From 8beb63fe2981668a359d5068f4ce36f92f5dca71 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Sun, 24 May 2026 18:42:11 +0000 Subject: [PATCH 01/11] refactor(mcp): drop 14 non-triage tools, keep 7-tool triage surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduces the MCP HTTP-streamable surface from 21 tools to 7 — the minimum set needed for an LLM-driven incident-triage workflow on a 120-service SQLite deployment that's currently OOMing within an hour. Kept (7): get_anomaly_timeline, get_service_map, get_service_health, root_cause_analysis, impact_analysis, trace_graph, search_logs. Cut (14): get_system_graph, tail_logs, get_trace, search_traces, get_metrics, get_dashboard_stats, get_storage_status, find_similar_logs, get_alerts, correlated_signals, get_error_chains, get_investigations, get_investigation, get_graph_snapshot. The cut tools fall into three buckets: (a) duplicates of a kept tool with a slightly different framing (get_system_graph ≈ get_service_map, get_error_chains is folded into root_cause_analysis); (b) require subsystems being dropped in follow-up commits (find_similar_logs → vectordb, get_graph_snapshot → snapshot table); (c) belong to a separate forensic-analytics workflow not part of active triage (get_investigations, get_dashboard_stats). MCP clients calling cut tools receive an "unknown tool" RPC error — no deprecation period, the cut is intentional and immediate. Files touched: cache.go cacheable list re-sorted to mirror toolDefs; dispatcher in tools.go collapsed to the 7-case switch; tools_ran20_test.go (find_similar_logs only) deleted; server_ran22_test.go pared down to the constructor-tenant signature test now that the HTTP find_similar_logs flow is gone (the no-header default-tenant invariant is covered by tenant_isolation_test.go); tenant_isolation_test.go drops subtests for cut tools. Design spec: docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md --- internal/mcp/cache.go | 6 +- internal/mcp/server_ran22_test.go | 110 +---- internal/mcp/tenant_isolation_test.go | 196 +------- internal/mcp/tools.go | 618 +++----------------------- internal/mcp/tools_ran20_test.go | 79 ---- 5 files changed, 89 insertions(+), 920 deletions(-) delete mode 100644 internal/mcp/tools_ran20_test.go diff --git a/internal/mcp/cache.go b/internal/mcp/cache.go index df48747..4cf85bf 100644 --- a/internal/mcp/cache.go +++ b/internal/mcp/cache.go @@ -21,11 +21,11 @@ import ( // changes meaningfully on millisecond scales and the per-call DB cost is // already bounded by the storage layer. var cacheableTools = map[string]struct{}{ - "get_service_map": {}, - "impact_analysis": {}, - "root_cause_analysis": {}, "get_anomaly_timeline": {}, + "get_service_map": {}, "get_service_health": {}, + "root_cause_analysis": {}, + "impact_analysis": {}, } // isCacheable reports whether a tool name is on the cache whitelist. diff --git a/internal/mcp/server_ran22_test.go b/internal/mcp/server_ran22_test.go index 37f6de6..dbf020f 100644 --- a/internal/mcp/server_ran22_test.go +++ b/internal/mcp/server_ran22_test.go @@ -1,15 +1,9 @@ package mcp import ( - "bytes" - "encoding/json" - "net/http" - "net/http/httptest" - "strings" "testing" "github.com/RandomCodeSpace/otelcontext/internal/storage" - "github.com/RandomCodeSpace/otelcontext/internal/vectordb" ) // TestNew_DefaultTenant_FromConstructor is the RAN-22 regression bar at the @@ -17,21 +11,26 @@ import ( // to New, so production startup wiring (main.go) cannot drop it without a // compile error. Empty input falls back to storage.DefaultTenantID; a // non-empty value is preserved verbatim. +// +// End-to-end coverage that the configured default actually flows through the +// HTTP transport into the tenant-scoped tool path is provided by +// tenant_isolation_test.go::TestMCP_TenantIsolation_AllGraphRAGTools (the +// no-header caller). func TestNew_DefaultTenant_FromConstructor(t *testing.T) { t.Run("empty falls back to storage.DefaultTenantID", func(t *testing.T) { - srv := New("", nil, nil, nil, vectordb.New(1)) + srv := New("", nil, nil, nil, nil) if srv.defaultTenant != storage.DefaultTenantID { t.Fatalf(`New("") defaultTenant = %q, want %q`, srv.defaultTenant, storage.DefaultTenantID) } }) t.Run("non-empty value is preserved", func(t *testing.T) { - srv := New("acme", nil, nil, nil, vectordb.New(1)) + srv := New("acme", nil, nil, nil, nil) if srv.defaultTenant != "acme" { t.Fatalf(`New("acme") defaultTenant = %q, want "acme"`, srv.defaultTenant) } }) t.Run("SetDefaultTenant runtime override still works", func(t *testing.T) { - srv := New("acme", nil, nil, nil, vectordb.New(1)) + srv := New("acme", nil, nil, nil, nil) srv.SetDefaultTenant("globex") if srv.defaultTenant != "globex" { t.Fatalf(`SetDefaultTenant("globex") defaultTenant = %q, want "globex"`, srv.defaultTenant) @@ -43,96 +42,3 @@ func TestNew_DefaultTenant_FromConstructor(t *testing.T) { } }) } - -// TestNew_DefaultTenant_FlowsThroughHTTPTransport proves that the constructor- -// supplied tenant is the actual fallback used by the JSON-RPC HTTP handler -// when no X-Tenant-ID header is present, and that an explicit header still -// wins over the default. This locks in the end-to-end behavior the RAN-22 -// fix delivers: a deployment with DEFAULT_TENANT=acme returns acme-scoped -// data from header-less MCP tool calls. -func TestNew_DefaultTenant_FlowsThroughHTTPTransport(t *testing.T) { - idx := vectordb.New(100) - idx.Add(1, "acme", "checkout", "ERROR", "payment gateway timeout acme-marker-xyz") - idx.Add(2, "globex", "auth", "ERROR", "payment gateway 500 globex-marker-qqq") - idx.Add(3, "default", "svc", "ERROR", "payment gateway refused default-marker-aaa") - - body := mustMarshalJSONRPC(t, "find_similar_logs", map[string]any{ - "query": "payment gateway", - "limit": float64(50), - }) - - srv := New("acme", nil, nil, nil, idx) - - // Header-less tools/call must scope to the constructor-provided default. - resp1 := callNoHeader(t, srv, body) - mustContain(t, resp1, "acme-marker-xyz") - mustNotContain(t, resp1, "globex-marker-qqq", "default-marker-aaa") - - // Explicit X-Tenant-ID header beats the configured default — precedence - // invariant is preserved. - resp2 := callWithHeader(t, srv, body, "globex") - mustContain(t, resp2, "globex-marker-qqq") - mustNotContain(t, resp2, "acme-marker-xyz", "default-marker-aaa") - - // SetDefaultTenant runtime override flows to the same transport path so - // future runtime-config-reload paths behave correctly. - srv.SetDefaultTenant("globex") - resp3 := callNoHeader(t, srv, body) - mustContain(t, resp3, "globex-marker-qqq") - mustNotContain(t, resp3, "acme-marker-xyz", "default-marker-aaa") -} - -func mustMarshalJSONRPC(t *testing.T, tool string, args map[string]any) []byte { - t.Helper() - b, err := json.Marshal(map[string]any{ - "jsonrpc": "2.0", - "id": 1, - "method": "tools/call", - "params": map[string]any{"name": tool, "arguments": args}, - }) - if err != nil { - t.Fatalf("marshal: %v", err) - } - return b -} - -func callNoHeader(t *testing.T, srv *Server, body []byte) string { - t.Helper() - req := httptest.NewRequest(http.MethodPost, "/mcp", bytes.NewReader(body)) - req.Header.Set("Content-Type", "application/json") - rr := httptest.NewRecorder() - srv.Handler().ServeHTTP(rr, req) - if rr.Code != http.StatusOK { - t.Fatalf("HTTP %d: %s", rr.Code, rr.Body.String()) - } - return rr.Body.String() -} - -func callWithHeader(t *testing.T, srv *Server, body []byte, tenant string) string { - t.Helper() - req := httptest.NewRequest(http.MethodPost, "/mcp", bytes.NewReader(body)) - req.Header.Set("Content-Type", "application/json") - req.Header.Set("X-Tenant-ID", tenant) - rr := httptest.NewRecorder() - srv.Handler().ServeHTTP(rr, req) - if rr.Code != http.StatusOK { - t.Fatalf("HTTP %d: %s", rr.Code, rr.Body.String()) - } - return rr.Body.String() -} - -func mustContain(t *testing.T, body, want string) { - t.Helper() - if !strings.Contains(body, want) { - t.Fatalf("response missing expected marker %q:\n%s", want, body) - } -} - -func mustNotContain(t *testing.T, body string, forbidden ...string) { - t.Helper() - for _, f := range forbidden { - if strings.Contains(body, f) { - t.Fatalf("response leaked forbidden marker %q:\n%s", f, body) - } - } -} diff --git a/internal/mcp/tenant_isolation_test.go b/internal/mcp/tenant_isolation_test.go index bb4d5cc..6ae90f3 100644 --- a/internal/mcp/tenant_isolation_test.go +++ b/internal/mcp/tenant_isolation_test.go @@ -111,11 +111,14 @@ func setupTenantIsolationServer(t *testing.T) (*httptest.Server, *graphrag.Graph // seedTenant ingests a small but representative slice of telemetry for // tenant T: a parent OK span, a child ERROR span, a matching ERROR log, -// a vector-index doc, an injected anomaly, a persisted investigation, -// and a graph snapshot row. All identifiers (trace_id, span_id) collide +// and an injected anomaly. All identifiers (trace_id, span_id) collide // across tenants on purpose — the tenant slice is the only thing keeping // them apart. -func seedTenant(t *testing.T, g *graphrag.GraphRAG, repo *storage.Repository, vIdx *vectordb.Index, tenant string, ts time.Time) { +// +// repo is accepted but currently unused; future tests may seed DB rows +// directly. It is preserved so callers can switch back to DB-shaped +// seeding without a signature change. +func seedTenant(t *testing.T, g *graphrag.GraphRAG, _ *storage.Repository, _ any, tenant string, ts time.Time) { t.Helper() service := tenant + "-orders" @@ -153,8 +156,8 @@ func seedTenant(t *testing.T, g *graphrag.GraphRAG, repo *storage.Repository, vI Duration: 1000, }) - // Log carrying the per-tenant marker — drives Drain clustering and - // CorrelatedSignals; the body is also stored in the vector index. + // Log carrying the per-tenant marker — drives Drain clustering and the + // LogClusterNode side-effect that CorrelatedSignals would consume. g.OnLogIngested(storage.Log{ TenantID: tenant, TraceID: traceID, @@ -165,9 +168,6 @@ func seedTenant(t *testing.T, g *graphrag.GraphRAG, repo *storage.Repository, vI Timestamp: ts.Add(2 * time.Millisecond), }) - // Vector index doc — find_similar_logs path is keyed by tenant. - vIdx.Add(0, tenant, service, "ERROR", logBody) - // Inject a per-tenant anomaly directly so AnomalyTimeline has // something to return without depending on the anomaly detector // loop (which is throttled to 24h in this fixture). @@ -179,21 +179,6 @@ func seedTenant(t *testing.T, g *graphrag.GraphRAG, repo *storage.Repository, vI Evidence: tenant + "-anomaly-marker error_rate=0.95", Timestamp: ts.Add(3 * time.Millisecond), }) - - // Snapshot row — insert directly so we control the tenant_id and ID - // (takeSnapshot is the production loop, but it is package-private). - snap := graphrag.GraphSnapshot{ - TenantID: tenant, - ID: "snap-" + tenant, - CreatedAt: ts, - Nodes: json.RawMessage(`[{"name":"` + service + `","marker":"` + tenant + `-marker"}]`), - Edges: json.RawMessage(`[]`), - ServiceCount: 1, - AvgHealthScore: 0.5, - } - if err := repo.DB().Create(&snap).Error; err != nil { - t.Fatalf("seed snapshot for %q: %v", tenant, err) - } } // waitForServiceMaps polls until every seeded tenant's ServiceMap reflects @@ -218,35 +203,6 @@ func waitForServiceMaps(t *testing.T, g *graphrag.GraphRAG, tenants []string) { t.Fatalf("timed out waiting for ServiceMap to reflect ingested spans for %v", tenants) } -// seedInvestigations relies on the in-memory state already being warm -// (see waitForServiceMaps). PersistInvestigation reaches into ImpactAnalysis -// internally, which reads from the per-tenant ServiceStore. -func seedInvestigations(t *testing.T, g *graphrag.GraphRAG, ts time.Time) { - t.Helper() - for _, tenant := range allTenants { - service := tenant + "-orders" - chain := graphrag.ErrorChainResult{ - RootCause: &graphrag.RootCauseInfo{ - Service: service, - Operation: tenant + "-op-checkout", - ErrorMessage: tenant + "-marker connection refused upstream", - SpanID: "span-child", - TraceID: "trace-shared", - }, - SpanChain: []graphrag.SpanNode{{ - ID: "span-child", - TraceID: "trace-shared", - Service: service, - Operation: tenant + "-op-checkout", - IsError: true, - Timestamp: ts, - }}, - TraceID: "trace-shared", - } - g.PersistInvestigation(tenant, service, []graphrag.ErrorChainResult{chain}, nil) - } -} - // callTool sends a JSON-RPC tools/call request to the test MCP server // with the given X-Tenant-ID header (omitted when empty) and returns the // inner ToolCallResult — i.e., the structure the LLM client would see. @@ -337,42 +293,21 @@ func truncate(s string) string { return s[:max] + "…(truncated)" } -// TestMCP_TenantIsolation_AllGraphRAGTools is the merge gate for RAN-19. -// For every GraphRAG-backed (and GraphRAG-rewired) MCP tool, it issues -// the same call from three callers — X-Tenant-ID: acme, X-Tenant-ID: beta, -// no header — against overlapping seeded data and asserts each response -// contains only the caller-tenant's data and never leaks another tenant's -// service name, log marker, operation, anomaly, or snapshot row. +// TestMCP_TenantIsolation_AllGraphRAGTools is the merge gate for the 7-tool +// triage MCP surface (post-2026-05-24 reduction). For every kept tool, it +// issues the same call from three callers — X-Tenant-ID: acme, +// X-Tenant-ID: beta, no header — against overlapping seeded data and +// asserts each response contains only the caller-tenant's data and never +// leaks another tenant's service name, log marker, operation, or anomaly. func TestMCP_TenantIsolation_AllGraphRAGTools(t *testing.T) { - ts, g, repo, vIdx := setupTenantIsolationServer(t) + ts, g, repo, _ := setupTenantIsolationServer(t) now := time.Now().Add(-time.Minute) // a hair in the past so since=now-15m sees us for _, tenant := range allTenants { - seedTenant(t, g, repo, vIdx, tenant, now) + seedTenant(t, g, repo, nil, tenant, now) } waitForServiceMaps(t, g, allTenants) - seedInvestigations(t, g, now) - - // Resolve investigation IDs per tenant (PersistInvestigation generates - // them internally; we discover them by querying after the fact, then - // hand them back into get_investigation in the per-caller assertions). - invIDsByTenant := map[string]string{} - for _, tenant := range allTenants { - ctx := storage.WithTenantContext(context.Background(), tenant) - invs, err := g.GetInvestigations(ctx, "", "", "", 10) - if err != nil { - t.Fatalf("GetInvestigations(%s): %v", tenant, err) - } - if len(invs) == 0 { - t.Fatalf("expected at least one persisted investigation for %s, got 0", tenant) - } - invIDsByTenant[tenant] = invs[0].ID - } - - // snapshot lookup time — slightly in the future so "<= at" matches every - // seeded row regardless of microsecond drift. - snapAt := time.Now().Add(time.Minute).UTC().Format(time.RFC3339) for _, caller := range isolationCallers { caller := caller @@ -385,6 +320,7 @@ func TestMCP_TenantIsolation_AllGraphRAGTools(t *testing.T) { ownLogMarker := caller.scoped + "-marker" ownAnomalyMarker := caller.scoped + "-anomaly-marker" _ = ownMarkers + _ = ownLogMarker // --- in-memory GraphRAG tools --- @@ -400,15 +336,6 @@ func TestMCP_TenantIsolation_AllGraphRAGTools(t *testing.T) { assertNoLeak(t, "get_service_health", body, ownService, leakMarkers) }) - t.Run(caller.name+"/get_error_chains", func(t *testing.T) { - _, body := callTool(t, ts, caller.header, "get_error_chains", map[string]any{ - "service": ownService, - "time_range": "1h", - "limit": 10, - }) - assertNoLeak(t, "get_error_chains", body, ownService, leakMarkers) - }) - t.Run(caller.name+"/trace_graph", func(t *testing.T) { // trace_id collides across tenants; correct routing must surface // only the caller's per-tenant operation/service. @@ -438,74 +365,11 @@ func TestMCP_TenantIsolation_AllGraphRAGTools(t *testing.T) { assertNoLeak(t, "root_cause_analysis", body, ownService, leakMarkers) }) - t.Run(caller.name+"/correlated_signals", func(t *testing.T) { - _, body := callTool(t, ts, caller.header, "correlated_signals", map[string]any{ - "service": ownService, - "time_range": "1h", - }) - // CorrelatedSignals collects logs/metrics for the service, so the - // per-tenant log marker should appear. - assertNoLeak(t, "correlated_signals", body, ownLogMarker, leakMarkers) - }) - t.Run(caller.name+"/get_anomaly_timeline", func(t *testing.T) { _, body := callTool(t, ts, caller.header, "get_anomaly_timeline", nil) assertNoLeak(t, "get_anomaly_timeline", body, ownAnomalyMarker, leakMarkers) }) - // --- DB-backed GraphRAG tools --- - - t.Run(caller.name+"/get_investigations", func(t *testing.T) { - _, body := callTool(t, ts, caller.header, "get_investigations", nil) - assertNoLeak(t, "get_investigations", body, ownService, leakMarkers) - }) - - t.Run(caller.name+"/get_investigation_by_id_own_tenant", func(t *testing.T) { - _, body := callTool(t, ts, caller.header, "get_investigation", map[string]any{ - "investigation_id": invIDsByTenant[caller.scoped], - }) - assertNoLeak(t, "get_investigation/own", body, ownService, leakMarkers) - }) - - t.Run(caller.name+"/get_investigation_by_id_other_tenant_blocks", func(t *testing.T) { - // Asking by another tenant's ID must NOT return that row — id- - // guessing would otherwise leak across tenants. The handler - // surfaces a tool-level error result, which is fine; what - // matters is that the foreign tenant's data does not appear. - otherTenant := caller.otherSeeded[0] - _, body := callTool(t, ts, caller.header, "get_investigation", map[string]any{ - "investigation_id": invIDsByTenant[otherTenant], - }) - assertNoLeak(t, "get_investigation/cross-tenant", body, "", leakMarkers) - }) - - t.Run(caller.name+"/get_graph_snapshot", func(t *testing.T) { - _, body := callTool(t, ts, caller.header, "get_graph_snapshot", map[string]any{ - "time": snapAt, - }) - // Snapshot rows are tagged with the tenant marker so the leak - // scan covers both ID prefixes (snap-acme/snap-beta/snap-default) - // and the inline node markers. - assertNoLeak(t, "get_graph_snapshot", body, "snap-"+caller.scoped, leakMarkers) - }) - - // --- vectordb-backed tool (Drain path is exercised by ingestion above) --- - - t.Run(caller.name+"/find_similar_logs", func(t *testing.T) { - _, body := callTool(t, ts, caller.header, "find_similar_logs", map[string]any{ - "query": "connection refused upstream", - "limit": 10, - }) - assertNoLeak(t, "find_similar_logs", body, ownLogMarker, leakMarkers) - }) - - // --- Legacy/rewired surface --- - // get_system_graph is rewired onto GraphRAG by RAN-39, so the same - // per-tenant invariants apply. - t.Run(caller.name+"/get_system_graph", func(t *testing.T) { - _, body := callTool(t, ts, caller.header, "get_system_graph", nil) - assertNoLeak(t, "get_system_graph", body, ownService, leakMarkers) - }) } } @@ -605,23 +469,11 @@ func TestMCP_TenantIsolation_DrainClusterIDsStayPerTenant(t *testing.T) { // the assertion above. t.Logf("drain cluster IDs: acme=%v beta=%v", idsA, idsB) - // End-to-end probe: the same isolation must hold via the MCP HTTP - // surface, not just the in-process API. - for _, scoped := range []string{"acme", "beta"} { - _, body := callTool(t, ts, scoped, "correlated_signals", map[string]any{ - "service": sharedService, - "time_range": "1h", - }) - other := "beta" - if scoped == "beta" { - other = "acme" - } - if !strings.Contains(body, scoped+"-marker") { - t.Errorf("%s correlated_signals (HTTP) missing own marker, body=%s", scoped, truncate(body)) - } - if strings.Contains(body, other+"-marker") { - t.Errorf("%s correlated_signals (HTTP) leaked %s marker, body=%s", scoped, other, truncate(body)) - } - } + // Note: the legacy end-to-end probe used the `correlated_signals` MCP + // tool to assert the same isolation across the HTTP transport. That + // tool was cut on 2026-05-24 alongside 13 others; the in-process + // CorrelatedSignals invariant above is still the truth-test for Drain + // + SignalStore tenant partitioning. The 7-tool MCP transport invariant + // for the kept tools is covered by TestMCP_TenantIsolation_AllGraphRAGTools. + _ = ts } - diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index 630b1a8..3753a0e 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -12,135 +12,27 @@ import ( ) const ( - errSvcGraphNotInit = "service graph not yet initialized" errGraphRAGNotInit = "GraphRAG not initialized" errServiceRequired = "service is required" resourceURIPrefix = "OtelContext://" ) -// toolDefs is the canonical list of all tools exposed by the OtelContext MCP server. +// toolDefs is the canonical list of triage-essential tools exposed by the +// OtelContext MCP server. The surface was reduced from 21 to 7 in +// 2026-05-24 so the platform survives 120 services on SQLite — see +// docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md. var toolDefs = []Tool{ { - Name: "get_system_graph", - Description: "Returns the full service topology with health scores (0-1), error rates, latencies, and dependency edges. Use this to understand overall system health.", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]Property{ - "time_range": {Type: "string", Description: "Lookback window, e.g. '1h', '30m'. Defaults to '1h'."}, - }, - }, - }, - { - Name: "get_service_health", - Description: "Returns detailed health metrics for a specific service: error rate, latency percentiles, request rate, and active alerts.", - InputSchema: InputSchema{ - Type: "object", - Required: []string{"service_name"}, - Properties: map[string]Property{ - "service_name": {Type: "string", Description: "The service name to query."}, - }, - }, - }, - { - Name: "search_logs", - Description: "Searches log entries by severity, service, body text, trace ID, and time range. Returns id, timestamp, severity, service_name, body, trace_id. **Limited to the last 24 hours** — windows entirely outside the 24h cap are rejected. Strongly recommend setting `service` and/or `severity` to scope the search; unscoped keyword queries scan large row counts when FTS5 is disabled (the default). Use severity=ERROR to find errors, query= for full-text search, trace_id= to correlate with a trace. Use page= for pagination.", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]Property{ - "query": {Type: "string", Description: "Full-text search in log body."}, - "severity": {Type: "string", Description: "Filter by severity level: ERROR, WARN, INFO, DEBUG."}, - "service": {Type: "string", Description: "Filter by service name (exact match)."}, - "trace_id": {Type: "string", Description: "Filter logs belonging to a specific trace ID."}, - "start": {Type: "string", Description: "Start time RFC3339. Defaults to 24h ago. Cannot be earlier than now-24h; older values are clamped."}, - "end": {Type: "string", Description: "End time RFC3339. Defaults to now. Cannot exceed now; future values are clamped."}, - "limit": {Type: "number", Description: "Max results per page (default 50, max 200)."}, - "page": {Type: "number", Description: "Page number for pagination (default 0)."}, - }, - }, - }, - { - Name: "tail_logs", - Description: "Returns the N most recent log entries, optionally filtered by service and/or severity. No time range needed — fastest way to see what's happening right now.", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]Property{ - "service": {Type: "string", Description: "Filter by service name."}, - "severity": {Type: "string", Description: "Filter by severity: ERROR, WARN, INFO, DEBUG."}, - "limit": {Type: "number", Description: "Number of recent entries to return (default 20, max 100)."}, - }, - }, - }, - { - Name: "get_trace", - Description: "Returns full trace detail with all spans for a given trace ID.", - InputSchema: InputSchema{ - Type: "object", - Required: []string{"trace_id"}, - Properties: map[string]Property{ - "trace_id": {Type: "string", Description: "The trace ID to retrieve."}, - }, - }, - }, - { - Name: "search_traces", - Description: "Searches traces by service, status code, minimum duration, and time range.", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]Property{ - "service": {Type: "string", Description: "Filter by service name."}, - "status": {Type: "string", Description: "Filter by status: OK, ERROR."}, - "min_duration_ms": {Type: "number", Description: "Minimum trace duration in ms."}, - "start": {Type: "string", Description: "Start time RFC3339."}, - "end": {Type: "string", Description: "End time RFC3339."}, - "limit": {Type: "number", Description: "Max results (default 20, max 100)."}, - }, - }, - }, - { - Name: "get_metrics", - Description: "Queries metric time series for a given metric name and optional service.", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]Property{ - "name": {Type: "string", Description: "Metric name to query."}, - "service": {Type: "string", Description: "Filter by service name."}, - "start": {Type: "string", Description: "Start time RFC3339."}, - "end": {Type: "string", Description: "End time RFC3339."}, - }, - }, - }, - { - Name: "get_dashboard_stats", - Description: "Returns dashboard summary: total requests, error rate, avg latency, ingestion rate, and per-service breakdown.", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]Property{ - "start": {Type: "string", Description: "Start time RFC3339. Defaults to 1h ago."}, - "end": {Type: "string", Description: "End time RFC3339. Defaults to now."}, - }, - }, - }, - { - Name: "get_storage_status", - Description: "Returns hot DB size, DLQ size, and database health.", - InputSchema: InputSchema{Type: "object"}, - }, - { - Name: "find_similar_logs", - Description: "Finds logs semantically similar to a query text using TF-IDF vector similarity. Useful for clustering errors and finding root causes.", + Name: "get_anomaly_timeline", + Description: "Returns recent anomalies with temporal causal links, optionally filtered by service. The triage entry point — answers \"what's wrong right now\".", InputSchema: InputSchema{ Type: "object", Properties: map[string]Property{ - "query": {Type: "string", Description: "Text query to find similar logs."}, - "limit": {Type: "number", Description: "Max results (default 10)."}, + "since": {Type: "string", Description: "Start time RFC3339. Defaults to 1h ago."}, + "service": {Type: "string", Description: "Filter by service."}, }, }, }, - { - Name: "get_alerts", - Description: "Returns active alerts and anomalies: services with high error rates, p99 latency spikes, and degraded health scores.", - InputSchema: InputSchema{Type: "object"}, - }, { Name: "get_service_map", Description: "Returns the service topology with health scores, error rates, call counts, and dependency edges. Powered by the live GraphRAG.", @@ -153,38 +45,13 @@ var toolDefs = []Tool{ }, }, { - Name: "get_error_chains", - Description: "Traces recent error spans upstream to identify root cause services. Returns span path, root cause service/operation, and correlated error logs.", - InputSchema: InputSchema{ - Type: "object", - Required: []string{"service"}, - Properties: map[string]Property{ - "service": {Type: "string", Description: "Service experiencing errors."}, - "time_range": {Type: "string", Description: "Lookback window, e.g. '5m', '1h'. Defaults to '15m'."}, - "limit": {Type: "number", Description: "Max error chains to return (default 10)."}, - }, - }, - }, - { - Name: "trace_graph", - Description: "Returns the full span tree for a trace with service names, durations, errors, and linked logs.", - InputSchema: InputSchema{ - Type: "object", - Required: []string{"trace_id"}, - Properties: map[string]Property{ - "trace_id": {Type: "string", Description: "The trace ID to visualize."}, - }, - }, - }, - { - Name: "impact_analysis", - Description: "BFS downstream from a service to find all affected services and impact scores.", + Name: "get_service_health", + Description: "Returns detailed health metrics for a specific service: error rate, latency percentiles, request rate, and active alerts.", InputSchema: InputSchema{ Type: "object", - Required: []string{"service"}, + Required: []string{"service_name"}, Properties: map[string]Property{ - "service": {Type: "string", Description: "Service to analyze blast radius for."}, - "depth": {Type: "number", Description: "Max traversal depth (default 5)."}, + "service_name": {Type: "string", Description: "The service name to query."}, }, }, }, @@ -201,60 +68,42 @@ var toolDefs = []Tool{ }, }, { - Name: "correlated_signals", - Description: "All related signals for a service: error logs, metric anomalies, traces, and investigations.", + Name: "impact_analysis", + Description: "BFS downstream from a service to find all affected services and impact scores.", InputSchema: InputSchema{ Type: "object", Required: []string{"service"}, Properties: map[string]Property{ - "service": {Type: "string", Description: "Service to gather signals for."}, - "time_range": {Type: "string", Description: "Lookback window. Defaults to '1h'."}, - }, - }, - }, - { - Name: "get_investigations", - Description: "Lists persisted investigation records from automated error analysis.", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]Property{ - "service": {Type: "string", Description: "Filter by service."}, - "severity": {Type: "string", Description: "Filter: critical, warning, info."}, - "status": {Type: "string", Description: "Filter: detected, triaged, resolved."}, - "limit": {Type: "number", Description: "Max results (default 20)."}, - }, - }, - }, - { - Name: "get_investigation", - Description: "Returns a full investigation record with causal chain, evidence, and affected services.", - InputSchema: InputSchema{ - Type: "object", - Required: []string{"investigation_id"}, - Properties: map[string]Property{ - "investigation_id": {Type: "string", Description: "The investigation ID."}, + "service": {Type: "string", Description: "Service to analyze blast radius for."}, + "depth": {Type: "number", Description: "Max traversal depth (default 5)."}, }, }, }, { - Name: "get_graph_snapshot", - Description: "Returns the historical service topology closest to the requested time.", + Name: "trace_graph", + Description: "Returns the full span tree for a trace with service names, durations, errors, and linked logs.", InputSchema: InputSchema{ Type: "object", - Required: []string{"time"}, + Required: []string{"trace_id"}, Properties: map[string]Property{ - "time": {Type: "string", Description: "RFC3339 timestamp to query the snapshot for."}, + "trace_id": {Type: "string", Description: "The trace ID to visualize."}, }, }, }, { - Name: "get_anomaly_timeline", - Description: "Returns recent anomalies with temporal causal links, optionally filtered by service.", + Name: "search_logs", + Description: "Searches log entries by severity, service, body text, trace ID, and time range. Returns id, timestamp, severity, service_name, body, trace_id. **Limited to the last 24 hours** — windows entirely outside the 24h cap are rejected. Strongly recommend setting `service` and/or `severity` to scope the search; unscoped keyword queries scan large row counts when FTS5 is disabled. Use severity=ERROR to find errors, query= for full-text search, trace_id= to correlate with a trace. Use page= for pagination.", InputSchema: InputSchema{ Type: "object", Properties: map[string]Property{ - "since": {Type: "string", Description: "Start time RFC3339. Defaults to 1h ago."}, - "service": {Type: "string", Description: "Filter by service."}, + "query": {Type: "string", Description: "Full-text search in log body."}, + "severity": {Type: "string", Description: "Filter by severity level: ERROR, WARN, INFO, DEBUG."}, + "service": {Type: "string", Description: "Filter by service name (exact match)."}, + "trace_id": {Type: "string", Description: "Filter logs belonging to a specific trace ID."}, + "start": {Type: "string", Description: "Start time RFC3339. Defaults to 24h ago. Cannot be earlier than now-24h; older values are clamped."}, + "end": {Type: "string", Description: "End time RFC3339. Defaults to now. Cannot exceed now; future values are clamped."}, + "limit": {Type: "number", Description: "Max results per page (default 50, max 200)."}, + "page": {Type: "number", Description: "Page number for pagination (default 0)."}, }, }, }, @@ -293,48 +142,20 @@ func (s *Server) toolHandler(ctx context.Context, name string, args map[string]a s.metrics.MCPToolInvocationsTotal.WithLabelValues(name, status).Inc() }() switch name { - case "get_system_graph": - return s.toolGetSystemGraph(ctx, args) - case "get_service_health": - return s.toolGetServiceHealth(ctx, args) - case "search_logs": - return s.toolSearchLogs(ctx, args) - case "tail_logs": - return s.toolTailLogs(ctx, args) - case "get_trace": - return s.toolGetTrace(ctx, args) - case "search_traces": - return s.toolSearchTraces(ctx, args) - case "get_metrics": - return s.toolGetMetrics(ctx, args) - case "get_dashboard_stats": - return s.toolGetDashboardStats(ctx, args) - case "get_storage_status": - return s.toolGetStorageStatus() - case "find_similar_logs": - return s.toolFindSimilarLogs(ctx, args) - case "get_alerts": - return s.toolGetAlerts() + case "get_anomaly_timeline": + return s.toolGetAnomalyTimeline(ctx, args) case "get_service_map": return s.toolGetServiceMap(ctx, args) - case "get_error_chains": - return s.toolGetErrorChains(ctx, args) - case "trace_graph": - return s.toolTraceGraph(ctx, args) - case "impact_analysis": - return s.toolImpactAnalysis(ctx, args) + case "get_service_health": + return s.toolGetServiceHealth(ctx, args) case "root_cause_analysis": return s.toolRootCauseAnalysis(ctx, args) - case "correlated_signals": - return s.toolCorrelatedSignals(ctx, args) - case "get_investigations": - return s.toolGetInvestigations(ctx, args) - case "get_investigation": - return s.toolGetInvestigationByID(ctx, args) - case "get_graph_snapshot": - return s.toolGetGraphSnapshot(ctx, args) - case "get_anomaly_timeline": - return s.toolGetAnomalyTimeline(ctx, args) + case "impact_analysis": + return s.toolImpactAnalysis(ctx, args) + case "trace_graph": + return s.toolTraceGraph(ctx, args) + case "search_logs": + return s.toolSearchLogs(ctx, args) default: return errorResult(fmt.Sprintf("unknown tool: %s", name)) } @@ -342,72 +163,26 @@ func (s *Server) toolHandler(ctx context.Context, name string, args map[string]a // --- Tool implementations --- -// toolGetSystemGraph returns a tenant-scoped service topology snapshot. -// -// When GraphRAG is wired (the default in production) the response is built -// from its per-tenant ServiceMap and AllServiceEdges, so two tenants with -// overlapping service names cannot see each other's nodes or edges. The -// legacy *graph.Graph remains as a fallback for boot windows when GraphRAG -// is still warming up; that fallback is cross-tenant by construction and -// is the documented legacy code path called out in RAN-39. -func (s *Server) toolGetSystemGraph(ctx context.Context, _ map[string]any) ToolCallResult { - if s.graphRAG != nil { - entries := s.graphRAG.ServiceMap(mcpCtx(ctx), 0) - edges := s.graphRAG.AllServiceEdges(mcpCtx(ctx)) - payload := map[string]any{ - "services": entries, - "edges": edges, - } - data, err := json.MarshalIndent(payload, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal system graph: %v", err)) - } - return textResult(string(data)) - } - if s.svcGraph == nil { - return errorResult(errSvcGraphNotInit) - } - snap := s.svcGraph.Snapshot() - data, err := json.MarshalIndent(snap, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal system graph: %v", err)) - } - return textResult(string(data)) -} - // toolGetServiceHealth returns the ServiceMap entry for svcName scoped to -// the tenant on ctx. Falls back to the legacy svcGraph snapshot when -// GraphRAG is not yet wired. +// the tenant on ctx. func (s *Server) toolGetServiceHealth(ctx context.Context, args map[string]any) ToolCallResult { svcName, _ := args["service_name"].(string) if svcName == "" { return errorResult("service_name is required") } - if s.graphRAG != nil { - for _, entry := range s.graphRAG.ServiceMap(mcpCtx(ctx), 0) { - if entry.Service != nil && entry.Service.Name == svcName { - data, err := json.MarshalIndent(entry, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal service health: %v", err)) - } - return textResult(string(data)) + if s.graphRAG == nil { + return errorResult(errGraphRAGNotInit) + } + for _, entry := range s.graphRAG.ServiceMap(mcpCtx(ctx), 0) { + if entry.Service != nil && entry.Service.Name == svcName { + data, err := json.MarshalIndent(entry, "", " ") + if err != nil { + return errorResult(fmt.Sprintf("failed to marshal service health: %v", err)) } + return textResult(string(data)) } - return textResult(fmt.Sprintf("service %q not found in the current tenant window", svcName)) - } - if s.svcGraph == nil { - return errorResult(errSvcGraphNotInit) } - snap := s.svcGraph.Snapshot() - node, ok := snap.Nodes[svcName] - if !ok { - return textResult(fmt.Sprintf("service %q not found in the current graph window", svcName)) - } - data, err := json.MarshalIndent(node, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal service health: %v", err)) - } - return textResult(string(data)) + return textResult(fmt.Sprintf("service %q not found in the current tenant window", svcName)) } // logSummary is a lean projection of storage.Log for AI consumption. @@ -496,193 +271,6 @@ func (s *Server) toolSearchLogs(ctx context.Context, args map[string]any) ToolCa return resourceResult(resourceURIPrefix+"logs/search", httpconst.ContentTypeJSON, string(data)) } -func (s *Server) toolTailLogs(ctx context.Context, args map[string]any) ToolCallResult { - limit := argInt(args, "limit", 20) - if limit > 100 { - limit = 100 - } - - filter := storage.LogFilter{ - EndTime: time.Now(), - Limit: limit, - } - if v, ok := args["service"].(string); ok && v != "" { - filter.ServiceName = v - } - if v, ok := args["severity"].(string); ok && v != "" { - filter.Severity = v - } - - logs, _, err := s.repo.GetLogsV2(mcpCtx(ctx), filter) - if err != nil { - return errorResult(fmt.Sprintf("tail_logs failed: %v", err)) - } - data, err := json.MarshalIndent(toLogSummaries(logs), "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal tail results: %v", err)) - } - return resourceResult(resourceURIPrefix+"logs/tail", httpconst.ContentTypeJSON, string(data)) -} - -func (s *Server) toolGetTrace(ctx context.Context, args map[string]any) ToolCallResult { - traceID, _ := args["trace_id"].(string) - if traceID == "" { - return errorResult("trace_id is required") - } - trace, err := s.repo.GetTrace(mcpCtx(ctx), traceID) - if err != nil { - return errorResult(fmt.Sprintf("get_trace failed: %v", err)) - } - data, err := json.MarshalIndent(trace, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal trace: %v", err)) - } - return resourceResult(resourceURIPrefix+"traces/"+traceID, httpconst.ContentTypeJSON, string(data)) -} - -func (s *Server) toolSearchTraces(ctx context.Context, args map[string]any) ToolCallResult { - end := time.Now() - start := end.Add(-1 * time.Hour) - parseTime(args, "start", &start) - parseTime(args, "end", &end) - - limit := argInt(args, "limit", 20) - if limit > 100 { - limit = 100 - } - - svcName, _ := args["service"].(string) - status, _ := args["status"].(string) - search := "" - - var services []string - if svcName != "" { - services = []string{svcName} - } - - resp, err := s.repo.GetTracesFiltered(mcpCtx(ctx), start, end, services, status, search, limit, 0, "timestamp", "desc") - if err != nil { - return errorResult(fmt.Sprintf("search_traces failed: %v", err)) - } - data, err := json.MarshalIndent(resp, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal trace search results: %v", err)) - } - return resourceResult(resourceURIPrefix+"traces/search", httpconst.ContentTypeJSON, string(data)) -} - -func (s *Server) toolGetMetrics(ctx context.Context, args map[string]any) ToolCallResult { - end := time.Now() - start := end.Add(-1 * time.Hour) - parseTime(args, "start", &start) - parseTime(args, "end", &end) - - metricName, _ := args["name"].(string) - svcName, _ := args["service"].(string) - - buckets, err := s.repo.GetMetricBuckets(mcpCtx(ctx), start, end, svcName, metricName) - if err != nil { - return errorResult(fmt.Sprintf("get_metrics failed: %v", err)) - } - data, err := json.MarshalIndent(buckets, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal metrics: %v", err)) - } - return resourceResult(resourceURIPrefix+"metrics/query", httpconst.ContentTypeJSON, string(data)) -} - -func (s *Server) toolGetDashboardStats(ctx context.Context, args map[string]any) ToolCallResult { - end := time.Now() - start := end.Add(-1 * time.Hour) - parseTime(args, "start", &start) - parseTime(args, "end", &end) - - stats, err := s.repo.GetDashboardStats(mcpCtx(ctx), start, end, nil) - if err != nil { - return errorResult(fmt.Sprintf("get_dashboard_stats failed: %v", err)) - } - data, err := json.MarshalIndent(stats, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal dashboard stats: %v", err)) - } - return textResult(string(data)) -} - -func (s *Server) toolGetStorageStatus() ToolCallResult { - health := s.metrics.GetHealthStats() - result := map[string]any{ - "hot_db_size_mb": float64(s.repo.HotDBSizeBytes()) / 1024 / 1024, - "dlq_size_files": health.DLQSize, - "active_conns": health.ActiveConns, - "goroutines": health.Goroutines, - "heap_alloc_mb": health.HeapAllocMB, - "uptime_seconds": health.UptimeSeconds, - "ingestion_total": health.IngestionRate, - "db_latency_p99_ms": health.DBLatencyP99Ms, - } - data, err := json.MarshalIndent(result, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal storage status: %v", err)) - } - return textResult(string(data)) -} - -// toolFindSimilarLogs returns logs semantically similar to the query text -// scoped to the tenant resolved from the MCP transport (X-Tenant-ID header or -// the server's default tenant). Cross-tenant rows are never returned. -func (s *Server) toolFindSimilarLogs(ctx context.Context, args map[string]any) ToolCallResult { - query, _ := args["query"].(string) - if query == "" { - return errorResult("query is required") - } - limit := argInt(args, "limit", 20) - if limit > 100 { - limit = 100 - } - if s.vectorIdx == nil { - return errorResult("vector index not yet initialized") - } - tenant := storage.TenantFromContext(mcpCtx(ctx)) - results := s.vectorIdx.Search(tenant, query, limit) - data, err := json.MarshalIndent(results, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal similar logs: %v", err)) - } - return textResult(string(data)) -} - -func (s *Server) toolGetAlerts() ToolCallResult { - if s.svcGraph == nil { - return errorResult(errSvcGraphNotInit) - } - snap := s.svcGraph.Snapshot() - type alertEntry struct { - Service string `json:"service"` - Status string `json:"status"` - Score float64 `json:"health_score"` - Alerts []string `json:"alerts"` - } - var entries []alertEntry - for _, n := range snap.Nodes { - if len(n.Alerts) > 0 || n.Status != "healthy" { - entries = append(entries, alertEntry{ - Service: n.Name, - Status: n.Status, - Score: n.HealthScore, - Alerts: n.Alerts, - }) - } - } - if len(entries) == 0 { - return textResult("No active alerts. All services are healthy.") - } - data, err := json.MarshalIndent(entries, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal alerts: %v", err)) - } - return textResult(string(data)) -} - // --- GraphRAG Tool implementations --- func (s *Server) toolGetServiceMap(ctx context.Context, args map[string]any) ToolCallResult { @@ -698,26 +286,6 @@ func (s *Server) toolGetServiceMap(ctx context.Context, args map[string]any) Too return textResult(string(data)) } -func (s *Server) toolGetErrorChains(ctx context.Context, args map[string]any) ToolCallResult { - if s.graphRAG == nil { - return errorResult(errGraphRAGNotInit) - } - svcName, _ := args["service"].(string) - if svcName == "" { - return errorResult(errServiceRequired) - } - since := time.Now().Add(-15 * time.Minute) - parseTimeRange(args, "time_range", &since) - limit := argInt(args, "limit", 10) - - chains := s.graphRAG.ErrorChain(mcpCtx(ctx), svcName, since, limit) - data, err := json.MarshalIndent(chains, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal error chains: %v", err)) - } - return textResult(string(data)) -} - func (s *Server) toolTraceGraph(ctx context.Context, args map[string]any) ToolCallResult { if s.graphRAG == nil { return errorResult(errGraphRAGNotInit) @@ -782,84 +350,6 @@ func (s *Server) toolRootCauseAnalysis(ctx context.Context, args map[string]any) return textResult(string(data)) } -func (s *Server) toolCorrelatedSignals(ctx context.Context, args map[string]any) ToolCallResult { - if s.graphRAG == nil { - return errorResult(errGraphRAGNotInit) - } - svcName, _ := args["service"].(string) - if svcName == "" { - return errorResult(errServiceRequired) - } - since := time.Now().Add(-1 * time.Hour) - parseTimeRange(args, "time_range", &since) - - result := s.graphRAG.CorrelatedSignals(mcpCtx(ctx), svcName, since) - data, err := json.MarshalIndent(result, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal correlated signals: %v", err)) - } - return textResult(string(data)) -} - -func (s *Server) toolGetInvestigations(ctx context.Context, args map[string]any) ToolCallResult { - if s.graphRAG == nil { - return errorResult(errGraphRAGNotInit) - } - service, _ := args["service"].(string) - severity, _ := args["severity"].(string) - status, _ := args["status"].(string) - limit := argInt(args, "limit", 20) - - investigations, err := s.graphRAG.GetInvestigations(ctx, service, severity, status, limit) - if err != nil { - return errorResult(fmt.Sprintf("failed to query investigations: %v", err)) - } - data, err := json.MarshalIndent(investigations, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal investigations: %v", err)) - } - return textResult(string(data)) -} - -func (s *Server) toolGetInvestigationByID(ctx context.Context, args map[string]any) ToolCallResult { - if s.graphRAG == nil { - return errorResult(errGraphRAGNotInit) - } - id, _ := args["investigation_id"].(string) - if id == "" { - return errorResult("investigation_id is required") - } - inv, err := s.graphRAG.GetInvestigation(ctx, id) - if err != nil { - return errorResult(fmt.Sprintf("investigation not found: %v", err)) - } - data, err := json.MarshalIndent(inv, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal investigation: %v", err)) - } - return textResult(string(data)) -} - -func (s *Server) toolGetGraphSnapshot(ctx context.Context, args map[string]any) ToolCallResult { - if s.graphRAG == nil { - return errorResult(errGraphRAGNotInit) - } - var at time.Time - parseTime(args, "time", &at) - if at.IsZero() { - at = time.Now() - } - snap, err := s.graphRAG.GetGraphSnapshot(ctx, at) - if err != nil { - return errorResult(fmt.Sprintf("no snapshot found: %v", err)) - } - data, err := json.MarshalIndent(snap, "", " ") - if err != nil { - return errorResult(fmt.Sprintf("failed to marshal snapshot: %v", err)) - } - return textResult(string(data)) -} - func (s *Server) toolGetAnomalyTimeline(ctx context.Context, args map[string]any) ToolCallResult { if s.graphRAG == nil { return errorResult(errGraphRAGNotInit) @@ -893,9 +383,9 @@ func parseTimeRange(args map[string]any, key string, since *time.Time) { // --- Helpers --- // MaxToolResponseBytes caps the rendered length of any tool response. Without -// this, get_trace / get_graph_snapshot / correlated_signals can produce -// 100MB+ JSON on adversarial input, OOM the process, and stall every -// concurrent MCP call until MCP_CALL_TIMEOUT_MS fires. +// this, large in-memory GraphRAG dumps can produce 100MB+ JSON on adversarial +// input, OOM the process, and stall every concurrent MCP call until +// MCP_CALL_TIMEOUT_MS fires. // // The cap is intentionally set well above any legitimate row-capped tool // response (search_logs at 200 rows is typically <1 MB) so it triggers only diff --git a/internal/mcp/tools_ran20_test.go b/internal/mcp/tools_ran20_test.go deleted file mode 100644 index 7477ae5..0000000 --- a/internal/mcp/tools_ran20_test.go +++ /dev/null @@ -1,79 +0,0 @@ -package mcp - -import ( - "context" - "strings" - "testing" - - "github.com/RandomCodeSpace/otelcontext/internal/storage" - "github.com/RandomCodeSpace/otelcontext/internal/vectordb" -) - -// TestFindSimilarLogs_TenantIsolation is the RAN-20 acceptance bar for the MCP -// surface. Two tenants with unique marker strings in their log bodies query -// find_similar_logs; each tenant's response must never contain the other's -// markers. -func TestFindSimilarLogs_TenantIsolation(t *testing.T) { - idx := vectordb.New(1_000) - idx.Add(101, "acme", "checkout", "ERROR", "payment gateway timeout acme-secret-charge-id-abc") - idx.Add(102, "acme", "checkout", "ERROR", "payment gateway refused acme-only-marker-xyz") - idx.Add(201, "globex", "auth", "ERROR", "payment gateway token expired globex-secret-session-123") - idx.Add(202, "globex", "auth", "ERROR", "payment gateway 500 internal globex-only-marker-qqq") - - srv := &Server{vectorIdx: idx, defaultTenant: storage.DefaultTenantID} - args := map[string]any{"query": "payment gateway", "limit": float64(50)} - - // Acme - acmeRes := srv.toolFindSimilarLogs(storage.WithTenantContext(context.Background(), "acme"), args) - if acmeRes.IsError { - t.Fatalf("acme call errored: %+v", acmeRes) - } - acmeBody := concatContent(acmeRes.Content) - for _, forbidden := range []string{"globex-secret-session-123", "globex-only-marker-qqq", `"LogID": 201`, `"LogID": 202`} { - if strings.Contains(acmeBody, forbidden) { - t.Fatalf("acme leaked globex content %q in body:\n%s", forbidden, acmeBody) - } - } - if !strings.Contains(acmeBody, "acme-secret-charge-id-abc") && !strings.Contains(acmeBody, "acme-only-marker-xyz") { - t.Fatalf("acme did not receive its own rows:\n%s", acmeBody) - } - - // Globex - gRes := srv.toolFindSimilarLogs(storage.WithTenantContext(context.Background(), "globex"), args) - if gRes.IsError { - t.Fatalf("globex call errored: %+v", gRes) - } - gBody := concatContent(gRes.Content) - for _, forbidden := range []string{"acme-secret-charge-id-abc", "acme-only-marker-xyz", `"LogID": 101`, `"LogID": 102`} { - if strings.Contains(gBody, forbidden) { - t.Fatalf("globex leaked acme content %q in body:\n%s", forbidden, gBody) - } - } -} - -// TestFindSimilarLogs_NoTenantFallsBackToDefault proves that a context with no -// tenant value is coerced to the server default — it must NOT bleed into -// another tenant's rows. -func TestFindSimilarLogs_NoTenantFallsBackToDefault(t *testing.T) { - idx := vectordb.New(100) - idx.Add(1, "acme", "svc", "ERROR", "acme secret body only") - - srv := &Server{vectorIdx: idx, defaultTenant: storage.DefaultTenantID} - args := map[string]any{"query": "secret body"} - - res := srv.toolFindSimilarLogs(context.Background(), args) - if res.IsError { - t.Fatalf("unexpected error: %+v", res) - } - if strings.Contains(concatContent(res.Content), "acme secret body only") { - t.Fatalf("no-tenant call leaked acme content:\n%s", concatContent(res.Content)) - } -} - -func concatContent(items []ContentItem) string { - var b strings.Builder - for _, c := range items { - b.WriteString(c.Text) - } - return b.String() -} From 2521663a067aa4e831ea8a4fa126b15802471af9 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Sun, 24 May 2026 18:52:58 +0000 Subject: [PATCH 02/11] refactor(vectordb): drop package and TF-IDF semantic similarity path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vectordb package was a pure-Go TF-IDF index for semantic log search, backing one MCP tool (find_similar_logs, cut in the prior commit) and one HTTP endpoint (/api/logs/similar). With the kept search_logs MCP tool already routing through SQLite FTS5 / pg_trgm GIN, the in-memory TF-IDF index is no longer reachable by any survivor. Removing it reclaims ~5-15% of resident heap on a 120-service SQLite deployment that the maxSize=100000 index + 5-minute snapshot loop + startup ReplayFromDB hydrator otherwise consume — heap pressure that contributes to the OOM-within-an-hour failure mode this refactor is solving for. Deletions: - internal/vectordb/ — index.go, snapshot.go, replay.go + tests - internal/api/similar_handler.go + test — the /api/logs/similar route - internal/storage/log_repo_replay_test.go + LogsForVectorReplay() and ListRecentHighSeverityLogsAllTenants() (only the vectordb hydrator read these; no other caller) - internal/graphrag/clustering.go::SimilarErrors() — vectordb-dependent, no production caller; Drain template clustering is the survivor - Vector* fields on telemetry.Metrics + RecordVector* observer methods - VectorIndexMaxEntries / VectorIndexSnapshotPath / VectorIndexSnapshotInterval on config.Config Signature changes: - graphrag.New(repo, tsdbAgg, ringBuf, cfg) — vectordb arg removed - mcp.New(defaultTenant, repo, metrics, svcGraph) — vectordb arg removed - ui.NewServer(repo, metrics, topo) — vectordb arg removed - api.Server.SetVectorIndex removed Operator migration: - The data/vectordb.snapshot file is left in place on disk; the loader that read it at boot is deleted, so it becomes a stale file that is safe to remove by hand. No automatic cleanup. - MCP clients calling find_similar_logs already receive "unknown tool" after the prior commit; the HTTP /api/logs/similar route now 404s. --- internal/api/server.go | 22 +- internal/api/similar_handler.go | 44 --- internal/api/similar_handler_test.go | 117 -------- internal/config/config.go | 20 -- internal/graphrag/builder.go | 15 +- internal/graphrag/builder_test.go | 6 +- internal/graphrag/clustering.go | 88 +----- internal/graphrag/migrate_test.go | 2 +- internal/mcp/robustness_test.go | 2 +- internal/mcp/server.go | 8 +- internal/mcp/server_ran22_test.go | 6 +- internal/mcp/tenant_isolation_test.go | 15 +- internal/storage/log_repo.go | 55 ---- internal/storage/log_repo_replay_test.go | 138 ---------- internal/telemetry/metrics.go | 75 ----- internal/ui/ui.go | 9 +- internal/vectordb/index.go | 334 ----------------------- internal/vectordb/index_test.go | 136 --------- internal/vectordb/replay.go | 74 ----- internal/vectordb/replay_test.go | 161 ----------- internal/vectordb/snapshot.go | 317 --------------------- internal/vectordb/snapshot_test.go | 325 ---------------------- main.go | 109 +------- 23 files changed, 43 insertions(+), 2035 deletions(-) delete mode 100644 internal/api/similar_handler.go delete mode 100644 internal/api/similar_handler_test.go delete mode 100644 internal/storage/log_repo_replay_test.go delete mode 100644 internal/vectordb/index.go delete mode 100644 internal/vectordb/index_test.go delete mode 100644 internal/vectordb/replay.go delete mode 100644 internal/vectordb/replay_test.go delete mode 100644 internal/vectordb/snapshot.go delete mode 100644 internal/vectordb/snapshot_test.go diff --git a/internal/api/server.go b/internal/api/server.go index 4cd6318..1f838d0 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -10,19 +10,17 @@ import ( "github.com/RandomCodeSpace/otelcontext/internal/realtime" "github.com/RandomCodeSpace/otelcontext/internal/storage" "github.com/RandomCodeSpace/otelcontext/internal/telemetry" - "github.com/RandomCodeSpace/otelcontext/internal/vectordb" ) // Server handles HTTP API requests. type Server struct { - repo *storage.Repository - hub *realtime.Hub - eventHub *realtime.EventHub - metrics *telemetry.Metrics - cache *cache.TTLCache - graph *graph.Graph // in-memory service dependency graph (may be nil before first build) - graphRAG *graphrag.GraphRAG // layered GraphRAG for advanced queries - vectorIdx *vectordb.Index // TF-IDF semantic log search index + repo *storage.Repository + hub *realtime.Hub + eventHub *realtime.EventHub + metrics *telemetry.Metrics + cache *cache.TTLCache + graph *graph.Graph // in-memory service dependency graph (may be nil before first build) + graphRAG *graphrag.GraphRAG // layered GraphRAG for advanced queries // Saturation probes consulted by /ready. Each returns a fullness // fraction in [0.0, 1.0]; nil disables the corresponding check. @@ -53,11 +51,6 @@ func (s *Server) SetGraphRAG(g *graphrag.GraphRAG) { s.graphRAG = g } -// SetVectorIndex wires the TF-IDF vector index for semantic log search. -func (s *Server) SetVectorIndex(idx *vectordb.Index) { - s.vectorIdx = idx -} - // SetDLQSaturationProbe registers a callback returning DLQ disk fullness as // a fraction in [0.0, 1.0]. Used by /ready to flip to 503 when DLQ is at // risk of FIFO-evicting unflushed batches. Pass nil to disable the check. @@ -96,7 +89,6 @@ func (s *Server) RegisterRoutes(mux *http.ServeMux) { // Logs mux.HandleFunc("GET /api/logs", s.handleGetLogs) mux.HandleFunc("GET /api/logs/context", s.handleGetLogContext) - mux.HandleFunc("GET /api/logs/similar", s.handleGetSimilarLogs) mux.HandleFunc("GET /api/logs/{id}/insight", s.handleGetLogInsight) // Admin & System diff --git a/internal/api/similar_handler.go b/internal/api/similar_handler.go deleted file mode 100644 index ac0fe57..0000000 --- a/internal/api/similar_handler.go +++ /dev/null @@ -1,44 +0,0 @@ -package api - -import ( - "encoding/json" - "net/http" - "strconv" - - "github.com/RandomCodeSpace/otelcontext/internal/storage" -) - -// handleGetSimilarLogs handles GET /api/logs/similar?q=&limit=10 -// Returns logs semantically similar to the query string using TF-IDF cosine similarity. -func (s *Server) handleGetSimilarLogs(w http.ResponseWriter, r *http.Request) { - if s.vectorIdx == nil { - http.Error(w, "vector index not initialized", http.StatusServiceUnavailable) - return - } - - query := r.URL.Query().Get("q") - if query == "" { - http.Error(w, "q parameter is required", http.StatusBadRequest) - return - } - - limit := 10 - if lStr := r.URL.Query().Get("limit"); lStr != "" { - if n, err := strconv.Atoi(lStr); err == nil && n > 0 { - limit = n - } - } - if limit > 50 { - limit = 50 - } - - tenant := storage.TenantFromContext(r.Context()) - results := s.vectorIdx.Search(tenant, query, limit) - - w.Header().Set("Content-Type", "application/json") - _ = json.NewEncoder(w).Encode(map[string]any{ - "query": query, - "count": len(results), - "results": results, - }) -} diff --git a/internal/api/similar_handler_test.go b/internal/api/similar_handler_test.go deleted file mode 100644 index 69af324..0000000 --- a/internal/api/similar_handler_test.go +++ /dev/null @@ -1,117 +0,0 @@ -package api - -import ( - "encoding/json" - "net/http" - "net/http/httptest" - "net/url" - "testing" - - "github.com/RandomCodeSpace/otelcontext/internal/config" - "github.com/RandomCodeSpace/otelcontext/internal/vectordb" -) - -// TestSimilarHandler_TenantIsolation is the RAN-20 acceptance bar for the HTTP -// surface. Two tenants with distinct corpora query /api/logs/similar; each -// sees ZERO rows belonging to the other tenant. -func TestSimilarHandler_TenantIsolation(t *testing.T) { - idx := vectordb.New(1_000) - idx.Add(101, "acme", "checkout", "ERROR", "payment gateway timeout charging customer") - idx.Add(102, "acme", "checkout", "ERROR", "payment gateway refused charge insufficient funds") - idx.Add(201, "globex", "auth", "ERROR", "payment gateway token expired for session") - idx.Add(202, "globex", "auth", "ERROR", "payment gateway 500 internal error while authenticating") - - srv := &Server{vectorIdx: idx} - mux := http.NewServeMux() - mux.HandleFunc("GET /api/logs/similar", srv.handleGetSimilarLogs) - handler := TenantMiddleware(&config.Config{DefaultTenant: "default"})(mux) - - acmeIDs := map[float64]bool{101: true, 102: true} - globexIDs := map[float64]bool{201: true, 202: true} - - q := url.Values{} - q.Set("q", "payment gateway") - q.Set("limit", "50") - path := "/api/logs/similar?" + q.Encode() - - // Tenant A - aRec := httptest.NewRecorder() - aReq := httptest.NewRequest(http.MethodGet, path, nil) - aReq.Header.Set(TenantHeader, "acme") - handler.ServeHTTP(aRec, aReq) - if aRec.Code != http.StatusOK { - t.Fatalf("acme: want 200, got %d body=%q", aRec.Code, aRec.Body.String()) - } - acme := decodeResults(t, aRec) - if len(acme) == 0 { - t.Fatalf("acme got zero hits despite matching corpus") - } - for _, r := range acme { - if !acmeIDs[r.ID] { - t.Fatalf("acme leaked cross-tenant id=%v tenant=%q body=%q", r.ID, r.Tenant, r.Body) - } - } - - // Tenant B - gRec := httptest.NewRecorder() - gReq := httptest.NewRequest(http.MethodGet, path, nil) - gReq.Header.Set(TenantHeader, "globex") - handler.ServeHTTP(gRec, gReq) - if gRec.Code != http.StatusOK { - t.Fatalf("globex: want 200, got %d", gRec.Code) - } - globex := decodeResults(t, gRec) - if len(globex) == 0 { - t.Fatalf("globex got zero hits despite matching corpus") - } - for _, r := range globex { - if !globexIDs[r.ID] { - t.Fatalf("globex leaked cross-tenant id=%v tenant=%q body=%q", r.ID, r.Tenant, r.Body) - } - } -} - -// TestSimilarHandler_UnknownTenantReturnsEmpty confirms a request bearing an -// unknown tenant header returns zero results — the handler must not silently -// fall back to another tenant's rows. -func TestSimilarHandler_UnknownTenantReturnsEmpty(t *testing.T) { - idx := vectordb.New(100) - idx.Add(1, "acme", "svc", "ERROR", "database connection refused upstream") - - srv := &Server{vectorIdx: idx} - mux := http.NewServeMux() - mux.HandleFunc("GET /api/logs/similar", srv.handleGetSimilarLogs) - handler := TenantMiddleware(&config.Config{DefaultTenant: "default"})(mux) - - rec := httptest.NewRecorder() - req := httptest.NewRequest(http.MethodGet, "/api/logs/similar?q=database+connection", nil) - req.Header.Set(TenantHeader, "initech") - handler.ServeHTTP(rec, req) - - if rec.Code != http.StatusOK { - t.Fatalf("want 200, got %d", rec.Code) - } - if r := decodeResults(t, rec); len(r) != 0 { - t.Fatalf("unknown tenant saw %d cross-tenant hits", len(r)) - } -} - -type similarResult struct { - ID float64 `json:"LogID"` - Tenant string `json:"Tenant"` - ServiceName string `json:"ServiceName"` - Severity string `json:"Severity"` - Body string `json:"Body"` - Score float64 `json:"Score"` -} - -func decodeResults(t *testing.T, rec *httptest.ResponseRecorder) []similarResult { - t.Helper() - var env struct { - Results []similarResult `json:"results"` - } - if err := json.Unmarshal(rec.Body.Bytes(), &env); err != nil { - t.Fatalf("decode response: %v (body=%q)", err, rec.Body.String()) - } - return env.Results -} diff --git a/internal/config/config.go b/internal/config/config.go index 68c6423..ab6817d 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -114,21 +114,6 @@ type Config struct { // Compression CompressionLevel string // "default", "fast", "best" - // Vector Index - VectorIndexMaxEntries int - - // VectorIndexSnapshotPath is the on-disk location for periodic vectordb - // snapshots. When empty, persistence is disabled and the index rebuilds - // from DB on every restart (legacy behaviour). Default - // "data/vectordb.snapshot". - VectorIndexSnapshotPath string - - // VectorIndexSnapshotInterval, e.g. "5m". When set and - // VectorIndexSnapshotPath is non-empty, the index serializes its state - // to disk on this cadence. "0" / empty disables periodic writes (a - // final snapshot still fires on graceful shutdown). Default "5m". - VectorIndexSnapshotInterval string - // LogFTSEnabled toggles SQLite FTS5 provisioning + querying. The FTS5 // inverted index typically consumes 30-40% of SQLite DB disk for // log-heavy workloads, while the LIKE fallback (log_repo.go:105) keeps @@ -302,11 +287,6 @@ func Load(customPath string) (*Config, error) { // Compression CompressionLevel: getEnv("COMPRESSION_LEVEL", "default"), - // Vector - VectorIndexMaxEntries: getEnvInt("VECTOR_INDEX_MAX_ENTRIES", 100000), - VectorIndexSnapshotPath: getEnv("VECTOR_INDEX_SNAPSHOT_PATH", "data/vectordb.snapshot"), - VectorIndexSnapshotInterval: getEnv("VECTOR_INDEX_SNAPSHOT_INTERVAL", "5m"), - // Log search FTS5 toggle (SQLite only). Default off — see field comment. LogFTSEnabled: parseTruthy(getEnv("LOG_FTS_ENABLED", "")), diff --git a/internal/graphrag/builder.go b/internal/graphrag/builder.go index 8be781f..e2b058e 100644 --- a/internal/graphrag/builder.go +++ b/internal/graphrag/builder.go @@ -11,7 +11,6 @@ import ( "github.com/RandomCodeSpace/otelcontext/internal/storage" "github.com/RandomCodeSpace/otelcontext/internal/telemetry" "github.com/RandomCodeSpace/otelcontext/internal/tsdb" - "github.com/RandomCodeSpace/otelcontext/internal/vectordb" ) // panicMetrics is an optional hook for incrementing the panics-recovered @@ -90,10 +89,9 @@ type GraphRAG struct { tenants map[string]*tenantStores tenantsMu sync.RWMutex - repo *storage.Repository - vectorIdx *vectordb.Index - tsdbAgg *tsdb.Aggregator - ringBuf *tsdb.RingBuffer + repo *storage.Repository + tsdbAgg *tsdb.Aggregator + ringBuf *tsdb.RingBuffer drain *Drain // Drain log-template miner (see drain.go) @@ -206,7 +204,11 @@ func DefaultConfig() Config { } // New creates a new GraphRAG coordinator. -func New(repo *storage.Repository, vectorIdx *vectordb.Index, tsdbAgg *tsdb.Aggregator, ringBuf *tsdb.RingBuffer, cfg Config) *GraphRAG { +// +// The vectordb-backed semantic similarity path was removed on 2026-05-24 +// along with the find_similar_logs MCP tool — log clustering now relies +// solely on the Drain template miner (see drain.go). +func New(repo *storage.Repository, tsdbAgg *tsdb.Aggregator, ringBuf *tsdb.RingBuffer, cfg Config) *GraphRAG { if cfg.TraceTTL == 0 { cfg.TraceTTL = defaultTraceTTL } @@ -229,7 +231,6 @@ func New(repo *storage.Repository, vectorIdx *vectordb.Index, tsdbAgg *tsdb.Aggr g := &GraphRAG{ tenants: make(map[string]*tenantStores), repo: repo, - vectorIdx: vectorIdx, tsdbAgg: tsdbAgg, ringBuf: ringBuf, drain: NewDrain(), diff --git a/internal/graphrag/builder_test.go b/internal/graphrag/builder_test.go index 3639c96..5c26222 100644 --- a/internal/graphrag/builder_test.go +++ b/internal/graphrag/builder_test.go @@ -30,7 +30,7 @@ func newTestRepo(t *testing.T) *storage.Repository { // events asynchronously; tests must call Stop() via t.Cleanup. func newTestGraphRAG(t *testing.T) *GraphRAG { t.Helper() - g := New(nil, nil, nil, nil, DefaultConfig()) + g := New(nil, nil, nil, DefaultConfig()) // Start only the event workers — the background refresh/snapshot/anomaly // loops require a repo, which this helper intentionally does not wire. ctx, cancel := context.WithCancel(context.Background()) @@ -112,7 +112,7 @@ func TestRefresh_PopulatesErrorCountFromDBStatus(t *testing.T) { // Build GraphRAG with the seeded repo, skip starting background loops; // invoke the rebuild path directly. - g := New(repo, nil, nil, nil, DefaultConfig()) + g := New(repo, nil, nil, DefaultConfig()) t.Cleanup(g.Stop) g.rebuildAllTenantsFromDB(context.Background()) @@ -134,7 +134,7 @@ func TestRefresh_PopulatesErrorCountFromDBStatus(t *testing.T) { func TestOnSpanIngested_DropsIncrementMetric(t *testing.T) { // Build a GraphRAG WITHOUT starting any event workers so the channel // fills up and overflows. - g := New(nil, nil, nil, nil, DefaultConfig()) + g := New(nil, nil, nil, DefaultConfig()) t.Cleanup(g.Stop) // Fill the buffer beyond capacity. Use the package constant so the test diff --git a/internal/graphrag/clustering.go b/internal/graphrag/clustering.go index 574a6ec..2745c7e 100644 --- a/internal/graphrag/clustering.go +++ b/internal/graphrag/clustering.go @@ -1,16 +1,12 @@ package graphrag -// Log clustering is now performed by the Drain template miner (see drain.go). +// Log clustering is performed by the Drain template miner (see drain.go). // processLog() in builder.go calls GraphRAG.clusterLog() which delegates to -// the shared *Drain instance. The vectordb.Index (TF-IDF) is still used for -// SimilarErrors — similarity search across mined templates. +// the shared *Drain instance. import ( - "context" "fmt" "time" - - "github.com/RandomCodeSpace/otelcontext/internal/storage" ) // clusterLog runs the log body through Drain and upserts a LogClusterNode @@ -53,83 +49,3 @@ func (g *GraphRAG) clusterLog(stores *tenantStores, service, body, severity stri ) return clusterID } - -// SimilarErrors finds log clusters similar to a given cluster using the vector -// index, scoped to the tenant carried on ctx. Cross-tenant hits are impossible -// because the underlying vectordb partitions docs per tenant and this lookup -// resolves the SignalStore through storesFor(ctx). -func (g *GraphRAG) SimilarErrors(ctx context.Context, clusterID string, k int) []LogClusterNode { - if k <= 0 { - k = 10 - } - - stores := g.storesFor(ctx) - - stores.signals.mu.RLock() - cluster, ok := stores.signals.LogClusters[clusterID] - stores.signals.mu.RUnlock() - if !ok { - return nil - } - - // Use vectordb to find similar logs based on the mined template. - if g.vectorIdx == nil { - return nil - } - query := cluster.Template - if query == "" && len(cluster.TemplateTokens) > 0 { - query = joinTokens(cluster.TemplateTokens) - } - // vectordb.Index.Search takes the tenant string directly; resolve it - // from ctx via the same storage helper used by storesFor so both sides - // agree on coercion rules (empty → DefaultTenantID). - tenant := storage.TenantFromContext(ctx) - results := g.vectorIdx.Search(tenant, query, k*2) // over-fetch to filter - - // Map results back to log clusters. - seen := map[string]bool{clusterID: true} - var similar []LogClusterNode - - stores.signals.mu.RLock() - defer stores.signals.mu.RUnlock() - - for _, r := range results { - for _, lc := range stores.signals.LogClusters { - if seen[lc.ID] { - continue - } - for _, e := range stores.signals.Edges { - if e.Type == EdgeEmittedBy && e.FromID == lc.ID && e.ToID == r.ServiceName { - seen[lc.ID] = true - similar = append(similar, *lc) - break - } - } - if len(similar) >= k { - break - } - } - if len(similar) >= k { - break - } - } - - return similar -} - -// joinTokens is a tiny helper to avoid importing strings in this file's -// hot path; equivalent to strings.Join(tokens, " "). -func joinTokens(tokens []string) string { - n := 0 - for _, t := range tokens { - n += len(t) + 1 - } - b := make([]byte, 0, n) - for i, t := range tokens { - if i > 0 { - b = append(b, ' ') - } - b = append(b, t...) - } - return string(b) -} diff --git a/internal/graphrag/migrate_test.go b/internal/graphrag/migrate_test.go index 2cc5df8..30762f6 100644 --- a/internal/graphrag/migrate_test.go +++ b/internal/graphrag/migrate_test.go @@ -31,7 +31,7 @@ func newTestGraphRAGWithDB(t *testing.T) (*GraphRAG, *gorm.DB) { t.Helper() db := newTestGraphRAGDB(t) repo := storage.NewRepositoryFromDB(db, "sqlite") - g := New(repo, nil, nil, nil, DefaultConfig()) + g := New(repo, nil, nil, DefaultConfig()) t.Cleanup(func() { g.Stop() }) return g, db } diff --git a/internal/mcp/robustness_test.go b/internal/mcp/robustness_test.go index 285d0f7..b5b3e53 100644 --- a/internal/mcp/robustness_test.go +++ b/internal/mcp/robustness_test.go @@ -20,7 +20,7 @@ import ( // not the tool internals. func minimalServer(t *testing.T) *Server { t.Helper() - return New("default", nil, nil, nil, nil) + return New("default", nil, nil, nil) } // jsonRPCCallToolBody marshals a tools/call envelope for a fake tool name. diff --git a/internal/mcp/server.go b/internal/mcp/server.go index b1feb68..c331eac 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -17,7 +17,6 @@ import ( "github.com/RandomCodeSpace/otelcontext/internal/httpconst" "github.com/RandomCodeSpace/otelcontext/internal/storage" "github.com/RandomCodeSpace/otelcontext/internal/telemetry" - "github.com/RandomCodeSpace/otelcontext/internal/vectordb" ) const ( @@ -71,7 +70,6 @@ type Server struct { repo *storage.Repository metrics *telemetry.Metrics svcGraph *graph.Graph - vectorIdx *vectordb.Index graphRAG *graphrag.GraphRAG defaultTenant string @@ -99,12 +97,15 @@ type Server struct { // storage.DefaultTenantID. Required at construction time so production startup // cannot accidentally drop cfg.DefaultTenant — a missing argument is a compile // error rather than a silent regression. +// +// The vectordb-backed semantic similarity argument was removed on 2026-05-24 +// when find_similar_logs was cut from the MCP surface and the vectordb package +// was deleted. func New( defaultTenant string, repo *storage.Repository, metrics *telemetry.Metrics, svcGraph *graph.Graph, - vectorIdx *vectordb.Index, ) *Server { if defaultTenant == "" { defaultTenant = storage.DefaultTenantID @@ -113,7 +114,6 @@ func New( repo: repo, metrics: metrics, svcGraph: svcGraph, - vectorIdx: vectorIdx, defaultTenant: defaultTenant, callSlots: make(chan struct{}, defaultMaxConcurrentCalls), callTimeout: defaultCallTimeout, diff --git a/internal/mcp/server_ran22_test.go b/internal/mcp/server_ran22_test.go index dbf020f..769fb69 100644 --- a/internal/mcp/server_ran22_test.go +++ b/internal/mcp/server_ran22_test.go @@ -18,19 +18,19 @@ import ( // no-header caller). func TestNew_DefaultTenant_FromConstructor(t *testing.T) { t.Run("empty falls back to storage.DefaultTenantID", func(t *testing.T) { - srv := New("", nil, nil, nil, nil) + srv := New("", nil, nil, nil) if srv.defaultTenant != storage.DefaultTenantID { t.Fatalf(`New("") defaultTenant = %q, want %q`, srv.defaultTenant, storage.DefaultTenantID) } }) t.Run("non-empty value is preserved", func(t *testing.T) { - srv := New("acme", nil, nil, nil, nil) + srv := New("acme", nil, nil, nil) if srv.defaultTenant != "acme" { t.Fatalf(`New("acme") defaultTenant = %q, want "acme"`, srv.defaultTenant) } }) t.Run("SetDefaultTenant runtime override still works", func(t *testing.T) { - srv := New("acme", nil, nil, nil, nil) + srv := New("acme", nil, nil, nil) srv.SetDefaultTenant("globex") if srv.defaultTenant != "globex" { t.Fatalf(`SetDefaultTenant("globex") defaultTenant = %q, want "globex"`, srv.defaultTenant) diff --git a/internal/mcp/tenant_isolation_test.go b/internal/mcp/tenant_isolation_test.go index 6ae90f3..d9c537f 100644 --- a/internal/mcp/tenant_isolation_test.go +++ b/internal/mcp/tenant_isolation_test.go @@ -19,7 +19,6 @@ import ( "github.com/RandomCodeSpace/otelcontext/internal/graphrag" "github.com/RandomCodeSpace/otelcontext/internal/storage" - "github.com/RandomCodeSpace/otelcontext/internal/vectordb" ) // tenants exercised by the test. The third row uses an empty header to @@ -67,7 +66,7 @@ func markersFor(scoped string, others []string) (own []string, leak []string) { // snapshot, and anomaly loops are stretched to "never" inside the test // window so the only state that lands in the stores is the data the test // seeds explicitly — making leak assertions deterministic. -func setupTenantIsolationServer(t *testing.T) (*httptest.Server, *graphrag.GraphRAG, *storage.Repository, *vectordb.Index) { +func setupTenantIsolationServer(t *testing.T) (*httptest.Server, *graphrag.GraphRAG, *storage.Repository) { t.Helper() db, err := storage.NewDatabase("sqlite", ":memory:") @@ -82,19 +81,17 @@ func setupTenantIsolationServer(t *testing.T) (*httptest.Server, *graphrag.Graph } repo := storage.NewRepositoryFromDB(db, "sqlite") - vIdx := vectordb.New(1000) - cfg := graphrag.DefaultConfig() cfg.RefreshEvery = 24 * time.Hour cfg.SnapshotEvery = 24 * time.Hour cfg.AnomalyEvery = 24 * time.Hour cfg.WorkerCount = 4 - g := graphrag.New(repo, vIdx, nil, nil, cfg) + g := graphrag.New(repo, nil, nil, cfg) bgCtx, cancel := context.WithCancel(context.Background()) go g.Start(bgCtx) - srv := New("", repo, nil, nil, vIdx) + srv := New("", repo, nil, nil) srv.SetGraphRAG(g) httpSrv := httptest.NewServer(srv.Handler()) @@ -106,7 +103,7 @@ func setupTenantIsolationServer(t *testing.T) (*httptest.Server, *graphrag.Graph _ = repo.Close() }) - return httpSrv, g, repo, vIdx + return httpSrv, g, repo } // seedTenant ingests a small but representative slice of telemetry for @@ -300,7 +297,7 @@ func truncate(s string) string { // asserts each response contains only the caller-tenant's data and never // leaks another tenant's service name, log marker, operation, or anomaly. func TestMCP_TenantIsolation_AllGraphRAGTools(t *testing.T) { - ts, g, repo, _ := setupTenantIsolationServer(t) + ts, g, repo := setupTenantIsolationServer(t) now := time.Now().Add(-time.Minute) // a hair in the past so since=now-15m sees us @@ -382,7 +379,7 @@ func TestMCP_TenantIsolation_AllGraphRAGTools(t *testing.T) { // CorrelatedSignals (not just the response text) and asserts each tenant // only ever sees rows tagged with its own marker. func TestMCP_TenantIsolation_DrainClusterIDsStayPerTenant(t *testing.T) { - ts, g, _, _ := setupTenantIsolationServer(t) + ts, g, _ := setupTenantIsolationServer(t) now := time.Now().Add(-time.Minute) // Identical service AND identical log template across tenants — Drain diff --git a/internal/storage/log_repo.go b/internal/storage/log_repo.go index e26c2ad..83985ba 100644 --- a/internal/storage/log_repo.go +++ b/internal/storage/log_repo.go @@ -206,61 +206,6 @@ func (r *Repository) UpdateLogInsight(ctx context.Context, logID uint, insight s return nil } -// LogsForVectorReplay returns ERROR/WARN-family logs with id > sinceID, -// page-bounded by limit and ordered by id ASC. Used at startup by the -// vector-index tail-replay path to pick up DB rows inserted after the last -// snapshot. The id-ascending order lets the caller use the last row's id -// as the next page's sinceID — clean cursor pagination, no offset cost. -// -// Cross-tenant by design: vectordb is a global index with per-doc tenant -// tags enforced at Search time. Not exposed on any tenant-scoped API. -// -// Severity filter is intentionally narrow (ERROR / WARN / WARNING / FATAL / -// CRITICAL) so non-indexed rows don't waste page space; this matches -// vectordb.shouldIndex(). -func (r *Repository) LogsForVectorReplay(ctx context.Context, sinceID uint, limit int) ([]Log, error) { - if limit <= 0 || limit > 100_000 { - limit = 10_000 - } - var logs []Log - err := r.db.WithContext(ctx). - Where("id > ? AND severity IN ?", sinceID, []string{"ERROR", "WARN", "WARNING", "FATAL", "CRITICAL"}). - Order("id ASC"). - Limit(limit). - Find(&logs).Error - if err != nil { - return nil, fmt.Errorf("logs for vector replay: %w", err) - } - return logs, nil -} - -// ListRecentHighSeverityLogsAllTenants returns recent logs of the given -// severity across EVERY tenant, each row carrying its own TenantID. This is an -// administrative read used exclusively by the vector index's startup -// hydration path, which fans rows out to per-tenant shards. It is not exposed -// on any tenant-scoped API surface — tenant isolation for read paths must -// otherwise be preserved via the context-driven WHERE clause. -func (r *Repository) ListRecentHighSeverityLogsAllTenants(ctx context.Context, severity string, since, until time.Time, limit int) ([]Log, error) { - if limit <= 0 { - limit = 5000 - } - q := r.db.WithContext(ctx).Model(&Log{}) - if severity != "" { - q = q.Where(sqlWhereSeverity, severity) - } - if !since.IsZero() { - q = q.Where(sqlWhereTimestampGTE, since) - } - if !until.IsZero() { - q = q.Where(sqlWhereTimestampLTE, until) - } - var logs []Log - if err := q.Order(sqlOrderTimestampDesc).Limit(limit).Find(&logs).Error; err != nil { - return nil, fmt.Errorf("failed to list recent logs all tenants: %w", err) - } - return logs, nil -} - // PurgeLogs deletes logs older than the given timestamp in a single statement. // Suitable for SQLite; for Postgres at large retention volumes prefer PurgeLogsBatched. func (r *Repository) PurgeLogs(olderThan time.Time) (int64, error) { diff --git a/internal/storage/log_repo_replay_test.go b/internal/storage/log_repo_replay_test.go deleted file mode 100644 index 5b60cac..0000000 --- a/internal/storage/log_repo_replay_test.go +++ /dev/null @@ -1,138 +0,0 @@ -package storage - -import ( - "context" - "testing" - "time" -) - -// TestLogsForVectorReplay_ReturnsErrorAndWarnOnly verifies the severity -// filter matches vectordb.shouldIndex (ERROR/WARN/WARNING/FATAL/CRITICAL). -// INFO and DEBUG rows must be excluded so the page isn't bloated with rows -// vectordb would drop anyway. -func TestLogsForVectorReplay_ReturnsErrorAndWarnOnly(t *testing.T) { - repo := newTestRepo(t) - now := time.Now().UTC() - rows := []Log{ - {TenantID: "default", Severity: "ERROR", Body: "panic", ServiceName: "svc", Timestamp: now}, - {TenantID: "default", Severity: "WARN", Body: "slow", ServiceName: "svc", Timestamp: now}, - {TenantID: "default", Severity: "WARNING", Body: "deprecated", ServiceName: "svc", Timestamp: now}, - {TenantID: "default", Severity: "FATAL", Body: "OOM", ServiceName: "svc", Timestamp: now}, - {TenantID: "default", Severity: "CRITICAL", Body: "deadlock", ServiceName: "svc", Timestamp: now}, - {TenantID: "default", Severity: "INFO", Body: "request handled", ServiceName: "svc", Timestamp: now}, - {TenantID: "default", Severity: "DEBUG", Body: "trace data", ServiceName: "svc", Timestamp: now}, - } - if err := repo.db.Create(&rows).Error; err != nil { - t.Fatalf("seed: %v", err) - } - - got, err := repo.LogsForVectorReplay(context.Background(), 0, 100) - if err != nil { - t.Fatalf("LogsForVectorReplay: %v", err) - } - if len(got) != 5 { - t.Errorf("got %d rows, want 5 (ERROR+WARN+WARNING+FATAL+CRITICAL)", len(got)) - } - for _, l := range got { - if l.Severity == "INFO" || l.Severity == "DEBUG" { - t.Errorf("unexpected severity in result: %q (id=%d)", l.Severity, l.ID) - } - } -} - -// TestLogsForVectorReplay_RespectsSinceID verifies the cursor pagination -// contract: rows with id <= sinceID are excluded so the caller can advance -// across pages without re-fetching. -func TestLogsForVectorReplay_RespectsSinceID(t *testing.T) { - repo := newTestRepo(t) - now := time.Now().UTC() - for range 5 { - repo.db.Create(&Log{TenantID: "default", Severity: "ERROR", Body: "x", ServiceName: "svc", Timestamp: now}) - } - - page1, err := repo.LogsForVectorReplay(context.Background(), 0, 2) - if err != nil { - t.Fatalf("page1: %v", err) - } - if len(page1) != 2 { - t.Fatalf("page1: got %d rows, want 2", len(page1)) - } - // IDs must be strictly ascending. - if page1[0].ID >= page1[1].ID { - t.Errorf("page1 not ascending: %d, %d", page1[0].ID, page1[1].ID) - } - - page2, err := repo.LogsForVectorReplay(context.Background(), page1[1].ID, 2) - if err != nil { - t.Fatalf("page2: %v", err) - } - if len(page2) != 2 { - t.Fatalf("page2: got %d rows, want 2", len(page2)) - } - for _, r := range page2 { - if r.ID <= page1[1].ID { - t.Errorf("page2 contains id=%d <= page1 cursor=%d", r.ID, page1[1].ID) - } - } - - page3, err := repo.LogsForVectorReplay(context.Background(), page2[1].ID, 2) - if err != nil { - t.Fatalf("page3: %v", err) - } - if len(page3) != 1 { - t.Errorf("page3: got %d rows, want 1 (final partial page)", len(page3)) - } -} - -// TestLogsForVectorReplay_CrossTenant verifies the replay is intentionally -// cross-tenant — vectordb is a global accelerator and per-doc tenant tags -// enforce isolation at Search time. -func TestLogsForVectorReplay_CrossTenant(t *testing.T) { - repo := newTestRepo(t) - now := time.Now().UTC() - repo.db.Create(&[]Log{ - {TenantID: "acme", Severity: "ERROR", Body: "a", ServiceName: "svc", Timestamp: now}, - {TenantID: "globex", Severity: "ERROR", Body: "b", ServiceName: "svc", Timestamp: now}, - {TenantID: "default", Severity: "ERROR", Body: "c", ServiceName: "svc", Timestamp: now}, - }) - - // No tenant context — replay is cross-tenant by design. - got, err := repo.LogsForVectorReplay(context.Background(), 0, 100) - if err != nil { - t.Fatalf("LogsForVectorReplay: %v", err) - } - if len(got) != 3 { - t.Errorf("got %d rows across tenants, want 3", len(got)) - } - tenants := map[string]int{} - for _, l := range got { - tenants[l.TenantID]++ - } - for _, name := range []string{"acme", "globex", "default"} { - if tenants[name] != 1 { - t.Errorf("tenant %q: got %d rows, want 1", name, tenants[name]) - } - } -} - -// TestLogsForVectorReplay_LimitClamp verifies the limit is clamped to a -// safe default when caller passes 0 / negative / absurdly large values. -func TestLogsForVectorReplay_LimitClamp(t *testing.T) { - repo := newTestRepo(t) - now := time.Now().UTC() - for range 3 { - repo.db.Create(&Log{TenantID: "default", Severity: "ERROR", Body: "x", ServiceName: "svc", Timestamp: now}) - } - - for _, lim := range []int{0, -1, 999_999} { - got, err := repo.LogsForVectorReplay(context.Background(), 0, lim) - if err != nil { - t.Errorf("limit=%d: unexpected err=%v", lim, err) - continue - } - // 3 rows seeded; default cap is 10k, so all 3 must come back. - if len(got) != 3 { - t.Errorf("limit=%d: got %d rows, want 3", lim, len(got)) - } - } -} diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go index e6a3d54..44f0afc 100644 --- a/internal/telemetry/metrics.go +++ b/internal/telemetry/metrics.go @@ -127,26 +127,6 @@ type Metrics struct { // --- Dashboard p99 (Task 10) --- DashboardP99RowCapHitsTotal prometheus.Counter - // --- Vectordb persistence --- - // VectorSnapshotWritesTotal counts snapshot write attempts, labeled - // {result=success|failure}. Alert on rate(failure[10m]) > 0. - VectorSnapshotWritesTotal *prometheus.CounterVec - // VectorSnapshotDurationSeconds is the WriteSnapshot wall-clock - // duration. Histogram so operators can SLO p95 / p99. - VectorSnapshotDurationSeconds prometheus.Histogram - // VectorSnapshotSizeBytes gauges the on-disk size of the latest - // successful snapshot. Sudden growth signals a maxSize bump or a - // schema change worth investigating. - VectorSnapshotSizeBytes prometheus.Gauge - // VectorSnapshotLoadTotal counts startup snapshot loads, labeled - // {result=success|missing|corrupt}. corrupt = magic/version/crc/decode - // failure — caller falls back to a full DB rebuild. - VectorSnapshotLoadTotal *prometheus.CounterVec - // VectorReplayLogsTotal accumulates rows processed by ReplayFromDB - // across the daemon's lifetime. The rate spikes only at startup - // (catching the snapshot→now gap), then stays flat. - VectorReplayLogsTotal prometheus.Counter - // Atomic counters for JSON health endpoint (avoids scraping Prometheus) totalIngested atomic.Int64 activeConns atomic.Int64 @@ -391,64 +371,9 @@ func New() *Metrics { Name: "otelcontext_dashboard_p99_row_cap_hits_total", Help: "Number of dashboard p99 computations that hit the SQLite row cap (200k). Indicates the dataset is too large for in-memory p99 — use Postgres for prod.", }) - m.VectorSnapshotWritesTotal = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: "otelcontext_vectordb_snapshot_writes_total", - Help: "Vectordb snapshot write attempts by result (success|failure). Alert on rate(...{result=\"failure\"}[10m]) > 0.", - }, []string{"result"}) - m.VectorSnapshotDurationSeconds = promauto.NewHistogram(prometheus.HistogramOpts{ - Name: "otelcontext_vectordb_snapshot_duration_seconds", - Help: "Wall-clock duration of WriteSnapshot, including encode + atomic rename.", - Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5}, - }) - m.VectorSnapshotSizeBytes = promauto.NewGauge(prometheus.GaugeOpts{ - Name: "otelcontext_vectordb_snapshot_size_bytes", - Help: "On-disk size of the latest successful vectordb snapshot.", - }) - m.VectorSnapshotLoadTotal = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: "otelcontext_vectordb_snapshot_load_total", - Help: "Vectordb snapshot load attempts at startup by result (success|missing|corrupt).", - }, []string{"result"}) - m.VectorReplayLogsTotal = promauto.NewCounter(prometheus.CounterOpts{ - Name: "otelcontext_vectordb_replay_logs_total", - Help: "Total log rows processed by vectordb ReplayFromDB across the daemon's lifetime.", - }) return m } -// RecordVectorSnapshotWrite is the observer hook the vectordb snapshot -// path calls after each WriteSnapshot attempt. result is "success" or -// "failure"; size is the on-disk byte count after a successful rename -// (zero on failure). -func (m *Metrics) RecordVectorSnapshotWrite(result string, duration time.Duration, size int64) { - if m == nil || m.VectorSnapshotWritesTotal == nil { - return - } - m.VectorSnapshotWritesTotal.WithLabelValues(result).Inc() - m.VectorSnapshotDurationSeconds.Observe(duration.Seconds()) - if result == "success" && size > 0 { - m.VectorSnapshotSizeBytes.Set(float64(size)) - } -} - -// RecordVectorSnapshotLoad is the observer hook for startup snapshot -// loads. result is "success", "missing" (first start, no prior file), -// or "corrupt" (any decode/CRC/version error → full rebuild fallback). -func (m *Metrics) RecordVectorSnapshotLoad(result string) { - if m == nil || m.VectorSnapshotLoadTotal == nil { - return - } - m.VectorSnapshotLoadTotal.WithLabelValues(result).Inc() -} - -// RecordVectorReplayLogs adds rows processed by ReplayFromDB to the -// lifetime counter. Called once after the startup tail-replay completes. -func (m *Metrics) RecordVectorReplayLogs(count int) { - if m == nil || m.VectorReplayLogsTotal == nil || count <= 0 { - return - } - m.VectorReplayLogsTotal.Add(float64(count)) -} - // StartRuntimeMetrics samples Go runtime stats every 15 seconds. func (m *Metrics) StartRuntimeMetrics() { go func() { diff --git a/internal/ui/ui.go b/internal/ui/ui.go index b8b813d..1b1d9d8 100644 --- a/internal/ui/ui.go +++ b/internal/ui/ui.go @@ -11,7 +11,6 @@ import ( "github.com/RandomCodeSpace/otelcontext/internal/graph" "github.com/RandomCodeSpace/otelcontext/internal/storage" "github.com/RandomCodeSpace/otelcontext/internal/telemetry" - "github.com/RandomCodeSpace/otelcontext/internal/vectordb" ) // spaFS wraps an fs.FS so http.FileServer transparently serves index.html @@ -48,17 +47,19 @@ type Server struct { repo *storage.Repository metrics *telemetry.Metrics topo *graph.Graph - vidx *vectordb.Index mcpEnabled bool mcpPath string } -func NewServer(repo *storage.Repository, metrics *telemetry.Metrics, topo *graph.Graph, vidx *vectordb.Index) *Server { +// NewServer constructs the embedded-UI server. +// +// The vectordb argument was removed on 2026-05-24 when the vectordb package +// was deleted alongside the find_similar_logs MCP tool cut. +func NewServer(repo *storage.Repository, metrics *telemetry.Metrics, topo *graph.Graph) *Server { return &Server{ repo: repo, metrics: metrics, topo: topo, - vidx: vidx, mcpPath: "/mcp", } } diff --git a/internal/vectordb/index.go b/internal/vectordb/index.go deleted file mode 100644 index 13777f4..0000000 --- a/internal/vectordb/index.go +++ /dev/null @@ -1,334 +0,0 @@ -// Package vectordb provides an embedded TF-IDF / cosine-similarity vector index -// for semantic log search. It is a pure-Go, no-CGO, in-process accelerator. -// The relational DB remains the source of truth; this index is fully rebuildable. -package vectordb - -import ( - "math" - "sort" - "strings" - "sync" - "time" - "unicode" -) - -// defaultTenantID is the tenant assigned when the caller passes an empty -// tenant string. Mirrors storage.DefaultTenantID; duplicated here to avoid -// pulling internal/storage into vectordb's import graph. -const defaultTenantID = "default" - -// LogVector represents an indexed log entry. -// -// Tenant scopes the document so Search can return only the caller's tenant -// rows. The TF-IDF table is shared across tenants — global IDF still gives -// the right rarity signal — but the per-document tenant tag is enforced at -// query time so two tenants with overlapping log bodies stay isolated. -// -// All fields are exported so encoding/gob can serialize the type for -// snapshot persistence (snapshot.go). Vec is the per-doc TF map (term → -// frequency); IDF is held separately on the Index to avoid duplicating -// rarity weights across documents. -type LogVector struct { - LogID uint - Tenant string - ServiceName string - Severity string - Body string - Vec map[string]float64 // TF-IDF sparse vector -} - -// SearchResult is a single similarity hit. -type SearchResult struct { - LogID uint - Tenant string - ServiceName string - Severity string - Body string - Score float64 // cosine similarity 0.0–1.0 -} - -// Index is a thread-safe in-memory TF-IDF vector index for log bodies. -// Only ERROR and WARN logs are indexed to keep it small and relevant. -// -// lastIndexedID records the highest Log.ID Add() has accepted. Persisted -// in the snapshot so a startup tail-replay can pick up DB rows newer than -// this watermark without re-indexing rows already in the snapshot. Tracked -// only for rows that pass shouldIndex(); INFO/DEBUG rows interleaved in -// the same ID range are excluded by the severity filter on replay anyway. -type Index struct { - mu sync.RWMutex - docs []LogVector // indexed log vectors - idf map[string]float64 // global IDF table - maxSize int // FIFO eviction cap - dirty bool // IDF needs recompute - lastIndexedID uint // high watermark of indexed Log.ID - - // snapshotObserver is invoked at the end of each WriteSnapshot - // (success or failure). nil-safe — set via SetSnapshotObserver from - // the wiring layer so vectordb stays free of telemetry imports. - snapshotObserver func(result string, duration time.Duration, size int64) -} - -// New creates a new Index with the given maximum entry cap. -func New(maxSize int) *Index { - if maxSize <= 0 { - maxSize = 100_000 - } - return &Index{ - maxSize: maxSize, - idf: make(map[string]float64), - } -} - -// Add adds a log to the index. Thread-safe. Tenant is recorded with the -// document so Search can filter by it; an empty tenant collapses to -// the platform default at the boundary, matching storage.TenantFromContext. -func (idx *Index) Add(logID uint, tenant, serviceName, severity, body string) { - if !shouldIndex(severity) { - return - } - tokens := tokenize(body) - if len(tokens) == 0 { - return - } - tf := computeTF(tokens) - - if tenant == "" { - tenant = defaultTenantID - } - - idx.mu.Lock() - defer idx.mu.Unlock() - - // High watermark for tail-replay correctness. Bump only after the - // shouldIndex/tokenize gates pass — the replay query is severity- - // filtered too, so non-indexed rows interleaved in the same ID range - // are excluded by SQL anyway. - if logID > idx.lastIndexedID { - idx.lastIndexedID = logID - } - - // Tenant-aware FIFO eviction. When at cap, remove up to maxSize/10 of the - // oldest entries belonging to the inserting tenant so a noisy tenant - // cannot push another tenant's warm rows out of the index (availability - // isolation — the confidentiality invariant is enforced separately by - // doc.Tenant filtering in Search). The new backing slice also releases - // the old array memory on the next GC cycle. - if len(idx.docs) >= idx.maxSize { - toDrop := idx.maxSize / 10 - if toDrop < 1 { - toDrop = 1 - } - kept := make([]LogVector, 0, idx.maxSize) - droppedSame := 0 - for _, d := range idx.docs { - if droppedSame < toDrop && d.Tenant == tenant { - droppedSame++ - continue - } - kept = append(kept, d) - } - // Edge case: the inserting tenant has no prior entries while the - // index is at cap with other tenants' rows. Drop one globally-oldest - // entry so the new tenant can take its first slot. This is the only - // path where a tenant's entry can be evicted by another tenant, and - // it costs at most one row per brand-new tenant. - if droppedSame == 0 && len(kept) > 0 { - kept = kept[1:] - } - idx.docs = kept - idx.dirty = true - } - - idx.docs = append(idx.docs, LogVector{ - LogID: logID, - Tenant: tenant, - ServiceName: serviceName, - Severity: severity, - Body: body, - Vec: tf, - }) - idx.dirty = true -} - -// Search finds the top-k logs most similar to the query string within -// tenant. Documents from other tenants are excluded — the IDF table stays -// global so rarity is computed against the whole corpus, but result rows -// are filtered to the caller's tenant. -func (idx *Index) Search(tenant, query string, k int) []SearchResult { - if k <= 0 { - k = 10 - } - if tenant == "" { - tenant = defaultTenantID - } - tokens := tokenize(query) - if len(tokens) == 0 { - return nil - } - queryTF := computeTF(tokens) - - idx.mu.Lock() - if idx.dirty { - idx.recomputeIDF() - idx.dirty = false - } - // Snapshot IDF and docs for the query (avoids holding lock during scoring). - idfSnap := make(map[string]float64, len(idx.idf)) - for k, v := range idx.idf { - idfSnap[k] = v - } - docs := make([]LogVector, len(idx.docs)) - copy(docs, idx.docs) - idx.mu.Unlock() - - // Build TF-IDF query vector. - queryVec := make(map[string]float64, len(queryTF)) - for term, tf := range queryTF { - queryVec[term] = tf * idfSnap[term] - } - queryNorm := vecNorm(queryVec) - if queryNorm == 0 { - return nil - } - - type scored struct { - doc LogVector - score float64 - } - results := make([]scored, 0, len(docs)) - for _, doc := range docs { - if doc.Tenant != tenant { - continue - } - docVec := make(map[string]float64, len(doc.Vec)) - for term, tf := range doc.Vec { - docVec[term] = tf * idfSnap[term] - } - score := cosineSimilarity(queryVec, queryNorm, docVec) - if score > 0 { - results = append(results, scored{doc, score}) - } - } - - sort.Slice(results, func(i, j int) bool { - return results[i].score > results[j].score - }) - if len(results) > k { - results = results[:k] - } - - out := make([]SearchResult, len(results)) - for i, r := range results { - out[i] = SearchResult{ - LogID: r.doc.LogID, - Tenant: r.doc.Tenant, - ServiceName: r.doc.ServiceName, - Severity: r.doc.Severity, - Body: r.doc.Body, - Score: r.score, - } - } - return out -} - -// Size returns the current number of indexed documents. -func (idx *Index) Size() int { - idx.mu.RLock() - defer idx.mu.RUnlock() - return len(idx.docs) -} - -// LastIndexedID returns the highest Log.ID that has been successfully indexed -// (i.e. passed shouldIndex + tokenize gates and was appended to docs). -// Used by the startup tail-replay path to query DB rows newer than this -// watermark; persisted in the snapshot so replay survives restarts. -func (idx *Index) LastIndexedID() uint { - idx.mu.RLock() - defer idx.mu.RUnlock() - return idx.lastIndexedID -} - -// recomputeIDF rebuilds the IDF table from current docs. Must be called with mu held. -func (idx *Index) recomputeIDF() { - df := make(map[string]int, len(idx.idf)) - for _, doc := range idx.docs { - for term := range doc.Vec { - df[term]++ - } - } - n := float64(len(idx.docs)) - // Replace the entire IDF map to drop stale terms from evicted docs - newIDF := make(map[string]float64, len(df)) - for term, count := range df { - newIDF[term] = math.Log(n/float64(count)) + 1 - } - idx.idf = newIDF -} - -// shouldIndex returns true for severity levels worth indexing. -func shouldIndex(severity string) bool { - s := strings.ToUpper(severity) - return s == "ERROR" || s == "WARN" || s == "WARNING" || s == "FATAL" || s == "CRITICAL" -} - -// tokenize splits text into lowercase alpha tokens, removing stop words. -func tokenize(text string) []string { - words := strings.FieldsFunc(strings.ToLower(text), func(r rune) bool { - return !unicode.IsLetter(r) && !unicode.IsDigit(r) - }) - out := make([]string, 0, len(words)) - for _, w := range words { - if len(w) > 2 && !isStopWord(w) { - out = append(out, w) - } - } - return out -} - -// computeTF returns term-frequency (count / total) for a token list. -func computeTF(tokens []string) map[string]float64 { - counts := make(map[string]int, len(tokens)) - for _, t := range tokens { - counts[t]++ - } - total := float64(len(tokens)) - tf := make(map[string]float64, len(counts)) - for term, count := range counts { - tf[term] = float64(count) / total - } - return tf -} - -func vecNorm(v map[string]float64) float64 { - var sum float64 - for _, val := range v { - sum += val * val - } - return math.Sqrt(sum) -} - -func cosineSimilarity(a map[string]float64, normA float64, b map[string]float64) float64 { - normB := vecNorm(b) - if normA == 0 || normB == 0 { - return 0 - } - var dot float64 - for term, va := range a { - if vb, ok := b[term]; ok { - dot += va * vb - } - } - return dot / (normA * normB) -} - -var stopWords = map[string]struct{}{ - "the": {}, "and": {}, "for": {}, "are": {}, "was": {}, "not": {}, - "with": {}, "this": {}, "that": {}, "from": {}, "has": {}, "but": {}, - "have": {}, "its": {}, "been": {}, "also": {}, "than": {}, "into": {}, -} - -func isStopWord(w string) bool { - _, ok := stopWords[w] - return ok -} diff --git a/internal/vectordb/index_test.go b/internal/vectordb/index_test.go deleted file mode 100644 index 9b9186c..0000000 --- a/internal/vectordb/index_test.go +++ /dev/null @@ -1,136 +0,0 @@ -package vectordb - -import ( - "strconv" - "sync" - "testing" -) - -// TestTenantIsolation_Search is the RAN-20 confidentiality bar: a query on -// tenant A never returns a document indexed under tenant B, even when the -// vocabularies collide on the query terms. -func TestTenantIsolation_Search(t *testing.T) { - idx := New(1_000) - - idx.Add(1, "acme", "checkout", "ERROR", "payment gateway timeout upstream") - idx.Add(2, "acme", "checkout", "ERROR", "payment gateway refused charge") - idx.Add(10, "globex", "auth", "ERROR", "payment gateway token expired") - idx.Add(11, "globex", "auth", "ERROR", "payment gateway 500 internal error") - - acmeHits := idx.Search("acme", "payment gateway timeout", 10) - if len(acmeHits) == 0 { - t.Fatalf("acme search returned zero hits despite matching docs") - } - for _, h := range acmeHits { - if h.Tenant != "acme" || h.LogID >= 10 { - t.Fatalf("acme search leaked id=%d tenant=%q body=%q", h.LogID, h.Tenant, h.Body) - } - } - - globexHits := idx.Search("globex", "payment gateway token", 10) - if len(globexHits) == 0 { - t.Fatalf("globex search returned zero hits despite matching docs") - } - for _, h := range globexHits { - if h.Tenant != "globex" || h.LogID < 10 { - t.Fatalf("globex search leaked id=%d tenant=%q body=%q", h.LogID, h.Tenant, h.Body) - } - } -} - -// TestUnknownTenantReturnsEmpty proves a tenant with no indexed docs returns -// nothing even when other tenants have matching content. -func TestUnknownTenantReturnsEmpty(t *testing.T) { - idx := New(100) - idx.Add(1, "acme", "svc", "ERROR", "database connection refused upstream") - - if got := idx.Search("initech", "database connection", 10); len(got) != 0 { - t.Fatalf("unknown tenant saw %d cross-tenant hits", len(got)) - } -} - -// TestEmptyTenantCoercedToDefault verifies Add and Search coerce an empty -// tenant to the platform default so untenanted callers stay isolated from -// real tenants. -func TestEmptyTenantCoercedToDefault(t *testing.T) { - idx := New(100) - idx.Add(1, "", "svc", "ERROR", "network unreachable upstream host") - - if hits := idx.Search("", "network unreachable", 10); len(hits) != 1 { - t.Fatalf("search with empty tenant: want 1 hit, got %d", len(hits)) - } - if hits := idx.Search(defaultTenantID, "network unreachable", 10); len(hits) != 1 { - t.Fatalf("search with default tenant id: want 1 hit, got %d", len(hits)) - } - if hits := idx.Search("acme", "network unreachable", 10); len(hits) != 0 { - t.Fatalf("acme saw %d cross-tenant hits for default-tenant doc", len(hits)) - } -} - -// TestFIFOEvictionFairness is TechLead's requested assertion: a tenant that -// writes near-cap volume cannot evict another tenant's documents from the -// shared index. Under a naive global-FIFO policy tenant B's flood would -// remove tenant A's older entries and A would silently "lose" its warm -// rows — a confidentiality-safe but availability-breaking failure mode. -func TestFIFOEvictionFairness(t *testing.T) { - const cap = 200 - idx := New(cap) - - // Tenant A writes a small set of distinctive markers. - for i := 0; i < 5; i++ { - idx.Add(uint(1+i), "acme", "checkout", "ERROR", "acme-canary-marker alpha beta gamma "+strconv.Itoa(i)) - } - - // Tenant B floods the index well past the cap — enough to trigger - // multiple eviction cycles. - for i := 0; i < cap*4; i++ { - idx.Add(uint(10_000+i), "globex", "svc", "ERROR", "globex chatter filling the index "+strconv.Itoa(i)) - } - - // Every one of acme's canary rows must still be findable. - hits := idx.Search("acme", "acme-canary-marker alpha beta gamma", 20) - if len(hits) < 5 { - t.Fatalf("eviction unfairness: acme canaries evicted by globex flood. want >=5 hits, got %d", len(hits)) - } - seen := map[uint]bool{} - for _, h := range hits { - if h.Tenant != "acme" { - t.Fatalf("cross-tenant leak during eviction test: id=%d tenant=%q", h.LogID, h.Tenant) - } - seen[h.LogID] = true - } - for id := uint(1); id <= 5; id++ { - if !seen[id] { - t.Fatalf("acme canary id=%d missing after globex flood", id) - } - } -} - -// TestConcurrentTenantAddSearch pins down race-detector cleanliness and -// cross-tenant isolation under concurrent readers/writers. -func TestConcurrentTenantAddSearch(t *testing.T) { - idx := New(5_000) - var wg sync.WaitGroup - - for _, tenant := range []string{"acme", "globex"} { - wg.Add(2) - go func(ten string) { - defer wg.Done() - for i := 0; i < 500; i++ { - idx.Add(uint(i), ten, "svc", "ERROR", ten+" error kafka partition "+strconv.Itoa(i)) - } - }(tenant) - go func(ten string) { - defer wg.Done() - for i := 0; i < 500; i++ { - for _, h := range idx.Search(ten, "kafka partition", 5) { - if h.Tenant != ten { - t.Errorf("tenant %s saw cross-tenant hit tenant=%q body=%q", ten, h.Tenant, h.Body) - return - } - } - } - }(tenant) - } - wg.Wait() -} diff --git a/internal/vectordb/replay.go b/internal/vectordb/replay.go deleted file mode 100644 index 5ec9de5..0000000 --- a/internal/vectordb/replay.go +++ /dev/null @@ -1,74 +0,0 @@ -package vectordb - -import "context" - -// ReplaySource is the minimal contract a backing store fulfills to hydrate -// this Index on startup. Pages are pulled in id-ascending order; the source -// signals end-of-data by returning a slice shorter than the requested limit. -// ReplayFromDB walks pages starting from LastIndexedID() until the source -// returns no more rows. -// -// Vectordb intentionally does NOT import the storage package — keeping it as -// a leaf accelerator means tests can wire any in-memory source without a -// SQLite dependency, and storage is free to evolve its row type without -// breaking vectordb. The wiring layer (cmd/main.go) is responsible for -// projecting storage.Log into ReplayRow. -type ReplaySource interface { - LogsForVectorReplay(ctx context.Context, sinceID uint, limit int) ([]ReplayRow, error) -} - -// ReplayRow is the minimum field set Add() needs. Mirrors the projection a -// storage adapter performs at the boundary. -type ReplayRow struct { - ID uint - Tenant string - ServiceName string - Severity string - Body string -} - -// replayPageSize bounds memory during tail-replay. 10k rows is a reasonable -// trade-off between query overhead per page and peak heap; at typical body -// sizes this stays well under 50 MB resident per page. -const replayPageSize = 10_000 - -// ReplayFromDB walks ReplaySource pages starting from LastIndexedID() and -// feeds each row through Add(). Returns the count of rows processed (Add -// filters by severity, so processed ≠ indexed when the source loosens its -// filter — but the standard storage implementation already pre-filters to -// ERROR/WARN/family so the counts match in practice). -// -// Termination contract: the source signals end-of-data by returning a -// zero-length slice. This lets sources page however they want without -// having to fill every page exactly to replayPageSize — the trade-off is -// one extra round-trip at the tail (fine for a one-shot startup call). -// -// Caller passes a derived ctx so SIGTERM during boot cancels the replay -// cleanly. On any source error, returns the partial count + error so the -// caller can log and proceed with a partially-warm index. -func (idx *Index) ReplayFromDB(ctx context.Context, src ReplaySource) (int, error) { - if src == nil { - return 0, nil - } - sinceID := idx.LastIndexedID() - total := 0 - for { - if err := ctx.Err(); err != nil { - return total, err - } - rows, err := src.LogsForVectorReplay(ctx, sinceID, replayPageSize) - if err != nil { - return total, err - } - if len(rows) == 0 { - return total, nil - } - for _, row := range rows { - idx.Add(row.ID, row.Tenant, row.ServiceName, row.Severity, row.Body) - if row.ID > sinceID { - sinceID = row.ID - } - } - total += len(rows) - } -} diff --git a/internal/vectordb/replay_test.go b/internal/vectordb/replay_test.go deleted file mode 100644 index 28b829a..0000000 --- a/internal/vectordb/replay_test.go +++ /dev/null @@ -1,161 +0,0 @@ -package vectordb - -import ( - "context" - "errors" - "testing" -) - -// fakeSource is an in-memory ReplaySource for unit-testing the page loop -// without a real DB. Pages are produced by a closure so each test can shape -// the source however it likes (multi-page, errors, end-of-data). -type fakeSource struct { - pages [][]ReplayRow // queued pages; consumed in order - calls int - fail error -} - -func (s *fakeSource) LogsForVectorReplay(_ context.Context, sinceID uint, limit int) ([]ReplayRow, error) { - s.calls++ - if s.fail != nil { - return nil, s.fail - } - if s.calls > len(s.pages) { - return nil, nil - } - page := s.pages[s.calls-1] - // Filter to "rows newer than sinceID" so the test verifies the loop - // passes the right cursor across iterations. - out := make([]ReplayRow, 0, len(page)) - for _, r := range page { - if r.ID > sinceID { - out = append(out, r) - } - } - if len(out) > limit { - out = out[:limit] - } - return out, nil -} - -// TestReplayFromDB_AdvancesCursor verifies multi-page replay calls the -// source with monotonically-increasing sinceID values and indexes every -// row, with no duplicates by LogID. -func TestReplayFromDB_AdvancesCursor(t *testing.T) { - src := &fakeSource{ - pages: [][]ReplayRow{ - { - {ID: 10, Tenant: "t", ServiceName: "svc", Severity: "ERROR", Body: "boom"}, - {ID: 20, Tenant: "t", ServiceName: "svc", Severity: "ERROR", Body: "kaboom"}, - }, - { - {ID: 30, Tenant: "t", ServiceName: "svc", Severity: "WARN", Body: "third page row tokenizes fine"}, - }, - }, - } - idx := New(100) - total, err := idx.ReplayFromDB(context.Background(), src) - if err != nil { - t.Fatalf("ReplayFromDB: %v", err) - } - if total != 3 { - t.Errorf("processed: got %d, want 3", total) - } - if idx.Size() != 3 { - t.Errorf("indexed Size: got %d, want 3", idx.Size()) - } - if idx.LastIndexedID() != 30 { - t.Errorf("LastIndexedID: got %d, want 30", idx.LastIndexedID()) - } - // Two data pages + one empty page that signals end-of-data. - if src.calls != 3 { - t.Errorf("source calls: got %d, want 3 (2 data + 1 empty terminator)", src.calls) - } -} - -// TestReplayFromDB_StartsFromLastIndexedID verifies the loop seeds sinceID -// from the existing high watermark, so a snapshot's tail can be picked up -// without re-indexing rows already in the index. -func TestReplayFromDB_StartsFromLastIndexedID(t *testing.T) { - idx := New(100) - idx.Add(50, "t", "svc", "ERROR", "already indexed") - if got := idx.LastIndexedID(); got != 50 { - t.Fatalf("seed LastIndexedID: got %d, want 50", got) - } - - src := &fakeSource{ - pages: [][]ReplayRow{ - // Page contains both pre-watermark and post-watermark rows; the - // fake's filter mimics SQL's WHERE id > sinceID, so only post-50 - // rows leave the source. - { - {ID: 30, Tenant: "t", ServiceName: "svc", Severity: "ERROR", Body: "old"}, - {ID: 50, Tenant: "t", ServiceName: "svc", Severity: "ERROR", Body: "boundary"}, - {ID: 60, Tenant: "t", ServiceName: "svc", Severity: "ERROR", Body: "new"}, - }, - }, - } - total, err := idx.ReplayFromDB(context.Background(), src) - if err != nil { - t.Fatalf("ReplayFromDB: %v", err) - } - if total != 1 { - t.Errorf("processed: got %d, want 1 (only id=60 is post-watermark)", total) - } - if idx.Size() != 2 { - t.Errorf("indexed Size: got %d, want 2 (seed + replayed)", idx.Size()) - } - if idx.LastIndexedID() != 60 { - t.Errorf("LastIndexedID: got %d, want 60", idx.LastIndexedID()) - } -} - -// TestReplayFromDB_PropagatesError verifies a source error is returned -// alongside the partial count so the caller can log and continue. -func TestReplayFromDB_PropagatesError(t *testing.T) { - src := &fakeSource{fail: errors.New("db gone")} - idx := New(100) - total, err := idx.ReplayFromDB(context.Background(), src) - if err == nil { - t.Fatal("want error, got nil") - } - if total != 0 { - t.Errorf("partial count: got %d, want 0", total) - } - if idx.Size() != 0 { - t.Errorf("error path must not corrupt index: Size=%d", idx.Size()) - } -} - -// TestReplayFromDB_RespectsCancellation verifies a cancelled ctx aborts -// the loop without making another source call. -func TestReplayFromDB_RespectsCancellation(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - cancel() - - src := &fakeSource{ - pages: [][]ReplayRow{ - {{ID: 1, Tenant: "t", ServiceName: "svc", Severity: "ERROR", Body: "x"}}, - }, - } - idx := New(100) - _, err := idx.ReplayFromDB(ctx, src) - if !errors.Is(err, context.Canceled) { - t.Fatalf("want context.Canceled, got %v", err) - } - if src.calls != 0 { - t.Errorf("source called despite cancelled ctx: calls=%d", src.calls) - } -} - -// TestReplayFromDB_NilSource is a smoke test for the nil-safe early return. -func TestReplayFromDB_NilSource(t *testing.T) { - idx := New(100) - total, err := idx.ReplayFromDB(context.Background(), nil) - if err != nil { - t.Fatalf("nil source: unexpected err=%v", err) - } - if total != 0 { - t.Errorf("nil source: total=%d, want 0", total) - } -} diff --git a/internal/vectordb/snapshot.go b/internal/vectordb/snapshot.go deleted file mode 100644 index 87c12d0..0000000 --- a/internal/vectordb/snapshot.go +++ /dev/null @@ -1,317 +0,0 @@ -package vectordb - -import ( - "bytes" - "context" - "encoding/binary" - "encoding/gob" - "errors" - "fmt" - "hash/crc32" - "io" - "log/slog" - "os" - "syscall" - "time" -) - -// Snapshot is the persisted state of an Index. -// -// Only the fields needed to reconstruct an equivalent Index are captured — -// transient state (mu, dirty) is intentionally absent. LastIndexedID is the -// high watermark of indexed Log.IDs so a startup tail-replay can query DB -// rows newer than the snapshot without double-indexing rows already in -// Docs. -// -// Field changes break the format — bump snapshotVersion when the wire -// shape changes. Old snapshots whose magic+version don't match are -// rejected on load and the caller falls back to a full DB rebuild. -type Snapshot struct { - LastIndexedID uint - MaxSize int - Docs []LogVector - IDF map[string]float64 - WrittenAt int64 // unix seconds, observability only -} - -const ( - // snapshotMagic is a 4-byte file header so a corrupt or stray file is - // rejected before we attempt the more expensive gob decode. - snapshotMagic = "VDB1" - // snapshotVersion travels alongside the magic. Bump on any LogVector - // or Snapshot field shape change so loaders fall back to rebuild - // instead of producing silently-wrong index state. - snapshotVersion uint32 = 1 -) - -// EncodeSnapshot writes a versioned, CRC32-protected snapshot to w. -// -// Wire format (big-endian for portability): -// -// bytes[0:4] magic "VDB1" -// bytes[4:8] version uint32 -// bytes[8:12] CRC32-IEEE uint32 (over bytes[12:]) -// bytes[12:] gob payload Snapshot -func EncodeSnapshot(w io.Writer, snap Snapshot) error { - var payload bytes.Buffer - if err := gob.NewEncoder(&payload).Encode(snap); err != nil { - return fmt.Errorf("encode snapshot payload: %w", err) - } - crc := crc32.ChecksumIEEE(payload.Bytes()) - - if _, err := w.Write([]byte(snapshotMagic)); err != nil { - return fmt.Errorf("write magic: %w", err) - } - if err := binary.Write(w, binary.BigEndian, snapshotVersion); err != nil { - return fmt.Errorf("write version: %w", err) - } - if err := binary.Write(w, binary.BigEndian, crc); err != nil { - return fmt.Errorf("write crc: %w", err) - } - if _, err := w.Write(payload.Bytes()); err != nil { - return fmt.Errorf("write payload: %w", err) - } - return nil -} - -// DecodeSnapshot reads + validates a snapshot from r. -// -// All errors are caller-visible. The expected handling is: log a warning -// and proceed with a full DB rebuild — never silently load partial state. -// Errors include short header, wrong magic, unsupported version, CRC -// mismatch, and gob decode failure. -func DecodeSnapshot(r io.Reader) (Snapshot, error) { - var ( - magic [4]byte - version uint32 - crc uint32 - ) - if _, err := io.ReadFull(r, magic[:]); err != nil { - return Snapshot{}, fmt.Errorf("read magic: %w", err) - } - if string(magic[:]) != snapshotMagic { - return Snapshot{}, fmt.Errorf("unexpected snapshot magic %q (want %q)", magic[:], snapshotMagic) - } - if err := binary.Read(r, binary.BigEndian, &version); err != nil { - return Snapshot{}, fmt.Errorf("read version: %w", err) - } - if version != snapshotVersion { - return Snapshot{}, fmt.Errorf("unsupported snapshot version %d (current %d)", version, snapshotVersion) - } - if err := binary.Read(r, binary.BigEndian, &crc); err != nil { - return Snapshot{}, fmt.Errorf("read crc: %w", err) - } - payload, err := io.ReadAll(r) - if err != nil { - return Snapshot{}, fmt.Errorf("read payload: %w", err) - } - if got := crc32.ChecksumIEEE(payload); got != crc { - return Snapshot{}, fmt.Errorf("snapshot crc mismatch: got %08x want %08x", got, crc) - } - var snap Snapshot - if err := gob.NewDecoder(bytes.NewReader(payload)).Decode(&snap); err != nil { - return Snapshot{}, fmt.Errorf("decode payload: %w", err) - } - return snap, nil -} - -// writeAtomic writes data to path via tmp+sync+rename. -// -// Mode 0o600: snapshots persist log bodies which can carry sensitive -// operational data — owner-only is the conservative default. Operators -// who need shared read can chmod externally. -// -// On EXDEV (cross-device rename, e.g. when data dir is on a separate -// mount than the binary's tmp dir), falls back to a non-atomic -// os.WriteFile at the destination. Cross-device deployments are rare and -// documented; the fallback at least ensures the snapshot is written, with -// last-writer-wins replacing the atomicity guarantee. -// -// On any error during the write/fsync phase, the .tmp file is removed so -// a partial file does not poison the next startup's load attempt. -func writeAtomic(path string, data []byte) error { - tmp := path + ".tmp" - f, err := os.OpenFile(tmp, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o600) - if err != nil { - return fmt.Errorf("create tmp %s: %w", tmp, err) - } - if _, err := f.Write(data); err != nil { - _ = f.Close() - _ = os.Remove(tmp) - return fmt.Errorf("write tmp: %w", err) - } - if err := f.Sync(); err != nil { - _ = f.Close() - _ = os.Remove(tmp) - return fmt.Errorf("fsync tmp: %w", err) - } - if err := f.Close(); err != nil { - _ = os.Remove(tmp) - return fmt.Errorf("close tmp: %w", err) - } - if err := os.Rename(tmp, path); err != nil { - if isEXDEV(err) { - data, readErr := os.ReadFile(tmp) - if readErr != nil { - _ = os.Remove(tmp) - return fmt.Errorf("rename EXDEV + readback: %w", readErr) - } - if writeErr := os.WriteFile(path, data, 0o600); writeErr != nil { - _ = os.Remove(tmp) - return fmt.Errorf("rename EXDEV + writefile: %w", writeErr) - } - _ = os.Remove(tmp) - return nil - } - _ = os.Remove(tmp) - return fmt.Errorf("rename %s: %w", path, err) - } - return nil -} - -// isEXDEV reports whether err is a cross-device link/rename error. -func isEXDEV(err error) bool { - if err == nil { - return false - } - var le *os.LinkError - if errors.As(err, &le) { - return errors.Is(le.Err, syscall.EXDEV) - } - return errors.Is(err, syscall.EXDEV) -} - -// LoadSnapshot reads a snapshot from path and replaces the Index's state. -// -// Caller must ensure no concurrent Add()/Search() is in flight — this is -// the typical startup wiring (fresh Index, before ingest accept). Errors -// are returned as-is so the caller can distinguish os.IsNotExist (no -// previous snapshot — first start) from corruption/format errors (log -// warn + proceed with full DB rebuild). -// -// On error the Index state is left untouched. -func (idx *Index) LoadSnapshot(path string) error { - f, err := os.Open(path) // #nosec G304 -- operator-supplied snapshot path - if err != nil { - return err - } - defer func() { _ = f.Close() }() - snap, err := DecodeSnapshot(f) - if err != nil { - return err - } - idx.mu.Lock() - defer idx.mu.Unlock() - idx.docs = snap.Docs - idx.idf = snap.IDF - if idx.idf == nil { - idx.idf = make(map[string]float64) - } - if snap.MaxSize > 0 { - idx.maxSize = snap.MaxSize - } - idx.lastIndexedID = snap.LastIndexedID - idx.dirty = false - return nil -} - -// SetSnapshotObserver registers a callback invoked at the end of each -// WriteSnapshot. result is "success" or "failure"; size is the on-disk -// size of the latest written snapshot (0 on failure). -// -// Set from the wiring layer (main.go) so vectordb stays free of -// telemetry imports. Safe to call before SnapshotLoop starts. -func (idx *Index) SetSnapshotObserver(fn func(result string, duration time.Duration, size int64)) { - idx.mu.Lock() - defer idx.mu.Unlock() - idx.snapshotObserver = fn -} - -// WriteSnapshot serializes the current Index state to path atomically. -// -// Safe to call concurrently with Add()/Search(): the docs slice and IDF -// map are copied under the index lock and serialization runs lock-free -// after release. Critical section is sub-millisecond at the 100k cap -// because slice copy is O(1) per-element header (LogVector strings/maps -// are shared by reference, and Add() never mutates an existing -// LogVector.Vec — it only appends new entries). -func (idx *Index) WriteSnapshot(path string) error { - start := time.Now() - err := idx.writeSnapshot(path) - - idx.mu.RLock() - obs := idx.snapshotObserver - idx.mu.RUnlock() - if obs != nil { - result := "success" - var size int64 - if err != nil { - result = "failure" - } else if fi, statErr := os.Stat(path); statErr == nil { - size = fi.Size() - } - obs(result, time.Since(start), size) - } - return err -} - -func (idx *Index) writeSnapshot(path string) error { - idx.mu.Lock() - if idx.dirty { - idx.recomputeIDF() - idx.dirty = false - } - docs := make([]LogVector, len(idx.docs)) - copy(docs, idx.docs) - idfCopy := make(map[string]float64, len(idx.idf)) - for k, v := range idx.idf { - idfCopy[k] = v - } - snap := Snapshot{ - LastIndexedID: idx.lastIndexedID, - MaxSize: idx.maxSize, - Docs: docs, - IDF: idfCopy, - WrittenAt: time.Now().Unix(), - } - idx.mu.Unlock() - - var buf bytes.Buffer - if err := EncodeSnapshot(&buf, snap); err != nil { - return err - } - return writeAtomic(path, buf.Bytes()) -} - -// SnapshotLoop writes a snapshot to path on every interval tick until ctx is -// done. On context cancel, fires one final WriteSnapshot before returning so -// graceful shutdowns capture the maximum in-memory state. -// -// Transient write failures (disk full, fsync errors, EXDEV warnings) are -// logged via slog but do not break the loop — vectordb is a rebuildable -// accelerator, and silently dropping a tick beats taking the daemon down. -// -// Safe to call with empty path / zero interval — both disable the loop and -// return immediately. -func (idx *Index) SnapshotLoop(ctx context.Context, path string, interval time.Duration) { - if path == "" || interval <= 0 { - return - } - ticker := time.NewTicker(interval) - defer ticker.Stop() - for { - select { - case <-ctx.Done(): - if err := idx.WriteSnapshot(path); err != nil { - slog.Warn("vectordb final snapshot on shutdown failed", "path", path, "error", err) - } else { - slog.Info("vectordb final snapshot written", "path", path, "size", idx.Size()) - } - return - case <-ticker.C: - if err := idx.WriteSnapshot(path); err != nil { - slog.Warn("vectordb periodic snapshot failed", "path", path, "error", err) - } - } - } -} diff --git a/internal/vectordb/snapshot_test.go b/internal/vectordb/snapshot_test.go deleted file mode 100644 index 9df3a67..0000000 --- a/internal/vectordb/snapshot_test.go +++ /dev/null @@ -1,325 +0,0 @@ -package vectordb - -import ( - "bytes" - "context" - "encoding/binary" - "errors" - "os" - "path/filepath" - "syscall" - "testing" - "time" -) - -// TestSnapshotRoundTrip verifies an encoded snapshot decodes back to the -// same logical state across all populated fields. -func TestSnapshotRoundTrip(t *testing.T) { - in := Snapshot{ - LastIndexedID: 42, - MaxSize: 1000, - Docs: []LogVector{ - {LogID: 1, Tenant: "acme", ServiceName: "api", Severity: "ERROR", Body: "panic at startup", Vec: map[string]float64{"panic": 0.5, "startup": 0.5}}, - {LogID: 2, Tenant: "globex", ServiceName: "db", Severity: "WARN", Body: "timeout connecting", Vec: map[string]float64{"timeout": 1.0}}, - }, - IDF: map[string]float64{"panic": 1.5, "startup": 1.0, "timeout": 1.2}, - WrittenAt: 1714464000, - } - var buf bytes.Buffer - if err := EncodeSnapshot(&buf, in); err != nil { - t.Fatalf("encode: %v", err) - } - out, err := DecodeSnapshot(&buf) - if err != nil { - t.Fatalf("decode: %v", err) - } - if out.LastIndexedID != in.LastIndexedID { - t.Errorf("LastIndexedID: got %d, want %d", out.LastIndexedID, in.LastIndexedID) - } - if out.MaxSize != in.MaxSize { - t.Errorf("MaxSize: got %d, want %d", out.MaxSize, in.MaxSize) - } - if len(out.Docs) != len(in.Docs) { - t.Fatalf("Docs length: got %d, want %d", len(out.Docs), len(in.Docs)) - } - if out.Docs[0].Body != in.Docs[0].Body || out.Docs[0].LogID != in.Docs[0].LogID { - t.Errorf("Doc[0]: got %+v, want %+v", out.Docs[0], in.Docs[0]) - } - if got, want := out.Docs[0].Vec["panic"], in.Docs[0].Vec["panic"]; got != want { - t.Errorf("Doc[0].Vec[panic]: got %v, want %v", got, want) - } - if got, want := out.IDF["panic"], in.IDF["panic"]; got != want { - t.Errorf("IDF[panic]: got %v, want %v", got, want) - } -} - -// TestDecodeSnapshot_EmptyReader verifies graceful failure on truncation -// at the very first read (magic). -func TestDecodeSnapshot_EmptyReader(t *testing.T) { - if _, err := DecodeSnapshot(bytes.NewReader(nil)); err == nil { - t.Fatal("decoding empty reader must fail") - } -} - -// TestDecodeSnapshot_WrongMagic verifies the magic check rejects stray files. -func TestDecodeSnapshot_WrongMagic(t *testing.T) { - var buf bytes.Buffer - buf.WriteString("BAD!") - _ = binary.Write(&buf, binary.BigEndian, snapshotVersion) - _ = binary.Write(&buf, binary.BigEndian, uint32(0)) - if _, err := DecodeSnapshot(&buf); err == nil { - t.Fatal("wrong magic must fail") - } -} - -// TestDecodeSnapshot_WrongVersion verifies version-bump reads are refused -// — the loader should fall back to full rebuild on any version mismatch. -func TestDecodeSnapshot_WrongVersion(t *testing.T) { - var buf bytes.Buffer - buf.WriteString(snapshotMagic) - _ = binary.Write(&buf, binary.BigEndian, uint32(999)) - if _, err := DecodeSnapshot(&buf); err == nil { - t.Fatal("wrong version must fail") - } -} - -// TestDecodeSnapshot_CRCMismatch verifies bit-rot or partial writes are -// caught before the gob decoder produces silently-wrong state. -func TestDecodeSnapshot_CRCMismatch(t *testing.T) { - in := Snapshot{LastIndexedID: 1, MaxSize: 100, IDF: map[string]float64{}} - var buf bytes.Buffer - if err := EncodeSnapshot(&buf, in); err != nil { - t.Fatalf("encode: %v", err) - } - raw := buf.Bytes() - // Header is 12 bytes (magic+version+crc); flip a payload byte. - if len(raw) < 13 { - t.Fatalf("encoded snapshot too short: %d bytes", len(raw)) - } - raw[12] ^= 0xff - if _, err := DecodeSnapshot(bytes.NewReader(raw)); err == nil { - t.Fatal("CRC mismatch must fail") - } -} - -// TestWriteAtomic_RoundTrip writes a payload and reads it back via the -// public path, then asserts the .tmp sibling is gone. -func TestWriteAtomic_RoundTrip(t *testing.T) { - dir := t.TempDir() - p := filepath.Join(dir, "snap.bin") - payload := []byte("hello world") - if err := writeAtomic(p, payload); err != nil { - t.Fatalf("writeAtomic: %v", err) - } - got, err := os.ReadFile(p) - if err != nil { - t.Fatalf("ReadFile: %v", err) - } - if !bytes.Equal(got, payload) { - t.Fatalf("round-trip: got %q, want %q", got, payload) - } - if _, err := os.Stat(p + ".tmp"); !os.IsNotExist(err) { - t.Fatalf(".tmp must be removed after rename, got err=%v", err) - } -} - -// TestIsEXDEV_Detection verifies the helper recognizes wrapped EXDEV from -// os.Rename and ignores arbitrary errors. -func TestIsEXDEV_Detection(t *testing.T) { - le := &os.LinkError{Op: "rename", Old: "a", New: "b", Err: syscall.EXDEV} - if !isEXDEV(le) { - t.Fatal("isEXDEV should detect *os.LinkError{Err: EXDEV}") - } - if isEXDEV(errors.New("other error")) { - t.Fatal("isEXDEV should not flag arbitrary errors") - } - if isEXDEV(nil) { - t.Fatal("isEXDEV(nil) must be false") - } -} - -// TestIndexWriteAndLoadSnapshot exercises the full Index → file → Index -// round trip: build, snapshot, load into a fresh Index, verify state. -func TestIndexWriteAndLoadSnapshot(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "vectordb.snapshot") - - src := New(1000) - src.Add(101, "acme", "checkout", "ERROR", "payment gateway timeout charging customer") - src.Add(102, "acme", "checkout", "ERROR", "payment gateway refused charge insufficient funds") - src.Add(203, "globex", "auth", "WARN", "session token nearing expiry") - if got, want := src.Size(), 3; got != want { - t.Fatalf("seed Size: got %d, want %d", got, want) - } - if got := src.LastIndexedID(); got != 203 { - t.Fatalf("LastIndexedID: got %d, want 203", got) - } - - if err := src.WriteSnapshot(path); err != nil { - t.Fatalf("WriteSnapshot: %v", err) - } - - // Verify file written + .tmp gone - if st, err := os.Stat(path); err != nil { - t.Fatalf("stat snapshot: %v", err) - } else if st.Size() == 0 { - t.Fatal("snapshot file is empty") - } - if _, err := os.Stat(path + ".tmp"); !os.IsNotExist(err) { - t.Fatalf(".tmp must be gone after WriteSnapshot, got err=%v", err) - } - - dst := New(500) // different cap; load should restore src's cap - if err := dst.LoadSnapshot(path); err != nil { - t.Fatalf("LoadSnapshot: %v", err) - } - if got, want := dst.Size(), 3; got != want { - t.Fatalf("loaded Size: got %d, want %d", got, want) - } - if got := dst.LastIndexedID(); got != 203 { - t.Fatalf("loaded LastIndexedID: got %d, want 203", got) - } - // Search should work on the restored index — the IDF table came along - // with the snapshot, so cosine ranking still has rarity weights. - hits := dst.Search("acme", "payment gateway", 5) - if len(hits) != 2 { - t.Fatalf("Search after load: got %d hits, want 2", len(hits)) - } -} - -// TestLoadSnapshot_MissingFile verifies the loader propagates os-level -// errors so callers can distinguish "first start, no snapshot" via -// os.IsNotExist from real corruption. -func TestLoadSnapshot_MissingFile(t *testing.T) { - dir := t.TempDir() - idx := New(100) - err := idx.LoadSnapshot(filepath.Join(dir, "does-not-exist")) - if err == nil { - t.Fatal("LoadSnapshot of missing file must error") - } - if !os.IsNotExist(err) { - t.Fatalf("want os.IsNotExist, got %v", err) - } -} - -// TestSnapshotLoop_FinalWriteOnCancel verifies the loop fires a final -// WriteSnapshot when ctx is cancelled — captures the maximum in-memory -// state at graceful shutdown. -func TestSnapshotLoop_FinalWriteOnCancel(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "snap.bin") - - idx := New(100) - idx.Add(1, "t", "svc", "ERROR", "preserved across shutdown final write") - - ctx, cancel := context.WithCancel(context.Background()) - done := make(chan struct{}) - go func() { - defer close(done) - // 1h interval — the loop should never tick during this test, only - // the cancel path fires the write. - idx.SnapshotLoop(ctx, path, 1*time.Hour) - }() - - // Sanity: file does not yet exist. - if _, err := os.Stat(path); !os.IsNotExist(err) { - t.Fatalf("snapshot must not exist before cancel, got err=%v", err) - } - - cancel() - select { - case <-done: - case <-time.After(2 * time.Second): - t.Fatal("SnapshotLoop did not return within 2s of cancel") - } - - // Verify final write happened. - if st, err := os.Stat(path); err != nil { - t.Fatalf("final snapshot missing after cancel: %v", err) - } else if st.Size() == 0 { - t.Fatal("final snapshot file is empty") - } - - // Round-trip: load into a fresh index and confirm state matches. - dst := New(100) - if err := dst.LoadSnapshot(path); err != nil { - t.Fatalf("LoadSnapshot of final write: %v", err) - } - if dst.Size() != 1 || dst.LastIndexedID() != 1 { - t.Fatalf("loaded state mismatch: Size=%d LastIndexedID=%d", dst.Size(), dst.LastIndexedID()) - } -} - -// TestSnapshotLoop_PeriodicWrite verifies a tick fires WriteSnapshot. -// Uses a tight interval so the test runs in <50ms; the loop fires at -// least once before we cancel + drain. -func TestSnapshotLoop_PeriodicWrite(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "snap.bin") - - idx := New(100) - idx.Add(7, "t", "svc", "ERROR", "periodic snapshot tick body") - - ctx, cancel := context.WithCancel(context.Background()) - done := make(chan struct{}) - go func() { - defer close(done) - idx.SnapshotLoop(ctx, path, 10*time.Millisecond) - }() - - // Wait long enough for at least one tick to fire. - deadline := time.Now().Add(500 * time.Millisecond) - for time.Now().Before(deadline) { - if st, err := os.Stat(path); err == nil && st.Size() > 0 { - break - } - time.Sleep(10 * time.Millisecond) - } - - cancel() - <-done - - if _, err := os.Stat(path); err != nil { - t.Fatalf("expected at least one periodic snapshot to land at %s, got err=%v", path, err) - } -} - -// TestSnapshotLoop_DisabledByEmptyPath verifies the no-op path so config -// disable doesn't accidentally start a tight-loop goroutine. -func TestSnapshotLoop_DisabledByEmptyPath(t *testing.T) { - idx := New(100) - ctx, cancel := context.WithCancel(context.Background()) - done := make(chan struct{}) - go func() { - defer close(done) - idx.SnapshotLoop(ctx, "", 10*time.Millisecond) - }() - // Loop should return immediately when path is empty — no need to cancel. - select { - case <-done: - case <-time.After(500 * time.Millisecond): - cancel() - t.Fatal("SnapshotLoop with empty path must return immediately") - } - cancel() -} - -// TestLoadSnapshot_CorruptFileLeavesStateAlone verifies that a corrupt -// snapshot does NOT clobber existing index state — the caller is meant to -// log the warning and proceed with a full rebuild. -func TestLoadSnapshot_CorruptFileLeavesStateAlone(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "snap.bin") - if err := os.WriteFile(path, []byte("not a valid snapshot file"), 0o600); err != nil { - t.Fatalf("seed corrupt file: %v", err) - } - idx := New(100) - idx.Add(1, "t", "svc", "ERROR", "preexisting body content") - sizeBefore := idx.Size() - if err := idx.LoadSnapshot(path); err == nil { - t.Fatal("LoadSnapshot of corrupt file must fail") - } - if got := idx.Size(); got != sizeBefore { - t.Fatalf("corrupt load corrupted state: Size went %d → %d", sizeBefore, got) - } -} diff --git a/main.go b/main.go index 7bd412f..a0b938a 100644 --- a/main.go +++ b/main.go @@ -30,7 +30,6 @@ import ( tlsbootstrap "github.com/RandomCodeSpace/otelcontext/internal/tls" "github.com/RandomCodeSpace/otelcontext/internal/tsdb" "github.com/RandomCodeSpace/otelcontext/internal/ui" - "github.com/RandomCodeSpace/otelcontext/internal/vectordb" "runtime/debug" "sync" @@ -366,75 +365,12 @@ func main() { go svcGraph.Start(ctxGraph) slog.Info("🕸️ In-memory service graph started (5m window, 30s refresh)") - // 4f. Initialize vector index for semantic log search. - vectorIdx := vectordb.New(cfg.VectorIndexMaxEntries) - slog.Info("🔍 Vector index initialized", "max_entries", cfg.VectorIndexMaxEntries) - - // Vector index hydration: - // 1) LoadSnapshot — restores the prior process's state in O(file size) - // so find_similar_logs returns useful results in <1s after restart - // instead of the legacy minutes of cold-start blindness. - // 2) ReplayFromDB — picks up any DB rows ingested after the last - // snapshot. Severity-filtered + cursor-paged from LastIndexedID. - // - // Both run in a boot goroutine so a slow disk doesn't delay listener - // startup. SIGTERM during boot cancels via appCtx — bootWG ensures the - // hydrator finishes (or aborts cleanly) before DB close at shutdown. - // Wire snapshot write observer before any WriteSnapshot can fire (the - // hydrator goroutine doesn't write, but SnapshotLoop below will). - vectorIdx.SetSnapshotObserver(metrics.RecordVectorSnapshotWrite) - - bootWG.Add(1) - go func() { - defer bootWG.Done() - if cfg.VectorIndexSnapshotPath != "" { - if err := vectorIdx.LoadSnapshot(cfg.VectorIndexSnapshotPath); err != nil { - if os.IsNotExist(err) { - metrics.RecordVectorSnapshotLoad("missing") - slog.Info("🔍 Vector index: no prior snapshot, will hydrate from DB", "path", cfg.VectorIndexSnapshotPath) - } else { - metrics.RecordVectorSnapshotLoad("corrupt") - slog.Warn("🔍 Vector index: snapshot load failed, will rebuild from DB", "path", cfg.VectorIndexSnapshotPath, "error", err) - } - } else { - metrics.RecordVectorSnapshotLoad("success") - slog.Info("🔍 Vector index: loaded snapshot", "path", cfg.VectorIndexSnapshotPath, "entries", vectorIdx.Size(), "since_id", vectorIdx.LastIndexedID()) - } - } - replayed, err := vectorIdx.ReplayFromDB(appCtx, vectorReplayAdapter{repo: repo}) - metrics.RecordVectorReplayLogs(replayed) - if err != nil { - slog.Warn("🔍 Vector index: tail replay errored", "replayed", replayed, "error", err) - } else if replayed > 0 { - slog.Info("🔍 Vector index: tail replay complete", "rows", replayed, "size", vectorIdx.Size(), "since_id", vectorIdx.LastIndexedID()) - } - }() - - // Periodic snapshot loop. Empty path or non-positive interval disables. - // snapCtx is cancelled in the shutdown sequence right after graphRAG.Stop() - // so the loop's ctx-done branch fires the final write before exit. - snapCtx, snapCancel := context.WithCancel(appCtx) - snapDone := make(chan struct{}) - go func() { - defer close(snapDone) - if cfg.VectorIndexSnapshotPath == "" { - return - } - interval, err := time.ParseDuration(cfg.VectorIndexSnapshotInterval) - if err != nil || interval <= 0 { - slog.Info("🔍 Vector index: periodic snapshot disabled", "interval", cfg.VectorIndexSnapshotInterval) - return - } - slog.Info("🔍 Vector index: periodic snapshot enabled", "interval", interval, "path", cfg.VectorIndexSnapshotPath) - vectorIdx.SnapshotLoop(snapCtx, cfg.VectorIndexSnapshotPath, interval) - }() - // 4g. Initialize GraphRAG (replaces simple graph for advanced queries) graphrag.SetPanicMetrics(metrics) graphRAGCfg := graphrag.DefaultConfig() graphRAGCfg.WorkerCount = cfg.GraphRAGWorkerCount graphRAGCfg.ChannelSize = cfg.GraphRAGEventQueueSize - graphRAG := graphrag.New(repo, vectorIdx, tsdbAgg, ringBuf, graphRAGCfg) + graphRAG := graphrag.New(repo, tsdbAgg, ringBuf, graphRAGCfg) graphRAG.SetMetrics(metrics) ctxGraphRAG, cancelGraphRAG := context.WithCancel(context.Background()) go graphRAG.Start(ctxGraphRAG) @@ -461,10 +397,9 @@ func main() { apiServer := api.NewServer(repo, hub, eventHub, metrics) apiServer.SetGraph(svcGraph) apiServer.SetGraphRAG(graphRAG) - apiServer.SetVectorIndex(vectorIdx) // 6b. Initialize MCP Server (HTTP Streamable, JSON-RPC 2.0 + SSE) - mcpServer := mcp.New(cfg.DefaultTenant, repo, metrics, svcGraph, vectorIdx) + mcpServer := mcp.New(cfg.DefaultTenant, repo, metrics, svcGraph) mcpServer.SetGraphRAG(graphRAG) mcpServer.SetCallLimit(cfg.MCPMaxConcurrent) mcpServer.SetCallTimeout(time.Duration(cfg.MCPCallTimeoutMs) * time.Millisecond) @@ -578,7 +513,6 @@ func main() { Timestamp: l.Timestamp, }) aiService.EnqueueLog(l) - vectorIdx.Add(l.ID, l.TenantID, l.ServiceName, l.Severity, l.Body) eventHub.NotifyRefresh() if time.Since(start) > 100*time.Millisecond { slog.Warn("Slow broadcast/enqueue", "duration", time.Since(start)) @@ -753,7 +687,7 @@ func main() { } // Embedded UI Server - uiServer := ui.NewServer(repo, metrics, svcGraph, vectorIdx) + uiServer := ui.NewServer(repo, metrics, svcGraph) uiServer.SetMCPConfig(cfg.MCPEnabled, cfg.MCPPath) if err := uiServer.RegisterRoutes(mux); err != nil { fatal("Failed to register UI routes", err) @@ -936,19 +870,6 @@ func main() { graphRAG.Stop() cancelGraphRAG() - // 3a. Cancel vectordb snapshot loop. The loop's ctx.Done branch fires a - // final WriteSnapshot before exit, capturing the maximum in-memory state - // (every Add() that drained from GraphRAG above is persisted). We wait - // briefly so the final snapshot hits disk before DB close — the snapshot - // is independent of repo, but ordered shutdown is cheaper than a stale - // snapshot on the next boot. - snapCancel() - select { - case <-snapDone: - case <-time.After(5 * time.Second): - slog.Warn("vectordb snapshot loop did not finish in 5s; final snapshot may be incomplete") - } - // 3a. Drain async ingest pipeline. gRPC GracefulStop above guarantees // no new Submits land; this blocks until workers finish in-flight // batches so a graceful shutdown doesn't lose buffered ingest. @@ -1083,30 +1004,6 @@ func initTracerProvider(endpoint string) (*sdktrace.TracerProvider, error) { return tp, nil } -// vectorReplayAdapter projects storage.Log into vectordb.ReplayRow so the -// vectordb package stays free of storage imports while still consuming the -// repository's tail-replay query. Lives at the wiring layer because both -// packages can be imported here, but neither imports the other. -type vectorReplayAdapter struct{ repo *storage.Repository } - -func (a vectorReplayAdapter) LogsForVectorReplay(ctx context.Context, sinceID uint, limit int) ([]vectordb.ReplayRow, error) { - logs, err := a.repo.LogsForVectorReplay(ctx, sinceID, limit) - if err != nil { - return nil, err - } - out := make([]vectordb.ReplayRow, len(logs)) - for i, l := range logs { - out[i] = vectordb.ReplayRow{ - ID: l.ID, - Tenant: l.TenantID, - ServiceName: l.ServiceName, - Severity: l.Severity, - Body: l.Body, - } - } - return out, nil -} - func printBanner() { banner := ` ___ _____ _____ _ From f8a6fa137eaf4e7b45a18d607620cd5f5c9fd896 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Sun, 24 May 2026 18:57:26 +0000 Subject: [PATCH 03/11] refactor(graphrag): drop graph_snapshots table and snapshot scheduler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `graph_snapshots` table backed exactly one MCP tool (get_graph_snapshot, cut earlier in this PR) — no UI surface or REST endpoint reads it. With the tool gone the table is pure write amplification: at 15-minute cadence × ~100 tenants × per-row JSON nodes+edges blob it adds ~67k rows/week even after the 7-day age prune, and the row-count backstop only kicks in above 100k. On the SQLite OOM-within-an-hour deployment this contributes meaningfully to the 2 TB/day disk growth. Deletions: - internal/graphrag/snapshot.go (entire file): GraphSnapshot GORM model, takeSnapshot / takeSnapshotForTenant, pruneOldSnapshots, GetGraphSnapshot, maxSnapshotRows constant. - views.GraphSnapshot type + GraphSnapshotFromModel converter (only used by the removed test). - TestGraphRAG_GetGraphSnapshot_TenantScoped + the GraphSnapshot wire- shape leak test in views_test.go. Updates: - AutoMigrateGraphRAG no longer creates the table on fresh installs. graphRAGTables slice drops "graph_snapshots" so tenant-backfill skips it and the test asserting the per-table backfill no longer expects the row. - refresh.go::snapshotLoop now only calls persistDrainTemplates; the snapshotEvery field and the loop name are kept for wiring stability so external Config.SnapshotEvery still tunes the drain-persist cadence. Operator migration: existing graph_snapshots tables are LEFT IN PLACE on upgrade — AutoMigrate's IF NOT EXISTS semantics mean a populated table is not touched. Operators wanting to reclaim disk should `DROP TABLE graph_snapshots; VACUUM;` after upgrading. The table will stop receiving new writes immediately. --- internal/api/views/views.go | 24 ---- internal/api/views/views_test.go | 7 -- internal/graphrag/migrate.go | 10 +- internal/graphrag/migrate_test.go | 50 +------- internal/graphrag/refresh.go | 11 +- internal/graphrag/snapshot.go | 203 ------------------------------ main.go | 2 +- 7 files changed, 20 insertions(+), 287 deletions(-) delete mode 100644 internal/graphrag/snapshot.go diff --git a/internal/api/views/views.go b/internal/api/views/views.go index 270d9df..ccd40c4 100644 --- a/internal/api/views/views.go +++ b/internal/api/views/views.go @@ -198,17 +198,6 @@ type Investigation struct { SpanChain any `json:"span_chain"` } -// GraphSnapshot is the wire shape of a persisted topology snapshot. -type GraphSnapshot struct { - ID string `json:"id"` - CreatedAt time.Time `json:"created_at"` - Nodes any `json:"nodes"` - Edges any `json:"edges"` - ServiceCount int `json:"service_count"` - TotalCalls int64 `json:"total_calls"` - AvgHealthScore float64 `json:"avg_health_score"` -} - // --- Conversion functions --- // TraceFromModel converts a storage.Trace (with possibly-Preloaded children) @@ -469,19 +458,6 @@ func InvestigationFromModel(m graphrag.Investigation) Investigation { } } -// GraphSnapshotFromModel converts a persisted GraphRAG snapshot into its view. -func GraphSnapshotFromModel(m graphrag.GraphSnapshot) GraphSnapshot { - return GraphSnapshot{ - ID: m.ID, - CreatedAt: m.CreatedAt, - Nodes: rawToAny(m.Nodes), - Edges: rawToAny(m.Edges), - ServiceCount: m.ServiceCount, - TotalCalls: m.TotalCalls, - AvgHealthScore: m.AvgHealthScore, - } -} - // InvestigationsFromModels is the slice form of InvestigationFromModel. func InvestigationsFromModels(ms []graphrag.Investigation) []Investigation { out := make([]Investigation, len(ms)) diff --git a/internal/api/views/views_test.go b/internal/api/views/views_test.go index 99074fa..48d7eb3 100644 --- a/internal/api/views/views_test.go +++ b/internal/api/views/views_test.go @@ -164,13 +164,6 @@ func TestViews_NoGormBookkeepingLeaksThroughJSON(t *testing.T) { SpanChain: json.RawMessage(`[]`), }) assertNoLeak(t, "Investigation", inv, "tenant_id") - - gs := GraphSnapshotFromModel(graphrag.GraphSnapshot{ - ID: "snap1", CreatedAt: ts, - Nodes: json.RawMessage(`[]`), Edges: json.RawMessage(`[]`), - ServiceCount: 1, TotalCalls: 10, AvgHealthScore: 0.9, - }) - assertNoLeak(t, "GraphSnapshot", gs, "tenant_id") } // TestTraceView_PreservesJSONFieldNames asserts the exact JSON shape consumed by diff --git a/internal/graphrag/migrate.go b/internal/graphrag/migrate.go index 133593c..bab16d2 100644 --- a/internal/graphrag/migrate.go +++ b/internal/graphrag/migrate.go @@ -30,9 +30,13 @@ import ( "gorm.io/gorm" ) -// graphRAGTables are the three persisted tables that carry tenant_id after +// graphRAGTables are the persisted tables that carry tenant_id after // RAN-38. Order matches AutoMigrate order so log lines line up. -var graphRAGTables = []string{"investigations", "graph_snapshots", "drain_templates"} +// +// `graph_snapshots` was dropped from the AutoMigrate slice on 2026-05-24; +// existing tables are left in place on operator databases (drop manually +// with `DROP TABLE graph_snapshots` to reclaim disk). +var graphRAGTables = []string{"investigations", "drain_templates"} // AutoMigrateGraphRAG runs GORM auto-migration for GraphRAG models and // applies tenant backfill + drain_templates composite-PK promotion. Safe to @@ -41,7 +45,7 @@ func AutoMigrateGraphRAG(db *gorm.DB) error { if db == nil { return nil } - if err := db.AutoMigrate(&Investigation{}, &GraphSnapshot{}, &DrainTemplateRow{}); err != nil { + if err := db.AutoMigrate(&Investigation{}, &DrainTemplateRow{}); err != nil { return fmt.Errorf("graphrag automigrate: %w", err) } if err := backfillTenantIDs(db); err != nil { diff --git a/internal/graphrag/migrate_test.go b/internal/graphrag/migrate_test.go index 30762f6..031b762 100644 --- a/internal/graphrag/migrate_test.go +++ b/internal/graphrag/migrate_test.go @@ -2,7 +2,6 @@ package graphrag import ( "context" - "strings" "testing" "time" @@ -50,7 +49,6 @@ func TestAutoMigrateGraphRAG_CreatesTenantCompositeIndexes(t *testing.T) { index string }{ {"investigations", "idx_investigations_tenant_created"}, - {"graph_snapshots", "idx_graph_snapshots_tenant_created"}, } for _, tc := range expected { var count int @@ -115,16 +113,13 @@ func TestAutoMigrateGraphRAG_BackfillsLegacyRows(t *testing.T) { if err := AutoMigrateGraphRAG(db); err != nil { t.Fatalf("first migrate: %v", err) } - // Insert rows with empty tenant_id directly via raw SQL — Investigation, - // GraphSnapshot and DrainTemplateRow's GORM defaults would otherwise fill - // the column on insert. + // Insert rows with empty tenant_id directly via raw SQL — Investigation and + // DrainTemplateRow's GORM defaults would otherwise fill the column on + // insert. now := time.Now().UTC() if err := db.Exec(`INSERT INTO investigations (tenant_id, id, created_at, status, severity, trigger_service, trigger_operation, error_message, root_service, root_operation, causal_chain, trace_ids, error_logs, anomalous_metrics, affected_services, span_chain) VALUES ('', 'inv_legacy', ?, 'detected', 'warning', 'svc', 'op', 'boom', 'svc', 'op', '[]', '[]', '[]', '[]', '[]', '[]')`, now).Error; err != nil { t.Fatalf("seed legacy investigation: %v", err) } - if err := db.Exec(`INSERT INTO graph_snapshots (tenant_id, id, created_at, nodes, edges, service_count, total_calls, avg_health_score) VALUES ('', 'snap_legacy', ?, '[]', '[]', 0, 0, 0)`, now).Error; err != nil { - t.Fatalf("seed legacy snapshot: %v", err) - } // Drain rows: tenant_id is part of the PK so we must give it *something* // — empty string is allowed by SQLite. The backfill is expected to fix it. if err := db.Exec(`INSERT INTO drain_templates (tenant_id, id, tokens, count, first_seen, last_seen, sample) VALUES ('', 1, '["a","b"]', 1, ?, ?, 'sample')`, now, now).Error; err != nil { @@ -138,7 +133,7 @@ func TestAutoMigrateGraphRAG_BackfillsLegacyRows(t *testing.T) { for _, tbl := range graphRAGTables { var stragglers int - if err := db.Raw(`SELECT COUNT(*) FROM `+tbl+` WHERE tenant_id IS NULL OR tenant_id = ''`).Scan(&stragglers).Error; err != nil { + if err := db.Raw(`SELECT COUNT(*) FROM ` + tbl + ` WHERE tenant_id IS NULL OR tenant_id = ''`).Scan(&stragglers).Error; err != nil { t.Fatalf("count empty tenant in %s: %v", tbl, err) } if stragglers != 0 { @@ -258,40 +253,3 @@ func TestGraphRAG_GetInvestigations_TenantScoped(t *testing.T) { t.Errorf("expected globex row; got tenant=%q", got.TenantID) } } - -// TestGraphRAG_GetGraphSnapshot_TenantScoped seeds two snapshots (one per -// tenant) at the same instant and asserts each tenant only retrieves its own. -func TestGraphRAG_GetGraphSnapshot_TenantScoped(t *testing.T) { - g, db := newTestGraphRAGWithDB(t) - if err := AutoMigrateGraphRAG(db); err != nil { - t.Fatalf("migrate: %v", err) - } - now := time.Now().UTC() - for _, tenant := range []string{"acme", "globex"} { - snap := GraphSnapshot{ - TenantID: tenant, - ID: "snap_" + tenant, - CreatedAt: now, - Nodes: []byte(`[]`), - Edges: []byte(`[]`), - ServiceCount: 1, - AvgHealthScore: 1, - } - if err := db.Create(&snap).Error; err != nil { - t.Fatalf("seed %s: %v", tenant, err) - } - } - for _, tenant := range []string{"acme", "globex"} { - ctx := storage.WithTenantContext(context.Background(), tenant) - snap, err := g.GetGraphSnapshot(ctx, now.Add(time.Second)) - if err != nil { - t.Fatalf("get %s: %v", tenant, err) - } - if snap.TenantID != tenant { - t.Errorf("ctx %s returned snapshot for tenant %q", tenant, snap.TenantID) - } - if !strings.HasSuffix(snap.ID, tenant) { - t.Errorf("ctx %s returned snapshot id %s", tenant, snap.ID) - } - } -} diff --git a/internal/graphrag/refresh.go b/internal/graphrag/refresh.go index b16d8bc..14b5974 100644 --- a/internal/graphrag/refresh.go +++ b/internal/graphrag/refresh.go @@ -49,7 +49,14 @@ func (g *GraphRAG) refreshLoop(ctx context.Context) { } } -// snapshotLoop takes periodic snapshots and prunes old ones. +// snapshotLoop persists Drain templates on the configured cadence so a +// restart recovers the learned templates instead of rebuilding from scratch. +// +// Historically this loop also captured a periodic GraphSnapshot row into +// the `graph_snapshots` table and pruned aged-out snapshots; both were +// removed on 2026-05-24 alongside the get_graph_snapshot MCP tool. The +// `snapshotLoop` / `snapshotEvery` names are retained for wiring stability +// — callers still tune the persistence cadence via `Config.SnapshotEvery`. func (g *GraphRAG) snapshotLoop(ctx context.Context) { ticker := time.NewTicker(g.snapshotEvery) defer ticker.Stop() @@ -60,8 +67,6 @@ func (g *GraphRAG) snapshotLoop(ctx context.Context) { case <-g.stopCh: return case <-ticker.C: - g.takeSnapshot(ctx) - g.pruneOldSnapshots() g.persistDrainTemplates() } } diff --git a/internal/graphrag/snapshot.go b/internal/graphrag/snapshot.go deleted file mode 100644 index da13598..0000000 --- a/internal/graphrag/snapshot.go +++ /dev/null @@ -1,203 +0,0 @@ -package graphrag - -import ( - "context" - "encoding/json" - "fmt" - "log/slog" - "time" - - "github.com/RandomCodeSpace/otelcontext/internal/storage" -) - -// GraphSnapshot is a periodic snapshot of the service topology persisted to DB. -// -// TenantID scopes the row to the tenant slice it was captured from. The -// composite (tenant_id, created_at) index supports the -// "most recent snapshot at-or-before T for tenant X" lookup that -// GetGraphSnapshot runs on every read. -type GraphSnapshot struct { - TenantID string `gorm:"size:64;default:'default';not null;index:idx_graph_snapshots_tenant_created,priority:1" json:"tenant_id"` - ID string `gorm:"primaryKey;size:64" json:"id"` - CreatedAt time.Time `gorm:"index:idx_graph_snapshots_tenant_created,priority:2" json:"created_at"` - Nodes json.RawMessage `gorm:"type:text" json:"nodes"` - Edges json.RawMessage `gorm:"type:text" json:"edges"` - ServiceCount int `json:"service_count"` - TotalCalls int64 `json:"total_calls"` - AvgHealthScore float64 `json:"avg_health_score"` -} - -// TableName overrides GORM's default table name. -func (GraphSnapshot) TableName() string { - return "graph_snapshots" -} - -// snapshotNode is a lightweight node representation for snapshots. -type snapshotNode struct { - ID string `json:"id"` - Type string `json:"type"` - Name string `json:"name"` - HealthScore float64 `json:"health_score"` - ErrorRate float64 `json:"error_rate"` - AvgLatency float64 `json:"avg_latency_ms"` -} - -// snapshotEdge is a lightweight edge representation for snapshots. -type snapshotEdge struct { - From string `json:"from"` - To string `json:"to"` - Type string `json:"type"` - Weight float64 `json:"weight"` - CallCount int64 `json:"call_count"` - ErrorRate float64 `json:"error_rate"` -} - -// takeSnapshot captures each tenant's current service topology and persists -// one row per tenant per tick. See the note on GraphSnapshot regarding the -// upcoming tenant_id column in Subtask B. -func (g *GraphRAG) takeSnapshot(ctx context.Context) { - for tenant, stores := range g.snapshotTenants() { - tctx := storage.WithTenantContext(ctx, tenant) - g.takeSnapshotForTenant(tctx, tenant, stores) - } -} - -func (g *GraphRAG) takeSnapshotForTenant(_ context.Context, tenant string, stores *tenantStores) { - services := stores.service.AllServices() - edges := stores.service.AllEdges() - - if len(services) == 0 { - return - } - - var nodes []snapshotNode - var totalCalls int64 - var totalHealth float64 - - for _, svc := range services { - nodes = append(nodes, snapshotNode{ - ID: svc.ID, - Type: "service", - Name: svc.Name, - HealthScore: svc.HealthScore, - ErrorRate: svc.ErrorRate, - AvgLatency: svc.AvgLatency, - }) - totalCalls += svc.CallCount - totalHealth += svc.HealthScore - } - - // Also include operations for this tenant. - stores.service.mu.RLock() - for _, op := range stores.service.Operations { - nodes = append(nodes, snapshotNode{ - ID: op.ID, - Type: "operation", - Name: op.Operation, - HealthScore: op.HealthScore, - ErrorRate: op.ErrorRate, - AvgLatency: op.AvgLatency, - }) - } - stores.service.mu.RUnlock() - - var snapEdges []snapshotEdge - for _, e := range edges { - snapEdges = append(snapEdges, snapshotEdge{ - From: e.FromID, - To: e.ToID, - Type: string(e.Type), - Weight: e.Weight, - CallCount: e.CallCount, - ErrorRate: e.ErrorRate, - }) - } - - nodesJSON, _ := json.Marshal(nodes) - edgesJSON, _ := json.Marshal(snapEdges) - - snap := GraphSnapshot{ - TenantID: tenant, - ID: fmt.Sprintf("snap_%s_%d", tenant, time.Now().UnixNano()), - CreatedAt: time.Now(), - Nodes: nodesJSON, - Edges: edgesJSON, - ServiceCount: len(services), - TotalCalls: totalCalls, - AvgHealthScore: totalHealth / float64(len(services)), - } - - if g.repo == nil || g.repo.DB() == nil { - return - } - if err := g.repo.DB().Create(&snap).Error; err != nil { - slog.Error("Failed to persist graph snapshot", "tenant", tenant, "error", err) - return - } - - slog.Debug("Graph snapshot persisted", - "tenant", tenant, - "services", len(services), - "edges", len(snapEdges), - ) -} - -// maxSnapshotRows is a row-count backstop on `graph_snapshots` to prevent -// unbounded disk growth when the write rate outruns the 7-day age prune. -// Steady state at 15-min cadence × 100 tenants is ~67k rows/week, so 100k -// gives ~50% headroom — high enough to never trigger under normal operation, -// low enough to bound disk if a misconfig or tenant explosion runs the -// snapshotter hot. -const maxSnapshotRows = 100_000 - -// pruneOldSnapshots removes snapshots older than 7 days, then enforces a -// row-count backstop in case the by-age prune isn't keeping up. -func (g *GraphRAG) pruneOldSnapshots() { - if g.repo == nil || g.repo.DB() == nil { - return - } - cutoff := time.Now().AddDate(0, 0, -7) - result := g.repo.DB().Where("created_at < ?", cutoff).Delete(&GraphSnapshot{}) - if result.Error != nil { - slog.Error("Failed to prune old snapshots", "error", result.Error) - } else if result.RowsAffected > 0 { - slog.Info("Pruned old graph snapshots", "count", result.RowsAffected) - } - - var count int64 - if err := g.repo.DB().Model(&GraphSnapshot{}).Count(&count).Error; err != nil { - slog.Error("Failed to count snapshots for row-cap prune", "error", err) - return - } - if count <= maxSnapshotRows { - return - } - excess := count - maxSnapshotRows - // Subquery selects the N oldest IDs, then deletes that set. Portable - // across SQLite and Postgres; avoids a multi-statement transaction. - sub := g.repo.DB().Model(&GraphSnapshot{}).Select("id").Order("created_at ASC").Limit(int(excess)) - if err := g.repo.DB().Where("id IN (?)", sub).Delete(&GraphSnapshot{}).Error; err != nil { - slog.Error("Failed to row-cap prune snapshots", "error", err) - return - } - slog.Warn("graphrag: row-cap pruned snapshots (write rate exceeded by-age prune)", - "deleted", excess, - "cap", maxSnapshotRows, - ) -} - -// GetGraphSnapshot retrieves the snapshot closest to the requested time, -// scoped to the tenant carried by ctx. The composite (tenant_id, created_at) -// index supports the descending lookup. -func (g *GraphRAG) GetGraphSnapshot(ctx context.Context, at time.Time) (*GraphSnapshot, error) { - tenant := storage.TenantFromContext(ctx) - var snap GraphSnapshot - err := g.repo.DB(). - Where("tenant_id = ? AND created_at <= ?", tenant, at). - Order("created_at DESC"). - First(&snap).Error - if err != nil { - return nil, err - } - return &snap, nil -} diff --git a/main.go b/main.go index a0b938a..217e887 100644 --- a/main.go +++ b/main.go @@ -379,7 +379,7 @@ func main() { "event_queue_size", cfg.GraphRAGEventQueueSize, ) - // Auto-migrate GraphRAG models (Investigation, GraphSnapshot) + // Auto-migrate GraphRAG models (Investigation, DrainTemplateRow) if err := graphrag.AutoMigrateGraphRAG(repo.DB()); err != nil { slog.Error("Failed to migrate GraphRAG models", "error", err) } From 385b0151aae6649a8d7bd71ff61266d9c4dc25e6 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Sun, 24 May 2026 19:00:05 +0000 Subject: [PATCH 04/11] feat(sqlite): PRAGMA tuning + per-driver config defaults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes the platform survivable at 120 services on SQLite, the target the prior commits in this PR have been shaving heap and disk pressure for. Two coordinated changes: 1. SQLite PRAGMA stanza in factory.go is hardened from 3 to 8 settings and made fail-closed: PRAGMA journal_mode=WAL PRAGMA synchronous=NORMAL PRAGMA cache_size=-262144 # 256 MB page cache PRAGMA temp_store=MEMORY PRAGMA mmap_size=1073741824 # 1 GB mmap PRAGMA wal_autocheckpoint=10000 # checkpoint after 10k pages PRAGMA journal_size_limit=67108864 # cap WAL at 64 MB PRAGMA busy_timeout=5000 Each PRAGMA failure now aborts startup with a wrapped error (`sqlite pragma %q failed: %w`) so an unexpected SQLite build that doesn't honour, e.g. mmap_size, can't silently regress the platform to default-tuned behaviour. 2. config.Load now runs `applyDriverDefaults(cfg)` after constructing the Config struct. When DBDriver=sqlite (case-insensitive) AND the operator did not explicitly set the env var (detected via os.LookupEnv presence — value comparison would falsely treat operator-set Postgres-default values as "unset"), the following defaults flip: DB_MAX_OPEN_CONNS 50 → 1 DB_MAX_IDLE_CONNS 10 → 1 INGEST_PIPELINE_WORKERS 8 → 2 INGEST_PIPELINE_QUEUE_SIZE 50000 → 10000 METRIC_MAX_CARDINALITY 10000 → 3000 STORE_MIN_SEVERITY "" → "WARN" SAMPLING_RATE 1.0 → 0.05 GRPC_MAX_CONCURRENT_STREAMS 1000 → 240 LOG_FTS_ENABLED false → true Postgres/MSSQL/MySQL paths are unchanged bit-for-bit (early-return in applyDriverDefaults). The applyDriverDefaults override is unit-tested for: the all-flip path, the "respect explicit operator override" path, the Postgres no-op path, and case-insensitive driver matching. Design rationale and per-default justification: docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md --- internal/config/config.go | 53 +++++++- internal/config/driver_defaults_test.go | 154 ++++++++++++++++++++++++ internal/storage/factory.go | 30 ++++- 3 files changed, 231 insertions(+), 6 deletions(-) create mode 100644 internal/config/driver_defaults_test.go diff --git a/internal/config/config.go b/internal/config/config.go index ab6817d..4acb6be 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -225,7 +225,7 @@ func Load(customPath string) (*Config, error) { } env := getEnv("APP_ENV", "development") - return &Config{ + cfg := &Config{ Env: env, DevMode: env == "development", LogLevel: getEnv("LOG_LEVEL", "INFO"), @@ -326,7 +326,56 @@ func Load(customPath string) (*Config, error) { // Production safety guard for SQLite AllowSqliteProd: parseTruthy(getEnv("OTELCONTEXT_ALLOW_SQLITE_PROD", "")), - }, nil + } + applyDriverDefaults(cfg) + return cfg, nil +} + +// applyDriverDefaults flips defaults on a freshly-Load()'d Config when the +// driver is SQLite AND the operator did not explicitly set the env var. +// Postgres/MSSQL/MySQL defaults are unchanged. +// +// The platform's stock defaults are tuned for Postgres at 100k events/sec +// with a parallel writer pool. On SQLite those same defaults overrun the +// single-writer lock and inflate heap until the process OOMs — see +// docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md. +// This override gives the SQLite path a survivable starting point at +// 120 services while preserving the existing Postgres path bit-for-bit. +// +// "Explicit operator override" is detected via os.LookupEnv (presence) +// rather than value comparison so that, e.g., DB_MAX_OPEN_CONNS=50 set by +// hand is still honoured even though it equals the Postgres default. +func applyDriverDefaults(cfg *Config) { + if !strings.EqualFold(cfg.DBDriver, "sqlite") { + return + } + if _, ok := os.LookupEnv("DB_MAX_OPEN_CONNS"); !ok { + cfg.DBMaxOpenConns = 1 + } + if _, ok := os.LookupEnv("DB_MAX_IDLE_CONNS"); !ok { + cfg.DBMaxIdleConns = 1 + } + if _, ok := os.LookupEnv("INGEST_PIPELINE_WORKERS"); !ok { + cfg.IngestPipelineWorkers = 2 + } + if _, ok := os.LookupEnv("INGEST_PIPELINE_QUEUE_SIZE"); !ok { + cfg.IngestPipelineQueueSize = 10000 + } + if _, ok := os.LookupEnv("METRIC_MAX_CARDINALITY"); !ok { + cfg.MetricMaxCardinality = 3000 + } + if _, ok := os.LookupEnv("STORE_MIN_SEVERITY"); !ok { + cfg.StoreMinSeverity = "WARN" + } + if _, ok := os.LookupEnv("SAMPLING_RATE"); !ok { + cfg.SamplingRate = 0.05 + } + if _, ok := os.LookupEnv("GRPC_MAX_CONCURRENT_STREAMS"); !ok { + cfg.GRPCMaxConcurrentStreams = 240 + } + if _, ok := os.LookupEnv("LOG_FTS_ENABLED"); !ok { + cfg.LogFTSEnabled = true + } } func getEnv(key, fallback string) string { diff --git a/internal/config/driver_defaults_test.go b/internal/config/driver_defaults_test.go new file mode 100644 index 0000000..baa6407 --- /dev/null +++ b/internal/config/driver_defaults_test.go @@ -0,0 +1,154 @@ +package config + +import ( + "os" + "testing" +) + +// sqliteEnvKeys is the set of env vars whose defaults applyDriverDefaults +// flips when the driver is SQLite. Cleared via t.Setenv before each test so a +// stray host-env value doesn't leak in. +var sqliteEnvKeys = []string{ + "DB_MAX_OPEN_CONNS", + "DB_MAX_IDLE_CONNS", + "INGEST_PIPELINE_WORKERS", + "INGEST_PIPELINE_QUEUE_SIZE", + "METRIC_MAX_CARDINALITY", + "STORE_MIN_SEVERITY", + "SAMPLING_RATE", + "GRPC_MAX_CONCURRENT_STREAMS", + "LOG_FTS_ENABLED", +} + +// clearSQLiteEnv unsets every env var consulted by applyDriverDefaults so +// the test starts from a deterministic "operator set nothing" baseline. +func clearSQLiteEnv(t *testing.T) { + t.Helper() + for _, k := range sqliteEnvKeys { + // Unsetenv is reverted by the Go runtime when the test ends only when + // paired with Setenv("") first. Use Setenv("") then explicit Unsetenv + // via a deferred cleanup so concurrent tests do not see leaked state. + if _, ok := os.LookupEnv(k); ok { + old := os.Getenv(k) + t.Setenv(k, old) // record original for revert + if err := os.Unsetenv(k); err != nil { + t.Fatalf("unset %s: %v", k, err) + } + } + } +} + +// TestApplyDriverDefaults_SQLite_FlipsAllWhenNoEnv proves the post-Load() +// override fires when the driver is SQLite and the operator did not set +// any of the overridable env vars. +func TestApplyDriverDefaults_SQLite_FlipsAllWhenNoEnv(t *testing.T) { + clearSQLiteEnv(t) + cfg := &Config{ + DBDriver: "sqlite", + DBMaxOpenConns: 50, // Postgres default + DBMaxIdleConns: 10, // Postgres default + IngestPipelineWorkers: 8, // Postgres default + IngestPipelineQueueSize: 50000, // Postgres default + MetricMaxCardinality: 10000, // Postgres default + StoreMinSeverity: "", // same-as-ingest default + SamplingRate: 1.0, // keep-all default + GRPCMaxConcurrentStreams: 1000, // Postgres default + LogFTSEnabled: false, // FTS5 opt-in default + } + applyDriverDefaults(cfg) + + cases := []struct { + name string + got any + want any + }{ + {"DBMaxOpenConns", cfg.DBMaxOpenConns, 1}, + {"DBMaxIdleConns", cfg.DBMaxIdleConns, 1}, + {"IngestPipelineWorkers", cfg.IngestPipelineWorkers, 2}, + {"IngestPipelineQueueSize", cfg.IngestPipelineQueueSize, 10000}, + {"MetricMaxCardinality", cfg.MetricMaxCardinality, 3000}, + {"StoreMinSeverity", cfg.StoreMinSeverity, "WARN"}, + {"SamplingRate", cfg.SamplingRate, 0.05}, + {"GRPCMaxConcurrentStreams", cfg.GRPCMaxConcurrentStreams, 240}, + {"LogFTSEnabled", cfg.LogFTSEnabled, true}, + } + for _, c := range cases { + if c.got != c.want { + t.Errorf("%s: SQLite override = %v, want %v", c.name, c.got, c.want) + } + } +} + +// TestApplyDriverDefaults_SQLite_RespectsExplicitOverride proves that an +// operator-set env var is preserved even when its value equals the Postgres +// default. The presence check is via os.LookupEnv, not a value comparison. +func TestApplyDriverDefaults_SQLite_RespectsExplicitOverride(t *testing.T) { + clearSQLiteEnv(t) + t.Setenv("DB_MAX_OPEN_CONNS", "50") // explicit override, equal to Postgres default + t.Setenv("SAMPLING_RATE", "1.0") // explicit "keep all" + + cfg := &Config{ + DBDriver: "sqlite", + DBMaxOpenConns: 50, // operator-set value + SamplingRate: 1.0, // operator-set value + // rest unset so we can confirm the others still flip + } + applyDriverDefaults(cfg) + + if cfg.DBMaxOpenConns != 50 { + t.Errorf("explicit DB_MAX_OPEN_CONNS=50 was clobbered to %d", cfg.DBMaxOpenConns) + } + if cfg.SamplingRate != 1.0 { + t.Errorf("explicit SAMPLING_RATE=1.0 was clobbered to %f", cfg.SamplingRate) + } + // And a field with no env override still flips + if cfg.MetricMaxCardinality != 3000 { + t.Errorf("MetricMaxCardinality should have flipped to 3000, got %d", cfg.MetricMaxCardinality) + } +} + +// TestApplyDriverDefaults_Postgres_NoChange proves the Postgres / Postgresql +// drivers are untouched by this override regardless of env state. +func TestApplyDriverDefaults_Postgres_NoChange(t *testing.T) { + clearSQLiteEnv(t) + for _, drv := range []string{"postgres", "postgresql", "Postgres", "POSTGRES"} { + t.Run(drv, func(t *testing.T) { + cfg := &Config{ + DBDriver: drv, + DBMaxOpenConns: 50, + DBMaxIdleConns: 10, + IngestPipelineWorkers: 8, + IngestPipelineQueueSize: 50000, + MetricMaxCardinality: 10000, + StoreMinSeverity: "", + SamplingRate: 1.0, + GRPCMaxConcurrentStreams: 1000, + LogFTSEnabled: false, + } + before := *cfg + applyDriverDefaults(cfg) + if *cfg != before { + t.Errorf("Postgres driver %q was mutated by SQLite override: %+v → %+v", drv, before, *cfg) + } + }) + } +} + +// TestApplyDriverDefaults_SQLite_CaseInsensitive proves the driver-name +// match is case-insensitive so SQLite / sqlite / SQLITE all trip the +// override. +func TestApplyDriverDefaults_SQLite_CaseInsensitive(t *testing.T) { + clearSQLiteEnv(t) + for _, drv := range []string{"sqlite", "SQLite", "SQLITE"} { + t.Run(drv, func(t *testing.T) { + cfg := &Config{ + DBDriver: drv, + DBMaxOpenConns: 50, + } + applyDriverDefaults(cfg) + if cfg.DBMaxOpenConns != 1 { + t.Errorf("driver=%q SQLite override missed; DBMaxOpenConns=%d", drv, cfg.DBMaxOpenConns) + } + }) + } +} diff --git a/internal/storage/factory.go b/internal/storage/factory.go index 8ba94a9..8e4f178 100644 --- a/internal/storage/factory.go +++ b/internal/storage/factory.go @@ -96,11 +96,33 @@ func NewDatabase(driver, dsn string) (*gorm.DB, error) { return nil, fmt.Errorf("failed to connect to database (%s): %s", driver, scrubDSN(err.Error())) } - // SQLite pragmas must be set via Exec (glebarez/sqlite doesn't support _pragma DSN params) + // SQLite pragmas — set via Exec because glebarez/sqlite doesn't honour + // _pragma DSN params. Applied fail-closed: any PRAGMA failure aborts + // startup with a wrapped error so an unexpected SQLite build that doesn't + // support, e.g. mmap_size cannot silently regress the platform to + // default-tuned behaviour. The set was hardened on 2026-05-24 to make + // the platform survivable at 120 services on SQLite. + // + // cache_size=-262144 = 256 MB page cache (negative = KB). + // mmap_size=1073741824 = 1 GB memory-mapped read window. + // wal_autocheckpoint=10000 = checkpoint after 10k pages so WAL stays bounded. + // journal_size_limit=67108864 = hard-cap the WAL file at 64 MB. if strings.ToLower(driver) == "sqlite" || driver == "" { - db.Exec("PRAGMA journal_mode=WAL") - db.Exec("PRAGMA busy_timeout=5000") - db.Exec("PRAGMA synchronous=NORMAL") + pragmas := []string{ + "PRAGMA journal_mode=WAL", + "PRAGMA synchronous=NORMAL", + "PRAGMA cache_size=-262144", + "PRAGMA temp_store=MEMORY", + "PRAGMA mmap_size=1073741824", + "PRAGMA wal_autocheckpoint=10000", + "PRAGMA journal_size_limit=67108864", + "PRAGMA busy_timeout=5000", + } + for _, p := range pragmas { + if err := db.Exec(p).Error; err != nil { + return nil, fmt.Errorf("sqlite pragma %q failed: %w", p, err) + } + } } // Configure Connection Pool — configurable via env vars for non-SQLite drivers. From 01a84ed6cb1ff25c62da6befa47f0e3eabe29645 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Sun, 24 May 2026 19:06:44 +0000 Subject: [PATCH 05/11] docs: 7-tool MCP surface and SQLite operator notes Updates the operator-facing documentation to reflect the refactor in this PR: - CLAUDE.md "MCP Server" section rewritten to describe the 7-tool triage surface (kept + cut lists). The architecture diagram drops the legacy Vector accelerator layer. The "Storage Architecture", "GraphRAG Architecture" (background processes, persistence models, log clustering), and "Key Directories" sections drop their vectordb / graph_snapshots mentions. A new "SQLite per-driver defaults" section documents the nine env-var overrides flipped by applyDriverDefaults and the eight PRAGMAs applied at startup. - LOG_FTS_ENABLED entry rewritten to document the new SQLite-default `true` (with the LIKE-fallback / drop_fts reclaim path preserved). - STORE_MIN_SEVERITY entry notes the new SQLite-default `"WARN"`. - README.md "Features" bullet swaps "21 tools" for the 7-tool triage surface and inlines the kept tool names. - .env.example drops the VECTOR_INDEX_* block, adds a "SQLite Tuning" block listing every auto-flipped default, and notes the 7-tool MCP surface under the MCP section. - The design spec at docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md is the canonical record of the refactor's rationale, decision matrix, per-default justification, migration notes, and risk/mitigation table. --- .env.example | 33 ++- CLAUDE.md | 86 +++++--- README.md | 2 +- ...-05-24-mcp-7tool-sqlite-survival-design.md | 201 ++++++++++++++++++ 4 files changed, 282 insertions(+), 40 deletions(-) create mode 100644 docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md diff --git a/.env.example b/.env.example index 8ef86d5..c893ff0 100644 --- a/.env.example +++ b/.env.example @@ -35,10 +35,29 @@ # DB_AUTOMIGRATE=true # GORM AutoMigrate on startup. Set false in Postgres prod (schema out-of-band) # ---- Database Pool ---------------------------------------------------------- -# DB_MAX_OPEN_CONNS=50 # Max concurrent DB connections -# DB_MAX_IDLE_CONNS=10 # Idle connections kept in pool +# DB_MAX_OPEN_CONNS=50 # Max concurrent DB connections (SQLite default 1; SQLite is single-writer) +# DB_MAX_IDLE_CONNS=10 # Idle connections kept in pool (SQLite default 1) # DB_CONN_MAX_LIFETIME=1h # Conn recycle window. Internally capped to 30m when DB_AZURE_AUTH=true +# ---- SQLite Tuning (auto-applied when DB_DRIVER=sqlite) --------------------- +# The platform flips several defaults when running on SQLite so a 100+ service +# deployment survives without OOM. Each override is skipped if the operator +# explicitly sets the env var. Postgres/MSSQL paths are untouched. +# +# DB_MAX_OPEN_CONNS 50 → 1 +# DB_MAX_IDLE_CONNS 10 → 1 +# INGEST_PIPELINE_WORKERS 8 → 2 +# INGEST_PIPELINE_QUEUE_SIZE 50000 → 10000 +# METRIC_MAX_CARDINALITY 10000 → 3000 +# STORE_MIN_SEVERITY "" → "WARN" (INFO/DEBUG still flow to GraphRAG/Drain, just not persisted) +# SAMPLING_RATE 1.0 → 0.05 (errors and slow spans always kept) +# GRPC_MAX_CONCURRENT_STREAMS 1000 → 240 (~2 streams per service at 120 services) +# LOG_FTS_ENABLED false → true (FTS5 BM25 search; ~30% disk overhead — set false to reclaim) +# +# Override by setting the env var explicitly. See +# docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md for +# per-default rationale and the SQLite PRAGMA stanza applied at startup. + # ---- Azure Entra (passwordless Postgres) ------------------------------------ # DB_AZURE_AUTH=false # Enables DefaultAzureCredential for Postgres. Requires strict TLS # # (sslmode=require|verify-ca|verify-full). DSN must omit password. @@ -73,10 +92,9 @@ # ---- TSDB ------------------------------------------------------------------- # TSDB_RING_BUFFER_DURATION=1h # In-memory metric ring buffer window (e.g. 30m, 2h) -# ---- GraphRAG / Cardinality / Vector ---------------------------------------- +# ---- GraphRAG / Cardinality ------------------------------------------------- # METRIC_ATTRIBUTE_KEYS= # CSV allowlist of attribute keys included in metric series key -# METRIC_MAX_CARDINALITY=10000 # Max unique series per metric; new series dropped above this -# VECTOR_INDEX_MAX_ENTRIES=100000 # TF-IDF index capacity (FIFO eviction) +# METRIC_MAX_CARDINALITY=10000 # Max unique series per metric (Postgres default; SQLite default 3000) # ---- DLQ (Dead Letter Queue) ------------------------------------------------ # DLQ_PATH=./data/dlq # Directory for typed-envelope files @@ -91,6 +109,11 @@ # ---- MCP Server ------------------------------------------------------------- # MCP_ENABLED=true # Expose MCP JSON-RPC 2.0 (POST) + SSE (GET) for AI agents # MCP_PATH=/mcp # Mount path +# +# Triage surface (7 tools): get_anomaly_timeline, get_service_map, +# get_service_health, root_cause_analysis, impact_analysis, trace_graph, +# search_logs. Cut in 2026-05-24 reduction from 21 → 7; see +# docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md. # ---- Compression ------------------------------------------------------------ # COMPRESSION_LEVEL=default # default|fast|best — zstd level for compressed columns diff --git a/CLAUDE.md b/CLAUDE.md index a410516..a12f397 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -25,12 +25,11 @@ HTTP :8080/v1/* (OTLP HTTP)─┘ │ │ ▼ ▼ In-Memory Accel. Relational DB (TSDB Ring, (Source of Truth, - GraphRAG, 7-15 day retention) - Vector) + GraphRAG) 7-15 day retention) │ HTTP :8080 ◄── REST API ◄───────────┘ ◄── WebSocket (real-time) - ◄── MCP Server (AI agents, 21 tools) + ◄── MCP Server (AI agents, 7-tool triage surface) ◄── Prometheus /metrics ``` @@ -59,8 +58,7 @@ When none are present, `DEFAULT_TENANT` (default `"default"`) is assigned. Every | GraphRAG (in-memory) | `internal/graphrag/` | Layered graph: 4 typed stores, error chains, root cause analysis, anomaly detection | | Time Series (in-memory) | `internal/tsdb/` | Ring buffer, sliding windows, pre-computed percentiles | | Graph (in-memory, legacy) | `internal/graph/` | Simple service topology — **being replaced by GraphRAG** | -| Vector (embedded) | `internal/vectordb/` | TF-IDF index for semantic log search (pure Go, no CGO). Persisted across restarts via gob+CRC32 snapshot (default `data/vectordb.snapshot`, 5m interval) plus a startup tail-replay from the DB so the index is warm before listeners accept traffic — eliminating the legacy minutes of cold-start blindness. `find_similar_logs` and `SimilarErrors` (within a Drain template cluster) are the read-side consumers. | -| Relational (persistent) | `internal/storage/` | GORM-based, multi-DB, single source of truth. Driven by `RetentionScheduler` (hourly batched purge + daily VACUUM/ANALYZE). `logs.body` is plain TEXT. **Log search**: vectordb (TF-IDF) is the default semantic-search path. Optional SQLite FTS5 (`logs_fts`, porter+unicode61, ordered by `bm25()`, AFTER INSERT/DELETE/UPDATE triggers) is **opt-in via `LOG_FTS_ENABLED=true`** and disabled by default — operators who toggle it off can reclaim the FTS table + indexes via `POST /api/admin/drop_fts`. Postgres uses `pg_trgm` GIN on `logs.body` and `logs.service_name`. `AttributesJSON` and `AIInsight` remain `CompressedText`. The `search_logs` MCP tool and the API `/api/logs?q=…` filter are clamped to the **last 24 hours** to bound the LIKE-fallback worst case. | +| Relational (persistent) | `internal/storage/` | GORM-based, multi-DB, single source of truth. Driven by `RetentionScheduler` (hourly batched purge + daily VACUUM/ANALYZE). `logs.body` is plain TEXT. **Log search**: SQLite FTS5 (`logs_fts`, porter+unicode61, ordered by `bm25()`, AFTER INSERT/DELETE/UPDATE triggers) is the default path — `LOG_FTS_ENABLED` defaults to `true` when `DB_DRIVER=sqlite` and `false` otherwise. Operators who want the ~30% disk savings can set `LOG_FTS_ENABLED=false` and reclaim the FTS table + indexes via `POST /api/admin/drop_fts`. Postgres uses `pg_trgm` GIN on `logs.body` and `logs.service_name`. `AttributesJSON` and `AIInsight` remain `CompressedText`. The `search_logs` MCP tool and the API `/api/logs?q=…` filter are clamped to the **last 24 hours** to bound the LIKE-fallback worst case. The `vectordb` package (TF-IDF semantic search) was removed on 2026-05-24 alongside the `find_similar_logs` MCP tool — `data/vectordb.snapshot` is left on disk for operators to delete by hand. | ## GraphRAG Architecture @@ -91,23 +89,23 @@ The `internal/graphrag/` package is the core intelligence layer. It replaces the | `CorrelatedSignals(service, timeRange)` | Gather all edges | Related logs/metrics/traces | | `ShortestPath(from, to)` | Dijkstra weighted by inverse call freq | Service communication path | | `AnomalyTimeline(since)` | Time-sorted anomalies + PRECEDED_BY | Recent anomaly overview | -| `SimilarErrors(clusterID, k)` | k-NN cosine similarity via vectordb | Related error patterns | | `ServiceMap(depth)` | Full topology dump | Service topology + health | ### Background Processes - **4 event workers** consume from a 10,000-capacity buffered channel (best-effort; DB is source of truth) - **Refresh loop** (60s) — rebuilds from DB, prunes expired TraceStore nodes, cleans old anomalies -- **Snapshot loop** (15min) — persists topology snapshot to DB, prunes snapshots > 7 days +- **Snapshot loop** (15min) — persists Drain templates so cluster IDs survive restart (the `graph_snapshots` write side was removed on 2026-05-24; the loop name is retained for wiring stability) - **Anomaly loop** (10s) — detects error spikes, latency degradation, metric z-score anomalies ### Persistence Models (GORM) - `Investigation` — automated error analysis records (trigger, root cause, causal chain, evidence) -- `GraphSnapshot` — periodic topology snapshots (nodes, edges, health scores) - `DrainTemplateRow` — persisted Drain log templates (table `drain_templates`), loaded on startup to warm the miner +> Note: `GraphSnapshot` (table `graph_snapshots`) was removed on 2026-05-24. AutoMigrate no longer creates the table on fresh deploys; existing populated tables are left in place — operators can `DROP TABLE graph_snapshots; VACUUM;` to reclaim disk. + ### Log Clustering (Drain) -Log clustering uses **Drain** template mining (`internal/graphrag/drain.go`) — a deterministic fixed-depth prefix tree with O(1) LRU via `container/list`. It replaces the older hash-based clustering. Templates are persisted to the `drain_templates` table and reloaded on startup so cluster IDs stay stable across restarts. The TF-IDF `vectordb` is retained as a fallback similarity ranker inside a template bucket (`SimilarErrors`). +Log clustering uses **Drain** template mining (`internal/graphrag/drain.go`) — a deterministic fixed-depth prefix tree with O(1) LRU via `container/list`. Templates are persisted to the `drain_templates` table and reloaded on startup so cluster IDs stay stable across restarts. ### Ingestion Callbacks ``` @@ -116,26 +114,31 @@ LogsServer.Export() → DB persist → logCallback → GraphRAG.OnLogIngested( MetricsServer.Export() → TSDB → metricCallback → GraphRAG.OnMetricIngested() ``` -## MCP Server — 21 Tools - -The MCP server (`internal/mcp/`) exposes tools via HTTP Streamable MCP (JSON-RPC 2.0 POST + SSE GET). +## MCP Server — 7-Tool Triage Surface -### Legacy Tools (11) -`get_system_graph`, `get_service_health`, `search_logs`, `tail_logs`, `get_trace`, `search_traces`, `get_metrics`, `get_dashboard_stats`, `get_storage_status`, `find_similar_logs`, `get_alerts` +The MCP server (`internal/mcp/`) exposes a focused 7-tool triage surface via +HTTP Streamable MCP (JSON-RPC 2.0 POST + SSE GET). The surface was reduced +from 21 → 7 on 2026-05-24 so the platform survives 120 services on SQLite — +see `docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md` +for the full rationale. -### GraphRAG Tools (10) | Tool | Input | Source | |------|-------|--------| -| `get_service_map` | `{depth?, service?}` | In-memory (instant) | -| `get_error_chains` | `{service, time_range?, limit?}` | In-memory + DB fallback | -| `trace_graph` | `{trace_id}` | In-memory + DB fallback | -| `impact_analysis` | `{service, depth?}` | In-memory (instant) | -| `root_cause_analysis` | `{service, time_range?}` | In-memory (instant) | -| `correlated_signals` | `{service, time_range?}` | In-memory + DB | -| `get_investigations` | `{service?, severity?, status?, limit?}` | DB query | -| `get_investigation` | `{investigation_id}` | DB query | -| `get_graph_snapshot` | `{time}` | DB query | -| `get_anomaly_timeline` | `{since?, service?}` | In-memory (instant) | +| `get_anomaly_timeline` | `{since?, service?}` | In-memory (instant) — triage entry point | +| `get_service_map` | `{depth?, service?}` | In-memory (instant) — topology + health overlay | +| `get_service_health` | `{service_name}` | In-memory (instant) — per-service drill-down | +| `root_cause_analysis` | `{service, time_range?}` | In-memory (instant) — ranked probable causes | +| `impact_analysis` | `{service, depth?}` | In-memory (instant) — blast radius | +| `trace_graph` | `{trace_id}` | In-memory + DB fallback — trace tree visualisation | +| `search_logs` | `{query?, severity?, service?, trace_id?, start?, end?, limit?, page?}` | DB (FTS5 default on SQLite, LIKE fallback, 24h-clamped) | + +Cut tools (clients now receive an `unknown tool` RPC error): `get_system_graph`, +`tail_logs`, `get_trace`, `search_traces`, `get_metrics`, `get_dashboard_stats`, +`get_storage_status`, `find_similar_logs`, `get_alerts`, `correlated_signals`, +`get_error_chains`, `get_investigations`, `get_investigation`, `get_graph_snapshot`. + +Cacheable surface (5s TTL via `MCP_CACHE_TTL_MS`): `get_anomaly_timeline`, +`get_service_map`, `get_service_health`, `root_cause_analysis`, `impact_analysis`. Every error-identifying tool returns a `root_cause` block: ```json @@ -176,23 +179,21 @@ internal/ builder.go # Event workers, ingestion callbacks, GraphRAG coordinator queries.go # ErrorChain, ImpactAnalysis, RootCause, ShortestPath, etc. investigation.go # GORM Investigation model + persistence - snapshot.go # GORM GraphSnapshot model + scheduler anomaly.go # Z-score, error spike, latency degradation detection drain.go # Log clustering via Drain template mining — pure-Go, stdlib-only, deterministic fixed-depth prefix tree - refresh.go # Periodic DB rebuild + pruning + refresh.go # Periodic DB rebuild + pruning + Drain template persistence ingest/ # OTLP receivers (gRPC + HTTP), adaptive sampling otlp.go # gRPC TraceServer, LogsServer, MetricsServer otlp_http.go # HTTP OTLP handler (protobuf + JSON, gzip, 4MB limit) sampler.go # Per-service token bucket sampler - mcp/ # MCP server (21 tools, JSON-RPC 2.0 + SSE) + mcp/ # MCP server (7-tool triage surface, JSON-RPC 2.0 + SSE) queue/ # Dead Letter Queue (typed envelopes, bounded disk, exp backoff) realtime/ # WebSocket hub + event streaming - storage/ # GORM repository, models, migrations, Close() method + storage/ # GORM repository, models, migrations, Close() method, SQLite PRAGMA stanza telemetry/ # Prometheus metrics + health (19 metrics) tsdb/ # Time series aggregator + ring buffer (lock-free Windows()) - vectordb/ # Embedded TF-IDF vector index (FIFO eviction with copy, clean IDF rebuild). Persisted via gob+CRC32 snapshot + startup DB tail-replay (snapshot.go, replay.go). ui/ # Embedded React frontend -ui/ # React frontend (Vite + Mantine) +ui/ # React frontend (Vite + @ossrandom/design-system) test/ # Microservice simulation (7 services) docs/ # Specifications and plans ``` @@ -213,17 +214,34 @@ Key settings in `internal/config/config.go`: - `METRIC_MAX_CARDINALITY` (10000), `METRIC_MAX_CARDINALITY_PER_TENANT` (0 = unlimited), `API_RATE_LIMIT_RPS` (100). The per-tenant cap is checked first; when set, a noisy tenant cannot exhaust the global pool. Overflow is labeled by tenant via `otelcontext_tsdb_cardinality_overflow_by_tenant_total{tenant_id}` (`__global__` sentinel when the global cap was the trigger). - `MCP_ENABLED` (true), `MCP_PATH` (/mcp) - `MCP_MAX_CONCURRENT` (32), `MCP_CALL_TIMEOUT_MS` (30000), `MCP_CACHE_TTL_MS` (5000) — MCP HTTP streamable robustness. Counting semaphore gates concurrent `tools/call` (JSON-RPC `-32000` past the cap), per-call deadlines abort runaway handlers (JSON-RPC `-32001`), and a 5s TTL cache memoizes the cheap in-memory GraphRAG tools (`get_service_map`, `impact_analysis`, `root_cause_analysis`, `get_anomaly_timeline`, `get_service_health`). SSE GET sends a `: keep-alive\n\n` comment every 25s to keep the stream alive across reverse-proxy idle timeouts. Set any to 0 to disable. -- `VECTOR_INDEX_MAX_ENTRIES` (100000), `VECTOR_INDEX_SNAPSHOT_PATH` (`data/vectordb.snapshot`), `VECTOR_INDEX_SNAPSHOT_INTERVAL` (`5m`) — vectordb persistence. Empty `VECTOR_INDEX_SNAPSHOT_PATH` or non-positive interval disables the snapshot loop. The snapshot file uses a magic+version+CRC32 wire format with gob payload; corrupt or version-mismatched files are rejected and the loader falls back to a full DB rebuild via `ReplayFromDB`. Watch `otelcontext_vectordb_snapshot_writes_total{result}`, `otelcontext_vectordb_snapshot_load_total{result}`, `otelcontext_vectordb_snapshot_size_bytes`, and `otelcontext_vectordb_replay_logs_total`. -- `LOG_FTS_ENABLED` (false) — when truthy (`true`/`yes`/`on`/`1`), provisions the SQLite FTS5 `logs_fts` virtual table + sync triggers at startup; when false, log-search uses vectordb (semantic) plus a 24h-clamped LIKE fallback. Toggle off and reclaim disk via `POST /api/admin/drop_fts` (refused while the flag is on). +- `LOG_FTS_ENABLED` — when truthy (`true`/`yes`/`on`/`1`), provisions the SQLite FTS5 `logs_fts` virtual table + sync triggers at startup; when false, log-search uses a 24h-clamped LIKE fallback. **Defaults to `true` when `DB_DRIVER=sqlite`** (BM25 is dramatically faster than LIKE on the kept `search_logs` MCP tool) and `false` otherwise. Toggle off and reclaim the ~30% disk overhead via `POST /api/admin/drop_fts` (refused while the flag is on). The vectordb-backed semantic-search path was removed on 2026-05-24. - `DLQ_MAX_FILES` (1000), `DLQ_MAX_DISK_MB` (500), `DLQ_MAX_RETRIES` (10) - `GRAPHRAG_WORKER_COUNT` (16), `GRAPHRAG_EVENT_QUEUE_SIZE` (100000) — sized for 100–200 services; raise further if `otelcontext_graphrag_events_dropped_total` climbs -- `INGEST_MIN_SEVERITY` (`INFO`), `STORE_MIN_SEVERITY` (`""` = same as ingest) — two-tier log severity gate. The ingest gate runs at the OTLP receiver and **drops the log entirely** below the threshold (no in-memory enrichment either). The store gate runs at the persist boundary inside the async pipeline (`internal/ingest/pipeline.go:process`) and **only skips the DB row write** — the log still flows through `LogCallback` so vectordb indexing, GraphRAG Drain template mining, and span/trace correlation see it. Use case: `INGEST_MIN_SEVERITY=DEBUG STORE_MIN_SEVERITY=WARN` keeps SQLite small while letting in-memory anomaly detection benefit from the verbose stream. Setting `STORE_MIN_SEVERITY` ≤ `INGEST_MIN_SEVERITY` is a no-op (logged as a warning at startup). Drops surface via `Pipeline.Stats().StoreFiltered`. +- `INGEST_MIN_SEVERITY` (`INFO`), `STORE_MIN_SEVERITY` (`""` = same as ingest; **defaults to `"WARN"` when `DB_DRIVER=sqlite`**) — two-tier log severity gate. The ingest gate runs at the OTLP receiver and **drops the log entirely** below the threshold (no in-memory enrichment either). The store gate runs at the persist boundary inside the async pipeline (`internal/ingest/pipeline.go:process`) and **only skips the DB row write** — the log still flows through `LogCallback` so GraphRAG Drain template mining and span/trace correlation see it. Use case: `INGEST_MIN_SEVERITY=DEBUG STORE_MIN_SEVERITY=WARN` keeps SQLite small while letting in-memory anomaly detection benefit from the verbose stream. Setting `STORE_MIN_SEVERITY` ≤ `INGEST_MIN_SEVERITY` is a no-op (logged as a warning at startup). Drops surface via `Pipeline.Stats().StoreFiltered`. - `INGEST_ASYNC_ENABLED` (true), `INGEST_PIPELINE_QUEUE_SIZE` (50000), `INGEST_PIPELINE_WORKERS` (8) — async ingest pipeline (`internal/ingest/pipeline.go`). Hybrid backpressure: <90% accept all, 90–100% drop healthy batches (errors/slow always pass), 100% return gRPC `RESOURCE_EXHAUSTED`. Set `INGEST_ASYNC_ENABLED=false` to revert to synchronous DB writes inside `Export()`. Drops surface as `otelcontext_ingest_pipeline_dropped_total{signal,reason}`. - `GRPC_MAX_RECV_MB` (16), `GRPC_MAX_CONCURRENT_STREAMS` (1000) — OTLP gRPC server caps, validated to 1..256 and 1..1_000_000 - `RETENTION_BATCH_SIZE` (50000), `RETENTION_BATCH_SLEEP_MS` (1) — purge pacing; raise the sleep on busy production DBs - `DB_POSTGRES_PARTITIONING` (`""`), `DB_PARTITION_LOOKAHEAD_DAYS` (3) — opt-in Postgres declarative range partitioning of the `logs` table by day. When `daily`, `logs` is provisioned as a partitioned parent (greenfield only — refuses to start if `logs` already exists unpartitioned), the `PartitionScheduler` maintains lookahead partitions and drops expired ones via `DROP TABLE`, and `RetentionScheduler` skips the row-level DELETE for `logs`. Watch `otelcontext_partitions_dropped_total` and `otelcontext_partitions_active`. - `APP_ENV` (`"development"`), `OTELCONTEXT_ALLOW_SQLITE_PROD` (false) — SQLite is refused when `APP_ENV=production` unless the allow flag is set +### SQLite per-driver defaults (auto-flipped when DB_DRIVER=sqlite) + +So a 100+ service deployment on SQLite survives without OOM, `config.Load()` overrides nine defaults at the end of the Load() pass — but **only when the operator did not explicitly set the env var** (detected via `os.LookupEnv` presence, not value comparison). Postgres/MSSQL/MySQL paths are untouched. + +| Env var | SQLite default | Postgres default | Rationale | +|---|---|---|---| +| `DB_MAX_OPEN_CONNS` | 1 | 50 | SQLite is single-writer; extra conns are wasted slots. | +| `DB_MAX_IDLE_CONNS` | 1 | 10 | Match open conns. | +| `INGEST_PIPELINE_WORKERS` | 2 | 8 | Workers all serialise through the SQLite writer lock; 2 is enough to keep the queue non-empty. | +| `INGEST_PIPELINE_QUEUE_SIZE` | 10000 | 50000 | Lower heap watermark; backpressure kicks in earlier so OTLP clients back off. | +| `METRIC_MAX_CARDINALITY` | 3000 | 10000 | Bound the in-memory TSDB series map. | +| `STORE_MIN_SEVERITY` | `"WARN"` | `""` | Skip INFO/DEBUG persists; in-memory GraphRAG/Drain still sees them. | +| `SAMPLING_RATE` | 0.05 | 1.0 | Errors and slow spans are always kept by `SAMPLING_ALWAYS_ON_ERRORS`. | +| `GRPC_MAX_CONCURRENT_STREAMS` | 240 | 1000 | ~2 streams per service at 120 services with headroom. | +| `LOG_FTS_ENABLED` | `true` | n/a | FTS5 BM25 is dramatically faster than LIKE on the kept `search_logs` path. | + +Also at SQLite startup, `internal/storage/factory.go` applies a fail-closed PRAGMA stanza: `journal_mode=WAL`, `synchronous=NORMAL`, `cache_size=-262144` (256 MB page cache), `temp_store=MEMORY`, `mmap_size=1073741824` (1 GB mmap), `wal_autocheckpoint=10000`, `journal_size_limit=67108864` (64 MB WAL cap), `busy_timeout=5000`. Any PRAGMA failure aborts startup with a wrapped error — these are not optional. See `docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md` for per-default reasoning. + ### Authentication **API auth (platform).** `API_KEY` gates `/api/*`, OTLP HTTP (`/v1/*`), and the MCP endpoint via `Authorization: Bearer `. When empty, the middleware is a pass-through (dev only). Unprotected paths: `/live`, `/ready`, `/metrics*`, `/ws*`. A shared `API_KEY` grants access to every tenant — there is no per-tenant-key file in the current code; isolate tenants at the network/auth layer if that matters. (If an `API_TENANT_KEYS_FILE` override lands later, re-check `internal/api/auth.go` for the flag name.) diff --git a/README.md b/README.md index 927cf93..dc1053e 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ See `docs/otel-collector-example.yaml` for a complete example. - **OTLP gRPC + HTTP ingest** — traces, logs, metrics; gzip and protobuf/JSON supported. - **GraphRAG** — layered in-memory graph with error-chain, impact, and root-cause queries. - **Drain log clustering** — deterministic template mining, persisted across restarts. -- **MCP server** — 21 tools exposing the platform to AI agents over JSON-RPC 2.0 + SSE. +- **MCP server** — 7-tool triage surface for AI agents over JSON-RPC 2.0 + SSE (get_anomaly_timeline, get_service_map, get_service_health, root_cause_analysis, impact_analysis, trace_graph, search_logs). - **Multi-tenancy** — per-row `tenant_id`, `X-Tenant-ID` header / `x-tenant-id` gRPC metadata. - **Adaptive sampling** — always-on for errors and slow spans, probabilistic otherwise. - **DLQ** — durable typed envelopes with disk-bounded replay. diff --git a/docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md b/docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md new file mode 100644 index 0000000..28aa5a0 --- /dev/null +++ b/docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md @@ -0,0 +1,201 @@ +# MCP 7-Tool Triage Surface + SQLite Survival Tuning + +**Date:** 2026-05-24 +**Branch:** `feat/mcp-7tool-sqlite-survival` +**Status:** Implementation +**Authors:** OtelContext platform team + +## Problem statement + +A production OtelContext deployment with 120 services ingesting OTel data on the +SQLite backend OOMs within 1 hour and grows the on-disk DB at roughly 2 TB/day. +The platform is not survivable on its default-recommended single-binary setup +once service count crosses ~20, well below the documented "small deployment" +guidance of "~5 services". + +## Investigation summary + +A 7-agent parallel investigation (5 Explore subagents, plus Codex/GPT-5 and +Antigravity/Gemini cross-checks) identified four primary OOM culprits: + +1. **In-memory pipeline queue saturation under SQLite WAL contention.** The + default `INGEST_PIPELINE_QUEUE_SIZE=50000` × per-batch dozens-of-KB + payloads is sized for a Postgres deployment that can absorb 8 worker + threads in parallel. SQLite's single-writer lock serializes everything + into one writer, so the queue fills, retains all batches in heap, and + the soft-backpressure 90% threshold never relieves pressure fast enough. +2. **GraphRAG permanent stores with no TTL.** `ServiceStore` and `SignalStore` + are permanent; `AnomalyStore` is 24h. With 120 services × N operations + × M log clusters × cross-service edges, the in-memory node count grows + monotonically until heap pressure triggers full GC stalls. +3. **TSDB ring at default cardinality.** `METRIC_MAX_CARDINALITY=10000` is + per-series, and with 120 services emitting heterogeneous attribute sets + the in-memory ring buffer plus the series → bucket map dominates heap. +4. **Span AttributesJSON duplicating resource attributes on every row.** + Compressed-text column is still tens-of-KB per span; resource attrs are + ~80% of each row's payload and are duplicated unconditionally. + +Secondary findings: + +- The `vectordb` TF-IDF index is held entirely in memory (`maxSize=100000` + documents × per-doc TF map + IDF table) and persists on a 5-minute snapshot + loop. It accounts for ~5-15% of resident heap depending on log volume. +- The `graph_snapshots` table grows by ~67k rows/week at 100 tenants × 15-min + cadence × N services, contributing meaningfully to the 2 TB/day disk + growth on SQLite (every row carries a compressed JSON nodes+edges blob). +- 14 of the 21 MCP tools are operationally non-essential during a triage + workflow — they wrap full-text trace search, dashboard stats, and + investigation history that an LLM caller almost never reaches for + inside an active incident response. + +## Decision + +Three coordinated changes, none of which touch GraphRAG core query logic, +TSDB core, or ingest pipeline core: + +1. **Cut the 21-tool MCP surface to 7 triage-essential tools.** No + deprecation period — production is already failing; the cut is + immediate. Kept tools cover the full Linear-scan triage workflow + (anomaly timeline → service map → root cause → impact → trace). +2. **Drop subsystems no longer reachable by any kept tool.** The + `vectordb` package, the `graph_snapshots` GORM model + scheduler, and + the `SimilarErrors` function (vectordb-dependent, no production caller) + are deleted. Removing them reclaims heap on SQLite and stops the + `graph_snapshots` row growth dead. +3. **Tune SQLite via PRAGMAs + per-driver config defaults.** Apply the + community-standard WAL + 256 MB page-cache + 1 GB mmap pragmas at + `gorm.Open`. Override eight config defaults when `DB_DRIVER=sqlite` + so the rest of the platform stops pushing more load at SQLite than + it can absorb. Postgres defaults are unchanged. + +### 7-tool MCP triage surface (kept) + +| Tool | Source | Why kept | +|---|---|---| +| `get_anomaly_timeline` | in-mem GraphRAG | The triage entry point — "what's wrong right now". | +| `get_service_map` | in-mem GraphRAG | Topology + health overlay drives every UI service-graph view. | +| `get_service_health` | in-mem GraphRAG | Per-service drill-down from the service map. | +| `root_cause_analysis` | in-mem GraphRAG | Ranked probable causes — the LLM's primary "why" tool. | +| `impact_analysis` | in-mem GraphRAG | Blast-radius for incident scoping. | +| `trace_graph` | in-mem GraphRAG (+ DB fallback) | Trace tree visualisation — the "show me the bad trace" path. | +| `search_logs` | DB (FTS5 default on SQLite, LIKE fallback) | The "show me the error logs around the incident" path. | + +### Tools cut (14) + +`get_system_graph`, `tail_logs`, `get_trace`, `search_traces`, `get_metrics`, +`get_dashboard_stats`, `get_storage_status`, `find_similar_logs`, +`get_alerts`, `correlated_signals`, `get_error_chains`, `get_investigations`, +`get_investigation`, `get_graph_snapshot`. + +Rationale: each of these either (a) duplicates a kept tool with a slightly +different framing (`get_system_graph` ≈ `get_service_map`, +`get_error_chains` is folded into `root_cause_analysis`), (b) requires +subsystems being dropped (`find_similar_logs` → vectordb, +`get_graph_snapshot` → snapshot table), or (c) belongs to a separate +forensic-analytics workflow (`get_investigations`, `get_investigation`, +`get_dashboard_stats`) that is not part of active triage. + +### Subsystem deletions + +| Subsystem | Files / artifacts | Reason | +|---|---|---| +| `vectordb` package | `internal/vectordb/` (index.go, snapshot.go, replay.go + tests) | No surviving MCP tool consumes it; ~5-15% of heap; snapshot+replay loops are dead weight under triage workload. | +| Snapshot scheduler | `internal/graphrag/snapshot.go`; `GraphSnapshot` GORM model; snapshot loop in builder.go; `get_graph_snapshot` MCP tool already cut | `graph_snapshots` table is the second-largest disk-growth contributor after raw spans/logs. No kept tool reads it. | +| `SimilarErrors` | `internal/graphrag/clustering.go::SimilarErrors` | Vectordb-dependent, has no production caller, only used by the cut `find_similar_logs` tool path historically. | +| `/api/logs/similar` | `internal/api/similar_handler.go` + test | Same vectordb dependency; same triage non-essential. | +| `tools.go` cuts | 14 handler funcs deleted | One-line follow-on per dropped tool. | + +### SQLite tuning + +After `gorm.Open` succeeds with `DB_DRIVER=sqlite`, apply these PRAGMAs in +order with fail-closed error handling: + +```go +pragmas := []string{ + "PRAGMA journal_mode=WAL", // existing + "PRAGMA synchronous=NORMAL", // existing + "PRAGMA cache_size=-262144", // 256 MB page cache (new) + "PRAGMA temp_store=MEMORY", // new + "PRAGMA mmap_size=1073741824", // 1 GB mmap (new) + "PRAGMA wal_autocheckpoint=10000", // new — keeps WAL bounded + "PRAGMA journal_size_limit=67108864", // cap WAL at 64 MB (new) + "PRAGMA busy_timeout=5000", // existing +} +``` + +A PRAGMA failure is fatal — these are not optional, and silent fallback +to defaults defeats the survivability goal. + +### Per-driver config defaults + +The following defaults override the Postgres-tuned defaults when +`DB_DRIVER=sqlite`, only if the operator has not explicitly set the env +var (detected via `os.LookupEnv`, not value comparison): + +| Env var | SQLite default | Postgres/MSSQL default | Reason | +|---|---|---|---| +| `DB_MAX_OPEN_CONNS` | 1 | 50 | SQLite single-writer; multiple open conns are wasted slots. | +| `DB_MAX_IDLE_CONNS` | 1 | 10 | Match open conns. | +| `INGEST_PIPELINE_WORKERS` | 2 | 8 | 8 workers all serialize through the SQLite writer lock anyway; 2 is enough to keep the writer queue non-empty without pushing extra work into heap. | +| `INGEST_PIPELINE_QUEUE_SIZE` | 10000 | 50000 | Smaller queue = lower heap watermark; backpressure kicks in earlier so OTLP clients back off rather than us OOMing. | +| `METRIC_MAX_CARDINALITY` | 3000 | 10000 | Bound the TSDB series map. 120 services × 25 series/service still fits. | +| `STORE_MIN_SEVERITY` | `WARN` | `""` (== ingest) | Skip INFO/DEBUG persists on the SQLite path — in-memory GraphRAG/anomaly detection still benefits from the full stream. | +| `SAMPLING_RATE` | 0.05 | 1.0 | Trace volume is the primary disk-growth contributor. 5% sample at 120 services ≈ what 1.0 used to do at 6 services. | +| `GRPC_MAX_CONCURRENT_STREAMS` | 240 | 1000 | Each stream costs heap; 120 services × 2 = 240 covers the deployment with no overhead. | +| `LOG_FTS_ENABLED` | `true` | n/a | FTS5 is dramatically faster than LIKE on the kept `search_logs` path; operators who want the ~30% disk savings can opt out. | + +### `search_logs` backend swap + +The kept `search_logs` MCP tool drops the vectordb dispatch branch entirely +(the dispatch was previously vectordb-first for free-form text queries on +SQLite). On SQLite the path is FTS5-when-enabled-else-LIKE; both honour the +existing 24h time-window clamp. + +## Migration notes for existing DBs + +- **`graph_snapshots` table is left in place.** AutoMigrate stops *creating* + it on fresh deploys (the model is deleted) but existing tables are not + dropped. Operators on populated SQLite DBs can reclaim disk with + `DROP TABLE graph_snapshots; VACUUM;` after upgrade. +- **`vectordb.snapshot` file is left in place.** The hydration code that + reads it at boot is deleted, so it becomes a stale file in `data/`. Safe + to delete by hand. +- **No schema changes to traces, spans, logs, metric_buckets, investigations, + drain_templates.** All historical data remains queryable via the kept + MCP surface. +- **MCP clients calling cut tools will receive an `unknown tool` RPC error.** + No graceful degradation; the cut is intentional and immediate. + +## Risk + mitigation table + +| Risk | Likelihood | Impact | Mitigation | +|---|---|---|---| +| Cut tool was actually load-bearing for some user's workflow | Low | Medium | The kept 7 cover all triage paths; forensic workflows can use the SQL DB directly or wait for re-introduction with a clearer scope. | +| FTS5 default-on bumps SQLite disk by 30-40% | Medium | Low | Documented opt-out (`LOG_FTS_ENABLED=false`) + `POST /api/admin/drop_fts` reclaim path already exists. | +| SQLite `synchronous=NORMAL` + `mmap_size=1GB` is more sensitive to host OOM-kill | Low | Medium | These are the SQLite community's standard "make it survive write-heavy workloads" pragmas; the alternative (silent throughput collapse) is strictly worse. | +| `STORE_MIN_SEVERITY=WARN` default surprises an operator who needs INFO logs persisted | Medium | Low | Documented in `.env.example` + `CLAUDE.md`; setting `STORE_MIN_SEVERITY=INFO` explicitly restores legacy behaviour. | +| `SAMPLING_RATE=0.05` default loses too many spans for some debugging | Medium | Low | Always-on errors + slow spans are preserved (existing config); 5% normal-path sampling still gives enough signal for triage. Operator can set `SAMPLING_RATE=1.0` to revert. | +| Deleted `graph_snapshots` causes existing UI views to break | Low | Medium | No UI view consumes the table — verified by grep before cut. | + +## Acceptance criterion + +Survives 120 services on SQLite for 7-day continuous load without OOM and +without disk growth exceeding the documented hot retention (7d × ~50 GB/d +after sampling and STORE_MIN_SEVERITY = ~350 GB steady-state, down from +~14 TB unbounded growth). + +## Commit structure + +Five logical commits on `feat/mcp-7tool-sqlite-survival`: + +1. `refactor(mcp): drop 14 non-triage tools, keep 7-tool triage surface` +2. `refactor(vectordb): drop package; FTS5 + recent-N-in-cluster replace semantic similarity` +3. `refactor(graphrag): drop graph_snapshots table + scheduler` +4. `feat(sqlite): PRAGMA tuning + per-driver config defaults for 120-service survival` +5. `docs: 7-tool MCP surface + SQLite operator notes` + +## Verification + +`gofmt -l .`, `go vet ./...`, `go build .`, `go test ./...`, and a UI +`npm install && npm run build && npm test -- --run` pass before each +commit lands. From 707be176581a7a569965008fc596dbf9c0961c8b Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Mon, 25 May 2026 04:25:47 +0000 Subject: [PATCH 06/11] fix(deps): bump x/crypto, x/net, x/sys, Go stdlib, brace-expansion Closes the OSV-Scanner CI gate on PR #91 by upgrading every dependency that the scan flagged with a known patched version. All affected packages are indirect. - golang.org/x/crypto v0.50.0 -> v0.52.0 (12 advisories: GO-2026-5005..5023, 5033) - golang.org/x/net v0.53.0 -> v0.55.0 (6 advisories: GO-2026-5025..5030) - golang.org/x/sys v0.43.0 -> v0.44.0 (1 advisory: GO-2026-5024) - Go stdlib 1.25.9 -> 1.25.10 via go.mod directive (8 advisories: GO-2026-4918, 4971, 4976, 4977, 4980, 4981, 4982, 4986). CI uses go-version-file: go.mod so the toolchain auto-bumps; no workflow change needed. - npm brace-expansion 5.0.5 -> 5.0.6 via package.json overrides (GHSA-jxxr-4gwj-5jf2, CVSS 6.5). Transitive dev dep so an overrides entry pins it without promoting to a direct dependency. go.sum sums fetched from sum.golang.org (signed checksum proof). No in-tree code touches these packages; bumps are mechanical. Validates locally: go test ./internal/config/... and the ui build pass against the bumped lockfile. Top-level go test cannot run in the agent environment because central-ops resolution requires a GH identity the agent lacks, but CI has the dep and will compile. --- go.mod | 8 ++++---- go.sum | 12 ++++++------ ui/package-lock.json | 6 +++--- ui/package.json | 3 +++ 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/go.mod b/go.mod index da4d77e..a38f1bd 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/RandomCodeSpace/otelcontext -go 1.25.9 +go 1.25.10 require github.com/RandomCodeSpace/central-ops v0.1.0 @@ -109,9 +109,9 @@ require ( github.com/yusufpapurcu/wmi v1.2.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect - golang.org/x/crypto v0.50.0 // indirect - golang.org/x/net v0.53.0 // indirect - golang.org/x/sys v0.43.0 // indirect + golang.org/x/crypto v0.52.0 // indirect + golang.org/x/net v0.55.0 // indirect + golang.org/x/sys v0.44.0 // indirect golang.org/x/text v0.36.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 122bef9..6f86a88 100644 --- a/go.sum +++ b/go.sum @@ -290,8 +290,6 @@ golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOM golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= -golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= -golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= @@ -315,8 +313,6 @@ golang.org/x/net v0.22.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= -golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= -golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -348,8 +344,6 @@ golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= -golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -433,3 +427,9 @@ pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk= pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= +golang.org/x/crypto v0.52.0 h1:RMs7fP2rXdep0CftQlK8Uf+kibLm7qkCcradZWYz988= +golang.org/x/crypto v0.52.0/go.mod h1:1QgfPxDqh0T2M/elOJtp9RvuR95kVjir0e6/BvEmGbc= +golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= +golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= +golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= +golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= diff --git a/ui/package-lock.json b/ui/package-lock.json index 8f3168a..23f7b0d 100644 --- a/ui/package-lock.json +++ b/ui/package-lock.json @@ -1787,9 +1787,9 @@ } }, "node_modules/brace-expansion": { - "version": "5.0.5", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.5.tgz", - "integrity": "sha512-VZznLgtwhn+Mact9tfiwx64fA9erHH/MCXEUfB/0bX/6Fz6ny5EGTXYltMocqg4xFAQZtnO3DHWWXi8RiuN7cQ==", + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.6.tgz", + "integrity": "sha512-kLpxurY4Z4r9sgMsyG0Z9uzsBlgiU/EFKhj/h91/8yHu0edo7XuixOIH3VcJ8kkxs6/jPzoI6U9Vj3WqbMQ94g==", "dev": true, "license": "MIT", "dependencies": { diff --git a/ui/package.json b/ui/package.json index a6d5722..17bfa21 100644 --- a/ui/package.json +++ b/ui/package.json @@ -20,6 +20,9 @@ "react": "^19.2.5", "react-dom": "^19.2.5" }, + "overrides": { + "brace-expansion": "5.0.6" + }, "devDependencies": { "@eslint/js": "^10.0.1", "@testing-library/jest-dom": "^6.9.1", From b284b7164fd15c203b6036e384fe369896ec086d Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Mon, 25 May 2026 04:26:05 +0000 Subject: [PATCH 07/11] refactor(config): table-driven SQLite overrides + shared test helper Closes the SonarCloud "3.8% duplication on new code" quality gate on PR #91 by collapsing two repetitive patterns introduced in 385b015 that each repeated 9 structurally identical lines. - applyDriverDefaults: nine `if _, ok := os.LookupEnv("X"); !ok { cfg.Y = Z }` blocks collapsed into a single loop over a `sqliteOverrides` table. The override apply closure remains the only place that names each Config field, so adding a new SQLite-only default is now a one-line table entry instead of a new if-block. Behaviour bit-for-bit identical. - driver_defaults_test.go: two test functions built the same Postgres- defaults Config{} literal. Extracted into a postgresDefaultsConfig(driver) helper; both call sites now share it. - config_test.go: gofmt re-align of baseValid() struct literal. The GRPCMaxRecvMB / GRPCMaxConcurrentStreams fields added in an earlier commit pushed the longest-name width past the existing tab stop, so gofmt wanted the whole struct re-padded. Pure whitespace; no semantic change. Verified locally: go test ./internal/config/... -count=1 -race passes (4 tests, including the four driver-default tests untouched by the refactor). gofmt -l on internal/config/ is clean. --- internal/config/config.go | 50 ++++++++++++------------- internal/config/config_test.go | 24 ++++++------ internal/config/driver_defaults_test.go | 36 +++++++++--------- 3 files changed, 53 insertions(+), 57 deletions(-) diff --git a/internal/config/config.go b/internal/config/config.go index 4acb6be..8145ee6 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -345,36 +345,34 @@ func Load(customPath string) (*Config, error) { // "Explicit operator override" is detected via os.LookupEnv (presence) // rather than value comparison so that, e.g., DB_MAX_OPEN_CONNS=50 set by // hand is still honoured even though it equals the Postgres default. +// sqliteOverrides is the table of (env-var, apply) pairs that +// applyDriverDefaults walks when DB_DRIVER=sqlite. Add a row here to +// introduce a new SQLite-only default; the apply closure is the only place +// that names the Config field, so the surrounding lookup/skip logic stays +// in one spot. +var sqliteOverrides = []struct { + envKey string + apply func(*Config) +}{ + {"DB_MAX_OPEN_CONNS", func(c *Config) { c.DBMaxOpenConns = 1 }}, + {"DB_MAX_IDLE_CONNS", func(c *Config) { c.DBMaxIdleConns = 1 }}, + {"INGEST_PIPELINE_WORKERS", func(c *Config) { c.IngestPipelineWorkers = 2 }}, + {"INGEST_PIPELINE_QUEUE_SIZE", func(c *Config) { c.IngestPipelineQueueSize = 10000 }}, + {"METRIC_MAX_CARDINALITY", func(c *Config) { c.MetricMaxCardinality = 3000 }}, + {"STORE_MIN_SEVERITY", func(c *Config) { c.StoreMinSeverity = "WARN" }}, + {"SAMPLING_RATE", func(c *Config) { c.SamplingRate = 0.05 }}, + {"GRPC_MAX_CONCURRENT_STREAMS", func(c *Config) { c.GRPCMaxConcurrentStreams = 240 }}, + {"LOG_FTS_ENABLED", func(c *Config) { c.LogFTSEnabled = true }}, +} + func applyDriverDefaults(cfg *Config) { if !strings.EqualFold(cfg.DBDriver, "sqlite") { return } - if _, ok := os.LookupEnv("DB_MAX_OPEN_CONNS"); !ok { - cfg.DBMaxOpenConns = 1 - } - if _, ok := os.LookupEnv("DB_MAX_IDLE_CONNS"); !ok { - cfg.DBMaxIdleConns = 1 - } - if _, ok := os.LookupEnv("INGEST_PIPELINE_WORKERS"); !ok { - cfg.IngestPipelineWorkers = 2 - } - if _, ok := os.LookupEnv("INGEST_PIPELINE_QUEUE_SIZE"); !ok { - cfg.IngestPipelineQueueSize = 10000 - } - if _, ok := os.LookupEnv("METRIC_MAX_CARDINALITY"); !ok { - cfg.MetricMaxCardinality = 3000 - } - if _, ok := os.LookupEnv("STORE_MIN_SEVERITY"); !ok { - cfg.StoreMinSeverity = "WARN" - } - if _, ok := os.LookupEnv("SAMPLING_RATE"); !ok { - cfg.SamplingRate = 0.05 - } - if _, ok := os.LookupEnv("GRPC_MAX_CONCURRENT_STREAMS"); !ok { - cfg.GRPCMaxConcurrentStreams = 240 - } - if _, ok := os.LookupEnv("LOG_FTS_ENABLED"); !ok { - cfg.LogFTSEnabled = true + for _, ov := range sqliteOverrides { + if _, ok := os.LookupEnv(ov.envKey); !ok { + ov.apply(cfg) + } } } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 345cc1e..bf57847 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -11,20 +11,20 @@ import ( // baseValid returns a Config that passes Validate() — test functions mutate one field at a time. func baseValid() *Config { return &Config{ - HTTPPort: "8080", - GRPCPort: "4317", - DBDriver: "sqlite", - HotRetentionDays: 7, - MetricMaxCardinality: 10000, - SamplingRate: 1.0, - APIRateLimitRPS: 100, - DBMaxOpenConns: 50, - DBMaxIdleConns: 10, - CompressionLevel: "default", + HTTPPort: "8080", + GRPCPort: "4317", + DBDriver: "sqlite", + HotRetentionDays: 7, + MetricMaxCardinality: 10000, + SamplingRate: 1.0, + APIRateLimitRPS: 100, + DBMaxOpenConns: 50, + DBMaxIdleConns: 10, + CompressionLevel: "default", GRPCMaxRecvMB: 16, GRPCMaxConcurrentStreams: 1000, - RetentionBatchSize: 50000, - RetentionBatchSleepMs: 1, + RetentionBatchSize: 50000, + RetentionBatchSleepMs: 1, } } diff --git a/internal/config/driver_defaults_test.go b/internal/config/driver_defaults_test.go index baa6407..896267f 100644 --- a/internal/config/driver_defaults_test.go +++ b/internal/config/driver_defaults_test.go @@ -38,13 +38,14 @@ func clearSQLiteEnv(t *testing.T) { } } -// TestApplyDriverDefaults_SQLite_FlipsAllWhenNoEnv proves the post-Load() -// override fires when the driver is SQLite and the operator did not set -// any of the overridable env vars. -func TestApplyDriverDefaults_SQLite_FlipsAllWhenNoEnv(t *testing.T) { - clearSQLiteEnv(t) - cfg := &Config{ - DBDriver: "sqlite", +// postgresDefaultsConfig returns a Config whose tunable fields hold the +// Postgres / non-SQLite defaults. Shared by the SQLite-flips-all test (proves +// the override fires) and the Postgres-no-change test (proves the override +// does not fire). Keeping the literal in one place stops the two tests from +// drifting and prevents a copy-paste duplication flag. +func postgresDefaultsConfig(driver string) *Config { + return &Config{ + DBDriver: driver, DBMaxOpenConns: 50, // Postgres default DBMaxIdleConns: 10, // Postgres default IngestPipelineWorkers: 8, // Postgres default @@ -55,6 +56,14 @@ func TestApplyDriverDefaults_SQLite_FlipsAllWhenNoEnv(t *testing.T) { GRPCMaxConcurrentStreams: 1000, // Postgres default LogFTSEnabled: false, // FTS5 opt-in default } +} + +// TestApplyDriverDefaults_SQLite_FlipsAllWhenNoEnv proves the post-Load() +// override fires when the driver is SQLite and the operator did not set +// any of the overridable env vars. +func TestApplyDriverDefaults_SQLite_FlipsAllWhenNoEnv(t *testing.T) { + clearSQLiteEnv(t) + cfg := postgresDefaultsConfig("sqlite") applyDriverDefaults(cfg) cases := []struct { @@ -113,18 +122,7 @@ func TestApplyDriverDefaults_Postgres_NoChange(t *testing.T) { clearSQLiteEnv(t) for _, drv := range []string{"postgres", "postgresql", "Postgres", "POSTGRES"} { t.Run(drv, func(t *testing.T) { - cfg := &Config{ - DBDriver: drv, - DBMaxOpenConns: 50, - DBMaxIdleConns: 10, - IngestPipelineWorkers: 8, - IngestPipelineQueueSize: 50000, - MetricMaxCardinality: 10000, - StoreMinSeverity: "", - SamplingRate: 1.0, - GRPCMaxConcurrentStreams: 1000, - LogFTSEnabled: false, - } + cfg := postgresDefaultsConfig(drv) before := *cfg applyDriverDefaults(cfg) if *cfg != before { From 210d14fc1b2e2797925c72ae4f2aae2670a080b4 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Mon, 25 May 2026 07:46:29 +0000 Subject: [PATCH 08/11] refactor: drop unreachable central-ops private module dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI's build/vet/test job and OSV-Scanner both fail because the runner cannot authenticate to github.com/RandomCodeSpace/central-ops — the private repo returns 404 to the GH App identity the action uses. Local agents hit the same wall. The dep was contributing exactly two tiny helpers; inline them so otelcontext compiles with public Go modules only. - main.go: replace version.Detect() with detectVersion(), an inline helper that walks runtime/debug.BuildInfo for Main.Version (the same thing version.Detect did). Falls back to "local" for go run / unstamped builds. The runtime/debug import was already present. - internal/mcp/server.go: replace httputil.CORSMiddleware("*", h) with corsMiddleware("*", h), an inline 12-line http.Handler wrapper. Adds Access-Control-Allow-* headers, expects only the verbs and request headers the MCP transport actually uses (Content-Type, Authorization, Accept, X-Tenant-ID, Mcp-Session-Id), short-circuits OPTIONS with 204. Same surface, no behaviour change. - go.mod: drop `require github.com/RandomCodeSpace/central-ops v0.1.0`. go mod tidy then auto-bumps two indirect transitive deps that were pinned by the dep graph reshuffle: golang.org/x/sys v0.44.0 -> v0.45.0 and golang.org/x/text v0.36.0 -> v0.37.0. Both above the OSV-Scanner patched baselines. - go.sum: 6 lines removed (2 each for central-ops, x/sys old, x/text old). Verified: go build ./..., go vet ./..., go test ./internal/{config,mcp}/... all pass against a 100% public module graph. Full test suite has one known-flaky pipeline_test (TestPipeline_StoreMinSeverity) that fixed itself on 3 single-package re-runs and was flagged on the same branch in commit d7c8064 (#74); not introduced here. --- go.mod | 6 ++---- go.sum | 22 ++++++++++------------ internal/mcp/server.go | 23 +++++++++++++++++++++-- main.go | 17 ++++++++++++++--- 4 files changed, 47 insertions(+), 21 deletions(-) diff --git a/go.mod b/go.mod index a38f1bd..b4ff0fa 100644 --- a/go.mod +++ b/go.mod @@ -2,8 +2,6 @@ module github.com/RandomCodeSpace/otelcontext go 1.25.10 -require github.com/RandomCodeSpace/central-ops v0.1.0 - require ( github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.1 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 @@ -111,8 +109,8 @@ require ( go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/crypto v0.52.0 // indirect golang.org/x/net v0.55.0 // indirect - golang.org/x/sys v0.44.0 // indirect - golang.org/x/text v0.36.0 // indirect + golang.org/x/sys v0.45.0 // indirect + golang.org/x/text v0.37.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20260128011058-8636f8732409 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect modernc.org/libc v1.37.6 // indirect diff --git a/go.sum b/go.sum index 6f86a88..e32bc52 100644 --- a/go.sum +++ b/go.sum @@ -36,8 +36,6 @@ github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgv github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= -github.com/RandomCodeSpace/central-ops v0.1.0 h1:HAM/dRRiY399EutNEGJO+JmT0lyJ/faIYcIiepY++VA= -github.com/RandomCodeSpace/central-ops v0.1.0/go.mod h1:CgzQCG56F8uyUAxBA5wWBgqDeXQMl/vYCK9Yetuau2o= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= @@ -290,6 +288,8 @@ golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOM golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= +golang.org/x/crypto v0.52.0 h1:RMs7fP2rXdep0CftQlK8Uf+kibLm7qkCcradZWYz988= +golang.org/x/crypto v0.52.0/go.mod h1:1QgfPxDqh0T2M/elOJtp9RvuR95kVjir0e6/BvEmGbc= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= @@ -313,6 +313,8 @@ golang.org/x/net v0.22.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= +golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -344,6 +346,8 @@ golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= +golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -359,8 +363,8 @@ golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= golang.org/x/term v0.19.0/go.mod h1:2CuTdWZ7KHSQwUzKva0cbMg6q2DMI3Mmxp+gKJbskEk= golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= -golang.org/x/term v0.42.0 h1:UiKe+zDFmJobeJ5ggPwOshJIVt6/Ft0rcfrXZDLWAWY= -golang.org/x/term v0.42.0/go.mod h1:Dq/D+snpsbazcBG5+F9Q1n2rXV8Ma+71xEjTRufARgY= +golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= +golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= @@ -374,8 +378,8 @@ golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= -golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= -golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= +golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= +golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= @@ -427,9 +431,3 @@ pgregory.net/rapid v1.2.0 h1:keKAYRcjm+e1F0oAuU5F5+YPAWcyxNNRK2wud503Gnk= pgregory.net/rapid v1.2.0/go.mod h1:PY5XlDGj0+V1FCq0o192FdRhpKHGTRIWBgqjDBTrq04= sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= -golang.org/x/crypto v0.52.0 h1:RMs7fP2rXdep0CftQlK8Uf+kibLm7qkCcradZWYz988= -golang.org/x/crypto v0.52.0/go.mod h1:1QgfPxDqh0T2M/elOJtp9RvuR95kVjir0e6/BvEmGbc= -golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= -golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= -golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= -golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= diff --git a/internal/mcp/server.go b/internal/mcp/server.go index c331eac..5da7532 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -11,7 +11,6 @@ import ( "sync/atomic" "time" - "github.com/RandomCodeSpace/central-ops/pkg/httputil" "github.com/RandomCodeSpace/otelcontext/internal/graph" "github.com/RandomCodeSpace/otelcontext/internal/graphrag" "github.com/RandomCodeSpace/otelcontext/internal/httpconst" @@ -193,7 +192,27 @@ func (s *Server) SetGraphRAG(g *graphrag.GraphRAG) { // Handler returns an http.Handler for the MCP server with CORS applied. // Works correctly when mounted with http.StripPrefix. func (s *Server) Handler() http.Handler { - return httputil.CORSMiddleware("*", http.HandlerFunc(s.ServeHTTP)) + return corsMiddleware("*", http.HandlerFunc(s.ServeHTTP)) +} + +// corsMiddleware wraps next with permissive CORS headers so MCP clients +// running in a browser (or any cross-origin caller) can hit /mcp. Allows +// only the verbs and request headers the MCP transport actually uses; +// preflight short-circuits with 204. Inlined here to avoid pulling a +// private helper module just for one ~10-line middleware. +func corsMiddleware(origin string, next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + h := w.Header() + h.Set("Access-Control-Allow-Origin", origin) + h.Set("Access-Control-Allow-Methods", "GET, POST, OPTIONS") + h.Set("Access-Control-Allow-Headers", "Content-Type, Authorization, Accept, "+mcpTenantHeader+", Mcp-Session-Id") + h.Set("Access-Control-Expose-Headers", "Mcp-Session-Id") + if r.Method == http.MethodOptions { + w.WriteHeader(http.StatusNoContent) + return + } + next.ServeHTTP(w, r) + }) } // ServeHTTP dispatches by HTTP method — no path routing needed. diff --git a/main.go b/main.go index 217e887..5278841 100644 --- a/main.go +++ b/main.go @@ -14,8 +14,6 @@ import ( "syscall" "time" - "github.com/RandomCodeSpace/central-ops/pkg/version" - "github.com/RandomCodeSpace/otelcontext/internal/ai" "github.com/RandomCodeSpace/otelcontext/internal/api" "github.com/RandomCodeSpace/otelcontext/internal/config" @@ -55,7 +53,20 @@ import ( // Version is detected from build info at startup. // Returns the real tag when installed via `go install`, "local" otherwise. -var Version = version.Detect() +var Version = detectVersion() + +// detectVersion reads runtime/debug.BuildInfo to return the module version +// that go install or go build stamped into the binary. Falls back to "local" +// for go run, raw go build, or any path that does not produce a stamped +// build (e.g. `(devel)` from module-aware development builds). +func detectVersion() string { + if info, ok := debug.ReadBuildInfo(); ok { + if v := info.Main.Version; v != "" && v != "(devel)" { + return v + } + } + return "local" +} // cleanupStack is an ordered LIFO list of cleanup closures registered during // startup. fatal() walks it before os.Exit so DBs, DLQs, and tracer providers From 696c77bfe1488f59ddfadfa5a007f91733ad158e Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Mon, 25 May 2026 09:32:51 +0000 Subject: [PATCH 09/11] docs(spec): trim per-driver table and PRAGMA listing to clear Sonar gate SonarCloud quality-gate kept failing at 3.5% duplication on new code because the spec's "Per-driver config defaults" table and "SQLite tuning" code block were lifted near-verbatim from CLAUDE.md (and the implementation sites in internal/config/config.go and internal/storage/factory.go). Replace both with a short pointer to CLAUDE.md / factory.go so the spec still tells the story (problem, decision, migration notes) but stops copying the operator-facing reference data verbatim. CLAUDE.md remains the authoritative table; the spec is now a thinner historical record. --- ...-05-24-mcp-7tool-sqlite-survival-design.md | 49 ++++++------------- 1 file changed, 16 insertions(+), 33 deletions(-) diff --git a/docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md b/docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md index 28aa5a0..d0c6cdd 100644 --- a/docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md +++ b/docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md @@ -107,42 +107,25 @@ forensic-analytics workflow (`get_investigations`, `get_investigation`, ### SQLite tuning -After `gorm.Open` succeeds with `DB_DRIVER=sqlite`, apply these PRAGMAs in -order with fail-closed error handling: - -```go -pragmas := []string{ - "PRAGMA journal_mode=WAL", // existing - "PRAGMA synchronous=NORMAL", // existing - "PRAGMA cache_size=-262144", // 256 MB page cache (new) - "PRAGMA temp_store=MEMORY", // new - "PRAGMA mmap_size=1073741824", // 1 GB mmap (new) - "PRAGMA wal_autocheckpoint=10000", // new — keeps WAL bounded - "PRAGMA journal_size_limit=67108864", // cap WAL at 64 MB (new) - "PRAGMA busy_timeout=5000", // existing -} -``` - -A PRAGMA failure is fatal — these are not optional, and silent fallback -to defaults defeats the survivability goal. +`internal/storage/factory.go` applies an 8-PRAGMA stanza (WAL mode, sync +NORMAL, 256 MB page cache, MEMORY temp store, 1 GB mmap, 10k-page +autocheckpoint, 64 MB WAL cap, 5s busy_timeout) immediately after +`gorm.Open` when the driver is SQLite. Any PRAGMA failure aborts startup +— these are not optional, and silent fallback to defaults defeats the +survivability goal. CLAUDE.md "SQLite PRAGMA stanza" enumerates each +PRAGMA with its rationale. ### Per-driver config defaults -The following defaults override the Postgres-tuned defaults when -`DB_DRIVER=sqlite`, only if the operator has not explicitly set the env -var (detected via `os.LookupEnv`, not value comparison): - -| Env var | SQLite default | Postgres/MSSQL default | Reason | -|---|---|---|---| -| `DB_MAX_OPEN_CONNS` | 1 | 50 | SQLite single-writer; multiple open conns are wasted slots. | -| `DB_MAX_IDLE_CONNS` | 1 | 10 | Match open conns. | -| `INGEST_PIPELINE_WORKERS` | 2 | 8 | 8 workers all serialize through the SQLite writer lock anyway; 2 is enough to keep the writer queue non-empty without pushing extra work into heap. | -| `INGEST_PIPELINE_QUEUE_SIZE` | 10000 | 50000 | Smaller queue = lower heap watermark; backpressure kicks in earlier so OTLP clients back off rather than us OOMing. | -| `METRIC_MAX_CARDINALITY` | 3000 | 10000 | Bound the TSDB series map. 120 services × 25 series/service still fits. | -| `STORE_MIN_SEVERITY` | `WARN` | `""` (== ingest) | Skip INFO/DEBUG persists on the SQLite path — in-memory GraphRAG/anomaly detection still benefits from the full stream. | -| `SAMPLING_RATE` | 0.05 | 1.0 | Trace volume is the primary disk-growth contributor. 5% sample at 120 services ≈ what 1.0 used to do at 6 services. | -| `GRPC_MAX_CONCURRENT_STREAMS` | 240 | 1000 | Each stream costs heap; 120 services × 2 = 240 covers the deployment with no overhead. | -| `LOG_FTS_ENABLED` | `true` | n/a | FTS5 is dramatically faster than LIKE on the kept `search_logs` path; operators who want the ~30% disk savings can opt out. | +When `DB_DRIVER=sqlite`, `config.Load()` overrides nine defaults that are +otherwise Postgres-tuned. The override applies only when the operator did +not set the env var explicitly (detected via `os.LookupEnv` presence, not +value comparison). The authoritative table — env var, SQLite default, +Postgres default, and per-row rationale — lives in `CLAUDE.md` under +"SQLite per-driver defaults". The implementation in +`internal/config/config.go::applyDriverDefaults` and its tests in +`internal/config/driver_defaults_test.go` are the runtime source of +truth. ### `search_logs` backend swap From 9c1e511ce6edbb160df12346d1b8489c29eefc64 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Mon, 25 May 2026 09:36:29 +0000 Subject: [PATCH 10/11] refactor(mcp): map-dispatch the 7-tool switch to clear Sonar dup gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dispatcher had seven structurally identical `case "name": return s.toolFn(ctx, args)` arms — 14 lines that SonarCloud flagged as duplication on new code (3.5%, exactly the 14 lines remaining over the 3% gate after the spec trim in 696c77b). Replace the switch with a `map[string]func(context.Context, map[string]any) ToolCallResult` populated in-place and looked up once. Same dispatch semantics, same metrics deferral, no behavioural change. The map literal is the single source of truth for which names route to which handlers; adding a new tool is still one entry per name and one entry in toolDefs. Verified: go test ./internal/mcp/... -count=1 -race passes (all 366 sub-tests). gofmt clean. -2 LOC net. --- internal/mcp/tools.go | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index 3753a0e..9b4559e 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -141,24 +141,22 @@ func (s *Server) toolHandler(ctx context.Context, name string, args map[string]a } s.metrics.MCPToolInvocationsTotal.WithLabelValues(name, status).Inc() }() - switch name { - case "get_anomaly_timeline": - return s.toolGetAnomalyTimeline(ctx, args) - case "get_service_map": - return s.toolGetServiceMap(ctx, args) - case "get_service_health": - return s.toolGetServiceHealth(ctx, args) - case "root_cause_analysis": - return s.toolRootCauseAnalysis(ctx, args) - case "impact_analysis": - return s.toolImpactAnalysis(ctx, args) - case "trace_graph": - return s.toolTraceGraph(ctx, args) - case "search_logs": - return s.toolSearchLogs(ctx, args) - default: - return errorResult(fmt.Sprintf("unknown tool: %s", name)) - } + // Map dispatch: the name -> handler binding is the single source of truth + // for which tools the surface exposes. Adding a new tool means one entry + // in this map plus a definition in toolDefs, nothing else. + dispatch := map[string]func(context.Context, map[string]any) ToolCallResult{ + "get_anomaly_timeline": s.toolGetAnomalyTimeline, + "get_service_map": s.toolGetServiceMap, + "get_service_health": s.toolGetServiceHealth, + "root_cause_analysis": s.toolRootCauseAnalysis, + "impact_analysis": s.toolImpactAnalysis, + "trace_graph": s.toolTraceGraph, + "search_logs": s.toolSearchLogs, + } + if fn, ok := dispatch[name]; ok { + return fn(ctx, args) + } + return errorResult(fmt.Sprintf("unknown tool: %s", name)) } // --- Tool implementations --- From ff590d77994cf753657674b06366b57311a7edef Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Mon, 25 May 2026 09:40:39 +0000 Subject: [PATCH 11/11] refactor(mcp): builder helpers for tool defs to collapse Sonar dup gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous attempt (map-dispatch in 9c1e511) fixed the 7-arm switch but Sonar's gate stayed at 3.49% because the actual duplicated 14 lines were the structurally identical InputSchema/Properties scaffolding repeated across the seven Tool struct literals — not the dispatcher. Introduce three small builder helpers — mkTool(name, desc, opts...), param(name, type, desc), and required(fields...) — that own the InputSchema initialisation and Property construction once. The toolDefs list collapses from 7 repeating struct-literal blocks (8-12 lines each) to 7 mkTool calls (3-5 lines each). Same surface, same JSON shape on the wire, no behaviour change. The helper types are unexported and only used here. LOC delta: -20 net (65 inserted, 85 deleted). Verified by go test ./internal/mcp/... -count=1 -race (full suite passes) and gofmt clean. --- internal/mcp/tools.go | 150 ++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 85 deletions(-) diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index 9b4559e..6896aeb 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -21,92 +21,72 @@ const ( // OtelContext MCP server. The surface was reduced from 21 to 7 in // 2026-05-24 so the platform survives 120 services on SQLite — see // docs/superpowers/specs/2026-05-24-mcp-7tool-sqlite-survival-design.md. +// schemaOpt mutates an InputSchema being built by mkTool. Use param(...) and +// required(...) to compose schemas without re-typing the InputSchema / +// Properties scaffolding on every tool definition. +type schemaOpt func(*InputSchema) + +// param adds a single Property to the schema. Type is "string" or "number". +func param(name, typ, desc string) schemaOpt { + return func(s *InputSchema) { + s.Properties[name] = Property{Type: typ, Description: desc} + } +} + +// required marks one or more parameter names as required by JSON-schema. +func required(fields ...string) schemaOpt { + return func(s *InputSchema) { s.Required = append(s.Required, fields...) } +} + +// mkTool builds a Tool with a freshly-initialised InputSchema. Centralising +// the InputSchema/Properties scaffolding here keeps the toolDefs list one +// call per tool and avoids the repeated struct-literal boilerplate that +// SonarCloud (rightly) flagged as duplication. +func mkTool(name, desc string, opts ...schemaOpt) Tool { + s := InputSchema{Type: "object", Properties: map[string]Property{}} + for _, opt := range opts { + opt(&s) + } + return Tool{Name: name, Description: desc, InputSchema: s} +} + var toolDefs = []Tool{ - { - Name: "get_anomaly_timeline", - Description: "Returns recent anomalies with temporal causal links, optionally filtered by service. The triage entry point — answers \"what's wrong right now\".", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]Property{ - "since": {Type: "string", Description: "Start time RFC3339. Defaults to 1h ago."}, - "service": {Type: "string", Description: "Filter by service."}, - }, - }, - }, - { - Name: "get_service_map", - Description: "Returns the service topology with health scores, error rates, call counts, and dependency edges. Powered by the live GraphRAG.", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]Property{ - "depth": {Type: "number", Description: "Max traversal depth (default 3)."}, - "service": {Type: "string", Description: "Focus on a specific service and its neighbors."}, - }, - }, - }, - { - Name: "get_service_health", - Description: "Returns detailed health metrics for a specific service: error rate, latency percentiles, request rate, and active alerts.", - InputSchema: InputSchema{ - Type: "object", - Required: []string{"service_name"}, - Properties: map[string]Property{ - "service_name": {Type: "string", Description: "The service name to query."}, - }, - }, - }, - { - Name: "root_cause_analysis", - Description: "Ranked probable root causes with evidence: error chains, anomalous metrics, correlated logs.", - InputSchema: InputSchema{ - Type: "object", - Required: []string{"service"}, - Properties: map[string]Property{ - "service": {Type: "string", Description: "Service experiencing issues."}, - "time_range": {Type: "string", Description: "Lookback window. Defaults to '15m'."}, - }, - }, - }, - { - Name: "impact_analysis", - Description: "BFS downstream from a service to find all affected services and impact scores.", - InputSchema: InputSchema{ - Type: "object", - Required: []string{"service"}, - Properties: map[string]Property{ - "service": {Type: "string", Description: "Service to analyze blast radius for."}, - "depth": {Type: "number", Description: "Max traversal depth (default 5)."}, - }, - }, - }, - { - Name: "trace_graph", - Description: "Returns the full span tree for a trace with service names, durations, errors, and linked logs.", - InputSchema: InputSchema{ - Type: "object", - Required: []string{"trace_id"}, - Properties: map[string]Property{ - "trace_id": {Type: "string", Description: "The trace ID to visualize."}, - }, - }, - }, - { - Name: "search_logs", - Description: "Searches log entries by severity, service, body text, trace ID, and time range. Returns id, timestamp, severity, service_name, body, trace_id. **Limited to the last 24 hours** — windows entirely outside the 24h cap are rejected. Strongly recommend setting `service` and/or `severity` to scope the search; unscoped keyword queries scan large row counts when FTS5 is disabled. Use severity=ERROR to find errors, query= for full-text search, trace_id= to correlate with a trace. Use page= for pagination.", - InputSchema: InputSchema{ - Type: "object", - Properties: map[string]Property{ - "query": {Type: "string", Description: "Full-text search in log body."}, - "severity": {Type: "string", Description: "Filter by severity level: ERROR, WARN, INFO, DEBUG."}, - "service": {Type: "string", Description: "Filter by service name (exact match)."}, - "trace_id": {Type: "string", Description: "Filter logs belonging to a specific trace ID."}, - "start": {Type: "string", Description: "Start time RFC3339. Defaults to 24h ago. Cannot be earlier than now-24h; older values are clamped."}, - "end": {Type: "string", Description: "End time RFC3339. Defaults to now. Cannot exceed now; future values are clamped."}, - "limit": {Type: "number", Description: "Max results per page (default 50, max 200)."}, - "page": {Type: "number", Description: "Page number for pagination (default 0)."}, - }, - }, - }, + mkTool("get_anomaly_timeline", "Returns recent anomalies with temporal causal links, optionally filtered by service. The triage entry point — answers \"what's wrong right now\".", + param("since", "string", "Start time RFC3339. Defaults to 1h ago."), + param("service", "string", "Filter by service."), + ), + mkTool("get_service_map", "Returns the service topology with health scores, error rates, call counts, and dependency edges. Powered by the live GraphRAG.", + param("depth", "number", "Max traversal depth (default 3)."), + param("service", "string", "Focus on a specific service and its neighbors."), + ), + mkTool("get_service_health", "Returns detailed health metrics for a specific service: error rate, latency percentiles, request rate, and active alerts.", + required("service_name"), + param("service_name", "string", "The service name to query."), + ), + mkTool("root_cause_analysis", "Ranked probable root causes with evidence: error chains, anomalous metrics, correlated logs.", + required("service"), + param("service", "string", "Service experiencing issues."), + param("time_range", "string", "Lookback window. Defaults to '15m'."), + ), + mkTool("impact_analysis", "BFS downstream from a service to find all affected services and impact scores.", + required("service"), + param("service", "string", "Service to analyze blast radius for."), + param("depth", "number", "Max traversal depth (default 5)."), + ), + mkTool("trace_graph", "Returns the full span tree for a trace with service names, durations, errors, and linked logs.", + required("trace_id"), + param("trace_id", "string", "The trace ID to visualize."), + ), + mkTool("search_logs", "Searches log entries by severity, service, body text, trace ID, and time range. Returns id, timestamp, severity, service_name, body, trace_id. **Limited to the last 24 hours** — windows entirely outside the 24h cap are rejected. Strongly recommend setting `service` and/or `severity` to scope the search; unscoped keyword queries scan large row counts when FTS5 is disabled. Use severity=ERROR to find errors, query= for full-text search, trace_id= to correlate with a trace. Use page= for pagination.", + param("query", "string", "Full-text search in log body."), + param("severity", "string", "Filter by severity level: ERROR, WARN, INFO, DEBUG."), + param("service", "string", "Filter by service name (exact match)."), + param("trace_id", "string", "Filter logs belonging to a specific trace ID."), + param("start", "string", "Start time RFC3339. Defaults to 24h ago. Cannot be earlier than now-24h; older values are clamped."), + param("end", "string", "End time RFC3339. Defaults to now. Cannot exceed now; future values are clamped."), + param("limit", "number", "Max results per page (default 50, max 200)."), + param("page", "number", "Page number for pagination (default 0)."), + ), } // mcpCtx returns a tenant-scoped context for repository calls. If the caller's