From 7eed2858e16d7df874636e862a0a88c6d311ed61 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Tue, 28 Apr 2026 09:53:23 +0000 Subject: [PATCH 1/2] fix(api): source services dropdown from GraphRAG ServiceStore, not the traces table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /api/metadata/services queried `SELECT DISTINCT service_name FROM traces`, which silently dropped any service that only ever appeared as a callee deep in a fan-out. Root cause: TraceServer.Export inserts a Trace row for every span (not just root spans), the table has uniqueIndex(tenant_id, trace_id), so the deepest-fan-out span loses the insert race for its trace_id and its service_name never lands. Result on the simulator: 7 services emit telemetry but only 6 appear in the dropdown — shipping-service (deepest hop) is invisible while user-service occasionally wins the race. Fix: read the dropdown from the in-memory GraphRAG ServiceStore, which sees every UpsertService call regardless of span depth. This is the same source /api/system/graph already uses, so the dropdown now matches the system map exactly. No DB query — GraphRAG's own 60s refresh loop is responsible for cold-start population. Cold-start (first ~60s after restart) returns []; the JSON encoder emits `[]` not `null` so the UI dropdown stays a valid array on first paint. Tests in internal/graphrag/service_names_test.go cover: - a 3-deep fan-out where the grandchild service must still appear (the exact bug condition) - tenant scoping (no leak across tenants) - empty store returns [] not nil Live verification: with the simulator hammering 9001-9007 and otelcontext running on HTTP_PORT=37778, /api/metadata/services now returns all 7 services and matches /api/system/graph exactly. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/api/metrics_handlers.go | 21 ++++-- internal/graphrag/queries.go | 14 ++++ internal/graphrag/service_names_test.go | 86 +++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 5 deletions(-) create mode 100644 internal/graphrag/service_names_test.go diff --git a/internal/api/metrics_handlers.go b/internal/api/metrics_handlers.go index 2fba8e5..371f76e 100644 --- a/internal/api/metrics_handlers.go +++ b/internal/api/metrics_handlers.go @@ -168,12 +168,23 @@ func (s *Server) handleGetMetricNames(w http.ResponseWriter, r *http.Request) { _ = json.NewEncoder(w).Encode(names) } +// handleGetServices returns the list of services the caller's tenant has +// emitted any span for. Read from the in-memory GraphRAG ServiceStore so +// the dropdown matches /api/system/graph exactly — and so a service that +// only appears as a downstream callee (e.g. shipping-service deep in a +// fan-out) isn't silently dropped because some other span won the +// trace_id-uniqueness race for the legacy `traces` table query. +// +// Cold-start (first ~60s after restart, before the GraphRAG refresh loop +// rebuilds from DB) returns an empty list, which is correct: nothing has +// been ingested yet that the dropdown could meaningfully display. func (s *Server) handleGetServices(w http.ResponseWriter, r *http.Request) { - services, err := s.repo.GetServices(r.Context()) - if err != nil { - slog.Error("Failed to get services metadata", "error", err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return + var services []string + if s.graphRAG != nil { + services = s.graphRAG.ServiceNames(r.Context()) + } + if services == nil { + services = []string{} } w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(services) diff --git a/internal/graphrag/queries.go b/internal/graphrag/queries.go index c05b343..595db18 100644 --- a/internal/graphrag/queries.go +++ b/internal/graphrag/queries.go @@ -355,6 +355,20 @@ func (g *GraphRAG) AllServiceEdges(ctx context.Context) []*Edge { return g.storesFor(ctx).service.AllEdges() } +// ServiceNames returns every service the caller's tenant has emitted any +// span for, sorted ascending. Reads from the in-memory ServiceStore — no +// DB scan. Used by /api/metadata/services so the dropdown matches the +// system map exactly (both are sourced from the same store). +func (g *GraphRAG) ServiceNames(ctx context.Context) []string { + services := g.storesFor(ctx).service.AllServices() + names := make([]string, 0, len(services)) + for _, svc := range services { + names = append(names, svc.Name) + } + sort.Strings(names) + return names +} + // ServiceMap returns the service topology with health scores for the API. type ServiceMapEntry struct { Service *ServiceNode `json:"service"` diff --git a/internal/graphrag/service_names_test.go b/internal/graphrag/service_names_test.go new file mode 100644 index 0000000..b392c8d --- /dev/null +++ b/internal/graphrag/service_names_test.go @@ -0,0 +1,86 @@ +package graphrag + +import ( + "context" + "reflect" + "testing" + "time" + + "github.com/RandomCodeSpace/otelcontext/internal/storage" +) + +// TestServiceNames_IncludesDeepCalleesAndIsTenantScoped covers the bug that +// motivated this method: prior to ServiceNames, /api/metadata/services +// queried `SELECT DISTINCT service_name FROM traces`, which silently dropped +// any service that only ever appeared as a callee (deep in a fan-out) — its +// span lost the trace_id-uniqueness race and never made it into the traces +// table. ServiceNames reads from the in-memory ServiceStore where every +// span — root or child — registers via UpsertService. +func TestServiceNames_IncludesDeepCalleesAndIsTenantScoped(t *testing.T) { + g := newTestGraphRAG(t) + + now := time.Now() + mk := func(tenant, service, traceID, spanID, parentSpan string) storage.Span { + return storage.Span{ + TenantID: tenant, + TraceID: traceID, + SpanID: spanID, + ParentSpanID: parentSpan, + OperationName: "/op", + ServiceName: service, + Status: "STATUS_CODE_OK", + StartTime: now, + EndTime: now.Add(time.Millisecond), + Duration: 1000, + } + } + + // Tenant A: a 3-deep fan-out under one trace_id. order is the root, + // payment is a child, shipping is a grandchild — exactly the pattern + // where shipping previously got dropped from the dropdown. + g.OnSpanIngested(mk("tenant-a", "order", "t-a-1", "root", "")) + g.OnSpanIngested(mk("tenant-a", "payment", "t-a-1", "child", "root")) + g.OnSpanIngested(mk("tenant-a", "shipping", "t-a-1", "grand", "child")) + + // Tenant B: a single root in a separate tenant. Must not leak into A. + g.OnSpanIngested(mk("tenant-b", "audit", "t-b-1", "root", "")) + + ctxA := storage.WithTenantContext(context.Background(), "tenant-a") + ctxB := storage.WithTenantContext(context.Background(), "tenant-b") + + // Async event workers — poll until both tenants have settled. + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if len(g.ServiceNames(ctxA)) >= 3 && len(g.ServiceNames(ctxB)) >= 1 { + break + } + time.Sleep(20 * time.Millisecond) + } + + gotA := g.ServiceNames(ctxA) + wantA := []string{"order", "payment", "shipping"} // sorted ascending + if !reflect.DeepEqual(gotA, wantA) { + t.Fatalf("tenant-a ServiceNames = %v, want %v (sorted, includes deep callee)", gotA, wantA) + } + + gotB := g.ServiceNames(ctxB) + wantB := []string{"audit"} + if !reflect.DeepEqual(gotB, wantB) { + t.Fatalf("tenant-b ServiceNames = %v, want %v (no leak from tenant-a)", gotB, wantB) + } +} + +// TestServiceNames_EmptyOnColdStart asserts that a freshly-constructed +// GraphRAG with no ingested spans returns an empty slice (never nil), +// matching the JSON-encoder expectation of `[]` rather than `null` so the +// UI dropdown stays a valid array on first paint. +func TestServiceNames_EmptyOnColdStart(t *testing.T) { + g := newTestGraphRAG(t) + got := g.ServiceNames(context.Background()) + if got == nil { + t.Fatal("ServiceNames returned nil; want empty slice for json `[]` round-trip") + } + if len(got) != 0 { + t.Fatalf("ServiceNames on empty store = %v, want []", got) + } +} From e6a88f3ed411bdfd287927a06cbc80f4f4de4175 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Tue, 28 Apr 2026 09:54:06 +0000 Subject: [PATCH 2/2] chore(test): use Join-Path in run_simulation.ps1 for cross-platform support The simulator's build and process-start steps used Windows-style \$TmpDir\\$svc.exe path concatenation, producing files with literal backslashes when run via pwsh on Linux. Replaced with Join-Path so the script works on both Windows and Linux without behaviour change. This is the path used to run the simulator that exposed the services-dropdown bug fixed in this PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/run_simulation.ps1 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/run_simulation.ps1 b/test/run_simulation.ps1 index 2f9bc9c..60d3261 100644 --- a/test/run_simulation.ps1 +++ b/test/run_simulation.ps1 @@ -53,7 +53,7 @@ $Services = @( Push-Location $RootDir foreach ($svc in $Services) { Write-Host (" {0,-26}" -f $svc.Name) -NoNewline -ForegroundColor DarkGray - go build -o "$TmpDir\$($svc.Name).exe" "./test/$($svc.Name)" 2>&1 | Out-Null + go build -o (Join-Path $TmpDir "$($svc.Name).exe") "./test/$($svc.Name)" 2>&1 | Out-Null Write-Host "built" -ForegroundColor DarkGray } Pop-Location @@ -65,9 +65,9 @@ Write-Host "[2/3] Starting services..." -ForegroundColor Yellow $Procs = @() foreach ($svc in $Services) { - $exe = "$TmpDir\$($svc.Name).exe" - $stdout = "$LogDir\$($svc.Name).stdout" - $stderr = "$LogDir\$($svc.Name).stderr" + $exe = Join-Path $TmpDir "$($svc.Name).exe" + $stdout = Join-Path $LogDir "$($svc.Name).stdout" + $stderr = Join-Path $LogDir "$($svc.Name).stderr" $proc = Start-Process -FilePath $exe -NoNewWindow -PassThru ` -RedirectStandardOutput $stdout ` -RedirectStandardError $stderr