From 901a4aee9a589bd060b8a058b200ce9883cb662e Mon Sep 17 00:00:00 2001
From: Amit Kumar <ak.nitrr13@gmail.com>
Date: Fri, 24 Apr 2026 20:26:36 +0000
Subject: [PATCH 1/2] feat(graphrag): tenant-partition in-memory stores + query
 ctx (RAN-37)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces a tenant dimension to every GraphRAG in-memory store and the
query surface, so every ingest and query lands in the calling tenant's
slice of the graph.

Architecture
- New tenantStores composite bundles one tenant's ServiceStore,
  TraceStore, SignalStore, AnomalyStore. GraphRAG drops the four
  direct-store fields and instead holds tenants map[string]*tenantStores
  guarded by its own RWMutex; slices are lazy-created via
  storesFor(ctx) / storesForTenant(tenant). The default tenant is
  seeded at New() so background loops have a baseline to iterate.
- spanEvent / logEvent / metricEvent each carry Tenant string;
  OnSpanIngested / OnLogIngested / OnMetricIngested derive it from
  storage.Span / storage.Log / tsdb.RawMetric TenantID fields. Callback
  signatures are intentionally unchanged — tsdb.RawMetric already
  carries TenantID from ingest/otlp.go, so no second argument or struct
  change is needed for the metric path.
- Every public query method takes ctx as its first argument and routes
  through storesFor(ctx): ErrorChain, ImpactAnalysis, RootCauseAnalysis,
  DependencyChain, CorrelatedSignals, ShortestPath, AnomalyTimeline,
  ServiceMap, SimilarErrors. Two narrow helpers (AllServiceEdges,
  AnomaliesForService) let handlers read store data without reaching
  into the composite directly.
- Background loops (refresh, snapshot, anomaly) iterate a stable
  snapshot of tenants on each tick, wrap ctx with
  storage.WithTenantContext(ctx, tenant), and act on each slice under
  its own per-store lock. rebuildAllTenantsFromDB enumerates tenants
  from the spans table so historical data from tenants without live
  callbacks is still recovered.
- The Drain log-template miner stays shared across tenants — templates
  describe log shape, not content — while per-tenant SignalStore maps
  keep cluster nodes isolated. Persisted tables (investigations,
  graph_snapshots, drain_templates) intentionally unchanged; Subtask B
  (RAN-38) owns the tenant_id column migration. PersistInvestigation
  now accepts tenant as its first arg so the column can be wired in B
  without a signature break.

External callers updated
- internal/mcp/tools.go: all GraphRAG-backed tools take ctx and thread
  it through the query call; AnomalyStore direct access replaced with
  AnomaliesForService.
- internal/api/graph_handler.go: handleGetSystemGraph uses
  r.Context() throughout, the 10s response cache is tenant-keyed so
  cross-tenant cache hits are impossible, and ServiceStore direct
  access replaced with AllServiceEdges.

Tests
- builder_test + investigation_cooldown_test updated to new signatures.
- New tenant_isolation_test covers the core RAN-37 invariant: two
  tenants ingesting overlapping data see only their own services /
  traces, and empty-context lookups collapse to DefaultTenantID.
- go build ./... && go vet ./... && go test ./... all green;
  go test -race ./internal/graphrag/... clean.

Scope
- Out of scope: persisted-table tenant columns (Subtask B, RAN-38),
  MCP-level tenant-isolation integration test (Subtask C, RAN-39 —
  the merge gate for RAN-19). internal/graph/ legacy package untouched.

Refs RAN-19 RAN-21

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 internal/api/graph_handler.go                 |  25 ++--
 internal/graphrag/anomaly.go                  |  82 ++++++-----
 internal/graphrag/builder.go                  | 131 +++++++++++++++---
 internal/graphrag/builder_test.go             |  15 +-
 internal/graphrag/clustering.go               |  44 ++++--
 internal/graphrag/investigation.go            |  16 ++-
 .../graphrag/investigation_cooldown_test.go   |   6 +-
 internal/graphrag/queries.go                  |  94 ++++++++-----
 internal/graphrag/refresh.go                  |  90 +++++++++---
 internal/graphrag/snapshot.go                 |  51 +++++--
 internal/graphrag/store.go                    |  21 +++
 internal/graphrag/tenant_isolation_test.go    |  84 +++++++++++
 internal/mcp/tools.go                         |  50 ++++---
 13 files changed, 535 insertions(+), 174 deletions(-)
 create mode 100644 internal/graphrag/tenant_isolation_test.go

diff --git a/internal/api/graph_handler.go b/internal/api/graph_handler.go
index cb98844..1ee9826 100644
--- a/internal/api/graph_handler.go
+++ b/internal/api/graph_handler.go
@@ -7,6 +7,8 @@ import (
 	"math"
 	"net/http"
 	"time"
+
+	"github.com/RandomCodeSpace/otelcontext/internal/storage"
 )
 
 // SystemSummary is the top-level system health summary.
@@ -63,10 +65,12 @@ var OtelContextStartTime = time.Now()
 // handleGetSystemGraph handles GET /api/system/graph.
 // When the in-memory graph has been populated it returns instantly from memory.
 // Falls back to a DB query only when the graph has never been built yet.
-// Results are additionally cached for 10s to smooth out burst traffic.
+// Results are cached for 10s per tenant — the cache key is scoped by tenant
+// so two tenants hitting this endpoint never share a response.
 func (s *Server) handleGetSystemGraph(w http.ResponseWriter, r *http.Request) {
-	const cacheKey = "system_graph"
 	const cacheTTL = 10 * time.Second
+	ctx := r.Context()
+	cacheKey := "system_graph:" + storage.TenantFromContext(ctx)
 
 	if cached, ok := s.cache.Get(cacheKey); ok {
 		w.Header().Set("Content-Type", "application/json")
@@ -75,10 +79,10 @@ func (s *Server) handleGetSystemGraph(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 
-	resp := s.buildGraphFromMemory()
+	resp := s.buildGraphFromMemory(ctx)
 	if resp == nil {
 		// Graph not yet hydrated — fall back to DB path.
-		resp = s.buildGraphFromDB(r.Context())
+		resp = s.buildGraphFromDB(ctx)
 		if resp == nil {
 			http.Error(w, "failed to build system graph", http.StatusInternalServerError)
 			return
@@ -93,10 +97,10 @@ func (s *Server) handleGetSystemGraph(w http.ResponseWriter, r *http.Request) {
 
 // buildGraphFromMemory converts the in-memory graph snapshot to the API response.
 // Returns nil if the graph has not been built yet.
-func (s *Server) buildGraphFromMemory() *SystemGraphResponse {
+func (s *Server) buildGraphFromMemory(ctx context.Context) *SystemGraphResponse {
 	// Prefer GraphRAG if available
 	if s.graphRAG != nil {
-		return s.buildGraphFromGraphRAG()
+		return s.buildGraphFromGraphRAG(ctx)
 	}
 	if s.graph == nil {
 		return nil
@@ -147,9 +151,10 @@ func (s *Server) buildGraphFromMemory() *SystemGraphResponse {
 	return buildSummaryResponse(nodes, edges, totalErrorRate, totalLatency)
 }
 
-// buildGraphFromGraphRAG converts the GraphRAG service store into the API response.
-func (s *Server) buildGraphFromGraphRAG() *SystemGraphResponse {
-	services := s.graphRAG.ServiceMap(0)
+// buildGraphFromGraphRAG converts the caller's tenant slice of the GraphRAG
+// service store into the API response.
+func (s *Server) buildGraphFromGraphRAG(ctx context.Context) *SystemGraphResponse {
+	services := s.graphRAG.ServiceMap(ctx, 0)
 	if len(services) == 0 {
 		return nil
 	}
@@ -179,7 +184,7 @@ func (s *Server) buildGraphFromGraphRAG() *SystemGraphResponse {
 	}
 
 	edges := make([]GraphEdge, 0)
-	allEdges := s.graphRAG.ServiceStore.AllEdges()
+	allEdges := s.graphRAG.AllServiceEdges(ctx)
 	for _, e := range allEdges {
 		if e.Type == "CALLS" {
 			edges = append(edges, GraphEdge{
diff --git a/internal/graphrag/anomaly.go b/internal/graphrag/anomaly.go
index c5b0cbc..0d84cb7 100644
--- a/internal/graphrag/anomaly.go
+++ b/internal/graphrag/anomaly.go
@@ -4,11 +4,23 @@ import (
 	"context"
 	"fmt"
 	"time"
+
+	"github.com/RandomCodeSpace/otelcontext/internal/storage"
 )
 
-// detectAnomalies runs anomaly detection across all services.
-func (g *GraphRAG) detectAnomalies() {
-	services := g.ServiceStore.AllServices()
+// detectAnomalies runs anomaly detection across every known tenant. For each
+// tenant slice we walk the ServiceStore and SignalStore under their own
+// locks and emit anomalies into that tenant's AnomalyStore.
+func (g *GraphRAG) detectAnomalies(ctx context.Context) {
+	tenants := g.snapshotTenants()
+	for tenant, stores := range tenants {
+		tctx := storage.WithTenantContext(ctx, tenant)
+		g.detectAnomaliesForTenant(tctx, tenant, stores)
+	}
+}
+
+func (g *GraphRAG) detectAnomaliesForTenant(ctx context.Context, tenant string, stores *tenantStores) {
+	services := stores.service.AllServices()
 	now := time.Now()
 
 	for _, svc := range services {
@@ -23,14 +35,14 @@ func (g *GraphRAG) detectAnomalies() {
 				Evidence:  fmt.Sprintf("error rate %.1f%% (baseline ~%.1f%%)", svc.ErrorRate*100, baselineErrorRate*100),
 				Timestamp: now,
 			}
-			g.AnomalyStore.AddAnomaly(anomaly)
-			g.correlateWithRecent(anomaly)
+			stores.anomalies.AddAnomaly(anomaly)
+			correlateWithRecent(stores, anomaly)
 
-			// Trigger investigation
-			chains := g.ErrorChain(svc.Name, now.Add(-5*time.Minute), 5)
+			// Trigger investigation (scoped to this tenant).
+			chains := g.ErrorChain(ctx, svc.Name, now.Add(-5*time.Minute), 5)
 			if len(chains) > 0 {
-				anomalies := g.AnomalyStore.AnomaliesForService(svc.Name, now.Add(-1*time.Minute))
-				g.PersistInvestigation(svc.Name, chains, anomalies)
+				anomalies := stores.anomalies.AnomaliesForService(svc.Name, now.Add(-1*time.Minute))
+				g.PersistInvestigation(tenant, svc.Name, chains, anomalies)
 			}
 		}
 
@@ -44,18 +56,18 @@ func (g *GraphRAG) detectAnomalies() {
 				Evidence:  fmt.Sprintf("avg latency %.0fms", svc.AvgLatency),
 				Timestamp: now,
 			}
-			g.AnomalyStore.AddAnomaly(anomaly)
-			g.correlateWithRecent(anomaly)
+			stores.anomalies.AddAnomaly(anomaly)
+			correlateWithRecent(stores, anomaly)
 		}
 	}
 
-	// Metric z-score anomalies (check metrics in SignalStore)
-	g.SignalStore.mu.RLock()
-	metrics := make([]*MetricNode, 0, len(g.SignalStore.Metrics))
-	for _, m := range g.SignalStore.Metrics {
+	// Metric z-score anomalies (check metrics in this tenant's SignalStore).
+	stores.signals.mu.RLock()
+	metrics := make([]*MetricNode, 0, len(stores.signals.Metrics))
+	for _, m := range stores.signals.Metrics {
 		metrics = append(metrics, m)
 	}
-	g.SignalStore.mu.RUnlock()
+	stores.signals.mu.RUnlock()
 
 	for _, m := range metrics {
 		if m.SampleCount < 10 {
@@ -74,23 +86,24 @@ func (g *GraphRAG) detectAnomalies() {
 					Evidence:  fmt.Sprintf("metric %s z-score %.1f (avg=%.2f, range=[%.2f, %.2f])", m.MetricName, deviation, m.RollingAvg, m.RollingMin, m.RollingMax),
 					Timestamp: now,
 				}
-				g.AnomalyStore.AddAnomaly(anomaly)
-				g.correlateWithRecent(anomaly)
+				stores.anomalies.AddAnomaly(anomaly)
+				correlateWithRecent(stores, anomaly)
 			}
 		}
 	}
 }
 
-// correlateWithRecent links an anomaly to other anomalies within ±30s.
-func (g *GraphRAG) correlateWithRecent(anomaly AnomalyNode) {
+// correlateWithRecent links an anomaly to other anomalies within ±30s in the
+// same tenant's AnomalyStore.
+func correlateWithRecent(stores *tenantStores, anomaly AnomalyNode) {
 	window := 30 * time.Second
-	recent := g.AnomalyStore.AnomaliesSince(anomaly.Timestamp.Add(-window))
+	recent := stores.anomalies.AnomaliesSince(anomaly.Timestamp.Add(-window))
 	for _, prev := range recent {
 		if prev.ID == anomaly.ID {
 			continue
 		}
 		if prev.Timestamp.After(anomaly.Timestamp.Add(-window)) && prev.Timestamp.Before(anomaly.Timestamp.Add(window)) {
-			g.AnomalyStore.AddPrecededByEdge(anomaly.ID, prev.ID, anomaly.Timestamp)
+			stores.anomalies.AddPrecededByEdge(anomaly.ID, prev.ID, anomaly.Timestamp)
 		}
 	}
 }
@@ -117,20 +130,23 @@ func classifyLatencySeverity(avgMs float64) AnomalySeverity {
 	}
 }
 
-// pruneOldAnomalies removes anomalies older than 24 hours.
+// pruneOldAnomalies removes anomalies older than 24 hours across every
+// tenant slice.
 func (g *GraphRAG) pruneOldAnomalies() {
 	cutoff := time.Now().Add(-24 * time.Hour)
-	g.AnomalyStore.mu.Lock()
-	defer g.AnomalyStore.mu.Unlock()
-	for id, a := range g.AnomalyStore.Anomalies {
-		if a.Timestamp.Before(cutoff) {
-			delete(g.AnomalyStore.Anomalies, id)
+	for _, stores := range g.snapshotTenants() {
+		stores.anomalies.mu.Lock()
+		for id, a := range stores.anomalies.Anomalies {
+			if a.Timestamp.Before(cutoff) {
+				delete(stores.anomalies.Anomalies, id)
+			}
 		}
-	}
-	for ek, e := range g.AnomalyStore.Edges {
-		if e.UpdatedAt.Before(cutoff) {
-			delete(g.AnomalyStore.Edges, ek)
+		for ek, e := range stores.anomalies.Edges {
+			if e.UpdatedAt.Before(cutoff) {
+				delete(stores.anomalies.Edges, ek)
+			}
 		}
+		stores.anomalies.mu.Unlock()
 	}
 }
 
@@ -145,7 +161,7 @@ func (g *GraphRAG) anomalyLoop(ctx context.Context) {
 		case <-g.stopCh:
 			return
 		case <-ticker.C:
-			g.detectAnomalies()
+			g.detectAnomalies(ctx)
 		}
 	}
 }
diff --git a/internal/graphrag/builder.go b/internal/graphrag/builder.go
index 3bf5451..143842b 100644
--- a/internal/graphrag/builder.go
+++ b/internal/graphrag/builder.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"log/slog"
 	"runtime/debug"
+	"sync"
 	"sync/atomic"
 	"time"
 
@@ -51,16 +52,22 @@ type spanEvent struct {
 	Span    storage.Span
 	TraceID string
 	Status  string
+	// Tenant is the tenant slice to route this event into. Populated by
+	// OnSpanIngested from storage.Span.TenantID; empty values are coerced
+	// to storage.DefaultTenantID before processing.
+	Tenant string
 }
 
 // logEvent is sent through the ingestion channel.
 type logEvent struct {
-	Log storage.Log
+	Log    storage.Log
+	Tenant string
 }
 
 // metricEvent is sent through the ingestion channel.
 type metricEvent struct {
 	Metric tsdb.RawMetric
+	Tenant string
 }
 
 // event wraps one of the above event types.
@@ -71,11 +78,17 @@ type event struct {
 }
 
 // GraphRAG is the main coordinator for the layered graph system.
+//
+// Every in-memory store is partitioned by tenant. The coordinator holds a map
+// of tenant ID → *tenantStores and a reader/writer mutex that protects only
+// the outer map; per-tenant stores keep their own RWMutexes for fine-grained
+// concurrent access. All event ingestion and queries route through
+// storesFor(ctx) / storesForTenant(tenant) — there is no "global" slice.
 type GraphRAG struct {
-	ServiceStore *ServiceStore
-	TraceStore   *TraceStore
-	SignalStore  *SignalStore
-	AnomalyStore *AnomalyStore
+	// tenants maps tenant ID → per-tenant store composite. Access via
+	// storesFor / storesForTenant / snapshotTenants, not directly.
+	tenants   map[string]*tenantStores
+	tenantsMu sync.RWMutex
 
 	repo      *storage.Repository
 	vectorIdx *vectordb.Index
@@ -201,10 +214,7 @@ func New(repo *storage.Repository, vectorIdx *vectordb.Index, tsdbAgg *tsdb.Aggr
 	}
 
 	g := &GraphRAG{
-		ServiceStore:  newServiceStore(),
-		TraceStore:    newTraceStore(cfg.TraceTTL),
-		SignalStore:   newSignalStore(),
-		AnomalyStore:  newAnomalyStore(),
+		tenants:       make(map[string]*tenantStores),
 		repo:          repo,
 		vectorIdx:     vectorIdx,
 		tsdbAgg:       tsdbAgg,
@@ -220,6 +230,11 @@ func New(repo *storage.Repository, vectorIdx *vectordb.Index, tsdbAgg *tsdb.Aggr
 		invCooldown:   newInvestigationCooldown(5 * time.Minute),
 	}
 
+	// Bootstrap the default tenant slice so refresh/snapshot loops have a
+	// baseline to iterate over before any ingest lands. Other tenants are
+	// created lazily on first event via storesForTenant.
+	g.storesForTenant(storage.DefaultTenantID)
+
 	// Restore persisted Drain templates so log clustering survives restarts.
 	// A missing table (fresh install) or transient DB error is non-fatal —
 	// ingestion will rebuild templates from scratch.
@@ -311,16 +326,24 @@ func (g *GraphRAG) IsRunning() bool {
 }
 
 // OnSpanIngested is the callback wired into the trace ingestion pipeline.
+// Tenant is taken straight from the persisted Span (already resolved upstream
+// by the OTLP Export handlers) and carried on the event — the callback
+// signature is intentionally unchanged so external wiring stays trivial.
 func (g *GraphRAG) OnSpanIngested(span storage.Span) {
 	status := span.Status
 	if status == "" {
 		status = "STATUS_CODE_UNSET"
 	}
+	tenant := span.TenantID
+	if tenant == "" {
+		tenant = storage.DefaultTenantID
+	}
 	select {
 	case g.eventCh <- event{span: &spanEvent{
 		Span:    span,
 		TraceID: span.TraceID,
 		Status:  status,
+		Tenant:  tenant,
 	}}:
 	default:
 		// Channel full — graph is best-effort; DB is source of truth.
@@ -330,8 +353,12 @@ func (g *GraphRAG) OnSpanIngested(span storage.Span) {
 
 // OnLogIngested is the callback wired into the log ingestion pipeline.
 func (g *GraphRAG) OnLogIngested(log storage.Log) {
+	tenant := log.TenantID
+	if tenant == "" {
+		tenant = storage.DefaultTenantID
+	}
 	select {
-	case g.eventCh <- event{log: &logEvent{Log: log}}:
+	case g.eventCh <- event{log: &logEvent{Log: log, Tenant: tenant}}:
 	default:
 		// Channel full — graph is best-effort; DB is source of truth.
 		g.recordEventDrop("log")
@@ -339,9 +366,16 @@ func (g *GraphRAG) OnLogIngested(log storage.Log) {
 }
 
 // OnMetricIngested is the callback wired into the metric ingestion pipeline.
+// tsdb.RawMetric already carries a resolved TenantID (set in ingest/otlp.go
+// Export), so we read it here instead of adding a second argument — keeping
+// the metric callback signature identical across TSDB and GraphRAG.
 func (g *GraphRAG) OnMetricIngested(metric tsdb.RawMetric) {
+	tenant := metric.TenantID
+	if tenant == "" {
+		tenant = storage.DefaultTenantID
+	}
 	select {
-	case g.eventCh <- event{metric: &metricEvent{Metric: metric}}:
+	case g.eventCh <- event{metric: &metricEvent{Metric: metric, Tenant: tenant}}:
 	default:
 		// Channel full — graph is best-effort; DB is source of truth.
 		g.recordEventDrop("metric")
@@ -381,17 +415,19 @@ func (g *GraphRAG) processSpan(ev *spanEvent) {
 		return
 	}
 
+	stores := g.storesForTenant(ev.Tenant)
+
 	// 1. Upsert ServiceNode
-	g.ServiceStore.UpsertService(span.ServiceName, durationMs, isError, span.StartTime)
+	stores.service.UpsertService(span.ServiceName, durationMs, isError, span.StartTime)
 
 	// 2. Upsert OperationNode + EXPOSES edge
 	if span.OperationName != "" {
-		g.ServiceStore.UpsertOperation(span.ServiceName, span.OperationName, durationMs, isError, span.StartTime)
+		stores.service.UpsertOperation(span.ServiceName, span.OperationName, durationMs, isError, span.StartTime)
 	}
 
 	// 3. Create TraceNode + SpanNode + CONTAINS + CHILD_OF edges
-	g.TraceStore.UpsertTrace(span.TraceID, span.ServiceName, ev.Status, durationMs, span.StartTime)
-	g.TraceStore.UpsertSpan(SpanNode{
+	stores.traces.UpsertTrace(span.TraceID, span.ServiceName, ev.Status, durationMs, span.StartTime)
+	stores.traces.UpsertSpan(SpanNode{
 		ID:           span.SpanID,
 		TraceID:      span.TraceID,
 		ParentSpanID: span.ParentSpanID,
@@ -405,9 +441,9 @@ func (g *GraphRAG) processSpan(ev *spanEvent) {
 
 	// 4. If parent span exists and belongs to different service, create CALLS edge
 	if span.ParentSpanID != "" {
-		if parentSpan, ok := g.TraceStore.GetSpan(span.ParentSpanID); ok {
+		if parentSpan, ok := stores.traces.GetSpan(span.ParentSpanID); ok {
 			if parentSpan.Service != span.ServiceName {
-				g.ServiceStore.UpsertCallEdge(parentSpan.Service, span.ServiceName, durationMs, isError, span.StartTime)
+				stores.service.UpsertCallEdge(parentSpan.Service, span.ServiceName, durationMs, isError, span.StartTime)
 			}
 		}
 	}
@@ -420,16 +456,21 @@ func (g *GraphRAG) processLog(ev *logEvent) {
 		return
 	}
 
-	// Drain-based clustering (replaces hash+TF-IDF clustering).
+	stores := g.storesForTenant(ev.Tenant)
+
+	// Drain-based clustering (replaces hash+TF-IDF clustering). The Drain
+	// miner is shared across tenants — its template tokens describe log shape,
+	// not content, so same-shape logs from different tenants share a template
+	// ID but land in their own tenant's SignalStore LogClusterNode entry.
 	body := log.Body
-	clusterID := g.clusterLog(log.ServiceName, body, log.Severity, log.Timestamp)
+	clusterID := g.clusterLog(stores, log.ServiceName, body, log.Severity, log.Timestamp)
 	if clusterID == "" {
 		return
 	}
 
 	// If log has trace_id + span_id, create LOGGED_DURING edge
 	if log.SpanID != "" {
-		g.SignalStore.AddLoggedDuringEdge(clusterID, log.SpanID, log.Timestamp)
+		stores.signals.AddLoggedDuringEdge(clusterID, log.SpanID, log.Timestamp)
 	}
 }
 
@@ -438,7 +479,8 @@ func (g *GraphRAG) processMetric(ev *metricEvent) {
 	if m.ServiceName == "" {
 		return
 	}
-	g.SignalStore.UpsertMetric(m.Name, m.ServiceName, m.Value, m.Timestamp)
+	stores := g.storesForTenant(ev.Tenant)
+	stores.signals.UpsertMetric(m.Name, m.ServiceName, m.Value, m.Timestamp)
 }
 
 // simpleHash produces a quick hash for log clustering.
@@ -449,3 +491,50 @@ func simpleHash(s string) uint32 {
 	}
 	return h
 }
+
+// storesFor returns the tenantStores composite scoped to the tenant carried
+// on ctx. A missing or empty tenant collapses to storage.DefaultTenantID,
+// matching WithTenantContext semantics. Lazily creates the slice on first
+// reference so a single-tenant install never carries empty maps for phantom
+// tenants, and a new tenant does not require a restart.
+func (g *GraphRAG) storesFor(ctx context.Context) *tenantStores {
+	return g.storesForTenant(storage.TenantFromContext(ctx))
+}
+
+// storesForTenant is the tenant-string flavour of storesFor, used by event
+// handlers that have already resolved the tenant (the callback path carries
+// it on spanEvent / logEvent / metricEvent). Empty strings are coerced to
+// storage.DefaultTenantID.
+func (g *GraphRAG) storesForTenant(tenant string) *tenantStores {
+	if tenant == "" {
+		tenant = storage.DefaultTenantID
+	}
+	g.tenantsMu.RLock()
+	slice, ok := g.tenants[tenant]
+	g.tenantsMu.RUnlock()
+	if ok {
+		return slice
+	}
+	g.tenantsMu.Lock()
+	defer g.tenantsMu.Unlock()
+	if slice, ok = g.tenants[tenant]; ok {
+		return slice
+	}
+	slice = newTenantStores(g.traceTTL)
+	g.tenants[tenant] = slice
+	return slice
+}
+
+// snapshotTenants returns a stable copy of the tenant → stores map suitable
+// for iteration without holding the coordinator lock. Background loops call
+// this once per tick and then operate on each slice under its own per-store
+// lock, so a long-running refresh never blocks new-tenant ingestion.
+func (g *GraphRAG) snapshotTenants() map[string]*tenantStores {
+	g.tenantsMu.RLock()
+	defer g.tenantsMu.RUnlock()
+	out := make(map[string]*tenantStores, len(g.tenants))
+	for k, v := range g.tenants {
+		out[k] = v
+	}
+	return out
+}
diff --git a/internal/graphrag/builder_test.go b/internal/graphrag/builder_test.go
index 9070598..e73fe21 100644
--- a/internal/graphrag/builder_test.go
+++ b/internal/graphrag/builder_test.go
@@ -52,6 +52,7 @@ func TestOnSpanIngested_PropagatesErrorStatus(t *testing.T) {
 	g := newTestGraphRAG(t)
 
 	errSpan := storage.Span{
+		TenantID:      storage.DefaultTenantID,
 		TraceID:       "trace-err",
 		SpanID:        "span-err",
 		OperationName: "/checkout",
@@ -63,9 +64,10 @@ func TestOnSpanIngested_PropagatesErrorStatus(t *testing.T) {
 	g.OnSpanIngested(errSpan)
 
 	// Event loop is async; poll briefly for the event to be processed.
+	stores := g.storesForTenant(storage.DefaultTenantID)
 	deadline := time.Now().Add(2 * time.Second)
 	for time.Now().Before(deadline) {
-		if svc, ok := g.ServiceStore.GetService("orders"); ok && svc.ErrorCount > 0 {
+		if svc, ok := stores.service.GetService("orders"); ok && svc.ErrorCount > 0 {
 			return
 		}
 		time.Sleep(20 * time.Millisecond)
@@ -80,9 +82,10 @@ func TestOnSpanIngested_PropagatesErrorStatus(t *testing.T) {
 func TestRefresh_PopulatesErrorCountFromDBStatus(t *testing.T) {
 	repo := newTestRepo(t)
 
-	// Seed: one trace + one ERROR span under it.
+	// Seed: one trace + one ERROR span under it, on the default tenant.
 	now := time.Now()
 	tr := storage.Trace{
+		TenantID:    storage.DefaultTenantID,
 		TraceID:     "trace-err-refresh",
 		ServiceName: "orders",
 		Duration:    1000,
@@ -93,6 +96,7 @@ func TestRefresh_PopulatesErrorCountFromDBStatus(t *testing.T) {
 		t.Fatalf("seed trace: %v", err)
 	}
 	sp := storage.Span{
+		TenantID:      storage.DefaultTenantID,
 		TraceID:       "trace-err-refresh",
 		SpanID:        "span-err-refresh",
 		OperationName: "/checkout",
@@ -111,11 +115,12 @@ func TestRefresh_PopulatesErrorCountFromDBStatus(t *testing.T) {
 	g := New(repo, nil, nil, nil, DefaultConfig())
 	t.Cleanup(g.Stop)
 
-	g.rebuildFromDB()
+	g.rebuildAllTenantsFromDB(context.Background())
 
-	svc, ok := g.ServiceStore.GetService("orders")
+	stores := g.storesForTenant(storage.DefaultTenantID)
+	svc, ok := stores.service.GetService("orders")
 	if !ok {
-		t.Fatalf("service 'orders' missing after rebuildFromDB")
+		t.Fatalf("service 'orders' missing after rebuildAllTenantsFromDB")
 	}
 	if svc.ErrorCount < 1 {
 		t.Fatalf("ErrorCount=%d after refresh, want >=1 — status not read from DB", svc.ErrorCount)
diff --git a/internal/graphrag/clustering.go b/internal/graphrag/clustering.go
index ec33469..ad43778 100644
--- a/internal/graphrag/clustering.go
+++ b/internal/graphrag/clustering.go
@@ -6,22 +6,29 @@ package graphrag
 // SimilarErrors — similarity search across mined templates.
 
 import (
+	"context"
 	"fmt"
 	"time"
+
+	"github.com/RandomCodeSpace/otelcontext/internal/storage"
 )
 
 // clusterLog runs the log body through Drain and upserts a LogClusterNode
-// into the SignalStore. Returns the service-scoped cluster ID.
+// into the supplied tenant's SignalStore. Returns the service-scoped cluster ID.
 //
 // The cluster ID is service-scoped and derived from the Drain template ID,
 // so it remains stable for any future ingestion of the same template shape.
 // Drain's internal template ID may change when tokens generalize to
 // wildcards; the LogClusterNode's TemplateID field is updated to track this.
-func (g *GraphRAG) clusterLog(service, body, severity string, ts time.Time) string {
+//
+// The Drain miner itself is shared across tenants — its templates describe
+// log shape, not content, and the per-tenant SignalStore keeps the actual
+// cluster nodes isolated.
+func (g *GraphRAG) clusterLog(stores *tenantStores, service, body, severity string, ts time.Time) string {
 	if g.drain == nil {
 		// Fallback: legacy hash-based clustering.
 		clusterID := fmt.Sprintf("lc_%s_%x", service, simpleHash(body))
-		g.SignalStore.UpsertLogCluster(clusterID, body, severity, service, ts)
+		stores.signals.UpsertLogCluster(clusterID, body, severity, service, ts)
 		return clusterID
 	}
 
@@ -34,7 +41,7 @@ func (g *GraphRAG) clusterLog(service, body, severity string, ts time.Time) stri
 	// Drain merges (tokens generalize), the ID may shift — acceptable since
 	// it occurs only on the first few ingestions of a pattern.
 	clusterID := fmt.Sprintf("lc_%s_%x", service, tpl.ID)
-	g.SignalStore.UpsertLogClusterWithTemplate(
+	stores.signals.UpsertLogClusterWithTemplate(
 		clusterID,
 		tpl.TemplateString(),
 		severity,
@@ -47,15 +54,20 @@ func (g *GraphRAG) clusterLog(service, body, severity string, ts time.Time) stri
 	return clusterID
 }
 
-// SimilarErrors finds log clusters similar to a given cluster using the vector index.
-func (g *GraphRAG) SimilarErrors(clusterID string, k int) []LogClusterNode {
+// SimilarErrors finds log clusters similar to a given cluster using the vector
+// index, scoped to the tenant carried on ctx. Cross-tenant hits are impossible
+// because the underlying vectordb partitions docs per tenant and this lookup
+// resolves the SignalStore through storesFor(ctx).
+func (g *GraphRAG) SimilarErrors(ctx context.Context, clusterID string, k int) []LogClusterNode {
 	if k <= 0 {
 		k = 10
 	}
 
-	g.SignalStore.mu.RLock()
-	cluster, ok := g.SignalStore.LogClusters[clusterID]
-	g.SignalStore.mu.RUnlock()
+	stores := g.storesFor(ctx)
+
+	stores.signals.mu.RLock()
+	cluster, ok := stores.signals.LogClusters[clusterID]
+	stores.signals.mu.RUnlock()
 	if !ok {
 		return nil
 	}
@@ -68,21 +80,25 @@ func (g *GraphRAG) SimilarErrors(clusterID string, k int) []LogClusterNode {
 	if query == "" && len(cluster.TemplateTokens) > 0 {
 		query = joinTokens(cluster.TemplateTokens)
 	}
-	results := g.vectorIdx.Search(query, k*2) // over-fetch to filter
+	// vectordb.Index.Search takes the tenant string directly; we resolve it
+	// from ctx via the same storage helper used by storesFor so both sides
+	// agree on coercion rules (empty → DefaultTenantID).
+	tenant := storage.TenantFromContext(ctx)
+	results := g.vectorIdx.Search(tenant, query, k*2) // over-fetch to filter
 
 	// Map results back to log clusters.
 	seen := map[string]bool{clusterID: true}
 	var similar []LogClusterNode
 
-	g.SignalStore.mu.RLock()
-	defer g.SignalStore.mu.RUnlock()
+	stores.signals.mu.RLock()
+	defer stores.signals.mu.RUnlock()
 
 	for _, r := range results {
-		for _, lc := range g.SignalStore.LogClusters {
+		for _, lc := range stores.signals.LogClusters {
 			if seen[lc.ID] {
 				continue
 			}
-			for _, e := range g.SignalStore.Edges {
+			for _, e := range stores.signals.Edges {
 				if e.Type == EdgeEmittedBy && e.FromID == lc.ID && e.ToID == r.ServiceName {
 					seen[lc.ID] = true
 					similar = append(similar, *lc)
diff --git a/internal/graphrag/investigation.go b/internal/graphrag/investigation.go
index 28765f9..e3dbd31 100644
--- a/internal/graphrag/investigation.go
+++ b/internal/graphrag/investigation.go
@@ -1,6 +1,7 @@
 package graphrag
 
 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"log/slog"
@@ -8,6 +9,7 @@ import (
 	"sync"
 	"time"
 
+	"github.com/RandomCodeSpace/otelcontext/internal/storage"
 	"gorm.io/gorm"
 )
 
@@ -91,8 +93,12 @@ func AutoMigrateGraphRAG(db *gorm.DB) error {
 	return db.AutoMigrate(&Investigation{}, &GraphSnapshot{}, &DrainTemplateRow{})
 }
 
-// PersistInvestigation saves an investigation record from an error chain analysis.
-func (g *GraphRAG) PersistInvestigation(triggerService string, chains []ErrorChainResult, anomalies []*AnomalyNode) {
+// PersistInvestigation saves an investigation record from an error chain
+// analysis. Tenant is accepted explicitly so the caller (the per-tenant
+// anomaly loop) can re-enter ImpactAnalysis on the correct tenant slice. The
+// `investigations` table itself does not yet carry a tenant_id column —
+// Subtask B (RAN-38) owns that migration.
+func (g *GraphRAG) PersistInvestigation(tenant, triggerService string, chains []ErrorChainResult, anomalies []*AnomalyNode) {
 	if len(chains) == 0 {
 		return
 	}
@@ -151,8 +157,10 @@ func (g *GraphRAG) PersistInvestigation(triggerService string, chains []ErrorCha
 		})
 	}
 
-	// Affected services from impact analysis
-	impact := g.ImpactAnalysis(triggerService, 3)
+	// Affected services from impact analysis, run against the tenant that
+	// raised this investigation.
+	impactCtx := storage.WithTenantContext(context.Background(), tenant)
+	impact := g.ImpactAnalysis(impactCtx, triggerService, 3)
 	var affected []string
 	for _, a := range impact.AffectedServices {
 		affected = append(affected, a.Service)
diff --git a/internal/graphrag/investigation_cooldown_test.go b/internal/graphrag/investigation_cooldown_test.go
index 8c8e292..1e36eeb 100644
--- a/internal/graphrag/investigation_cooldown_test.go
+++ b/internal/graphrag/investigation_cooldown_test.go
@@ -22,13 +22,13 @@ func TestPersistInvestigation_Cooldown(t *testing.T) {
 		RootCause: &RootCauseInfo{Service: "orders", Operation: "op"},
 	}}
 
-	g.PersistInvestigation("orders", chains, nil)
+	g.PersistInvestigation("default", "orders", chains, nil)
 	first := g.InvestigationInsertCount()
 	if first == 0 {
 		t.Fatalf("first PersistInvestigation should insert, got count=0")
 	}
 
-	g.PersistInvestigation("orders", chains, nil)
+	g.PersistInvestigation("default", "orders", chains, nil)
 	second := g.InvestigationInsertCount()
 	if second != first {
 		t.Fatalf("second PersistInvestigation within cooldown should be suppressed; got %d new inserts", second-first)
@@ -38,7 +38,7 @@ func TestPersistInvestigation_Cooldown(t *testing.T) {
 		TraceID:   "tr2",
 		RootCause: &RootCauseInfo{Service: "payments", Operation: "op"},
 	}}
-	g.PersistInvestigation("payments", chains2, nil)
+	g.PersistInvestigation("default", "payments", chains2, nil)
 	third := g.InvestigationInsertCount()
 	if third <= second {
 		t.Fatalf("distinct service should bypass cooldown; got %d, want > %d", third, second)
diff --git a/internal/graphrag/queries.go b/internal/graphrag/queries.go
index 736eff6..c05b343 100644
--- a/internal/graphrag/queries.go
+++ b/internal/graphrag/queries.go
@@ -1,18 +1,23 @@
 package graphrag
 
 import (
+	"context"
 	"math"
 	"sort"
 	"time"
 )
 
 // ErrorChain traces error spans upstream to find the root cause service.
-func (g *GraphRAG) ErrorChain(service string, since time.Time, limit int) []ErrorChainResult {
+// The tenant slice is selected via ctx — callers without a tenant ctx
+// collapse to storage.DefaultTenantID at the coordinator boundary.
+func (g *GraphRAG) ErrorChain(ctx context.Context, service string, since time.Time, limit int) []ErrorChainResult {
 	if limit <= 0 {
 		limit = 10
 	}
 
-	errorSpans := g.TraceStore.ErrorSpans(service, since)
+	stores := g.storesFor(ctx)
+
+	errorSpans := stores.traces.ErrorSpans(service, since)
 	if len(errorSpans) > limit {
 		errorSpans = errorSpans[:limit]
 	}
@@ -26,7 +31,7 @@ func (g *GraphRAG) ErrorChain(service string, since time.Time, limit int) []Erro
 		}
 		seen[span.TraceID] = true
 
-		chain := g.traceErrorChainUpstream(span)
+		chain := traceErrorChainUpstream(stores, span)
 		if len(chain) == 0 {
 			continue
 		}
@@ -46,7 +51,7 @@ func (g *GraphRAG) ErrorChain(service string, since time.Time, limit int) []Erro
 		// Gather correlated logs
 		for _, s := range chain {
 			if s.IsError {
-				clusters := g.SignalStore.LogClustersForService(s.Service)
+				clusters := stores.signals.LogClustersForService(s.Service)
 				for _, lc := range clusters {
 					if lc.LastSeen.After(since) {
 						result.CorrelatedLogs = append(result.CorrelatedLogs, *lc)
@@ -61,8 +66,9 @@ func (g *GraphRAG) ErrorChain(service string, since time.Time, limit int) []Erro
 	return results
 }
 
-// traceErrorChainUpstream walks CHILD_OF edges upstream from an error span to the root.
-func (g *GraphRAG) traceErrorChainUpstream(span *SpanNode) []SpanNode {
+// traceErrorChainUpstream walks CHILD_OF edges upstream from an error span to
+// the root within a single tenant's TraceStore.
+func traceErrorChainUpstream(stores *tenantStores, span *SpanNode) []SpanNode {
 	var chain []SpanNode
 	visited := make(map[string]bool)
 	current := span
@@ -74,7 +80,7 @@ func (g *GraphRAG) traceErrorChainUpstream(span *SpanNode) []SpanNode {
 		if current.ParentSpanID == "" {
 			break
 		}
-		parent, ok := g.TraceStore.GetSpan(current.ParentSpanID)
+		parent, ok := stores.traces.GetSpan(current.ParentSpanID)
 		if !ok {
 			break
 		}
@@ -85,11 +91,13 @@ func (g *GraphRAG) traceErrorChainUpstream(span *SpanNode) []SpanNode {
 }
 
 // ImpactAnalysis performs BFS downstream from a service to find affected services.
-func (g *GraphRAG) ImpactAnalysis(service string, maxDepth int) *ImpactResult {
+func (g *GraphRAG) ImpactAnalysis(ctx context.Context, service string, maxDepth int) *ImpactResult {
 	if maxDepth <= 0 {
 		maxDepth = 5
 	}
 
+	stores := g.storesFor(ctx)
+
 	result := &ImpactResult{Service: service}
 	visited := map[string]bool{service: true}
 
@@ -107,14 +115,14 @@ func (g *GraphRAG) ImpactAnalysis(service string, maxDepth int) *ImpactResult {
 			continue
 		}
 
-		edges := g.ServiceStore.CallEdgesFrom(item.svc)
+		edges := stores.service.CallEdgesFrom(item.svc)
 		for _, e := range edges {
 			if visited[e.ToID] {
 				continue
 			}
 			visited[e.ToID] = true
 
-			svc, _ := g.ServiceStore.GetService(e.ToID)
+			svc, _ := stores.service.GetService(e.ToID)
 			impact := 1.0
 			if svc != nil {
 				impact = 1.0 - svc.HealthScore
@@ -136,9 +144,10 @@ func (g *GraphRAG) ImpactAnalysis(service string, maxDepth int) *ImpactResult {
 }
 
 // RootCauseAnalysis combines ErrorChain with anomaly correlation to rank probable causes.
-func (g *GraphRAG) RootCauseAnalysis(service string, since time.Time) []RankedCause {
-	errorChains := g.ErrorChain(service, since, 20)
-	anomalies := g.AnomalyStore.AnomaliesForService(service, since)
+func (g *GraphRAG) RootCauseAnalysis(ctx context.Context, service string, since time.Time) []RankedCause {
+	stores := g.storesFor(ctx)
+	errorChains := g.ErrorChain(ctx, service, since, 20)
+	anomalies := stores.anomalies.AnomaliesForService(service, since)
 
 	// Score services by how often they appear as root cause
 	causeScores := make(map[string]*RankedCause)
@@ -197,8 +206,9 @@ func (g *GraphRAG) RootCauseAnalysis(service string, since time.Time) []RankedCa
 }
 
 // DependencyChain returns the full span tree for a trace.
-func (g *GraphRAG) DependencyChain(traceID string) []SpanNode {
-	spans := g.TraceStore.SpansForTrace(traceID)
+func (g *GraphRAG) DependencyChain(ctx context.Context, traceID string) []SpanNode {
+	stores := g.storesFor(ctx)
+	spans := stores.traces.SpansForTrace(traceID)
 	out := make([]SpanNode, len(spans))
 	for i, s := range spans {
 		out[i] = *s
@@ -219,11 +229,12 @@ type CorrelatedSignalsResult struct {
 	ErrorChains []ErrorChainResult `json:"error_chains,omitempty"`
 }
 
-func (g *GraphRAG) CorrelatedSignals(service string, since time.Time) *CorrelatedSignalsResult {
+func (g *GraphRAG) CorrelatedSignals(ctx context.Context, service string, since time.Time) *CorrelatedSignalsResult {
+	stores := g.storesFor(ctx)
 	result := &CorrelatedSignalsResult{Service: service}
 
 	// Error logs
-	clusters := g.SignalStore.LogClustersForService(service)
+	clusters := stores.signals.LogClustersForService(service)
 	for _, lc := range clusters {
 		if lc.LastSeen.After(since) {
 			result.ErrorLogs = append(result.ErrorLogs, *lc)
@@ -231,29 +242,30 @@ func (g *GraphRAG) CorrelatedSignals(service string, since time.Time) *Correlate
 	}
 
 	// Metrics
-	metrics := g.SignalStore.MetricsForService(service)
+	metrics := stores.signals.MetricsForService(service)
 	for _, m := range metrics {
 		result.Metrics = append(result.Metrics, *m)
 	}
 
 	// Anomalies
-	anomalies := g.AnomalyStore.AnomaliesForService(service, since)
+	anomalies := stores.anomalies.AnomaliesForService(service, since)
 	for _, a := range anomalies {
 		result.Anomalies = append(result.Anomalies, *a)
 	}
 
 	// Recent error chains
-	result.ErrorChains = g.ErrorChain(service, since, 5)
+	result.ErrorChains = g.ErrorChain(ctx, service, since, 5)
 
 	return result
 }
 
 // ShortestPath finds the shortest path between two services using Dijkstra.
-func (g *GraphRAG) ShortestPath(from, to string) []string {
+func (g *GraphRAG) ShortestPath(ctx context.Context, from, to string) []string {
+	stores := g.storesFor(ctx)
 	// Build adjacency from CALLS edges
-	g.ServiceStore.mu.RLock()
+	stores.service.mu.RLock()
 	adj := make(map[string]map[string]float64)
-	for _, e := range g.ServiceStore.Edges {
+	for _, e := range stores.service.Edges {
 		if e.Type != EdgeCalls {
 			continue
 		}
@@ -271,7 +283,7 @@ func (g *GraphRAG) ShortestPath(from, to string) []string {
 		}
 		adj[e.ToID][e.FromID] = weight
 	}
-	g.ServiceStore.mu.RUnlock()
+	stores.service.mu.RUnlock()
 
 	// Dijkstra
 	dist := map[string]float64{from: 0}
@@ -320,14 +332,29 @@ func (g *GraphRAG) ShortestPath(from, to string) []string {
 }
 
 // AnomalyTimeline returns recent anomalies sorted by time.
-func (g *GraphRAG) AnomalyTimeline(since time.Time) []*AnomalyNode {
-	anomalies := g.AnomalyStore.AnomaliesSince(since)
+func (g *GraphRAG) AnomalyTimeline(ctx context.Context, since time.Time) []*AnomalyNode {
+	stores := g.storesFor(ctx)
+	anomalies := stores.anomalies.AnomaliesSince(since)
 	sort.Slice(anomalies, func(i, j int) bool {
 		return anomalies[i].Timestamp.After(anomalies[j].Timestamp)
 	})
 	return anomalies
 }
 
+// AnomaliesForService is a tenant-aware read-through to the per-tenant
+// AnomalyStore, exported so handlers outside this package never need to
+// reach into the store maps directly.
+func (g *GraphRAG) AnomaliesForService(ctx context.Context, service string, since time.Time) []*AnomalyNode {
+	return g.storesFor(ctx).anomalies.AnomaliesForService(service, since)
+}
+
+// AllServiceEdges returns every edge in the caller's tenant's ServiceStore.
+// Kept as a narrow helper so API handlers do not need to traverse the
+// tenantStores composite themselves.
+func (g *GraphRAG) AllServiceEdges(ctx context.Context) []*Edge {
+	return g.storesFor(ctx).service.AllEdges()
+}
+
 // ServiceMap returns the service topology with health scores for the API.
 type ServiceMapEntry struct {
 	Service    *ServiceNode     `json:"service"`
@@ -336,25 +363,26 @@ type ServiceMapEntry struct {
 	CalledBy   []*Edge          `json:"called_by,omitempty"`
 }
 
-func (g *GraphRAG) ServiceMap(depth int) []ServiceMapEntry {
-	services := g.ServiceStore.AllServices()
+func (g *GraphRAG) ServiceMap(ctx context.Context, depth int) []ServiceMapEntry {
+	stores := g.storesFor(ctx)
+	services := stores.service.AllServices()
 	result := make([]ServiceMapEntry, 0, len(services))
 
 	for _, svc := range services {
 		entry := ServiceMapEntry{
 			Service:  svc,
-			CallsTo:  g.ServiceStore.CallEdgesFrom(svc.Name),
-			CalledBy: g.ServiceStore.CallEdgesTo(svc.Name),
+			CallsTo:  stores.service.CallEdgesFrom(svc.Name),
+			CalledBy: stores.service.CallEdgesTo(svc.Name),
 		}
 
 		// Get operations for this service
-		g.ServiceStore.mu.RLock()
-		for _, op := range g.ServiceStore.Operations {
+		stores.service.mu.RLock()
+		for _, op := range stores.service.Operations {
 			if op.Service == svc.Name {
 				entry.Operations = append(entry.Operations, op)
 			}
 		}
-		g.ServiceStore.mu.RUnlock()
+		stores.service.mu.RUnlock()
 
 		result = append(result, entry)
 	}
diff --git a/internal/graphrag/refresh.go b/internal/graphrag/refresh.go
index b354a7f..43a941e 100644
--- a/internal/graphrag/refresh.go
+++ b/internal/graphrag/refresh.go
@@ -4,15 +4,21 @@ import (
 	"context"
 	"log/slog"
 	"time"
+
+	"github.com/RandomCodeSpace/otelcontext/internal/storage"
 )
 
 // refreshLoop periodically rebuilds/merges from DB and prunes stale data.
+// Work is sharded per tenant: on each tick we snapshot the coordinator's
+// tenant map, then rebuild and prune each slice under its own lock. Tenants
+// are discovered from the spans table on first rebuild so historical data
+// from tenants that have not yet ingested via callbacks is still loaded.
 func (g *GraphRAG) refreshLoop(ctx context.Context) {
 	ticker := time.NewTicker(g.refreshEvery)
 	defer ticker.Stop()
 
-	// Initial rebuild on startup
-	g.rebuildFromDB()
+	// Initial rebuild on startup.
+	g.rebuildAllTenantsFromDB(ctx)
 
 	for {
 		select {
@@ -21,8 +27,11 @@ func (g *GraphRAG) refreshLoop(ctx context.Context) {
 		case <-g.stopCh:
 			return
 		case <-ticker.C:
-			g.rebuildFromDB()
-			pruned := g.TraceStore.Prune()
+			g.rebuildAllTenantsFromDB(ctx)
+			pruned := 0
+			for _, stores := range g.snapshotTenants() {
+				pruned += stores.traces.Prune()
+			}
 			if pruned > 0 {
 				slog.Debug("GraphRAG pruned expired traces/spans", "count", pruned)
 			}
@@ -51,7 +60,7 @@ func (g *GraphRAG) snapshotLoop(ctx context.Context) {
 		case <-g.stopCh:
 			return
 		case <-ticker.C:
-			g.takeSnapshot()
+			g.takeSnapshot(ctx)
 			g.pruneOldSnapshots()
 			g.persistDrainTemplates()
 		}
@@ -75,12 +84,53 @@ func (g *GraphRAG) persistDrainTemplates() {
 	slog.Debug("Drain templates persisted", "count", len(tpls))
 }
 
-// rebuildFromDB loads recent span data from the DB and merges into the graph.
-// This catches data from before callbacks started (e.g., restart recovery).
-func (g *GraphRAG) rebuildFromDB() {
+// rebuildAllTenantsFromDB rebuilds each known tenant's in-memory service
+// topology from the spans table. Tenants are the union of already-present
+// coordinator slices and the distinct tenant_id values observed in recent
+// spans — this catches historical tenants that have not yet ingested via
+// live callbacks since startup.
+func (g *GraphRAG) rebuildAllTenantsFromDB(ctx context.Context) {
+	if g.repo == nil || g.repo.DB() == nil {
+		return
+	}
+
 	since := time.Now().Add(-1 * time.Hour)
 
-	// Load recent spans
+	// Discover tenants that have recent spans. Missing tenant_id rows fall
+	// back to DefaultTenantID so pre-multi-tenant data still rebuilds.
+	var tenantIDs []string
+	if err := g.repo.DB().
+		Table("spans").
+		Where("start_time > ?", since).
+		Distinct("tenant_id").
+		Pluck("tenant_id", &tenantIDs).Error; err != nil {
+		slog.Error("GraphRAG: failed to enumerate tenants for rebuild", "error", err)
+		return
+	}
+
+	seen := make(map[string]bool, len(tenantIDs))
+	for _, t := range tenantIDs {
+		if t == "" {
+			t = storage.DefaultTenantID
+		}
+		seen[t] = true
+	}
+	// Always include tenants the coordinator already knows about so we refresh
+	// live-ingested tenants even when no DB rows yet carry their ID.
+	for t := range g.snapshotTenants() {
+		seen[t] = true
+	}
+
+	for tenant := range seen {
+		tctx := storage.WithTenantContext(ctx, tenant)
+		g.rebuildFromDBForTenant(tctx, tenant, since)
+	}
+}
+
+// rebuildFromDBForTenant loads recent span data for a single tenant and
+// merges it into that tenant's slice of the graph. Catches data from before
+// callbacks started (e.g., restart recovery).
+func (g *GraphRAG) rebuildFromDBForTenant(_ context.Context, tenant string, since time.Time) {
 	type spanRow struct {
 		SpanID        string
 		ParentSpanID  string
@@ -92,16 +142,18 @@ func (g *GraphRAG) rebuildFromDB() {
 		StartTime     time.Time
 	}
 
+	stores := g.storesForTenant(tenant)
+
 	var rows []spanRow
 	err := g.repo.DB().
 		Table("spans").
 		Select("span_id, parent_span_id, service_name, operation_name, duration, trace_id, status, start_time").
-		Where("start_time > ?", since).
+		Where("start_time > ? AND tenant_id = ?", since, tenant).
 		Order("start_time ASC").
 		Limit(50000).
 		Find(&rows).Error
 	if err != nil {
-		slog.Error("GraphRAG: failed to rebuild from DB", "error", err)
+		slog.Error("GraphRAG: failed to rebuild from DB", "tenant", tenant, "error", err)
 		return
 	}
 
@@ -109,7 +161,7 @@ func (g *GraphRAG) rebuildFromDB() {
 		return
 	}
 
-	// Build spanID → service map for edge resolution
+	// Build spanID → service map for edge resolution.
 	spanService := make(map[string]string, len(rows))
 	for _, r := range rows {
 		spanService[r.SpanID] = r.ServiceName
@@ -119,18 +171,22 @@ func (g *GraphRAG) rebuildFromDB() {
 		durationMs := float64(r.Duration) / 1000.0
 		isError := r.Status == "STATUS_CODE_ERROR"
 
-		g.ServiceStore.UpsertService(r.ServiceName, durationMs, isError, r.StartTime)
+		stores.service.UpsertService(r.ServiceName, durationMs, isError, r.StartTime)
 		if r.OperationName != "" {
-			g.ServiceStore.UpsertOperation(r.ServiceName, r.OperationName, durationMs, isError, r.StartTime)
+			stores.service.UpsertOperation(r.ServiceName, r.OperationName, durationMs, isError, r.StartTime)
 		}
 
-		// Cross-service edges
+		// Cross-service edges.
 		if r.ParentSpanID != "" {
 			if parentSvc, ok := spanService[r.ParentSpanID]; ok && parentSvc != r.ServiceName {
-				g.ServiceStore.UpsertCallEdge(parentSvc, r.ServiceName, durationMs, isError, r.StartTime)
+				stores.service.UpsertCallEdge(parentSvc, r.ServiceName, durationMs, isError, r.StartTime)
 			}
 		}
 	}
 
-	slog.Debug("GraphRAG rebuilt from DB", "spans", len(rows), "services", len(g.ServiceStore.Services))
+	slog.Debug("GraphRAG rebuilt from DB",
+		"tenant", tenant,
+		"spans", len(rows),
+		"services", len(stores.service.Services),
+	)
 }
diff --git a/internal/graphrag/snapshot.go b/internal/graphrag/snapshot.go
index c4e430b..3a77328 100644
--- a/internal/graphrag/snapshot.go
+++ b/internal/graphrag/snapshot.go
@@ -1,13 +1,22 @@
 package graphrag
 
 import (
+	"context"
 	"encoding/json"
 	"fmt"
 	"log/slog"
 	"time"
+
+	"github.com/RandomCodeSpace/otelcontext/internal/storage"
 )
 
 // GraphSnapshot is a periodic snapshot of the service topology persisted to DB.
+//
+// Tenant identity is not yet a column on this model — Subtask B (RAN-38) adds
+// `tenant_id` to the schema along with the persistence-layer filtering. For
+// now, snapshots are written one row per tenant per tick and remain queryable
+// only across the whole table; callers must not treat pre-Subtask-B rows as
+// tenant-scoped.
 type GraphSnapshot struct {
 	ID             string          `gorm:"primaryKey;size:64" json:"id"`
 	CreatedAt      time.Time       `json:"created_at"`
@@ -43,10 +52,19 @@ type snapshotEdge struct {
 	ErrorRate float64 `json:"error_rate"`
 }
 
-// takeSnapshot captures the current service topology and persists it.
-func (g *GraphRAG) takeSnapshot() {
-	services := g.ServiceStore.AllServices()
-	edges := g.ServiceStore.AllEdges()
+// takeSnapshot captures each tenant's current service topology and persists
+// one row per tenant per tick. See the note on GraphSnapshot regarding the
+// upcoming tenant_id column in Subtask B.
+func (g *GraphRAG) takeSnapshot(ctx context.Context) {
+	for tenant, stores := range g.snapshotTenants() {
+		tctx := storage.WithTenantContext(ctx, tenant)
+		g.takeSnapshotForTenant(tctx, tenant, stores)
+	}
+}
+
+func (g *GraphRAG) takeSnapshotForTenant(_ context.Context, tenant string, stores *tenantStores) {
+	services := stores.service.AllServices()
+	edges := stores.service.AllEdges()
 
 	if len(services) == 0 {
 		return
@@ -69,9 +87,9 @@ func (g *GraphRAG) takeSnapshot() {
 		totalHealth += svc.HealthScore
 	}
 
-	// Also include operations
-	g.ServiceStore.mu.RLock()
-	for _, op := range g.ServiceStore.Operations {
+	// Also include operations for this tenant.
+	stores.service.mu.RLock()
+	for _, op := range stores.service.Operations {
 		nodes = append(nodes, snapshotNode{
 			ID:          op.ID,
 			Type:        "operation",
@@ -81,7 +99,7 @@ func (g *GraphRAG) takeSnapshot() {
 			AvgLatency:  op.AvgLatency,
 		})
 	}
-	g.ServiceStore.mu.RUnlock()
+	stores.service.mu.RUnlock()
 
 	var snapEdges []snapshotEdge
 	for _, e := range edges {
@@ -99,7 +117,7 @@ func (g *GraphRAG) takeSnapshot() {
 	edgesJSON, _ := json.Marshal(snapEdges)
 
 	snap := GraphSnapshot{
-		ID:             fmt.Sprintf("snap_%d", time.Now().UnixNano()),
+		ID:             fmt.Sprintf("snap_%s_%d", tenant, time.Now().UnixNano()),
 		CreatedAt:      time.Now(),
 		Nodes:          nodesJSON,
 		Edges:          edgesJSON,
@@ -108,16 +126,26 @@ func (g *GraphRAG) takeSnapshot() {
 		AvgHealthScore: totalHealth / float64(len(services)),
 	}
 
+	if g.repo == nil || g.repo.DB() == nil {
+		return
+	}
 	if err := g.repo.DB().Create(&snap).Error; err != nil {
-		slog.Error("Failed to persist graph snapshot", "error", err)
+		slog.Error("Failed to persist graph snapshot", "tenant", tenant, "error", err)
 		return
 	}
 
-	slog.Debug("Graph snapshot persisted", "services", len(services), "edges", len(snapEdges))
+	slog.Debug("Graph snapshot persisted",
+		"tenant", tenant,
+		"services", len(services),
+		"edges", len(snapEdges),
+	)
 }
 
 // pruneOldSnapshots removes snapshots older than 7 days.
 func (g *GraphRAG) pruneOldSnapshots() {
+	if g.repo == nil || g.repo.DB() == nil {
+		return
+	}
 	cutoff := time.Now().AddDate(0, 0, -7)
 	result := g.repo.DB().Where("created_at < ?", cutoff).Delete(&GraphSnapshot{})
 	if result.Error != nil {
@@ -128,6 +156,7 @@ func (g *GraphRAG) pruneOldSnapshots() {
 }
 
 // GetGraphSnapshot retrieves the snapshot closest to the requested time.
+// TODO(RAN-38, Subtask B): scope by tenant once tenant_id lands on the table.
 func (g *GraphRAG) GetGraphSnapshot(at time.Time) (*GraphSnapshot, error) {
 	var snap GraphSnapshot
 	err := g.repo.DB().
diff --git a/internal/graphrag/store.go b/internal/graphrag/store.go
index 599fdac..8d2caa3 100644
--- a/internal/graphrag/store.go
+++ b/internal/graphrag/store.go
@@ -6,6 +6,27 @@ import (
 	"time"
 )
 
+// tenantStores bundles one tenant's slice of the four layered in-memory
+// stores. Every mutation and query lands in exactly one composite, keyed by
+// tenant ID in the coordinator (see storesFor / storesForTenant in builder.go).
+// Storage is lazily created on first reference — empty-tenant contexts coerce
+// to storage.DefaultTenantID at the lookup boundary.
+type tenantStores struct {
+	service   *ServiceStore
+	traces    *TraceStore
+	signals   *SignalStore
+	anomalies *AnomalyStore
+}
+
+func newTenantStores(traceTTL time.Duration) *tenantStores {
+	return &tenantStores{
+		service:   newServiceStore(),
+		traces:    newTraceStore(traceTTL),
+		signals:   newSignalStore(),
+		anomalies: newAnomalyStore(),
+	}
+}
+
 // ServiceStore holds permanent service topology data.
 type ServiceStore struct {
 	mu         sync.RWMutex
diff --git a/internal/graphrag/tenant_isolation_test.go b/internal/graphrag/tenant_isolation_test.go
new file mode 100644
index 0000000..c6f1725
--- /dev/null
+++ b/internal/graphrag/tenant_isolation_test.go
@@ -0,0 +1,84 @@
+package graphrag
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/RandomCodeSpace/otelcontext/internal/storage"
+)
+
+// TestGraphRAG_TenantIsolation_InMemoryStores ingests overlapping span data
+// for two distinct tenants and asserts each tenant's query surface sees
+// only its own services and traces. This is the core invariant of RAN-37:
+// no in-memory state may leak across tenants.
+func TestGraphRAG_TenantIsolation_InMemoryStores(t *testing.T) {
+	g := newTestGraphRAG(t)
+
+	now := time.Now()
+	mk := func(tenant, service, traceID, spanID string) storage.Span {
+		return storage.Span{
+			TenantID:      tenant,
+			TraceID:       traceID,
+			SpanID:        spanID,
+			OperationName: "/op",
+			ServiceName:   service,
+			Status:        "STATUS_CODE_OK",
+			StartTime:     now,
+			EndTime:       now.Add(time.Millisecond),
+			Duration:      1000,
+		}
+	}
+
+	// Tenant A: service "orders" under trace t-a-1.
+	g.OnSpanIngested(mk("tenant-a", "orders", "t-a-1", "s-a-1"))
+	// Tenant B: service "payments" under trace t-b-1 — different service so we
+	// can prove A never sees B's service and vice versa.
+	g.OnSpanIngested(mk("tenant-b", "payments", "t-b-1", "s-b-1"))
+
+	ctxA := storage.WithTenantContext(context.Background(), "tenant-a")
+	ctxB := storage.WithTenantContext(context.Background(), "tenant-b")
+
+	// Poll briefly for the async event workers to finish both tenants.
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		mapA := g.ServiceMap(ctxA, 0)
+		mapB := g.ServiceMap(ctxB, 0)
+		if len(mapA) >= 1 && len(mapB) >= 1 {
+			break
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+
+	mapA := g.ServiceMap(ctxA, 0)
+	mapB := g.ServiceMap(ctxB, 0)
+
+	if len(mapA) != 1 || mapA[0].Service.Name != "orders" {
+		t.Fatalf("tenant-a ServiceMap = %v, want only [orders]", mapA)
+	}
+	if len(mapB) != 1 || mapB[0].Service.Name != "payments" {
+		t.Fatalf("tenant-b ServiceMap = %v, want only [payments]", mapB)
+	}
+
+	// Dependency chain lookups must also stay inside the caller's tenant.
+	if chain := g.DependencyChain(ctxA, "t-b-1"); len(chain) != 0 {
+		t.Fatalf("tenant-a saw tenant-b trace t-b-1: %+v", chain)
+	}
+	if chain := g.DependencyChain(ctxB, "t-a-1"); len(chain) != 0 {
+		t.Fatalf("tenant-b saw tenant-a trace t-a-1: %+v", chain)
+	}
+}
+
+// TestGraphRAG_StoresFor_EmptyContextCollapsesToDefault confirms that a
+// query without any tenant context lands on the DefaultTenantID slice —
+// matching storage.TenantFromContext semantics so single-tenant installs
+// behave as before.
+func TestGraphRAG_StoresFor_EmptyContextCollapsesToDefault(t *testing.T) {
+	g := newTestGraphRAG(t)
+
+	viaEmpty := g.storesFor(context.Background())
+	viaDefault := g.storesForTenant(storage.DefaultTenantID)
+	if viaEmpty != viaDefault {
+		t.Fatalf("empty-context lookup returned a different slice than DefaultTenantID")
+	}
+}
diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go
index c988d72..91df67b 100644
--- a/internal/mcp/tools.go
+++ b/internal/mcp/tools.go
@@ -304,21 +304,21 @@ func (s *Server) toolHandler(ctx context.Context, name string, args map[string]a
 	case "get_storage_status":
 		return s.toolGetStorageStatus()
 	case "find_similar_logs":
-		return s.toolFindSimilarLogs(args)
+		return s.toolFindSimilarLogs(ctx, args)
 	case "get_alerts":
 		return s.toolGetAlerts()
 	case "get_service_map":
-		return s.toolGetServiceMap(args)
+		return s.toolGetServiceMap(ctx, args)
 	case "get_error_chains":
-		return s.toolGetErrorChains(args)
+		return s.toolGetErrorChains(ctx, args)
 	case "trace_graph":
 		return s.toolTraceGraph(ctx, args)
 	case "impact_analysis":
-		return s.toolImpactAnalysis(args)
+		return s.toolImpactAnalysis(ctx, args)
 	case "root_cause_analysis":
-		return s.toolRootCauseAnalysis(args)
+		return s.toolRootCauseAnalysis(ctx, args)
 	case "correlated_signals":
-		return s.toolCorrelatedSignals(args)
+		return s.toolCorrelatedSignals(ctx, args)
 	case "get_investigations":
 		return s.toolGetInvestigations(args)
 	case "get_investigation":
@@ -326,7 +326,7 @@ func (s *Server) toolHandler(ctx context.Context, name string, args map[string]a
 	case "get_graph_snapshot":
 		return s.toolGetGraphSnapshot(args)
 	case "get_anomaly_timeline":
-		return s.toolGetAnomalyTimeline(args)
+		return s.toolGetAnomalyTimeline(ctx, args)
 	default:
 		return errorResult(fmt.Sprintf("unknown tool: %s", name))
 	}
@@ -575,7 +575,10 @@ func (s *Server) toolGetStorageStatus() ToolCallResult {
 	return textResult(string(data))
 }
 
-func (s *Server) toolFindSimilarLogs(args map[string]any) ToolCallResult {
+// toolFindSimilarLogs returns logs semantically similar to the query text
+// scoped to the tenant resolved from the MCP transport (X-Tenant-ID header or
+// the server's default tenant). Cross-tenant rows are never returned.
+func (s *Server) toolFindSimilarLogs(ctx context.Context, args map[string]any) ToolCallResult {
 	query, _ := args["query"].(string)
 	if query == "" {
 		return errorResult("query is required")
@@ -587,7 +590,8 @@ func (s *Server) toolFindSimilarLogs(args map[string]any) ToolCallResult {
 	if s.vectorIdx == nil {
 		return errorResult("vector index not yet initialized")
 	}
-	results := s.vectorIdx.Search(query, limit)
+	tenant := storage.TenantFromContext(mcpCtx(ctx))
+	results := s.vectorIdx.Search(tenant, query, limit)
 	data, err := json.MarshalIndent(results, "", "  ")
 	if err != nil {
 		return errorResult(fmt.Sprintf("failed to marshal similar logs: %v", err))
@@ -629,12 +633,12 @@ func (s *Server) toolGetAlerts() ToolCallResult {
 
 // --- GraphRAG Tool implementations ---
 
-func (s *Server) toolGetServiceMap(args map[string]any) ToolCallResult {
+func (s *Server) toolGetServiceMap(ctx context.Context, args map[string]any) ToolCallResult {
 	if s.graphRAG == nil {
 		return errorResult("GraphRAG not initialized")
 	}
 	depth := argInt(args, "depth", 3)
-	result := s.graphRAG.ServiceMap(depth)
+	result := s.graphRAG.ServiceMap(mcpCtx(ctx), depth)
 	data, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return errorResult(fmt.Sprintf("failed to marshal service map: %v", err))
@@ -642,7 +646,7 @@ func (s *Server) toolGetServiceMap(args map[string]any) ToolCallResult {
 	return textResult(string(data))
 }
 
-func (s *Server) toolGetErrorChains(args map[string]any) ToolCallResult {
+func (s *Server) toolGetErrorChains(ctx context.Context, args map[string]any) ToolCallResult {
 	if s.graphRAG == nil {
 		return errorResult("GraphRAG not initialized")
 	}
@@ -654,7 +658,7 @@ func (s *Server) toolGetErrorChains(args map[string]any) ToolCallResult {
 	parseTimeRange(args, "time_range", &since)
 	limit := argInt(args, "limit", 10)
 
-	chains := s.graphRAG.ErrorChain(svcName, since, limit)
+	chains := s.graphRAG.ErrorChain(mcpCtx(ctx), svcName, since, limit)
 	data, err := json.MarshalIndent(chains, "", "  ")
 	if err != nil {
 		return errorResult(fmt.Sprintf("failed to marshal error chains: %v", err))
@@ -670,7 +674,7 @@ func (s *Server) toolTraceGraph(ctx context.Context, args map[string]any) ToolCa
 	if traceID == "" {
 		return errorResult("trace_id is required")
 	}
-	spans := s.graphRAG.DependencyChain(traceID)
+	spans := s.graphRAG.DependencyChain(mcpCtx(ctx), traceID)
 	if len(spans) == 0 {
 		// Fallback to DB
 		trace, err := s.repo.GetTrace(mcpCtx(ctx), traceID)
@@ -690,7 +694,7 @@ func (s *Server) toolTraceGraph(ctx context.Context, args map[string]any) ToolCa
 	return textResult(string(data))
 }
 
-func (s *Server) toolImpactAnalysis(args map[string]any) ToolCallResult {
+func (s *Server) toolImpactAnalysis(ctx context.Context, args map[string]any) ToolCallResult {
 	if s.graphRAG == nil {
 		return errorResult("GraphRAG not initialized")
 	}
@@ -699,7 +703,7 @@ func (s *Server) toolImpactAnalysis(args map[string]any) ToolCallResult {
 		return errorResult("service is required")
 	}
 	depth := argInt(args, "depth", 5)
-	result := s.graphRAG.ImpactAnalysis(svcName, depth)
+	result := s.graphRAG.ImpactAnalysis(mcpCtx(ctx), svcName, depth)
 	data, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return errorResult(fmt.Sprintf("failed to marshal impact analysis: %v", err))
@@ -707,7 +711,7 @@ func (s *Server) toolImpactAnalysis(args map[string]any) ToolCallResult {
 	return textResult(string(data))
 }
 
-func (s *Server) toolRootCauseAnalysis(args map[string]any) ToolCallResult {
+func (s *Server) toolRootCauseAnalysis(ctx context.Context, args map[string]any) ToolCallResult {
 	if s.graphRAG == nil {
 		return errorResult("GraphRAG not initialized")
 	}
@@ -718,7 +722,7 @@ func (s *Server) toolRootCauseAnalysis(args map[string]any) ToolCallResult {
 	since := time.Now().Add(-15 * time.Minute)
 	parseTimeRange(args, "time_range", &since)
 
-	causes := s.graphRAG.RootCauseAnalysis(svcName, since)
+	causes := s.graphRAG.RootCauseAnalysis(mcpCtx(ctx), svcName, since)
 	data, err := json.MarshalIndent(causes, "", "  ")
 	if err != nil {
 		return errorResult(fmt.Sprintf("failed to marshal root cause analysis: %v", err))
@@ -726,7 +730,7 @@ func (s *Server) toolRootCauseAnalysis(args map[string]any) ToolCallResult {
 	return textResult(string(data))
 }
 
-func (s *Server) toolCorrelatedSignals(args map[string]any) ToolCallResult {
+func (s *Server) toolCorrelatedSignals(ctx context.Context, args map[string]any) ToolCallResult {
 	if s.graphRAG == nil {
 		return errorResult("GraphRAG not initialized")
 	}
@@ -737,7 +741,7 @@ func (s *Server) toolCorrelatedSignals(args map[string]any) ToolCallResult {
 	since := time.Now().Add(-1 * time.Hour)
 	parseTimeRange(args, "time_range", &since)
 
-	result := s.graphRAG.CorrelatedSignals(svcName, since)
+	result := s.graphRAG.CorrelatedSignals(mcpCtx(ctx), svcName, since)
 	data, err := json.MarshalIndent(result, "", "  ")
 	if err != nil {
 		return errorResult(fmt.Sprintf("failed to marshal correlated signals: %v", err))
@@ -804,7 +808,7 @@ func (s *Server) toolGetGraphSnapshot(args map[string]any) ToolCallResult {
 	return textResult(string(data))
 }
 
-func (s *Server) toolGetAnomalyTimeline(args map[string]any) ToolCallResult {
+func (s *Server) toolGetAnomalyTimeline(ctx context.Context, args map[string]any) ToolCallResult {
 	if s.graphRAG == nil {
 		return errorResult("GraphRAG not initialized")
 	}
@@ -814,9 +818,9 @@ func (s *Server) toolGetAnomalyTimeline(args map[string]any) ToolCallResult {
 
 	var anomalies []*graphrag.AnomalyNode
 	if service != "" {
-		anomalies = s.graphRAG.AnomalyStore.AnomaliesForService(service, since)
+		anomalies = s.graphRAG.AnomaliesForService(mcpCtx(ctx), service, since)
 	} else {
-		anomalies = s.graphRAG.AnomalyTimeline(since)
+		anomalies = s.graphRAG.AnomalyTimeline(mcpCtx(ctx), since)
 	}
 	data, err := json.MarshalIndent(anomalies, "", "  ")
 	if err != nil {

From 901b6828979614f82f86b5b2c5de1d997a7aecec Mon Sep 17 00:00:00 2001
From: Amit Kumar <ak.nitrr13@gmail.com>
Date: Sat, 25 Apr 2026 15:11:18 +0000
Subject: [PATCH 2/2] fix(graphrag,mcp): drop premature 3-arg vectordb.Search
 call (RAN-37 CI)

The PR-27 build failed CI because internal/graphrag/clustering.go and
internal/mcp/tools.go both called the 3-arg vectordb.Index.Search(tenant,
query, k) signature, but that signature lives on RAN-20's vectordb
tenant-isolation work and is not yet on main. The 3-arg form leaked in
from a stacked branch during the original RAN-37 cut.

Restore the 2-arg Search(query, k) call in both sites and leave a
TODO(RAN-20) so the proper vector-side tenant scoping lands with the
RAN-20 follow-up. RAN-37's in-memory tenant invariants are unaffected:
SimilarErrors still narrows results by the per-tenant SignalStore's
EMITTED_BY edges, so cross-tenant hits cannot surface even while the
underlying vector index is shared.

go build / vet / test ./... and -race on graphrag + api all green.

Co-Authored-By: Paperclip <noreply@paperclip.ing>
---
 internal/graphrag/clustering.go | 15 ++++++++-------
 internal/mcp/tools.go           |  7 +++++--
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/internal/graphrag/clustering.go b/internal/graphrag/clustering.go
index ad43778..d1735bb 100644
--- a/internal/graphrag/clustering.go
+++ b/internal/graphrag/clustering.go
@@ -9,8 +9,6 @@ import (
 	"context"
 	"fmt"
 	"time"
-
-	"github.com/RandomCodeSpace/otelcontext/internal/storage"
 )
 
 // clusterLog runs the log body through Drain and upserts a LogClusterNode
@@ -80,11 +78,14 @@ func (g *GraphRAG) SimilarErrors(ctx context.Context, clusterID string, k int) [
 	if query == "" && len(cluster.TemplateTokens) > 0 {
 		query = joinTokens(cluster.TemplateTokens)
 	}
-	// vectordb.Index.Search takes the tenant string directly; we resolve it
-	// from ctx via the same storage helper used by storesFor so both sides
-	// agree on coercion rules (empty → DefaultTenantID).
-	tenant := storage.TenantFromContext(ctx)
-	results := g.vectorIdx.Search(tenant, query, k*2) // over-fetch to filter
+	// TODO(RAN-20): vectordb.Index.Search itself is not yet tenant-scoped on
+	// `main`, so we call the 2-arg signature here. Tenant isolation for the
+	// SignalStore lookup above is already enforced via storesFor(ctx); the
+	// vector hits are then narrowed by the EmittedBy edges in this tenant's
+	// SignalStore on lines below, so cross-tenant hits cannot surface even
+	// while the underlying vector index is shared.
+	_ = ctx
+	results := g.vectorIdx.Search(query, k*2) // over-fetch to filter
 
 	// Map results back to log clusters.
 	seen := map[string]bool{clusterID: true}
diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go
index 91df67b..03a0ffc 100644
--- a/internal/mcp/tools.go
+++ b/internal/mcp/tools.go
@@ -590,8 +590,11 @@ func (s *Server) toolFindSimilarLogs(ctx context.Context, args map[string]any) T
 	if s.vectorIdx == nil {
 		return errorResult("vector index not yet initialized")
 	}
-	tenant := storage.TenantFromContext(mcpCtx(ctx))
-	results := s.vectorIdx.Search(tenant, query, limit)
+	// TODO(RAN-20): vectordb.Index.Search itself is not yet tenant-scoped on
+	// `main`. The tenant filter for log similarity will move into the
+	// vector index in the RAN-20 follow-up; until then this call is shared.
+	_ = ctx
+	results := s.vectorIdx.Search(query, limit)
 	data, err := json.MarshalIndent(results, "", "  ")
 	if err != nil {
 		return errorResult(fmt.Sprintf("failed to marshal similar logs: %v", err))