From 0c72a5edd1f29faa0bb73ffcc2b194db636ddb63 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Thu, 23 Apr 2026 19:26:53 +0000 Subject: [PATCH 1/3] chore: post-robustness follow-ups - .gitignore: exclude build artifacts (/bin, /loadsim, /otelcontext) - graphrag: canonicalize investigation cooldown key (lower + trim) so "Orders" / "orders " / "ORDERS" share a bucket; otherwise trivial casing differences would bypass the cooldown guard - graphrag: consolidate time.Now() calls in PersistInvestigation to a single now := time.Now() at the top - graphrag/refresh.go: spell out the cooldown-prune cutoff's dependency on RefreshEvery to keep future tuning honest - storage: delete dead verifyP99Index helper and math import in metrics_p99_test.go Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 6 ++++- internal/graphrag/investigation.go | 25 ++++++++++++++----- .../graphrag/investigation_cooldown_test.go | 17 +++++++++++++ internal/graphrag/refresh.go | 8 ++++-- internal/storage/metrics_p99_test.go | 13 ---------- 5 files changed, 47 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index 7336a95..3c690b7 100644 --- a/.gitignore +++ b/.gitignore @@ -126,4 +126,8 @@ go.work.sum *.stderr *scheduled_tasks* tmp/ -data/ \ No newline at end of file +data/ + +/bin/ +/loadsim +/otelcontext \ No newline at end of file diff --git a/internal/graphrag/investigation.go b/internal/graphrag/investigation.go index b4f9d9a..28765f9 100644 --- a/internal/graphrag/investigation.go +++ b/internal/graphrag/investigation.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "log/slog" + "strings" "sync" "time" @@ -39,6 +40,16 @@ func (c *investigationCooldown) allow(key string, now time.Time) bool { return true } +// cooldownKey builds a case- and whitespace-insensitive key from the tuple +// (trigger_service, root_service, root_operation). Service names emitted +// from different instrumentations occasionally differ in casing or have +// trailing whitespace; canonicalizing here prevents those variants from +// bypassing the cooldown guard. +func cooldownKey(triggerService, rootService, rootOperation string) string { + norm := func(s string) string { return strings.ToLower(strings.TrimSpace(s)) } + return norm(triggerService) + "|" + norm(rootService) + "|" + norm(rootOperation) +} + // prune drops entries older than cutoff to bound map size. Called from // the refresh tick. func (c *investigationCooldown) prune(cutoff time.Time) { @@ -91,19 +102,21 @@ func (g *GraphRAG) PersistInvestigation(triggerService string, chains []ErrorCha return } + now := time.Now() + // Cooldown: suppress repeat investigations for the same // (trigger_service, root_service, root_operation) inside a sliding window. - // Without this guard, a stuck service produces one insert every anomaly - // tick (default 10s) indefinitely. - key := triggerService + "|" + firstChain.RootCause.Service + "|" + firstChain.RootCause.Operation - if g.invCooldown != nil && !g.invCooldown.allow(key, time.Now()) { + // Keys are canonicalized (lower + trim) so "Orders" and "orders " share a + // bucket — otherwise trivial casing differences would bypass the guard. + key := cooldownKey(triggerService, firstChain.RootCause.Service, firstChain.RootCause.Operation) + if g.invCooldown != nil && !g.invCooldown.allow(key, now) { return } // Increment BEFORE db.Create so the counter reflects "cooldown allowed; // persist attempted". See InvestigationInsertCount's doc comment. g.invInserts.Add(1) - id := fmt.Sprintf("inv_%d", time.Now().UnixNano()) + id := fmt.Sprintf("inv_%d", now.UnixNano()) severity := "warning" if len(anomalies) > 0 { @@ -160,7 +173,7 @@ func (g *GraphRAG) PersistInvestigation(triggerService string, chains []ErrorCha inv := Investigation{ ID: id, - CreatedAt: time.Now(), + CreatedAt: now, Status: "detected", Severity: severity, TriggerService: triggerService, diff --git a/internal/graphrag/investigation_cooldown_test.go b/internal/graphrag/investigation_cooldown_test.go index 2d90933..8c8e292 100644 --- a/internal/graphrag/investigation_cooldown_test.go +++ b/internal/graphrag/investigation_cooldown_test.go @@ -44,3 +44,20 @@ func TestPersistInvestigation_Cooldown(t *testing.T) { t.Fatalf("distinct service should bypass cooldown; got %d, want > %d", third, second) } } + +// TestCooldownKey_Canonical verifies the key normalizes case and trims +// whitespace so "Orders" / "orders " / "ORDERS" land in the same bucket. +func TestCooldownKey_Canonical(t *testing.T) { + cases := [][3]string{ + {"orders", "orders", "op"}, + {"Orders", "ORDERS", "op"}, + {" orders ", "orders", " op "}, + {"ORDERS", "Orders ", "OP"}, + } + want := cooldownKey(cases[0][0], cases[0][1], cases[0][2]) + for _, c := range cases[1:] { + if got := cooldownKey(c[0], c[1], c[2]); got != want { + t.Errorf("cooldownKey%v = %q, want %q", c, got, want) + } + } +} diff --git a/internal/graphrag/refresh.go b/internal/graphrag/refresh.go index 66a3cf8..b354a7f 100644 --- a/internal/graphrag/refresh.go +++ b/internal/graphrag/refresh.go @@ -27,8 +27,12 @@ func (g *GraphRAG) refreshLoop(ctx context.Context) { slog.Debug("GraphRAG pruned expired traces/spans", "count", pruned) } g.pruneOldAnomalies() - // Bound the investigation cooldown map. 2× window keeps - // entries through the active suppression plus a grace period. + // Bound the investigation cooldown map. The 10m cutoff is 2× + // the cooldown window (5m) — it retains entries through the + // active suppression plus a grace period. This assumes the + // refresh tick runs at least every 10 minutes; if RefreshEvery + // grows larger, raise the cutoff in lockstep, otherwise a stuck + // service could bypass the cooldown between prunes. if g.invCooldown != nil { g.invCooldown.prune(time.Now().Add(-10 * time.Minute)) } diff --git a/internal/storage/metrics_p99_test.go b/internal/storage/metrics_p99_test.go index eb4c515..3898695 100644 --- a/internal/storage/metrics_p99_test.go +++ b/internal/storage/metrics_p99_test.go @@ -2,7 +2,6 @@ package storage import ( "context" - "math" "testing" "time" @@ -222,18 +221,6 @@ func p99Itoa(n int) string { return string(buf[pos:]) } -// verifyP99Index is the reference formula used in assertions. -func verifyP99Index(n int) int { - idx := int(math.Ceil(float64(n)*0.99)) - 1 - if idx < 0 { - idx = 0 - } - if idx >= n { - idx = n - 1 - } - return idx -} - // --------------------------------------------------------------------------- // Critical 2: verify MySQL branch preserves tenant filter // --------------------------------------------------------------------------- From df021c84d07215cec49452a5ec50cd2a8df7866d Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 24 Apr 2026 15:17:00 +0000 Subject: [PATCH 2/3] checkpoint: pre-yolo 2026-04-24T15:17:00 From 4dda1f47f1b406e2cff48d0c98c2c73523d2ac96 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Fri, 24 Apr 2026 15:18:23 +0000 Subject: [PATCH 3/3] ci: replace deleted central-ops reusable workflow with local pipeline The reusable workflow at RandomCodeSpace/central-ops/.github/workflows /reusable-pipeline.yml@main returns 404 (repo deleted), which has been silently failing CI for multiple commits. Runs the same gates locally: go build, go vet, go test -race, and a build-tag compile-check for the loadsim simulator. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci.yml | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 59c72be..3c45635 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,16 +8,30 @@ on: workflow_dispatch: permissions: - contents: write - security-events: write - id-token: write - actions: read + contents: read jobs: - pipeline: - uses: RandomCodeSpace/central-ops/.github/workflows/reusable-pipeline.yml@main - secrets: inherit - with: - go_version_file: go.mod - cgo_enabled: "1" - go_test_flags: "-race -timeout 120s" \ No newline at end of file + build-and-test: + name: build · vet · test + runs-on: ubuntu-latest + env: + CGO_ENABLED: "1" + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: go build + run: go build ./... + + - name: go vet + run: go vet ./... + + - name: go test (race) + run: go test -race -timeout 180s ./... + + - name: loadsim build tag compiles + run: go build -tags loadtest ./test/loadsim/...