JLRansom · JLRansom · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/__tests__/unit/health-baselines.test.ts b/__tests__/unit/health-baselines.test.ts
diff --git a/__tests__/unit/health-cache.test.ts b/__tests__/unit/health-cache.test.ts
@@ -3,16 +3,65 @@
  *
  * These tests use a real DB (isolated per test file via setup.ts) to verify
  * that computeAndCacheHealthMetrics correctly reads from DB and caches results.
+ *
+ * The second describe block ("with ROLE_BASELINES_ENABLED") covers the
+ * role-relative baseline integration path, seeding both agent records and
+ * task-run history to exercise the full normalization pipeline.
  */
-import { describe, it, expect, beforeEach } from 'vitest'
+import { describe, it, expect, beforeEach, afterEach } from 'vitest'
 import {
   getCachedHealthMetrics,
   computeAndCacheHealthMetrics,
   invalidateHealthCache,
 } from '@/lib/health-cache'
 import { dbAddTaskRun } from '@/lib/db/repositories/taskRunRepo'
+import { dbAddAgent } from '@/lib/db/repositories/agentRepo'
+import { invalidateRoleBaselines } from '@/lib/health-baselines'
 import { makeTestTaskRun } from '../helpers/test-utils'
 import { randomUUID } from 'crypto'
+import type { Agent } from '@/lib/types'
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+/** Creates a minimal Agent record and persists it to the DB. */
+function seedAgent(agentId: string, type: Agent['type'] = 'coder'): Agent {
+  const agent: Agent = {
+    id: agentId,
+    type,
+    prompt: '',
+    status: 'idle',
+    events: [],
+    createdAt: Date.now(),
+  }
+  dbAddAgent(agent)
+  return agent
+}
+
+/**
+ * Seeds `count` task runs for the given agentId inside the 7-day rolling
+ * window so hasEnoughData will be true.
+ */
+function seedRunsForAgent(
+  agentId: string,
+  role: Agent['type'],
+  count: number,
+  status: 'done' | 'failed' = 'done',
+): void {
+  const now = Date.now()
+  for (let i = 0; i < count; i++) {
+    dbAddTaskRun(
+      makeTestTaskRun(randomUUID(), {
+        agentId,
+        role,
+        status,
+        completedAt: now - i * 60_000,
+        startedAt: now - i * 60_000 - 30_000,
+      }),
+    )
+  }
+}
 
 beforeEach(() => {
   // Clear cache between tests to avoid cross-test contamination
@@ -110,3 +159,82 @@ describe('invalidateHealthCache', () => {
     expect(() => invalidateHealthCache('ghost-agent')).not.toThrow()
   })
 })
+
+// ---------------------------------------------------------------------------
+// Role-baseline integration (ROLE_BASELINES_ENABLED=true)
+// ---------------------------------------------------------------------------
+
+describe('computeAndCacheHealthMetrics with ROLE_BASELINES_ENABLED', () => {
+  beforeEach(() => {
+    invalidateHealthCache()
+    invalidateRoleBaselines()
+    process.env.ROLE_BASELINES_ENABLED = 'true'
+  })
+
+  afterEach(() => {
+    delete process.env.ROLE_BASELINES_ENABLED
+    invalidateRoleBaselines()
+  })
+
+  it('returns raw metrics (no normalisation) when feature flag is off', () => {
+    delete process.env.ROLE_BASELINES_ENABLED // ensure flag is off
+    const agentId = randomUUID()
+    seedAgent(agentId, 'coder')
+    seedRunsForAgent(agentId, 'coder', 6)
+
+    const metrics = computeAndCacheHealthMetrics(agentId)
+    // Without baseline, completionRate should be the raw value (1.0 = all done)
+    expect(metrics.completionRate).toBe(1)
+    expect(metrics.errorDensity).toBe(0)
+  })
+
+  it('applies role-relative normalisation when feature flag is enabled', () => {
+    // Set up: role median completionRate = 0.5 (mixed cohort)
+    // Build a cohort of 3 agents for the 'researcher' role with 50% completion
+    const cohortAgents = [randomUUID(), randomUUID(), randomUUID()]
+    for (const id of cohortAgents) {
+      seedAgent(id, 'researcher')
+      // 3 done + 3 failed = 50% completion rate per agent
+      seedRunsForAgent(id, 'researcher', 3, 'done')
+      seedRunsForAgent(id, 'researcher', 3, 'failed')
+    }
+
+    // Now test an agent that has 100% completion rate
+    const agentId = randomUUID()
+    seedAgent(agentId, 'researcher')
+    seedRunsForAgent(agentId, 'researcher', 6, 'done')
+
+    // Invalidate baseline cache so it recomputes from the seeded cohort
+    invalidateRoleBaselines()
+
+    const metrics = computeAndCacheHealthMetrics(agentId)
+
+    // The agent performs above role norm:
+    //   raw completionRate = 1.0, role median ≈ 0.5
+    //   normalised = min(1.0 / 0.5, 1) = 1.0 (capped)
+    expect(metrics.hasEnoughData).toBe(true)
+    expect(metrics.completionRate).toBe(1)
+  })
+
+  it('degrades to raw metrics when agent is not found in DB', () => {
+    // agentId with no DB record — getRoleBaseline should not be called
+    const ghostAgentId = randomUUID()
+    // Seed runs directly without creating the agent record
+    seedRunsForAgent(ghostAgentId, 'coder', 6)
+
+    const metrics = computeAndCacheHealthMetrics(ghostAgentId)
+    // Should succeed without throwing; returns raw metrics
+    expect(metrics).toBeDefined()
+    expect(metrics.hasEnoughData).toBe(true)
+  })
+
+  it('stores normalised metrics in cache and returns the same value on hit', () => {
+    const agentId = randomUUID()
+    seedAgent(agentId, 'coder')
+    seedRunsForAgent(agentId, 'coder', 6)
+
+    const computed = computeAndCacheHealthMetrics(agentId)
+    const cached = getCachedHealthMetrics(agentId, 60_000)
+    expect(cached).toEqual(computed)
+  })
+})
diff --git a/context/current-sprint.md b/context/current-sprint.md
@@ -1,12 +1,13 @@
 # Current Sprint
 
-> Last updated: 2026-03-17
+> Last updated: 2026-03-18
 
 ## Active Worktrees
 
-*(none — all branches merged, worktrees cleaned up)*
+- `feat/role-aware-health-baselines` — PR #29 open, awaiting review (depends on #26, #27)
 
 ## In Progress
+- [ ] Role-aware health baselines — PR #29 open (awaiting #26 + #27 as prerequisites)
 - [ ] Authentication & role-based access (protect dashboard + API routes)
 - [ ] Agent log search & filtering
 

diff --git a/context/decisions.md b/context/decisions.md
@@ -2,6 +2,14 @@
 
 > Append new entries at the top. Keep each entry ≤ 10 lines.
 
+## ADR-040 — Role-aware health baselines (2026-03-18)
+**Decision:** Add `lib/health-baselines.ts` — a dedicated module that computes and caches 30-day median baselines (completionRate, errorDensity, weeklyThroughput) per agent role. `applyRoleBaseline()` in `health.ts` normalises raw sub-metrics relative to the role's median before badge thresholds are applied, so a tester at 72% completion looks healthy if testers average 70% but alarming if they average 92%.
+**Architecture:** `BaselineNorms` interface defined in `health.ts` (not `health-baselines.ts`) to break the potential circular import. `health-baselines.ts` satisfies it structurally. 5-minute TTL cache; `MIN_COHORT_SIZE = 3` guard falls back to flat thresholds when cohort is too thin. Feature-flagged via `ROLE_BASELINES_ENABLED` env var (default off) for safe A/B rollout.
+**Degradation contract:** Cohort < 3 → flat thresholds. Agent < 5 runs → skip (hasEnoughData unchanged). Baseline metric null or zero → skip that sub-metric. DB error → return stale cache or null-metrics. New role → cohort = 0 → flat thresholds until data accumulates (≈2–3 weeks).
+**New role warning:** Adding a new AgentType requires a baseline recalibration period; documented in `docs/agent-types.md`.
+**Affects:** `lib/health-baselines.ts` (new), `lib/health.ts` (+`applyRoleBaseline`, `BaselineNorms`, `MIN_COHORT_SIZE_FOR_BASELINE`), `lib/health-cache.ts` (+role lookup, feature flag), `lib/db/repositories/taskRunRepo.ts` (+`dbGetTaskRunsByRole`), `__tests__/unit/health-baselines.test.ts` (new), `__tests__/unit/health-cache.test.ts` (extended), `docs/agent-types.md`.
+**PR:** #28 (`feat/role-aware-health-baselines`) — depends on #26, #27.
+
 ## ADR-037 — Meetings toggle in HistoryList + global meetings API (2026-03-14)
 **Decision:** Added `mode` state (`'runs' | 'meetings'`) to `HistoryList`. Meetings view shows a filterable table (date, topic, project, status, agents, tokens, cost) via a new global `GET /api/meetings` endpoint. Endpoint enriches data from `dbGetAllMeetings()` + per-meeting message aggregations (agentCount, totalTokens, totalCostUsd). Supports `from`/`to` epoch-ms and `status` query params.
 **Why:** History tab covers all runs; logical to extend it to meetings. Global endpoint needed because /history has no projectId.

diff --git a/docs/agent-types.md b/docs/agent-types.md
@@ -74,3 +74,31 @@ tester      → 'testing'
 ```
 
 Cards are automatically moved to the matching column when an agent of that role starts.
+
+## Health Baselines and New Role Addition
+
+When `ROLE_BASELINES_ENABLED=true`, each agent's health sub-metrics are
+normalised against the median performance of its role cohort (see
+`lib/health-baselines.ts` and ADR-040).
+
+**Adding a new AgentType requires a baseline recalibration period.**
+
+When a sixth (or later) role is added to the `AgentType` union:
+
+1. At deploy time, the new role has zero historical runs → `cohortSize = 0`
+   → `MIN_COHORT_SIZE` guard fires → health scores fall back to flat thresholds
+   automatically. No crash, no code change needed.
+2. After approximately **2–3 weeks** of production traffic, once the cohort
+   accumulates ≥ 3 agents each with ≥ 5 runs per week, the baseline will
+   self-populate on the next 5-minute cache refresh.
+3. During the recalibration window, health badges for the new role display raw
+   absolute values (same as before `ROLE_BASELINES_ENABLED` was set).
+
+**Action required when adding a new role:**
+- Update `AgentType` in `lib/types.ts`
+- Update `ROLE_COLORS` / `ROLE_HEX` in `lib/constants.ts`
+- Update pipeline logic in `lib/services/agentService.ts` if it joins the pipeline
+- Note the recalibration period in the PR description so the team knows to
+  monitor badge accuracy for the new role for 2–3 weeks post-deploy
+- Call `invalidateRoleBaselines()` in any migration script that back-fills
+  historical runs for the new role (so the cache doesn't serve stale data)
diff --git a/lib/db/repositories/taskRunRepo.ts b/lib/db/repositories/taskRunRepo.ts
@@ -178,3 +178,30 @@ export function dbGetRecentTaskRunsByAgent(agentId: string, since: number): Task
     .all()
     .map(rowToTaskRun)
 }
+
+/**
+ * Returns all task runs for agents of the given role within a trailing time
+ * window, ordered by completedAt DESC.
+ *
+ * Used by `lib/health-baselines.ts` to compute per-role aggregate baselines.
+ * Fetching all runs for the role in one query is more efficient than issuing N
+ * per-agent queries and merging in JS.
+ *
+ * @param role     - The agent role string (e.g. 'coder', 'researcher').
+ * @param windowMs - Look-back window in ms (default: 30 days).
+ * @param now      - Injectable epoch ms for deterministic tests.
+ */
+export function dbGetTaskRunsByRole(
+  role: string,
+  windowMs: number = 30 * 24 * 60 * 60 * 1_000,
+  now: number = Date.now(),
+): TaskRun[] {
+  const since = now - windowMs
+  return db
+    .select()
+    .from(taskRuns)
+    .where(and(eq(taskRuns.role, role), gte(taskRuns.completedAt, since)))
+    .orderBy(desc(taskRuns.completedAt))
+    .all()
+    .map(rowToTaskRun)
+}