diff --git a/__tests__/unit/health-baselines.test.ts b/__tests__/unit/health-baselines.test.ts new file mode 100644 index 0000000..9a2466f --- /dev/null +++ b/__tests__/unit/health-baselines.test.ts @@ -0,0 +1,509 @@ +/** + * Unit tests for lib/health-baselines.ts and the applyRoleBaseline() function + * exported from lib/health.ts. + * + * Tests are grouped into four areas: + * 1. medianOf() helper + * 2. computeRoleBaseline() — computation logic and degradation ladder + * 3. getRoleBaseline() — cache TTL and invalidation + * 4. applyRoleBaseline() — normalisation rules and guards + * + * Uses a real isolated SQLite DB (via setup.ts) so that computeRoleBaseline + * exercises the full DB→compute path rather than mocking internals. + */ +import { describe, it, expect, beforeEach, vi, afterEach } from 'vitest' +import { randomUUID } from 'crypto' +import { dbAddTaskRun } from '@/lib/db/repositories/taskRunRepo' +import { makeTestTaskRun } from '../helpers/test-utils' +import { + computeRoleBaseline, + getRoleBaseline, + invalidateRoleBaselines, + medianOf, + MIN_COHORT_SIZE, + BASELINE_TTL_MS, + BASELINE_WINDOW_MS, +} from '@/lib/health-baselines' +import { + applyRoleBaseline, + MIN_COHORT_SIZE_FOR_BASELINE, +} from '@/lib/health' +import type { AgentHealthMetrics } from '@/lib/types' + +// --------------------------------------------------------------------------- +// Fixed reference time +// --------------------------------------------------------------------------- + +/** Wednesday 2026-03-11 noon UTC — same anchor used in health.test.ts */ +const NOW = new Date('2026-03-11T12:00:00Z').getTime() + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** + * Seeds `count` task runs for the given agentId and role inside the 7-day + * rolling window that `computeHealthMetrics` uses, spaced 30 minutes apart. + * + * Runs are placed within the last 24 hours so they safely fall inside the + * rolling window regardless of which NOW anchor is used. All `count` runs + * will therefore contribute to `hasEnoughData` and the per-agent metrics + * that `computeRoleBaseline` collects. + */ +function seedRunsForAgent( + agentId: string, + role: string, + count: number, + status: 'done' | 'failed' = 'done', + now: number = NOW, +): void { + for (let i = 0; i < count; i++) { + // Space 30 min apart, most recent first, all within last 24 h + const completedAt = now - (i + 1) * 30 * 60_000 + dbAddTaskRun( + makeTestTaskRun(randomUUID(), { + agentId, + role, + status, + completedAt, + startedAt: completedAt - 30_000, + }), + ) + } +} + +// --------------------------------------------------------------------------- +// 1. medianOf() +// --------------------------------------------------------------------------- + +describe('medianOf', () => { + it('returns null for an empty array', () => { + expect(medianOf([])).toBeNull() + }) + + it('returns the single value for a one-element array', () => { + expect(medianOf([42])).toBe(42) + }) + + it('returns the middle value for an odd-length array', () => { + expect(medianOf([3, 1, 4, 1, 5])).toBe(3) // sorted: [1,1,3,4,5] + }) + + it('returns the average of the two middle values for an even-length array', () => { + expect(medianOf([1, 3, 5, 7])).toBe(4) // (3+5)/2 + }) + + it('does not mutate the input array', () => { + const input = [5, 2, 8, 1] + medianOf(input) + expect(input).toEqual([5, 2, 8, 1]) + }) +}) + +// --------------------------------------------------------------------------- +// 2. computeRoleBaseline() +// --------------------------------------------------------------------------- + +describe('computeRoleBaseline', () => { + beforeEach(() => { + invalidateRoleBaselines() + }) + + it('returns null-metric baseline with cohortSize=0 when no runs exist for the role', () => { + const result = computeRoleBaseline('tester', NOW) + expect(result.cohortSize).toBe(0) + expect(result.medianCompletionRate).toBeNull() + expect(result.medianErrorDensity).toBeNull() + expect(result.medianWeeklyThroughput).toBeNull() + expect(result.role).toBe('tester') + expect(result.computedAt).toBe(NOW) + }) + + it('returns null-metric baseline when cohort has fewer than MIN_COHORT_SIZE qualifying agents', () => { + // Seed 2 agents with enough runs — below the MIN_COHORT_SIZE of 3 + for (let i = 0; i < MIN_COHORT_SIZE - 1; i++) { + seedRunsForAgent(randomUUID(), 'writer', 6, 'done', NOW) + } + const result = computeRoleBaseline('writer', NOW) + expect(result.cohortSize).toBeLessThan(MIN_COHORT_SIZE) + expect(result.medianCompletionRate).toBeNull() + expect(result.medianErrorDensity).toBeNull() + }) + + it('returns meaningful baselines when cohort meets MIN_COHORT_SIZE', () => { + // Seed 3 agents, each with 6 done runs → completion rate = 1, error density = 0 + for (let i = 0; i < MIN_COHORT_SIZE; i++) { + seedRunsForAgent(randomUUID(), 'researcher', 6, 'done', NOW) + } + const result = computeRoleBaseline('researcher', NOW) + expect(result.cohortSize).toBe(MIN_COHORT_SIZE) + expect(result.medianCompletionRate).toBe(1) + expect(result.medianErrorDensity).toBe(0) + expect(result.medianWeeklyThroughput).not.toBeNull() + expect(result.medianWeeklyThroughput!).toBeGreaterThan(0) + }) + + it('only counts agents with >= MIN_RUNS_THRESHOLD runs as qualifying', () => { + // 2 agents with enough runs + 1 with too few — only 2 qualify, below cohort min + seedRunsForAgent(randomUUID(), 'coder', 6, 'done', NOW) + seedRunsForAgent(randomUUID(), 'coder', 6, 'done', NOW) + seedRunsForAgent(randomUUID(), 'coder', 2, 'done', NOW) // sparse — not qualifying + const result = computeRoleBaseline('coder', NOW) + expect(result.cohortSize).toBe(2) // only 2 qualify + expect(result.medianCompletionRate).toBeNull() // below MIN_COHORT_SIZE + }) + + it('excludes runs outside the 30-day window when determining qualifying agents', () => { + const agentId = randomUUID() + // Seed 6 runs but all 36 days ago — outside BASELINE_WINDOW_MS + const ancient = NOW - BASELINE_WINDOW_MS - 36 * 24 * 3600_000 + for (let i = 0; i < 6; i++) { + dbAddTaskRun( + makeTestTaskRun(randomUUID(), { + agentId, + role: 'senior-coder', + status: 'done', + completedAt: ancient + i * 60_000, + startedAt: ancient + i * 60_000 - 30_000, + }), + ) + } + // No other agents → cohort = 0 + const result = computeRoleBaseline('senior-coder', NOW) + expect(result.cohortSize).toBe(0) + }) + + it('computes correct median completionRate across a mixed cohort', () => { + // 3 agents: rates ~1.0, ~0.5, ~0.0 → median = 0.5 + const agents = [randomUUID(), randomUUID(), randomUUID()] + // Agent 0: all done → rate 1.0 + seedRunsForAgent(agents[0], 'tester', 6, 'done', NOW) + // Agent 1: half done, half failed → rate 0.5 + seedRunsForAgent(agents[1], 'tester', 3, 'done', NOW) + seedRunsForAgent(agents[1], 'tester', 3, 'failed', NOW) + // Agent 2: all failed → rate 0.0 + seedRunsForAgent(agents[2], 'tester', 6, 'failed', NOW) + + const result = computeRoleBaseline('tester', NOW) + expect(result.cohortSize).toBe(3) + expect(result.medianCompletionRate).toBeCloseTo(0.5, 5) + expect(result.medianErrorDensity).toBeCloseTo(0.5, 5) + }) + + it('returns cohortSize correctly even when below threshold', () => { + // senior-coder is only seeded in the "excludes runs outside window" test above, + // and those runs are outside BASELINE_WINDOW_MS — so they produce 0 qualifying agents. + // Adding 1 agent with inside-window runs here means cohortSize = 1 (below MIN_COHORT_SIZE). + seedRunsForAgent(randomUUID(), 'senior-coder', 6, 'done', NOW) + const result = computeRoleBaseline('senior-coder', NOW) + expect(result.cohortSize).toBe(1) + }) +}) + +// --------------------------------------------------------------------------- +// 3. getRoleBaseline() — cache behaviour +// --------------------------------------------------------------------------- + +describe('getRoleBaseline', () => { + beforeEach(() => { + invalidateRoleBaselines() + vi.useFakeTimers() + vi.setSystemTime(NOW) + }) + + afterEach(() => { + vi.useRealTimers() + }) + + it('returns a baseline without throwing even for an empty role cohort', () => { + const result = getRoleBaseline('researcher') + expect(result).toBeDefined() + expect(result.role).toBe('researcher') + }) + + it('returns cached baseline within TTL without recomputing', () => { + // Seed enough data so a real baseline exists + for (let i = 0; i < MIN_COHORT_SIZE; i++) { + seedRunsForAgent(randomUUID(), 'coder', 6, 'done', NOW) + } + + const first = getRoleBaseline('coder') + // Advance time but stay within TTL + vi.advanceTimersByTime(BASELINE_TTL_MS - 1_000) + const second = getRoleBaseline('coder') + // Same object reference means the cache was hit + expect(second).toBe(first) + }) + + it('recomputes baseline after TTL expires', () => { + for (let i = 0; i < MIN_COHORT_SIZE; i++) { + seedRunsForAgent(randomUUID(), 'coder', 6, 'done', NOW) + } + + const first = getRoleBaseline('coder') + // Advance past TTL + vi.advanceTimersByTime(BASELINE_TTL_MS + 1_000) + const second = getRoleBaseline('coder') + // Should be a fresh object (different reference) + expect(second).not.toBe(first) + }) + + it('invalidateRoleBaselines() for a specific role forces recompute', () => { + for (let i = 0; i < MIN_COHORT_SIZE; i++) { + seedRunsForAgent(randomUUID(), 'writer', 6, 'done', NOW) + } + + const first = getRoleBaseline('writer') + invalidateRoleBaselines('writer') + const second = getRoleBaseline('writer') + expect(second).not.toBe(first) + }) + + it('invalidateRoleBaselines() without argument clears all roles', () => { + for (let i = 0; i < MIN_COHORT_SIZE; i++) { + seedRunsForAgent(randomUUID(), 'coder', 6, 'done', NOW) + seedRunsForAgent(randomUUID(), 'tester', 6, 'done', NOW) + } + const c1 = getRoleBaseline('coder') + const t1 = getRoleBaseline('tester') + + invalidateRoleBaselines() + + const c2 = getRoleBaseline('coder') + const t2 = getRoleBaseline('tester') + + expect(c2).not.toBe(c1) + expect(t2).not.toBe(t1) + }) + + it('does not throw when called for an unknown/new agent type', () => { + // Force TypeScript to accept an arbitrary string via cast (simulates future role addition) + expect(() => getRoleBaseline('pilot' as Parameters[0])).not.toThrow() + }) +}) + +// --------------------------------------------------------------------------- +// 4. applyRoleBaseline() — normalisation rules and guards +// --------------------------------------------------------------------------- + +/** Helper: build a fully-populated AgentHealthMetrics with hasEnoughData=true */ +function makeMetrics(overrides: Partial = {}): AgentHealthMetrics { + return { + completionRate: 0.8, + throughputTrend: 1.5, + errorDensity: 0.2, + idleSeconds: 3600, + hasEnoughData: true, + ...overrides, + } +} + +describe('applyRoleBaseline', () => { + // ── Guard conditions ──────────────────────────────────────────────────────── + + it('returns raw metrics unchanged when baseline is null', () => { + const raw = makeMetrics() + expect(applyRoleBaseline(raw, null)).toEqual(raw) + }) + + it('returns raw metrics unchanged when cohortSize < MIN_COHORT_SIZE_FOR_BASELINE', () => { + const raw = makeMetrics() + const baseline = { + medianCompletionRate: 0.9, + medianErrorDensity: 0.1, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE - 1, + } + expect(applyRoleBaseline(raw, baseline)).toEqual(raw) + }) + + it('returns raw metrics unchanged when hasEnoughData is false', () => { + const raw = makeMetrics({ hasEnoughData: false, completionRate: null, errorDensity: null }) + const baseline = { + medianCompletionRate: 0.9, + medianErrorDensity: 0.1, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + expect(applyRoleBaseline(raw, baseline)).toEqual(raw) + }) + + // ── Normalisation — completionRate ────────────────────────────────────────── + + it('normalises completionRate: raw / median, capped at 1', () => { + const raw = makeMetrics({ completionRate: 0.72 }) + const baseline = { + medianCompletionRate: 0.70, + medianErrorDensity: 0.30, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + // 0.72 / 0.70 = 1.028... → capped at 1.0 + expect(result.completionRate).toBe(1) + }) + + it('normalises completionRate below 1 when agent underperforms role', () => { + const raw = makeMetrics({ completionRate: 0.72 }) + const baseline = { + medianCompletionRate: 0.92, + medianErrorDensity: 0.08, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + // 0.72 / 0.92 ≈ 0.782 + expect(result.completionRate).toBeCloseTo(0.72 / 0.92, 10) + expect(result.completionRate!).toBeLessThan(1) + }) + + it('skips completionRate normalisation when raw completionRate is null', () => { + const raw = makeMetrics({ completionRate: null }) + const baseline = { + medianCompletionRate: 0.9, + medianErrorDensity: 0.1, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + expect(result.completionRate).toBeNull() + }) + + it('skips completionRate normalisation when baseline median is null', () => { + const raw = makeMetrics({ completionRate: 0.8 }) + const baseline = { + medianCompletionRate: null, + medianErrorDensity: 0.1, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + expect(result.completionRate).toBe(0.8) // unchanged + }) + + it('skips completionRate normalisation when baseline median is zero (division guard)', () => { + const raw = makeMetrics({ completionRate: 0.8 }) + const baseline = { + medianCompletionRate: 0, // would cause division by zero + medianErrorDensity: 0.1, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + expect(result.completionRate).toBe(0.8) // unchanged + }) + + // ── Normalisation — errorDensity ──────────────────────────────────────────── + + it('normalises errorDensity: raw / median (higher ratio = worse than norm)', () => { + const raw = makeMetrics({ errorDensity: 0.3 }) + const baseline = { + medianCompletionRate: 0.7, + medianErrorDensity: 0.3, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + // 0.3 / 0.3 = 1.0 — exactly at role norm + expect(result.errorDensity).toBeCloseTo(1.0, 10) + }) + + it('normalises errorDensity below 1 when agent has fewer errors than role norm', () => { + const raw = makeMetrics({ errorDensity: 0.1 }) + const baseline = { + medianCompletionRate: 0.7, + medianErrorDensity: 0.3, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + // 0.1 / 0.3 ≈ 0.333 + expect(result.errorDensity).toBeCloseTo(0.1 / 0.3, 10) + }) + + it('normalises errorDensity above 1 when agent has more errors than role norm', () => { + const raw = makeMetrics({ errorDensity: 0.6 }) + const baseline = { + medianCompletionRate: 0.4, + medianErrorDensity: 0.3, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + // 0.6 / 0.3 = 2.0 — twice the role error rate + expect(result.errorDensity).toBeCloseTo(2.0, 10) + }) + + it('skips errorDensity normalisation when raw errorDensity is null', () => { + const raw = makeMetrics({ errorDensity: null }) + const baseline = { + medianCompletionRate: 0.9, + medianErrorDensity: 0.1, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + expect(result.errorDensity).toBeNull() + }) + + it('skips errorDensity normalisation when baseline median is null', () => { + const raw = makeMetrics({ errorDensity: 0.2 }) + const baseline = { + medianCompletionRate: 0.8, + medianErrorDensity: null, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + expect(result.errorDensity).toBe(0.2) // unchanged + }) + + it('skips errorDensity normalisation when baseline median is zero (division guard)', () => { + const raw = makeMetrics({ errorDensity: 0.2 }) + const baseline = { + medianCompletionRate: 0.8, + medianErrorDensity: 0, // would cause division by zero + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + expect(result.errorDensity).toBe(0.2) // unchanged + }) + + // ── Unchanged fields ───────────────────────────────────────────────────────── + + it('leaves throughputTrend unchanged (slope value, not role-dependent)', () => { + const raw = makeMetrics({ throughputTrend: -2.5 }) + const baseline = { + medianCompletionRate: 0.9, + medianErrorDensity: 0.1, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + expect(result.throughputTrend).toBe(-2.5) + }) + + it('leaves idleSeconds unchanged (absolute wall-clock metric)', () => { + const raw = makeMetrics({ idleSeconds: 86400 }) + const baseline = { + medianCompletionRate: 0.9, + medianErrorDensity: 0.1, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + expect(result.idleSeconds).toBe(86400) + }) + + it('leaves hasEnoughData unchanged', () => { + const raw = makeMetrics({ hasEnoughData: true }) + const baseline = { + medianCompletionRate: 0.9, + medianErrorDensity: 0.1, + cohortSize: MIN_COHORT_SIZE_FOR_BASELINE, + } + const result = applyRoleBaseline(raw, baseline) + expect(result.hasEnoughData).toBe(true) + }) + + // ── New role / unknown type graceful degradation ───────────────────────────── + + it('degrades to flat thresholds for a brand-new role with zero baseline data', () => { + // Simulate a freshly added 6th role with no run history + const raw = makeMetrics({ completionRate: 0.6, errorDensity: 0.4 }) + const emptyBaseline = { + medianCompletionRate: null, + medianErrorDensity: null, + cohortSize: 0, // zero — below MIN_COHORT_SIZE_FOR_BASELINE + } + const result = applyRoleBaseline(raw, emptyBaseline) + // Should pass through unchanged — flat thresholds apply + expect(result).toEqual(raw) + }) +}) diff --git a/__tests__/unit/health-cache.test.ts b/__tests__/unit/health-cache.test.ts index 601189a..6da31ae 100644 --- a/__tests__/unit/health-cache.test.ts +++ b/__tests__/unit/health-cache.test.ts @@ -3,16 +3,65 @@ * * These tests use a real DB (isolated per test file via setup.ts) to verify * that computeAndCacheHealthMetrics correctly reads from DB and caches results. + * + * The second describe block ("with ROLE_BASELINES_ENABLED") covers the + * role-relative baseline integration path, seeding both agent records and + * task-run history to exercise the full normalization pipeline. */ -import { describe, it, expect, beforeEach } from 'vitest' +import { describe, it, expect, beforeEach, afterEach } from 'vitest' import { getCachedHealthMetrics, computeAndCacheHealthMetrics, invalidateHealthCache, } from '@/lib/health-cache' import { dbAddTaskRun } from '@/lib/db/repositories/taskRunRepo' +import { dbAddAgent } from '@/lib/db/repositories/agentRepo' +import { invalidateRoleBaselines } from '@/lib/health-baselines' import { makeTestTaskRun } from '../helpers/test-utils' import { randomUUID } from 'crypto' +import type { Agent } from '@/lib/types' + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** Creates a minimal Agent record and persists it to the DB. */ +function seedAgent(agentId: string, type: Agent['type'] = 'coder'): Agent { + const agent: Agent = { + id: agentId, + type, + prompt: '', + status: 'idle', + events: [], + createdAt: Date.now(), + } + dbAddAgent(agent) + return agent +} + +/** + * Seeds `count` task runs for the given agentId inside the 7-day rolling + * window so hasEnoughData will be true. + */ +function seedRunsForAgent( + agentId: string, + role: Agent['type'], + count: number, + status: 'done' | 'failed' = 'done', +): void { + const now = Date.now() + for (let i = 0; i < count; i++) { + dbAddTaskRun( + makeTestTaskRun(randomUUID(), { + agentId, + role, + status, + completedAt: now - i * 60_000, + startedAt: now - i * 60_000 - 30_000, + }), + ) + } +} beforeEach(() => { // Clear cache between tests to avoid cross-test contamination @@ -110,3 +159,82 @@ describe('invalidateHealthCache', () => { expect(() => invalidateHealthCache('ghost-agent')).not.toThrow() }) }) + +// --------------------------------------------------------------------------- +// Role-baseline integration (ROLE_BASELINES_ENABLED=true) +// --------------------------------------------------------------------------- + +describe('computeAndCacheHealthMetrics with ROLE_BASELINES_ENABLED', () => { + beforeEach(() => { + invalidateHealthCache() + invalidateRoleBaselines() + process.env.ROLE_BASELINES_ENABLED = 'true' + }) + + afterEach(() => { + delete process.env.ROLE_BASELINES_ENABLED + invalidateRoleBaselines() + }) + + it('returns raw metrics (no normalisation) when feature flag is off', () => { + delete process.env.ROLE_BASELINES_ENABLED // ensure flag is off + const agentId = randomUUID() + seedAgent(agentId, 'coder') + seedRunsForAgent(agentId, 'coder', 6) + + const metrics = computeAndCacheHealthMetrics(agentId) + // Without baseline, completionRate should be the raw value (1.0 = all done) + expect(metrics.completionRate).toBe(1) + expect(metrics.errorDensity).toBe(0) + }) + + it('applies role-relative normalisation when feature flag is enabled', () => { + // Set up: role median completionRate = 0.5 (mixed cohort) + // Build a cohort of 3 agents for the 'researcher' role with 50% completion + const cohortAgents = [randomUUID(), randomUUID(), randomUUID()] + for (const id of cohortAgents) { + seedAgent(id, 'researcher') + // 3 done + 3 failed = 50% completion rate per agent + seedRunsForAgent(id, 'researcher', 3, 'done') + seedRunsForAgent(id, 'researcher', 3, 'failed') + } + + // Now test an agent that has 100% completion rate + const agentId = randomUUID() + seedAgent(agentId, 'researcher') + seedRunsForAgent(agentId, 'researcher', 6, 'done') + + // Invalidate baseline cache so it recomputes from the seeded cohort + invalidateRoleBaselines() + + const metrics = computeAndCacheHealthMetrics(agentId) + + // The agent performs above role norm: + // raw completionRate = 1.0, role median ≈ 0.5 + // normalised = min(1.0 / 0.5, 1) = 1.0 (capped) + expect(metrics.hasEnoughData).toBe(true) + expect(metrics.completionRate).toBe(1) + }) + + it('degrades to raw metrics when agent is not found in DB', () => { + // agentId with no DB record — getRoleBaseline should not be called + const ghostAgentId = randomUUID() + // Seed runs directly without creating the agent record + seedRunsForAgent(ghostAgentId, 'coder', 6) + + const metrics = computeAndCacheHealthMetrics(ghostAgentId) + // Should succeed without throwing; returns raw metrics + expect(metrics).toBeDefined() + expect(metrics.hasEnoughData).toBe(true) + }) + + it('stores normalised metrics in cache and returns the same value on hit', () => { + const agentId = randomUUID() + seedAgent(agentId, 'coder') + seedRunsForAgent(agentId, 'coder', 6) + + const computed = computeAndCacheHealthMetrics(agentId) + const cached = getCachedHealthMetrics(agentId, 60_000) + expect(cached).toEqual(computed) + }) +}) diff --git a/context/current-sprint.md b/context/current-sprint.md index 1075775..18ffd31 100644 --- a/context/current-sprint.md +++ b/context/current-sprint.md @@ -1,12 +1,13 @@ # Current Sprint -> Last updated: 2026-03-17 +> Last updated: 2026-03-18 ## Active Worktrees -*(none — all branches merged, worktrees cleaned up)* +- `feat/role-aware-health-baselines` — PR #29 open, awaiting review (depends on #26, #27) ## In Progress +- [ ] Role-aware health baselines — PR #29 open (awaiting #26 + #27 as prerequisites) - [ ] Authentication & role-based access (protect dashboard + API routes) - [ ] Agent log search & filtering diff --git a/context/decisions.md b/context/decisions.md index 5ac2abe..9070d72 100644 --- a/context/decisions.md +++ b/context/decisions.md @@ -2,6 +2,14 @@ > Append new entries at the top. Keep each entry ≤ 10 lines. +## ADR-040 — Role-aware health baselines (2026-03-18) +**Decision:** Add `lib/health-baselines.ts` — a dedicated module that computes and caches 30-day median baselines (completionRate, errorDensity, weeklyThroughput) per agent role. `applyRoleBaseline()` in `health.ts` normalises raw sub-metrics relative to the role's median before badge thresholds are applied, so a tester at 72% completion looks healthy if testers average 70% but alarming if they average 92%. +**Architecture:** `BaselineNorms` interface defined in `health.ts` (not `health-baselines.ts`) to break the potential circular import. `health-baselines.ts` satisfies it structurally. 5-minute TTL cache; `MIN_COHORT_SIZE = 3` guard falls back to flat thresholds when cohort is too thin. Feature-flagged via `ROLE_BASELINES_ENABLED` env var (default off) for safe A/B rollout. +**Degradation contract:** Cohort < 3 → flat thresholds. Agent < 5 runs → skip (hasEnoughData unchanged). Baseline metric null or zero → skip that sub-metric. DB error → return stale cache or null-metrics. New role → cohort = 0 → flat thresholds until data accumulates (≈2–3 weeks). +**New role warning:** Adding a new AgentType requires a baseline recalibration period; documented in `docs/agent-types.md`. +**Affects:** `lib/health-baselines.ts` (new), `lib/health.ts` (+`applyRoleBaseline`, `BaselineNorms`, `MIN_COHORT_SIZE_FOR_BASELINE`), `lib/health-cache.ts` (+role lookup, feature flag), `lib/db/repositories/taskRunRepo.ts` (+`dbGetTaskRunsByRole`), `__tests__/unit/health-baselines.test.ts` (new), `__tests__/unit/health-cache.test.ts` (extended), `docs/agent-types.md`. +**PR:** #28 (`feat/role-aware-health-baselines`) — depends on #26, #27. + ## ADR-037 — Meetings toggle in HistoryList + global meetings API (2026-03-14) **Decision:** Added `mode` state (`'runs' | 'meetings'`) to `HistoryList`. Meetings view shows a filterable table (date, topic, project, status, agents, tokens, cost) via a new global `GET /api/meetings` endpoint. Endpoint enriches data from `dbGetAllMeetings()` + per-meeting message aggregations (agentCount, totalTokens, totalCostUsd). Supports `from`/`to` epoch-ms and `status` query params. **Why:** History tab covers all runs; logical to extend it to meetings. Global endpoint needed because /history has no projectId. diff --git a/docs/agent-types.md b/docs/agent-types.md index ad1d1e2..18cbdcb 100644 --- a/docs/agent-types.md +++ b/docs/agent-types.md @@ -74,3 +74,31 @@ tester → 'testing' ``` Cards are automatically moved to the matching column when an agent of that role starts. + +## Health Baselines and New Role Addition + +When `ROLE_BASELINES_ENABLED=true`, each agent's health sub-metrics are +normalised against the median performance of its role cohort (see +`lib/health-baselines.ts` and ADR-040). + +**Adding a new AgentType requires a baseline recalibration period.** + +When a sixth (or later) role is added to the `AgentType` union: + +1. At deploy time, the new role has zero historical runs → `cohortSize = 0` + → `MIN_COHORT_SIZE` guard fires → health scores fall back to flat thresholds + automatically. No crash, no code change needed. +2. After approximately **2–3 weeks** of production traffic, once the cohort + accumulates ≥ 3 agents each with ≥ 5 runs per week, the baseline will + self-populate on the next 5-minute cache refresh. +3. During the recalibration window, health badges for the new role display raw + absolute values (same as before `ROLE_BASELINES_ENABLED` was set). + +**Action required when adding a new role:** +- Update `AgentType` in `lib/types.ts` +- Update `ROLE_COLORS` / `ROLE_HEX` in `lib/constants.ts` +- Update pipeline logic in `lib/services/agentService.ts` if it joins the pipeline +- Note the recalibration period in the PR description so the team knows to + monitor badge accuracy for the new role for 2–3 weeks post-deploy +- Call `invalidateRoleBaselines()` in any migration script that back-fills + historical runs for the new role (so the cache doesn't serve stale data) diff --git a/lib/db/repositories/taskRunRepo.ts b/lib/db/repositories/taskRunRepo.ts index 10888e2..bec47af 100644 --- a/lib/db/repositories/taskRunRepo.ts +++ b/lib/db/repositories/taskRunRepo.ts @@ -178,3 +178,30 @@ export function dbGetRecentTaskRunsByAgent(agentId: string, since: number): Task .all() .map(rowToTaskRun) } + +/** + * Returns all task runs for agents of the given role within a trailing time + * window, ordered by completedAt DESC. + * + * Used by `lib/health-baselines.ts` to compute per-role aggregate baselines. + * Fetching all runs for the role in one query is more efficient than issuing N + * per-agent queries and merging in JS. + * + * @param role - The agent role string (e.g. 'coder', 'researcher'). + * @param windowMs - Look-back window in ms (default: 30 days). + * @param now - Injectable epoch ms for deterministic tests. + */ +export function dbGetTaskRunsByRole( + role: string, + windowMs: number = 30 * 24 * 60 * 60 * 1_000, + now: number = Date.now(), +): TaskRun[] { + const since = now - windowMs + return db + .select() + .from(taskRuns) + .where(and(eq(taskRuns.role, role), gte(taskRuns.completedAt, since))) + .orderBy(desc(taskRuns.completedAt)) + .all() + .map(rowToTaskRun) +} diff --git a/lib/health-baselines.ts b/lib/health-baselines.ts new file mode 100644 index 0000000..0602ec5 --- /dev/null +++ b/lib/health-baselines.ts @@ -0,0 +1,243 @@ +/** + * Per-role aggregate health baselines. + * + * Computes and caches median performance metrics per agent role so that + * `lib/health-cache.ts` can normalise per-agent health scores against the + * role's own norms rather than flat absolute thresholds. + * + * Intended evolution: the in-memory TTL cache here is a stepping stone. Once + * data volume justifies it, replace `computeRoleBaseline` with a query backed + * by a persistent materialised view, keeping the `getRoleBaseline` API stable. + * + * Usage: + * import { getRoleBaseline, invalidateRoleBaselines } from '@/lib/health-baselines' + */ +import { dbGetTaskRunsByRole } from '@/lib/db/repositories/taskRunRepo' +import { computeHealthMetrics, MIN_RUNS_THRESHOLD } from '@/lib/health' +import type { AgentType, TaskRun } from '@/lib/types' + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +/** + * Minimum number of agents of a given role, each with at least + * MIN_RUNS_THRESHOLD runs within the baseline window, required to form a + * statistically meaningful cohort. Below this we return null-metric + * baselines and fall back to flat thresholds. + */ +export const MIN_COHORT_SIZE = 3 + +/** + * TTL for cached role baselines (5 minutes). + * Baselines change slowly — far longer than the per-agent 30-second TTL. + */ +export const BASELINE_TTL_MS = 300_000 // 5 minutes + +/** + * Look-back window for baseline computation: 30 days. + * Wider than the per-agent 7-day rolling window to establish stable role + * norms rather than sensitive short-term trends. + */ +export const BASELINE_WINDOW_MS = 30 * 24 * 60 * 60 * 1_000 // 30 days + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +/** + * Aggregate baseline for a single agent role derived from the trailing + * BASELINE_WINDOW_MS of task run history. + * + * Any metric field may be null when the cohort lacks sufficient data (e.g. a + * role where every agent always succeeds → `medianErrorDensity` is null). + * Callers must treat null as "baseline unavailable for this metric". + */ +export interface RoleBaseline { + role: AgentType + /** Median completion rate across all qualifying agents of this role. */ + medianCompletionRate: number | null + /** Median error density across all qualifying agents of this role. */ + medianErrorDensity: number | null + /** Median average weekly task throughput across qualifying agents. */ + medianWeeklyThroughput: number | null + /** Number of agents that contributed to this baseline. */ + cohortSize: number + /** Epoch ms when this baseline was last computed. */ + computedAt: number +} + +interface CachedBaseline { + baseline: RoleBaseline + /** Epoch ms when the cache entry was stored (used for TTL eviction). */ + computedAt: number +} + +// --------------------------------------------------------------------------- +// Cache store +// --------------------------------------------------------------------------- + +/** In-process TTL cache — keyed by AgentType. */ +const baselineCache = new Map() + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +/** + * Returns the sorted median of a numeric array. + * Returns null for empty arrays rather than NaN so callers get a clean signal. + */ +export function medianOf(values: number[]): number | null { + if (values.length === 0) return null + const sorted = [...values].sort((a, b) => a - b) + const mid = Math.floor(sorted.length / 2) + return sorted.length % 2 === 0 + ? (sorted[mid - 1] + sorted[mid]) / 2 + : sorted[mid] +} + +// --------------------------------------------------------------------------- +// Core computation +// --------------------------------------------------------------------------- + +/** + * Computes per-role aggregate health baselines from the trailing + * BASELINE_WINDOW_MS of task runs. + * + * Steps: + * 1. Fetch all runs for this role within the window via DB. + * 2. Group by agentId and keep only agents with ≥ MIN_RUNS_THRESHOLD runs. + * 3. If the qualifying cohort is < MIN_COHORT_SIZE, return null-metric baseline. + * 4. Compute per-agent metrics; take the median of each metric across agents. + * + * @param role - The AgentType to compute baselines for. + * @param now - Injectable epoch ms for deterministic testing. + */ +export function computeRoleBaseline( + role: AgentType, + now: number = Date.now(), +): RoleBaseline { + const nullBaseline: RoleBaseline = { + role, + medianCompletionRate: null, + medianErrorDensity: null, + medianWeeklyThroughput: null, + cohortSize: 0, + computedAt: now, + } + + let runs: TaskRun[] + try { + runs = dbGetTaskRunsByRole(role, BASELINE_WINDOW_MS, now) + } catch { + // DB error — return null-metrics baseline; caller falls back to flat thresholds + return nullBaseline + } + + // Group runs by agentId + const byAgent = new Map() + for (const run of runs) { + const existing = byAgent.get(run.agentId) ?? [] + existing.push(run) + byAgent.set(run.agentId, existing) + } + + // Only agents with sufficient run history in the window contribute to the baseline + const windowStart = now - BASELINE_WINDOW_MS + const qualifyingIds: string[] = [] + for (const [agentId, agentRuns] of byAgent) { + const windowRuns = agentRuns.filter( + (r) => r.completedAt >= windowStart && r.completedAt <= now, + ) + if (windowRuns.length >= MIN_RUNS_THRESHOLD) { + qualifyingIds.push(agentId) + } + } + + if (qualifyingIds.length < MIN_COHORT_SIZE) { + // Cohort too small — medians would be statistically meaningless + return { ...nullBaseline, cohortSize: qualifyingIds.length } + } + + // Collect per-agent metric values for median calculation + const completionRates: number[] = [] + const errorDensities: number[] = [] + const weeklyThroughputs: number[] = [] + const weeksInWindow = BASELINE_WINDOW_MS / (7 * 24 * 60 * 60 * 1_000) + + for (const agentId of qualifyingIds) { + const agentRuns = byAgent.get(agentId)! + const metrics = computeHealthMetrics(agentRuns, now) + if (!metrics.hasEnoughData) continue + + if (metrics.completionRate !== null) completionRates.push(metrics.completionRate) + if (metrics.errorDensity !== null) errorDensities.push(metrics.errorDensity) + + // Average weekly throughput = total window runs / weeks in window + const windowRuns = agentRuns.filter( + (r) => r.completedAt >= windowStart && r.completedAt <= now, + ) + weeklyThroughputs.push(windowRuns.length / weeksInWindow) + } + + return { + role, + medianCompletionRate: medianOf(completionRates), + medianErrorDensity: medianOf(errorDensities), + medianWeeklyThroughput: medianOf(weeklyThroughputs), + cohortSize: qualifyingIds.length, + computedAt: now, + } +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/** + * Returns a cached role baseline, recomputing if stale or absent. + * + * This function never throws. On any DB or computation error it returns a + * stale cached value if one exists, otherwise a safe null-metrics baseline. + * This ensures health scoring always degrades gracefully rather than crashing. + */ +export function getRoleBaseline(role: AgentType): RoleBaseline { + const cached = baselineCache.get(role) + if (cached && Date.now() - cached.computedAt < BASELINE_TTL_MS) { + return cached.baseline + } + + try { + const baseline = computeRoleBaseline(role) + baselineCache.set(role, { baseline, computedAt: Date.now() }) + return baseline + } catch { + // On recomputation failure, prefer a stale cached value over nothing + if (cached) return cached.baseline + return { + role, + medianCompletionRate: null, + medianErrorDensity: null, + medianWeeklyThroughput: null, + cohortSize: 0, + computedAt: Date.now(), + } + } +} + +/** + * Invalidates cached baselines. + * - If `role` is provided, clears only that role's cached entry. + * - If omitted, clears all roles. + * + * Call this when an agent is created or deleted (cohort size may shift), or + * when a new AgentType is added to the system. + */ +export function invalidateRoleBaselines(role?: AgentType): void { + if (role !== undefined) { + baselineCache.delete(role) + } else { + baselineCache.clear() + } +} diff --git a/lib/health-cache.ts b/lib/health-cache.ts index 6096eb1..7526a3a 100644 --- a/lib/health-cache.ts +++ b/lib/health-cache.ts @@ -1,5 +1,7 @@ -import { computeHealthMetrics, ROLLING_WINDOW_MS } from '@/lib/health' +import { computeHealthMetrics, applyRoleBaseline } from '@/lib/health' import { dbGetTaskRunsByAgent } from '@/lib/db/repositories/taskRunRepo' +import { dbGetAgent } from '@/lib/db/repositories/agentRepo' +import { getRoleBaseline } from '@/lib/health-baselines' import type { AgentHealthMetrics } from '@/lib/types' // --------------------------------------------------------------------------- @@ -37,11 +39,31 @@ export function getCachedHealthMetrics( /** * Queries the DB for all task runs belonging to this agent, computes health - * metrics, stores the result in the cache, and returns it. + * metrics, applies role-relative baseline normalisation when the + * `ROLE_BASELINES_ENABLED` environment variable is set to `"true"`, stores + * the result in the cache, and returns it. + * + * Role-baseline normalisation is gated behind the feature flag so the code + * can be deployed without activating it, enabling an A/B comparison of badge + * behaviour and an emergency rollback without a code revert. */ export function computeAndCacheHealthMetrics(agentId: string): AgentHealthMetrics { const runs = dbGetTaskRunsByAgent(agentId) - const metrics = computeHealthMetrics(runs) + const raw = computeHealthMetrics(runs) + + let metrics = raw + + if (process.env.ROLE_BASELINES_ENABLED === 'true') { + // Look up the agent's role so we can fetch the correct cohort baseline. + // dbGetAgent returns undefined for unknown ids; we degrade to flat thresholds + // in that case (applyRoleBaseline treats a null baseline as a no-op). + const agent = dbGetAgent(agentId) + if (agent) { + const baseline = getRoleBaseline(agent.type) + metrics = applyRoleBaseline(raw, baseline) + } + } + cache.set(agentId, { metrics, computedAt: Date.now() }) return metrics } diff --git a/lib/health.ts b/lib/health.ts index 958e3de..1d5f528 100644 --- a/lib/health.ts +++ b/lib/health.ts @@ -1,5 +1,24 @@ import type { TaskRun, AgentHealthMetrics } from '@/lib/types' +// --------------------------------------------------------------------------- +// Role-baseline normalisation types +// --------------------------------------------------------------------------- + +/** + * Structural subset of RoleBaseline used by applyRoleBaseline. + * + * Defining it here (rather than importing from health-baselines.ts) avoids a + * circular dependency: health-baselines.ts → health.ts → health-baselines.ts. + * Any object that satisfies this shape — including RoleBaseline — is accepted + * by applyRoleBaseline thanks to TypeScript's structural typing. + */ +export interface BaselineNorms { + medianCompletionRate: number | null + medianErrorDensity: number | null + /** Number of agents that contributed; guards against thin cohorts. */ + cohortSize: number +} + // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- @@ -117,3 +136,75 @@ export function computeHealthMetrics( hasEnoughData: true, } } + +// --------------------------------------------------------------------------- +// Role-baseline normalisation +// --------------------------------------------------------------------------- + +/** + * The minimum cohort size required to treat a baseline as meaningful. + * Mirrors the constant in health-baselines.ts; duplicated here to keep + * health.ts free of any import from health-baselines.ts. + */ +export const MIN_COHORT_SIZE_FOR_BASELINE = 3 + +/** + * Adjusts raw health metrics relative to the role's aggregate norms. + * + * Returns the raw metrics **unchanged** when: + * - `baseline` is null (caller signals "no baseline available") + * - `baseline.cohortSize < MIN_COHORT_SIZE_FOR_BASELINE` (cohort too thin) + * - `raw.hasEnoughData` is false (not enough per-agent data to compare) + * + * Normalisation rules (only for metrics where both raw value and baseline + * median are non-null and the baseline median is > 0): + * - `completionRate` → raw / medianCompletionRate, capped at [0, 1] + * (1.0 = at role norm; <1.0 = below norm; values >1 clamped to 1) + * - `errorDensity` → raw / medianErrorDensity (higher ratio = worse) + * (1.0 = at role norm; >1.0 = worse than norm; <1.0 = better than norm) + * - `throughputTrend` → unchanged (already a slope/delta, not an absolute) + * - `idleSeconds` → unchanged (wall-clock metric, not role-dependent) + * + * The AgentHealthMetrics shape is **not** changed — this is a purely internal + * value transform applied before badge thresholds are evaluated. + * + * @internal — exported for unit testing; consume via health-cache.ts + */ +export function applyRoleBaseline( + raw: AgentHealthMetrics, + baseline: BaselineNorms | null, +): AgentHealthMetrics { + // Gate 1: no baseline or cohort too small → flat thresholds apply unchanged + if (!baseline || baseline.cohortSize < MIN_COHORT_SIZE_FOR_BASELINE) return raw + + // Gate 2: agent lacks sufficient data → nothing to normalise + if (!raw.hasEnoughData) return raw + + let { completionRate, errorDensity } = raw + + // Normalise completionRate: higher raw / high baseline = "at or above norm" + // Guard: skip if either value is null or baseline median is zero + if ( + completionRate !== null && + baseline.medianCompletionRate !== null && + baseline.medianCompletionRate > 0 + ) { + completionRate = Math.min(completionRate / baseline.medianCompletionRate, 1) + } + + // Normalise errorDensity: lower raw / high baseline = "better than norm" + // Guard: skip if either value is null or baseline median is zero + if ( + errorDensity !== null && + baseline.medianErrorDensity !== null && + baseline.medianErrorDensity > 0 + ) { + errorDensity = errorDensity / baseline.medianErrorDensity + } + + return { + ...raw, + completionRate, + errorDensity, + } +}