Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
509 changes: 509 additions & 0 deletions __tests__/unit/health-baselines.test.ts

Large diffs are not rendered by default.

130 changes: 129 additions & 1 deletion __tests__/unit/health-cache.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,65 @@
*
* These tests use a real DB (isolated per test file via setup.ts) to verify
* that computeAndCacheHealthMetrics correctly reads from DB and caches results.
*
* The second describe block ("with ROLE_BASELINES_ENABLED") covers the
* role-relative baseline integration path, seeding both agent records and
* task-run history to exercise the full normalization pipeline.
*/
import { describe, it, expect, beforeEach } from 'vitest'
import { describe, it, expect, beforeEach, afterEach } from 'vitest'
import {
getCachedHealthMetrics,
computeAndCacheHealthMetrics,
invalidateHealthCache,
} from '@/lib/health-cache'
import { dbAddTaskRun } from '@/lib/db/repositories/taskRunRepo'
import { dbAddAgent } from '@/lib/db/repositories/agentRepo'
import { invalidateRoleBaselines } from '@/lib/health-baselines'
import { makeTestTaskRun } from '../helpers/test-utils'
import { randomUUID } from 'crypto'
import type { Agent } from '@/lib/types'

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/** Creates a minimal Agent record and persists it to the DB. */
function seedAgent(agentId: string, type: Agent['type'] = 'coder'): Agent {
const agent: Agent = {
id: agentId,
type,
prompt: '',
status: 'idle',
events: [],
createdAt: Date.now(),
}
dbAddAgent(agent)
return agent
}

/**
* Seeds `count` task runs for the given agentId inside the 7-day rolling
* window so hasEnoughData will be true.
*/
function seedRunsForAgent(
agentId: string,
role: Agent['type'],
count: number,
status: 'done' | 'failed' = 'done',
): void {
const now = Date.now()
for (let i = 0; i < count; i++) {
dbAddTaskRun(
makeTestTaskRun(randomUUID(), {
agentId,
role,
status,
completedAt: now - i * 60_000,
startedAt: now - i * 60_000 - 30_000,
}),
)
}
}

beforeEach(() => {
// Clear cache between tests to avoid cross-test contamination
Expand Down Expand Up @@ -110,3 +159,82 @@ describe('invalidateHealthCache', () => {
expect(() => invalidateHealthCache('ghost-agent')).not.toThrow()
})
})

// ---------------------------------------------------------------------------
// Role-baseline integration (ROLE_BASELINES_ENABLED=true)
// ---------------------------------------------------------------------------

describe('computeAndCacheHealthMetrics with ROLE_BASELINES_ENABLED', () => {
beforeEach(() => {
invalidateHealthCache()
invalidateRoleBaselines()
process.env.ROLE_BASELINES_ENABLED = 'true'
})

afterEach(() => {
delete process.env.ROLE_BASELINES_ENABLED
invalidateRoleBaselines()
})

it('returns raw metrics (no normalisation) when feature flag is off', () => {
delete process.env.ROLE_BASELINES_ENABLED // ensure flag is off
const agentId = randomUUID()
seedAgent(agentId, 'coder')
seedRunsForAgent(agentId, 'coder', 6)

const metrics = computeAndCacheHealthMetrics(agentId)
// Without baseline, completionRate should be the raw value (1.0 = all done)
expect(metrics.completionRate).toBe(1)
expect(metrics.errorDensity).toBe(0)
})

it('applies role-relative normalisation when feature flag is enabled', () => {
// Set up: role median completionRate = 0.5 (mixed cohort)
// Build a cohort of 3 agents for the 'researcher' role with 50% completion
const cohortAgents = [randomUUID(), randomUUID(), randomUUID()]
for (const id of cohortAgents) {
seedAgent(id, 'researcher')
// 3 done + 3 failed = 50% completion rate per agent
seedRunsForAgent(id, 'researcher', 3, 'done')
seedRunsForAgent(id, 'researcher', 3, 'failed')
}

// Now test an agent that has 100% completion rate
const agentId = randomUUID()
seedAgent(agentId, 'researcher')
seedRunsForAgent(agentId, 'researcher', 6, 'done')

// Invalidate baseline cache so it recomputes from the seeded cohort
invalidateRoleBaselines()

const metrics = computeAndCacheHealthMetrics(agentId)

// The agent performs above role norm:
// raw completionRate = 1.0, role median ≈ 0.5
// normalised = min(1.0 / 0.5, 1) = 1.0 (capped)
expect(metrics.hasEnoughData).toBe(true)
expect(metrics.completionRate).toBe(1)
})

it('degrades to raw metrics when agent is not found in DB', () => {
// agentId with no DB record — getRoleBaseline should not be called
const ghostAgentId = randomUUID()
// Seed runs directly without creating the agent record
seedRunsForAgent(ghostAgentId, 'coder', 6)

const metrics = computeAndCacheHealthMetrics(ghostAgentId)
// Should succeed without throwing; returns raw metrics
expect(metrics).toBeDefined()
expect(metrics.hasEnoughData).toBe(true)
})

it('stores normalised metrics in cache and returns the same value on hit', () => {
const agentId = randomUUID()
seedAgent(agentId, 'coder')
seedRunsForAgent(agentId, 'coder', 6)

const computed = computeAndCacheHealthMetrics(agentId)
const cached = getCachedHealthMetrics(agentId, 60_000)
expect(cached).toEqual(computed)
})
})
5 changes: 3 additions & 2 deletions context/current-sprint.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# Current Sprint

> Last updated: 2026-03-17
> Last updated: 2026-03-18

## Active Worktrees

*(none — all branches merged, worktrees cleaned up)*
- `feat/role-aware-health-baselines` — PR #29 open, awaiting review (depends on #26, #27)

## In Progress
- [ ] Role-aware health baselines — PR #29 open (awaiting #26 + #27 as prerequisites)
- [ ] Authentication & role-based access (protect dashboard + API routes)
- [ ] Agent log search & filtering

Expand Down
8 changes: 8 additions & 0 deletions context/decisions.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

> Append new entries at the top. Keep each entry ≤ 10 lines.

## ADR-040 — Role-aware health baselines (2026-03-18)
**Decision:** Add `lib/health-baselines.ts` — a dedicated module that computes and caches 30-day median baselines (completionRate, errorDensity, weeklyThroughput) per agent role. `applyRoleBaseline()` in `health.ts` normalises raw sub-metrics relative to the role's median before badge thresholds are applied, so a tester at 72% completion looks healthy if testers average 70% but alarming if they average 92%.
**Architecture:** `BaselineNorms` interface defined in `health.ts` (not `health-baselines.ts`) to break the potential circular import. `health-baselines.ts` satisfies it structurally. 5-minute TTL cache; `MIN_COHORT_SIZE = 3` guard falls back to flat thresholds when cohort is too thin. Feature-flagged via `ROLE_BASELINES_ENABLED` env var (default off) for safe A/B rollout.
**Degradation contract:** Cohort < 3 → flat thresholds. Agent < 5 runs → skip (hasEnoughData unchanged). Baseline metric null or zero → skip that sub-metric. DB error → return stale cache or null-metrics. New role → cohort = 0 → flat thresholds until data accumulates (≈2–3 weeks).
**New role warning:** Adding a new AgentType requires a baseline recalibration period; documented in `docs/agent-types.md`.
**Affects:** `lib/health-baselines.ts` (new), `lib/health.ts` (+`applyRoleBaseline`, `BaselineNorms`, `MIN_COHORT_SIZE_FOR_BASELINE`), `lib/health-cache.ts` (+role lookup, feature flag), `lib/db/repositories/taskRunRepo.ts` (+`dbGetTaskRunsByRole`), `__tests__/unit/health-baselines.test.ts` (new), `__tests__/unit/health-cache.test.ts` (extended), `docs/agent-types.md`.
**PR:** #28 (`feat/role-aware-health-baselines`) — depends on #26, #27.

## ADR-037 — Meetings toggle in HistoryList + global meetings API (2026-03-14)
**Decision:** Added `mode` state (`'runs' | 'meetings'`) to `HistoryList`. Meetings view shows a filterable table (date, topic, project, status, agents, tokens, cost) via a new global `GET /api/meetings` endpoint. Endpoint enriches data from `dbGetAllMeetings()` + per-meeting message aggregations (agentCount, totalTokens, totalCostUsd). Supports `from`/`to` epoch-ms and `status` query params.
**Why:** History tab covers all runs; logical to extend it to meetings. Global endpoint needed because /history has no projectId.
Expand Down
28 changes: 28 additions & 0 deletions docs/agent-types.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,31 @@ tester → 'testing'
```

Cards are automatically moved to the matching column when an agent of that role starts.

## Health Baselines and New Role Addition

When `ROLE_BASELINES_ENABLED=true`, each agent's health sub-metrics are
normalised against the median performance of its role cohort (see
`lib/health-baselines.ts` and ADR-040).

**Adding a new AgentType requires a baseline recalibration period.**

When a sixth (or later) role is added to the `AgentType` union:

1. At deploy time, the new role has zero historical runs → `cohortSize = 0`
→ `MIN_COHORT_SIZE` guard fires → health scores fall back to flat thresholds
automatically. No crash, no code change needed.
2. After approximately **2–3 weeks** of production traffic, once the cohort
accumulates ≥ 3 agents each with ≥ 5 runs per week, the baseline will
self-populate on the next 5-minute cache refresh.
3. During the recalibration window, health badges for the new role display raw
absolute values (same as before `ROLE_BASELINES_ENABLED` was set).

**Action required when adding a new role:**
- Update `AgentType` in `lib/types.ts`
- Update `ROLE_COLORS` / `ROLE_HEX` in `lib/constants.ts`
- Update pipeline logic in `lib/services/agentService.ts` if it joins the pipeline
- Note the recalibration period in the PR description so the team knows to
monitor badge accuracy for the new role for 2–3 weeks post-deploy
- Call `invalidateRoleBaselines()` in any migration script that back-fills
historical runs for the new role (so the cache doesn't serve stale data)
27 changes: 27 additions & 0 deletions lib/db/repositories/taskRunRepo.ts
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,30 @@ export function dbGetRecentTaskRunsByAgent(agentId: string, since: number): Task
.all()
.map(rowToTaskRun)
}

/**
* Returns all task runs for agents of the given role within a trailing time
* window, ordered by completedAt DESC.
*
* Used by `lib/health-baselines.ts` to compute per-role aggregate baselines.
* Fetching all runs for the role in one query is more efficient than issuing N
* per-agent queries and merging in JS.
*
* @param role - The agent role string (e.g. 'coder', 'researcher').
* @param windowMs - Look-back window in ms (default: 30 days).
* @param now - Injectable epoch ms for deterministic tests.
*/
export function dbGetTaskRunsByRole(
role: string,
windowMs: number = 30 * 24 * 60 * 60 * 1_000,
now: number = Date.now(),
): TaskRun[] {
const since = now - windowMs
return db
.select()
.from(taskRuns)
.where(and(eq(taskRuns.role, role), gte(taskRuns.completedAt, since)))
.orderBy(desc(taskRuns.completedAt))
.all()
.map(rowToTaskRun)
}
Loading
Loading