From 93601f229085b37a4ceba470c51c94a922aa6079 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 28 Mar 2026 18:04:47 -0700 Subject: [PATCH] Add test coverage as 4th Copeland criterion with anti-gaming safeguards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split filesChanged into nonTestFiles (fewer=better) and testFiles (more=better, capped at 3). Anti-gaming: test files only count when agent also changed production code. Prevents score inflation via empty test files. 4 criteria now: tests passed, convergence, code scope, test coverage. Generated by thinktank Opus (5 agents, 4 pass, all 3 passing had identical anti-gaming logic — strong consensus on the approach). Closes #119 Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/architecture.md | 19 ++++-- src/scoring/convergence.test.ts | 112 ++++++++++++++++++++++++++++++-- src/scoring/convergence.ts | 65 ++++++++++++++---- src/types.ts | 3 +- src/utils/display.ts | 10 +-- 5 files changed, 181 insertions(+), 28 deletions(-) diff --git a/docs/architecture.md b/docs/architecture.md index 05be7d5..b761c59 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -119,13 +119,20 @@ The agent with the highest total score is recommended. Ties broken by the first ### Copeland Pairwise Scoring (alternative) -Enabled with `--scoring copeland`. Instead of assigning absolute point values, Copeland scoring compares every pair of agents head-to-head on three criteria: +Enabled with `--scoring copeland`. Instead of assigning absolute point values, Copeland scoring compares every pair of agents head-to-head on four criteria: -| Criterion | Better = | -|-----------|----------| -| Tests passed | Passed > Failed | -| Convergence group size | Larger group > Smaller group | -| Files changed | Fewer files > More files | +| Criterion | Better = | Notes | +|-----------|----------|-------| +| Tests passed | Passed > Failed | | +| Convergence group size | Larger group > Smaller group | | +| Non-test files changed | Fewer files > More files | Minimal code scope preferred | +| Test files added/modified | More files > Fewer files | Capped at 3; only counts when agent also changed non-test files | + +Test files are identified by the `*.test.*` or `*.spec.*` pattern in the file path. + +**Anti-gaming:** The test files criterion only applies when the agent also changed production (non-test) code. An agent that only adds test files without changing production code receives no test coverage bonus — this prevents gaming the score with empty test padding. + +**Cap:** The effective test file count is `min(testFiles, 3)`. This means 1 test file < 2 < 3+, but 3 and 10 are treated equally — adequate coverage is rewarded, but excessive test files don't dominate. For each pair (A, B): 1. Count how many criteria A wins vs B wins diff --git a/src/scoring/convergence.test.ts b/src/scoring/convergence.test.ts index b9a5c60..48bc699 100644 --- a/src/scoring/convergence.test.ts +++ b/src/scoring/convergence.test.ts @@ -294,7 +294,8 @@ describe("copelandRecommend", () => { assert.equal(score.copelandTotal, 0, `Agent #${score.agentId} should have Copeland score 0`); assert.equal(score.testsWins, 0); assert.equal(score.convergenceWins, 0); - assert.equal(score.filesChangedWins, 0); + assert.equal(score.nonTestFilesWins, 0); + assert.equal(score.testFilesWins, 0); } // Still recommends someone (first agent) assert.ok(result.recommended !== null); @@ -317,9 +318,10 @@ describe("copelandRecommend", () => { const convergence = analyzeConvergence(agents); const result = copelandRecommend(agents, tests, convergence); - // Agent 1 vs Agent 2: tests(+1), convergence(-1), files(-1) → Agent 2 wins - // Agent 1 vs Agent 3: tests(+1), convergence(-1), files(tie) → tie - // Agent 2 vs Agent 3: tests(tie), convergence(tie), files(+1 for 2) → Agent 2 wins + // No test files in any agent, so testFiles criterion is always tied + // Agent 1 vs Agent 2: tests(+1), convergence(-1), scope(-1), testFiles(tie) → Agent 2 wins + // Agent 1 vs Agent 3: tests(+1), convergence(-1), scope(tie), testFiles(tie) → tie + // Agent 2 vs Agent 3: tests(tie), convergence(tie), scope(+1 for 2), testFiles(tie) → Agent 2 wins // So Agent 2 should have the best Copeland score assert.equal(result.recommended, 2); }); @@ -375,8 +377,8 @@ describe("copelandRecommend", () => { // Score1 wins tests and files, score2 wins neither assert.equal(score1.testsWins, 1); assert.equal(score2.testsWins, -1); - assert.equal(score1.filesChangedWins, 1); - assert.equal(score2.filesChangedWins, -1); + assert.equal(score1.nonTestFilesWins, 1); + assert.equal(score2.nonTestFilesWins, -1); }); it("handles single agent", () => { @@ -387,4 +389,102 @@ describe("copelandRecommend", () => { assert.equal(result.scores.length, 1); assert.equal(result.scores[0]!.copelandTotal, 0); }); + + it("agent with tests beats agent without when other criteria tie", () => { + // Both agents change 1 prod file + 1 test file, same convergence + const agents = [ + makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.ts", "a.test.ts"] }), + makeAgent({ id: 2, diff: DIFF_A, filesChanged: ["a.ts"] }), + ]; + const tests = [ + { agentId: 1, passed: true }, + { agentId: 2, passed: true }, + ]; + const convergence = analyzeConvergence(agents); + const result = copelandRecommend(agents, tests, convergence); + + // Agent 1 wins testFiles criterion (+1 vs 0), ties everything else + assert.equal(result.recommended, 1); + const score1 = result.scores.find((s) => s.agentId === 1); + assert.ok(score1); + assert.ok(score1.testFilesWins > 0); + }); + + it("test-only changes do not get test file bonus", () => { + // Agent 1 changes only test files (no prod code) — should not get testFiles bonus + // Agent 2 changes 1 prod file + const agents = [ + makeAgent({ id: 1, diff: DIFF_A, filesChanged: ["a.test.ts", "b.spec.ts"] }), + makeAgent({ id: 2, diff: DIFF_B, filesChanged: ["x.ts"] }), + ]; + const convergence = analyzeConvergence(agents); + const result = copelandRecommend(agents, [], convergence); + + const score1 = result.scores.find((s) => s.agentId === 1); + const score2 = result.scores.find((s) => s.agentId === 2); + assert.ok(score1); + assert.ok(score2); + + // Agent 1 has 0 effective test files (no prod changes), Agent 2 also has 0 test files + // So testFilesWins should be 0 for both + assert.equal(score1.testFilesWins, 0); + assert.equal(score2.testFilesWins, 0); + + // Agent 1 has 0 nonTestFiles, Agent 2 has 1 — but fewer is better, + // so Agent 1 wins scope. However Agent 2 is not disadvantaged on testFiles. + }); + + it("test file cap prevents gaming with many test files", () => { + // Agent 1: 1 prod file + 10 test files + // Agent 2: 1 prod file + 2 test files + // Agent 3: 1 prod file + 3 test files + // After capping at 3: Agent 1 effective=3, Agent 2 effective=2, Agent 3 effective=3 + const agents = [ + makeAgent({ + id: 1, + diff: DIFF_A, + filesChanged: [ + "a.ts", + "a.test.ts", + "b.test.ts", + "c.test.ts", + "d.test.ts", + "e.test.ts", + "f.test.ts", + "g.test.ts", + "h.test.ts", + "i.test.ts", + "j.test.ts", + ], + }), + makeAgent({ + id: 2, + diff: DIFF_A, + filesChanged: ["a.ts", "a.test.ts", "b.test.ts"], + }), + makeAgent({ + id: 3, + diff: DIFF_A, + filesChanged: ["a.ts", "a.test.ts", "b.test.ts", "c.test.ts"], + }), + ]; + const convergence = analyzeConvergence(agents); + const result = copelandRecommend(agents, [], convergence); + + const score1 = result.scores.find((s) => s.agentId === 1); + const score3 = result.scores.find((s) => s.agentId === 3); + assert.ok(score1); + assert.ok(score3); + + // Agent 1 (10 test files capped to 3) and Agent 3 (3 test files capped to 3) + // should tie on testFiles criterion + // Agent 1 vs Agent 3: testFilesWins contribution should be 0 (tie) + // In pairwise: A1 effective=3 vs A3 effective=3 → tie on testFiles + assert.equal(score1.testFilesWins, score3.testFilesWins); + + // Agent 2 (2 test files) should lose to both Agent 1 and Agent 3 on testFiles + const score2 = result.scores.find((s) => s.agentId === 2); + assert.ok(score2); + assert.ok(score2.testFilesWins < score1.testFilesWins); + }); }); diff --git a/src/scoring/convergence.ts b/src/scoring/convergence.ts index 8351c49..e7cdae9 100644 --- a/src/scoring/convergence.ts +++ b/src/scoring/convergence.ts @@ -177,9 +177,39 @@ export function recommend( return { recommended: bestId, scores: agentScores }; } +const TEST_FILE_PATTERN = /[./](?:test|spec)\./; + +/** Cap for test file criterion — prevents gaming with many test files */ +const TEST_FILE_CAP = 3; + +/** + * Count test files (matching *.test.* or *.spec.*) and non-test files separately. + */ +function splitFilesByType(files: string[]): { testFiles: number; nonTestFiles: number } { + let testFiles = 0; + let nonTestFiles = 0; + for (const f of files) { + if (TEST_FILE_PATTERN.test(f)) { + testFiles++; + } else { + nonTestFiles++; + } + } + return { testFiles, nonTestFiles }; +} + +/** + * Effective test file count for scoring: capped at TEST_FILE_CAP, and only + * counts when the agent also changed non-test files (prevents gaming). + */ +function effectiveTestFiles(testFiles: number, nonTestFiles: number): number { + if (nonTestFiles === 0) return 0; + return Math.min(testFiles, TEST_FILE_CAP); +} + /** * Copeland pairwise scoring: compare every pair of agents head-to-head - * on three criteria (tests passed, convergence group size, files changed). + * on four criteria (tests passed, convergence group size, non-test files changed, test files). * For each pair, the agent winning more criteria gets +1, the loser gets -1, ties get 0. * The agent with the highest Copeland score is recommended. */ @@ -197,8 +227,9 @@ export function copelandRecommend( const testsPassed = test?.passed ? 1 : 0; const group = convergence.find((g) => g.agents.includes(agent.id)); const groupSize = group ? group.agents.length : 0; - const filesChanged = agent.filesChanged.length; - return { id: agent.id, testsPassed, groupSize, filesChanged }; + const { testFiles, nonTestFiles } = splitFilesByType(agent.filesChanged); + const cappedTestFiles = effectiveTestFiles(testFiles, nonTestFiles); + return { id: agent.id, testsPassed, groupSize, nonTestFiles, cappedTestFiles }; }); // Initialize scores @@ -208,7 +239,8 @@ export function copelandRecommend( agentId: data.id, testsWins: 0, convergenceWins: 0, - filesChangedWins: 0, + nonTestFilesWins: 0, + testFilesWins: 0, copelandTotal: 0, }); } @@ -244,15 +276,26 @@ export function copelandRecommend( scoreMap.get(a.id)!.convergenceWins--; } - // Criterion 3: files changed (fewer is better — minimal changes preferred) - if (a.filesChanged < b.filesChanged) { + // Criterion 3: non-test files changed (fewer is better — minimal code scope) + if (a.nonTestFiles < b.nonTestFiles) { + aWins++; + scoreMap.get(a.id)!.nonTestFilesWins++; + scoreMap.get(b.id)!.nonTestFilesWins--; + } else if (b.nonTestFiles < a.nonTestFiles) { + bWins++; + scoreMap.get(b.id)!.nonTestFilesWins++; + scoreMap.get(a.id)!.nonTestFilesWins--; + } + + // Criterion 4: test files added/modified (more is better, capped, only with prod changes) + if (a.cappedTestFiles > b.cappedTestFiles) { aWins++; - scoreMap.get(a.id)!.filesChangedWins++; - scoreMap.get(b.id)!.filesChangedWins--; - } else if (b.filesChanged < a.filesChanged) { + scoreMap.get(a.id)!.testFilesWins++; + scoreMap.get(b.id)!.testFilesWins--; + } else if (b.cappedTestFiles > a.cappedTestFiles) { bWins++; - scoreMap.get(b.id)!.filesChangedWins++; - scoreMap.get(a.id)!.filesChangedWins--; + scoreMap.get(b.id)!.testFilesWins++; + scoreMap.get(a.id)!.testFilesWins--; } // Overall Copeland: winner of more criteria gets +1, loser -1 diff --git a/src/types.ts b/src/types.ts index ded3549..a9c2f6b 100644 --- a/src/types.ts +++ b/src/types.ts @@ -53,7 +53,8 @@ export interface CopelandScore { agentId: number; testsWins: number; convergenceWins: number; - filesChangedWins: number; + nonTestFilesWins: number; + testFilesWins: number; copelandTotal: number; } diff --git a/src/utils/display.ts b/src/utils/display.ts index 6f6c6fa..c1e9faa 100644 --- a/src/utils/display.ts +++ b/src/utils/display.ts @@ -107,10 +107,11 @@ export function displayResults(result: EnsembleResult): void { padRight("Agent", 8) + padRight("Tests", 10) + padRight("Converge", 10) + - padRight("Files", 10) + + padRight("Scope", 10) + + padRight("TestCov", 10) + padRight("Copeland", 10), ); - console.log(" " + pc.dim("─".repeat(48))); + console.log(" " + pc.dim("─".repeat(58))); for (const score of result.copelandScores) { const isRecommended = result.scoring === "copeland" && result.recommended === score.agentId; @@ -121,7 +122,8 @@ export function displayResults(result: EnsembleResult): void { padRight(`#${score.agentId}`, 8) + padRight(fmt(score.testsWins), 10) + padRight(fmt(score.convergenceWins), 10) + - padRight(fmt(score.filesChangedWins), 10) + + padRight(fmt(score.nonTestFilesWins), 10) + + padRight(fmt(score.testFilesWins), 10) + padRight(fmt(score.copelandTotal), 10), ); } @@ -133,7 +135,7 @@ export function displayResults(result: EnsembleResult): void { const method = result.scoring === "copeland" ? "Copeland pairwise" : "weighted"; console.log( pc.cyan(` Recommended: Agent #${result.recommended}`) + - pc.dim(` (${method} scoring: tests + convergence + diff size)`), + pc.dim(` (${method} scoring: tests + convergence + scope + test coverage)`), ); console.log(); }