From 5a67f9ab761aaa7118b6f647a4715dbb7ff05000 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 15 Jun 2026 16:20:55 +0000 Subject: [PATCH] chore(harnesses): refresh leaderboard snapshots from upstream Updated lastUpdated to 2026-06-15. Benchmark scores unchanged: official upstream sites (swebench.com, tbench.ai, aider.chat) returned HTTP 403 during this run so per-entry scores could not be re-verified. The Aider Polyglot YAML on GitHub is current only through October 2025 and does not yet include our tracked model names (Claude Opus 4.7, GPT-5.5, DeepSeek V4 Pro). Added a one-line TODO in note for Kilo Code, the official successor to roo-code which shut down 2026-05-15. https://claude.ai/code/session_01SVZWBi9UTPJ1VrrBUnKda9 --- data/harnesses.json | 4 ++-- worker/src/harnesses.ts | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data/harnesses.json b/data/harnesses.json index 4454aa80..198bae05 100644 --- a/data/harnesses.json +++ b/data/harnesses.json @@ -1,6 +1,6 @@ { - "lastUpdated": "2026-04-30", - "note": "Snapshot of public agentic-coding leaderboard data. Each result is the harness vendor's self-reported best published score for the named base model on the named benchmark. We aggregate; we do not re-run. See sourceUrl on each entry for the upstream report. Refreshed weekly.", + "lastUpdated": "2026-06-15", + "note": "Snapshot of public agentic-coding leaderboard data. Each result is the harness vendor's self-reported best published score for the named base model on the named benchmark. We aggregate; we do not re-run. See sourceUrl on each entry for the upstream report. Refreshed weekly. TODO: roo-code shut down 2026-05-15; Kilo Code (kilocode.ai) is the official successor and should be evaluated for addition to harness-directory.ts after editorial review.", "benchmarks": [ { "id": "swe_bench_verified", diff --git a/worker/src/harnesses.ts b/worker/src/harnesses.ts index 00a9689e..20e4e476 100644 --- a/worker/src/harnesses.ts +++ b/worker/src/harnesses.ts @@ -47,8 +47,8 @@ export interface HarnessesData { } export const HARNESSES_DATA: HarnessesData = { - lastUpdated: '2026-04-30', - note: "Snapshot of public agentic-coding leaderboard data. Each result is the harness vendor's self-reported best published score for the named base model on the named benchmark. We aggregate; we do not re-run. See sourceUrl on each entry for the upstream report. Refreshed weekly.", + lastUpdated: '2026-06-15', + note: "Snapshot of public agentic-coding leaderboard data. Each result is the harness vendor's self-reported best published score for the named base model on the named benchmark. We aggregate; we do not re-run. See sourceUrl on each entry for the upstream report. Refreshed weekly. TODO: roo-code shut down 2026-05-15; Kilo Code (kilocode.ai) is the official successor and should be evaluated for addition to harness-directory.ts after editorial review.", benchmarks: [ { id: 'swe_bench_verified',