From 485eed0e6f957cdabf05f5de1b20a438403370b1 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Tue, 19 May 2026 16:40:15 +0100 Subject: [PATCH 1/7] Add onboarding system scan and retrieval benchmarks --- .gitignore | 5 + .prettierignore | 2 + bench/aggregate_v12.py | 164 +++ bench/aggregate_v13.py | 255 ++++ bench/build_report.py | 571 ++++++++ bench/continuity-gold.json | 198 +++ bench/continuity-tasks.json | 142 ++ bench/fetch_contexts.py | 106 ++ bench/fixtures/skills/app-launcher/SKILL.md | 33 + .../fixtures/skills/brand-guidelines/SKILL.md | 31 + bench/fixtures/skills/canvas-design/SKILL.md | 28 + bench/fixtures/skills/claude-api/SKILL.md | 57 + .../skills/comfyui-prompt-builder/SKILL.md | 35 + .../skills/comfyui-workflow-manager/SKILL.md | 42 + .../skills/competitor-analyser/SKILL.md | 26 + .../skills/downloads-cleaner/SKILL.md | 30 + bench/fixtures/skills/email-drafter/SKILL.md | 30 + bench/fixtures/skills/file-organiser/SKILL.md | 33 + bench/fixtures/skills/file-search/SKILL.md | 26 + .../skills/health-tracker-analyst/SKILL.md | 36 + bench/fixtures/skills/internal-comms/SKILL.md | 30 + bench/fixtures/skills/pptx/SKILL.md | 51 + .../fixtures/skills/process-manager/SKILL.md | 28 + .../skills/slack-gif-creator/SKILL.md | 37 + .../fixtures/skills/startup-routine/SKILL.md | 39 + bench/fixtures/skills/system-monitor/SKILL.md | 26 + bench/fixtures/skills/theme-factory/SKILL.md | 31 + bench/fixtures/skills/web-researcher/SKILL.md | 26 + bench/gold-answers.json | 480 +++++++ bench/grade_v2.py | 420 ++++++ bench/legacy/build_addendum.py | 394 ++++++ bench/legacy/build_addendum_v12.py | 358 +++++ bench/legacy/build_addendum_v13.py | 366 +++++ bench/legacy/codex-handoff.md | 164 +++ bench/run_continuity.py | 268 ++++ bench/run_v12.py | 167 +++ bench/run_v13.py | 219 +++ docs/roadmap.md | 248 ++++ docs/specs/system-detection.md | 320 +++++ electron/main.cjs | 28 +- scripts/onboarding-smoke.cjs | 21 +- scripts/quick-search.cjs | 46 + scripts/retrieval-smoke.js | 213 +++ scripts/setup-benchmark.cjs | 106 ++ scripts/vectorstore-smoke.js | 45 + server/compiler.js | 13 + server/lib/intelligence-routes.js | 4 +- server/lib/skills.js | 39 + server/lib/smart-compile.js | 62 +- server/lib/system-scan.js | 752 ++++++++++ server/lib/validation.js | 6 +- server/lib/vectorstore.js | 262 +++- server/router.js | 18 + ui/config.js | 66 +- ui/dashboard.js | 2 +- ui/data.js | 4 +- ui/icons.js | 89 ++ ui/index.html | 32 +- ui/onboarding-populate.js | 102 ++ ui/onboarding.js | 1255 ++++++++--------- ui/rules-lab.js | 286 +++- ui/store.js | 12 +- ui/styles/dram-standard-actions.css | 20 +- ui/styles/dram-standard.css | 11 +- ui/styles/onboarding.css | 1093 ++++++++------ ui/styles/shell.css | 77 +- ui/styles/surface-final.css | 1 + ui/styles/tab-config.css | 75 + ui/types.d.ts | 2 + 69 files changed, 8917 insertions(+), 1347 deletions(-) create mode 100644 bench/aggregate_v12.py create mode 100644 bench/aggregate_v13.py create mode 100644 bench/build_report.py create mode 100644 bench/continuity-gold.json create mode 100644 bench/continuity-tasks.json create mode 100644 bench/fetch_contexts.py create mode 100644 bench/fixtures/skills/app-launcher/SKILL.md create mode 100644 bench/fixtures/skills/brand-guidelines/SKILL.md create mode 100644 bench/fixtures/skills/canvas-design/SKILL.md create mode 100644 bench/fixtures/skills/claude-api/SKILL.md create mode 100644 bench/fixtures/skills/comfyui-prompt-builder/SKILL.md create mode 100644 bench/fixtures/skills/comfyui-workflow-manager/SKILL.md create mode 100644 bench/fixtures/skills/competitor-analyser/SKILL.md create mode 100644 bench/fixtures/skills/downloads-cleaner/SKILL.md create mode 100644 bench/fixtures/skills/email-drafter/SKILL.md create mode 100644 bench/fixtures/skills/file-organiser/SKILL.md create mode 100644 bench/fixtures/skills/file-search/SKILL.md create mode 100644 bench/fixtures/skills/health-tracker-analyst/SKILL.md create mode 100644 bench/fixtures/skills/internal-comms/SKILL.md create mode 100644 bench/fixtures/skills/pptx/SKILL.md create mode 100644 bench/fixtures/skills/process-manager/SKILL.md create mode 100644 bench/fixtures/skills/slack-gif-creator/SKILL.md create mode 100644 bench/fixtures/skills/startup-routine/SKILL.md create mode 100644 bench/fixtures/skills/system-monitor/SKILL.md create mode 100644 bench/fixtures/skills/theme-factory/SKILL.md create mode 100644 bench/fixtures/skills/web-researcher/SKILL.md create mode 100644 bench/gold-answers.json create mode 100644 bench/grade_v2.py create mode 100644 bench/legacy/build_addendum.py create mode 100644 bench/legacy/build_addendum_v12.py create mode 100644 bench/legacy/build_addendum_v13.py create mode 100644 bench/legacy/codex-handoff.md create mode 100644 bench/run_continuity.py create mode 100644 bench/run_v12.py create mode 100644 bench/run_v13.py create mode 100644 docs/roadmap.md create mode 100644 docs/specs/system-detection.md create mode 100644 scripts/quick-search.cjs create mode 100644 scripts/retrieval-smoke.js create mode 100644 scripts/setup-benchmark.cjs create mode 100644 server/lib/system-scan.js create mode 100644 ui/icons.js create mode 100644 ui/onboarding-populate.js diff --git a/.gitignore b/.gitignore index ab03656..4a4ea66 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,9 @@ data/modes.json Thumbs.db .env +# Canonical shared context root — all AI tool configs consolidated here +.context/ + # Per-contributor Claude Code config (preview launcher, etc.) .claude/ @@ -65,3 +68,5 @@ out/ *.deb __pycache__/ bench/results-latest.json +bench/artifacts/ +bench/data/ diff --git a/.prettierignore b/.prettierignore index 3a1d4a1..daa3bf5 100644 --- a/.prettierignore +++ b/.prettierignore @@ -6,3 +6,5 @@ CONTEXT.md server.out.log server.err.log *.log +bench/artifacts +bench/data diff --git a/bench/aggregate_v12.py b/bench/aggregate_v12.py new file mode 100644 index 0000000..a1bd781 --- /dev/null +++ b/bench/aggregate_v12.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +""" +aggregate_v12.py — Combine deterministic scoring + session grades into the +final v1.2 results file. Produces a per-task table and summary stats. +""" +from __future__ import annotations +import json, statistics +from pathlib import Path + +HERE = Path(__file__).parent +DATA = HERE / "data" / "v1.2" + +det = json.loads((DATA / "results-v12.json").read_text(encoding="utf-8")) +sg = json.loads((DATA / "session_grades.json").read_text(encoding="utf-8")) +resp = json.loads((DATA / "responses-v12.json").read_text(encoding="utf-8")) + +# Index session grades +sgrades = {(g["task_id"], g["mode"]): g for g in sg["grades"]} + +rows = [] +for t in det["results"]: + tid = t["task_id"] + for mode in ("no_context", "smart", "search"): + m = t["modes"].get(mode) + if not m: + continue + g = sgrades.get((tid, mode)) + fr = m["fact_recall"] + rq = m.get("retrieval_quality") + in_tok = m["input_tokens"] + out_tok = m["output_tokens"] + composite = (g["factual"] + g["specific"] + g["complete"]) / 3 if g else 0 + rows.append({ + "task_id": tid, + "mode": mode, + "input_tokens": in_tok, + "output_tokens": out_tok, + "cost_usd": m["cost_usd"], + "latency_ms": m["latency_ms"], + "fact_coverage_pct": fr["coverage_pct"], + "fact_bonus_pct": fr["bonus_pct"], + "forbidden_hits": fr["forbidden_hits"], + "failure_modes": fr["failure_modes"], + "factual": g["factual"] if g else 0, + "specific": g["specific"] if g else 0, + "complete": g["complete"] if g else 0, + "hallucinations": g["hallucinations"] if g else 0, + "composite_15": round(composite, 2), # composite on 0-5 scale (per-axis mean) + "composite_pct": round(composite / 5 * 100, 1), # composite as a % + "retrieval": rq, + "judge_notes": g["notes"] if g else "", + }) + +# Per-mode aggregates +def by_mode(field): + out = {} + for mode in ("no_context", "smart", "search"): + vs = [r[field] for r in rows if r["mode"] == mode and r[field] is not None] + if not vs: continue + out[mode] = { + "median": round(statistics.median(vs), 2), + "mean": round(statistics.mean(vs), 2), + "min": min(vs), + "max": max(vs), + } + return out + +def tokens_per_quality_point(mode): + rs = [r for r in rows if r["mode"] == mode and r["composite_15"] > 0] + if not rs: return None + # Use composite on a 0-15 scale (sum of 3 axes, each 0-5) for tk/pt + tk = [r["input_tokens"] for r in rs] + pts = [(r["factual"] + r["specific"] + r["complete"]) for r in rs] + return round(statistics.median(tk) / statistics.median(pts), 1) + +retr = {} +for r in rows: + if r["mode"] == "search" and r["retrieval"]: + for k in ("precision_at_k", "recall_at_k", "mrr"): + retr.setdefault(k, []).append(r["retrieval"][k]) +retr_summary = {k: round(statistics.mean(v), 3) for k, v in retr.items()} + +summary = { + "ran_at": det["ran_at"], + "task_count": len([r for r in rows if r["mode"] == "smart"]), + "task_model": "claude-sonnet-4-5 (session)", + "judges": "session:claude-sonnet-4-5 (single judge, single run; cross-family multi-run pending API keys)", + "tokens": { + "raw_all_baseline": 186654, + "by_mode_input_tokens": by_mode("input_tokens"), + "by_mode_output_tokens": by_mode("output_tokens"), + "by_mode_cost_usd": by_mode("cost_usd"), + "by_mode_latency_ms": by_mode("latency_ms"), + }, + "fact_recall": { + "coverage_must_include_pct": by_mode("fact_coverage_pct"), + "bonus_may_include_pct": by_mode("fact_bonus_pct"), + }, + "decomposed_rubric": { + "factual_1to5": by_mode("factual"), + "specific_1to5": by_mode("specific"), + "complete_1to5": by_mode("complete"), + "hallucinations_count": by_mode("hallucinations"), + "composite_pct": by_mode("composite_pct"), + }, + "retrieval_quality_mean": retr_summary, + "tokens_per_quality_point": { + "no_context": tokens_per_quality_point("no_context"), + "smart": tokens_per_quality_point("smart"), + "search": tokens_per_quality_point("search"), + }, +} + +# Headline derived numbers +sm_tk = summary["tokens"]["by_mode_input_tokens"]["smart"]["median"] +se_tk = summary["tokens"]["by_mode_input_tokens"]["search"]["median"] +nc_tk = summary["tokens"]["by_mode_input_tokens"]["no_context"]["median"] +summary["headline"] = { + "smart_saving_vs_raw_all_pct": round(100 * (1 - sm_tk / 186654), 1), + "search_saving_vs_raw_all_pct": round(100 * (1 - se_tk / 186654), 1), + "smart_quality_lift_vs_no_context_pct": round( + summary["decomposed_rubric"]["composite_pct"]["smart"]["median"] + - summary["decomposed_rubric"]["composite_pct"]["no_context"]["median"], 1), + "search_quality_lift_vs_no_context_pct": round( + summary["decomposed_rubric"]["composite_pct"]["search"]["median"] + - summary["decomposed_rubric"]["composite_pct"]["no_context"]["median"], 1), + "search_failure_at_release_note": "Precision@8=0.00 Recall@8=0.00 — retrieval missed internal-comms + slack-gif-creator entirely; predicts the observed quality drop.", +} + +out = {"summary": summary, "rows": rows} +DATA.mkdir(parents=True, exist_ok=True) +(DATA / "results-v12-final.json").write_text(json.dumps(out, indent=2), encoding="utf-8") + +# Print table +print(f"\n{'task':<25} {'mode':<11} {'tokens':>7} {'cov':>5} {'bonus':>6} {'fact':>4} {'spec':>4} {'comp':>4} {'comp%':>6}") +print("-" * 80) +for r in rows: + print(f"{r['task_id']:<25} {r['mode']:<11} {r['input_tokens']:>7,} " + f"{r['fact_coverage_pct']:>4.0f}% {r['fact_bonus_pct']:>5.0f}% " + f"{r['factual']:>4} {r['specific']:>4} {r['complete']:>4} " + f"{r['composite_pct']:>5.0f}%") +print() +print("=" * 80) +print(f"SUMMARY (5 tasks; baseline raw_all = 186,654 tokens)") +print("=" * 80) +for k in ("no_context", "smart", "search"): + tk = summary["tokens"]["by_mode_input_tokens"][k]["median"] + saving = round(100 * (1 - tk / 186654), 1) if k != "no_context" else None + cov = summary["fact_recall"]["coverage_must_include_pct"][k]["mean"] + bonus = summary["fact_recall"]["bonus_may_include_pct"][k]["mean"] + comp = summary["decomposed_rubric"]["composite_pct"][k]["median"] + tpqp = summary["tokens_per_quality_point"][k] + print(f" {k:<11} med tk={tk:>7,} save={f'{saving}%' if saving is not None else ' n/a':>6} " + f"cov={cov:>5.1f}% bonus={bonus:>5.1f}% composite={comp:>5.1f}% tk/pt={tpqp}") +print() +print(f"Retrieval quality (mean, search mode):") +for k, v in retr_summary.items(): + print(f" {k:<16} {v:.3f}") +print() +print(f"Headline:") +for k, v in summary["headline"].items(): + print(f" {k}: {v}") + +print(f"\nWrote {DATA / 'results-v12-final.json'}") diff --git a/bench/aggregate_v13.py b/bench/aggregate_v13.py new file mode 100644 index 0000000..b1eca28 --- /dev/null +++ b/bench/aggregate_v13.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +""" +aggregate_v13.py — Aggregate the full 15-task × 3-mode × 3-judge-run benchmark. + +Reports median + IQR per axis, paired comparisons across modes, and the +honest finding that contradicts v1.2's framing. + +Usage: + python bench/aggregate_v13.py # aggregate + print + python bench/aggregate_v13.py --gate # aggregate + gate check (exit 1 on fail) + python bench/aggregate_v13.py --gate --json # aggregate + gate + JSON output +""" +from __future__ import annotations +import argparse, json, statistics, sys +from pathlib import Path + +p = argparse.ArgumentParser() +p.add_argument("--gate", action="store_true", help="Exit 1 if quality or retrieval gates fail") +p.add_argument("--json", action="store_true", help="Output final result as JSON to stdout") +args = p.parse_args() + +HERE = Path(__file__).parent +DATA = HERE / "data" / "v1.3" +data = json.loads((DATA / "results-v13.json").read_text(encoding="utf-8")) +rows = data["rows"] + + +def med(xs): return statistics.median(xs) if xs else 0 +def mean(xs): return statistics.mean(xs) if xs else 0 +def iqr(xs): + if len(xs) < 4: return 0.0 + q = statistics.quantiles(xs, n=4) + return q[2] - q[0] + + +# Per-mode aggregates +modes = ("no_context", "smart", "search") +def mode_rows(m): return [r for r in rows if r["mode"] == m] + +agg = {} +for m in modes: + rs = mode_rows(m) + agg[m] = { + "n": len(rs), + "input_tokens_median": int(med([r["input_tokens"] for r in rs])), + "output_tokens_median": int(med([r["output_tokens"] for r in rs])), + "latency_ms_median": int(med([r["latency_ms"] for r in rs])), + "cost_usd_median": round(med([r["cost_usd"] for r in rs]), 5), + "cost_usd_total": round(sum(r["cost_usd"] for r in rs), 4), + "coverage_pct_mean": round(mean([r["fact_recall"]["coverage_pct"] for r in rs]), 1), + "bonus_pct_mean": round(mean([r["fact_recall"]["bonus_pct"] for r in rs]), 1), + "factual_median": med([r["judge_summary"].get("factual_median", 0) for r in rs]), + "factual_iqr": round(iqr([r["judge_summary"].get("factual_median", 0) for r in rs]), 2), + "specific_median": med([r["judge_summary"].get("specific_median", 0) for r in rs]), + "specific_iqr": round(iqr([r["judge_summary"].get("specific_median", 0) for r in rs]), 2), + "complete_median": med([r["judge_summary"].get("complete_median", 0) for r in rs]), + "complete_iqr": round(iqr([r["judge_summary"].get("complete_median", 0) for r in rs]), 2), + "hallucinations_mean": round(mean([r["judge_summary"].get("hallucinations_mean", 0) for r in rs]), 2), + "hallucinations_max": max([r["judge_summary"].get("hallucinations_max", 0) for r in rs]), + "composite_15_median": round(med([ + (r["judge_summary"].get("factual_median", 0) + + r["judge_summary"].get("specific_median", 0) + + r["judge_summary"].get("complete_median", 0)) + for r in rs + ]), 2), + } + +# Paired comparisons: per-task, mode A vs mode B +by_task = {} +for r in rows: + by_task.setdefault(r["task_id"], {})[r["mode"]] = r + +def composite(r): + js = r["judge_summary"] + return js.get("factual_median", 0) + js.get("specific_median", 0) + js.get("complete_median", 0) + +pairs = {"smart_vs_no_context": [], "search_vs_no_context": [], "smart_vs_search": []} +for tid, modes_d in by_task.items(): + if "no_context" in modes_d: + if "smart" in modes_d: + pairs["smart_vs_no_context"].append((tid, composite(modes_d["smart"]) - composite(modes_d["no_context"]))) + if "search" in modes_d: + pairs["search_vs_no_context"].append((tid, composite(modes_d["search"]) - composite(modes_d["no_context"]))) + if "smart" in modes_d and "search" in modes_d: + pairs["smart_vs_search"].append((tid, composite(modes_d["smart"]) - composite(modes_d["search"]))) + +paired_summary = {} +for k, vs in pairs.items(): + diffs = [d for _, d in vs] + paired_summary[k] = { + "n": len(diffs), + "mean_delta": round(mean(diffs), 2), + "median_delta": round(med(diffs), 2), + "wins": sum(1 for d in diffs if d > 0), + "ties": sum(1 for d in diffs if d == 0), + "losses": sum(1 for d in diffs if d < 0), + "worst_task": min(vs, key=lambda x: x[1]) if vs else None, + "best_task": max(vs, key=lambda x: x[1]) if vs else None, + } + +# Retrieval-quality aggregates (search mode only) +retr = [r["retrieval_quality"] for r in rows if r["mode"] == "search" and r["retrieval_quality"]] +retr_summary = { + "n": len(retr), + "precision_at_k_mean": round(mean([r["precision_at_k"] for r in retr]), 3), + "recall_at_k_mean": round(mean([r["recall_at_k"] for r in retr]), 3), + "mrr_mean": round(mean([r["mrr"] for r in retr]), 3), + "complete_misses": [r["expected"] for r in retr if r["precision_at_k"] == 0], +} + +# Token-per-quality-point of LIFT over no_context +def lift_efficiency(mode): + rs = [] + for tid, ms in by_task.items(): + if mode not in ms or "no_context" not in ms: + continue + delta_q = composite(ms[mode]) - composite(ms["no_context"]) + delta_tk = ms[mode]["input_tokens"] - ms["no_context"]["input_tokens"] + rs.append((tid, delta_tk, delta_q)) + if not rs: + return None + # only count rows where lift > 0 (the rest are negative-value) + pos = [(tk, q) for _, tk, q in rs if q > 0] + if not pos: + return {"n_positive_lift": 0, "n_total": len(rs)} + return { + "n_total": len(rs), + "n_positive_lift": len(pos), + "n_negative_lift": sum(1 for _, _, q in rs if q < 0), + "n_zero_lift": sum(1 for _, _, q in rs if q == 0), + "median_tk_per_lift_pt": round(med([tk for tk, q in pos]) / med([q for tk, q in pos]), 1), + } + +lift = {m: lift_efficiency(m) for m in ("smart", "search")} + +# Build per-task detail rows for the appendix table +detail = [] +for tid in sorted(by_task.keys()): + ms = by_task[tid] + row = {"task_id": tid, "category": ms[next(iter(ms))]["category"]} + for m in modes: + if m not in ms: continue + r = ms[m] + js = r["judge_summary"] + row[m] = { + "input_tokens": r["input_tokens"], + "coverage": r["fact_recall"]["coverage_pct"], + "bonus": r["fact_recall"]["bonus_pct"], + "factual": js.get("factual_median", 0), + "specific": js.get("specific_median", 0), + "complete": js.get("complete_median", 0), + "halluc": js.get("hallucinations_mean", 0), + "composite_15": composite(r), + } + detail.append(row) + +final = { + "ran_at": data["ran_at"], + "task_model": data["task_model"], + "judge_model": data["judge_model"], + "judge_runs": data["judge_runs"], + "task_count": data["task_count"], + "raw_all_baseline_tokens": 186654, + "by_mode": agg, + "paired": paired_summary, + "retrieval_quality": retr_summary, + "lift_efficiency": lift, + "detail": detail, +} + +DATA.mkdir(parents=True, exist_ok=True) +(DATA / "results-v13-final.json").write_text(json.dumps(final, indent=2), encoding="utf-8") + +# Print summary +print("=" * 96) +print(f"v1.3 RESULTS — task={data['task_model']} judge={data['judge_model']}×{data['judge_runs']} runs N={data['task_count']}") +print("=" * 96) + +print(f"\n{'Mode':<12} {'med tk':>8} {'cov':>5} {'bonus':>6} {'fact':>5} {'spec':>5} {'comp':>5} {'halluc':>7} {'comp/15':>8} {'$':>7}") +for m in modes: + a = agg[m] + print(f"{m:<12} {a['input_tokens_median']:>8,} " + f"{a['coverage_pct_mean']:>4.0f}% {a['bonus_pct_mean']:>5.0f}% " + f"{a['factual_median']:>5.1f} {a['specific_median']:>5.1f} {a['complete_median']:>5.1f} " + f"{a['hallucinations_mean']:>7.2f} {a['composite_15_median']:>7.1f} ${a['cost_usd_total']:>6.2f}") + +print("\nPaired comparisons (composite/15):") +for k, p in paired_summary.items(): + print(f" {k:<24} n={p['n']:<3} mean d={p['mean_delta']:>6.2f} wins/ties/losses = {p['wins']}/{p['ties']}/{p['losses']}") + if p['worst_task']: + print(f" worst: {p['worst_task'][0]} (d={p['worst_task'][1]:.0f}) best: {p['best_task'][0]} (d={p['best_task'][1]:.0f})") + +print("\nRetrieval quality (search mode):") +r = retr_summary +print(f" P@8 mean = {r['precision_at_k_mean']:.3f} R@8 mean = {r['recall_at_k_mean']:.3f} MRR mean = {r['mrr_mean']:.3f}") +print(f" Complete retrieval misses: {len(r['complete_misses'])} of {r['n']} tasks") + +print("\nLift efficiency over no_context (tokens per +1 composite point of lift):") +for m, l in lift.items(): + if l is None: + continue + if l.get("median_tk_per_lift_pt") is None: + print(f" {m}: NO POSITIVE LIFT on any task (negative={l['n_negative_lift']}, zero={l['n_zero_lift']}, positive={l['n_positive_lift']})") + else: + print(f" {m}: {l['median_tk_per_lift_pt']:>10,.0f} tk/+1pt " + f"(positive lift {l['n_positive_lift']}/{l['n_total']}; " + f"negative lift {l['n_negative_lift']}; zero {l['n_zero_lift']})") + +print("\nHEADLINE: Smart Compile in its current form") +sv = paired_summary["smart_vs_no_context"] +if sv["mean_delta"] < 0: + print(f" DEGRADES composite quality by {abs(sv['mean_delta']):.2f} points on average vs no-context.") + print(f" Loses on {sv['losses']}/{sv['n']} tasks; wins on {sv['wins']}.") + print(f" Implication: full-skill-body injection at ~75k tokens hits an attention-dilution regime") + print(f" the no-context model doesn't experience. Hypothesis 1 (in its v1.0 form) is contradicted.") +elif sv["mean_delta"] > 0: + print(f" IMPROVES composite quality by {sv['mean_delta']:.2f} points on average vs no-context.") +sr = paired_summary["search_vs_no_context"] +print(f"\n Search vs no_context: mean d = {sr['mean_delta']:+.2f} ({sr['wins']} wins / {sr['ties']} ties / {sr['losses']} losses)") +print(f"\nWrote {DATA / 'results-v13-final.json'}") + +# --gate: check quality gates, exit 1 on failure +if args.json: + json.dump(final, sys.stdout, indent=2) + print() + +if args.gate: + gates_failed = 0 + + # Gate 1: No-context quality — Smart and Search must beat or tie no-context + for mode_key, label in [("smart_vs_no_context", "Smart Compile vs no_context"), + ("search_vs_no_context", "MCP Search vs no_context")]: + p_data = paired_summary[mode_key] + if p_data["mean_delta"] < 0: + print(f"\n [GATE FAIL] {label}: mean delta = {p_data['mean_delta']:.2f} " + f"(losses={p_data['losses']}/{p_data['n']})") + gates_failed += 1 + else: + print(f"\n [GATE PASS] {label}: mean delta = {p_data['mean_delta']:.2f} " + f"(wins={p_data['wins']}, ties={p_data['ties']})") + + # Gate 2: Retrieval quality — Recall@8 must be 1.00 + retr_r = retr_summary + if retr_r["recall_at_k_mean"] < 1.0: + print(f" [GATE FAIL] R@8 = {retr_r['recall_at_k_mean']:.3f} (expected 1.000). " + f"{len(retr_r['complete_misses'])} complete misses.") + gates_failed += 1 + else: + print(f" [GATE PASS] R@8 = {retr_r['recall_at_k_mean']:.3f}") + + if gates_failed: + print(f"\n {gates_failed} gate(s) FAILED.") + sys.exit(1) + else: + print(f"\n All gates PASSED.") diff --git a/bench/build_report.py b/bench/build_report.py new file mode 100644 index 0000000..89d8c80 --- /dev/null +++ b/bench/build_report.py @@ -0,0 +1,571 @@ +#!/usr/bin/env python3 +"""Build the consolidated Context Engine benchmark report.""" + +from __future__ import annotations + +import html +import json +import math +import statistics +from pathlib import Path + +from PIL import Image, ImageDraw, ImageFont +from reportlab.lib import colors +from reportlab.lib.enums import TA_CENTER, TA_LEFT +from reportlab.lib.pagesizes import A4 +from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet +from reportlab.lib.units import mm +from reportlab.platypus import ( + Image as RLImage, + KeepTogether, + LongTable, + PageBreak, + Paragraph, + SimpleDocTemplate, + Spacer, + Table, + TableStyle, +) + + +ROOT = Path(__file__).resolve().parents[2] +BENCH = ROOT / "app" / "bench" +CHARTS = BENCH / "artifacts" / "charts" +OUT = BENCH / "artifacts" / "reports" / "Context Engine Benchmark Report v1.0.pdf" + +IVORY = "#f7f3ea" +SLATE = "#222832" +MUTED = "#62666f" +CORAL = "#d8664f" +GOLD = "#c99a42" +BLUE = "#4f6f8f" +GREEN = "#5f8d69" +RED = "#b8524b" +GRID = "#d9d2c6" + +MODES = ["no_context", "smart", "search"] +MODE_LABEL = {"no_context": "No context", "smart": "Smart", "search": "Search"} + + +def load_json(name: str): + return json.loads((BENCH / name).read_text(encoding="utf-8")) + + +tasks = load_json("tasks.json") +task_order = [t["id"] for t in tasks] +v11 = load_json("data/v1.1/results-graded-sample.json") +v12 = load_json("data/v1.2/results-v12-final.json") +v13 = load_json("data/v1.3/results-v13-final.json") +v13_rows = load_json("data/v1.3/results-v13.json")["rows"] +responses = load_json("data/v1.3/responses-v13.json") + + +def esc(text) -> str: + return html.escape(str(text)).replace("\n", "
") + + +def pct(x, d=1) -> str: + return f"{x:.{d}f}%" + + +def num(x) -> str: + if isinstance(x, float): + return f"{x:,.1f}" + return f"{x:,}" + + +def font(size=28, bold=False): + candidates = [ + "C:/Windows/Fonts/arialbd.ttf" if bold else "C:/Windows/Fonts/arial.ttf", + "C:/Windows/Fonts/segoeuib.ttf" if bold else "C:/Windows/Fonts/segoeui.ttf", + ] + for candidate in candidates: + try: + return ImageFont.truetype(candidate, size) + except Exception: + pass + return ImageFont.load_default() + + +FONT = font(24) +FONT_B = font(24, True) +FONT_S = font(18) +FONT_SB = font(18, True) +FONT_XS = font(15) + + +def new_chart(title: str, subtitle: str = "", w=1600, h=900): + img = Image.new("RGB", (w, h), IVORY) + d = ImageDraw.Draw(img) + d.text((60, 42), title, fill=SLATE, font=font(38, True)) + if subtitle: + d.text((62, 92), subtitle, fill=MUTED, font=FONT_S) + return img, d + + +def save_chart(img: Image.Image, name: str) -> Path: + CHARTS.mkdir(parents=True, exist_ok=True) + path = CHARTS / name + img.save(path, "PNG", optimize=True) + return path + + +def text_center(d, xy, text, fill=SLATE, fnt=FONT): + box = d.textbbox((0, 0), text, font=fnt) + d.text((xy[0] - (box[2] - box[0]) / 2, xy[1]), text, fill=fill, font=fnt) + + +def chart_token_cost(): + data = [ + ("Raw all", v13["raw_all_baseline_tokens"], SLATE), + ("Smart", v13["by_mode"]["smart"]["input_tokens_median"], CORAL), + ("Search", v13["by_mode"]["search"]["input_tokens_median"], BLUE), + ("No ctx", v13["by_mode"]["no_context"]["input_tokens_median"], GREEN), + ] + img, d = new_chart("Token Cost by Mode", "Median input tokens on a log scale") + x0, y0, x1, y1 = 160, 740, 1480, 170 + for tick in [10, 100, 1000, 10000, 100000]: + y = y0 - (math.log10(tick) - 1) / 4.4 * (y0 - y1) + d.line((x0, y, x1, y), fill=GRID, width=2) + d.text((62, y - 12), f"{tick:,}", fill=MUTED, font=FONT_XS) + bw = 190 + gap = 105 + for i, (label, value, color) in enumerate(data): + x = x0 + 90 + i * (bw + gap) + y = y0 - (math.log10(value) - 1) / 4.4 * (y0 - y1) + d.rounded_rectangle((x, y, x + bw, y0), radius=10, fill=color) + text_center(d, (x + bw / 2, y - 42), f"{value:,}", SLATE, FONT_SB) + text_center(d, (x + bw / 2, y0 + 22), label, SLATE, FONT_S) + d.text((160, 812), "Takeaway: four orders of magnitude separate the cheapest path from the naive baseline.", fill=SLATE, font=FONT_SB) + return save_chart(img, "01-token-cost-log.png") + + +def mode_value(task_id, mode, field): + row = next(r for r in v13["detail"] if r["task_id"] == task_id) + return row[mode][field] + + +def chart_quality_heatmap(): + img, d = new_chart("Composite Quality by Task", "Composite score on 0-15 scale") + left, top = 340, 155 + cell_w, cell_h = 260, 38 + for j, mode in enumerate(MODES): + text_center(d, (left + j * cell_w + cell_w / 2, top - 44), MODE_LABEL[mode], SLATE, FONT_SB) + for i, tid in enumerate(task_order): + y = top + i * cell_h + d.text((54, y + 8), tid, fill=SLATE, font=FONT_XS) + for j, mode in enumerate(MODES): + val = mode_value(tid, mode, "composite_15") + ratio = max(0, min(1, val / 15)) + r = int(245 - ratio * 60) + g = int(225 - ratio * 95) + b = int(210 - ratio * 115) + x = left + j * cell_w + d.rectangle((x, y, x + cell_w - 8, y + cell_h - 6), fill=(r, g, b)) + text_center(d, (x + cell_w / 2 - 4, y + 5), f"{val:g}", SLATE, FONT_XS) + d.text((60, 812), "Takeaway: Smart Compile loses on 8 of 15 tasks despite costing far more tokens than no-context.", fill=SLATE, font=FONT_SB) + return save_chart(img, "02-quality-heatmap.png") + + +def chart_smart_delta(): + rows = [] + for tid in task_order: + rows.append((tid, mode_value(tid, "smart", "composite_15") - mode_value(tid, "no_context", "composite_15"))) + rows.sort(key=lambda x: x[1]) + img, d = new_chart("Smart vs No-Context Delta", "Positive values mean Smart wins") + x0, y0, x1, y1 = 780, 760, 1480, 160 + zero = x0 + (0 + 10) / 16 * (x1 - x0) + d.line((zero, y1, zero, y0), fill=SLATE, width=3) + for tick in [-10, -5, 0, 5]: + x = x0 + (tick + 10) / 16 * (x1 - x0) + d.line((x, y1, x, y0), fill=GRID, width=1) + text_center(d, (x, y0 + 20), str(tick), MUTED, FONT_XS) + bh = 30 + for i, (tid, delta) in enumerate(rows): + y = y1 + i * 39 + d.text((60, y + 5), tid, fill=SLATE, font=FONT_XS) + x = x0 + (delta + 10) / 16 * (x1 - x0) + color = GREEN if delta > 0 else RED if delta < 0 else MUTED + d.rectangle((min(zero, x), y, max(zero, x), y + bh), fill=color) + d.text((max(zero, x) + 8 if delta >= 0 else min(zero, x) - 42, y + 4), f"{delta:+g}", fill=SLATE, font=FONT_XS) + d.text((60, 812), "Takeaway: when Smart fails, it fails catastrophically; when it wins, it wins modestly.", fill=SLATE, font=FONT_SB) + return save_chart(img, "03-smart-delta.png") + + +def chart_hallucinations(): + img, d = new_chart("Hallucination Distribution", "Mean fabricated facts per response") + x_positions = [380, 800, 1220] + y0, y1 = 730, 160 + for tick in range(0, 6): + y = y0 - tick / 5 * (y0 - y1) + d.line((180, y, 1420, y), fill=GRID, width=1) + d.text((120, y - 10), str(tick), fill=MUTED, font=FONT_XS) + for j, mode in enumerate(MODES): + xs = x_positions[j] + vals = [mode_value(tid, mode, "halluc") for tid in task_order] + for i, val in enumerate(vals): + y = y0 - val / 5 * (y0 - y1) + x = xs + ((i % 5) - 2) * 18 + d.ellipse((x - 9, y - 9, x + 9, y + 9), fill=[GREEN, CORAL, BLUE][j], outline=SLATE) + med = statistics.mean(vals) + y = y0 - med / 5 * (y0 - y1) + d.line((xs - 95, y, xs + 95, y), fill=SLATE, width=4) + text_center(d, (xs, y0 + 24), MODE_LABEL[mode], SLATE, FONT_S) + text_center(d, (xs, y - 34), f"mean {med:.2f}", SLATE, FONT_XS) + d.text((160, 812), "Takeaway: loading more context creates more fabricated facts, not fewer.", fill=SLATE, font=FONT_SB) + return save_chart(img, "04-hallucinations.png") + + +def search_retrieval_rows(): + return [r for r in v13_rows if r["mode"] == "search"] + + +def chart_retrieval_scatter(): + points = [] + for row in search_retrieval_rows(): + tid = row["task_id"] + p = row["retrieval_quality"]["precision_at_k"] + y = mode_value(tid, "search", "composite_15") - mode_value(tid, "no_context", "composite_15") + points.append((tid, p, y)) + img, d = new_chart("Retrieval Quality vs Outcome", "Search Precision@K against quality delta") + x0, y0, x1, y1 = 170, 720, 1460, 170 + for tick in [0, 0.25, 0.5, 0.75, 1.0]: + x = x0 + tick * (x1 - x0) + d.line((x, y0, x, y1), fill=GRID) + text_center(d, (x, y0 + 22), f"{tick:.2f}", MUTED, FONT_XS) + for tick in [-4, -2, 0, 2, 4]: + y = y0 - (tick + 4) / 8 * (y0 - y1) + d.line((x0, y, x1, y), fill=GRID) + d.text((110, y - 10), f"{tick:+d}", fill=MUTED, font=FONT_XS) + xs = [p[1] for p in points] + ys = [p[2] for p in points] + mean_x, mean_y = statistics.mean(xs), statistics.mean(ys) + denom = sum((x - mean_x) ** 2 for x in xs) or 1 + slope = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys)) / denom + intercept = mean_y - slope * mean_x + tx0, tx1 = 0, 1 + ty0, ty1 = intercept, intercept + slope + d.line((x0, y0 - (ty0 + 4) / 8 * (y0 - y1), x1, y0 - (ty1 + 4) / 8 * (y0 - y1)), fill=CORAL, width=4) + for tid, p, delta in points: + x = x0 + p * (x1 - x0) + y = y0 - (delta + 4) / 8 * (y0 - y1) + d.ellipse((x - 10, y - 10, x + 10, y + 10), fill=BLUE, outline=SLATE) + if p in (0, 1) or abs(delta) >= 3: + d.text((x + 12, y - 8), tid[:18], fill=SLATE, font=FONT_XS) + d.text((170, 812), "Takeaway: where retrieval surfaces relevant chunks, search adds value; misses degrade output.", fill=SLATE, font=FONT_SB) + return save_chart(img, "05-retrieval-scatter.png") + + +def chart_lift_efficiency(): + img, d = new_chart("Cost per Positive Quality Lift", "Tokens per +1 composite point over no-context") + data = [("Smart", v13["lift_efficiency"]["smart"], CORAL), ("Search", v13["lift_efficiency"]["search"], BLUE)] + maxv = max(item["median_tk_per_lift_pt"] for _, item, _ in data) + x0, y0, x1, y1 = 260, 700, 1350, 180 + for i, (label, item, color) in enumerate(data): + val = item["median_tk_per_lift_pt"] + h = val / maxv * (y0 - y1) + x = x0 + i * 500 + d.rounded_rectangle((x, y0 - h, x + 260, y0), radius=12, fill=color) + text_center(d, (x + 130, y0 - h - 42), f"{val:,.0f}", SLATE, FONT_SB) + text_center(d, (x + 130, y0 + 24), label, SLATE, FONT_S) + d.text((x - 10, y0 + 68), f"negative-lift tasks: {item['n_negative_lift']}/{item['n_total']}", fill=MUTED, font=FONT_XS) + d.text((200, 812), "Takeaway: Search is 76x cheaper per positive lift point, but both modes lose on most tasks.", fill=SLATE, font=FONT_SB) + return save_chart(img, "06-lift-efficiency.png") + + +def chart_methodology(): + img, d = new_chart("Methodology Changed the Result", "Why v1.1 and v1.2 were retracted") + rows = [ + ("v1.1", "5 tasks", "Session as judge, holistic 1-10", "Smart 9.0/10; optimistic"), + ("v1.2", "5 tasks", "Gold facts + decomposed rubric, same session judge", "Smart +20 composite pts vs no-context"), + ("v1.3", "15 tasks", "Sonnet task model, Haiku judge x3", "Smart -1.27 pts; loses 8/15"), + ] + x = [90, 300, 540, 1040] + y = 190 + headers = ["Run", "N", "Judge/rubric", "Headline"] + for j, h in enumerate(headers): + d.text((x[j], y), h, fill=SLATE, font=FONT_SB) + for i, row in enumerate(rows): + yy = y + 80 + i * 155 + fill = "#fffaf1" if i % 2 == 0 else "#eee8dc" + d.rounded_rectangle((70, yy - 24, 1510, yy + 92), radius=16, fill=fill, outline=GRID, width=2) + for j, cell in enumerate(row): + d.text((x[j], yy), cell, fill=SLATE if i < 2 else RED, font=FONT_S if j else FONT_B) + d.text((90, 812), "Takeaway: self-evaluation bias accounted for roughly three composite points of inflation.", fill=SLATE, font=FONT_SB) + return save_chart(img, "07-methodology.png") + + +def chart_latency_cost(): + img, d = new_chart("Latency and Cost by Mode", "Median latency and total API cost in v1.3") + panels = [("Median latency (ms)", "latency_ms_median", 150, 720, 680), ("Total cost (USD)", "cost_usd_total", 880, 720, 1410)] + for title, field, x0, y0, x1 in panels: + d.text((x0, 150), title, fill=SLATE, font=FONT_SB) + vals = [v13["by_mode"][m][field] for m in MODES] + maxv = max(vals) + for i, (mode, val) in enumerate(zip(MODES, vals)): + x = x0 + i * 170 + h = val / maxv * 430 + d.rounded_rectangle((x, y0 - h, x + 110, y0), radius=8, fill=[GREEN, CORAL, BLUE][i]) + label = f"${val:.2f}" if field == "cost_usd_total" else f"{int(val):,}" + text_center(d, (x + 55, y0 - h - 34), label, SLATE, FONT_XS) + text_center(d, (x + 55, y0 + 22), MODE_LABEL[mode].replace(" ", "\n"), SLATE, FONT_XS) + d.text((150, 812), "Takeaway: Smart is slower and materially more expensive while under-performing no-context.", fill=SLATE, font=FONT_SB) + return save_chart(img, "08-latency-cost.png") + + +def build_charts(): + return [ + chart_token_cost(), + chart_quality_heatmap(), + chart_smart_delta(), + chart_hallucinations(), + chart_retrieval_scatter(), + chart_lift_efficiency(), + chart_methodology(), + chart_latency_cost(), + ] + + +styles = getSampleStyleSheet() +H1 = ParagraphStyle("H1", parent=styles["Heading1"], fontName="Helvetica-Bold", fontSize=20, leading=24, textColor=colors.HexColor(SLATE), spaceAfter=10) +H2 = ParagraphStyle("H2", parent=styles["Heading2"], fontName="Helvetica-Bold", fontSize=14, leading=18, textColor=colors.HexColor(SLATE), spaceBefore=10, spaceAfter=6) +H3 = ParagraphStyle("H3", parent=styles["Heading3"], fontName="Helvetica-Bold", fontSize=10.5, leading=13, textColor=colors.HexColor(CORAL), spaceBefore=6, spaceAfter=3) +BODY = ParagraphStyle("BODY", parent=styles["BodyText"], fontName="Helvetica", fontSize=9.6, leading=13, textColor=colors.HexColor(SLATE), alignment=TA_LEFT, spaceAfter=6) +QUOTE = ParagraphStyle("QUOTE", parent=BODY, leftIndent=10, rightIndent=10, fontName="Helvetica-Oblique", textColor=colors.HexColor(MUTED), backColor=colors.HexColor("#eee8dc"), borderPadding=6, spaceAfter=8) +CAP = ParagraphStyle("CAP", parent=BODY, fontSize=8.4, leading=11, textColor=colors.HexColor(MUTED), spaceAfter=8) +TITLE = ParagraphStyle("TITLE", parent=H1, fontSize=28, leading=34, alignment=TA_CENTER, spaceAfter=14) +SUB = ParagraphStyle("SUB", parent=BODY, fontSize=12, leading=16, alignment=TA_CENTER, textColor=colors.HexColor(MUTED)) +SMALL = ParagraphStyle("SMALL", parent=BODY, fontSize=7.5, leading=9) + + +def para(text, style=BODY): + return Paragraph(esc(text), style) + + +def rich(text, style=BODY): + return Paragraph(text, style) + + +def table(headers, rows, widths, small=False): + body = [[rich(f"{esc(h)}", SMALL if small else BODY) for h in headers]] + for row in rows: + body.append([rich(esc(c), SMALL if small else BODY) for c in row]) + t = LongTable(body, colWidths=widths, repeatRows=1) + t.setStyle(TableStyle([ + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor(SLATE)), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), + ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f2ede3")]), + ("GRID", (0, 0), (-1, -1), 0.25, colors.HexColor(GRID)), + ("VALIGN", (0, 0), (-1, -1), "TOP"), + ("LEFTPADDING", (0, 0), (-1, -1), 4), + ("RIGHTPADDING", (0, 0), (-1, -1), 4), + ("TOPPADDING", (0, 0), (-1, -1), 3), + ("BOTTOMPADDING", (0, 0), (-1, -1), 3), + ])) + return t + + +def chart_flow(path: Path, caption: str): + return KeepTogether([ + RLImage(str(path), width=170 * mm, height=95 * mm), + rich(f"Caption. {esc(caption)}", CAP), + ]) + + +def cover(story): + story.append(Spacer(1, 65 * mm)) + story.append(rich("Context Engine
Benchmark Report v1.0", TITLE)) + story.append(rich("Token efficiency, quality regression, and roadmap implications across v1.1 to v1.3", SUB)) + story.append(Spacer(1, 18 * mm)) + story.append(rich("Publication-grade summary: Context Engine saves tokens. The current v0.3.x retrieval and Smart Compile implementation does not yet preserve quality. v1.3 retracts the earlier headline while validating the paper's multi-resolution architecture.", QUOTE)) + story.append(PageBreak()) + + +def executive_summary(story): + story.append(rich("Executive Summary", H1)) + story.append(rich("The honest result: v1.3 retracts the v1.1/v1.2 headline. Smart Compile in its current v0.3.x form net-degrades quality versus no-context: mean delta -1.27 composite points on a 15-point scale, with 8 losses, 2 ties, and 5 wins across 15 tasks. Search also under-performs no-context on the corpus: -0.80 mean delta, with 8 losses.", BODY)) + story.append(rich("The validation: token reduction is real and large. Median input tokens fall from 186,654 in the raw-all baseline to 74,279 in Smart and 903 in Search. This validates the token-efficiency part of the architecture but not the quality-preservation claim.", BODY)) + story.append(rich("White paper cross-reference: Section 11 predicted that binary include/exclude selection would not be enough; skills need multi-resolution packaging. v1.3 empirically confirms that prediction. Section 32 Hypothesis 1 - Smart Compile reduces token load without reducing task quality - is not supported by the current implementation.", BODY)) + story.append(rich("Product decision: CE should be framed today as a token-saving and retrieval research system, not as a quality-improving context broker. Phase 2 ranking/dedup and Section 11 multi-resolution packaging are load-bearing, not polish.", BODY)) + story.append(PageBreak()) + + +def methodology(story, charts): + story.append(rich("Methodology Evolution", H1)) + story.append(rich("The result changed because the measurement became harder to fool. v1.1 used the active session as a holistic judge. v1.2 added gold facts and a decomposed rubric but retained session judging. v1.3 used Sonnet 4.5 as the task model and Haiku 4.5 as a separate judge over three runs per response.", BODY)) + rows = [ + ["v1.1", "5", "Session-as-judge; holistic 1-10", "Smart 9.0/10, Search 7.6/10", "Retracted as over-optimistic"], + ["v1.2", "5", "Gold facts + decomposed rubric; session judge", "Smart +20 composite pts vs no-context", "Directionally useful, still biased"], + ["v1.3", "15", "Sonnet task model; Haiku judge x3", "Smart -1.27; Search -0.80 vs no-context", "Current decision baseline"], + ] + story.append(table(["Run", "N", "Judge/rubric", "Headline", "Status"], rows, [18*mm, 10*mm, 48*mm, 50*mm, 40*mm])) + story.append(Spacer(1, 6)) + story.append(chart_flow(charts[6], "Self-evaluation bias made earlier runs look better than the full v1.3 pipeline.")) + story.append(PageBreak()) + + +def headline_results(story, charts): + story.append(rich("Headline Results", H1)) + story.append(chart_flow(charts[0], "Four orders of magnitude separate the cheapest path from the naive raw-all baseline.")) + rows = [] + for mode in MODES: + a = v13["by_mode"][mode] + rows.append([MODE_LABEL[mode], num(a["input_tokens_median"]), pct(a["coverage_pct_mean"]), pct(a["bonus_pct_mean"]), f"{a['factual_median']:.1f}", f"{a['specific_median']:.1f}", f"{a['complete_median']:.1f}", f"{a['hallucinations_mean']:.2f}", f"{a['composite_15_median']:.1f}", f"${a['cost_usd_total']:.2f}"]) + story.append(table(["Mode", "med tk", "Cov", "Bonus", "Fact", "Spec", "Comp", "Halluc", "Comp/15", "Cost"], rows, [24*mm, 18*mm, 14*mm, 15*mm, 12*mm, 12*mm, 12*mm, 16*mm, 17*mm, 16*mm], small=True)) + story.append(rich("Table 1. Per-mode aggregate, all 15 tasks. Fact/Spec/Comp are judge medians on 1-5 axes; composite is on a 0-15 scale.", CAP)) + story.append(PageBreak()) + story.append(chart_flow(charts[1], "Smart Compile loses on 8 of 15 tasks despite costing about 2,500x more input tokens than no-context.")) + story.append(chart_flow(charts[2], "The worst Smart failure is large enough to dominate the mean; positive wins are smaller.")) + story.append(PageBreak()) + paired_rows = [] + for key, label in [("smart_vs_no_context", "Smart vs no-context"), ("search_vs_no_context", "Search vs no-context"), ("smart_vs_search", "Smart vs Search")]: + p = v13["paired"][key] + paired_rows.append([label, str(p["n"]), f"{p['mean_delta']:+.2f}", f"{p['median_delta']:+.1f}", f"{p['wins']}/{p['ties']}/{p['losses']}", f"{p['worst_task'][0]} ({p['worst_task'][1]:+g})", f"{p['best_task'][0]} ({p['best_task'][1]:+g})"]) + story.append(table(["Comparison", "n", "mean", "median", "W/T/L", "Worst", "Best"], paired_rows, [38*mm, 10*mm, 16*mm, 16*mm, 18*mm, 40*mm, 40*mm])) + story.append(rich("Table 2. Paired comparison by task. Positive deltas mean the first mode outperformed the second.", CAP)) + story.append(chart_flow(charts[7], "Smart costs more and runs slower while under-performing no-context in aggregate.")) + story.append(PageBreak()) + + +def whitepaper_crossref(story): + story.append(rich("White Paper Predictions vs v1.3", H1)) + story.append(rich("Section 11 - Multi-Resolution Context Packaging: The paper says the key optimisation is not merely selecting chunks but selecting resolution: manifest, glossary, summary, targeted chunks, section, or full source. v1.3 confirms this. Injecting many full skill bodies causes attention dilution; the model follows meta-style and process text instead of the user's concrete request.", BODY)) + story.append(rich("Section 19 - Future Evaluation Framework: The paper specifically named Precision@K, Recall@K, MRR, token utility, and compile reduction as future metrics. v1.3 is the first run to instantiate that scaffold. Retrieval quality is low: Precision@8 mean 0.23, Recall@8 mean 0.56, MRR mean 0.31, with 5/15 complete misses.", BODY)) + story.append(rich("Section 32 - Hypothesis 1: 'Smart Compile reduces token load without reducing task quality' is not supported by v0.3.x. The token-load half is true; the quality half is false on this corpus. Hypothesis 2 - multi-resolution packaging improves retrieval efficiency - becomes the more important next test.", BODY)) + story.append(rich("Section 34 - Build Roadmap: Phase 2 Dedup and Rank is the first major improvement to retrieval quality. The benchmark makes that phase urgent. Phase 4 Multi-Resolution Skill Compiler is not optional if Smart Compile is to become quality-preserving.", BODY)) + story.append(PageBreak()) + + +def failure_mode(story): + story.append(rich("Failure Mode Case Study", H1)) + story.append(rich("The clearest failure is comfy-prompt-fantasy. The user asked for a ComfyUI prompt. No-context produced a practical prompt with positive/negative sections and settings. Smart, after receiving 36k tokens of skill context, produced a philosophical essay titled Mystral Dusk and failed the task.", BODY)) + no_ctx = responses["comfy-prompt-fantasy"]["no_context"]["text"][:780] + smart = responses["comfy-prompt-fantasy"]["smart"]["text"][:780] + story.append(rich("No-context response excerpt", H3)) + story.append(para(no_ctx, QUOTE)) + story.append(rich("Smart response excerpt", H3)) + story.append(para(smart, QUOTE)) + story.append(rich("This is the attention-dilution failure mode in concrete form: the model absorbed aesthetic and meta-discussion from context and produced more meta-discussion instead of an executable prompt.", BODY)) + story.append(PageBreak()) + + +def hallucination_retrieval(story, charts): + story.append(rich("Hallucinations and Retrieval Quality", H1)) + story.append(chart_flow(charts[3], "Loading more context increases fabricated facts: Smart mean hallucinations 1.49 vs no-context 0.98.")) + story.append(chart_flow(charts[4], "Search quality depends on retrieval quality. Complete misses predict degraded output.")) + story.append(PageBreak()) + rows = [] + for row in search_retrieval_rows(): + rq = row["retrieval_quality"] + rows.append([row["task_id"], f"{rq['precision_at_k']:.2f}", f"{rq['recall_at_k']:.2f}", f"{rq['mrr']:.2f}", ", ".join(rq.get("hits") or []) or "-", ", ".join(rq.get("expected") or [])]) + story.append(table(["Task", "P@K", "R@K", "MRR", "Hits", "Expected"], rows, [38*mm, 13*mm, 13*mm, 13*mm, 42*mm, 55*mm], small=True)) + story.append(rich("Table 4. Retrieval quality per task for search mode. Complete misses are the strongest warning signal for quality loss.", CAP)) + story.append(PageBreak()) + + +def validated_retracted(story, charts): + story.append(rich("Validated, Retracted, Pending", H1)) + story.append(rich("Validated: token reduction is real; the benchmark harness now exists; Section 11's architecture is validated; retrieval quality predicts failure; Search is much more efficient than Smart per positive lift point.", BODY)) + story.append(rich("Retracted: the v1.1 claim that Smart held at 9.0/10 quality; the v1.2 framing that Smart improves quality over no-context; the product claim that v0.3.x Smart Compile preserves quality.", BODY)) + story.append(rich("Pending: cross-family judges beyond Haiku; alternative task models; a larger task corpus; user correction-rate measurement; and a post-Phase-2 rerun after ranking/dedup changes.", BODY)) + story.append(chart_flow(charts[5], "Search is dramatically more token-efficient when it helps, but both modes show negative lift on most tasks.")) + story.append(PageBreak()) + + +def roadmap(story): + story.append(rich("Build Roadmap Implications", H1)) + rows = [ + ["Phase 2: Dedup + Rank", "Highest", "P@8 is 0.23 and 5/15 searches miss completely. Ranking quality is the bottleneck."], + ["Phase 3: Smart Compile", "Re-scope", "Do not ship as quality-improving until it chooses resolution, not whole skills."], + ["Phase 4: Multi-Resolution Compiler", "Load-bearing", "Needed to avoid flooding the task model with full SKILL.md bodies."], + ["Retrieval dashboard", "High", "Expose P@K/R@K/MRR style signals and expected-source misses before user-facing claims."], + ["Benchmark gate", "High", "Any ranking change should rerun this report before release notes claim quality preservation."], + ] + story.append(table(["Roadmap item", "Priority", "Why v1.3 changes the decision"], rows, [42*mm, 24*mm, 104*mm])) + story.append(Spacer(1, 8)) + story.append(rich("The product framing should change immediately: current CE saves tokens and provides the plumbing for brokered context. It does not yet prove that brokered context improves output quality. That distinction protects the roadmap from overselling v0.3.x.", BODY)) + story.append(PageBreak()) + + +def limitations(story): + story.append(rich("Limitations Still Standing", H1)) + for item in [ + "Judge family is still Anthropic-only: Sonnet task model, Haiku judge. This avoids same-model self-judging but not family-level bias.", + "The corpus has only 15 tasks. It is representative of this project, not a universal benchmark.", + "Gold facts and expected sources are manually authored, so they encode human judgement.", + "No user correction-rate or real workflow completion metric is included yet.", + "v1.3 measures the current implementation, not the intended Section 11 multi-resolution implementation.", + ]: + story.append(rich("- " + esc(item), BODY)) + story.append(PageBreak()) + + +def reproducibility(story): + story.append(rich("Reproducibility and Source Lineage", H1)) + story.append(rich("The report is generated from immutable benchmark artifacts already present under app/bench. Historical white papers remain read-only records; this report cross-references them rather than rebuilding them.", BODY)) + rows = [ + ["Task corpus", "app/bench/tasks.json", "15 representative tasks"], + ["Gold answers", "app/bench/gold-answers.json", "Must/may/forbidden facts and expected sources"], + ["v1.1 token-only", "app/bench/data/v1.1/results-latest.json", "15-task token run, no quality grading"], + ["v1.1 graded sample", "app/bench/data/v1.1/results-graded-sample.json", "5-task session-as-judge quality sample"], + ["v1.2 final", "app/bench/data/v1.2/results-v12-final.json", "Gold scoring and decomposed rubric on 5-task subset"], + ["v1.3 final", "app/bench/data/v1.3/results-v13-final.json", "15-task aggregate, paired comparisons, lift efficiency"], + ["v1.3 raw rows", "app/bench/data/v1.3/results-v13.json", "45 task/mode rows with retrieval-quality detail"], + ["v1.3 responses", "app/bench/data/v1.3/responses-v13.json", "All task-model outputs used for case studies"], + ["Build script", "app/bench/build_report.py", "Single-command PDF and chart generation"], + ] + story.append(table(["Input", "Path", "Use"], rows, [36*mm, 66*mm, 68*mm], small=True)) + story.append(Spacer(1, 8)) + story.append(rich("Rebuild command: python app/bench/build_report.py. The script writes eight PNG charts to app/bench/artifacts/charts/ and the final PDF to app/bench/artifacts/reports/.", BODY)) + story.append(rich("No historical result JSON, benchmark runner, grading script, or white-paper PDF is modified by the build.", BODY)) + story.append(PageBreak()) + + +def detail_tables(story): + story.append(rich("Appendix A: Full Per-Task Detail", H1)) + rows = [] + for tid in task_order: + detail = next(r for r in v13["detail"] if r["task_id"] == tid) + for mode in MODES: + m = detail[mode] + rows.append([tid, MODE_LABEL[mode], num(m["input_tokens"]), pct(m["coverage"], 0), pct(m["bonus"], 0), f"{m['factual']:.1f}", f"{m['specific']:.1f}", f"{m['complete']:.1f}", f"{m['halluc']:.2f}", f"{m['composite_15']:.1f}"]) + story.append(table(["Task", "Mode", "Tokens", "Cov", "Bonus", "Fact", "Spec", "Comp", "Halluc", "Comp/15"], rows, [38*mm, 20*mm, 17*mm, 13*mm, 14*mm, 12*mm, 12*mm, 12*mm, 15*mm, 15*mm], small=True)) + story.append(PageBreak()) + + +def build_pdf(): + charts = build_charts() + doc = SimpleDocTemplate(str(OUT), pagesize=A4, rightMargin=18*mm, leftMargin=18*mm, topMargin=18*mm, bottomMargin=16*mm) + story = [] + cover(story) + executive_summary(story) + methodology(story, charts) + headline_results(story, charts) + whitepaper_crossref(story) + failure_mode(story) + hallucination_retrieval(story, charts) + validated_retracted(story, charts) + roadmap(story) + limitations(story) + reproducibility(story) + detail_tables(story) + doc.build(story, onFirstPage=page_bg, onLaterPages=page_bg) + + +def page_bg(canvas, doc): + canvas.saveState() + canvas.setFillColor(colors.HexColor(IVORY)) + canvas.rect(0, 0, A4[0], A4[1], fill=1, stroke=0) + canvas.setStrokeColor(colors.HexColor(GRID)) + canvas.line(18*mm, 13*mm, A4[0]-18*mm, 13*mm) + canvas.setFillColor(colors.HexColor(MUTED)) + canvas.setFont("Helvetica", 7) + canvas.drawString(18*mm, 8*mm, "Context Engine Benchmark Report v1.0") + canvas.drawRightString(A4[0]-18*mm, 8*mm, str(doc.page)) + canvas.restoreState() + + +if __name__ == "__main__": + build_pdf() + size_mb = OUT.stat().st_size / (1024 * 1024) + print(f"Wrote {OUT} ({size_mb:.2f} MB)") diff --git a/bench/continuity-gold.json b/bench/continuity-gold.json new file mode 100644 index 0000000..abc79bb --- /dev/null +++ b/bench/continuity-gold.json @@ -0,0 +1,198 @@ +{ + "_meta": { + "schema": "Per-task: must_inherit (facts S2 must respect from S1), reset_questions (regex patterns indicating S2 is asking for state it should already have), forbidden_drift (alternative choices that contradict S1).", + "scoring": "must_inherit hits are good. reset_question matches are bad. forbidden_drift matches are bad." + }, + "tasks": { + "strava-project": { + "must_inherit": [ + { + "name": "uses_uv", + "keywords": ["uv add", "uv run", "uv pip", "uv sync"], + "desc": "uses uv, not pip or poetry" + }, + { "name": "uses_pytest", "keywords": ["pytest", "test_", "tests/"] }, + { "name": "uses_ruff", "keywords": ["ruff"] }, + { + "name": "correct_path", + "keywords": ["E:/projects/strava-analyzer", "E:\\projects\\strava-analyzer", "strava-analyzer/"] + }, + { "name": "src_layout", "keywords": ["src/strava_analyzer", "src/strava_analyzer/"] }, + { "name": "python_312", "keywords": ["3.12", "Python 3.12", "python_version = \"3.12\""] } + ], + "reset_questions": [ + { + "name": "asks_path", + "patterns": ["what.*path", "where.*project", "which directory", "project location"] + }, + { + "name": "asks_tooling", + "patterns": [ + "which.*tool", + "uv or pip", + "pip or uv", + "pytest or unittest", + "ruff or black", + "what package manager" + ] + }, + { "name": "asks_layout", "patterns": ["src layout or flat", "which layout", "src/ or no"] } + ], + "forbidden_drift": [ + { + "name": "uses_pip", + "keywords": ["pip install", "requirements.txt"], + "desc": "S1 chose uv; pip is drift" + }, + { "name": "uses_poetry", "keywords": ["poetry add", "poetry install"] }, + { "name": "uses_black", "keywords": ["black ."], "desc": "S1 chose ruff" }, + { "name": "uses_unittest", "keywords": ["import unittest", "TestCase"] } + ] + }, + "fashion-shoot-prompts": { + "must_inherit": [ + { "name": "uses_juggernaut", "keywords": ["Juggernaut"] }, + { "name": "uses_dpm_karras", "keywords": ["DPM++ 2M Karras", "DPM++ 2M", "Karras"] }, + { "name": "uses_30_steps", "keywords": ["30 steps", "steps: 30", "steps=30"] }, + { "name": "uses_cfg_65", "keywords": ["CFG 6.5", "CFG: 6.5", "cfg=6.5"] }, + { "name": "uses_resolution", "keywords": ["1344x768", "1344 x 768"] }, + { "name": "aesthetic_tokyo", "keywords": ["Tokyo"] }, + { "name": "aesthetic_neon", "keywords": ["neon"] } + ], + "reset_questions": [ + { "name": "asks_sampler", "patterns": ["which sampler", "what sampler"] }, + { "name": "asks_checkpoint", "patterns": ["which.*checkpoint", "which.*model", "SDXL or"] }, + { "name": "asks_aesthetic", "patterns": ["what.*aesthetic", "what.*style", "what.*vibe"] }, + { "name": "asks_resolution", "patterns": ["which resolution", "what resolution", "what.*size"] } + ], + "forbidden_drift": [ + { + "name": "different_sampler", + "keywords": ["Euler a", "DDIM", "UniPC"], + "desc": "S1 chose DPM++ 2M Karras" + }, + { "name": "different_checkpoint", "keywords": ["DreamShaper", "RealVis", "SD 1.5"] } + ] + }, + "slack-release-thread": { + "must_inherit": [ + { "name": "version_referenced", "keywords": ["v0.3.1", "0.3.1"] }, + { "name": "uses_rocket_emoji", "keywords": [":rocket:"] }, + { + "name": "casual_technical_tone", + "keywords": ["folks", "team", "—", "shipped", "rolled out", "going out"] + }, + { "name": "migration_topic", "keywords": ["migration", "migrate", "userData", "auto-migration"] } + ], + "reset_questions": [ + { "name": "asks_version", "patterns": ["which version", "what version", "which release"] }, + { "name": "asks_audience", "patterns": ["who is.*audience", "what channel", "which channel"] }, + { "name": "asks_tone", "patterns": ["what tone", "how formal"] } + ], + "forbidden_drift": [ + { "name": "wrong_emoji", "keywords": [":tada:", ":sparkles:", ":fire:"], "desc": "S1 used :rocket:" }, + { "name": "marketing_tone", "keywords": ["world-class", "leverage synergies", "best-in-class"] } + ] + }, + "brand-poster-adapt": { + "must_inherit": [ + { "name": "accent_color", "keywords": ["#CC785C", "Book Cloth", "Clay"] }, + { "name": "background_color", "keywords": ["#F0EEE6", "Ivory"] }, + { "name": "text_color", "keywords": ["#191919", "Slate Dark"] }, + { "name": "display_type", "keywords": ["Styrene A", "Styrene"] }, + { "name": "body_type", "keywords": ["Tiempos"] }, + { "name": "asterisk_motif", "keywords": ["asterisk", "asterisk glyph"] }, + { "name": "headline_kept", "keywords": ["BUILDING WITH CLAUDE", "Building with Claude"] } + ], + "reset_questions": [ + { "name": "asks_palette", "patterns": ["which colors", "what palette", "color scheme"] }, + { "name": "asks_typography", "patterns": ["which font", "what typography", "which typeface"] }, + { "name": "asks_motif", "patterns": ["what graphic", "what motif", "any imagery"] } + ], + "forbidden_drift": [ + { "name": "different_palette", "keywords": ["#FF6B35", "neon", "vibrant gradient", "rainbow"] }, + { "name": "different_typography", "keywords": ["Inter", "Roboto", "Helvetica", "Arial"] } + ] + }, + "memory-consolidation": { + "must_inherit": [ + { + "name": "five_clusters", + "keywords": ["5 clusters", "five clusters", "cluster A", "cluster B", "cluster C"] + }, + { + "name": "merge_targets", + "keywords": [ + "cluster_A_windows", + "cluster A", + "Windows duplicates", + "cluster_B_x100vi", + "cluster B", + "X100VI" + ] + }, + { + "name": "keep_with_stale_tag", + "keywords": ["cluster_D", "cluster D", "cluster_E", "cluster E", "stale tag", "tag as stale"] + }, + { "name": "backup_required", "keywords": ["backup", "snapshot", "save copy", "before mutating"] }, + { "name": "audit_log_path", "keywords": ["data/memory-audits", "memory-audits/2026-05-15.json"] } + ], + "reset_questions": [ + { + "name": "asks_what_clusters", + "patterns": ["which.*clusters", "what duplicates", "what was found"] + }, + { + "name": "asks_decisions", + "patterns": ["which.*merge", "what should I merge", "what should be kept"] + }, + { "name": "asks_safety", "patterns": ["should I backup", "do you want a backup"] } + ], + "forbidden_drift": [ + { "name": "no_backup", "keywords": ["overwrites memory.json directly", "no backup needed"] }, + { "name": "merges_wrong", "keywords": ["merge cluster D", "merge cluster E"] } + ] + }, + "comfy-workflow-iteration": { + "must_inherit": [ + { "name": "uses_ultrasharp", "keywords": ["4x-UltraSharp", "UltraSharp"] }, + { "name": "refiner_denoise", "keywords": ["denoise 0.35", "denoise: 0.35", "denoise=0.35"] }, + { "name": "tile_size", "keywords": ["1024 tile", "tile size 1024", "tile_size: 1024"] }, + { "name": "output_4k", "keywords": ["3840x2160", "3840 x 2160", "4K"] }, + { "name": "refiner_checkpoint", "keywords": ["Juggernaut XL", "Juggernaut"] }, + { + "name": "workflow_path", + "keywords": ["E:/comfyui-workflows/product-upscale-4k.json", "product-upscale-4k.json"] + } + ], + "reset_questions": [ + { "name": "asks_upscaler", "patterns": ["which upscaler", "what upscaler model"] }, + { "name": "asks_denoise", "patterns": ["what denoise", "which denoise"] }, + { "name": "asks_resolution", "patterns": ["what resolution", "what output size"] } + ], + "forbidden_drift": [ + { "name": "different_upscaler", "keywords": ["RealESRGAN", "SwinIR", "LDSR"] }, + { "name": "different_denoise", "keywords": ["denoise 0.5", "denoise 0.4", "denoise 0.2"] } + ] + }, + "email-followup-thread": { + "must_inherit": [ + { "name": "patch_version", "keywords": ["v0.3.1", "0.3.1"] }, + { "name": "vulnerability_name", "keywords": ["symlink", "symlink-escape"] }, + { "name": "severity", "keywords": ["moderate"] }, + { "name": "audience", "keywords": ["eng-all", "engineering", "the team"] }, + { "name": "tone_calm", "keywords": ["update", "updated", "thanks", "appreciated", "no exploit"] } + ], + "reset_questions": [ + { "name": "asks_version", "patterns": ["which version", "which patch"] }, + { "name": "asks_vuln", "patterns": ["what vulnerability", "what was the bug"] }, + { "name": "asks_audience", "patterns": ["who is.*audience", "who should this go to"] } + ], + "forbidden_drift": [ + { "name": "panic_tone", "keywords": ["URGENT", "EMERGENCY", "DISASTER"] }, + { "name": "different_severity", "keywords": ["critical severity", "low severity"] } + ] + } + } +} diff --git a/bench/continuity-tasks.json b/bench/continuity-tasks.json new file mode 100644 index 0000000..c8e983a --- /dev/null +++ b/bench/continuity-tasks.json @@ -0,0 +1,142 @@ +{ + "_meta": { + "purpose": "Measure whether session 2 inherits state established in session 1. Each task simulates a paused multi-step workflow: session 1 makes specific decisions (tool choices, paths, parameters, aesthetic choices); session 2 is prompted to continue. With CE-backed memory, session 2 should respect those decisions; without it, session 2 must ask reset questions or guess.", + "modes_to_test": [ + "no_context_cold (S2 has no memory of S1 decisions; baseline of pain)", + "ce_continue (S2 system prompt includes a memory snapshot of S1 state)", + "ce_continue_compact (S2 system prompt includes only the compact handoff note CE emits)" + ], + "scoring": "Per task, gold facts define what S2 MUST inherit (tool choices, paths, parameters) and what counts as a reset question. Deterministic keyword scoring + LLM judge × 3 runs (decomposed rubric)." + }, + "tasks": [ + { + "id": "strava-project", + "category": "code", + "session_1_summary": "Set up a Python project to analyse a Strava CSV export. Decisions made: uv for dependencies, pytest for tests, ruff (not black) for formatting, src-layout, Python 3.12, project at E:/projects/strava-analyzer, package name strava_analyzer.", + "session_1_state": { + "project_path": "E:/projects/strava-analyzer", + "package_name": "strava_analyzer", + "layout": "src layout (src/strava_analyzer/)", + "python_version": "3.12", + "deps_tool": "uv", + "test_framework": "pytest", + "formatter": "ruff", + "files_created": ["pyproject.toml", "src/strava_analyzer/__init__.py", "tests/__init__.py"], + "csv_columns_observed": [ + "Activity Date", + "Activity Type", + "Distance", + "Moving Time", + "Average Heart Rate" + ] + }, + "session_1_compact_handoff": "Strava analyzer project at E:/projects/strava-analyzer (package strava_analyzer, src layout, Python 3.12, uv + pytest + ruff). pyproject.toml and src/strava_analyzer/__init__.py created. CSV cols: Activity Date, Activity Type, Distance, Moving Time, Average Heart Rate.", + "session_2_prompt": "Add a CSV loader function for the Strava export." + }, + { + "id": "fashion-shoot-prompts", + "category": "image-gen", + "session_1_summary": "Generated 3 ComfyUI prompts for a streetwear fashion shoot in Tokyo at dusk. Decisions: SDXL Juggernaut checkpoint, DPM++ 2M Karras sampler, 30 steps, CFG 6.5, 1344x768, aesthetic tags include 'oversized silhouettes', 'reflective surfaces', 'neon bleed', model 'Aya 24yo Japanese-Brazilian'.", + "session_1_state": { + "checkpoint": "Juggernaut XL v9", + "sampler": "DPM++ 2M Karras", + "steps": 30, + "cfg": 6.5, + "resolution": "1344x768", + "aesthetic_tags": ["oversized silhouettes", "reflective surfaces", "neon bleed", "Tokyo dusk"], + "model_persona": "Aya, 24yo Japanese-Brazilian model", + "negative_prompt_baseline": "low quality, blurry, deformed, watermark, modern buildings without neon" + }, + "session_1_compact_handoff": "Tokyo streetwear shoot. Juggernaut XL v9, DPM++ 2M Karras 30 steps CFG 6.5 at 1344x768. Aesthetic: oversized silhouettes, reflective surfaces, neon bleed. Model: Aya, 24yo Japanese-Brazilian.", + "session_2_prompt": "Give me 2 more prompt variants with different poses." + }, + { + "id": "slack-release-thread", + "category": "comms", + "session_1_summary": "Drafted a Slack announcement for shipping Context Engine v0.3.1 with the data persistence fix. Decisions: :rocket: emoji (not :tada:), audience '@channel devs', casual-technical tone, specific mention of CE_ROOT pinned to app.getPath('userData') and the asar.unpacked seeding. Length ~150 words. Posted to #context-engine.", + "session_1_state": { + "version_shipped": "v0.3.1", + "emoji": ":rocket:", + "audience": "@channel devs", + "tone": "casual-technical", + "channel": "#context-engine", + "approximate_length_words": 150, + "key_technical_points": [ + "CE_ROOT pinned to app.getPath('userData')", + "asar.unpacked seeding on first run", + "auto-migration of legacy data" + ], + "previous_message_opening": "Rolling out v0.3.1 today —" + }, + "session_1_compact_handoff": "Posted v0.3.1 announcement in #context-engine (@channel devs, :rocket: emoji, casual-technical tone, ~150 words). Covered: CE_ROOT pinned to userData, asar.unpacked seeding, auto-migration. Opened with 'Rolling out v0.3.1 today —'.", + "session_2_prompt": "Draft tomorrow's follow-up message with the migration status." + }, + { + "id": "brand-poster-adapt", + "category": "design", + "session_1_summary": "Designed an A3 poster for a developer workshop in Anthropic brand style. Decisions: palette Book Cloth #CC785C accent + Ivory Medium #F0EEE6 background + Slate Dark #191919 text, Styrene A display + Tiempos Text body, layout with a 15-degree asterisk glyph top-right, three-circle Venn graphic at lower-third 15% opacity, 'BUILDING WITH CLAUDE' headline.", + "session_1_state": { + "format_original": "A3 portrait (297x420mm)", + "accent_hex": "#CC785C", + "background_hex": "#F0EEE6", + "text_hex": "#191919", + "display_type": "Styrene A", + "body_type": "Tiempos Text", + "headline": "BUILDING WITH CLAUDE", + "graphic_motif": "asterisk glyph + three-circle Venn", + "asterisk_rotation_degrees": 15, + "venn_opacity_pct": 15 + }, + "session_1_compact_handoff": "Anthropic A3 workshop poster. Palette: Book Cloth #CC785C accent / Ivory #F0EEE6 bg / Slate #191919 text. Type: Styrene A display, Tiempos Text body. Headline 'BUILDING WITH CLAUDE'. Motif: asterisk glyph rotated 15deg + three-circle Venn at 15% opacity.", + "session_2_prompt": "Adapt this for an A4 flyer version." + }, + { + "id": "memory-consolidation", + "category": "meta", + "session_1_summary": "Audited memory.json. Identified 5 duplicate clusters (cluster A: 'Jeremy uses Windows' x3, cluster B: 'X100VI photography' x2, cluster C: 'CE token saving' x2, cluster D: 'morning routine' x2, cluster E: 'health tracker on Android' x2). Proposed merges for A, B, C. Flagged D and E as keep-but-tag-stale.", + "session_1_state": { + "clusters_identified": 5, + "clusters_to_merge": ["cluster_A_windows", "cluster_B_x100vi", "cluster_C_ce_tokens"], + "clusters_to_keep_with_stale_tag": ["cluster_D_morning_routine", "cluster_E_health_android"], + "backup_required_before": true, + "audit_log_path": "data/memory-audits/2026-05-15.json" + }, + "session_1_compact_handoff": "Memory audit done. 5 clusters found. Merge: cluster_A_windows, cluster_B_x100vi, cluster_C_ce_tokens. Keep-with-stale-tag: cluster_D_morning_routine, cluster_E_health_android. Backup first. Log at data/memory-audits/2026-05-15.json.", + "session_2_prompt": "Apply the consolidation we discussed." + }, + { + "id": "comfy-workflow-iteration", + "category": "image-gen", + "session_1_summary": "Built a ComfyUI upscale workflow for product photos. Decisions: 4x-UltraSharp upscaler, KSampler refiner at denoise 0.35, tiled VAE decode at 1024 tile size, output target 4K (3840x2160), Juggernaut XL refiner, sharpening pass via ImageFilter at 1.2 strength.", + "session_1_state": { + "upscaler_model": "4x-UltraSharp", + "refiner_denoise": 0.35, + "tile_size": 1024, + "tiled_vae": true, + "output_resolution": "3840x2160", + "refiner_checkpoint": "Juggernaut XL v9", + "sharpening_strength": 1.2, + "workflow_json_path": "E:/comfyui-workflows/product-upscale-4k.json" + }, + "session_1_compact_handoff": "Product upscale workflow saved to E:/comfyui-workflows/product-upscale-4k.json. 4x-UltraSharp upscaler -> KSampler refiner (denoise 0.35, Juggernaut XL v9) -> tiled VAE (1024 tiles) -> 3840x2160 output -> sharpening 1.2.", + "session_2_prompt": "Modify the workflow to also do a face-detail pass." + }, + { + "id": "email-followup-thread", + "category": "comms", + "session_1_summary": "Wrote an internal email about the symlink-escape security patch. Subject '[Security] Symlink-escape patch shipped — no action required for most'. Tone: calm-not-panicked. Severity called out as moderate. Recommended action: update via auto-updater. Sender: Jeremy. Recipients: eng-all@.", + "session_1_state": { + "subject_line": "[Security] Symlink-escape patch shipped — no action required for most", + "vulnerability": "symlink-escape", + "severity_called": "moderate", + "recommended_action": "update via auto-updater", + "audience": "eng-all@", + "tone": "calm-not-panicked", + "patch_version": "v0.3.1", + "no_exploit_observed_stated": true + }, + "session_1_compact_handoff": "Sent email to eng-all@ — Subject '[Security] Symlink-escape patch shipped — no action required for most'. Severity moderate, no exploit observed, action: update via auto-updater. Tone calm-not-panicked.", + "session_2_prompt": "Draft the one-week-later follow-up confirming everyone's updated." + } + ] +} diff --git a/bench/fetch_contexts.py b/bench/fetch_contexts.py new file mode 100644 index 0000000..534110e --- /dev/null +++ b/bench/fetch_contexts.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +""" +fetch_contexts.py — Pre-fetch CE smart + search contexts for each task and +dump them to disk. Used when the grading pass needs an LLM that isn't +reachable via API key (e.g. running through Claude Code session auth). + +Output: bench/artifacts/contexts/.smart.md and +bench/artifacts/contexts/.search.md plus +bench/artifacts/contexts/manifest.json with token counts + skill counts per task. + +Run AFTER `python bench/tokenomics.py` (or alongside; only depends on CE). +""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +# Re-use the helpers we already built in tokenomics.py. +sys.path.insert(0, str(Path(__file__).parent)) +from tokenomics import ( # noqa: E402 + DEFAULT_CE_URL, + build_context_text, + build_search_context_text, + count_tokens, + fetch_active_skill_bodies, + fetch_compiled_baseline, + post_json, + reachable, +) + +HERE = Path(__file__).parent +OUT_DIR = HERE / "artifacts" / "contexts" +TASKS_PATH = HERE / "tasks.json" + +CE_URL = os.environ.get("CE_URL", DEFAULT_CE_URL) +MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "16000")) +SEARCH_LIMIT = int(os.environ.get("SEARCH_LIMIT", "8")) + + +def main() -> int: + if not reachable(CE_URL): + sys.stderr.write(f"[!] CE not reachable at {CE_URL}\n") + return 1 + + OUT_DIR.mkdir(parents=True, exist_ok=True) + + with TASKS_PATH.open(encoding="utf-8") as f: + tasks = json.load(f) + + print(f"Fetching active skill bodies...") + active_bodies = fetch_active_skill_bodies(CE_URL) + print(f" {len(active_bodies)} active skills loaded") + + manifest = { + "ce_url": CE_URL, + "max_tokens": MAX_TOKENS, + "search_limit": SEARCH_LIMIT, + "active_skill_count": len(active_bodies), + "tasks": [], + } + + for i, task in enumerate(tasks, 1): + tid = task["id"] + print(f" [{i:>2}/{len(tasks)}] {tid:<28}", end=" ", flush=True) + + smart = post_json(CE_URL, "/api/compile/smart", + {"task": task["prompt"], "maxTokens": MAX_TOKENS}) + selected = smart.get("selectedSkillIds") or [] + smart_ctx = build_context_text(selected, active_bodies) + smart_tokens = count_tokens(smart_ctx) + + search = post_json(CE_URL, "/api/search", + {"query": task["prompt"], "limit": SEARCH_LIMIT}) + chunks = search.get("results") or [] + search_ctx = build_search_context_text(chunks) + search_tokens = count_tokens(search_ctx) + + (OUT_DIR / f"{tid}.smart.md").write_text(smart_ctx, encoding="utf-8") + (OUT_DIR / f"{tid}.search.md").write_text(search_ctx, encoding="utf-8") + + manifest["tasks"].append({ + "id": tid, + "category": task.get("category", ""), + "prompt": task["prompt"], + "selected_skill_count": len(selected), + "smart_tokens": smart_tokens, + "search_tokens": search_tokens, + "search_chunk_count": len(chunks), + "smart_path": str((OUT_DIR / f"{tid}.smart.md").relative_to(HERE.parent)), + "search_path": str((OUT_DIR / f"{tid}.search.md").relative_to(HERE.parent)), + }) + print(f"smart {smart_tokens:>6,}tk / search {search_tokens:>5,}tk") + + (OUT_DIR / "manifest.json").write_text( + json.dumps(manifest, indent=2), encoding="utf-8" + ) + print(f"\nWrote {len(tasks)} task contexts to {OUT_DIR.relative_to(HERE.parent)}/") + print(f"Manifest: {(OUT_DIR / 'manifest.json').relative_to(HERE.parent)}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bench/fixtures/skills/app-launcher/SKILL.md b/bench/fixtures/skills/app-launcher/SKILL.md new file mode 100644 index 0000000..6f44058 --- /dev/null +++ b/bench/fixtures/skills/app-launcher/SKILL.md @@ -0,0 +1,33 @@ +--- +name: App Launcher +description: Windows application launcher shortcuts and automation +triggers: [launch, open, start, Start-Process, shortcut, application, automation] +--- + +# App Launcher + +## Launching Applications + +Use `Start-Process` to launch any application: + +```powershell +Start-Process "C:\Program Files\Slack\slack.exe" +Start-Process "code" # via PATH +Start-Process "https://chat.openai.com" # default browser +``` + +## Finding Application Paths + +Get installed app paths from the registry: + +```powershell +Get-ItemProperty "HKLM:\Software\Microsoft\Windows\CurrentVersion\App Paths\*" | Select-Object '(default)' +``` + +## Startup Folder Automation + +Place shortcuts in `shell:startup` for automatic launch at logon. Use `$env:APPDATA\Microsoft\Windows\Start Menu\Programs\Startup` for per-user startup items. + +## Launch Order + +Use `Start-Process -Wait` to launch apps sequentially, ensuring each is fully loaded before the next starts. Combine with Start-Sleep for apps that need extra initialization time. diff --git a/bench/fixtures/skills/brand-guidelines/SKILL.md b/bench/fixtures/skills/brand-guidelines/SKILL.md new file mode 100644 index 0000000..47b9e59 --- /dev/null +++ b/bench/fixtures/skills/brand-guidelines/SKILL.md @@ -0,0 +1,31 @@ +--- +name: Brand Guidelines +description: Anthropic brand design guidelines, colors, typography, and aesthetics +triggers: [brand, design, Anthropic, poster, workshop, typography, color palette] +--- + +# Brand Guidelines + +## Color Palette + +Primary brand colors: +- **Book Cloth / Clay**: #CC785C (signature accent) +- **Ivory**: warm off-white background +- **Kraft**: natural brown tone +- **Slate**: dark neutral + +## Typography + +- **Display typeface**: Styrene (headings, impactful text) +- **Body typeface**: Tiempos (body copy, readability) + +## Design Principles + +- **Restraint**: generous whitespace, minimal elements +- **Editorial quality**: single accent per composition +- **No gradients or neon**: avoid glassmorphism, glow effects +- **Print-ready**: A3 format (297×420mm), CMYK, 300 DPI with 3mm bleed + +## Workshop Poster Template + +A3 poster for developer workshops: title in Styrene, body in Tiempos, one Clay accent element, rest in Slate/Ivory. diff --git a/bench/fixtures/skills/canvas-design/SKILL.md b/bench/fixtures/skills/canvas-design/SKILL.md new file mode 100644 index 0000000..f67c8fd --- /dev/null +++ b/bench/fixtures/skills/canvas-design/SKILL.md @@ -0,0 +1,28 @@ +--- +name: Canvas Design +description: Print and digital canvas design specification and layout +triggers: [canvas, layout, A3, poster, design, composition, print, bleed, DPI] +--- + +# Canvas Design + +## Print Specifications + +- **A3 format**: 297×420mm +- **Resolution**: 300 DPI for print, 72 DPI for digital +- **Color mode**: CMYK for print, RGB for digital +- **Bleed**: 3mm bleed on all sides for print production +- **Safe zone**: 10mm inset from trim edge + +## Poster Layout + +- **Hierarchy**: headline (20%), visual (50%), body text (20%), footer (10%) +- **Margins**: generous whitespace around all elements +- **Grid**: single-column or two-column depending on density +- **Visual**: centered hero image or graphic + +## Digital Canvas + +- **Workshop slides**: 1920×1080 (16:9) or 1024×768 (4:3) +- **Social media**: 1200×630 for Open Graph, 1080×1080 for Instagram +- **Presentation**: consistent margins, brand header/footer diff --git a/bench/fixtures/skills/claude-api/SKILL.md b/bench/fixtures/skills/claude-api/SKILL.md new file mode 100644 index 0000000..98631c3 --- /dev/null +++ b/bench/fixtures/skills/claude-api/SKILL.md @@ -0,0 +1,57 @@ +--- +name: Claude API +description: Anthropic Claude API integration, tool use, prompt caching +triggers: [anthropic, claude, API, SDK, tool use, prompt caching, messages, stream] +--- + +# Claude API + +## Tool Use Pattern + +```python +import anthropic + +client = anthropic.Anthropic() +response = client.messages.create( + model="claude-sonnet-4-6", + max_tokens=1024, + tools=[{ + "name": "get_weather", + "description": "Get current weather for a location", + "input_schema": { + "type": "object", + "properties": { + "location": {"type": "string"}, + }, + "required": ["location"] + } + }], + messages=[{"role": "user", "content": "What's the weather in London?"}] +) +``` + +## Agent Loop + +Check `response.stop_reason` for `"tool_use"`. When tools are requested, execute the function, add tool_result, and continue: + +```python +while response.stop_reason == "tool_use": + # Execute tool, add result + messages.append({"role": "user", "content": tool_result}) + response = client.messages.create(model=model, messages=messages, tools=tools) +``` + +## Prompt Caching (4.6+) + +Use `cache_control = {"type": "ephemeral"}` on system messages and tools to reduce costs: + +```python +response = client.messages.create( + model="claude-sonnet-4-6", + system=[{"type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}}], + tools=tools_with_cache, + messages=messages, +) +``` + +Cache hits shown via `usage.cache_read_input_tokens` and `usage.cache_creation_input_tokens`. diff --git a/bench/fixtures/skills/comfyui-prompt-builder/SKILL.md b/bench/fixtures/skills/comfyui-prompt-builder/SKILL.md new file mode 100644 index 0000000..9454567 --- /dev/null +++ b/bench/fixtures/skills/comfyui-prompt-builder/SKILL.md @@ -0,0 +1,35 @@ +--- +name: ComfyUI Prompt Builder +description: Building effective prompts for ComfyUI image generation workflows +triggers: [comfyui, prompt, stable diffusion, image generation, positive, negative, CFG] +--- + +# ComfyUI Prompt Builder + +## Positive Prompt Structure + +Structure positive prompts with subject, style, environment, and lighting: + +``` +fantasy landscape, rolling fog, warm sunset, dramatic clouds, epic vista, detailed, cinematic lighting +``` + +## Negative Prompts + +Include negative prompts to avoid artifacts: + +``` +low quality, blurry, distorted, deformed, extra limbs, bad anatomy, watermark, text, signature +``` + +## Settings Guidance + +- CFG Scale: 7.0 for balanced creativity +- Sampler: DPM++ 2M Karras for quality +- Steps: 20-30 for standard output +- Resolution: 768x768 or 1024x1024 for SDXL +- Seed: -1 for random, fixed for reproducibility + +## Weighted Tokens + +Use parentheses and number weights: `(fog:1.2)` or `(sunset:1.3)` to emphasize elements. Use `(unwanted:0.5)` to reduce emphasis. diff --git a/bench/fixtures/skills/comfyui-workflow-manager/SKILL.md b/bench/fixtures/skills/comfyui-workflow-manager/SKILL.md new file mode 100644 index 0000000..4b0f12b --- /dev/null +++ b/bench/fixtures/skills/comfyui-workflow-manager/SKILL.md @@ -0,0 +1,42 @@ +--- +name: ComfyUI Workflow Manager +description: Managing ComfyUI workflows for upscaling, image generation, prompt integration, and refinement +triggers: [comfyui, workflow, upscale, KSampler, refiner, image processing, node, prompt] +--- + +# ComfyUI Workflow Manager + +## Upscaling Workflow + +Build an upscale workflow: +1. **Load Image** node for input +2. **Upscale Model** node with 4x-UltraSharp ESRGAN model +3. **Upscale Image By** node connected to upscale model +4. **KSampler** for refiner pass at reduced denoise (0.3-0.4) +5. **Save Image** node for output + +## 4K Target + +For 4K output, use 4x upscale factor from 1080p input. Check VAE tile mode (`--tile`) to avoid memory issues at high resolutions. Target resolution: 3840x2160 or 4096x2160. + +## Refiner Pass + +After upscaling, run a KSampler with denoise 0.3-0.4 to add detail without changing composition. Set CFG to 5-6 for the refiner pass. Use the same seed as the original generation if preserving composition. + +## Prompt Workflow Integration + +When building a prompt workflow, combine positive and negative prompt nodes with the KSampler for text-to-image generation. Use the CLIP Text Encode node to convert your positive and negative prompts into the conditioning needed by the KSampler. Load a checkpoint model (like SDXL) and connect it to the CLIP and VAE nodes. + +## Fantasy Landscape Prompt Workflow + +To generate a moody fantasy landscape with rolling fog and warm sunset, build this workflow: +1. **Checkpoint Loader**: SDXL or Juggernaut XL model +2. **CLIP Text Encode (Positive Prompt)**: "fantasy landscape, rolling fog, warm sunset, dramatic lighting, epic vista, detailed, cinematic atmosphere, moody" +3. **CLIP Text Encode (Negative Prompt)**: "low quality, blurry, distorted, deformed, bad anatomy, watermark, text" +4. **Empty Latent Image**: 1024x1024 or 768x768 +5. **KSampler**: DPM++ 2M Karras, CFG 7, steps 25, denoise 1.0 +6. **VAE Decode** and **Save Image** + +## Template Workflows + +Save workflows as .json files in `ComfyUI/user/default/workflows/`. Load from the Workflows menu. Share prompt templates as workflow files that include the full node graph — image loading, upscaling, refiner pass, and save node. Use template workflows to save your favourite prompt configurations for reuse. diff --git a/bench/fixtures/skills/competitor-analyser/SKILL.md b/bench/fixtures/skills/competitor-analyser/SKILL.md new file mode 100644 index 0000000..51abc61 --- /dev/null +++ b/bench/fixtures/skills/competitor-analyser/SKILL.md @@ -0,0 +1,26 @@ +--- +name: Competitor Analyser +description: Market and competitor analysis for product ideation and startup planning +triggers: [competitor, market, analysis, startup, business, photographer, SaaS] +--- + +# Competitor Analyser + +## Market Analysis Framework + +1. **Identify competitors**: direct (similar product), indirect (different approach, same need) +2. **Feature comparison**: build a matrix of features across products +3. **Pricing analysis**: free vs freemium vs subscription tiers +4. **User pain points**: read reviews, support forums, social media complaints + +## Photography AI Market + +Current landscape for photographer-focused AI tools: +- **Editing**: Adobe Lightroom AI, Luminar Neo, Topaz Photo AI +- **Workflow**: Pixieset, SmugMug, ShootProof for client galleries +- **Culling**: Narrative Select, PhotoMechanic for initial selection +- **Gap**: personal AI assistant that integrates editing, culling, client management, and EXIF analysis + +## Pain Points + +Freelance photographers spend 40% of time on non-shooting tasks: culling, editing, client communication, gallery management. A personal AI assistant could automate the repetitive parts while keeping the creative control. diff --git a/bench/fixtures/skills/downloads-cleaner/SKILL.md b/bench/fixtures/skills/downloads-cleaner/SKILL.md new file mode 100644 index 0000000..4807f74 --- /dev/null +++ b/bench/fixtures/skills/downloads-cleaner/SKILL.md @@ -0,0 +1,30 @@ +--- +name: Downloads Cleaner +description: Windows Downloads folder cleanup and file organization automation +triggers: [downloads, cleanup, organize, file management, archive, stale files, PowerShell] +--- + +# Downloads Cleaner + +## Finding Stale Files + +```powershell +$cutoff = (Get-Date).AddDays(-90) +$stale = Get-ChildItem "$env:USERPROFILE\Downloads" -File | Where-Object { $_.LastWriteTime -lt $cutoff } +$stale | Select-Object Name, LastWriteTime, Length +``` + +## Cleanup Plan + +1. **Review**: first list stale files with dry-run +2. **Archive**: move to `_Archive` folder in Downloads +3. **Delete**: after confirming nothing is needed +4. **Report**: show size recovered and file counts + +## Safety + +Always use `-WhatIf` on destructive operations. Never delete without confirmation. Archive before removing. Create a report of what was moved/deleted. + +## Automation + +Schedule with Task Scheduler to run monthly. Log results to a file. Send summary via email if configured. diff --git a/bench/fixtures/skills/email-drafter/SKILL.md b/bench/fixtures/skills/email-drafter/SKILL.md new file mode 100644 index 0000000..02bb59b --- /dev/null +++ b/bench/fixtures/skills/email-drafter/SKILL.md @@ -0,0 +1,30 @@ +--- +name: Email Drafter +description: Professional email drafting for internal and external communications +triggers: [email, draft, subject, message, internal, security, announcement] +--- + +# Email Drafter + +## Security Patch Email Template + +**Subject:** `[Security] Symlink-escape vulnerability patched in v0.3.1` + +**Body:** +1. **What happened**: brief description of the symlink-escape path traversal vulnerability +2. **Impact**: what the vulnerability could allow (local file access, but no remote exploit observed) +3. **Fix**: shipped in v0.3.1 — the path sanitization now prevents symlink traversal +4. **Action**: update to v0.3.1 via auto-update or manual download +5. **Questions**: reply to this thread or DM the security lead + +## Tone Guidelines + +- Professional but direct +- No alarmist language +- State severity clearly +- Include concrete action items +- Sign with sender name and role + +## Structure + +Use `Subject:` line with clear prefix. Body divided into: context, impact, resolution, action items. End with contact info. diff --git a/bench/fixtures/skills/file-organiser/SKILL.md b/bench/fixtures/skills/file-organiser/SKILL.md new file mode 100644 index 0000000..77d8e74 --- /dev/null +++ b/bench/fixtures/skills/file-organiser/SKILL.md @@ -0,0 +1,33 @@ +--- +name: File Organiser +description: Windows file organization, categorization, and grouping utilities +triggers: [organize, file, categorize, group, move, archive, extension, cleanup] +--- + +# File Organiser + +## Categorization + +Group files by extension or type: +```powershell +Get-ChildItem -Path $downloads -File | Group-Object Extension | Sort-Object Count -Descending +``` + +## Organizing by Extension + +```powershell +$categories = @{ + ".pdf" = "Documents"; ".docx" = "Documents"; ".xlsx" = "Documents" + ".jpg" = "Images"; ".png" = "Images"; ".gif" = "Images" + ".zip" = "Archives"; ".rar" = "Archives"; ".7z" = "Archives" + ".exe" = "Installers"; ".msi" = "Installers" +} +``` + +## Archive Strategy + +Move files older than 90 days to subfolders. Keep a manifest of archived files. Use date-based folder structure: `Archive\2026\05\` for monthly archiving. Log all moves to an archive audit file. + +## Safety + +Preview changes before executing. Use `-WhatIf` flag to verify. Create restore point before bulk operations. diff --git a/bench/fixtures/skills/file-search/SKILL.md b/bench/fixtures/skills/file-search/SKILL.md new file mode 100644 index 0000000..01b629a --- /dev/null +++ b/bench/fixtures/skills/file-search/SKILL.md @@ -0,0 +1,26 @@ +--- +name: File Search +description: Windows file search and discovery utilities +triggers: [search, file, find, Get-ChildItem, directory, locate, Path] +--- + +# File Search + +## Searching Files + +```powershell +Get-ChildItem -Path "$env:USERPROFILE\Downloads" -Recurse -File | Where-Object { $_.Extension -in ".pdf", ".docx", ".zip" } +``` + +## Finding Specific Files + +Search by pattern, date, or size: +```powershell +Get-ChildItem -Path C:\ -Recurse -File -Filter "*.log" -ErrorAction SilentlyContinue +Get-ChildItem -Path $HOME -Recurse -File | Where-Object { $_.LastWriteTime -gt (Get-Date).AddDays(-7) } +Get-ChildItem -Path $HOME -Recurse -File | Sort-Object Length -Descending | Select-Object -First 20 +``` + +## Date Filtering + +Use `LastWriteTime` and `CreationTime` properties. Common filters: `-gt (Get-Date).AddDays(-90)` for files modified in the last 90 days, or `-lt` for files older than a threshold. diff --git a/bench/fixtures/skills/health-tracker-analyst/SKILL.md b/bench/fixtures/skills/health-tracker-analyst/SKILL.md new file mode 100644 index 0000000..fa41232 --- /dev/null +++ b/bench/fixtures/skills/health-tracker-analyst/SKILL.md @@ -0,0 +1,36 @@ +--- +name: Health Tracker Analyst +description: Health data analysis, step tracking, and fitness trend analysis +triggers: [health, step, fitness, analysis, tracking, Apple Health, Google Fit, daily goal] +--- + +# Health Tracker Analyst + +## Step Data Analysis + +Analyze step data from Apple Health or Google Fit exports: + +```python +import pandas as pd +df = pd.read_csv("steps_export.csv") +daily_avg = df["steps"].mean() +weekly_median = df.resample("W", on="date")["steps"].median() +``` + +## Key Metrics + +- **Daily average**: mean steps per day +- **Median**: more robust against outlier days +- **Trend**: rolling 7-day average shows direction +- **Streak**: consecutive days above 10,000 steps + +## Goal Tracking + +Compare current performance against the 10,000 steps daily goal. Calculate percentage of days on track, longest streak, and weekly averages. Visualize with a 7-day rolling average chart to see trends clearly. + +## Data Sources + +- Apple Health: export via Health app (XML or CSV) +- Google Fit: Takeout export +- Fitbit: account data export +- Garmin: Connect web export diff --git a/bench/fixtures/skills/internal-comms/SKILL.md b/bench/fixtures/skills/internal-comms/SKILL.md new file mode 100644 index 0000000..fc6b154 --- /dev/null +++ b/bench/fixtures/skills/internal-comms/SKILL.md @@ -0,0 +1,30 @@ +--- +name: Internal Communications +description: Internal team communications, release announcements, and patch notifications +triggers: [internal, communication, announcement, release, patch, team, Slack, email] +--- + +# Internal Communications + +## Release Announcements + +Draft announcements with this structure: +1. **Version and what shipped**: name the version number +2. **The fix or feature**: clear description of what changed and why +3. **Team action**: call to action (update, restart, no action needed) +4. **Format**: use formatting (bold, emoji, hashtags) appropriate to channel + +## Security Patch Notifications + +When shipping a security patch: +1. **Subject**: clear `[Security]` prefix +2. **Vulnerability**: name the specific issue (e.g., "symlink-escape vulnerability") +3. **Status**: state it's shipped/fixed +4. **Action required**: tell recipients what they need to do + +## Tone Guidelines + +- Internal: direct and informational, not marketing +- No hype language (avoid "leverage", "synergy", "best-in-class") +- Include contact for questions +- For security: no panic tone, no "URGENT" unless genuinely critical diff --git a/bench/fixtures/skills/pptx/SKILL.md b/bench/fixtures/skills/pptx/SKILL.md new file mode 100644 index 0000000..21e7a45 --- /dev/null +++ b/bench/fixtures/skills/pptx/SKILL.md @@ -0,0 +1,51 @@ +--- +name: PowerPoint Generator +description: PowerPoint slide deck creation and automation for presentations and slide decks +triggers: [PowerPoint, pptx, slide, deck, presentation, office, automation, slideshow] +--- + +# PowerPoint Generator + +## Creating Presentations + +Use the `python-pptx` library to create slide decks programmatically: + +```python +from pptx import Presentation +from pptx.util import Inches, Pt + +prs = Presentation() +slide_layout = prs.slide_layouts[1] # Title and Content +slide = prs.slides.add_slide(slide_layout) +slide.shapes.title.text = "Q2 Objectives" +``` + +## Slide Layouts + +- **Title slide**: for section dividers +- **Content slide**: bullet points or body text +- **Two content**: side-by-side comparison +- **Blank**: custom layouts + +## Deck Structure + +A well-structured presentation deck includes a title slide, an agenda slide, content slides for each topic, and a closing slide. For Q2 planning, include milestones, key results, and timeline slides. + +## Theme-Factory Integration + +Use custom themes from theme-factory with python-pptx: design principles like Dieter Rams' "less but better" can be applied programmatically. Set the theme's accent colors, typography (Styrene for titles, Tiempos for body), and slide master layouts. The preferred aesthetic is restrained — generous whitespace, single accent element per slide, no gradients. + +## Q2 Planning Deck with Rams Theme + +For a Q2 slide deck using the Dieter Rams theme: create a title slide with project name, an agenda slide listing objectives, milestone slides (1 per quarter-month), and key results slides. Apply the Rams design principles: honest presentation of data, unobtrusive layouts, useful visual hierarchy. Use the theme-factory preset's color palette (Slate, Clay, Ivory) and ensure consistent typography throughout every slide. + +## Formatting Tips + +- Use consistent font sizes (title 28pt, body 18pt) +- Keep 5-7 bullets per slide max +- Include speaker notes for presentation delivery +- Export to PDF for distribution + +## Automation + +Use `Add-Member -Type NoteProperty` in PowerShell to generate PPTX via COM object, or use python-pptx for cross-platform creation. Slide decks can be automated end-to-end with dynamic data from spreadsheets or databases. diff --git a/bench/fixtures/skills/process-manager/SKILL.md b/bench/fixtures/skills/process-manager/SKILL.md new file mode 100644 index 0000000..414fd02 --- /dev/null +++ b/bench/fixtures/skills/process-manager/SKILL.md @@ -0,0 +1,28 @@ +--- +name: Process Manager +description: Windows process management, CPU monitoring, and task management +triggers: [cpu, process, task manager, performance, high usage, Get-Process, Stop-Process] +--- + +# Process Manager + +## Monitoring CPU Usage + +Use `Get-Process | Sort-Object CPU -Descending | Select-Object -First 10` to find the top CPU consumers. The CPU column shows total processor time in seconds. + +## Identifying High CPU + +When a process is using high CPU: +1. Run `Get-Process -Name | Select-Object Id, ProcessName, CPU, WorkingSet` +2. Check if it's a known system process or third-party +3. Note the process ID for targeted investigation + +## Taking Action + +- `Stop-Process -Id -Force` to terminate +- `taskkill /PID /F` as cmd alternative +- Use Task Manager (Ctrl+Shift+Esc) for GUI monitoring + +## Prevention + +Set up Resource Monitor alerts for sustained CPU > 80%. Use Performance Monitor to log CPU trends over time. diff --git a/bench/fixtures/skills/slack-gif-creator/SKILL.md b/bench/fixtures/skills/slack-gif-creator/SKILL.md new file mode 100644 index 0000000..4da827e --- /dev/null +++ b/bench/fixtures/skills/slack-gif-creator/SKILL.md @@ -0,0 +1,37 @@ +--- +name: Slack GIF Creator +description: Creating and formatting Slack messages with GIFs and rich formatting +triggers: [Slack, message, announcement, GIF, emoji, format, channel] +--- + +# Slack GIF Creator + +## Slack Message Formatting + +Use Slack mrkdwn for rich messages: +- `*bold*` for emphasis +- `:emoji:` for reactions and decoration +- `>blockquote` for quoted text +- `#channel` for channel references +- `@user` for user mentions + +## Announcement Format + +```markdown +*Release: v0.3.1 is here!* :rocket: + +The data persistence fix has shipped. Your context is now saved reliably. + +*What to do:* Update via auto-update or download from GitHub. +:inbox_tray: No action if auto-update is on. + +Any questions, drop them in #dev-chat. +``` + +## Best Practices + +- Include a clear headline (bold) +- Use 2-3 emojis max — don't overdo it +- State the call to action explicitly +- Keep it under 5 lines if possible +- Use threads for additional detail diff --git a/bench/fixtures/skills/startup-routine/SKILL.md b/bench/fixtures/skills/startup-routine/SKILL.md new file mode 100644 index 0000000..5c022e6 --- /dev/null +++ b/bench/fixtures/skills/startup-routine/SKILL.md @@ -0,0 +1,39 @@ +--- +name: Startup Routine +description: Windows startup automation and morning routine configuration +triggers: [startup, morning, routine, launch, autostart, schedule, Start-Process] +--- + +# Startup Routine + +## Morning Startup Script + +Create a PowerShell script that launches your daily apps in order: + +```powershell +# Morning startup routine +Write-Host "Starting morning routine..." + +# First: communication apps +Start-Process "slack://" +Start-Process "C:\Users\jerem\AppData\Local\Programs\Microsoft VS Code\Code.exe" + +# Second: wait, then browsers +Start-Process "C:\Program Files\Google\Chrome\Application\chrome.exe" + +Write-Host "Morning routine complete." +``` + +## Scheduling with Task Scheduler + +Use `Register-ScheduledTask` to run the startup script at logon: + +```powershell +$action = New-ScheduledTaskAction -Execute "PowerShell.exe" -Argument "-File C:\scripts\morning-routine.ps1" +$trigger = New-ScheduledTaskTrigger -AtLogOn +Register-ScheduledTask -TaskName "MorningRoutine" -Action $action -Trigger $trigger +``` + +## Ordering + +Start communication apps first (Slack), then editor (VS Code), then browsers (ChatGPT). Use `Start-Sleep` between launches to ensure each app is ready before the next starts. diff --git a/bench/fixtures/skills/system-monitor/SKILL.md b/bench/fixtures/skills/system-monitor/SKILL.md new file mode 100644 index 0000000..f3149cd --- /dev/null +++ b/bench/fixtures/skills/system-monitor/SKILL.md @@ -0,0 +1,26 @@ +--- +name: System Monitor +description: System performance monitoring and resource tracking on Windows +triggers: [performance, system, monitor, resource, task manager, perfmon, Resource Monitor] +--- + +# System Monitor + +## Resource Monitor + +Launch Resource Monitor (`resmon.exe`) to see real-time CPU, memory, disk, and network usage. The CPU tab shows per-process utilization with expandable views. + +## Performance Monitor + +Use Performance Monitor (`perfmon.exe`) to create data collector sets. Track counters like: +- `\Processor(_Total)\% Processor Time` +- `\Memory\Available MBytes` +- `\LogicalDisk(*)\% Disk Time` + +## Task Manager + +Task Manager shows per-process CPU, memory, disk, and network columns. Sort by any column to identify resource hogs. The Performance tab shows overall system utilization graphs. + +## Identifying Worst Offender + +Use `Get-Process | Sort-Object CPU -Descending | Select-Object Name, CPU, WorkingSet -First 5` to quickly identify the process using the most CPU time. diff --git a/bench/fixtures/skills/theme-factory/SKILL.md b/bench/fixtures/skills/theme-factory/SKILL.md new file mode 100644 index 0000000..609c1e5 --- /dev/null +++ b/bench/fixtures/skills/theme-factory/SKILL.md @@ -0,0 +1,31 @@ +--- +name: Theme Factory +description: Slide deck theming engine with design philosophy presets +triggers: [theme, slides, deck, presentation, Dieter Rams, slide deck, PowerPoint] +--- + +# Theme Factory + +## Dieter Rams Theme + +Based on Rams' ten principles of good design: innovative, useful, aesthetic, understandable, unobtrusive, honest, long-lasting, thorough, environmentally friendly, and minimal. + +"Less, but better" — focus on essential content. Remove anything decorative. + +## Applying the Theme + +```powershell +# Generate slide deck using Rams theme +theme-factory --theme rams --format pptx --output Q2-plans.pptx +``` + +## Theme Structure + +- **Master slide**: clean white background, Slate title text, single Clay accent line +- **Content slides**: minimal, one idea per slide, generous whitespace +- **Data slides**: clean charts, no 3D effects, muted colors +- **Typography**: Styrene for titles, Tiempos for body + +## Q2 Planning Deck + +For Q2 objectives and milestones, use the Professional theme with Rams influence: results-focused layouts, timeline slides, key results highlighted with restraint. diff --git a/bench/fixtures/skills/web-researcher/SKILL.md b/bench/fixtures/skills/web-researcher/SKILL.md new file mode 100644 index 0000000..d550c32 --- /dev/null +++ b/bench/fixtures/skills/web-researcher/SKILL.md @@ -0,0 +1,26 @@ +--- +name: Web Researcher +description: Web research and information gathering for product validation and market analysis +triggers: [research, web, market, validation, idea, startup, customer, problem] +--- + +# Web Researcher + +## Research Methodology + +1. **Define the problem**: what specific pain point are you solving? +2. **Identify target users**: who experiences this pain most acutely? +3. **Validate demand**: search volume, Reddit threads, existing solutions +4. **Size the market**: TAM, SAM, SOM for the target segment + +## Photographer Market Research + +For a photographer-focused product: +- **Target users**: wedding photographers (high volume), commercial photographers (high value), portrait studios (repeat business) +- **Pain points**: culling time, editing consistency, client communication, gallery delivery +- **Existing solutions**: fragmented across multiple tools, no unified AI assistant +- **Monetization**: SaaS subscription at $15-30/month, or per-session pricing for occasional users + +## Validation + +Search for "photographer AI assistant", "photo editing AI", "client gallery automation" on Reddit, Product Hunt, and G2 to assess market interest. Build an MVP with the single most painful feature (automated culling + basic editing) and validate with 10 target users. diff --git a/bench/gold-answers.json b/bench/gold-answers.json new file mode 100644 index 0000000..68b8489 --- /dev/null +++ b/bench/gold-answers.json @@ -0,0 +1,480 @@ +{ + "_schema": "Per-task gold facts and labeled retrieval expectations. Used by grade_v2.py.", + "_fields": { + "must_include": "Facts the answer MUST cover. Each entry has a `name`, a list of `keywords` (any match counts), and optional `desc`.", + "may_include": "Bonus facts. Same shape as must_include.", + "forbidden": "Anti-patterns that should NOT appear (wrong OS, wrong API style, etc).", + "expected_sources": "Skill IDs that should appear in MCP search results. Used for Precision@K / Recall@K.", + "failure_modes": "Enumerated tags the judge may apply to label what went wrong." + }, + "tasks": { + "pc-cpu-hog": { + "must_include": [ + { + "name": "windows_command", + "keywords": ["Get-Process", "Task Manager", "Resource Monitor", "tasklist"], + "desc": "A native Windows mechanism to inspect CPU usage" + }, + { + "name": "sort_by_cpu", + "keywords": ["Sort-Object CPU", "sort by cpu", "sorted by cpu", "CPU -Descending", "highest cpu"], + "desc": "Show how to order the list by CPU consumption" + }, + { + "name": "action_after_finding", + "keywords": ["Stop-Process", "End task", "kill", "suspend", "investigate"], + "desc": "What to do with the offending process" + } + ], + "may_include": [ + { "name": "process_explorer", "keywords": ["Process Explorer", "procexp", "Sysinternals"] }, + { "name": "performance_tab", "keywords": ["Performance tab", "Performance monitor", "perfmon"] } + ], + "forbidden": [ + { + "name": "linux_only", + "keywords": ["sudo apt", "/proc/stat", "htop install", "apt-get"], + "desc": "Wrong OS — user is on Windows" + } + ], + "expected_sources": ["process-manager", "system-monitor"], + "failure_modes": ["wrong_os", "missing_command", "vague_advice"] + }, + "pc-morning-routine": { + "must_include": [ + { + "name": "script_or_automation", + "keywords": ["Start-Process", "PowerShell", ".ps1", "Task Scheduler", "shortcut"], + "desc": "A concrete automation mechanism" + }, + { + "name": "three_apps_named", + "keywords": ["Slack", "VS Code", "ChatGPT"], + "match_mode": "all", + "desc": "All three named applications appear" + }, + { + "name": "order_rationale", + "keywords": ["first", "second", "third", "order", "before", "after", "delay", "sleep"], + "desc": "An explicit launch order or sequencing" + } + ], + "may_include": [ + { + "name": "auto_run_logon", + "keywords": ["at log on", "AtLogOn", "Task Scheduler", "Register-ScheduledTask", "Startup folder"] + }, + { "name": "profile_alias", "keywords": ["$PROFILE", "alias", "function"] }, + { "name": "store_aumid", "keywords": ["AUMID", "shell:AppsFolder", "Get-StartApps"] } + ], + "forbidden": [ + { + "name": "linux_cron", + "keywords": ["crontab", "systemd timer", "launchd"], + "desc": "Wrong OS — user is on Windows" + } + ], + "expected_sources": ["startup-routine", "app-launcher"], + "failure_modes": ["wrong_os", "missing_app", "no_order_rationale", "vague_advice"] + }, + "comfy-prompt-fantasy": { + "must_include": [ + { + "name": "positive_prompt", + "keywords": ["positive", "prompt:", "Positive:"], + "desc": "Distinct positive prompt section" + }, + { + "name": "negative_prompt", + "keywords": ["negative", "Negative:"], + "desc": "Distinct negative prompt section" + }, + { + "name": "scene_elements", + "keywords": ["fog", "sunset", "fantasy", "landscape"], + "match_mode": "all", + "desc": "Names all four requested scene elements" + }, + { + "name": "settings_hint", + "keywords": ["CFG", "steps", "sampler", "resolution", "DPM", "Karras", "768", "1024", "1536"], + "desc": "Mentions concrete generation settings" + } + ], + "may_include": [ + { + "name": "artist_reference", + "keywords": ["Rutkowski", "Bierstadt", "Aivazovsky", "painterly", "in the style of"] + }, + { "name": "weighted_tokens", "keywords": ["(", "weight", "emphasis", ":1.1", ":1.2"] }, + { "name": "hires_fix", "keywords": ["hires", "upscale", "highres", "refiner"] } + ], + "forbidden": [ + { + "name": "a1111_only", + "keywords": ["AUTOMATIC1111 webui-user.bat", "stable-diffusion-webui"], + "desc": "Question is about ComfyUI" + }, + { "name": "midjourney_syntax", "keywords": ["/imagine", "--ar", "--style raw"] } + ], + "expected_sources": ["comfyui-prompt-builder", "comfyui-workflow-manager"], + "failure_modes": ["missing_negative", "no_settings", "wrong_tool", "vague_advice"] + }, + "comfy-workflow-upscale": { + "must_include": [ + { "name": "load_image_node", "keywords": ["Load Image", "LoadImage"] }, + { + "name": "upscale_node", + "keywords": [ + "Upscale Model", + "ImageUpscaleWithModel", + "UpscaleModelLoader", + "ESRGAN", + "UltraSharp", + "RealESRGAN", + "4x-UltraSharp" + ] + }, + { "name": "refiner_pass", "keywords": ["KSampler", "refiner", "denoise", "second pass", "sampler"] }, + { "name": "4k_target", "keywords": ["4K", "3840", "4096", "2160", "4x"] } + ], + "may_include": [ + { "name": "tile_vae", "keywords": ["tile", "VAE", "tiled decode"] }, + { "name": "model_choice", "keywords": ["SDXL", "checkpoint", "Juggernaut", "RealVis"] } + ], + "forbidden": [{ "name": "a1111_syntax", "keywords": ["webui.py", "extensions/sd-webui"] }], + "expected_sources": ["comfyui-workflow-manager", "comfyui-prompt-builder"], + "failure_modes": ["missing_upscaler", "no_refiner", "wrong_resolution", "vague_workflow"] + }, + "prompt-library-save": { + "must_include": [ + { + "name": "template_name", + "keywords": ["cinematic portrait v3"], + "desc": "Names the exact template the user asked for" + }, + { + "name": "save_mechanism", + "keywords": ["save", "store", "library", "template", "register"], + "desc": "Explains how it's saved" + }, + { + "name": "reusable_intent", + "keywords": ["reuse", "reusable", "later", "next time", "load"], + "desc": "Acknowledges reuse intent" + } + ], + "may_include": [ + { "name": "workflow_json", "keywords": [".json", "workflow", "metadata", "tags"] }, + { "name": "location_path", "keywords": ["path", "directory", "folder", "templates/"] } + ], + "forbidden": [], + "expected_sources": ["prompt-library-manager", "comfyui-prompt-builder"], + "failure_modes": ["missing_template_name", "no_save_mechanism", "vague_advice"] + }, + "claude-api-tooluse": { + "must_include": [ + { "name": "anthropic_sdk", "keywords": ["anthropic", "Anthropic(", "import anthropic"] }, + { "name": "tools_array", "keywords": ["tools=", "tools:", "input_schema"] }, + { + "name": "agent_loop", + "keywords": ["stop_reason", "tool_use"], + "match_mode": "all", + "desc": "Loop until non-tool_use stop_reason" + }, + { "name": "tool_result_id", "keywords": ["tool_use_id", "tool_result"] }, + { "name": "weather_function", "keywords": ["get_weather", "weather", "temperature"] } + ], + "may_include": [ + { "name": "error_as_data", "keywords": ["is_error", "is_error: True", "is_error=True"] }, + { "name": "prompt_caching", "keywords": ["cache_control", "ephemeral"] }, + { "name": "real_api", "keywords": ["openweather", "open-meteo", "weatherapi"] } + ], + "forbidden": [ + { + "name": "openai_format", + "keywords": ["function_call", "openai", "ChatCompletion"], + "desc": "Wrong SDK — this is Anthropic tool-use" + }, + { "name": "legacy_function_calling", "keywords": ["functions=", "function_call:"] } + ], + "expected_sources": ["claude-api"], + "failure_modes": ["wrong_sdk", "no_loop", "missing_tool_id", "vague_advice"] + }, + "claude-api-migrate": { + "must_include": [ + { + "name": "model_string_change", + "keywords": ["claude-4-6", "claude-sonnet-4-6", "claude-opus-4-6"], + "desc": "Mentions a 4.6 model name" + }, + { "name": "cache_control", "keywords": ["cache_control"] }, + { "name": "ephemeral_type", "keywords": ["ephemeral"] }, + { + "name": "cache_placement", + "keywords": ["system", "tools", "messages", "prefix"], + "desc": "Where to put the cache marker" + } + ], + "may_include": [ + { "name": "cache_ttl", "keywords": ["5 minutes", "5-minute", "TTL", "expires"] }, + { + "name": "cache_hit_metric", + "keywords": ["cache_creation_input_tokens", "cache_read_input_tokens", "usage."] + } + ], + "forbidden": [ + { "name": "openai_cache_syntax", "keywords": ["openai", "ChatCompletion", "system_fingerprint"] } + ], + "expected_sources": ["claude-api"], + "failure_modes": ["wrong_cache_syntax", "missing_model_swap", "vague_advice"] + }, + "brand-poster": { + "must_include": [ + { "name": "a3_dimensions", "keywords": ["A3", "297", "420"] }, + { + "name": "anthropic_palette", + "keywords": ["#CC785C", "Book Cloth", "Clay", "coral", "ivory", "kraft", "slate"], + "desc": "Names at least one signature brand palette color" + }, + { + "name": "typography", + "keywords": ["Styrene", "Tiempos"], + "match_mode": "all", + "desc": "Names both Anthropic display + body typefaces" + }, + { + "name": "restraint", + "keywords": ["restraint", "whitespace", "minimal", "editorial", "quiet", "single accent"], + "desc": "Acknowledges Anthropic's restrained aesthetic" + } + ], + "may_include": [ + { "name": "bleed_cmyk", "keywords": ["bleed", "CMYK", "300 DPI"] }, + { "name": "workshop_specifics", "keywords": ["developer", "workshop", "agenda"] } + ], + "forbidden": [ + { + "name": "gradients", + "keywords": ["neon", "gradient", "glassmorphism", "glow"], + "desc": "Off-brand visual treatments" + } + ], + "expected_sources": ["brand-guidelines", "canvas-design"], + "failure_modes": ["wrong_palette", "wrong_typography", "off_brand_aesthetic", "no_specifics"] + }, + "theme-slides": { + "must_include": [ + { "name": "theme_factory_ref", "keywords": ["theme-factory", "theme factory"] }, + { "name": "dieter_rams", "keywords": ["Dieter Rams", "Rams"] }, + { + "name": "rams_principles", + "keywords": [ + "less but better", + "less, but better", + "useful", + "honest", + "long-lasting", + "unobtrusive", + "good design", + "ten principles" + ], + "desc": "Names at least one Rams principle" + }, + { "name": "deck_format", "keywords": ["slide", "deck", "presentation", "pptx", "PowerPoint"] } + ], + "may_include": [{ "name": "q2_structure", "keywords": ["Q2", "objectives", "milestones", "outcomes"] }], + "forbidden": [{ "name": "off_brand_style", "keywords": ["gradient", "neon", "skeuomorphic"] }], + "expected_sources": ["theme-factory", "pptx"], + "failure_modes": ["missing_theme_reference", "no_rams_principles", "wrong_format"] + }, + "internal-release-note": { + "must_include": [ + { "name": "version_named", "keywords": ["v0.3.1", "0.3.1"] }, + { + "name": "fix_described", + "keywords": ["persistence", "data persistence", "userData", "data fix", "memory", "wiped"], + "desc": "Explains what the fix actually does" + }, + { + "name": "slack_format", + "keywords": [":", "*", "#", ":rocket:", ":tada:", "@"], + "desc": "Slack-style formatting markers" + }, + { + "name": "audience_action", + "keywords": ["update", "upgrade", "restart", "pull", "auto-update"], + "desc": "Tells reader what to do" + } + ], + "may_include": [ + { "name": "ce_root_env", "keywords": ["CE_ROOT", "getPath", "app.getPath", "userData"] }, + { "name": "migration_note", "keywords": ["migration", "migrate", "resourcesPath", "asar"] } + ], + "forbidden": [ + { + "name": "marketing_speak", + "keywords": ["leverage", "synergy", "best-in-class", "world-class"], + "desc": "Internal note, not marketing copy" + } + ], + "expected_sources": ["internal-comms", "slack-gif-creator"], + "failure_modes": ["missing_version", "vague_fix", "wrong_format", "missing_audience_action"] + }, + "email-security-patch": { + "must_include": [ + { + "name": "subject_line", + "keywords": ["Subject:", "Re:", "Security patch", "[Security]", "Patch shipped"] + }, + { + "name": "vulnerability_named", + "keywords": ["symlink", "symbolic link", "symlink-escape", "path traversal"], + "desc": "Names the actual vulnerability" + }, + { + "name": "patch_action", + "keywords": ["shipped", "patched", "fixed", "released", "rolled out"], + "desc": "States that fix is live" + }, + { + "name": "audience_action", + "keywords": ["update", "upgrade", "restart", "no action", "pull", "deploy"], + "desc": "What recipients should do" + } + ], + "may_include": [ + { + "name": "severity_or_impact", + "keywords": ["severity", "impact", "critical", "high", "low risk", "no exploit observed"] + }, + { "name": "contact_owner", "keywords": ["questions", "reach out", "DM", "ping me", "contact"] } + ], + "forbidden": [ + { + "name": "panic_tone", + "keywords": ["URGENT URGENT", "DISASTER", "EVERYONE STOP"], + "desc": "Internal patch note, not an incident" + }, + { "name": "external_marketing", "keywords": ["our valued customers", "world-class security"] } + ], + "expected_sources": ["internal-comms", "email-drafter"], + "failure_modes": ["missing_vulnerability_name", "no_audience_action", "wrong_tone"] + }, + "health-step-streak": { + "must_include": [ + { + "name": "data_source", + "keywords": ["Apple Health", "Google Fit", "Fitbit", "Garmin", "step data", "CSV", "export"] + }, + { + "name": "analysis_approach", + "keywords": ["median", "mean", "average", "trend", "rolling", "per day"] + }, + { + "name": "target_or_streak", + "keywords": ["target", "goal", "streak", "10,000", "10k", "daily goal", "on track"] + } + ], + "may_include": [ + { "name": "visualization", "keywords": ["chart", "graph", "plot", "visualize"] }, + { "name": "weekly_breakdown", "keywords": ["weekly", "per week", "weekday vs weekend"] } + ], + "forbidden": [ + { + "name": "fabricated_numbers", + "keywords": ["you walked exactly 8,432 steps", "your average is precisely"], + "desc": "Specific numbers without source data" + } + ], + "expected_sources": ["health-tracker-analyst"], + "failure_modes": ["no_data_source", "fabricated_data", "vague_analysis"] + }, + "file-cleanup-downloads": { + "must_include": [ + { + "name": "enumerate_downloads", + "keywords": ["Get-ChildItem", "Downloads", "$env:USERPROFILE\\Downloads", "ls", "dir"] + }, + { "name": "age_filter", "keywords": ["LastWriteTime", "AddDays(-90)", "90 days", "older than"] }, + { "name": "proposed_action", "keywords": ["move", "archive", "delete", "review", "list", "report"] }, + { + "name": "safety_step", + "keywords": ["dry-run", "dry run", "confirm", "WhatIf", "preview", "review before"], + "desc": "Doesn't delete blind" + } + ], + "may_include": [ + { "name": "size_breakdown", "keywords": ["size", "GB", "MB", "Measure-Object"] }, + { "name": "categorize", "keywords": ["extension", "type", "group", "category"] } + ], + "forbidden": [ + { + "name": "delete_without_confirm", + "keywords": ["Remove-Item -Recurse -Force without", "rm -rf"], + "desc": "Should never delete without confirmation" + } + ], + "expected_sources": ["downloads-cleaner", "file-search", "file-organiser"], + "failure_modes": ["no_safety_step", "missing_age_filter", "vague_plan"] + }, + "memory-consolidate": { + "must_include": [ + { "name": "read_memory", "keywords": ["memory.json", "memory file", "read memory", "load memory"] }, + { "name": "find_duplicates", "keywords": ["duplicate", "merge", "similar", "overlap"] }, + { "name": "prune_stale", "keywords": ["stale", "old", "outdated", "prune", "remove"] }, + { + "name": "backup_first", + "keywords": ["backup", "snapshot", "save copy", "before mutating", "before changes"], + "desc": "Don't blow up the memory file" + } + ], + "may_include": [ + { "name": "propose_then_apply", "keywords": ["propose", "review", "confirm", "preview", "diff"] }, + { "name": "similarity_threshold", "keywords": ["similarity", "threshold", "embedding"] } + ], + "forbidden": [ + { + "name": "silent_overwrite", + "keywords": ["overwrites memory.json", "no backup needed"], + "desc": "Always back up first" + } + ], + "expected_sources": ["context-engine-api"], + "failure_modes": ["no_backup", "destructive_default", "vague_plan"] + }, + "brainstorm-startup": { + "must_include": [ + { + "name": "multiple_ideas", + "keywords": ["1.", "2.", "3.", "Idea 1", "First", "Second", "Third"], + "desc": "More than one distinct idea" + }, + { + "name": "photographer_specific", + "keywords": ["photographer", "photography", "Lightroom", "shoot", "client gallery", "EXIF"] + }, + { "name": "problem_framing", "keywords": ["problem", "pain", "frustration", "currently", "today"] } + ], + "may_include": [ + { "name": "target_user", "keywords": ["wedding", "commercial", "portrait", "wildlife", "freelance"] }, + { "name": "monetization", "keywords": ["subscription", "SaaS", "pricing", "freemium", "$"] }, + { "name": "mvp_scope", "keywords": ["MVP", "first version", "wedge", "first feature"] } + ], + "forbidden": [ + { + "name": "single_idea", + "keywords": [], + "desc": "Pseudo-forbidden — checked separately; brainstorm should yield >1 idea" + }, + { + "name": "generic_only", + "keywords": [], + "desc": "Pseudo-forbidden — ideas must be photographer-specific, not generic AI assistant" + } + ], + "expected_sources": ["competitor-analyser", "web-researcher"], + "failure_modes": ["single_idea", "generic_not_photographer_specific", "no_problem_framing"] + } + } +} diff --git a/bench/grade_v2.py b/bench/grade_v2.py new file mode 100644 index 0000000..97a51cd --- /dev/null +++ b/bench/grade_v2.py @@ -0,0 +1,420 @@ +#!/usr/bin/env python3 +""" +grade_v2.py — Improved measurement layer addressing the v1.1 critique. + +What's new vs the tokenomics.py --grade pass: + + 1. **Gold-answer fact recall.** For each task, gold-answers.json defines a + list of facts that MUST be covered, optional facts that MAY be covered, + and FORBIDDEN content (hallucinations, wrong OS, etc). Scoring is + deterministic keyword-match first, with optional LLM-confirmed soft + matches for keywords that didn't hit. Output is a coverage % per + response — opinion replaced with measurement. + + 2. **Decomposed rubric.** Each response is scored on four axes + separately, not one holistic 1-10: + - factual_correctness (1-5) + - specificity (1-5) + - completeness (1-5) — against gold must_include + - hallucination_count (integer; lower is better) + + 3. **Multi-judge, multi-run.** Each response is graded N times by each + configured judge. Cross-family judges (Anthropic + OpenAI) break + self-evaluation bias. Reported metrics: median + IQR across runs. + + 4. **No-context baseline.** A fourth measurement path: same task model, + no skill context at all. Tells you whether retrieval is doing real + work or the model already knew enough. + + 5. **Precision@K / Recall@K / MRR.** When expected_sources is labeled, + score the search results against it. + + 6. **Latency + cost.** End-to-end milliseconds per task and dollar cost + per response, computed from per-model price tables. + + 7. **Failure-mode labels.** Each low-scoring response gets one or more + failure-mode tags from a fixed enum (per task in gold-answers.json). + Lets us count "what kind of failures" not just "how many." + +Usage: + python bench/grade_v2.py --responses bench/data/v1.2/responses-v12.json --gold bench/gold-answers.json + python bench/grade_v2.py --responses ... --judges anthropic,openai --runs 3 + +This module is importable; run_v12.py orchestrates the full pipeline. +""" +from __future__ import annotations + +import json +import os +import re +import statistics +import sys +import time +from dataclasses import dataclass, field, asdict +from pathlib import Path +from typing import Optional, Callable + +HERE = Path(__file__).parent + +# ---------- Pricing table (USD per 1M tokens, input/output) ---------- +# Update as published prices change. Used for the cost column only. +PRICING = { + "claude-haiku-4-5": {"in": 1.00, "out": 5.00}, + "claude-sonnet-4-5": {"in": 3.00, "out": 15.00}, + "claude-sonnet-4-6": {"in": 3.00, "out": 15.00}, + "claude-opus-4-7": {"in": 15.00, "out": 75.00}, + "gpt-4o": {"in": 2.50, "out": 10.00}, + "gpt-4o-mini": {"in": 0.15, "out": 0.60}, + "gpt-5": {"in": 5.00, "out": 25.00}, # placeholder; update on release + "gemini-1.5-pro": {"in": 1.25, "out": 5.00}, + "gemini-2.0-flash": {"in": 0.10, "out": 0.40}, + "local": {"in": 0.00, "out": 0.00}, +} + + +def cost_usd(model: str, in_tok: int, out_tok: int) -> float: + p = PRICING.get(model, PRICING["claude-haiku-4-5"]) + return (in_tok * p["in"] + out_tok * p["out"]) / 1_000_000 + + +# ---------- Gold-answer fact recall (deterministic) ---------- + +def _normalize(text: str) -> str: + return re.sub(r"\s+", " ", text.lower()).strip() + + +_NEGATION_PREFIXES = ("no ", "not ", "without ", "avoid ", "avoids ", "avoiding ", + "don't ", "do not ", "never ", "skip ", "skipping ", + "no gradients", "no glow") + + +def keyword_hits(text: str, keywords: list[str], guard_negation: bool = False) -> list[str]: + """Return the subset of keywords that appear in text (case-insensitive, + whitespace-tolerant). A keyword may be a multi-word phrase. When + guard_negation is True, suppress hits where the keyword is immediately + preceded by a negation token — used for forbidden checks so 'no gradients' + doesn't count as the response asserting gradients.""" + if not keywords: + return [] + norm = _normalize(text) + hits = [] + for k in keywords: + nk = _normalize(k) + if nk not in norm: + continue + if guard_negation: + # Find every occurrence; only count if at least one is NOT negated. + counted = False + start = 0 + while True: + idx = norm.find(nk, start) + if idx < 0: + break + window = norm[max(0, idx - 30):idx] + if not any(window.rstrip().endswith(neg.rstrip()) for neg in _NEGATION_PREFIXES): + counted = True + break + start = idx + len(nk) + if counted: + hits.append(k) + else: + hits.append(k) + return hits + + +@dataclass +class FactResult: + name: str + required: bool # True = must_include, False = may_include + hit: bool + matched_keywords: list[str] + + +@dataclass +class FactRecallResult: + must_total: int + must_hit: int + may_total: int + may_hit: int + forbidden_hits: list[str] # names of forbidden patterns that fired + facts: list[FactResult] + coverage_pct: float # must_hit / must_total + bonus_pct: float # may_hit / may_total + failure_modes: list[str] # inferred from misses + + +def score_facts(response: str, gold: dict) -> FactRecallResult: + must = gold.get("must_include", []) + may = gold.get("may_include", []) + forbidden = gold.get("forbidden", []) + + def _hit(f: dict) -> tuple[bool, list[str]]: + kws = f.get("keywords") or [] + if not kws: + return True, [] # empty keyword list = manual-review pass + hits = keyword_hits(response, kws) + mode = f.get("match_mode", "any") + if mode == "all": + ok = len(hits) == len(kws) + else: # default "any" + ok = bool(hits) + return ok, hits + + facts: list[FactResult] = [] + must_hit = 0 + for f in must: + ok, hits = _hit(f) + if ok: + must_hit += 1 + facts.append(FactResult(f["name"], True, ok, hits)) + + may_hit = 0 + for f in may: + ok, hits = _hit(f) + if ok: + may_hit += 1 + facts.append(FactResult(f["name"], False, ok, hits)) + + forbidden_fired = [] + for f in forbidden: + kws = f.get("keywords") or [] + hits = keyword_hits(response, kws, guard_negation=True) + if hits: + forbidden_fired.append(f["name"]) + + coverage = (must_hit / len(must)) * 100 if must else 100.0 + bonus = (may_hit / len(may)) * 100 if may else 0.0 + + # Infer failure modes from misses + failure_modes = [] + enumerated = gold.get("failure_modes", []) + if coverage < 60 and "vague_advice" in enumerated: + failure_modes.append("vague_advice") + for f in facts: + if f.required and not f.hit: + # Map common patterns to failure-mode tags. Names in gold-answers.json + # are designed so this mapping is mostly identity. + tag_map = { + "windows_command": "wrong_os", + "linux_cron": "wrong_os", + "negative_prompt": "missing_negative", + "agent_loop": "no_loop", + "tool_result_id": "missing_tool_id", + "anthropic_sdk": "wrong_sdk", + "backup_first": "no_backup", + "safety_step": "no_safety_step", + "version_named": "missing_version", + "vulnerability_named": "missing_vulnerability_name", + } + tag = tag_map.get(f.name, f"missing_{f.name}") + if tag in enumerated: + failure_modes.append(tag) + failure_modes.extend([f"forbidden:{n}" for n in forbidden_fired]) + + return FactRecallResult( + must_total=len(must), must_hit=must_hit, + may_total=len(may), may_hit=may_hit, + forbidden_hits=forbidden_fired, + facts=facts, + coverage_pct=round(coverage, 1), + bonus_pct=round(bonus, 1), + failure_modes=sorted(set(failure_modes)), + ) + + +# ---------- Retrieval quality (Precision@K / Recall@K / MRR) ---------- + +@dataclass +class RetrievalQuality: + k: int + precision_at_k: float + recall_at_k: float + mrr: float + expected: list[str] + retrieved: list[str] + hits: list[str] + + +def score_retrieval(retrieved_skill_ids: list[str], expected_sources: list[str], k: int = None) -> RetrievalQuality: + """retrieved_skill_ids: the ordered list of skill_ids returned by /api/search. + expected_sources: the labeled gold list for the task.""" + if k is None: + k = len(retrieved_skill_ids) + top_k = retrieved_skill_ids[:k] + expected_set = set(expected_sources) + hits = [s for s in top_k if s in expected_set] + precision = (len(hits) / len(top_k)) if top_k else 0.0 + recall = (len(hits) / len(expected_set)) if expected_set else 0.0 + # MRR: 1 / rank of first relevant result + mrr = 0.0 + for i, s in enumerate(top_k, 1): + if s in expected_set: + mrr = 1.0 / i + break + return RetrievalQuality( + k=k, + precision_at_k=round(precision, 3), + recall_at_k=round(recall, 3), + mrr=round(mrr, 3), + expected=expected_sources, + retrieved=top_k, + hits=hits, + ) + + +# ---------- Decomposed rubric (LLM-judged) ---------- + +DECOMPOSED_RUBRIC = """You are grading an AI assistant's response to a user task. Score on FOUR independent axes. + +Axes: +- factual_correctness (1-5): Do the named APIs, commands, paths, syntax actually exist as stated? Score 1 if there are clear factual errors; 5 if everything checkable looks correct. +- specificity (1-5): Is the response concrete? 1 = generic platitudes; 5 = task-specific with exact names, values, paths, code. +- completeness (1-5): Does it cover what a user would need to actually finish the task? 1 = misses critical steps; 5 = covers everything required. +- hallucination_count (integer >= 0): Count distinct fabricated facts — things the response asserts as true that are likely wrong or invented. 0 if none spotted. + +Output EXACTLY five lines in this format, nothing else: +FACTUAL: <1-5> +SPECIFIC: <1-5> +COMPLETE: <1-5> +HALLUCINATIONS: +NOTES: """ + + +@dataclass +class DecomposedGrade: + factual: int = 0 + specific: int = 0 + complete: int = 0 + hallucinations: int = 0 + notes: str = "" + judge: str = "" + run_idx: int = 0 + + +def parse_decomposed(text: str) -> DecomposedGrade: + g = DecomposedGrade() + for line in text.splitlines(): + line = line.strip() + if line.upper().startswith("FACTUAL:"): + digits = re.findall(r"\d+", line) + if digits: + g.factual = max(1, min(5, int(digits[0]))) + elif line.upper().startswith("SPECIFIC:"): + digits = re.findall(r"\d+", line) + if digits: + g.specific = max(1, min(5, int(digits[0]))) + elif line.upper().startswith("COMPLETE:"): + digits = re.findall(r"\d+", line) + if digits: + g.complete = max(1, min(5, int(digits[0]))) + elif line.upper().startswith("HALLUCINATIONS:"): + digits = re.findall(r"\d+", line) + if digits: + g.hallucinations = max(0, int(digits[0])) + elif line.upper().startswith("NOTES:"): + g.notes = line.split(":", 1)[1].strip() + return g + + +# ---------- Multi-judge orchestration ---------- + +@dataclass +class JudgeConfig: + name: str # e.g. "anthropic:haiku" or "openai:gpt-4o-mini" + family: str # "anthropic" | "openai" | "gemini" | "session" + model: str + api_key_env: str # which env var to read + + +def call_anthropic_judge(model: str, system: str, user: str, api_key: str) -> tuple[str, int, int]: + import anthropic + client = anthropic.Anthropic(api_key=api_key) + resp = client.messages.create( + model=model, max_tokens=300, + system=system, messages=[{"role": "user", "content": user}], + ) + text = "".join(b.text for b in resp.content if getattr(b, "type", None) == "text") + return text, int(resp.usage.input_tokens), int(resp.usage.output_tokens) + + +def call_openai_judge(model: str, system: str, user: str, api_key: str) -> tuple[str, int, int]: + import openai + client = openai.OpenAI(api_key=api_key) + resp = client.chat.completions.create( + model=model, max_tokens=300, + messages=[{"role": "system", "content": system}, + {"role": "user", "content": user}], + ) + text = resp.choices[0].message.content or "" + u = resp.usage + return text, int(u.prompt_tokens), int(u.completion_tokens) + + +JUDGE_DISPATCH = { + "anthropic": call_anthropic_judge, + "openai": call_openai_judge, +} + + +def grade_response_decomposed( + task_prompt: str, response: str, + judges: list[JudgeConfig], runs: int = 3, + session_grader: Optional[Callable[[str, str], DecomposedGrade]] = None, +) -> list[DecomposedGrade]: + """Returns a list of DecomposedGrade — one per (judge, run). The session_grader + callable, if provided, is used for judges of family 'session' (the current + Claude Code conversation acting as judge, useful when API keys aren't available).""" + user = f"TASK:\n{task_prompt}\n\nRESPONSE TO GRADE:\n{response}" + grades = [] + for j in judges: + for run_idx in range(runs): + if j.family == "session" and session_grader is not None: + g = session_grader(task_prompt, response) + else: + api_key = os.environ.get(j.api_key_env, "") + if not api_key: + g = DecomposedGrade(notes=f"(no {j.api_key_env})") + else: + try: + fn = JUDGE_DISPATCH[j.family] + text, _, _ = fn(j.model, DECOMPOSED_RUBRIC, user, api_key) + g = parse_decomposed(text) + except Exception as e: + g = DecomposedGrade(notes=f"(judge error: {e})") + g.judge = j.name + g.run_idx = run_idx + grades.append(g) + return grades + + +def aggregate_grades(grades: list[DecomposedGrade]) -> dict: + """Median + IQR + per-judge breakdown across multi-judge multi-run grading.""" + valid = [g for g in grades if g.factual > 0] + if not valid: + return {"n": 0} + def med(xs): return statistics.median(xs) + def iqr(xs): + if len(xs) < 4: + return 0.0 + q = statistics.quantiles(xs, n=4) + return round(q[2] - q[0], 2) + return { + "n": len(valid), + "factual_median": med([g.factual for g in valid]), + "factual_iqr": iqr([g.factual for g in valid]), + "specific_median": med([g.specific for g in valid]), + "specific_iqr": iqr([g.specific for g in valid]), + "complete_median": med([g.complete for g in valid]), + "complete_iqr": iqr([g.complete for g in valid]), + "hallucinations_mean": round(statistics.mean([g.hallucinations for g in valid]), 2), + "hallucinations_max": max([g.hallucinations for g in valid]), + "composite_median": round(med([(g.factual + g.specific + g.complete) / 3 for g in valid]), 2), + "per_judge": { + j: { + "factual_median": med([g.factual for g in valid if g.judge == j]), + "specific_median": med([g.specific for g in valid if g.judge == j]), + "complete_median": med([g.complete for g in valid if g.judge == j]), + } + for j in sorted({g.judge for g in valid}) + }, + } diff --git a/bench/legacy/build_addendum.py b/bench/legacy/build_addendum.py new file mode 100644 index 0000000..079a103 --- /dev/null +++ b/bench/legacy/build_addendum.py @@ -0,0 +1,394 @@ +#!/usr/bin/env python3 +""" +Build a PDF addendum (Appendix A) for the Context Engine white paper that +inserts the first empirical validation of Hypothesis 1: Smart Compile reduces +token load without reducing task quality. + +Source of truth for numbers: bench/results-graded-sample.json + bench/results-latest.json +""" +from __future__ import annotations +import json +from pathlib import Path + +from reportlab.lib.pagesizes import A4 +from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle +from reportlab.lib.units import mm +from reportlab.lib import colors +from reportlab.lib.enums import TA_LEFT +from reportlab.platypus import ( + SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, KeepTogether, +) +from PyPDF2 import PdfReader, PdfWriter + +ROOT = Path(__file__).resolve().parents[2] +BENCH = ROOT / "app" / "bench" +ADDENDUM_PDF = BENCH / "addendum-tokenomics.pdf" +SOURCE_PDF = ROOT / "Context Engine White Paper V1.pdf" +OUTPUT_PDF = ROOT / "Context Engine White Paper V1.1.pdf" + +graded = json.loads((BENCH / "results-graded-sample.json").read_text(encoding="utf-8")) +full = json.loads((BENCH / "results-latest.json").read_text(encoding="utf-8")) + +styles = getSampleStyleSheet() +H1 = ParagraphStyle("H1", parent=styles["Heading1"], fontName="Helvetica-Bold", + fontSize=18, leading=22, spaceAfter=12, textColor=colors.HexColor("#191919")) +H2 = ParagraphStyle("H2", parent=styles["Heading2"], fontName="Helvetica-Bold", + fontSize=13, leading=17, spaceBefore=14, spaceAfter=6, + textColor=colors.HexColor("#191919")) +H3 = ParagraphStyle("H3", parent=styles["Heading3"], fontName="Helvetica-Bold", + fontSize=11, leading=14, spaceBefore=8, spaceAfter=4, + textColor=colors.HexColor("#3a3a3a")) +BODY = ParagraphStyle("BODY", parent=styles["BodyText"], fontName="Helvetica", + fontSize=10.5, leading=14, spaceAfter=6, alignment=TA_LEFT, + textColor=colors.HexColor("#191919")) +QUOTE = ParagraphStyle("QUOTE", parent=BODY, leftIndent=14, rightIndent=14, + fontName="Helvetica-Oblique", textColor=colors.HexColor("#3a3a3a"), + spaceBefore=4, spaceAfter=10) +CODE = ParagraphStyle("CODE", parent=BODY, fontName="Courier", fontSize=9, leading=12, + leftIndent=10, textColor=colors.HexColor("#191919")) +CAPTION = ParagraphStyle("CAPTION", parent=BODY, fontSize=9, leading=12, + textColor=colors.HexColor("#666666"), spaceAfter=10) + + +def fmt(n): + return f"{n:,}" + + +story = [] + +# ============================================================ +story.append(Paragraph("Appendix A", H3)) +story.append(Paragraph("Empirical Validation — Tokenomics Benchmark v0.3.1", H1)) +story.append(Paragraph( + "Added in v1.1 of this paper. Status: first concrete data point against " + "Hypothesis 1 (Smart Compile reduces token load without reducing task quality). " + "Validation is partial — see methodology caveats at the end of this appendix.", + QUOTE, +)) + +# ---------- A.1 Why this section exists ---------- +story.append(Paragraph("A.1 Why this section exists", H2)) +story.append(Paragraph( + "Section 19 of this paper deferred empirical results until Context Engine's " + "Smart Compile, deduplication, and MCP retrieval layers were mature enough to " + "test fairly. Following the v0.3.0 ship (Handoffs, Skill Sources, MCP server) " + "and the v0.3.1 persistence fix, the three retrieval paths the paper describes — " + "naive all-on loading, Smart Compile selection, and MCP search retrieval — are " + "all live and measurable in the same instance.", + BODY, +)) +story.append(Paragraph( + "This appendix reports the first end-to-end token-and-quality benchmark " + "run against a representative task corpus, using the apparatus defined in " + "app/bench/tokenomics.py.", + BODY, +)) + +# ---------- A.2 Method ---------- +story.append(Paragraph("A.2 Method", H2)) +story.append(Paragraph("Corpus", H3)) +story.append(Paragraph( + "Fifteen tasks spanning eight categories (system-ops, image-gen, claude-api, " + "design, comms, health, meta, product). Each task is a single natural-language " + "request a user would plausibly send to an assistant with this skill library installed.", + BODY, +)) +story.append(Paragraph("Three measurement paths, same tokenizer (tiktoken cl100k_base)", H3)) +story.append(Paragraph( + "Raw All — every active skill body concatenated. This is the naive MCP-host " + "baseline: load everything the host advertises and hope the model finds the " + "relevant material. All savings percentages in this appendix are computed against this number.", + BODY, +)) +story.append(Paragraph( + "Smart — tokens after Context Engine's /api/compile/smart " + "endpoint selects relevant skills for the specific task. Same content type as Raw All, " + "just a task-conditioned subset. This is the "compiled context" path.", + BODY, +)) +story.append(Paragraph( + "Search — tokens an MCP host actually pulls when it calls " + "context_engine_search once per task and receives " + "N=8 ranked chunks. This is the "live retrieval" path described in Section 13.", + BODY, +)) +story.append(Paragraph( + "The reference column from the broader run (CONTEXT.md, the pre-compressed " + "system-prompt summary at 3,292 tokens) is a different content type entirely and " + "is intentionally excluded from savings ratios — mixing compressed summary tokens with " + "full skill bodies would inflate the headline number dishonestly.", + BODY, +)) + +story.append(Paragraph("Quality grading", H3)) +story.append(Paragraph( + "For five representative tasks, each context was supplied to an LLM as a system " + "prompt and the model produced an answer. Each (task, mode) response was then " + "scored 1–10 against a fixed three-axis rubric — specificity, actionability, " + "plausibility — by a separate judge call. The numerator the appendix cares about is " + "tokens per quality point: the cost in retrieved tokens of one point of usable output.", + BODY, +)) + +# ---------- A.3 Table 1 ---------- +story.append(Paragraph("A.3 Results — token efficiency, full 15-task corpus", H2)) +story.append(Paragraph( + f"Baseline (Raw All, every active skill body): {fmt(full['raw_all_tokens'])} tokens " + f"per turn, across {full['active_skill_count']} active skills.", + BODY, +)) + +table_data = [["Task", "Category", "Smart tk", "Smart save", "Search tk", "Search save"]] +for r in full["results"]: + table_data.append([ + r["task_id"], + r["category"], + fmt(r["smart_tokens"]), + f"{r['smart_saving_pct']:.1f}%", + fmt(r["search_tokens"]), + f"{r['search_saving_pct']:.1f}%", + ]) + +smart_savings = [r["smart_saving_pct"] for r in full["results"]] +search_savings = [r["search_saving_pct"] for r in full["results"]] +smart_tokens = [r["smart_tokens"] for r in full["results"]] +search_tokens = [r["search_tokens"] for r in full["results"]] +def median(xs): + s = sorted(xs); n = len(s) + return s[n//2] if n % 2 else (s[n//2-1] + s[n//2]) / 2 +table_data.append([ + "Median", "—", + fmt(int(median(smart_tokens))), + f"{median(smart_savings):.1f}%", + fmt(int(median(search_tokens))), + f"{median(search_savings):.1f}%", +]) + +t = Table(table_data, colWidths=[42*mm, 24*mm, 22*mm, 22*mm, 22*mm, 22*mm], repeatRows=1) +t.setStyle(TableStyle([ + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), + ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), + ("FONTSIZE", (0, 0), (-1, -1), 8.5), + ("ALIGN", (2, 1), (-1, -1), "RIGHT"), + ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), + ("ROWBACKGROUNDS", (0, 1), (-1, -2), [colors.white, colors.HexColor("#f5f3ee")]), + ("BACKGROUND", (0, -1), (-1, -1), colors.HexColor("#e8e3d6")), + ("FONTNAME", (0, -1), (-1, -1), "Helvetica-Bold"), + ("LINEBELOW", (0, 0), (-1, 0), 0.5, colors.HexColor("#191919")), + ("LINEBELOW", (0, -2), (-1, -2), 0.3, colors.HexColor("#999999")), + ("LEFTPADDING", (0, 0), (-1, -1), 5), + ("RIGHTPADDING", (0, 0), (-1, -1), 5), + ("TOPPADDING", (0, 0), (-1, -1), 4), + ("BOTTOMPADDING", (0, 0), (-1, -1), 4), +])) +story.append(t) +story.append(Paragraph( + "Table 1. Per-task token cost for each retrieval path, against the 186,654-token " + "naive baseline. Smart Compile compresses by ~65% via selection; MCP search " + "compresses by ~99.6% via chunk retrieval.", + CAPTION, +)) + +story.append(Paragraph( + "The shape is the load-bearing finding: Smart Compile selection alone removes " + "roughly two-thirds of the baseline load on the median task. MCP search removes " + "more than 99%. These are independent and complementary paths — Smart " + "Compile is the right path when the host wants a curated system prompt; " + "MCP search is the right path when the host fetches just-in-time.", + BODY, +)) + +story.append(PageBreak()) + +# ---------- A.4 Quality ---------- +story.append(Paragraph("A.4 Results — quality, 5-task representative subset", H2)) +story.append(Paragraph( + "The five graded tasks were chosen to span categories with different context " + "shapes: system-ops (procedural), image-gen (recipe), claude-api (code), " + "design (brand), comms (release note). Responses were scored 1–10 on " + "specificity, actionability, and plausibility.", + BODY, +)) + +q_data = [["Task", "Smart tk", "Smart Q", "Search tk", "Search Q"]] +for r in graded["results"]: + q_data.append([ + r["task_id"], + fmt(r["smart_tokens"]), + str(r["smart_quality"]), + fmt(r["search_tokens"]), + str(r["search_quality"]), + ]) +s = graded["summary"] +q_data.append([ + "Median / Mean", + fmt(s["smart_tokens_median"]), + f"{s['smart_quality_median']:.1f} / {s['smart_quality_mean']:.1f}", + fmt(s["search_tokens_median"]), + f"{s['search_quality_median']:.1f} / {s['search_quality_mean']:.1f}", +]) +t2 = Table(q_data, colWidths=[50*mm, 28*mm, 28*mm, 28*mm, 28*mm], repeatRows=1) +t2.setStyle(TableStyle([ + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), + ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), + ("FONTSIZE", (0, 0), (-1, -1), 8.5), + ("ALIGN", (1, 1), (-1, -1), "RIGHT"), + ("ROWBACKGROUNDS", (0, 1), (-1, -2), [colors.white, colors.HexColor("#f5f3ee")]), + ("BACKGROUND", (0, -1), (-1, -1), colors.HexColor("#e8e3d6")), + ("FONTNAME", (0, -1), (-1, -1), "Helvetica-Bold"), + ("LINEBELOW", (0, 0), (-1, 0), 0.5, colors.HexColor("#191919")), + ("LINEBELOW", (0, -2), (-1, -2), 0.3, colors.HexColor("#999999")), + ("LEFTPADDING", (0, 0), (-1, -1), 5), + ("RIGHTPADDING", (0, 0), (-1, -1), 5), + ("TOPPADDING", (0, 0), (-1, -1), 4), + ("BOTTOMPADDING", (0, 0), (-1, -1), 4), +])) +story.append(t2) +story.append(Paragraph( + "Table 2. Token cost vs. judged output quality for the five graded tasks. " + "Smart holds a flat 9.0/10. Search averages 7.6/10 — usable across the board, " + "with the gap concentrated in tasks where the answer depends on a specific recipe " + "(prompt construction, release-note technical details).", + CAPTION, +)) + +story.append(Paragraph("Tokens per quality point", H3)) +story.append(Paragraph( + f"This is the load-bearing efficiency number for the "lean MCP" claim: " + f"the cost in retrieved tokens of one point of usable output.", + BODY, +)) +tpq = Table([ + ["Path", "Median tokens / quality point"], + ["Smart Compile", f"{fmt(s['smart_tokens_per_quality_point_median'])} tk/pt"], + ["MCP Search", f"{fmt(s['search_tokens_per_quality_point_median'])} tk/pt"], + ["Ratio", f"Search is ~{s['efficiency_ratio_search_vs_smart']:.0f}× more efficient per quality point"], +], colWidths=[60*mm, 100*mm]) +tpq.setStyle(TableStyle([ + ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), + ("FONTSIZE", (0, 0), (-1, -1), 10), + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), + ("BACKGROUND", (0, -1), (-1, -1), colors.HexColor("#e8e3d6")), + ("FONTNAME", (0, -1), (-1, -1), "Helvetica-Bold"), + ("LEFTPADDING", (0, 0), (-1, -1), 8), + ("RIGHTPADDING", (0, 0), (-1, -1), 8), + ("TOPPADDING", (0, 0), (-1, -1), 6), + ("BOTTOMPADDING", (0, 0), (-1, -1), 6), + ("ROWBACKGROUNDS", (0, 1), (-1, -2), [colors.white, colors.HexColor("#f5f3ee")]), +])) +story.append(tpq) +story.append(Spacer(1, 8)) + +# ---------- A.5 Interpretation ---------- +story.append(Paragraph("A.5 Interpretation against Hypothesis 1", H2)) +story.append(Paragraph( + "Hypothesis 1 holds in the direction predicted, with one caveat. " + "Smart Compile delivers a ~65% median reduction in retrieved tokens while preserving " + "task quality at 9.0/10 across the graded subset — quality is statistically flat compared " + "to the all-on baseline a reader would intuit. MCP search delivers a ~99.6% reduction " + "but with a measurable quality cost: 7.6/10 on the same rubric, a ~1.4-point drop.", + BODY, +)) +story.append(Paragraph( + "The quality cost is not random. The two tasks where search lost the most ground " + "(comfy-prompt-fantasy, internal-release-note) are ones where the " + "skill body contains a specific recipe or fact set that doesn't compress into a few " + "chunks. Conceptual or brainstorming tasks — the cases the paper's " + "Multi-Resolution Packaging section anticipates — should narrow that gap.", + BODY, +)) +story.append(Paragraph( + "Practical reading: an MCP host that wants the absolute cheapest path can use search " + "and accept the 1.4-point quality discount; a host that wants near-baseline quality at " + "a third of the token cost should use Smart Compile. Both paths beat the naive " + "all-on default by an order of magnitude in different dimensions, and neither requires " + "the host to know anything about the underlying skill library.", + BODY, +)) + +# ---------- A.6 Caveats ---------- +story.append(Paragraph("A.6 Caveats and what this run does not yet measure", H2)) +story.append(Paragraph( + "This is a v0.3.1 measurement on a single user's library (116 active skills, " + "Jeremy's Context Engine instance). It is honest signal of shape, not a " + "publishable headline number. Specific limitations:", + BODY, +)) +caveats = [ + "Single-judge grading. Each response was scored once. The validation framework calls " + "for three judge runs per response with the median taken; that work is deferred to v1.2.", + "Same model on both sides. The task model and the judge model were the same Claude " + "session, which introduces self-evaluation bias. A future run should use Sonnet for the task " + "and a different judge family (e.g. GPT-4 or Haiku) to break the dependency.", + "5-task quality subset. The token table covers all 15 tasks; quality grading covers 5. " + "Expanding to the full 15 is straightforward but was not budget-justified for the v0.3.1 ship.", + "Tokenizer is cl100k_base, not Anthropic's. Within ~5% for prose; consistent " + "across the three measurement paths, so internal ratios are correct.", + "One retrieval call per task. Real hosts may call search multiple times or fall back to " + "get_skill for full bodies. Reported search numbers are a lower " + "bound on what production traffic actually consumes.", + "Hypotheses 2–5 are not yet measured. Multi-resolution packaging, model-aware " + "budgets, deduplication impact, and usage-learned reranking remain future work.", +] +for c in caveats: + story.append(Paragraph(f"• {c}", BODY)) + +# ---------- A.7 Reproducibility ---------- +story.append(Paragraph("A.7 Reproducibility", H2)) +story.append(Paragraph( + "All numbers in this appendix are reproducible from a Context Engine install with " + "the same skill library:", + BODY, +)) +story.append(Paragraph("python bench/tokenomics.py # tokens, full corpus", CODE)) +story.append(Paragraph("python bench/tokenomics.py --grade # tokens + quality (needs API key)", CODE)) +story.append(Paragraph("python bench/fetch_contexts.py # dumps contexts to disk for offline grading", CODE)) +story.append(Paragraph( + "Outputs land in bench/results-latest.json and " + "bench/results-graded-sample.json. The corpus lives in " + "bench/tasks.json — extend or replace it to validate against " + "your own workload.", + BODY, +)) +story.append(Spacer(1, 14)) +story.append(Paragraph( + "Section author note: the headline result this appendix supports is that the " + "MCP-first pivot from May 2026 was the right call. Naive MCP hosts pay 186k " + "tokens per turn for context they mostly don't use. Context Engine's " + "two retrieval paths cut that to 58k (Smart) or under 1k (Search) without " + "collapsing answer quality. The brokered-context model is empirically cheaper, " + "not just architecturally cleaner.", + QUOTE, +)) + + +def on_page(canvas, doc): + canvas.saveState() + canvas.setFont("Helvetica", 8) + canvas.setFillColor(colors.HexColor("#999999")) + canvas.drawString(20*mm, 12*mm, "Context Engine White Paper — v1.1 Appendix A") + canvas.drawRightString(A4[0] - 20*mm, 12*mm, f"A-{doc.page}") + canvas.restoreState() + + +doc = SimpleDocTemplate( + str(ADDENDUM_PDF), pagesize=A4, + leftMargin=22*mm, rightMargin=22*mm, topMargin=22*mm, bottomMargin=22*mm, + title="Context Engine White Paper v1.1 — Appendix A", + author="Jeremy Walder-Willows", +) +doc.build(story, onFirstPage=on_page, onLaterPages=on_page) +print(f"Wrote addendum: {ADDENDUM_PDF}") + +# Merge V1 + addendum -> V1.1 +writer = PdfWriter() +for p in PdfReader(str(SOURCE_PDF)).pages: + writer.add_page(p) +for p in PdfReader(str(ADDENDUM_PDF)).pages: + writer.add_page(p) +with OUTPUT_PDF.open("wb") as f: + writer.write(f) +print(f"Wrote merged: {OUTPUT_PDF}") diff --git a/bench/legacy/build_addendum_v12.py b/bench/legacy/build_addendum_v12.py new file mode 100644 index 0000000..9a9df9b --- /dev/null +++ b/bench/legacy/build_addendum_v12.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +""" +Build Appendix B for the v1.2 white paper. + +Appendix B replaces the v1.1 measurement methodology with the critique-addressed +version: + - Gold-answer fact recall (deterministic) + - Decomposed rubric (factual / specific / complete / hallucinations) + - No-context baseline column + - Retrieval quality (Precision@K / Recall@K / MRR) + - Latency + cost + - Failure-mode taxonomy + - Multi-judge support (architecture in code; v1.2 results are still single-judge + pending API keys, with a flagged caveat) + +Source: bench/results-v12-final.json +""" +from __future__ import annotations +import json +from pathlib import Path +from reportlab.lib.pagesizes import A4 +from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle +from reportlab.lib.units import mm +from reportlab.lib import colors +from reportlab.lib.enums import TA_LEFT +from reportlab.platypus import ( + SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, +) +from PyPDF2 import PdfReader, PdfWriter + +ROOT = Path(__file__).resolve().parents[2] +BENCH = ROOT / "app" / "bench" +ADDENDUM_PDF = BENCH / "addendum-v12.pdf" +SOURCE_PDF = ROOT / "Context Engine White Paper V1.1.pdf" +OUTPUT_PDF = ROOT / "Context Engine White Paper V1.2.pdf" + +data = json.loads((BENCH / "results-v12-final.json").read_text(encoding="utf-8")) +s = data["summary"] +rows = data["rows"] + +styles = getSampleStyleSheet() +H1 = ParagraphStyle("H1", parent=styles["Heading1"], fontName="Helvetica-Bold", + fontSize=18, leading=22, spaceAfter=12, textColor=colors.HexColor("#191919")) +H2 = ParagraphStyle("H2", parent=styles["Heading2"], fontName="Helvetica-Bold", + fontSize=13, leading=17, spaceBefore=14, spaceAfter=6, + textColor=colors.HexColor("#191919")) +H3 = ParagraphStyle("H3", parent=styles["Heading3"], fontName="Helvetica-Bold", + fontSize=11, leading=14, spaceBefore=8, spaceAfter=4, + textColor=colors.HexColor("#3a3a3a")) +BODY = ParagraphStyle("BODY", parent=styles["BodyText"], fontName="Helvetica", + fontSize=10.5, leading=14, spaceAfter=6, alignment=TA_LEFT, + textColor=colors.HexColor("#191919")) +QUOTE = ParagraphStyle("QUOTE", parent=BODY, leftIndent=14, rightIndent=14, + fontName="Helvetica-Oblique", textColor=colors.HexColor("#3a3a3a"), + spaceBefore=4, spaceAfter=10) +CAPTION = ParagraphStyle("CAPTION", parent=BODY, fontSize=9, leading=12, + textColor=colors.HexColor("#666666"), spaceAfter=10) +CODE = ParagraphStyle("CODE", parent=BODY, fontName="Courier", fontSize=9, leading=12, + leftIndent=10, textColor=colors.HexColor("#191919")) + + +def fmt(n): return f"{n:,}" if isinstance(n, (int, float)) else str(n) + + +def kv_table(pairs, col_widths=(60*mm, 100*mm)): + t = Table(pairs, colWidths=col_widths) + t.setStyle(TableStyle([ + ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), + ("FONTSIZE", (0, 0), (-1, -1), 9.5), + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), + ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f5f3ee")]), + ("LEFTPADDING", (0, 0), (-1, -1), 6), + ("RIGHTPADDING", (0, 0), (-1, -1), 6), + ("TOPPADDING", (0, 0), (-1, -1), 4), + ("BOTTOMPADDING", (0, 0), (-1, -1), 4), + ])) + return t + + +story = [] + +story.append(Paragraph("Appendix B", H3)) +story.append(Paragraph("Tokenomics Benchmark v1.2 — Critique-Addressed Methodology", H1)) +story.append(Paragraph( + "Added in v1.2 of this paper. Supersedes Appendix A's measurement methodology while " + "preserving A as the historical record. The v1.1 critique identified seven concrete " + "weaknesses in A's grading apparatus — same-model self-evaluation, single judge run, " + "no gold answers, no no-context baseline, no retrieval-quality scoring, holistic " + "rubric, single task model. B addresses all of these in the harness, and reports the " + "in-session results from the subset that can be measured without external API keys.", + QUOTE, +)) + +# ---------- B.1 What changed ---------- +story.append(Paragraph("B.1 What changed between A and B", H2)) + +changes = [ + ("Gold-answer fact recall", + "Per-task labeled facts (must_include, may_include, forbidden) in " + "bench/gold-answers.json. Scoring is deterministic keyword-match with negation " + "guarding and an 'all'/'any' match mode per fact. Coverage % is now a measurement, not an opinion."), + ("Decomposed rubric", + "Four independent axes — factual_correctness (1-5), specificity (1-5), " + "completeness (1-5), hallucination_count (integer) — replacing the single " + "holistic 1-10 score from A. Hallucination as a count, not a grade, exposes the " + "failure mode MCP hosts care about most."), + ("No-context baseline", + "Fourth measurement path: same task, zero skill context. Distinguishes 'CE added " + "value' from 'the model already knew this'. Without this column, A's quality " + "numbers were unanchored."), + ("Retrieval quality (Precision@K, Recall@K, MRR)", + "Labeled expected_sources per task in gold-answers.json. Scores search results " + "before any LLM call. Predicts when search will underperform without spending tokens."), + ("Latency + cost columns", + "End-to-end milliseconds per call, dollar cost from a per-model price table. " + "Token counts don't equal pricing; reporting both removes the ambiguity."), + ("Failure-mode taxonomy", + "Enumerated tags per task (wrong_os, wrong_sdk, missing_loop, no_safety_step, etc) " + "inferred from gold-miss patterns. Failures get categorised, not just counted."), + ("Multi-judge cross-family architecture", + "grade_v2.py supports anthropic + openai judges, N runs each, with median + IQR " + "aggregation. In v1.2 the in-session run still uses a single session judge (no " + "external API keys available); the harness is wired for cross-family multi-run " + "to break self-evaluation bias as soon as keys are in scope."), +] +for title, body in changes: + story.append(Paragraph(f"{title}. {body}", BODY)) + +# ---------- B.2 Results table ---------- +story.append(PageBreak()) +story.append(Paragraph("B.2 Results — 5-task subset, three context paths", H2)) +story.append(Paragraph( + "Same 5 representative tasks as Appendix A (system-ops, image-gen, claude-api, " + "design, comms). Now scored against gold facts and on the decomposed rubric, with " + "the no-context baseline as a fourth row per task.", + BODY, +)) + +t_rows = [["Task", "Mode", "Tokens", "Cov", "Bonus", "Fact", "Spec", "Comp", "Comp%"]] +for r in rows: + t_rows.append([ + r["task_id"], + r["mode"], + fmt(r["input_tokens"]), + f"{r['fact_coverage_pct']:.0f}%", + f"{r['fact_bonus_pct']:.0f}%", + str(r["factual"]), + str(r["specific"]), + str(r["complete"]), + f"{r['composite_pct']:.0f}%", + ]) +t = Table(t_rows, colWidths=[40*mm, 18*mm, 18*mm, 12*mm, 14*mm, 12*mm, 12*mm, 12*mm, 16*mm], repeatRows=1) +t.setStyle(TableStyle([ + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), + ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), + ("FONTSIZE", (0, 0), (-1, -1), 8), + ("ALIGN", (2, 1), (-1, -1), "RIGHT"), + ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), + ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f5f3ee")]), + ("LINEBELOW", (0, 0), (-1, 0), 0.5, colors.HexColor("#191919")), + ("LEFTPADDING", (0, 0), (-1, -1), 4), + ("RIGHTPADDING", (0, 0), (-1, -1), 4), + ("TOPPADDING", (0, 0), (-1, -1), 3), + ("BOTTOMPADDING", (0, 0), (-1, -1), 3), +])) +story.append(t) +story.append(Paragraph( + "Table B.1. Per-task per-mode results. Tokens = system-prompt context tokens. " + "Cov = must_include facts hit (deterministic). Bonus = may_include facts hit " + "(deterministic). Fact / Spec / Comp = decomposed rubric axes 1-5 (session-judged). " + "Comp% = mean of three axes normalised to 100.", + CAPTION, +)) + +# ---------- B.3 Per-mode aggregates ---------- +story.append(Paragraph("B.3 Per-mode aggregates (5 tasks)", H2)) + +agg_rows = [["Mode", "Median tk", "Saving vs raw_all", "Cov", "Bonus", "Composite Q", "Tokens / Q-pt"]] +for k in ("no_context", "smart", "search"): + tk = s["tokens"]["by_mode_input_tokens"][k]["median"] + cov = s["fact_recall"]["coverage_must_include_pct"][k]["mean"] + bonus = s["fact_recall"]["bonus_may_include_pct"][k]["mean"] + comp = s["decomposed_rubric"]["composite_pct"][k]["median"] + tpqp = s["tokens_per_quality_point"][k] + saving = (f"{round(100 * (1 - tk / 186654), 1)}%" + if k != "no_context" else "n/a") + agg_rows.append([ + k, fmt(tk), saving, f"{cov:.1f}%", f"{bonus:.1f}%", + f"{comp:.1f}%", fmt(tpqp), + ]) +agg = Table(agg_rows, colWidths=[26*mm, 24*mm, 30*mm, 16*mm, 18*mm, 26*mm, 26*mm], repeatRows=1) +agg.setStyle(TableStyle([ + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), + ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), + ("FONTSIZE", (0, 0), (-1, -1), 9), + ("ALIGN", (1, 1), (-1, -1), "RIGHT"), + ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f5f3ee")]), + ("LEFTPADDING", (0, 0), (-1, -1), 5), + ("RIGHTPADDING", (0, 0), (-1, -1), 5), + ("TOPPADDING", (0, 0), (-1, -1), 4), + ("BOTTOMPADDING", (0, 0), (-1, -1), 4), +])) +story.append(agg) +story.append(Paragraph( + "Table B.2. Aggregate per mode across the 5-task subset. Cov and Bonus are means; " + "all other columns are medians.", + CAPTION, +)) + +story.append(Paragraph("B.4 Retrieval quality (search mode)", H2)) +retr = s["retrieval_quality_mean"] +story.append(kv_table([ + ["Metric", "Mean (5 tasks)"], + ["Precision@8", f"{retr['precision_at_k']:.2f}"], + ["Recall@8", f"{retr['recall_at_k']:.2f}"], + ["MRR", f"{retr['mrr']:.2f}"], +])) +story.append(Paragraph( + "Table B.3. Retrieval-quality metrics from MCP search vs the labeled " + "expected_sources gold set. Per-task: brand-poster scores cleanly " + "(P@8=0.50, R@8=1.00, MRR=1.00 — both relevant skills appear at ranks 1 and 2). " + "internal-release-note is a complete miss (P@8=0, R@8=0) — neither internal-comms " + "nor slack-gif-creator appears in the top 8. This metric predicts the observed " + "quality drop on that task without spending an LLM call.", + CAPTION, +)) + +story.append(PageBreak()) + +# ---------- B.5 Interpretation ---------- +story.append(Paragraph("B.5 Interpretation", H2)) + +h = s["headline"] +story.append(Paragraph( + f"Smart Compile delivers {h['smart_saving_vs_raw_all_pct']}% token savings " + f"vs the raw_all baseline and adds {h['smart_quality_lift_vs_no_context_pct']} composite " + f"quality points above the no-context baseline. Quality is a flat 100% on all 5 " + "tasks — Smart consistently brings every bonus fact (cache_control, AUMID, exact hex codes, " + "env var names). Cost: ~3,876 tokens per quality point above no-context.", + BODY, +)) + +story.append(Paragraph( + f"MCP Search delivers {h['search_saving_vs_raw_all_pct']}% token savings and adds " + f"only {h['search_quality_lift_vs_no_context_pct']} composite quality points above no-context. " + "This is a more sober result than v1.1 suggested. On 4 of 5 tasks, search adds 5-15 " + "quality points; on internal-release-note it adds zero because retrieval missed the " + "relevant skills entirely. Cost: ~58 tokens per quality point above no-context — " + "67× more efficient than Smart per quality-point of lift.", + BODY, +)) + +story.append(Paragraph( + "The honest framing the v1.2 numbers force: a modern hosted model with general " + "world knowledge already gets to 80% composite on these tasks with zero context. CE " + "Smart Compile closes the remaining 20% reliably at a known token cost. CE Search " + "closes about a third of that remaining gap at a tiny token cost, but the gap it closes " + "is conditional on retrieval succeeding — which Precision@K predicts before the LLM " + "is invoked.", + BODY, +)) + +story.append(Paragraph( + "The Hypothesis 1 claim ('Smart Compile reduces token load without reducing task quality') " + "remains supported and is now measured against four axes instead of one. Hypothesis 2 " + "(multi-resolution packaging) is partially probed by Retrieval Quality — search loses where " + "the relevant chunks aren't surfaced. Hypothesis 3 (model-aware budgets) and " + "Hypothesis 5 (usage-learned reranking) remain unmeasured.", + BODY, +)) + +# ---------- B.6 Known limitations ---------- +story.append(Paragraph("B.6 Known limitations of v1.2 (what's still pending)", H2)) + +caveats = [ + "Single judge, single run. grade_v2.py supports anthropic + openai judges with " + "N=3 runs each and median+IQR aggregation, but the in-session run uses one judge " + "(the same Claude session) once. Cross-family multi-run grading is the v1.3 milestone " + "and requires only API keys, not new code.", + "5-task quality subset. Gold answers are written for all 15 corpus tasks; " + "response capture and grading covers 5. Expanding to 15 is a 2-hour task with a key.", + "Single task model. claude-sonnet-4-5 only. Hypothesis 3 demands the same corpus " + "across Haiku / Sonnet / a local 8B model to test whether smaller models benefit more " + "from CE. Wired in PRICING table; runner is one CLI flag away.", + "One retrieval call per task. Real MCP hosts iterate; production search traffic " + "sits between our Smart and Search numbers. A multi-call host simulator is on the " + "v1.3 backlog.", + "Gold-answer authoring bias. Gold facts were written by the same person who " + "wrote the tasks. Independent authoring (e.g. via crowdsourced labels) would harden " + "the must_include set.", + "Task corpus is single-user. 116 skills, 15 tasks, all reflecting one user's " + "workload. Cross-user generalisation requires either a community corpus or telemetry-" + "sampled real prompts.", +] +for c in caveats: + story.append(Paragraph(f"• {c}", BODY)) + +story.append(Paragraph("B.7 Reproducibility", H2)) +story.append(Paragraph("Full v1.2 pipeline:", BODY)) +story.append(Paragraph("python bench/tokenomics.py # token-only, all 15", CODE)) +story.append(Paragraph("python bench/fetch_contexts.py # pre-fetch contexts to disk", CODE)) +story.append(Paragraph("python bench/run_v12.py --responses bench/responses-v12.json # deterministic scoring", CODE)) +story.append(Paragraph("python bench/run_v12.py --judges anthropic:haiku,openai:gpt-4o-mini --runs 3 # full v1.3 with API keys", CODE)) +story.append(Paragraph("python bench/aggregate_v12.py # final summary + JSON", CODE)) +story.append(Paragraph( + "Source files: bench/gold-answers.json, " + "bench/grade_v2.py, " + "bench/run_v12.py, " + "bench/aggregate_v12.py, " + "bench/responses-v12.json, " + "bench/session_grades.json, " + "bench/results-v12-final.json. " + "All in the repo; no external services required for the deterministic half of the pipeline.", + BODY, +)) + +story.append(Spacer(1, 14)) +story.append(Paragraph( + "The headline-defensible finding the v1.2 numbers support: a brokered context layer " + "(Smart Compile) buys a measurable, consistent ~20-point composite quality lift over " + "zero-context modern hosted models at ~69% of the naive baseline's token cost. MCP " + "search retrieves at a fraction of a percent of the baseline cost but its quality " + "contribution is conditional on retrieval success, which Precision@K predicts cheaply. " + "Both findings argue for the brokered-context architecture, but with different framings " + "than v1.1's looser methodology suggested.", + QUOTE, +)) + + +def on_page(canvas, doc): + canvas.saveState() + canvas.setFont("Helvetica", 8) + canvas.setFillColor(colors.HexColor("#999999")) + canvas.drawString(20*mm, 12*mm, "Context Engine White Paper — v1.2 Appendix B") + canvas.drawRightString(A4[0] - 20*mm, 12*mm, f"B-{doc.page}") + canvas.restoreState() + + +doc = SimpleDocTemplate( + str(ADDENDUM_PDF), pagesize=A4, + leftMargin=22*mm, rightMargin=22*mm, topMargin=22*mm, bottomMargin=22*mm, + title="Context Engine White Paper v1.2 — Appendix B", + author="Jeremy Walder-Willows", +) +doc.build(story, onFirstPage=on_page, onLaterPages=on_page) +print(f"Wrote: {ADDENDUM_PDF}") + +writer = PdfWriter() +for p in PdfReader(str(SOURCE_PDF)).pages: + writer.add_page(p) +for p in PdfReader(str(ADDENDUM_PDF)).pages: + writer.add_page(p) +with OUTPUT_PDF.open("wb") as f: + writer.write(f) +print(f"Wrote: {OUTPUT_PDF}") diff --git a/bench/legacy/build_addendum_v13.py b/bench/legacy/build_addendum_v13.py new file mode 100644 index 0000000..5380d1a --- /dev/null +++ b/bench/legacy/build_addendum_v13.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python3 +""" +Build Appendix C — v1.3 results. Honest correction of v1.1/v1.2's framing. +""" +from __future__ import annotations +import json +from pathlib import Path +from reportlab.lib.pagesizes import A4 +from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle +from reportlab.lib.units import mm +from reportlab.lib import colors +from reportlab.lib.enums import TA_LEFT +from reportlab.platypus import ( + SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, +) +from PyPDF2 import PdfReader, PdfWriter + +ROOT = Path(__file__).resolve().parents[2] +BENCH = ROOT / "app" / "bench" +ADDENDUM_PDF = BENCH / "addendum-v13.pdf" +SOURCE_PDF = ROOT / "Context Engine White Paper V1.2.pdf" +OUTPUT_PDF = ROOT / "Context Engine White Paper V1.3.pdf" + +data = json.loads((BENCH / "results-v13-final.json").read_text(encoding="utf-8")) + +styles = getSampleStyleSheet() +H1 = ParagraphStyle("H1", parent=styles["Heading1"], fontName="Helvetica-Bold", + fontSize=18, leading=22, spaceAfter=12, textColor=colors.HexColor("#191919")) +H2 = ParagraphStyle("H2", parent=styles["Heading2"], fontName="Helvetica-Bold", + fontSize=13, leading=17, spaceBefore=14, spaceAfter=6, + textColor=colors.HexColor("#191919")) +H3 = ParagraphStyle("H3", parent=styles["Heading3"], fontName="Helvetica-Bold", + fontSize=11, leading=14, spaceBefore=8, spaceAfter=4, + textColor=colors.HexColor("#3a3a3a")) +BODY = ParagraphStyle("BODY", parent=styles["BodyText"], fontName="Helvetica", + fontSize=10.5, leading=14, spaceAfter=6, alignment=TA_LEFT, + textColor=colors.HexColor("#191919")) +QUOTE = ParagraphStyle("QUOTE", parent=BODY, leftIndent=14, rightIndent=14, + fontName="Helvetica-Oblique", textColor=colors.HexColor("#3a3a3a"), + spaceBefore=4, spaceAfter=10) +CAPTION = ParagraphStyle("CAPTION", parent=BODY, fontSize=9, leading=12, + textColor=colors.HexColor("#666666"), spaceAfter=10) +CODE = ParagraphStyle("CODE", parent=BODY, fontName="Courier", fontSize=9, leading=12, + leftIndent=10, textColor=colors.HexColor("#191919")) + + +def fmt(n): return f"{n:,}" if isinstance(n, (int, float)) else str(n) + + +def base_table(headers, rows, col_widths): + t = Table([headers] + rows, colWidths=col_widths, repeatRows=1) + t.setStyle(TableStyle([ + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), + ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), + ("FONTSIZE", (0, 0), (-1, -1), 8.5), + ("ALIGN", (1, 1), (-1, -1), "RIGHT"), + ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f5f3ee")]), + ("LEFTPADDING", (0, 0), (-1, -1), 5), + ("RIGHTPADDING", (0, 0), (-1, -1), 5), + ("TOPPADDING", (0, 0), (-1, -1), 3.5), + ("BOTTOMPADDING", (0, 0), (-1, -1), 3.5), + ("LINEBELOW", (0, 0), (-1, 0), 0.5, colors.HexColor("#191919")), + ])) + return t + + +story = [] + +story.append(Paragraph("Appendix C", H3)) +story.append(Paragraph("Tokenomics Benchmark v1.3 — Full Pipeline, Cross-Model Judging", H1)) +story.append(Paragraph( + f"Added in v1.3. First full-pipeline run with a real task model " + f"({data['task_model']}) and a cross-size judge ({data['judge_model']} " + f"× {data['judge_runs']} runs) across all 15 corpus tasks. Total API cost: " + f"~${sum(m['cost_usd_total'] for m in data['by_mode'].values()):.2f}. " + f"Result is materially different from v1.1 / v1.2 — and the difference is the point. " + "The earlier appendices used a session-as-judge which under-penalised the task model's " + "own outputs; a separate Haiku judge with three runs per response exposes failure modes " + "the earlier methodology missed.", + QUOTE, +)) + +# C.1 Headline +story.append(Paragraph("C.1 Headline finding", H2)) +sv = data["paired"]["smart_vs_no_context"] +srv = data["paired"]["search_vs_no_context"] +story.append(Paragraph( + f"In its v0.3.1 form, Smart Compile net-degrades quality. " + f"Mean composite delta vs no-context: {sv['mean_delta']:+.2f} points on a 15-point scale, " + f"with {sv['wins']} wins, {sv['ties']} ties, and {sv['losses']} losses out of 15 tasks. " + f"MCP Search also under-performs no-context on this corpus: mean delta " + f"{srv['mean_delta']:+.2f} ({srv['wins']} wins / {srv['ties']} ties / {srv['losses']} losses). " + "The naive 'load relevant skills in full as system prompt' implementation of " + "Smart Compile hits an attention-dilution regime modern hosted models don't experience " + "when given no skill context at all.", + BODY, +)) +story.append(Paragraph( + "This appears to contradict v1.1's headline ('Smart holds at 9.0/10 across the " + "subset'). It does not contradict the paper's thesis — Section 11 (Multi-Resolution " + "Context Packaging) explicitly predicts this failure mode and argues that the " + "implementation must chunk and summarise skills before injection. The v1.3 result is " + "empirical validation of the paper's own Section 11 prediction and a clear signal " + "that Smart Compile alone, without the chunking layer the paper specifies, is the wrong " + "way to deliver context.", + BODY, +)) + +# C.2 Aggregate table +story.append(Paragraph("C.2 Per-mode aggregates (N=15 tasks, 3 judge runs each)", H2)) +mode_rows = [] +for m in ("no_context", "smart", "search"): + a = data["by_mode"][m] + mode_rows.append([ + m, + fmt(a["input_tokens_median"]), + f"{a['coverage_pct_mean']:.0f}%", + f"{a['bonus_pct_mean']:.0f}%", + f"{a['factual_median']:.1f}", + f"{a['specific_median']:.1f}", + f"{a['complete_median']:.1f}", + f"{a['hallucinations_mean']:.2f}", + f"{a['composite_15_median']:.1f}", + f"${a['cost_usd_total']:.2f}", + ]) +story.append(base_table( + ["Mode", "med tk", "Cov", "Bonus", "Fact", "Spec", "Comp", "Halluc", "Comp/15", "Cost"], + mode_rows, + [22*mm, 16*mm, 12*mm, 14*mm, 13*mm, 13*mm, 13*mm, 16*mm, 17*mm, 16*mm], +)) +story.append(Paragraph( + "Table C.1. Per-mode aggregates. Fact/Spec/Comp are median axes (1-5) across judge runs. " + "Halluc is mean fabricated facts per response. Comp/15 is median composite (sum of axes). " + "Cost is total API spend for the corpus at this mode.", + CAPTION, +)) + +# Paired comparisons +story.append(Paragraph("C.3 Paired comparisons (per-task delta)", H2)) +pc_rows = [] +for k in ("smart_vs_no_context", "search_vs_no_context", "smart_vs_search"): + p = data["paired"][k] + worst = p["worst_task"][0] if p["worst_task"] else "—" + best = p["best_task"][0] if p["best_task"] else "—" + pc_rows.append([ + k.replace("_", " "), + str(p["n"]), + f"{p['mean_delta']:+.2f}", + f"{p['wins']}/{p['ties']}/{p['losses']}", + worst, + best, + ]) +story.append(base_table( + ["Comparison", "n", "mean delta", "W/T/L", "Worst task", "Best task"], + pc_rows, + [42*mm, 10*mm, 22*mm, 20*mm, 38*mm, 38*mm], +)) +story.append(Paragraph( + "Table C.2. Paired per-task deltas, composite (0-15 scale). 'Worst task' is the task " + "with the largest negative delta; 'best' the largest positive. Smart Compile loses on " + "more than half the corpus.", + CAPTION, +)) + +story.append(PageBreak()) + +# C.4 What happens when smart loses? +story.append(Paragraph("C.4 The failure mode, concretely", H2)) +story.append(Paragraph( + "The single worst Smart-vs-no_context result was comfy-prompt-fantasy " + "(delta = -10). The task asks for a ComfyUI prompt. The no-context response delivered " + "exactly that — positive prompt, negative prompt, sampler / steps / CFG / resolution / " + "model recommendations (Haiku judge: factual=4 specific=5 complete=4, hallucinations=1).", + BODY, +)) +story.append(Paragraph( + "The smart-mode response, given 36k tokens of skill context including theme-factory " + "themes and prompt-builder bodies, instead wrote a 700-token philosophical essay called " + "'Mystral Dusk: an algorithmic philosophy of atmospheric emergence and temporal " + "liminal states.' It described the aesthetic at length, never produced an actual " + "prompt, never named a sampler or resolution. (Haiku judge: factual=1 specific=1 " + "complete=1, hallucinations=2.67.) The user could not run anything from the response.", + BODY, +)) +story.append(Paragraph( + "This is not a quirk. It's the predicted-by-Section-11 attention-dilution regime: when " + "a model is given many full skill bodies as a single system prompt, it picks up on the " + "meta-content (style discussion, philosophy of design, when-to-use prose) and produces " + "meta-content of its own, drifting away from the concrete task. The smaller, terser " + "search-mode context (1.1k tokens) does not trigger this, and neither does no-context.", + BODY, +)) + +# C.5 Retrieval quality +story.append(Paragraph("C.5 Retrieval quality (search mode)", H2)) +r = data["retrieval_quality"] +story.append(Paragraph( + f"Precision@8 mean = {r['precision_at_k_mean']:.2f} · " + f"Recall@8 mean = {r['recall_at_k_mean']:.2f} · " + f"MRR mean = {r['mrr_mean']:.2f}. " + f"{len(r['complete_misses'])} of {r['n']} tasks had complete retrieval misses — " + "zero relevant chunks in the top 8. Search quality is the bottleneck.", + BODY, +)) +story.append(Paragraph( + "Where retrieval works (brand-poster P@8=0.5, R@8=1.0, MRR=1.0) the search-mode response " + "matches or exceeds no-context. Where retrieval fails completely (5 tasks including " + "internal-release-note, pc-cpu-hog, claude-api-migrate) the search-mode response " + "degrades. The deterministic Precision@K predictor identifies these cases before the LLM " + "is invoked — improving retrieval is the highest-leverage fix in the v1.x roadmap.", + BODY, +)) + +# C.6 Why v1.1 and v1.2 looked different +story.append(Paragraph("C.6 Why v1.1 and v1.2 looked positive", H2)) +story.append(Paragraph( + "v1.1 used a single Claude session as both task model and judge, scoring on a holistic " + "1-10 rubric without gold facts or hallucination counting. v1.2 added gold facts and a " + "decomposed rubric but kept session-as-judge for the LLM axes. Both runs scored Smart at " + "100% composite on the 5-task subset.", + BODY, +)) +story.append(Paragraph( + "v1.3's three changes — different judge model (Haiku, not Sonnet), three runs per " + "response (variance instead of point estimates), and full 15-task coverage — " + "uncovered the failure mode. Same-model self-evaluation systematically under-penalises " + "verbose, on-style outputs that miss the task; cross-model judging exposes them. The " + "v1.1 and v1.2 numbers were not lies, but they were biased in a predictable direction.", + BODY, +)) + +# C.7 What the paper should now claim +story.append(Paragraph("C.7 What the paper should claim", H2)) +story.append(Paragraph( + "Validated claims (from this run):", + BODY, +)) +validated = [ + "Token reduction works. Smart cuts naive baseline by ~60%; Search by 99.5%. These " + "are reproducible, deterministic measurements unaffected by judging.", + "Section 11 (Multi-Resolution Packaging) is necessary, not optional. Full-body " + "skill injection produces measurable attention-dilution failures that no-context " + "responses do not exhibit. This is the strongest empirical signal in the appendix.", + "Retrieval quality is the bottleneck. Precision@8 of 0.23 and 5/15 complete " + "misses cap how much value Search can deliver. Improving retrieval (Phase 2 in the " + "build roadmap) has the highest expected payoff.", + "The brokered-context architecture is correct; the v0.3.x implementation isn't done. " + "CE's job is to broker context. v0.3.x brokers by selection + retrieval. The data says it " + "also needs to broker by summarising, chunking, deduplicating, and reranking before " + "anything reaches the model.", +] +for v in validated: + story.append(Paragraph(f"• {v}", BODY)) + +story.append(Paragraph("Retracted or revised claims (vs v1.1):", BODY)) +retracted = [ + "Smart Compile does NOT 'hold quality flat at 9.0/10'. It loses 8/15 tasks on composite " + "quality vs no-context under a cross-model judge. Hypothesis 1 is contradicted in the " + "v0.3.x implementation form.", + "MCP Search does NOT 'add value' on every task; it adds value where retrieval succeeds " + "(2/15 clear wins, 5/15 ties at no-context-equivalent quality) and degrades quality " + "where retrieval fails. Both wins and losses correlate with Precision@K.", + "The 'tokens per quality point' framing was misleading at face value. The right number " + "is tokens per quality point of lift over no-context — and that number is negative " + "for Smart on most tasks (no lift achieved at any token cost).", +] +for v in retracted: + story.append(Paragraph(f"• {v}", BODY)) + +# C.8 Implications for build roadmap +story.append(Paragraph("C.8 Implications for the build roadmap (Section 34)", H2)) +story.append(Paragraph( + "Phase 2 (Dedup and Rank) gains urgency. Reducing skill-body noise via dedup, and " + "promoting the most-useful chunks via reranking, directly addresses the attention-" + "dilution failures observed here. The dedup report should run before any Smart Compile " + "output is materialised.", + BODY, +)) +story.append(Paragraph( + "Phase 3 (Smart Compile) needs to be re-scoped. The current implementation selects " + "skills, then loads their full bodies. It should select skills, then load their compressed " + "summaries plus relevant chunks — the multi-resolution packaging from Section 11. Until " + "that ships, recommending Smart Compile to users is recommending a quality regression on " + "more than half their tasks.", + BODY, +)) +story.append(Paragraph( + "Phase 1 (Vector Foundation) needs a retrieval-quality dashboard. P@K, R@K, and " + "MRR per task should be visible in the CE UI. Users (and the team) should see when " + "retrieval is failing for a task class, not discover it via downstream quality drops.", + BODY, +)) + +# C.9 Methodology + reproducibility +story.append(Paragraph("C.9 Methodology and reproducibility", H2)) +story.append(Paragraph( + f"Pipeline: app/bench/run_v13.py + " + f"aggregate_v13.py. Task model {data['task_model']}; " + f"judge {data['judge_model']} × {data['judge_runs']} runs. 15 tasks × 3 modes = 45 task " + "calls + 135 judge calls = 180 API calls. Deterministic scoring (fact recall + retrieval " + "quality) runs alongside LLM judging — same numbers reproducible without any API key.", + BODY, +)) +story.append(Paragraph("Reproduce:", BODY)) +story.append(Paragraph("export ANTHROPIC_API_KEY=...", CODE)) +story.append(Paragraph("python bench/fetch_contexts.py # rebuild contexts/", CODE)) +story.append(Paragraph("python bench/run_v13.py # full pipeline", CODE)) +story.append(Paragraph("python bench/aggregate_v13.py # summary + per-task table", CODE)) + +story.append(Paragraph("C.10 Honest limitations still standing", H2)) +limits = [ + "Single judge family. Anthropic Haiku × 3 runs gives variance but doesn't break " + "intra-family bias. Adding GPT-4o-mini or Gemini Flash as a second judge is the v1.4 step.", + "Sonnet as the task model. Smaller models may benefit more from injected context — " + "the attention-dilution regime is model-size-dependent. Re-run on Haiku and on a local " + "8B model to test Hypothesis 3.", + "Skill corpus written for retrieval, not injection. Many skill bodies have meta-" + "prose (when to use, anti-patterns, lifecycle notes) that hurts when shown in full. " + "Authoring skills with both retrieval-shape and injection-shape variants is on the table.", + "One retrieval call per task. Real MCP hosts iterate; production search traffic " + "may close some of the search-mode gap observed here.", +] +for c in limits: + story.append(Paragraph(f"• {c}", BODY)) + +story.append(Spacer(1, 14)) +story.append(Paragraph( + "The v1.3 finding is the most useful result this benchmark has produced. It validates " + "the paper's core architectural thesis (a context broker is needed; multi-resolution " + "packaging is necessary) while invalidating the current implementation's claim to " + "deliver that thesis. The right product response is to ship Phase 2 (Dedup and Rank) and " + "the multi-resolution layer before claiming Smart Compile is production-ready, and to " + "communicate to users that v0.3.x's Smart Compile is a token-saving feature, not a " + "quality-improving one. The benchmark itself now stands as the test these phases must " + "pass to ship.", + QUOTE, +)) + + +def on_page(canvas, doc): + canvas.saveState() + canvas.setFont("Helvetica", 8) + canvas.setFillColor(colors.HexColor("#999999")) + canvas.drawString(20*mm, 12*mm, "Context Engine White Paper — v1.3 Appendix C") + canvas.drawRightString(A4[0] - 20*mm, 12*mm, f"C-{doc.page}") + canvas.restoreState() + + +doc = SimpleDocTemplate( + str(ADDENDUM_PDF), pagesize=A4, + leftMargin=22*mm, rightMargin=22*mm, topMargin=22*mm, bottomMargin=22*mm, + title="Context Engine White Paper v1.3 — Appendix C", + author="Jeremy Walder-Willows", +) +doc.build(story, onFirstPage=on_page, onLaterPages=on_page) +print(f"Wrote: {ADDENDUM_PDF}") + +writer = PdfWriter() +for p in PdfReader(str(SOURCE_PDF)).pages: + writer.add_page(p) +for p in PdfReader(str(ADDENDUM_PDF)).pages: + writer.add_page(p) +with OUTPUT_PDF.open("wb") as f: + writer.write(f) +print(f"Wrote: {OUTPUT_PDF}") diff --git a/bench/legacy/codex-handoff.md b/bench/legacy/codex-handoff.md new file mode 100644 index 0000000..18f1e0f --- /dev/null +++ b/bench/legacy/codex-handoff.md @@ -0,0 +1,164 @@ +# Handoff to Codex — Tokenomics Benchmark Writeup + +You're picking up a three-iteration tokenomics benchmark of the Context Engine (CE) and producing a final consolidated PDF with tables, charts, and a critical analysis. The benchmark went through v1.1 → v1.2 → v1.3 and the headline result changed materially each time — the writeup must handle that arc honestly, not paper over it. + +## Your job + +Produce **one consolidated PDF** that: + +1. Cross-references all three benchmark iterations against each other (claims, retractions, methodology changes) +2. Cross-references the v1.3 findings against the white paper's own predictions (especially Section 11 — Multi-Resolution Context Packaging — and Hypothesis 1 in Section 32) +3. Includes every relevant table and chart (specs below) +4. Calls out what's defensible, what's been retracted, and what's still pending +5. Lands on a publication-grade summary the team can use to make product decisions + +Format: A4 PDF, ~15-25 pages. Same restrained design language as the existing white paper (Anthropic-ish: ivory background, slate text, single coral accent, generous whitespace). Use reportlab — it's already installed. + +Output file: `Context Engine Benchmark Report v1.0.pdf` at the repo root. + +## Source files (all in `app/bench/`) + +| File | What it is | +| ------------------------------------- | --------------------------------------------------------------------------------- | +| `tasks.json` | The 15-task corpus | +| `gold-answers.json` | Per-task must/may/forbidden facts + expected_sources for retrieval scoring | +| `results-latest.json` | v1.1: 15-task token-only run, no quality grading | +| `results-graded-sample.json` | v1.1: 5-task session-as-judge grading (holistic 1-10) | +| `results-v12-final.json` | v1.2: 5-task subset with deterministic gold scoring + session decomposed rubric | +| `results-v13.json` | v1.3: raw 15-task × 3-mode × 3-judge-run rows | +| `results-v13-final.json` | v1.3: aggregated v1.3 with paired comparisons, retrieval quality, lift efficiency | +| `responses-v13.json` | All 45 task-model responses (Sonnet 4.5) captured during v1.3 | +| `contexts/manifest.json` | Per-task smart and search context token counts | +| `contexts/*.smart.md` / `*.search.md` | The actual contexts each mode injected | + +## Source PDFs (read for cross-reference, especially Sections 11, 19, 32, 34) + +| File | What it is | +| ------------------------------------- | ------------------------------------------- | +| `Context Engine White Paper V1.pdf` | Original paper (41p) | +| `Context Engine White Paper V1.1.pdf` | V1 + Appendix A (v1.1 benchmark) | +| `Context Engine White Paper V1.2.pdf` | V1.1 + Appendix B (v1.2 critique-addressed) | +| `Context Engine White Paper V1.3.pdf` | V1.2 + Appendix C (v1.3 full pipeline) | + +## The narrative arc — handle this carefully + +**v1.1** (5-task, session-as-judge, holistic 1-10): "Smart 9.0/10, Search 7.6/10, Smart-Compile saves 65% tokens with no quality cost." → optimistic, claimed Hypothesis 1 holds. + +**v1.2** (5-task, gold facts + decomposed rubric, still session-judge): added no-context baseline. Found Smart adds +20 composite points over no-context at ~58k tokens; Search adds +6.7 at ~700 tokens. More sober but still positive. + +**v1.3** (15-task, Sonnet task model, Haiku judge × 3 runs, full pipeline): **Smart Compile in its v0.3.x form NET-DEGRADES quality vs no-context** — loses on 8/15 tasks, mean delta -1.27 composite/15. Search also slightly under-performs no-context (-0.80). Both v1.1 and v1.2 headlines retracted. + +**Why the change**: same-model self-evaluation in v1.1/v1.2 under-penalised verbose, on-style outputs that drift away from the task. A cross-model judge (Haiku, smaller than the Sonnet task model) caught the failure. The single worst case: `comfy-prompt-fantasy` smart-mode — given 36k tokens of skill context, Sonnet wrote a philosophical essay titled "Mystral Dusk" instead of a ComfyUI prompt. + +**What v1.3 _validates_** (don't bury this): + +- Token reduction is real (Smart -60%, Search -99.5%) — deterministic, judge-independent +- Section 11's prediction (multi-resolution packaging is necessary) is empirically confirmed +- Retrieval quality (P@8 = 0.23, 5/15 complete misses) is the bottleneck — Phase 2 in build roadmap (dedup + rank) is the highest-leverage fix +- The brokered-context architecture is right; the v0.3.x _implementation_ isn't done + +## Suggested visualisations + +Each chart should have a one-sentence caption that names the takeaway, not just labels the axes. + +### Chart 1 — Token cost by mode (log-scale bar chart) + +- X: mode (raw_all baseline, smart, search, no_context) +- Y: median input tokens, log scale +- Bars: raw_all 186,654 / smart 74,279 / search 903 / no_context 29 +- Takeaway caption: "Four orders of magnitude separate the cheapest path from the naive baseline." + +### Chart 2 — Composite quality per mode, all 15 tasks (grouped bar or heatmap) + +- X: 15 task IDs +- Y: composite quality (0-15 scale) +- 3 bars/cells per task: no_context, smart, search +- Source: `results-v13-final.json` → `detail[i].{mode}.composite_15` +- Takeaway: "Smart Compile loses on 8 of 15 tasks despite costing 2,500× more tokens than no-context." + +### Chart 3 — Paired delta plot (waterfall or dot) + +- X: 15 tasks, sorted by smart-vs-no_context delta +- Y: delta in composite points (positive = Smart wins) +- Source: `results-v13-final.json` → `detail` → compute per-task `smart.composite_15 - no_context.composite_15` +- Annotate `comfy-prompt-fantasy` (worst, -10) and `memory-consolidate` (best, +4) +- Takeaway: "When Smart Compile fails, it fails catastrophically; when it wins, it wins modestly." + +### Chart 4 — Hallucination distribution (box plot or strip) + +- X: mode +- Y: hallucinations_mean (judge runs averaged) per task +- Show: no_context cluster around 0.98, smart at 1.49 mean with high outliers (4.33 on claude-api-migrate), search at 1.20 +- Takeaway: "Loading more context creates more fabricated facts, not fewer." + +### Chart 5 — Retrieval quality predicts quality outcome (scatter) + +- X: Precision@8 per task (search mode, from `results-v13-final.json` → `retrieval_quality`) +- Y: search-mode composite quality delta vs no_context +- Trend line + labels for outliers +- Takeaway: "Where retrieval surfaces relevant chunks, search adds value; where it misses, search degrades." + +### Chart 6 — Cost per quality-point-of-lift over no-context + +- Bar: smart vs search +- Y: tokens per +1 composite point of lift (when lift > 0) +- Smart: 80,373 tk/+1pt | Search: 1,052 tk/+1pt | also show count of "negative-lift" tasks (smart 8/15, search 8/15) +- Takeaway: "Search is 76× more efficient per quality-point of lift, but both modes show negative lift on the majority of tasks." + +### Chart 7 — Methodology change vs result (narrative figure) + +- Three rows (v1.1 / v1.2 / v1.3), three columns showing: N, judge config, headline composite quality of Smart +- Visually emphasises how methodology drove the result +- Takeaway: "Self-evaluation bias accounted for ~3 composite points of inflation in the v1.1 result." + +### Chart 8 — Latency & cost per mode (small multiples) + +- Two side-by-side bars: median latency (ms) and total cost (USD) per mode for the v1.3 run +- Source: `results-v13-final.json` → `by_mode` per-mode `latency_ms_median`, `cost_usd_total` + +## Tables to include (verbatim from existing appendices) + +1. **Per-mode aggregate, all 15 tasks** (from Appendix C Table C.1) +2. **Paired comparison** (Smart/Search vs no_context, Smart vs Search) (Appendix C Table C.2) +3. **Per-task detail** (15 rows × 3 modes × {tokens, cov%, bonus%, fact/spec/comp/halluc, composite/15}) — pull from `results-v13-final.json` → `detail` +4. **Retrieval quality per task** (P@8, R@8, MRR, hits) — pull from `results-v13.json` → rows where mode == "search" → `retrieval_quality` +5. **Methodology evolution** (v1.1 → v1.2 → v1.3 differences in N, judge, rubric, gold answers, modes measured) — synthesise + +## Structure suggestion (15-25 pages) + +1. Executive summary (1 page) — headline numbers, big finding, the methodology-changed-the-result story +2. Methodology evolution (2 pages) — what each iteration changed and why +3. Headline results — v1.3 numbers (3-4 pages) — tables 1, 2, charts 1, 2, 3 +4. Cross-reference: white paper predictions vs v1.3 measurements (2 pages) — especially Section 11 vindication, Hypothesis 1 retraction +5. The failure mode in detail (1-2 pages) — comfy-prompt-fantasy Mystral Dusk case study (the actual responses are in `responses-v13.json`) +6. Hallucinations + retrieval-quality findings (2-3 pages) — charts 4, 5, table 4 +7. What's validated, what's retracted (1 page) — the honest list +8. Implications for build roadmap (1-2 pages) — Phase 2 urgency, re-scoping Phase 3, retrieval-quality dashboard +9. Honest limitations still standing (1 page) — single judge family, single task model, etc. +10. Appendix: full per-task detail table + +## Things the writeup MUST get right + +- **Don't bury the retraction.** v1.1 said "Smart holds at 9.0/10". v1.3 says Smart loses on 8/15 tasks. Both numbers are in the data; the writeup needs to say _why_ they differ (judge bias). +- **Don't bury the validation either.** Token reduction is real and large. Section 11 prediction is empirically confirmed. The architecture is right. +- **The product framing matters.** Current Smart Compile (v0.3.x) is a token-saving feature, not a quality-improving one. Recommending it to users today is recommending a quality regression on most tasks. That's the honest framing; don't soften it. +- **The roadmap framing matters.** Phase 2 (dedup + rank) and the Section 11 chunked-smart-compile aren't nice-to-haves; they're load-bearing for the product's value claim. Make that visible. + +## Files Codex should not modify + +The existing `Context Engine White Paper V1.x.pdf` series is the historical record. Don't rebuild those. Don't modify `tokenomics.py`, `grade_v2.py`, `run_v13.py`, `aggregate_v13.py`, or any of the JSON result files. Read-only. + +## Files Codex creates + +- `Context Engine Benchmark Report v1.0.pdf` (the main deliverable, at repo root) +- `app/bench/charts/*.png` (rendered chart images, kept around for re-use) +- `app/bench/build_report.py` (the reportlab/matplotlib script that produces the PDF; future-runnable) + +## Done criteria + +- PDF builds end-to-end from a single `python app/bench/build_report.py` invocation +- All 8 charts present, each with a take-away caption (not a label) +- All 5 tables present +- The v1.1 → v1.3 retraction is clearly explained, not euphemistic +- Section 11 vindication and Hypothesis 1 retraction both flagged in the executive summary +- File size under 10 MB (use 200 DPI for charts, not 600) diff --git a/bench/run_continuity.py b/bench/run_continuity.py new file mode 100644 index 0000000..efd889a --- /dev/null +++ b/bench/run_continuity.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +""" +run_continuity.py — Continuity benchmark for Context Engine. + +Tests whether session 2 of a paused workflow inherits state established in +session 1, across three modes: + + no_context_cold — S2 has no memory of S1. Baseline of pain. + System prompt empty; only the S2 user prompt is given. + ce_full — S2 gets the full session_1_state object as a memory + snapshot. Simulates CE persisting and replaying the + entire state. + ce_compact — S2 gets the compact handoff note (1-2 sentences) + CE emits. Simulates the realistic delivery — what CE + would inject as a memory entry, not the raw state dump. + +For each (task, mode) pair we measure: + • input/output tokens, latency, dollar cost + • inherited-fact coverage (deterministic; must_inherit keyword hits) + • reset-question count (deterministic; regex matches on response) + • forbidden-drift count (deterministic; alternative-choice hits) + • decomposed rubric (Haiku judge x N runs: factual / specific / complete / hallucinations) + +Headline metric: time/tokens/turns to productive output in S2 with CE-backed +state vs cold start. The lower the ce_continue numbers (vs no_context_cold), +the stronger the continuity claim. + +Requires ANTHROPIC_API_KEY in env. + +Usage: + export ANTHROPIC_API_KEY=... + python bench/run_continuity.py # all 7 tasks, 3 modes, 3 judge runs + python bench/run_continuity.py --tasks-subset 2 # smoke test + python bench/run_continuity.py --modes no_context_cold,ce_compact # just two modes +""" +from __future__ import annotations +import argparse +import json +import os +import re +import sys +import time +from dataclasses import asdict +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from grade_v2 import ( # noqa: E402 + keyword_hits, parse_decomposed, DECOMPOSED_RUBRIC, cost_usd, + aggregate_grades, DecomposedGrade, +) + +HERE = Path(__file__).parent +DATA = HERE / "data" / "continuity" + + +def system_for_mode(mode: str, task: dict) -> str: + """Build the system prompt that simulates each delivery mode of CE state.""" + if mode == "no_context_cold": + return "" + if mode == "ce_full": + state = task["session_1_state"] + return ( + "You are an AI assistant continuing a paused workflow. The user's " + "previous session established the following state, which you " + "should respect and build on without re-asking:\n\n" + f"PREVIOUS SESSION CONTEXT:\n{json.dumps(state, indent=2)}\n\n" + "Do not re-ask the user for facts that are already in the state above. " + "Continue from where the previous session left off." + ) + if mode == "ce_compact": + handoff = task["session_1_compact_handoff"] + return ( + "You are an AI assistant continuing a paused workflow. The user's " + "previous session left this handoff note:\n\n" + f"HANDOFF FROM PREVIOUS SESSION:\n{handoff}\n\n" + "Do not re-ask the user for facts that are already covered in the " + "handoff above. Continue from where the previous session left off." + ) + return "" + + +def score_inheritance(response: str, gold: dict) -> dict: + """Deterministic scoring of state inheritance.""" + must = gold.get("must_inherit", []) + reset_qs = gold.get("reset_questions", []) + forbidden = gold.get("forbidden_drift", []) + + must_hit = 0 + must_results = [] + for f in must: + hits = keyword_hits(response, f.get("keywords", [])) + ok = bool(hits) + if ok: + must_hit += 1 + must_results.append({"name": f["name"], "hit": ok, "matched": hits}) + + reset_hits = 0 + reset_results = [] + norm = response.lower() + for q in reset_qs: + matched = False + matches = [] + for pat in q.get("patterns", []): + if re.search(pat.lower(), norm): + matched = True + matches.append(pat) + if matched: + reset_hits += 1 + reset_results.append({"name": q["name"], "hit": matched, "patterns_matched": matches}) + + forbidden_hit = 0 + forbidden_results = [] + for f in forbidden: + hits = keyword_hits(response, f.get("keywords", [])) + ok = bool(hits) + if ok: + forbidden_hit += 1 + forbidden_results.append({"name": f["name"], "hit": ok, "matched": hits}) + + return { + "must_total": len(must), "must_hit": must_hit, + "inheritance_pct": round(100 * must_hit / len(must), 1) if must else 0, + "reset_questions_total": len(reset_qs), "reset_questions_hit": reset_hits, + "forbidden_drift_hit": forbidden_hit, + "must_results": must_results, + "reset_results": reset_results, + "forbidden_results": forbidden_results, + } + + +def call_anthropic(client, model: str, system: str, user: str, max_tokens: int = 600) -> dict: + t0 = time.time() + kwargs = {"model": model, "max_tokens": max_tokens, "messages": [{"role": "user", "content": user}]} + if system: + kwargs["system"] = system + resp = client.messages.create(**kwargs) + text = "".join(b.text for b in resp.content if getattr(b, "type", None) == "text") + return { + "text": text.strip(), + "input_tokens": int(resp.usage.input_tokens), + "output_tokens": int(resp.usage.output_tokens), + "latency_ms": int((time.time() - t0) * 1000), + } + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--tasks", default=str(HERE / "continuity-tasks.json")) + p.add_argument("--gold", default=str(HERE / "continuity-gold.json")) + p.add_argument("--out", default=str(DATA / "results-continuity.json")) + p.add_argument("--responses-out", default=str(DATA / "responses-continuity.json")) + p.add_argument("--task-model", default="claude-sonnet-4-5") + p.add_argument("--judge-model", default="claude-haiku-4-5") + p.add_argument("--judge-runs", type=int, default=3) + p.add_argument("--modes", default="no_context_cold,ce_compact,ce_full") + p.add_argument("--tasks-subset", type=int, default=0) + p.add_argument("--max-out", type=int, default=600) + args = p.parse_args() + + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + sys.stderr.write("[!] ANTHROPIC_API_KEY not set\n") + return 1 + try: + import anthropic + except ImportError: + sys.stderr.write("[!] pip install anthropic\n") + return 1 + client = anthropic.Anthropic(api_key=api_key) + + tasks = json.loads(Path(args.tasks).read_text(encoding="utf-8"))["tasks"] + gold = json.loads(Path(args.gold).read_text(encoding="utf-8"))["tasks"] + if args.tasks_subset: + tasks = tasks[: args.tasks_subset] + modes = [m.strip() for m in args.modes.split(",") if m.strip()] + + print(f"Continuity benchmark — task={args.task_model} judge={args.judge_model} x {args.judge_runs}") + print(f"Tasks: {len(tasks)} Modes: {modes}\n") + + responses = {} + rows = [] + + for ti, task in enumerate(tasks, 1): + tid = task["id"] + s2_prompt = task["session_2_prompt"] + print(f" [{ti}/{len(tasks)}] {tid}") + responses[tid] = {} + gold_t = gold.get(tid, {}) + + for mode in modes: + system = system_for_mode(mode, task) + try: + r = call_anthropic(client, args.task_model, system, s2_prompt, args.max_out) + except Exception as e: + print(f" {mode:<18} FAIL {e}") + continue + r["model"] = args.task_model + r["system_chars"] = len(system) + responses[tid][mode] = r + + inh = score_inheritance(r["text"], gold_t) + + # Judge runs + judge_grades = [] + user_judge = ( + f"TASK (session 2 of a paused workflow):\n{s2_prompt}\n\n" + f"SESSION 1 SUMMARY (what the user had already done):\n{task['session_1_summary']}\n\n" + f"RESPONSE TO GRADE:\n{r['text']}" + ) + for run_idx in range(args.judge_runs): + try: + jr = call_anthropic(client, args.judge_model, DECOMPOSED_RUBRIC, + user_judge, max_tokens=200) + g = parse_decomposed(jr["text"]) + g.judge = f"anthropic:{args.judge_model}" + g.run_idx = run_idx + judge_grades.append(g) + except Exception as e: + judge_grades.append(DecomposedGrade(notes=f"(judge err: {e})", + judge=f"anthropic:{args.judge_model}", + run_idx=run_idx)) + judge_summary = aggregate_grades(judge_grades) + + print(f" {mode:<18} " + f"in={r['input_tokens']:>5,} out={r['output_tokens']:>4} " + f"inh={inh['inheritance_pct']:>5.1f}% " + f"resets={inh['reset_questions_hit']} " + f"drift={inh['forbidden_drift_hit']} " + f"fact={judge_summary.get('factual_median', 0):.1f} " + f"spec={judge_summary.get('specific_median', 0):.1f} " + f"comp={judge_summary.get('complete_median', 0):.1f} " + f"halluc={judge_summary.get('hallucinations_mean', 0)}") + + rows.append({ + "task_id": tid, + "category": task["category"], + "mode": mode, + "model": args.task_model, + "input_tokens": r["input_tokens"], + "output_tokens": r["output_tokens"], + "latency_ms": r["latency_ms"], + "cost_usd": round(cost_usd(args.task_model, r["input_tokens"], r["output_tokens"]), 6), + "system_chars": r["system_chars"], + "inheritance": inh, + "judge_runs": [asdict(g) for g in judge_grades], + "judge_summary": judge_summary, + }) + + responses_path = Path(args.responses_out) + out_path = Path(args.out) + responses_path.parent.mkdir(parents=True, exist_ok=True) + out_path.parent.mkdir(parents=True, exist_ok=True) + responses_path.write_text(json.dumps(responses, indent=2), encoding="utf-8") + out_path.write_text(json.dumps({ + "ran_at": time.strftime("%Y-%m-%dT%H:%M:%S"), + "task_model": args.task_model, + "judge_model": args.judge_model, + "judge_runs": args.judge_runs, + "task_count": len(tasks), + "modes": modes, + "rows": rows, + }, indent=2), encoding="utf-8") + print(f"\nWrote: {responses_path}\nWrote: {out_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bench/run_v12.py b/bench/run_v12.py new file mode 100644 index 0000000..8511c04 --- /dev/null +++ b/bench/run_v12.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +""" +run_v12.py — v1.2 benchmark orchestrator. + +Replaces tokenomics.py --grade with the full critique-addressing measurement +pipeline. Reads gold-answers.json + tasks.json, runs every task through up to +four context paths (no-context, smart, search, raw_all-skipped-for-cost), +grades each response by: + - deterministic gold-fact recall (always run; no API key needed) + - retrieval Precision@K / Recall@K / MRR (always run if expected_sources labeled) + - decomposed LLM rubric (factual / specific / complete / hallucinations) + via multiple judges across families (anthropic, openai), N runs each. + - failure-mode taxonomy (inferred from gold misses) + +Captures latency + cost per call. Writes bench/data/v1.2/results-v12.json. + +When API keys aren't available, the script still produces meaningful results +via the deterministic scorers (gold recall + retrieval metrics). Use +--responses path/to/file.json to score pre-generated responses without +calling the task model. + +Usage: + # Full pipeline with API keys: + export ANTHROPIC_API_KEY=... + export OPENAI_API_KEY=... + python bench/run_v12.py --judges anthropic:haiku,openai:gpt-4o-mini --runs 3 + + # Deterministic-only scoring of pre-generated responses: + python bench/run_v12.py --responses bench/data/v1.2/responses-v12.json --no-llm-grade + + # Single judge, low cost smoke test: + python bench/run_v12.py --judges anthropic:haiku --runs 1 --tasks-subset 5 +""" +from __future__ import annotations + +import argparse +import json +import os +import sys +import time +from dataclasses import asdict +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from grade_v2 import ( # noqa: E402 + JudgeConfig, score_facts, score_retrieval, + grade_response_decomposed, aggregate_grades, cost_usd, +) + +# Hard-coded judge presets so the CLI stays short. +JUDGE_PRESETS = { + "anthropic:haiku": JudgeConfig("anthropic:haiku", "anthropic", "claude-haiku-4-5", "ANTHROPIC_API_KEY"), + "anthropic:sonnet": JudgeConfig("anthropic:sonnet", "anthropic", "claude-sonnet-4-5", "ANTHROPIC_API_KEY"), + "openai:gpt-4o-mini": JudgeConfig("openai:gpt-4o-mini", "openai", "gpt-4o-mini", "OPENAI_API_KEY"), + "openai:gpt-4o": JudgeConfig("openai:gpt-4o", "openai", "gpt-4o", "OPENAI_API_KEY"), +} + +HERE = Path(__file__).parent +DATA = HERE / "data" / "v1.2" + + +def load_responses(path: Path) -> dict: + """Pre-generated responses from disk. Schema: + { task_id: { context_mode: { "text": ..., "input_tokens": ..., "output_tokens": ..., + "latency_ms": ..., "model": ..., "retrieved_skill_ids": [...] }}}""" + return json.loads(path.read_text(encoding="utf-8")) + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--gold", default=str(HERE / "gold-answers.json")) + p.add_argument("--tasks", default=str(HERE / "tasks.json")) + p.add_argument("--responses", default=str(DATA / "responses-v12.json"), + help="Pre-generated responses to score. Required when --no-task-run.") + p.add_argument("--out", default=str(DATA / "results-v12.json")) + p.add_argument("--judges", default="anthropic:haiku", + help="Comma-separated judge presets, e.g. 'anthropic:haiku,openai:gpt-4o-mini'") + p.add_argument("--runs", type=int, default=3, help="Judge runs per response.") + p.add_argument("--no-llm-grade", action="store_true", + help="Skip the LLM decomposed-rubric pass. Deterministic scorers still run.") + p.add_argument("--tasks-subset", type=int, default=0, + help="Score only the first N tasks (for fast iteration).") + args = p.parse_args() + + gold_root = json.loads(Path(args.gold).read_text(encoding="utf-8")) + gold = gold_root["tasks"] + tasks = json.loads(Path(args.tasks).read_text(encoding="utf-8")) + if args.tasks_subset: + tasks = tasks[: args.tasks_subset] + + responses_path = Path(args.responses) + responses = load_responses(responses_path) if responses_path.exists() else {} + + judges = [JUDGE_PRESETS[j.strip()] for j in args.judges.split(",") if j.strip() in JUDGE_PRESETS] + if not args.no_llm_grade and judges: + print(f"Judges: {', '.join(j.name for j in judges)} x {args.runs} runs each") + else: + print("LLM grading: skipped (deterministic scorers only)") + + out = { + "ran_at": time.strftime("%Y-%m-%dT%H:%M:%S"), + "judges": [j.name for j in judges] if not args.no_llm_grade else [], + "runs_per_judge": args.runs if not args.no_llm_grade else 0, + "task_count": len(tasks), + "results": [], + } + + for task in tasks: + tid = task["id"] + if tid not in gold: + print(f" [skip] {tid}: no gold entry") + continue + if tid not in responses: + print(f" [skip] {tid}: no responses on disk") + continue + per_task = { + "task_id": tid, + "category": task.get("category", ""), + "prompt": task["prompt"], + "modes": {}, + } + for mode, resp in responses[tid].items(): + text = resp.get("text", "") + recall = score_facts(text, gold[tid]) + retrieval = None + if mode == "search": + retrieved = resp.get("retrieved_skill_ids", []) + expected = gold[tid].get("expected_sources", []) + if expected and retrieved: + retrieval = asdict(score_retrieval(retrieved, expected)) + llm_grades = [] + if not args.no_llm_grade and judges and text.strip(): + llm_grades = grade_response_decomposed( + task["prompt"], text, judges, runs=args.runs, + ) + llm_summary = aggregate_grades(llm_grades) if llm_grades else {"n": 0} + + model = resp.get("model", "unknown") + in_tok = int(resp.get("input_tokens", 0)) + out_tok = int(resp.get("output_tokens", 0)) + per_task["modes"][mode] = { + "model": model, + "input_tokens": in_tok, + "output_tokens": out_tok, + "latency_ms": int(resp.get("latency_ms", 0)), + "cost_usd": round(cost_usd(model, in_tok, out_tok), 6), + "fact_recall": asdict(recall), + "retrieval_quality": retrieval, + "llm_grades": [asdict(g) for g in llm_grades], + "llm_summary": llm_summary, + } + out["results"].append(per_task) + rs = per_task["modes"] + recall_line = " | ".join( + f"{m}={rs[m]['fact_recall']['coverage_pct']:.0f}%" for m in rs if "fact_recall" in rs[m] + ) + print(f" {tid:<28} {recall_line}") + + out_path = Path(args.out) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(out, indent=2), encoding="utf-8") + print(f"\nWrote {args.out}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bench/run_v13.py b/bench/run_v13.py new file mode 100644 index 0000000..4295a0d --- /dev/null +++ b/bench/run_v13.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +""" +run_v13.py — End-to-end v1.3 benchmark. + +Generates responses for every task × every mode using a real task model, then +grades each response N times with the configured judges, then writes the +final aggregated results. + +Pipeline: + 1. Load tasks + gold answers + pre-fetched contexts (from fetch_contexts.py). + 2. For each task, for each mode (no_context, smart, search): + a. Build the system prompt (empty / smart skill bodies / search chunks). + b. Call the task model. Record text, in/out tokens, latency. + 3. Score every response with the deterministic grade_v2 scorers: + fact-recall + retrieval-quality. + 4. Grade every response N times per judge with the decomposed rubric. + 5. Aggregate (median + IQR) and write bench/data/v1.3/results-v13.json. + +Requires ANTHROPIC_API_KEY in the env. Reads it once at startup; never +written to disk. + +Usage: + export ANTHROPIC_API_KEY=sk-ant-... + python bench/run_v13.py + python bench/run_v13.py --tasks-subset 5 --judge-runs 3 + python bench/run_v13.py --task-model claude-sonnet-4-5 --judge-model claude-haiku-4-5 +""" +from __future__ import annotations +import argparse +import json +import os +import sys +import time +from dataclasses import asdict +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from grade_v2 import ( # noqa: E402 + score_facts, score_retrieval, parse_decomposed, + DECOMPOSED_RUBRIC, cost_usd, aggregate_grades, DecomposedGrade, +) + +HERE = Path(__file__).parent +DATA = HERE / "data" / "v1.3" +CONTEXTS = HERE / "artifacts" / "contexts" + + +def system_for_mode(mode: str, tid: str) -> tuple[str, list[str]]: + """Return (system_prompt_body, retrieved_skill_ids). + retrieved_skill_ids is non-empty only for mode='search'.""" + if mode == "no_context": + return "", [] + if mode == "smart": + path = CONTEXTS / f"{tid}.smart.md" + if not path.exists(): + return "", [] + body = path.read_text(encoding="utf-8") + return f"You are an AI assistant. Use the following retrieved knowledge where relevant.\n\n{body}", [] + if mode == "search": + path = CONTEXTS / f"{tid}.search.md" + if not path.exists(): + return "", [] + body = path.read_text(encoding="utf-8") + # Extract retrieved skill IDs from chunk headers + import re + ids = [] + seen = set() + for m in re.finditer(r"skill=(\S+)", body): + if m.group(1) not in seen: + seen.add(m.group(1)) + ids.append(m.group(1)) + return f"You are an AI assistant. Use the following retrieved chunks where relevant.\n\n{body}", ids + return "", [] + + +def call_anthropic(client, model: str, system: str, user: str, max_tokens: int = 700) -> dict: + t0 = time.time() + kwargs = {"model": model, "max_tokens": max_tokens, "messages": [{"role": "user", "content": user}]} + if system: + kwargs["system"] = system + resp = client.messages.create(**kwargs) + text = "".join(b.text for b in resp.content if getattr(b, "type", None) == "text") + return { + "text": text.strip(), + "input_tokens": int(resp.usage.input_tokens), + "output_tokens": int(resp.usage.output_tokens), + "latency_ms": int((time.time() - t0) * 1000), + } + + +def main() -> int: + p = argparse.ArgumentParser() + p.add_argument("--gold", default=str(HERE / "gold-answers.json")) + p.add_argument("--tasks", default=str(HERE / "tasks.json")) + p.add_argument("--out", default=str(DATA / "results-v13.json")) + p.add_argument("--responses-out", default=str(DATA / "responses-v13.json")) + p.add_argument("--task-model", default="claude-sonnet-4-5") + p.add_argument("--judge-model", default="claude-haiku-4-5") + p.add_argument("--judge-runs", type=int, default=3) + p.add_argument("--tasks-subset", type=int, default=0, help="Run only the first N tasks.") + p.add_argument("--modes", default="no_context,smart,search") + p.add_argument("--task-max-out", type=int, default=700) + args = p.parse_args() + + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + sys.stderr.write("[!] ANTHROPIC_API_KEY not set. Aborting.\n") + return 1 + + try: + import anthropic + except ImportError: + sys.stderr.write("[!] pip install anthropic\n") + return 1 + client = anthropic.Anthropic(api_key=api_key) + + gold_root = json.loads(Path(args.gold).read_text(encoding="utf-8")) + gold = gold_root["tasks"] + tasks = json.loads(Path(args.tasks).read_text(encoding="utf-8")) + if args.tasks_subset: + tasks = tasks[: args.tasks_subset] + modes = [m.strip() for m in args.modes.split(",") if m.strip()] + + print(f"Task model: {args.task_model}") + print(f"Judge model: {args.judge_model} x {args.judge_runs} runs per response") + print(f"Tasks: {len(tasks)} | modes: {modes}") + print() + + responses = {} + rows = [] + + for ti, task in enumerate(tasks, 1): + tid = task["id"] + prompt = task["prompt"] + print(f" [{ti:>2}/{len(tasks)}] {tid}") + responses[tid] = {} + gold_t = gold.get(tid, {}) + for mode in modes: + system, retrieved_ids = system_for_mode(mode, tid) + if mode != "no_context" and not system: + print(f" {mode:<11} (no context on disk; skipping)") + continue + try: + r = call_anthropic(client, args.task_model, system, prompt, args.task_max_out) + except Exception as e: + print(f" {mode:<11} FAIL {e}") + continue + r["model"] = args.task_model + if retrieved_ids: + r["retrieved_skill_ids"] = retrieved_ids + responses[tid][mode] = r + + # Score deterministically + recall = score_facts(r["text"], gold_t) + retrieval = None + if mode == "search": + expected = gold_t.get("expected_sources", []) + if expected and retrieved_ids: + retrieval = asdict(score_retrieval(retrieved_ids, expected)) + + # Judge N times + judge_grades = [] + user_judge = f"TASK:\n{prompt}\n\nRESPONSE TO GRADE:\n{r['text']}" + for run_idx in range(args.judge_runs): + try: + jr = call_anthropic(client, args.judge_model, DECOMPOSED_RUBRIC, + user_judge, max_tokens=200) + g = parse_decomposed(jr["text"]) + g.judge = f"anthropic:{args.judge_model}" + g.run_idx = run_idx + judge_grades.append(g) + except Exception as e: + judge_grades.append(DecomposedGrade(notes=f"(judge err: {e})", + judge=f"anthropic:{args.judge_model}", + run_idx=run_idx)) + judge_summary = aggregate_grades(judge_grades) + + print(f" {mode:<11} in={r['input_tokens']:>6,} out={r['output_tokens']:>4} " + f"cov={recall.coverage_pct:>5.1f}% bonus={recall.bonus_pct:>5.1f}% " + f"fact={judge_summary.get('factual_median', 0):.1f} " + f"spec={judge_summary.get('specific_median', 0):.1f} " + f"comp={judge_summary.get('complete_median', 0):.1f} " + f"halluc={judge_summary.get('hallucinations_mean', 0)} ") + + rows.append({ + "task_id": tid, + "category": task.get("category", ""), + "mode": mode, + "model": args.task_model, + "input_tokens": r["input_tokens"], + "output_tokens": r["output_tokens"], + "latency_ms": r["latency_ms"], + "cost_usd": round(cost_usd(args.task_model, r["input_tokens"], r["output_tokens"]), 6), + "fact_recall": asdict(recall), + "retrieval_quality": retrieval, + "judge_runs": [asdict(g) for g in judge_grades], + "judge_summary": judge_summary, + }) + + responses_path = Path(args.responses_out) + out_path = Path(args.out) + responses_path.parent.mkdir(parents=True, exist_ok=True) + out_path.parent.mkdir(parents=True, exist_ok=True) + responses_path.write_text(json.dumps(responses, indent=2), encoding="utf-8") + out_path.write_text(json.dumps({ + "ran_at": time.strftime("%Y-%m-%dT%H:%M:%S"), + "task_model": args.task_model, + "judge_model": args.judge_model, + "judge_runs": args.judge_runs, + "task_count": len(tasks), + "rows": rows, + }, indent=2), encoding="utf-8") + print(f"\nWrote responses: {responses_path}") + print(f"Wrote results: {out_path}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/docs/roadmap.md b/docs/roadmap.md new file mode 100644 index 0000000..a33aa17 --- /dev/null +++ b/docs/roadmap.md @@ -0,0 +1,248 @@ +# Context Engine — Comprehensive Roadmap + +> Updated: 2026-05-18. Author: Jeremy. +> What we've built, where we stand, and what ships next. Two-month horizon. + +--- + +## Why This Exists + +Context quality is the bottleneck in every AI workflow. Models are good and getting better — but the context you feed them is fragmented, stale, and unstructured. CE solves that: it ingests skills from anywhere, understands them semantically, deduplicates overlap, ranks quality, auto-selects the right subset per task, and delivers optimised context to any AI surface. + +--- + +## Repo Layout — Canonical Context Root + +All AI tool configs live in a single canonical location: `app/.context/`. Root-level dot-files and directories are NTFS junctions pointing into it. + +``` +app/.context/ +├── claude/ → launch.json, settings.local.json +├── codex/ → instructions.md +├── continue/rules/ → context-engine.md +├── app-claude/ → launch.json (when running from app/) +├── instructions/ → AGENTS.md, CLAUDE.md +├── rules/ → cursorrules, windsurfrules, clinerules +└── kimi-system-prompt.md +``` + +Root-level junctions: `.claude/`, `.codex/`, `.continue/` all resolve to subdirectories of `.context/`. All file-based targets (`.clinerules`, `.cursorrules`, `.windsurfrules`, `.ampcoderc`, `.goosehints`, `.rules`, `AGENTS.md`, `CLAUDE.md`, `CONTEXT.md`, `CONVENTIONS.md`, `GEMINI.md`, `devin.md`, `.kimi-system-prompt.md`, `.github/copilot-instructions.md`) and remaining dirs (`.augment/`, `.junie/`, `.kiro/`, `.pearai/`, `.tmp/`, `.trae/`, `.void/`) are removed from root. **`app/.context/` is the single source of truth for every AI tool config.** + +--- + +## What We've Shipped (v0.3.1) + +### Architecture: 31 server/lib modules, ~14k files total + +The backend is deep and production-grade. Everything written zero-dependency (Node builtins + Ollama HTTP). + +### Phase 0 — Stabilised v3 Base + +- Modular DRAM CSS with token system and lint guard +- Cleanup modal for tidy/overlap review with apply flow +- Local update detection with clickable update toast +- Release checklist covering syntax, line limits, Git scope + +### P0 — Electron + TypeScript Quality Gate + +- Strict TypeScript typecheck across server and renderer — zero errors +- ESLint with TS-aware rules, no-floating-promises enforcement +- All source files under 500/700 line limits +- Electron shell with main/preload/renderer boundary + +### P0 — Windows Installer + Auto-Update + +- NSIS installer + portable target via electron-builder +- Auto-update wired (8s after launch, then every 6h) +- Brand marks: icon/mono/simple SVGs, .ico, .icns, Linux set +- GitHub Actions release workflow (tag-driven + manual) +- First tagged release v0.2.1 cut and validated end-to-end +- **Blocked:** code signing certificate not procured (SmartScreen warnings) + +### Phase 1 — Vector Foundation + +- `chunker.js`: semantic heading/rule/knowledge/example parser with frontmatter +- `embeddings.js`: Ollama client for nomic-embed-text, batch embed, graceful fallback +- `vectorstore.js`: flat-file vector index, cosine search, <5ms for 500 chunks +- `POST /api/index`, `POST /api/index/skill/:id`, `POST /api/search`, `GET /api/index/status` +- UI: indexed chunk count, model, last indexed time + +### Phase 2 — Dedup + Rank + +- `dedup.js`: pairwise similarity clustering, Union-Find, duplicate/related thresholds +- `ranking.js`: specificity, coverage, source weight, freshness scoring +- `GET /api/dedup`, `POST /api/dedup/resolve` with reversible resolution history +- Quality Audit UI: duplicate clusters, low-specificity filler, side-by-side comparison + +### Phase 3 — Smart Compile + +- `POST /api/compile/smart`: task embedding → vector search → expand → budget-fit → compile +- Project-aware stack detection (package.json, README, Cargo.toml) +- Before-and-after token comparison vs manual All On mode +- Modes moving toward presets (Smart Preview on dashboard) + +### Phase 1.5 (Promoted) — MCP Bridge to Daily Apps + +- `mcp-server.mjs`: stdio transport, 4-tool contract (search, list_skills, get_skill, status) +- Remote Streamable HTTP MCP adapter with bearer-token auth +- Claude Desktop `.mcpb` wrapper bundle +- One-paste config snippets for Claude Desktop, Codex CLI +- Lifecycle: spawned by host app, independent of Electron shell +- Claude Desktop + Codex CLI validated end-to-end +- **Blocked:** ChatGPT remote connector needs HTTPS tunnel/hosting choice + +### Handoffs — Full Lifecycle + +- Project handoffs (repo-bound), thread handoffs (topic-bound), dual-bound +- Git staleness detection (auto-archive at 5+ commits) +- Thread staleness (archive after 14 days idle), purge after 30 days +- 9 REST endpoints + MCP tool + admin UI (peer to Memory tab) +- Migrated existing llm-handoff.md entries into structured format + +### Projects — CRUD + Scoped Context + +- Project directories with seed memory.json + rules.json +- Collision-safe slugs, directory lifecycle, path management +- REST endpoints + 91-pass smoke test + +### PR #2 (James Chapman) — Priority Rules, Auth, Security + +- Priority-based rules model (hard/soft per section) with auto-migration +- API token auth (bearer, 48-hex, encrypted at rest, fully opt-in) +- Rule-files CRUD (`data/rules/*.json`) +- CI workflow (typecheck + lint + lint:css) +- 12 new smoke test suites (backup, crypto, mode-apply, mutex, projects, ranking, security, validation, etc.) + +### Benchmark v1.3 — Evidence Pack + +- Report PDF + chart PNGs under `bench/charts/` +- Smart Compile token accounting fixed (same output surface comparison) +- Manifest chunking for skill name/description/trigger search +- **Not yet green:** quality gate (Recall@8, no-context paired comparison) + +--- + +## Current State Summary + +| Area | Status | Notes | +| ----------------------------- | --------------- | -------------------------------------------------------------------------- | +| Backend architecture | Shipped | 31 modules, zero-dependency | +| Vector index + search | Shipped | Works, needs quality tuning | +| Dedup + rank | Shipped | Exists but quality-gated | +| Smart Compile | Shipped | Token savings proven, quality unproven | +| MCP bridge (local) | Shipped | Claude Desktop + Codex validated | +| MCP bridge (remote) | Blocked | ChatGPT needs HTTPS tunnel decision | +| Handoffs | Shipped | | +| Projects | Shipped | | +| Priority rules | Shipped | PR #2 merged | +| API auth | Shipped | Opt-in, encrypted | +| Benchmark gates | **Not green** | Recall@8 ≠ 1.00, no-context not compared | +| System detection + onboarding | **Not started** | Broad spec written. Replaces old skill-sources + onboarding-redesign | +| `scanSystem()` backend | Exists (narrow) | Currently only SKILL.md dirs. Must broaden to rules/instructions/MCP/hosts | +| MCP discovery | **Not started** | | +| AIModelDB bridge | **Not started** | | +| Modes-as-presets | **Partial** | Smart Preview on dashboard, tab grid not replaced | +| Code signing cert | **Blocked** | SmartScreen warnings on installer | + +--- + +## Two-Month Roadmap + +### Week 1-2: Quality Gate (P0) — Make the Benchmark Honest + +Before any new features, CE must prove it doesn't make answers worse. + +| Task | Est. | Owner | +| -------------------------------------------------------------------------------------------- | ---- | ------ | +| Rebuild vector index + re-run v1.3 benchmark after manifest chunking | 2d | Jeremy | +| Add retrieval-quality smoke gate: expected-source Recall@8 = 1.00 | 2d | Jeremy | +| Add no-context paired quality gate: Smart/Search must beat or tie no-context | 2d | Jeremy | +| Hybrid reranking: vector score + lexical match on id/name/triggers/section | 3d | Jeremy | +| Fix any retrieval misses identified by the benchmark | 2d | Jeremy | +| Dashboard/reporting copy: "token reduction measured; quality gate pending" → remove warning | 1d | Jeremy | +| Promote multi-resolution packaging: manifest → relevant chunks → full skill only when needed | 3d | Jeremy | + +**Gate:** Smoke CI fails if Recall@8 < 1.00 or quality drops below no-context. + +### Week 3-4: System Detection Phase 1 + MCP Remote (P0/P1) + +Two parallel streams. This replaces the old "Skill Sources" and "Onboarding Redesign" — now unified into System Detection. + +**Stream A — MCP Remote (Jeremy):** +| Task | Est. | +|------|------| +| Choose HTTPS tunnel/hosting (e.g. Cloudflare Tunnel, ngrok, or $5 VPS) | 1d | +| Set `MCP_OAUTH_PASSWORD`, expose `/mcp` | 1d | +| Register URL in Claude/ChatGPT connector, validate `context_engine_status` | 1d | +| Document the setup for self-hosted users | 1d | + +**Stream B — System Detection Backend (Jeremy or James):** +| Task | Est. | +|------|------| +| Rewrite `scanHostSkillPaths()` → `scanSystem()` with 5 categories (skills, rules, instructions, MCP, hosts) | 2d | +| Add probe functions: probeRuleFiles, probeInstructionFiles, probeMcpServers, probeHostConfigs | 2d | +| New endpoint: `GET /api/system/scan` — runs all probes, returns grouped results | 2d | +| New endpoints: `POST /api/system/link-all`, `POST /api/system/link`, `DELETE /api/system/link/:id` | 2d | +| New data model: `data/system-context.json` (tracks all linked sources with timestamps) | 1d | +| Refactor `server/lib/skills.js` → `findAllSkillDirs()` with sourceId for unified skill listing | 2d | + +### Week 5-6: Onboarding + Import Pipeline (P1) + +**Stream A — Onboarding Rewrite (Jeremy):** +| Task | Est. | +|------|------| +| 4-step scan → review → build → done flow. Step 1 runs `GET /api/system/scan` on open | 3d | +| Grouped result cards: skills count, rules count, instructions count, MCP count, hosts detected | 2d | +| "Link All" button links every unmanaged source in one click | 1d | +| Per-source Link/Unlink with inline feedback | 1d | +| Step 3: import + rebuild index with progress | 2d | +| CSS budget ≤ 250 lines, reuse DRAM tokens | 1d | + +**Stream B — Link-Import Pipeline (James):** +| Task | Est. | +|------|------| +| Directory sources → NTFS junction into `app/.context//` | 2d | +| File sources → copy into `app/.context/` | 1d | +| MCP server registration from detected configs | 2d | +| Auto-rebuild index after link/import | 1d | +| Per-source linking: collision-safe ID prefixing (`:`) | 1d | +| Smoke tests: full scan → link → verify → unlink roundtrip | 2d | + +### Week 7-8: Set & Forget + Polish (P1/P2) + +**Jeremy:** +| Task | Est. | +|------|------| +| Periodic health check (24h timer re-scan, notify on new/changed sources) | 2d | +| Connections tab (post-onboarding source management UI, sibling to Skills/Memory/Handoffs) | 3d | +| Replace Modes tab grid with preset library | 2d | +| Smart Preview can promote selected skill sets into presets | 2d | +| Code signing certificate procurement + wiring | 2d | + +**James:** +| Task | Est. | +|------|------| +| CE becomes sole author — compile writes to `app/.context/` AND root junctions (where they exist) | 3d | +| CE writes compiled output to tool root paths when no junction exists (file fallback) | 2d | +| Handoff rate-limit-aware heartbeat/update API | 1d | +| `context_engine_dedup_report` MCP tool | 2d | +| Validation + edge-case hardening across all system detection endpoints | 2d | + +--- + +## Beyond Two Months (Next) + +- **Phase 5 — AIModelDB Bridge:** model-aware compile budget, dashboard display, model comparison MCP tool +- **`context_engine_model_lookup` MCP tool** +- **Multi-platform native installers** (macOS dmg, Linux deb/rpm) — runners exist in the release workflow, just need testing +- **Plugin/skill marketplace** — network effects, the real moat + +--- + +## Key Principles + +1. **Quality gate is the door.** Nothing ships to users until the benchmark proves it doesn't make answers worse. +2. **Daily use is the signal.** If Jeremy wouldn't reach for it in Claude Desktop or Codex tomorrow morning, it waits. +3. **James audits and hardens.** Jeremy drives the main roadmap. James catches edge cases, adds tests, and prevents regressions. +4. **Zero new deps.** Node builtins + Ollama HTTP. This is a product principle, not an accident. +5. **Under 500 lines.** Every module stays under 500 lines soft limit, 700 absolute. New modules get split early. diff --git a/docs/specs/system-detection.md b/docs/specs/system-detection.md new file mode 100644 index 0000000..32fad68 --- /dev/null +++ b/docs/specs/system-detection.md @@ -0,0 +1,320 @@ +# System Detection & Unified Context Ingestion + +> Replaces: [skill-sources.md](skill-sources.md) + [onboarding-redesign.md](onboarding-redesign.md) +> Status: proposed. 2026-05-18. + +--- + +## The Vision + +CE is the single source of truth for every AI tool on the machine. You install CE once, it scans everything, you click confirm, and every tool on your system — Claude Code, Cursor, Codex CLI, OpenCode, Continue, Cline, Windsurf, Kimi, GitHub Copilot — reads from CE's unified context. You never touch a tool config again. + +### Flow + +``` +Install CE (single download) + → Launch Electron app (first run) + → Onboarding: "Scanning system…" (auto, 2-5 seconds) + → Shows: 4 hosts, 247 skills, 53 rules, 12 MCP servers found + → User clicks "Link All & Continue" + → CE imports everything, builds index, wires junctions + → Dashboard: "All systems ready. 0 configs to maintain." + → Done. +``` + +--- + +## What `scanHostSkillPaths()` Currently Scopes + +| Path | Type | Detects | +| ------------------------------ | ------ | ---------------- | +| `~/.claude/skills/` | skills | SKILL.md files | +| `~/.opencode/skills/` | skills | SKILL.md files | +| `/.claude/skills/` | skills | SKILL.md files | +| `/.clinerules` | rules | Single rule file | +| `/.continue/rules/` | rules | Rule files | + +### What It's Missing + +#### MCP Servers + +- `~/.claude/plugins/` — installed Claude Desktop plugins (each is an MCP server) +- `~/.claude/plugins/marketplaces/claude-plugins-official/plugins/*/.claude-plugin` — official plugin metadata +- `~/.config/Code/User/globalStorage/rooveterinaryinc.roo-cline/settings/cline_mcp_settings.json` — VS Code/Cline MCP config +- `claude_desktop_config.json` — Claude Desktop MCP server config (at `%APPDATA%\Claude\` on Windows) + +#### Instructions Files + +- `AGENTS.md` — OpenCode / Codex agent instructions +- `CLAUDE.md` — Claude Code project instructions +- `GEMINI.md` — Gemini Code Assist instructions +- `devin.md` — Devin instructions +- `.kimi-system-prompt.md` — Kimi system prompt +- `.github/copilot-instructions.md` — GitHub Copilot instructions + +#### Rule Files + +- `.cursorrules` — Cursor rules +- `.windsurfrules` — Windsurf rules +- `.clinerules` — Cline/Roo rules +- `.rules` — Generic rules +- `.ampcoderc` — Ampcoder config +- `.goosehints` — Goose hints +- `CONVENTIONS.md` — Project conventions + +#### Home-Dir Tool Configs (for context, not import) + +- `~/.cursor/` — Cursor settings (detect presence, don't import) +- `~/.codex/` — Codex CLI config and skills +- `~/.kimi/` — Kimi session data (detect, don't import sessions) +- `~/.continue/` — Continue.dev config (detect, don't import) + +--- + +## Expanded Scan: Full Probe Map + +```js +function scanSystem() { + return [ + // === SKILLS === + ...probeSkillDirs(), + // === RULES === + ...probeRuleFiles(), + // === INSTRUCTIONS === + ...probeInstructionFiles(), + // === MCP SERVERS === + ...probeMcpServers(), + // === HOST PRESENCE === + ...probeHostConfigs(), + ]; +} +``` + +### Skills + +| Probe | Label | Reads | +| ---------------------- | ------------------------ | ---------------------------------- | +| `~/.claude/skills/` | Claude Code (global) | dir → SKILL.md count | +| `~/.opencode/skills/` | OpenCode (global) | dir → SKILL.md count | +| `~/.codex/skills/` | Codex CLI (global) | dir → SKILL.md count | +| `/.claude/skills/` | Claude Code in {project} | dir → SKILL.md count | +| `/.codex/` | Codex CLI in {project} | dir → SKILL.md and instructions.md | + +### Rules (single-file sources) + +| Probe | Label | Reads | CE Target | +| ------------------------------- | ------------------- | ------------------ | ----------------------------- | +| `/.clinerules` | Cline / Roo | file content | `rules/clinerules` | +| `/.cursorrules` | Cursor | file content | `rules/cursorrules` | +| `/.windsurfrules` | Windsurf | file content | `rules/windsurfrules` | +| `/.rules` | Generic rules | file content | `rules/rules` | +| `/.ampcoderc` | Ampcoder | file content | `rules/ampcoderc` | +| `/.goosehints` | Goose | file content | `rules/goosehints` | +| `/.continue/rules/` | Continue.dev | dir → file content | `rules/continue/*` | +| `/CONVENTIONS.md` | Project conventions | file content | `instructions/CONVENTIONS.md` | + +### Instructions + +| Probe | Label | Reads | CE Target | +| ------------------------------------------------ | ------------------ | ------------ | ------------------------------- | +| `/AGENTS.md` | OpenCode / Codex | file content | `instructions/AGENTS.md` | +| `/CLAUDE.md` | Claude Code | file content | `instructions/CLAUDE.md` | +| `/GEMINI.md` | Gemini Code Assist | file content | `instructions/GEMINI.md` | +| `/devin.md` | Devin | file content | `instructions/devin.md` | +| `/.kimi-system-prompt.md` | Kimi | file content | `kimi-system-prompt.md` | +| `/.github/copilot-instructions.md` | GitHub Copilot | file content | `rules/copilot-instructions.md` | +| `/CONTEXT.md` | CE manifest | file content | `instructions/CONTEXT.md` | + +### MCP Servers + +| Probe | Label | +| --------------------------------------------- | ---------------------- | +| `%APPDATA%\Claude\claude_desktop_config.json` | Claude Desktop MCP | +| `~/.claude/plugins/` | Claude Desktop plugins | +| `{vscode-config}/cline_mcp_settings.json` | Cline MCP | +| `~/.codex/mcp.json` | Codex CLI MCP | + +### Host Presence (informational) + +| Probe | Label | +| -------------------------------------- | -------------------- | +| `~/.claude/` exists | Claude Code (global) | +| `~/.cursor/` exists | Cursor | +| `~/.codex/` exists | Codex CLI (global) | +| `~/.kimi/` exists | Kimi | +| `~/.continue/` exists | Continue.dev | +| `~/.opencode/` exists | OpenCode | +| where.exe cursor, where.exe code, etc. | Host CLI on PATH | + +--- + +## Detection Returns + +Each probe returns a unified result shape: + +```json +{ + "id": "claude-global-skills", + "category": "skills", + "label": "Claude Code (global skills)", + "path": "C:\\Users\\jerem\\.claude\\skills", + "exists": true, + "size": 12, + "unit": "SKILL.md files", + "alreadyManaged": true, + "contentSummary": "React, TypeScript, Python, ..." +} +``` + +`category` is one of: `skills`, `rules`, `instructions`, `mcp`, `host`. + +--- + +## Onboarding Flow (Revised) + +Four steps, but the emphasis shifts from "config your tools" to "scan, review, done." + +### Step 1: System Scan (was "Connect") + +Auto-run on open. Shows a spinner for 2-5s while CE probes every known path. + +Result: a grouped list: + +``` +Context Engine found AI tooling across your system: + +Skills 3 locations · 247 skill files + Claude Code (global) ~/.claude/skills 12 SKILL.md [linked] + OpenCode (global) ~/.opencode/skills 230 SKILL.md [linked] + Codex CLI (project) project/.codex/ 5 SKILL.md [link] + +Rules 4 locations · 3 files + 1 directory + Cursor rules .cursorrules 53KB [link] + Continue.dev .continue/rules/ 1 rule [link] + Cline / Roo .clinerules 17KB [link] + +Instructions 3 files + OpenCode / Codex AGENTS.md 18KB [link] + Claude Code CLAUDE.md 3.5KB [link] + Project conventions CONVENTIONS.md 8.7KB [link] + +MCP Servers 2 found + Claude Desktop claude_desktop_config.json 4 servers [link] + Cline cline_mcp_settings.json 2 servers [link] + +Hosts Detected 6 tools + Claude Code ✓ Cursor ✓ Codex CLI ✓ Continue ✓ Kimi ✓ OpenCode ✓ + +[ Link All ] [ Review Individually ] +``` + +### Step 2: Verify & Customize + +Folded into step 1 if "Review Individually" is clicked, or shown as an expandable section after auto-link: + +- Checkboxes per probe row +- "Path to skills folder:" text input + Browse button +- Linked sources show with an Unlink affordance +- Paths that are already in CE's `.context/` show as "Managed" + +### Step 3: Build & Confirm + +- CE imports all confirmed sources +- Rebuilds vector index (shows progress) +- Writes compiled context to `app/.context/` +- Shows result: "247 skills indexed | 53 rules consolidated | 4 MCP servers registered" + +### Step 4: Done + +Dashboard. No celebration step. The modal closes and the app is live. + +--- + +## What "Link" Actually Means Now + +Not just tracking a source in JSON. Full pipeline: + +1. **Register** the source in `skill-sources.json` +2. **Copy or junction** the content into `app/.context/` + - Directories → NTFS junction (instant, zero-copy) + - Files → copy (until Windows symlinks are available) +3. **Ingest** skills into CE's vector index +4. **Register** MCP servers in CE's MCP registry +5. **Scan** rule/instruction content for dedup against existing CE rules +6. **Write** compiled output back to `app/.context/` +7. **Update** the dashboard stat grid immediately + +--- + +## Post-Onboarding: "Set & Forget" + +After onboarding, CE maintains itself: + +### Periodic Health Check + +Every 24h (configurable): + +- Re-scan known paths for new/removed skills +- Flag stale index +- Check if new AI tools were installed (detect new `~/.` directories) +- Show notification: "New skills found in ~/.claude/skills — Link?" + +### Auto-Compile on Change + +When skills are added/removed to any linked source: + +- CE detects the change (on next read or periodic scan) +- Marks index as stale +- Shows: "Context has changed: [N] skills added, [M] removed. Rebuild?" + +### Manual Overrides + +- Connections tab shows all linked sources with per-source controls +- Unlink, Re-scan, Force Rebuild per source +- Custom path picker for power users + +--- + +## What the User Never Does + +- Never edits a config file directly +- Never copies skills between tool directories +- Never wonders which `.cursorrules` is current +- Never maintains duplicate `AGENTS.md` and `CLAUDE.md` +- Never manually wires MCP servers into each host + +--- + +## Implementation Plan + +### Phase 1: Broadened Scan + Onboarding (this sprint) + +| Task | Est. | +| -------------------------------------------------------------------------------------------------------------------------- | ---- | +| Rewrite `scanHostSkillPaths()` → `scanSystem()` with all probe categories | 2d | +| Add probe functions: probeRuleFiles, probeInstructionFiles, probeMcpServers, probeHostConfigs | 2d | +| New data model: `data/system-context.json` with full detected state | 1d | +| Add endpoints: `GET /api/system/scan`, `POST /api/system/link-all`, `POST /api/system/link`, `DELETE /api/system/link/:id` | 2d | +| Rewrite onboarding UI with the 4-step scan → review → build → done flow | 3d | +| Update dashboard stat grid to reflect linked sources | 1d | +| Smoke test: scan against fixture directories, verify all categories detected | 1d | + +### Phase 2: Import Pipeline + MCP Registration + +| Task | Est. | +| ----------------------------------------------------------------------- | ---- | +| Auto-junction for directory sources (moved into `.context//`) | 1d | +| Rule/instruction file import into `app/.context/` | 1d | +| MCP server registration from detected configs | 2d | +| "Set & forget" periodic health check (24h timer) | 2d | +| Notification system for new/changed sources | 1d | +| Connections tab UI (post-onboarding source management) | 2d | + +### Phase 3: CE as Sole Author + +| Task | Est. | +| ------------------------------------------------------------------- | ---- | +| CE rewrite root tool files on every compile (not just home dir) | 2d | +| CE writes to `app/.context/` AND to root-level junctions/copies | 2d | +| Kill the need for tool-specific config edits entirely | 1d | +| Benchmark: measure time from "install CE" to "all tools configured" | 1d | diff --git a/electron/main.cjs b/electron/main.cjs index 6d1cc45..4a71642 100644 --- a/electron/main.cjs +++ b/electron/main.cjs @@ -34,10 +34,14 @@ const newUserProfile = if (newUserProfile) { // Test isolation: keep both userData AND the writable CE_ROOT under the // repo so a smoke run never leaks into a real user's data. - const userDataPath = path.join(__dirname, '..', '..', '.electron-user-data'); + const profileRoot = + process.env.CE_ROOT || + process.argv.find((arg) => arg.startsWith('--ce-root='))?.slice('--ce-root='.length) || + path.join(__dirname, '..', '..', '.tmp', 'new-user-profile'); + const userDataPath = path.join(profileRoot, '.electron-user-data'); fs.mkdirSync(userDataPath, { recursive: true }); app.setPath('userData', userDataPath); - if (!process.env.CE_ROOT) process.env.CE_ROOT = userDataPath; + if (!process.env.CE_ROOT) process.env.CE_ROOT = profileRoot; console.log(`[ce-electron] isolated userData: ${userDataPath}`); } else if (app.isPackaged && !process.env.CE_ROOT) { const userData = app.getPath('userData'); @@ -173,7 +177,11 @@ function createWindow() { icon: appIconPath, show: false, titleBarStyle: 'hidden', - titleBarOverlay: false, + titleBarOverlay: { + color: '#050309', + symbolColor: '#ffffff', + height: 32, + }, autoHideMenuBar: true, webPreferences: { preload: path.join(__dirname, 'preload.cjs'), @@ -185,6 +193,12 @@ function createWindow() { }, }); + const revealWindow = () => { + if (!mainWindow || mainWindow.isDestroyed() || mainWindow.isVisible()) return; + mainWindow.show(); + mainWindow.focus(); + }; + // Some dev-mode launches don't fully honour the constructor `icon` for the // taskbar entry. Explicitly setting it after construction is the reliable path. if (process.platform === 'win32' && fs.existsSync(appIconPath)) { @@ -196,11 +210,9 @@ function createWindow() { } void mainWindow.loadURL(`http://127.0.0.1:${PORT}/`); - mainWindow.once('ready-to-show', () => { - if (!mainWindow) return; - mainWindow.show(); - mainWindow.focus(); - }); + mainWindow.once('ready-to-show', revealWindow); + mainWindow.webContents.once('did-finish-load', revealWindow); + setTimeout(revealWindow, 5000); if (smokeMode) { mainWindow.webContents.once('did-finish-load', () => { console.log('electron launch smoke ok'); diff --git a/scripts/onboarding-smoke.cjs b/scripts/onboarding-smoke.cjs index b7581c5..0d96459 100644 --- a/scripts/onboarding-smoke.cjs +++ b/scripts/onboarding-smoke.cjs @@ -69,23 +69,24 @@ async function run() { const discovery = await js( win, `(() => ({ - heading: document.querySelector('#onboarding-title')?.textContent || '', - hosts: document.querySelectorAll('.onboarding-host').length, - stats: document.querySelectorAll('.onboarding-stat').length, + heading: document.querySelector('.ob-title')?.textContent || '', + scanHeading: document.querySelector('.ob-step-head h2')?.textContent || '', + locations: document.querySelectorAll('.ob-row').length, }))()`, ); - assert(/Welcome to Context Engine/i.test(discovery.heading), 'Onboarding heading is missing'); - assert(discovery.hosts >= 1, 'Expected detected host cards'); + assert(/Onboarding/i.test(discovery.heading), 'Onboarding heading is missing'); + assert(/Where should we look/i.test(discovery.scanHeading), 'Scan setup heading is missing'); + assert(discovery.locations >= 3, 'Expected default scan location rows'); await js(win, `Onboarding.go(2)`); - await waitFor(win, `(() => document.querySelectorAll('.onboarding-stat').length >= 4)()`); - await js(win, `Onboarding.go(4)`); - await waitFor(win, `(() => /Final health check/.test(document.body.innerText))()`); + await waitFor(win, `(() => /Build vector index/.test(document.body.innerText))()`); + await js(win, `Onboarding.go(3)`); + await waitFor(win, `(() => /All set/.test(document.body.innerText))()`); await js( win, `(() => { - const buttons = [...document.querySelectorAll('.onboarding-footer .save-btn')]; - const finish = buttons.find((button) => /Finish setup/.test(button.textContent || '')); + const buttons = [...document.querySelectorAll('.ob-actions .save-btn')]; + const finish = buttons.find((button) => /Go to dashboard/.test(button.textContent || '')); if (!finish) throw new Error('Finish setup button missing'); finish.click(); })()`, diff --git a/scripts/quick-search.cjs b/scripts/quick-search.cjs new file mode 100644 index 0000000..7193522 --- /dev/null +++ b/scripts/quick-search.cjs @@ -0,0 +1,46 @@ +/** + * quick-search.cjs — Quick search test against running CE. + * Usage: node scripts/quick-search.cjs [port=3847] + */ +const http = require('http'); +const port = parseInt(process.argv[2], 10) || 3847; +const query = process.argv[3] || 'What is using my CPU right now and how do I find the worst offender?'; + +function request(method, urlPath, body) { + return new Promise((resolve, reject) => { + const payload = body ? JSON.stringify(body) : null; + const opts = { + host: '127.0.0.1', + port, + path: urlPath, + method, + timeout: 30000, + headers: payload + ? { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) } + : {}, + }; + const req = http.request(opts, (res) => { + let d = ''; + res.on('data', (c) => (d += c)); + res.on('end', () => { + try { + resolve(JSON.parse(d)); + } catch { + reject(new Error(d.slice(0, 200))); + } + }); + }); + req.on('timeout', () => req.destroy(new Error('Timeout'))); + req.on('error', reject); + if (payload) req.write(payload); + req.end(); + }); +} + +request('POST', '/api/search', { query, limit: 10 }).then((data) => { + (data.results || []).forEach((r, i) => + console.log( + `${i + 1}. ${r.skillId} (score=${(r.score || 0).toFixed(3)}, lex=${(r.lexicalScore || 0).toFixed(3)}) — ${r.section}`, + ), + ); +}); diff --git a/scripts/retrieval-smoke.js b/scripts/retrieval-smoke.js new file mode 100644 index 0000000..750f53d --- /dev/null +++ b/scripts/retrieval-smoke.js @@ -0,0 +1,213 @@ +// @ts-check +/** + * retrieval-smoke.js — Retrieval-quality smoke gate. + * + * Connects to CE (starts one if --start is given), registers fixture skills, + * rebuilds the vector index, then verifies expected_source Recall@8 = 1.00 + * for every task in gold-answers.json. Exits non-zero on failure. + * + * Usage: + * node scripts/retrieval-smoke.js # connect to existing CE + * node scripts/retrieval-smoke.js --port 3847 # specify port + * node scripts/retrieval-smoke.js --start # start CE then test + * node scripts/retrieval-smoke.js --fixtures bench/fixtures/skills + */ + +const http = require('http'); +const path = require('path'); +const fs = require('fs'); + +const HERE = path.resolve(__dirname, '..'); +const GOLD = path.join(HERE, 'bench', 'gold-answers.json'); +const TASKS = path.join(HERE, 'bench', 'tasks.json'); +const FIXTURES = path.join(HERE, 'bench', 'fixtures', 'skills'); +const REQUEST_TIMEOUT_MS = Number(process.env.CE_RETRIEVAL_SMOKE_TIMEOUT_MS || 300000); + +/** @type {import('http').Server | null} */ +let server = null; +let activePort = 3847; + +/** + * @param {string} method + * @param {string} urlPath + * @param {unknown=} body + */ +function request(method, urlPath, body) { + return new Promise((resolve, reject) => { + const payload = body !== undefined ? JSON.stringify(body) : null; + const opts = { + host: '127.0.0.1', + port: activePort, + path: urlPath, + method, + timeout: REQUEST_TIMEOUT_MS, + headers: payload + ? { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) } + : {}, + }; + const req = http.request(opts, (res) => { + let data = ''; + res.on('data', (chunk) => { + data += chunk; + }); + res.on('end', () => { + try { + resolve(JSON.parse(data)); + } catch { + reject(new Error(`Invalid JSON: ${data.slice(0, 200)}`)); + } + }); + }); + req.on('timeout', () => req.destroy(new Error('Request timed out'))); + req.on('error', reject); + if (payload) req.write(payload); + req.end(); + }); +} + +async function run() { + // Re-parse args inside run scope + const innerArgv = process.argv.slice(2); + let innerPort = 3847; + let innerStart = false; + let fixtureArg = FIXTURES; + for (let i = 0; i < innerArgv.length; i++) { + if (innerArgv[i] === '--port') { + innerPort = parseInt(/** @type {string} */ (innerArgv[i + 1]), 10) || innerPort; + i++; + } + if (innerArgv[i] === '--start') { + innerStart = true; + } + if (innerArgv[i] === '--fixtures') { + fixtureArg = path.resolve(HERE, /** @type {string} */ (innerArgv[i + 1])); + i++; + } + } + const fixturePath = fixtureArg; + + // Validate inputs + const gold = JSON.parse(fs.readFileSync(GOLD, 'utf8')); + const tasks = JSON.parse(fs.readFileSync(TASKS, 'utf8')); + const tasksGold = gold.tasks || {}; + + console.log(`Retrieval smoke gate`); + console.log(` Fixtures: ${fixturePath}`); + console.log(` CE port: ${innerPort}`); + console.log(` Tasks: ${tasks.length}`); + console.log(); + + activePort = innerPort; + + if (innerStart) { + process.env.CE_PORT = String(innerPort); + const { PORT } = require('../server/lib/config'); + const { startServer } = require('../server/server'); + const srv = startServer({ port: PORT, refresh: false }); + await new Promise((resolve) => srv.once('listening', resolve)); + server = srv; + console.log(' CE started on port', PORT); + } + + // Verify CE is reachable + try { + const hc = await request('GET', `/api/health`); + if (!hc?.skills) { + throw new Error('CE health check failed'); + } + console.log(` CE reachable: ${Object.keys(hc.skills || {}).length} skill counts`); + } catch (e) { + throw new Error(`CE not reachable on port ${innerPort}: ${e instanceof Error ? e.message : String(e)}`); + } + + try { + // Register fixture skills as a source + const addResult = await request('POST', '/api/skill-sources', { + path: fixturePath, + label: 'benchmark-fixtures', + }); + if (!addResult.ok && !addResult.error.includes('already linked')) { + throw new Error(`Failed to add fixtures: ${addResult.error}`); + } + console.log(' Fixtures registered.'); + + // Rebuild vector index + console.log(' Rebuilding index...'); + const indexResult = await request('POST', '/api/index'); + if (!indexResult.ok) { + throw new Error(`Index rebuild failed: ${indexResult.error}`); + } + console.log(` Index built: ${indexResult.chunks} chunks, ${indexResult.skills} skills`); + + // Test each task + let failures = 0; + let totalExpected = 0; + let totalFound = 0; + + for (const task of tasks) { + const tid = task.id; + const goldTask = tasksGold[tid]; + if (!goldTask) { + console.log(` ⚠ ${tid}: no gold data, skipping`); + continue; + } + + const expected = goldTask.expected_sources || []; + if (!expected.length) { + console.log(` ✓ ${tid}: no expected sources`); + continue; + } + + totalExpected += expected.length; + + // Use a generous limit to ensure we catch everything + const searchResult = await request('POST', '/api/search', { + query: task.prompt, + limit: 8, + }); + + // External sources prefix skillIds as `:`. + // Accept both prefixed and bare matches. + const retrieved = (searchResult.results || []).map(/** @param {any} r */ (r) => r.skillId); + const uniqueRetrieved = [...new Set(retrieved)]; + /** @param {string} expectedId */ + const matches = (expectedId) => + uniqueRetrieved.some((rid) => rid === expectedId || rid.endsWith(':' + expectedId)); + const hits = expected.filter((/** @type {string} */ s) => matches(s)); + + totalFound += hits.length; + + if (hits.length === expected.length) { + console.log(` ✓ ${tid}: R@8 = ${hits.length}/${expected.length} ${JSON.stringify(expected)}`); + } else { + failures++; + const missed = expected.filter((/** @type {string} */ s) => !matches(s)); + console.log( + ` ✗ ${tid}: R@8 = ${hits.length}/${expected.length} — missed: ${JSON.stringify(missed)}`, + ); + console.log(` retrieved: ${JSON.stringify(uniqueRetrieved)}`); + } + } + + const recall = totalExpected > 0 ? totalFound / totalExpected : 1; + console.log(); + console.log(` Overall R@8: ${(recall * 100).toFixed(1)}% (${totalFound}/${totalExpected})`); + + if (failures > 0) { + console.log(`\n FAIL: ${failures} task(s) with incomplete retrieval`); + process.exitCode = 1; + } else if (recall < 1.0) { + console.log(`\n FAIL: Overall R@8 < 1.00`); + process.exitCode = 1; + } else { + console.log(`\n PASS: All tasks achieve R@8 = 1.00`); + } + } finally { + if (server !== null) server.close(); + } +} + +run().catch((error) => { + console.error('Fatal:', error.message); + process.exitCode = 1; +}); diff --git a/scripts/setup-benchmark.cjs b/scripts/setup-benchmark.cjs new file mode 100644 index 0000000..243d326 --- /dev/null +++ b/scripts/setup-benchmark.cjs @@ -0,0 +1,106 @@ +/** + * setup-benchmark.cjs — Register fixtures, rebuild index, verify retrieval. + * + * Usage: + * node scripts/setup-benchmark.cjs + * + * This: + * 1. Registers bench/fixtures/skills/ as a skill source + * 2. Rebuilds the vector index + * 3. Runs a quick retrieval health check + */ + +const http = require('http'); +const path = require('path'); + +const CE_HOST = '127.0.0.1'; +const CE_PORT = 3847; +const FIXTURES = path.resolve(__dirname, '..', 'bench', 'fixtures', 'skills'); + +function request(method, urlPath, body) { + return new Promise((resolve, reject) => { + const payload = body !== undefined ? JSON.stringify(body) : null; + const opts = { + host: CE_HOST, + port: CE_PORT, + path: urlPath, + method, + timeout: 120000, + headers: payload + ? { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) } + : {}, + }; + const req = http.request(opts, (res) => { + let data = ''; + res.on('data', (chunk) => { + data += chunk; + }); + res.on('end', () => { + try { + resolve(JSON.parse(data)); + } catch { + reject(new Error(`Invalid JSON: ${data.slice(0, 200)}`)); + } + }); + }); + req.on('timeout', () => req.destroy(new Error('Request timed out'))); + req.on('error', reject); + if (payload) req.write(payload); + req.end(); + }); +} + +async function main() { + console.log('=== Benchmark Setup ===\n'); + + // Step 1: Register fixtures + console.log('1. Registering fixture skills...'); + const addResult = await request('POST', '/api/skill-sources', { + path: FIXTURES, + label: 'benchmark-fixtures', + }); + if (addResult.ok) { + console.log(` Registered: ${addResult.source.id} -> ${addResult.source.path}`); + } else if (addResult.error && addResult.error.includes('already linked')) { + console.log(' Already registered (skipping).'); + } else { + console.error(` FAILED: ${addResult.error}`); + process.exitCode = 1; + return; + } + + // Step 2: Rebuild index + console.log('\n2. Rebuilding vector index...'); + const indexResult = await request('POST', '/api/index'); + if (indexResult.ok) { + console.log( + ` Index built: ${indexResult.chunks} chunks, ${indexResult.skills} skills, model=${indexResult.model}`, + ); + } else { + console.error(` FAILED: ${indexResult.error}`); + process.exitCode = 1; + return; + } + + // Step 3: Quick health check + console.log('\n3. Quick search test...'); + const searchResult = await request('POST', '/api/search', { + query: 'process CPU usage Windows task manager', + limit: 5, + }); + if (searchResult.ok) { + const ids = (searchResult.results || []).map((r) => r.skillId); + console.log(` Top results: ${ids.join(', ')}`); + } else { + console.error(` FAILED: ${searchResult.error}`); + process.exitCode = 1; + return; + } + + console.log('\n=== Setup complete ==='); +} + +main().catch((err) => { + console.error('Fatal:', err.message); + process.exitCode = 1; +}); diff --git a/scripts/vectorstore-smoke.js b/scripts/vectorstore-smoke.js index dae6731..dc93aaa 100644 --- a/scripts/vectorstore-smoke.js +++ b/scripts/vectorstore-smoke.js @@ -10,6 +10,7 @@ const { upsertVectors, replaceVectors, searchVectors, + hybridSearch, cosineSimilarity, markIndexStale, clearIndexStale, @@ -109,6 +110,50 @@ try { const emptyResults = searchVectors(loadVectorStore(path.join(tmpDir, 'nonexistent.json')), [1, 0]); assert.deepStrictEqual(emptyResults, [], 'search on empty store returns empty'); + // ---- hybridSearch result diversity ---- + + const duplicateStore = replaceVectors( + [ + { + id: 'source-a:launcher:overview:1', + skillId: 'source-a:launcher', + section: 'Overview', + text: 'Launch apps in a repeatable morning routine.', + type: 'knowledge', + sourcePath: 'source-a/launcher/SKILL.md', + vector: [1, 0], + }, + { + id: 'source-b:launcher:overview:1', + skillId: 'source-b:launcher', + section: 'Overview', + text: 'Launch apps for startup automation.', + type: 'knowledge', + sourcePath: 'source-b/launcher/SKILL.md', + vector: [0.99, 0], + }, + { + id: 'calendar-helper:overview:1', + skillId: 'calendar-helper', + section: 'Overview', + text: 'Prepare a morning calendar checklist.', + type: 'knowledge', + sourcePath: 'calendar-helper/SKILL.md', + vector: [0.8, 0], + }, + ], + 'fixture-model', + ); + const diverseResults = hybridSearch(duplicateStore, [1, 0], 'morning launch apps', { + limit: 2, + diversifyBySkill: true, + }); + assert.deepStrictEqual( + diverseResults.map((r) => r.skillId), + ['source-a:launcher', 'calendar-helper'], + 'hybridSearch can diversify linked copies by bare skill ID', + ); + // ---- cosineSimilarity edge cases ---- assert.strictEqual(cosineSimilarity([0, 0], [1, 0]), 0, 'zero vector returns 0'); diff --git a/server/compiler.js b/server/compiler.js index 604939a..56f10be 100644 --- a/server/compiler.js +++ b/server/compiler.js @@ -168,8 +168,20 @@ ${flattenSection(ctx.rules.soul, ['soft'])}`); } if (ctx.activeSkills.length) { + // Manifest: list every selected skill with description const skillList = ctx.activeSkills.map((s) => `- **${s.id}**: ${s.desc}`).join('\n'); sections.push(`## Skills\n${skillList}`); + + // Multi-resolution chunks: include matched chunks when available + if (ctx.mrContext) { + const selectedIds = new Set(ctx.activeSkills.map((s) => s.id)); + for (const [skillId, mrc] of Object.entries(ctx.mrContext)) { + if (!selectedIds.has(skillId)) continue; + if (!mrc.chunks.length) continue; + const chunkParts = mrc.chunks.slice(0, 3).map((chunk, i) => `### ${chunk.section}\n${chunk.text}`); + sections.push(`## ${skillId} — Relevant knowledge\n${chunkParts.join('\n\n')}`); + } + } } return sections.join('\n\n'); @@ -434,6 +446,7 @@ function buildContext(opts) { sessionStart: rules?.sessionStart || '', activeSkills, totalSkills: allSkills.length, + mrContext: opts.mrContext || null, }; } diff --git a/server/lib/intelligence-routes.js b/server/lib/intelligence-routes.js index d9768b3..ebe1e91 100644 --- a/server/lib/intelligence-routes.js +++ b/server/lib/intelligence-routes.js @@ -11,7 +11,7 @@ const { saveVectorStore, upsertVectors, replaceVectors, - searchVectors, + hybridSearch, clearIndexStale, getIndexStale, } = require('./vectorstore'); @@ -141,7 +141,7 @@ async function handleIntelligenceRequest(req, res, url, deps) { return json(res, { ok: true, query, - results: searchVectors(store, embedded.vectors[0] || [], { limit }), + results: hybridSearch(store, embedded.vectors[0] || [], query, { limit, diversifyBySkill: true }), model: embedded.model, }); } diff --git a/server/lib/skills.js b/server/lib/skills.js index f1dd64c..e0fd331 100644 --- a/server/lib/skills.js +++ b/server/lib/skills.js @@ -592,11 +592,50 @@ ${JSON.stringify(skills).slice(0, 50000)}`, } } +function listSkillNames(dir) { + const names = []; + const walk = (d, cat) => { + let items; + try { + items = fs.readdirSync(d).sort((a, b) => a.localeCompare(b)); + } catch { + return; + } + for (const item of items) { + const full = path.join(d, item); + let stat; + try { + stat = fs.statSync(full); + } catch { + continue; + } + if (!stat.isDirectory()) continue; + const skillFile = path.join(full, 'SKILL.md'); + if (fs.existsSync(skillFile)) { + let name = item; + try { + const content = fs.readFileSync(skillFile, 'utf8'); + const fm = parseSkillFrontmatter(content); + if (fm.name) name = fm.name; + } catch { + /* use dirname */ + } + names.push({ bareId: item, name, cat: cat || 'Uncategorized' }); + } else { + walk(full, cat ? `${cat}/${item}` : item); + } + } + }; + walk(dir); + return names; +} + module.exports = { scanSkills, invalidateSkillCache, skillHealthCheck, countSkillFiles, + listSkillNames, llmParseSkill, parseAllNeedingParse, llmReviewSimilarSkills, diff --git a/server/lib/smart-compile.js b/server/lib/smart-compile.js index 9551358..4d7ef71 100644 --- a/server/lib/smart-compile.js +++ b/server/lib/smart-compile.js @@ -4,7 +4,7 @@ const fs = require('fs'); const path = require('path'); const { DATA_DIR, SKILLS_DIR } = require('./config'); const { embedTexts, DEFAULT_EMBED_MODEL } = require('./embeddings'); -const { loadVectorStore, searchVectors } = require('./vectorstore'); +const { loadVectorStore, hybridSearch } = require('./vectorstore'); const { compile, buildContext, estimateTokens, ADAPTERS } = require('../compiler'); /** @@ -29,15 +29,21 @@ async function smartCompile(input, deps) { const embedded = await embedTexts([query], { model: store.model || DEFAULT_EMBED_MODEL }); if (!embedded.ok) return { ok: false, error: embedded.error, model: embedded.model, status: 503 }; - const matches = searchVectors(store, embedded.vectors[0] || [], { limit: 60 }); + // Use hybrid search for better lexical matching + const matches = hybridSearch(store, embedded.vectors[0] || [], query, { limit: 60 }); const rankedSkills = rankSkillMatches(matches); const selectedSkillIds = fitSkillsToBudget(rankedSkills, input, deps, targets); + + // Multi-resolution output: include matched chunks alongside full skill bodies + const mrContext = buildMultiResolutionContext(matches, selectedSkillIds); + const result = compile({ dataDir: DATA_DIR, skillsDir: SKILLS_DIR, scanSkills: deps.scanSkills, targets, selectedSkillIds, + mrContext, }); const allOn = estimateAllOn(deps, targets); const selectedTokens = Object.values(result.results || {}).reduce( @@ -167,4 +173,54 @@ function detectProjectStack(projectPath) { }; } -module.exports = { smartCompile, detectProjectStack, rankSkillMatches, normalizeSmartTargets }; +/** + * Build a multi-resolution context from vector search matches. + * Returns an object keyed by skill ID with matched chunks and relevance info. + * + * @param {Array} matches + * @param {string[]} selectedSkillIds + * @returns {Record }>} + */ +function buildMultiResolutionContext(matches, selectedSkillIds) { + const selected = new Set(selectedSkillIds); + /** @type {Record }>} */ + const result = {}; + + for (const match of matches) { + if (!selected.has(match.skillId)) continue; + if (!(/** @type {any} */ (result)[match.skillId])) { + result[match.skillId] = { skillId: match.skillId, score: match.score, chunks: [] }; + } + // Track unique chunks (by section + text hash) to avoid duplicates + /** @type {any} */ + const entry = result[match.skillId]; + const existing = entry.chunks; + const dup = existing.some( + (/** @type {{ section: string, text: string }} */ c) => + c.section === match.section && c.text === match.text, + ); + if (!dup) { + existing.push({ section: match.section, text: match.text, score: match.score }); + } + } + + // Sort chunks within each skill by score descending + for (const skillId of Object.keys(result)) { + const chunks = /** @type {{ section: string; text: string; score: number }[]} */ ( + /** @type {any} */ (result)[skillId].chunks + ); + chunks.sort( + (/** @type {{ score: number }} */ a, /** @type {{ score: number }} */ b) => b.score - a.score, + ); + } + + return result; +} + +module.exports = { + smartCompile, + detectProjectStack, + rankSkillMatches, + normalizeSmartTargets, + buildMultiResolutionContext, +}; diff --git a/server/lib/system-scan.js b/server/lib/system-scan.js new file mode 100644 index 0000000..02e9408 --- /dev/null +++ b/server/lib/system-scan.js @@ -0,0 +1,752 @@ +// @ts-check +// system-scan.js — Probes the system for AI tools, skills, rules, configs, and opportunities. +// Returns data grouped by host app so the UI can render per-app cards. + +const fs = require('fs'); +const path = require('path'); +const { HOMEDIR, DATA_DIR, SKILLS_DIR } = require('./config'); +const { countSkillFiles, listSkillNames } = require('./skills'); + +// ---- Host definitions ---- + +const HOSTS = [ + { id: '.claude', label: 'Claude Code', icon: 'claude' }, + { id: '.cursor', label: 'Cursor', icon: 'cursor' }, + { id: '.windsurf', label: 'Windsurf', icon: 'windsurf' }, + { id: '.codex', label: 'Codex CLI', icon: 'openai' }, + { id: '.opencode', label: 'OpenCode', icon: 'opencode' }, + { id: '.continue', label: 'Continue', icon: 'continue' }, + { id: '.roo', label: 'Roo CLI', icon: 'cline' }, + { id: '.cline', label: 'Cline', icon: 'cline' }, + { id: '.kimi', label: 'Kimi K2', icon: 'kimi' }, + { id: '.goose', label: 'Goose', icon: 'goose' }, + { id: '.amp', label: 'Amp', icon: 'sourcegraph' }, + { id: '.kiro', label: 'Kiro', icon: 'kiro' }, + { id: '.antigravity', label: 'Antigravity', icon: 'antigravity' }, + { id: '.gemini', label: 'Gemini', icon: 'gemini' }, + { id: '.augment', label: 'Augment', icon: 'augment' }, + { id: '.pearai', label: 'PearAI', icon: 'pearai' }, + { id: '.void', label: 'Void', icon: 'void' }, +]; + +const RULE_FILE_NAMES = [ + '.clinerules', + '.cursorrules', + '.windsurfrules', + '.rules', + '.ampcoderc', + '.goosehints', +]; + +const INSTRUCTION_FILE_NAMES = [ + 'AGENTS.md', + 'CLAUDE.md', + 'GEMINI.md', + 'devin.md', + 'CONVENTIONS.md', + '.kimi-system-prompt.md', + '.github/copilot-instructions.md', + 'CONTEXT.md', + 'steering.md', +]; + +const CONFIG_FILE_NAMES = [ + 'settings.json', + 'config.json', + 'config.toml', + 'kimi.json', + 'mcp.json', + 'claude_desktop_config.json', +]; + +const OPPORTUNITY_FILES = { + '.claude': 'CLAUDE.md', + '.cursor': '.cursorrules', + '.windsurf': '.windsurfrules', + '.codex': 'instructions.md', + '.opencode': null, + '.continue': null, + '.roo': null, + '.cline': '.clinerules', + '.kimi': '.kimi-system-prompt.md', + '.goose': '.goosehints', + '.amp': '.ampcoderc', + '.kiro': '.kiro/steering.md', + '.antigravity': null, + '.gemini': 'GEMINI.md', + '.augment': '.augment-guidelines', + '.pearai': '.pearai', + '.void': null, +}; + +const IDE_PROBE_PATHS = [ + { + exe: 'Code.exe', + label: 'VS Code', + dirs: [ + '%LOCALAPPDATA%\\Programs\\Microsoft VS Code', + '%ProgramFiles%\\Microsoft VS Code', + '%ProgramFiles(x86)%\\Microsoft VS Code', + ], + }, + { + exe: 'Cursor.exe', + label: 'Cursor', + dirs: ['%LOCALAPPDATA%\\Programs\\cursor', '%ProgramFiles%\\Cursor'], + }, + { + exe: 'Windsurf.exe', + label: 'Windsurf', + dirs: ['%LOCALAPPDATA%\\Programs\\windsurf', '%ProgramFiles%\\Windsurf'], + }, + { exe: 'Kiro.exe', label: 'Kiro', dirs: ['%LOCALAPPDATA%\\Programs\\Kiro'] }, + { + exe: 'Antigravity.exe', + label: 'Antigravity', + dirs: ['%LOCALAPPDATA%\\Programs\\Antigravity', '%ProgramFiles%\\Antigravity'], + }, + { + exe: 'idea64.exe', + label: 'IntelliJ IDEA', + dirs: ['%ProgramFiles%\\JetBrains\\IntelliJ IDEA*', '%LOCALAPPDATA%\\JetBrains\\IntelliJ IDEA*'], + }, + { + exe: 'pycharm64.exe', + label: 'PyCharm', + dirs: ['%ProgramFiles%\\JetBrains\\PyCharm*', '%LOCALAPPDATA%\\JetBrains\\PyCharm*'], + }, + { + exe: 'webstorm64.exe', + label: 'WebStorm', + dirs: ['%ProgramFiles%\\JetBrains\\WebStorm*', '%LOCALAPPDATA%\\JetBrains\\WebStorm*'], + }, + { + exe: 'rider64.exe', + label: 'Rider', + dirs: ['%ProgramFiles%\\JetBrains\\Rider*', '%LOCALAPPDATA%\\JetBrains\\Rider*'], + }, + { + exe: 'goland64.exe', + label: 'GoLand', + dirs: ['%ProgramFiles%\\JetBrains\\GoLand*', '%LOCALAPPDATA%\\JetBrains\\GoLand*'], + }, + { + exe: 'clion64.exe', + label: 'CLion', + dirs: ['%ProgramFiles%\\JetBrains\\CLion*', '%LOCALAPPDATA%\\JetBrains\\CLion*'], + }, + { exe: 'fleet.exe', label: 'JetBrains Fleet', dirs: ['%LOCALAPPDATA%\\Programs\\Fleet'] }, + { exe: 'sublime_text.exe', label: 'Sublime Text', dirs: ['%ProgramFiles%\\Sublime Text*'] }, + { + exe: 'notepad++.exe', + label: 'Notepad++', + dirs: ['%ProgramFiles%\\Notepad++', '%ProgramFiles(x86)%\\Notepad++'], + }, + { + exe: 'devenv.exe', + label: 'Visual Studio', + dirs: ['%ProgramFiles%\\Microsoft Visual Studio*', '%ProgramFiles(x86)%\\Microsoft Visual Studio*'], + }, + { exe: 'zed.exe', label: 'Zed', dirs: ['%LOCALAPPDATA%\\Programs\\Zed', '%ProgramFiles%\\Zed'] }, + { exe: 'Trae.exe', label: 'Trae', dirs: ['%LOCALAPPDATA%\\Programs\\Trae', '%ProgramFiles%\\Trae'] }, + { + exe: 'PearAI.exe', + label: 'PearAI', + dirs: ['%LOCALAPPDATA%\\Programs\\PearAI', '%ProgramFiles%\\PearAI'], + }, +]; + +const AI_EXTENSION_PATTERNS = [ + { pattern: 'github.copilot', label: 'GitHub Copilot' }, + { pattern: 'github.copilot-chat', label: 'GitHub Copilot Chat' }, + { pattern: 'openai.chatgpt', label: 'ChatGPT' }, + { pattern: 'continue', label: 'Continue' }, + { pattern: 'cline', label: 'Cline' }, + { pattern: 'roo-code', label: 'Roo Code' }, + { pattern: 'aider', label: 'Aider' }, + { pattern: 'codeium', label: 'Codeium' }, + { pattern: 'tabnine', label: 'Tabnine' }, + { pattern: 'supermaven', label: 'Supermaven' }, + { pattern: 'amazonwebservices.aws-toolkit', label: 'AWS Q' }, + { pattern: 'sourcegraph.cody', label: 'Cody (Sourcegraph)' }, +]; + +// ---- Helpers ---- + +function getDriveRoots() { + const drives = []; + if (process.platform === 'win32') { + for (let i = 65; i <= 90; i++) { + const root = `${String.fromCharCode(i)}:\\`; + try { + if (fs.statSync(root).isDirectory()) drives.push(root); + } catch { + /* skip */ + } + } + } + return drives; +} + +/** @param {string} env */ +function expandEnvVar(env) { + return env.replace( + /%([^%]+)%/g, + (_ /** @type {string} */, v /** @type {string} */) => process.env[v] || '', + ); +} + +/** @param {string} p */ +function isFile(p) { + try { + return fs.statSync(p).isFile(); + } catch { + return false; + } +} +/** @param {string} p */ +function isDir(p) { + try { + return fs.statSync(p).isDirectory(); + } catch { + return false; + } +} + +/** @param {string} p */ +function readJsonSafe(p) { + try { + return JSON.parse(fs.readFileSync(p, 'utf8')); + } catch { + return null; + } +} + +// ---- Probe functions ---- + +function probeIDEs() { + const found = []; + const seen = new Set(); + for (const ide of IDE_PROBE_PATHS) { + if (seen.has(ide.label)) continue; + let resolvedPath = null; + for (const dirPattern of ide.dirs) { + const base = expandEnvVar(dirPattern); + if (!base) continue; + if (base.includes('*')) { + const wildIdx = base.indexOf('*'); + const prefix = base.substring(0, wildIdx); + const parentDir = path.dirname(prefix); + try { + if (fs.statSync(parentDir).isDirectory()) { + const entries = fs.readdirSync(parentDir); + const match = entries + .filter((e) => e.startsWith(path.basename(prefix))) + .sort() + .pop(); + if (match) resolvedPath = path.join(parentDir, match); + } + } catch { + /* ignore */ + } + } + if (!resolvedPath) { + try { + if (fs.statSync(base).isDirectory()) resolvedPath = base; + } catch { + /* ignore */ + } + } + if (!resolvedPath) continue; + const exePath = path.join(resolvedPath, ide.exe); + try { + if (fs.statSync(exePath).isFile()) { + found.push({ + id: 'ide-' + ide.label.replace(/\s+/g, '-').toLowerCase(), + label: ide.label, + path: resolvedPath, + exe: exePath, + }); + seen.add(ide.label); + break; + } + } catch { + /* ignore */ + } + } + } + return found; +} + +function probeAIExtensions() { + // Map: IDE label → Set of AI extension labels found in that IDE's extensions dir + const ideExtDirs = [ + { label: 'VS Code', path: path.join(HOMEDIR, '.vscode', 'extensions') }, + { + label: 'Cursor', + path: path.join( + process.env.APPDATA || path.join(HOMEDIR, 'AppData', 'Roaming'), + 'Cursor', + 'extensions', + ), + }, + { label: 'Kiro', path: path.join(HOMEDIR, '.kiro', 'extensions') }, + { label: 'Antigravity', path: path.join(HOMEDIR, '.antigravity', 'extensions') }, + { label: 'Trae', path: path.join(HOMEDIR, '.trae', 'extensions') }, + { label: 'PearAI', path: path.join(HOMEDIR, '.pearai', 'extensions') }, + ]; + /** @type {Record} */ + const perIde = {}; + for (const ide of ideExtDirs) { + if (!isDir(ide.path)) continue; + try { + const entries = fs.readdirSync(ide.path); + const found = []; + for (const ai of AI_EXTENSION_PATTERNS) { + if (entries.some((e) => e.startsWith(ai.pattern))) found.push(ai.label); + } + if (found.length > 0) perIde[ide.label] = [...new Set(found)]; + } catch { + /* ignore */ + } + } + return perIde; +} + +// ---- Host-grouped scan ---- + +/** @typedef {{ path: string, label: string, count: number, names: { bareId: string, name: string, cat: string }[], internal?: boolean }} SkillEntry */ +/** @typedef {{ path: string, label: string }} FileEntry */ +/** @typedef {{ path: string, count: number, servers: string[] }} McpEntry */ +/** @typedef {{ type: string, label: string, description: string }} OpportunityEntry */ + +/** + * @param {{ id: string, label: string, icon: string }} hostDef + * @param {string} homedir + */ +function probeHostDir(hostDef, homedir) { + const hostPath = path.join(homedir, hostDef.id); + if (!isDir(hostPath)) return null; + + /** @type {SkillEntry[]} */ + const skills = []; + /** @type {FileEntry[]} */ + const configs = []; + /** @type {FileEntry[]} */ + const instructions = []; + /** @type {FileEntry[]} */ + const rules = []; + /** @type {McpEntry[]} */ + const mcpServers = []; + /** @type {OpportunityEntry[]} */ + const opportunities = []; + + const result = { + id: hostDef.id, + label: hostDef.label, + icon: hostDef.icon, + path: hostPath, + skills, + configs, + instructions, + rules, + mcpServers, + opportunities, + }; + + // Skills: standard skill dirs + const skillDir = path.join(hostPath, 'skills'); + if (isDir(skillDir) && countSkillFiles(skillDir) > 0) { + result.skills.push({ + path: skillDir, + label: `${hostDef.label} skills`, + count: countSkillFiles(skillDir), + names: listSkillNames(skillDir), + }); + } + + // Skills: Claude plugin marketplace + if (hostDef.id === '.claude') { + const pluginDir = path.join(hostPath, 'plugins', 'marketplaces', 'claude-plugins-official', 'plugins'); + const externalDir = path.join( + hostPath, + 'plugins', + 'marketplaces', + 'claude-plugins-official', + 'external_plugins', + ); + if (isDir(pluginDir)) { + const count = countSkillFiles(pluginDir); + if (count > 0) + result.skills.push({ + path: pluginDir, + label: 'Claude Plugins (official)', + count, + names: listSkillNames(pluginDir), + }); + } + if (isDir(externalDir)) { + const count = countSkillFiles(externalDir); + if (count > 0) + result.skills.push({ + path: externalDir, + label: 'Claude Plugins (external)', + count, + names: listSkillNames(externalDir), + }); + } + } + + // Configs + for (const name of CONFIG_FILE_NAMES) { + const p = path.join(hostPath, name); + if (isFile(p)) result.configs.push({ path: p, label: name }); + } + // Special config locations + if (hostDef.id === '.cursor') { + const mcpPath = path.join(hostPath, 'mcp.json'); + if (isFile(mcpPath) && !result.configs.some((c) => c.path === mcpPath)) + result.configs.push({ path: mcpPath, label: 'mcp.json' }); + } + if (hostDef.id === '.claude') { + const desktopConfig = path.join( + process.env.APPDATA || path.join(HOMEDIR, 'AppData', 'Roaming'), + 'Claude', + 'claude_desktop_config.json', + ); + if (isFile(desktopConfig)) + result.configs.push({ path: desktopConfig, label: 'claude_desktop_config.json' }); + } + // Kiro: steering.md and settings + if (hostDef.id === '.kiro') { + const steering = path.join(hostPath, 'steering', 'steering.md'); + if (isFile(steering)) result.instructions.push({ path: steering, label: 'steering/steering.md' }); + const settings = path.join(hostPath, 'settings', 'settings.json'); + if (isFile(settings)) result.configs.push({ path: settings, label: 'settings.json' }); + } + // Antigravity: settings and AI extensions + if (hostDef.id === '.antigravity') { + const agSettings = path.join( + process.env.APPDATA || path.join(HOMEDIR, 'AppData', 'Roaming'), + 'Antigravity', + 'User', + 'settings.json', + ); + if (isFile(agSettings)) result.configs.push({ path: agSettings, label: 'settings.json' }); + } + // Gemini: GEMINI.md and antigravity MCP config + if (hostDef.id === '.gemini') { + const agMcp = path.join(hostPath, 'antigravity', 'mcp_config.json'); + if (isFile(agMcp)) { + const json = readJsonSafe(agMcp); + const servers = json?.mcpServers || json?.servers || {}; + const count = Object.keys(servers).length; + if (count > 0) result.mcpServers.push({ path: agMcp, count, servers: Object.keys(servers) }); + else result.configs.push({ path: agMcp, label: 'antigravity/mcp_config.json' }); + } + } + + // Instructions + for (const name of INSTRUCTION_FILE_NAMES) { + const p = path.join(hostPath, name); + if (isFile(p)) result.instructions.push({ path: p, label: name }); + } + // Special instruction dirs + if (hostDef.id === '.claude') { + const projectsDir = path.join(hostPath, 'projects'); + if (isDir(projectsDir)) { + try { + for (const proj of fs.readdirSync(projectsDir)) { + const memDir = path.join(projectsDir, proj, 'memory'); + if (isDir(memDir)) { + try { + for (const f of fs.readdirSync(memDir)) { + if (f.endsWith('.md')) + result.instructions.push({ path: path.join(memDir, f), label: `memory/${f}` }); + } + } catch { + /* ignore */ + } + } + } + } catch { + /* ignore */ + } + } + } + + // Rules + for (const name of RULE_FILE_NAMES) { + const p = path.join(hostPath, name); + if (isFile(p)) result.rules.push({ path: p, label: name }); + } + if (hostDef.id === '.codex') { + const rulesDir = path.join(hostPath, 'rules'); + if (isDir(rulesDir)) { + try { + for (const f of fs.readdirSync(rulesDir)) { + const p = path.join(rulesDir, f); + if (isFile(p)) result.rules.push({ path: p, label: `rules/${f}` }); + } + } catch { + /* ignore */ + } + } + } + + // MCP servers from host config + const mcpConfigs = []; + if (hostDef.id === '.claude') { + mcpConfigs.push( + path.join( + process.env.APPDATA || path.join(HOMEDIR, 'AppData', 'Roaming'), + 'Claude', + 'claude_desktop_config.json', + ), + ); + } + if (hostDef.id === '.codex') mcpConfigs.push(path.join(hostPath, 'mcp.json')); + if (hostDef.id === '.cursor') mcpConfigs.push(path.join(hostPath, 'mcp.json')); + if (hostDef.id === '.windsurf') mcpConfigs.push(path.join(hostPath, 'mcp.json')); + for (const mcpPath of mcpConfigs) { + if (!isFile(mcpPath)) continue; + const json = readJsonSafe(mcpPath); + const servers = json?.mcpServers || json?.mcp_servers || {}; + const count = Object.keys(servers).length; + if (count > 0) result.mcpServers.push({ path: mcpPath, count, servers: Object.keys(servers) }); + } + + // Opportunities (missing global config) + const expected = OPPORTUNITY_FILES[/** @type {keyof typeof OPPORTUNITY_FILES} */ (hostDef.id)]; + if (expected) { + const filePath = path.join(hostPath, expected); + const homedirFile = path.join(homedir, expected); + // If config doesn't exist inside host dir or at homedir root + if (!isFile(filePath) && !isFile(homedirFile)) { + result.opportunities.push({ + type: 'missing-global-config', + label: expected, + description: `${hostDef.label} does not have a global ${expected} file. Context Engine can create one from your rules.`, + }); + } + } + + return result; +} + +/** @param {Array<{id: string, label: string, path: string, exe: string}>} ideList */ +function probeIdegGroup(ideList) { + if (!ideList.length) return null; + const perIde = probeAIExtensions(); + return { + id: 'ides', + label: 'IDEs', + icon: 'vscode', + items: ideList, + extensions: perIde, + }; +} + +// ---- Main scan ---- + +/** + * @param {string[]} customPaths + * @param {{ skipDrives?: boolean, skipHomedir?: boolean, skipWorkspaces?: boolean }} [opts] + */ +function scanSystem(customPaths = [], opts = {}) { + const { skipDrives = false, skipHomedir = false, skipWorkspaces = false } = opts; + const workspaces = skipWorkspaces ? [] : readWorkspaces(); + + const hosts = []; + const seenHosts = new Set(); + + // Probe host dirs from homedir + if (!skipHomedir) { + for (const h of HOSTS) { + const data = probeHostDir(h, HOMEDIR); + if (data && !seenHosts.has(data.path)) { + seenHosts.add(data.path); + hosts.push(data); + } + } + } + + // Probe host dirs from drives + if (!skipDrives) { + for (const drive of getDriveRoots()) { + for (const h of HOSTS) { + const p = path.join(drive, h.id); + if (seenHosts.has(p)) continue; + if (isDir(p)) { + const isWin = process.platform === 'win32'; + // On Windows, drive-level host dirs overlap with homedir (same user) + // Skip if we already found this host from homedir + const homedirVersion = path.join(HOMEDIR, h.id); + if (isWin && isDir(homedirVersion) && seenHosts.has(homedirVersion)) continue; + const data = probeHostDir(h, drive); + if (data) { + seenHosts.add(p); + hosts.push(data); + } + } + } + } + } + + // Probe IDEs + const ideList = skipDrives && skipHomedir ? [] : probeIDEs(); + + // Custom paths: scan as additional skill sources + for (const cp of customPaths) { + if (!isDir(cp)) continue; + const count = countSkillFiles(cp); + if (count <= 0) continue; + let realPath = cp; + try { + realPath = fs.realpathSync(cp); + } catch { + /* use unresolved */ + } + const isWin = process.platform === 'win32'; + const internalReal = (() => { + try { + return fs.realpathSync(SKILLS_DIR); + } catch { + return SKILLS_DIR; + } + })(); + const internal = isWin + ? realPath.toLowerCase().startsWith(internalReal.toLowerCase() + path.sep) || + realPath.toLowerCase() === internalReal.toLowerCase() + : realPath.startsWith(internalReal + path.sep) || realPath === internalReal; + + // Check if this path falls inside an existing host dir + let matched = false; + for (const h of hosts) { + const hPath = isWin ? h.path.toLowerCase() : h.path; + const cPath = isWin ? cp.toLowerCase() : cp; + if (cPath.startsWith(hPath + path.sep) || cPath === hPath) { + // Add as skill under existing host + h.skills.push({ path: cp, label: path.basename(cp), count, names: listSkillNames(cp), internal }); + matched = true; + break; + } + } + if (!matched) { + // Create a standalone host entry for this custom path + hosts.push({ + id: 'custom-' + path.basename(cp).replace(/[^a-z0-9]/gi, '-'), + label: path.basename(cp), + icon: 'folder', + path: cp, + skills: [{ path: cp, label: path.basename(cp), count, names: listSkillNames(cp), internal }], + configs: [], + instructions: [], + rules: [], + mcpServers: [], + opportunities: [], + }); + } + } + + // Scan for standalone rule/instruction files in homedir root and workspaces + if (!skipHomedir) { + scanStandaloneFiles(HOMEDIR, hosts); + } + if (!skipDrives) { + for (const drive of getDriveRoots()) scanStandaloneFiles(drive, hosts); + } + for (const ws of workspaces) scanStandaloneFiles(ws, hosts); + + // Filter out hosts with nothing found (empty dirs) + const populated = hosts.filter( + (h) => + h.skills.length > 0 || + h.configs.length > 0 || + h.instructions.length > 0 || + h.rules.length > 0 || + h.mcpServers.length > 0 || + h.opportunities.length > 0, + ); + + // IDE group + const ides = probeIdegGroup(ideList); + + return { + hosts: populated, + ides: ides ? ides.items : [], + ideExtensions: ides ? ides.extensions : [], + workspaces, + }; +} + +/** @param {string} dir @param {Array} hosts */ +function scanStandaloneFiles(dir, hosts) { + if (!isDir(dir)) return; + // Check for rule/instruction files at the dir root that don't belong to a host dir + for (const name of RULE_FILE_NAMES) { + const p = path.join(dir, name); + if (!isFile(p)) continue; + // Skip if it's inside a host dir we already scanned + if (hosts.some((h) => p.startsWith(h.path + path.sep) || p === h.path)) continue; + // Attach to a "standalone" section + let standalone = hosts.find((h) => h.id === 'standalone-rules'); + if (!standalone) { + standalone = { + id: 'standalone-rules', + label: 'Standalone Files', + icon: 'folder', + path: '', + skills: [], + configs: [], + instructions: [], + rules: [], + mcpServers: [], + opportunities: [], + }; + hosts.push(standalone); + } + if (!standalone.rules.some(/** @param {FileEntry} r */ (r) => r.path === p)) + standalone.rules.push({ path: p, label: name }); + } + for (const name of INSTRUCTION_FILE_NAMES) { + const p = path.join(dir, name); + if (!isFile(p)) continue; + if (hosts.some((h) => p.startsWith(h.path + path.sep) || p === h.path)) continue; + let standalone = hosts.find((h) => h.id === 'standalone-rules'); + if (!standalone) { + standalone = { + id: 'standalone-rules', + label: 'Standalone Files', + icon: 'folder', + path: '', + skills: [], + configs: [], + instructions: [], + rules: [], + mcpServers: [], + opportunities: [], + }; + hosts.push(standalone); + } + if (!standalone.instructions.some(/** @param {FileEntry} i */ (i) => i.path === p)) + standalone.instructions.push({ path: p, label: name }); + } +} + +function readWorkspaces() { + try { + const raw = fs.readFileSync(path.join(DATA_DIR, 'workspaces.json'), 'utf8'); + const parsed = JSON.parse(raw); + if (Array.isArray(parsed?.workspaces)) { + return parsed.workspaces + .map((/** @type {any} */ w) => (typeof w === 'string' ? w : w?.path)) + .filter(Boolean); + } + } catch { + /* none */ + } + return []; +} + +module.exports = { scanSystem }; diff --git a/server/lib/validation.js b/server/lib/validation.js index 5150b70..639b262 100644 --- a/server/lib/validation.js +++ b/server/lib/validation.js @@ -20,9 +20,9 @@ function validateMemory(data) { function validateRules(data) { if (!data || typeof data !== 'object') return { valid: false, error: 'Must be a JSON object' }; if (data._parseError) return { valid: false, error: 'Invalid JSON in request body' }; - const codingPriorities = ['hard', 'soft']; - const generalPriorities = ['hard', 'soft']; - const soulPriorities = ['soft']; + const codingPriorities = ['hard', 'soft', 'style']; + const generalPriorities = ['hard', 'soft', 'style']; + const soulPriorities = ['soft', 'style']; const sections = [ { key: 'coding', allowed: codingPriorities }, { key: 'general', allowed: generalPriorities }, diff --git a/server/lib/vectorstore.js b/server/lib/vectorstore.js index dbe0f1c..fee8b01 100644 --- a/server/lib/vectorstore.js +++ b/server/lib/vectorstore.js @@ -123,7 +123,7 @@ function replaceVectors(records, model) { } /** - * @param {VectorStore} store + * @param {import('./vectorstore').VectorStore} store * @param {number[]} queryVector * @param {{ limit?: number, skillId?: string }=} options */ @@ -136,6 +136,265 @@ function searchVectors(store, queryVector, options = {}) { .slice(0, limit); } +/** + * Hybrid search: combine vector cosine score with lexical term matching. + * Lexical boosts chunks where the query terms appear in the skill ID (weight + * 0.4), section title (weight 0.3), or chunk text (weight 0.3). The final + * score is 0.6 * vectorScore + 0.4 * lexicalScore. + * + * @param {import('./vectorstore').VectorStore} store + * @param {number[]} queryVector + * @param {string} query Original query text for lexical matching. + * @param {{ limit?: number, diversifyBySkill?: boolean }=} options + * @returns {Array} + */ +function hybridSearch(store, queryVector, query, options = {}) { + const limit = options.limit || 10; + const terms = extractQueryTerms(query); + if (!terms.length) { + const vectorResults = searchVectors(store, queryVector, { + limit: options.diversifyBySkill ? Infinity : limit, + }).map((/** @type {import('./vectorstore').VectorRecord & { score: number }} */ r) => ({ + ...r, + lexicalScore: 0, + })); + return limitSearchResults(vectorResults, limit, options); + } + + const results = normalizeStore(store) + .records.map((record) => { + const vectorScore = cosineSimilarity(queryVector, record.vector); + const lexicalScore = computeLexicalScore(record, terms); + return { + ...record, + score: 0.6 * vectorScore + 0.4 * lexicalScore, + lexicalScore, + }; + }) + .sort((a, b) => b.score - a.score); + return limitSearchResults(results, limit, options); +} + +/** + * @param {Array} results + * @param {number} limit + * @param {{ diversifyBySkill?: boolean }} options + * @returns {Array} + */ +function limitSearchResults(results, limit, options) { + if (!options.diversifyBySkill) return results.slice(0, limit); + + const picked = []; + const deferred = []; + const seenSkills = new Set(); + for (const result of results) { + const skillGroup = bareSkillId(result.skillId); + if (!seenSkills.has(skillGroup)) { + picked.push(result); + seenSkills.add(skillGroup); + } else { + deferred.push(result); + } + if (picked.length >= limit) return picked; + } + + for (const result of deferred) { + if (picked.length >= limit) break; + picked.push(result); + } + return picked; +} + +/** + * Source-linked skills use `:`. For search result diversity, + * group linked copies and built-in copies by their bare skill ID. + * @param {string} skillId + */ +function bareSkillId(skillId) { + return ( + String(skillId || '') + .split(':') + .pop() || String(skillId || '') + ); +} + +/** + * Extract meaningful lowercase terms from a query string. + * Removes common stopwords and short tokens. + * @param {string} query + * @returns {string[]} + */ +function extractQueryTerms(query) { + const stopwords = new Set([ + 'the', + 'a', + 'an', + 'is', + 'are', + 'was', + 'were', + 'be', + 'been', + 'being', + 'have', + 'has', + 'had', + 'do', + 'does', + 'did', + 'will', + 'would', + 'could', + 'should', + 'may', + 'might', + 'shall', + 'can', + 'need', + 'dare', + 'ought', + 'used', + 'to', + 'of', + 'in', + 'for', + 'on', + 'with', + 'at', + 'by', + 'from', + 'as', + 'into', + 'through', + 'during', + 'before', + 'after', + 'above', + 'below', + 'between', + 'out', + 'off', + 'over', + 'under', + 'again', + 'further', + 'then', + 'once', + 'here', + 'there', + 'when', + 'where', + 'why', + 'how', + 'all', + 'each', + 'every', + 'both', + 'few', + 'more', + 'most', + 'other', + 'some', + 'such', + 'no', + 'nor', + 'not', + 'only', + 'own', + 'same', + 'so', + 'than', + 'too', + 'very', + 'just', + 'because', + 'but', + 'and', + 'or', + 'if', + 'while', + 'that', + 'this', + 'it', + 'its', + 'i', + 'me', + 'my', + 'we', + 'our', + 'you', + 'your', + 'he', + 'him', + 'his', + 'she', + 'her', + 'they', + 'them', + 'their', + 'what', + 'which', + 'who', + 'whom', + 'about', + 'up', + ]); + return query + .toLowerCase() + .replace(/[^a-z0-9\s-]/g, ' ') + .split(/\s+/) + .filter((t) => t.length > 2 && !stopwords.has(t)); +} + +/** + * Compute a lexical relevance score (0-1) for a record against query terms. + * Matches in skillId get weight 0.4, section title 0.3, chunk text 0.3. + * Supports prefix matching: "files" matches "file" in "file-search". + * @param {import('./vectorstore').VectorRecord} record + * @param {string[]} terms + * @returns {number} + */ +function computeLexicalScore(record, terms) { + const skillLower = (record.skillId || '').toLowerCase(); + const sectionLower = (record.section || '').toLowerCase(); + const textLower = (record.text || '').toLowerCase(); + const textWords = new Set(textLower.split(/\s+/).filter(Boolean)); + const sectionWords = new Set(sectionLower.split(/[\s-]+/).filter(Boolean)); + const skillWords = new Set(skillLower.split(/[\s:-]+/).filter(Boolean)); + + let skillHits = 0; + let sectionHits = 0; + let textHits = 0; + + for (const term of terms) { + const termStem = term.replace(/s$/, '').replace(/ing$/, '').replace(/ed$/, ''); + // Check full term, stemmed term, and prefix matches + /** + * @param {string} word + */ + const matches = (word) => + word === term || + word === termStem || + word.startsWith(term) || + word.startsWith(termStem) || + (term.length > 3 && term.startsWith(word)); + /** @param {string} w */ + const matchWord = (w) => matches(w); + const textMatch = [...textWords].some(matchWord); + const sectionMatch = [...sectionWords].some(matchWord); + const skillMatch = [...skillWords].some(matchWord); + + if (textMatch) textHits++; + if (sectionMatch) sectionHits++; + if (skillMatch) skillHits++; + } + + const maxHits = terms.length; + if (!maxHits) return 0; + + return 0.4 * (skillHits / maxHits) + 0.3 * (sectionHits / maxHits) + 0.3 * (textHits / maxHits); +} + /** * @param {number[]} a * @param {number[]} b @@ -193,6 +452,7 @@ module.exports = { upsertVectors, replaceVectors, searchVectors, + hybridSearch, cosineSimilarity, markIndexStale, clearIndexStale, diff --git a/server/router.js b/server/router.js index f64b926..516ab68 100644 --- a/server/router.js +++ b/server/router.js @@ -272,6 +272,24 @@ async function handleRequest(req, res, url) { return json(res, { ok: true, ...getAppVersion() }); } + // ---- SYSTEM SCAN ---- + if (p === '/api/system/scan') { + const { scanSystem } = require('./lib/system-scan'); + if (req.method === 'POST') { + const data = await body(req); + const customPaths = Array.isArray(data?.customPaths) ? data.customPaths : []; + const opts = { + skipDrives: !!data?.skipDrives, + skipHomedir: !!data?.skipHomedir, + skipWorkspaces: !!data?.skipWorkspaces, + }; + return json(res, { ok: true, ...scanSystem(customPaths, opts) }); + } + if (req.method === 'GET') { + return json(res, { ok: true, ...scanSystem() }); + } + } + // ---- ONBOARDING ---- if (p === '/api/onboarding' && req.method === 'GET') { const tools = detectTools(HOMEDIR, { diff --git a/ui/config.js b/ui/config.js index d16ddfd..4b91efa 100644 --- a/ui/config.js +++ b/ui/config.js @@ -5,9 +5,9 @@ const ConfigTab = (() => { // Priority sections per rule category (must match RulesLab.PRIORITY_SECTIONS) const PRIORITY_SECTIONS = { - coding: ['hard', 'preference', 'style'], - general: ['hard', 'preference', 'style'], - soul: ['preference'], + coding: ['hard', 'soft', 'style'], + general: ['hard', 'soft', 'style'], + soul: ['soft', 'style'], }; /** Get all textarea IDs for a given section key */ @@ -22,13 +22,17 @@ const ConfigTab = (() => { function load() { const r = RS.get(); + if (typeof RulesLab !== 'undefined' && RulesLab.setDraft) { + RulesLab.setDraft(r); + return; + } Object.keys(PRIORITY_SECTIONS).forEach((key) => { const section = r[key]; PRIORITY_SECTIONS[key].forEach((p) => { const el = document.getElementById(`rules-${key}-${p}`); if (!el) return; if (typeof section === 'string') { - el.value = p === 'preference' ? section : ''; + el.value = p === 'soft' ? section : ''; } else if (section && typeof section === 'object') { el.value = section[p] || ''; } else { @@ -40,18 +44,44 @@ const ConfigTab = (() => { } function save() { + const data = + typeof RulesLab !== 'undefined' && RulesLab.draft + ? RulesLab.draft() + : (() => { + const legacyData = {}; + Object.keys(PRIORITY_SECTIONS).forEach((key) => { + legacyData[key] = {}; + PRIORITY_SECTIONS[key].forEach((p) => { + const el = document.getElementById(`rules-${key}-${p}`); + legacyData[key][p] = el?.value?.trim() || ''; + }); + }); + return legacyData; + })(); if (typeof RulesLab !== 'undefined') RulesLab.beforeSave(); - const data = {}; + RS.save(data); + updateRuleMetrics(); + flash('rules-saved'); + } + + function updateRuleMetrics() { + if (typeof RulesLab !== 'undefined' && document.getElementById('rules-coding-list')) { + RulesLab.refresh(); + return; + } Object.keys(PRIORITY_SECTIONS).forEach((key) => { - data[key] = {}; + const metric = document.getElementById(`rules-${key}-count`); + if (!metric) return; + let words = 0; + let lines = 0; PRIORITY_SECTIONS[key].forEach((p) => { const el = document.getElementById(`rules-${key}-${p}`); - data[key][p] = el?.value?.trim() || ''; + if (!el) return; + words += el.value.trim().split(/\s+/).filter(Boolean).length; + lines += el.value.split(/\n/).filter((l) => l.trim()).length; }); + metric.textContent = `${words} words / ${lines} lines`; }); - RS.save(data); - updateRuleMetrics(); - flash('rules-saved'); } async function reset() { @@ -69,22 +99,6 @@ const ConfigTab = (() => { Toast.info('Rules reset to defaults'); } - function updateRuleMetrics() { - Object.keys(PRIORITY_SECTIONS).forEach((key) => { - const metric = document.getElementById(`rules-${key}-count`); - if (!metric) return; - let words = 0; - let lines = 0; - PRIORITY_SECTIONS[key].forEach((p) => { - const el = document.getElementById(`rules-${key}-${p}`); - if (!el) return; - words += el.value.trim().split(/\s+/).filter(Boolean).length; - lines += el.value.split(/\n/).filter((l) => l.trim()).length; - }); - metric.textContent = `${words} words / ${lines} lines`; - }); - } - function flash(id) { const el = document.getElementById(id); if (!el) return; diff --git a/ui/dashboard.js b/ui/dashboard.js index 31ab9a7..7c4a4c3 100644 --- a/ui/dashboard.js +++ b/ui/dashboard.js @@ -309,7 +309,7 @@ const DashboardTab = (() => { host.innerHTML = `
Preview${sel.toLocaleString()} tokens selected
-
vs ${all.toLocaleString()} all on${saved > 0 ? `, saving ${saved.toLocaleString()}` : ''} (${pct}% of full)
+
vs ${all.toLocaleString()} all on${saved > 0 ? `, saving ${saved.toLocaleString()}` : ''} (${pct}% of full) — quality gate pending
${tagRow}
${selected.length} skills picked: ${selected.map((id) => `${esc(id)}`).join(' ')}
diff --git a/ui/data.js b/ui/data.js index cbd74be..ecec60a 100644 --- a/ui/data.js +++ b/ui/data.js @@ -24,10 +24,12 @@ const DEFAULT_RULES = { coding: { hard: '', soft: 'Modular code files.\nComment the why, not the what.', + style: '', }, general: { hard: '', soft: 'Memory is a core skill. Think independently.', + style: '', }, - soul: { soft: DEFAULT_SOUL }, + soul: { soft: DEFAULT_SOUL, style: '' }, }; diff --git a/ui/icons.js b/ui/icons.js new file mode 100644 index 0000000..25cdf2d --- /dev/null +++ b/ui/icons.js @@ -0,0 +1,89 @@ +// icons.js — Shared icon map and helpers for onboarding skill/IDE cards. + +window.ObIcons = { + claude: 'https://cdn.jsdelivr.net/npm/simple-icons/icons/claude.svg', + cursor: 'https://cdn.jsdelivr.net/npm/simple-icons/icons/cursor.svg', + windsurf: 'https://cdn.jsdelivr.net/npm/simple-icons/icons/windsurf.svg', + openai: 'https://cdn.jsdelivr.net/npm/simple-icons/icons/openai.svg', + opencode: 'https://cdn.jsdelivr.net/gh/walkxcode/dashboard-icons/svg/opencode.svg', + continue: + 'https://raw.githubusercontent.com/continuedev/continue/main/extensions/vscode/media/sidebar-icon.png', + cline: 'https://cdn.jsdelivr.net/npm/simple-icons/icons/cline.svg', + kimi: 'https://cdn.jsdelivr.net/gh/homarr-labs/dashboard-icons/svg/kimi-ai.svg', + goose: 'https://cdn.jsdelivr.net/npm/@lobehub/icons-static-svg@latest/icons/goose.svg', + sourcegraph: 'https://upload.wikimedia.org/wikipedia/commons/8/8f/Sourcegraph-logo-light.svg', + kiro: 'https://kiro.dev/favicon.ico', + antigravity: 'https://avatars.githubusercontent.com/nicholasgriffintn?size=128', + gemini: 'https://cdn.jsdelivr.net/npm/simple-icons/icons/googlegemini.svg', + augment: 'https://www.augmentcode.com/favicon.svg', + pearai: 'https://avatars.githubusercontent.com/nicepkg?size=128', + void: 'https://avatars.githubusercontent.com/voideditor?size=128', + vscode: 'https://cdn.jsdelivr.net/npm/simple-icons/icons/visualstudiocode.svg', + folder: '', + 'ide-vs-code': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/visualstudiocode.svg', + 'ide-cursor': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/cursor.svg', + 'ide-windsurf': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/windsurf.svg', + 'ide-kiro': 'https://kiro.dev/favicon.ico', + 'ide-antigravity': 'https://avatars.githubusercontent.com/nicholasgriffintn?size=128', + 'ide-intellij-idea': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/intellijidea.svg', + 'ide-pycharm': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/pycharm.svg', + 'ide-webstorm': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/webstorm.svg', + 'ide-rider': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/rider.svg', + 'ide-goland': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/goland.svg', + 'ide-clion': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/clion.svg', + 'ide-jetbrains-fleet': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/jetbrains.svg', + 'ide-sublime-text': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/sublimetext.svg', + 'ide-notepad++': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/notepadplusplus.svg', + 'ide-visual-studio': 'https://cdn.jsdelivr.net/npm/simple-icons/icons/visualstudio.svg', + 'ide-zed': 'https://avatars.githubusercontent.com/zed-industries?size=128', + 'ide-trae': 'https://www.trae.ai/favicon.svg', + 'ide-pearai': 'https://avatars.githubusercontent.com/nicepkg?size=128', +}; + +window.obIcon = function (iconId) { + const url = window.ObIcons[iconId]; + if (!url) return '
' + (iconId || '?')[0].toUpperCase() + '
'; + const safe = String(url).replace(/&/g, '&').replace(/"/g, '"').replace(/'/g, '''); + return ''; +}; + +window.catSvg = function (cat) { + const icons = { + skills: + '', + rules: + '', + instruct: + '', + config: + '', + mcp: '', + opportunity: + '', + }; + return icons[cat] || ''; +}; + +window.ideIconKey = function (label) { + const m = { + 'VS Code': 'ide-vs-code', + Cursor: 'ide-cursor', + Windsurf: 'ide-windsurf', + Kiro: 'ide-kiro', + Antigravity: 'ide-antigravity', + 'IntelliJ IDEA': 'ide-intellij-idea', + PyCharm: 'ide-pycharm', + WebStorm: 'ide-webstorm', + Rider: 'ide-rider', + GoLand: 'ide-goland', + CLion: 'ide-clion', + 'JetBrains Fleet': 'ide-jetbrains-fleet', + 'Sublime Text': 'ide-sublime-text', + 'Notepad++': 'ide-notepad++', + 'Visual Studio': 'ide-visual-studio', + Zed: 'ide-zed', + Trae: 'ide-trae', + PearAI: 'ide-pearai', + }; + return m[label] || 'vscode'; +}; diff --git a/ui/index.html b/ui/index.html index de16b03..57a1b3b 100644 --- a/ui/index.html +++ b/ui/index.html @@ -27,36 +27,6 @@
Loading Context Engine
-
+
diff --git a/ui/styles/shell.css b/ui/styles/shell.css index 4e6d8cd..74d647f 100644 --- a/ui/styles/shell.css +++ b/ui/styles/shell.css @@ -33,6 +33,22 @@ html[data-runtime='electron'] .app { 'side main'; } +/* Electron: draggable strip behind the native titlebar overlay controls */ +.desktop-titlebar { + display: none; +} + +html[data-runtime='electron'] .desktop-titlebar { + display: block; + grid-area: overlay; + position: relative; + z-index: var(--z-nav); + min-height: var(--desktop-titlebar-h); + background: var(--bg); + border-bottom: 1px solid var(--line); + -webkit-app-region: drag; +} + /* Electron: sidebar brand is the window drag handle */ html[data-runtime='electron'] .nav-brand { -webkit-app-region: drag; From b427a7df9934d1217f3142ec2dea87e37b587380 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Tue, 19 May 2026 17:09:57 +0100 Subject: [PATCH 3/7] Clean production PR payload --- .gitignore | 1 + .prettierignore | 1 + bench/aggregate_v12.py | 164 ----- bench/aggregate_v13.py | 255 -------- bench/build_report.py | 571 ------------------ bench/continuity-gold.json | 198 ------ bench/continuity-tasks.json | 142 ----- bench/fetch_contexts.py | 106 ---- bench/fixtures/skills/app-launcher/SKILL.md | 33 - .../fixtures/skills/brand-guidelines/SKILL.md | 31 - bench/fixtures/skills/canvas-design/SKILL.md | 28 - bench/fixtures/skills/claude-api/SKILL.md | 57 -- .../skills/comfyui-prompt-builder/SKILL.md | 35 -- .../skills/comfyui-workflow-manager/SKILL.md | 42 -- .../skills/competitor-analyser/SKILL.md | 26 - .../skills/downloads-cleaner/SKILL.md | 30 - bench/fixtures/skills/email-drafter/SKILL.md | 30 - bench/fixtures/skills/file-organiser/SKILL.md | 33 - bench/fixtures/skills/file-search/SKILL.md | 26 - .../skills/health-tracker-analyst/SKILL.md | 36 -- bench/fixtures/skills/internal-comms/SKILL.md | 30 - bench/fixtures/skills/pptx/SKILL.md | 51 -- .../fixtures/skills/process-manager/SKILL.md | 28 - .../skills/slack-gif-creator/SKILL.md | 37 -- .../fixtures/skills/startup-routine/SKILL.md | 39 -- bench/fixtures/skills/system-monitor/SKILL.md | 26 - bench/fixtures/skills/theme-factory/SKILL.md | 31 - bench/fixtures/skills/web-researcher/SKILL.md | 26 - bench/gold-answers.json | 480 --------------- bench/grade_v2.py | 420 ------------- bench/legacy/build_addendum.py | 394 ------------ bench/legacy/build_addendum_v12.py | 358 ----------- bench/legacy/build_addendum_v13.py | 366 ----------- bench/legacy/codex-handoff.md | 164 ----- bench/run_continuity.py | 268 -------- bench/run_v12.py | 167 ----- bench/run_v13.py | 219 ------- scripts/quick-search.cjs | 46 -- scripts/retrieval-smoke.js | 213 ------- scripts/setup-benchmark.cjs | 106 ---- server/lib/system-scan-definitions.js | 174 ++++++ server/lib/system-scan-ides.js | 114 ++++ server/lib/system-scan.js | 271 +-------- ui/index.html | 1 + ui/onboarding-render.js | 193 ++++++ ui/onboarding.js | 196 +----- ui/styles/_index.css | 2 + ui/styles/onboarding-flow.css | 181 ++++++ ui/styles/onboarding-hosts.css | 291 +++++++++ ui/styles/onboarding.css | 486 --------------- 50 files changed, 968 insertions(+), 6255 deletions(-) delete mode 100644 bench/aggregate_v12.py delete mode 100644 bench/aggregate_v13.py delete mode 100644 bench/build_report.py delete mode 100644 bench/continuity-gold.json delete mode 100644 bench/continuity-tasks.json delete mode 100644 bench/fetch_contexts.py delete mode 100644 bench/fixtures/skills/app-launcher/SKILL.md delete mode 100644 bench/fixtures/skills/brand-guidelines/SKILL.md delete mode 100644 bench/fixtures/skills/canvas-design/SKILL.md delete mode 100644 bench/fixtures/skills/claude-api/SKILL.md delete mode 100644 bench/fixtures/skills/comfyui-prompt-builder/SKILL.md delete mode 100644 bench/fixtures/skills/comfyui-workflow-manager/SKILL.md delete mode 100644 bench/fixtures/skills/competitor-analyser/SKILL.md delete mode 100644 bench/fixtures/skills/downloads-cleaner/SKILL.md delete mode 100644 bench/fixtures/skills/email-drafter/SKILL.md delete mode 100644 bench/fixtures/skills/file-organiser/SKILL.md delete mode 100644 bench/fixtures/skills/file-search/SKILL.md delete mode 100644 bench/fixtures/skills/health-tracker-analyst/SKILL.md delete mode 100644 bench/fixtures/skills/internal-comms/SKILL.md delete mode 100644 bench/fixtures/skills/pptx/SKILL.md delete mode 100644 bench/fixtures/skills/process-manager/SKILL.md delete mode 100644 bench/fixtures/skills/slack-gif-creator/SKILL.md delete mode 100644 bench/fixtures/skills/startup-routine/SKILL.md delete mode 100644 bench/fixtures/skills/system-monitor/SKILL.md delete mode 100644 bench/fixtures/skills/theme-factory/SKILL.md delete mode 100644 bench/fixtures/skills/web-researcher/SKILL.md delete mode 100644 bench/gold-answers.json delete mode 100644 bench/grade_v2.py delete mode 100644 bench/legacy/build_addendum.py delete mode 100644 bench/legacy/build_addendum_v12.py delete mode 100644 bench/legacy/build_addendum_v13.py delete mode 100644 bench/legacy/codex-handoff.md delete mode 100644 bench/run_continuity.py delete mode 100644 bench/run_v12.py delete mode 100644 bench/run_v13.py delete mode 100644 scripts/quick-search.cjs delete mode 100644 scripts/retrieval-smoke.js delete mode 100644 scripts/setup-benchmark.cjs create mode 100644 server/lib/system-scan-definitions.js create mode 100644 server/lib/system-scan-ides.js create mode 100644 ui/onboarding-render.js create mode 100644 ui/styles/onboarding-flow.css create mode 100644 ui/styles/onboarding-hosts.css diff --git a/.gitignore b/.gitignore index 4a4ea66..4572cb2 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,4 @@ __pycache__/ bench/results-latest.json bench/artifacts/ bench/data/ +bench/fixtures/skills/ diff --git a/.prettierignore b/.prettierignore index daa3bf5..e06d9d3 100644 --- a/.prettierignore +++ b/.prettierignore @@ -8,3 +8,4 @@ server.err.log *.log bench/artifacts bench/data +bench/fixtures/skills diff --git a/bench/aggregate_v12.py b/bench/aggregate_v12.py deleted file mode 100644 index a1bd781..0000000 --- a/bench/aggregate_v12.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python3 -""" -aggregate_v12.py — Combine deterministic scoring + session grades into the -final v1.2 results file. Produces a per-task table and summary stats. -""" -from __future__ import annotations -import json, statistics -from pathlib import Path - -HERE = Path(__file__).parent -DATA = HERE / "data" / "v1.2" - -det = json.loads((DATA / "results-v12.json").read_text(encoding="utf-8")) -sg = json.loads((DATA / "session_grades.json").read_text(encoding="utf-8")) -resp = json.loads((DATA / "responses-v12.json").read_text(encoding="utf-8")) - -# Index session grades -sgrades = {(g["task_id"], g["mode"]): g for g in sg["grades"]} - -rows = [] -for t in det["results"]: - tid = t["task_id"] - for mode in ("no_context", "smart", "search"): - m = t["modes"].get(mode) - if not m: - continue - g = sgrades.get((tid, mode)) - fr = m["fact_recall"] - rq = m.get("retrieval_quality") - in_tok = m["input_tokens"] - out_tok = m["output_tokens"] - composite = (g["factual"] + g["specific"] + g["complete"]) / 3 if g else 0 - rows.append({ - "task_id": tid, - "mode": mode, - "input_tokens": in_tok, - "output_tokens": out_tok, - "cost_usd": m["cost_usd"], - "latency_ms": m["latency_ms"], - "fact_coverage_pct": fr["coverage_pct"], - "fact_bonus_pct": fr["bonus_pct"], - "forbidden_hits": fr["forbidden_hits"], - "failure_modes": fr["failure_modes"], - "factual": g["factual"] if g else 0, - "specific": g["specific"] if g else 0, - "complete": g["complete"] if g else 0, - "hallucinations": g["hallucinations"] if g else 0, - "composite_15": round(composite, 2), # composite on 0-5 scale (per-axis mean) - "composite_pct": round(composite / 5 * 100, 1), # composite as a % - "retrieval": rq, - "judge_notes": g["notes"] if g else "", - }) - -# Per-mode aggregates -def by_mode(field): - out = {} - for mode in ("no_context", "smart", "search"): - vs = [r[field] for r in rows if r["mode"] == mode and r[field] is not None] - if not vs: continue - out[mode] = { - "median": round(statistics.median(vs), 2), - "mean": round(statistics.mean(vs), 2), - "min": min(vs), - "max": max(vs), - } - return out - -def tokens_per_quality_point(mode): - rs = [r for r in rows if r["mode"] == mode and r["composite_15"] > 0] - if not rs: return None - # Use composite on a 0-15 scale (sum of 3 axes, each 0-5) for tk/pt - tk = [r["input_tokens"] for r in rs] - pts = [(r["factual"] + r["specific"] + r["complete"]) for r in rs] - return round(statistics.median(tk) / statistics.median(pts), 1) - -retr = {} -for r in rows: - if r["mode"] == "search" and r["retrieval"]: - for k in ("precision_at_k", "recall_at_k", "mrr"): - retr.setdefault(k, []).append(r["retrieval"][k]) -retr_summary = {k: round(statistics.mean(v), 3) for k, v in retr.items()} - -summary = { - "ran_at": det["ran_at"], - "task_count": len([r for r in rows if r["mode"] == "smart"]), - "task_model": "claude-sonnet-4-5 (session)", - "judges": "session:claude-sonnet-4-5 (single judge, single run; cross-family multi-run pending API keys)", - "tokens": { - "raw_all_baseline": 186654, - "by_mode_input_tokens": by_mode("input_tokens"), - "by_mode_output_tokens": by_mode("output_tokens"), - "by_mode_cost_usd": by_mode("cost_usd"), - "by_mode_latency_ms": by_mode("latency_ms"), - }, - "fact_recall": { - "coverage_must_include_pct": by_mode("fact_coverage_pct"), - "bonus_may_include_pct": by_mode("fact_bonus_pct"), - }, - "decomposed_rubric": { - "factual_1to5": by_mode("factual"), - "specific_1to5": by_mode("specific"), - "complete_1to5": by_mode("complete"), - "hallucinations_count": by_mode("hallucinations"), - "composite_pct": by_mode("composite_pct"), - }, - "retrieval_quality_mean": retr_summary, - "tokens_per_quality_point": { - "no_context": tokens_per_quality_point("no_context"), - "smart": tokens_per_quality_point("smart"), - "search": tokens_per_quality_point("search"), - }, -} - -# Headline derived numbers -sm_tk = summary["tokens"]["by_mode_input_tokens"]["smart"]["median"] -se_tk = summary["tokens"]["by_mode_input_tokens"]["search"]["median"] -nc_tk = summary["tokens"]["by_mode_input_tokens"]["no_context"]["median"] -summary["headline"] = { - "smart_saving_vs_raw_all_pct": round(100 * (1 - sm_tk / 186654), 1), - "search_saving_vs_raw_all_pct": round(100 * (1 - se_tk / 186654), 1), - "smart_quality_lift_vs_no_context_pct": round( - summary["decomposed_rubric"]["composite_pct"]["smart"]["median"] - - summary["decomposed_rubric"]["composite_pct"]["no_context"]["median"], 1), - "search_quality_lift_vs_no_context_pct": round( - summary["decomposed_rubric"]["composite_pct"]["search"]["median"] - - summary["decomposed_rubric"]["composite_pct"]["no_context"]["median"], 1), - "search_failure_at_release_note": "Precision@8=0.00 Recall@8=0.00 — retrieval missed internal-comms + slack-gif-creator entirely; predicts the observed quality drop.", -} - -out = {"summary": summary, "rows": rows} -DATA.mkdir(parents=True, exist_ok=True) -(DATA / "results-v12-final.json").write_text(json.dumps(out, indent=2), encoding="utf-8") - -# Print table -print(f"\n{'task':<25} {'mode':<11} {'tokens':>7} {'cov':>5} {'bonus':>6} {'fact':>4} {'spec':>4} {'comp':>4} {'comp%':>6}") -print("-" * 80) -for r in rows: - print(f"{r['task_id']:<25} {r['mode']:<11} {r['input_tokens']:>7,} " - f"{r['fact_coverage_pct']:>4.0f}% {r['fact_bonus_pct']:>5.0f}% " - f"{r['factual']:>4} {r['specific']:>4} {r['complete']:>4} " - f"{r['composite_pct']:>5.0f}%") -print() -print("=" * 80) -print(f"SUMMARY (5 tasks; baseline raw_all = 186,654 tokens)") -print("=" * 80) -for k in ("no_context", "smart", "search"): - tk = summary["tokens"]["by_mode_input_tokens"][k]["median"] - saving = round(100 * (1 - tk / 186654), 1) if k != "no_context" else None - cov = summary["fact_recall"]["coverage_must_include_pct"][k]["mean"] - bonus = summary["fact_recall"]["bonus_may_include_pct"][k]["mean"] - comp = summary["decomposed_rubric"]["composite_pct"][k]["median"] - tpqp = summary["tokens_per_quality_point"][k] - print(f" {k:<11} med tk={tk:>7,} save={f'{saving}%' if saving is not None else ' n/a':>6} " - f"cov={cov:>5.1f}% bonus={bonus:>5.1f}% composite={comp:>5.1f}% tk/pt={tpqp}") -print() -print(f"Retrieval quality (mean, search mode):") -for k, v in retr_summary.items(): - print(f" {k:<16} {v:.3f}") -print() -print(f"Headline:") -for k, v in summary["headline"].items(): - print(f" {k}: {v}") - -print(f"\nWrote {DATA / 'results-v12-final.json'}") diff --git a/bench/aggregate_v13.py b/bench/aggregate_v13.py deleted file mode 100644 index b1eca28..0000000 --- a/bench/aggregate_v13.py +++ /dev/null @@ -1,255 +0,0 @@ -#!/usr/bin/env python3 -""" -aggregate_v13.py — Aggregate the full 15-task × 3-mode × 3-judge-run benchmark. - -Reports median + IQR per axis, paired comparisons across modes, and the -honest finding that contradicts v1.2's framing. - -Usage: - python bench/aggregate_v13.py # aggregate + print - python bench/aggregate_v13.py --gate # aggregate + gate check (exit 1 on fail) - python bench/aggregate_v13.py --gate --json # aggregate + gate + JSON output -""" -from __future__ import annotations -import argparse, json, statistics, sys -from pathlib import Path - -p = argparse.ArgumentParser() -p.add_argument("--gate", action="store_true", help="Exit 1 if quality or retrieval gates fail") -p.add_argument("--json", action="store_true", help="Output final result as JSON to stdout") -args = p.parse_args() - -HERE = Path(__file__).parent -DATA = HERE / "data" / "v1.3" -data = json.loads((DATA / "results-v13.json").read_text(encoding="utf-8")) -rows = data["rows"] - - -def med(xs): return statistics.median(xs) if xs else 0 -def mean(xs): return statistics.mean(xs) if xs else 0 -def iqr(xs): - if len(xs) < 4: return 0.0 - q = statistics.quantiles(xs, n=4) - return q[2] - q[0] - - -# Per-mode aggregates -modes = ("no_context", "smart", "search") -def mode_rows(m): return [r for r in rows if r["mode"] == m] - -agg = {} -for m in modes: - rs = mode_rows(m) - agg[m] = { - "n": len(rs), - "input_tokens_median": int(med([r["input_tokens"] for r in rs])), - "output_tokens_median": int(med([r["output_tokens"] for r in rs])), - "latency_ms_median": int(med([r["latency_ms"] for r in rs])), - "cost_usd_median": round(med([r["cost_usd"] for r in rs]), 5), - "cost_usd_total": round(sum(r["cost_usd"] for r in rs), 4), - "coverage_pct_mean": round(mean([r["fact_recall"]["coverage_pct"] for r in rs]), 1), - "bonus_pct_mean": round(mean([r["fact_recall"]["bonus_pct"] for r in rs]), 1), - "factual_median": med([r["judge_summary"].get("factual_median", 0) for r in rs]), - "factual_iqr": round(iqr([r["judge_summary"].get("factual_median", 0) for r in rs]), 2), - "specific_median": med([r["judge_summary"].get("specific_median", 0) for r in rs]), - "specific_iqr": round(iqr([r["judge_summary"].get("specific_median", 0) for r in rs]), 2), - "complete_median": med([r["judge_summary"].get("complete_median", 0) for r in rs]), - "complete_iqr": round(iqr([r["judge_summary"].get("complete_median", 0) for r in rs]), 2), - "hallucinations_mean": round(mean([r["judge_summary"].get("hallucinations_mean", 0) for r in rs]), 2), - "hallucinations_max": max([r["judge_summary"].get("hallucinations_max", 0) for r in rs]), - "composite_15_median": round(med([ - (r["judge_summary"].get("factual_median", 0) - + r["judge_summary"].get("specific_median", 0) - + r["judge_summary"].get("complete_median", 0)) - for r in rs - ]), 2), - } - -# Paired comparisons: per-task, mode A vs mode B -by_task = {} -for r in rows: - by_task.setdefault(r["task_id"], {})[r["mode"]] = r - -def composite(r): - js = r["judge_summary"] - return js.get("factual_median", 0) + js.get("specific_median", 0) + js.get("complete_median", 0) - -pairs = {"smart_vs_no_context": [], "search_vs_no_context": [], "smart_vs_search": []} -for tid, modes_d in by_task.items(): - if "no_context" in modes_d: - if "smart" in modes_d: - pairs["smart_vs_no_context"].append((tid, composite(modes_d["smart"]) - composite(modes_d["no_context"]))) - if "search" in modes_d: - pairs["search_vs_no_context"].append((tid, composite(modes_d["search"]) - composite(modes_d["no_context"]))) - if "smart" in modes_d and "search" in modes_d: - pairs["smart_vs_search"].append((tid, composite(modes_d["smart"]) - composite(modes_d["search"]))) - -paired_summary = {} -for k, vs in pairs.items(): - diffs = [d for _, d in vs] - paired_summary[k] = { - "n": len(diffs), - "mean_delta": round(mean(diffs), 2), - "median_delta": round(med(diffs), 2), - "wins": sum(1 for d in diffs if d > 0), - "ties": sum(1 for d in diffs if d == 0), - "losses": sum(1 for d in diffs if d < 0), - "worst_task": min(vs, key=lambda x: x[1]) if vs else None, - "best_task": max(vs, key=lambda x: x[1]) if vs else None, - } - -# Retrieval-quality aggregates (search mode only) -retr = [r["retrieval_quality"] for r in rows if r["mode"] == "search" and r["retrieval_quality"]] -retr_summary = { - "n": len(retr), - "precision_at_k_mean": round(mean([r["precision_at_k"] for r in retr]), 3), - "recall_at_k_mean": round(mean([r["recall_at_k"] for r in retr]), 3), - "mrr_mean": round(mean([r["mrr"] for r in retr]), 3), - "complete_misses": [r["expected"] for r in retr if r["precision_at_k"] == 0], -} - -# Token-per-quality-point of LIFT over no_context -def lift_efficiency(mode): - rs = [] - for tid, ms in by_task.items(): - if mode not in ms or "no_context" not in ms: - continue - delta_q = composite(ms[mode]) - composite(ms["no_context"]) - delta_tk = ms[mode]["input_tokens"] - ms["no_context"]["input_tokens"] - rs.append((tid, delta_tk, delta_q)) - if not rs: - return None - # only count rows where lift > 0 (the rest are negative-value) - pos = [(tk, q) for _, tk, q in rs if q > 0] - if not pos: - return {"n_positive_lift": 0, "n_total": len(rs)} - return { - "n_total": len(rs), - "n_positive_lift": len(pos), - "n_negative_lift": sum(1 for _, _, q in rs if q < 0), - "n_zero_lift": sum(1 for _, _, q in rs if q == 0), - "median_tk_per_lift_pt": round(med([tk for tk, q in pos]) / med([q for tk, q in pos]), 1), - } - -lift = {m: lift_efficiency(m) for m in ("smart", "search")} - -# Build per-task detail rows for the appendix table -detail = [] -for tid in sorted(by_task.keys()): - ms = by_task[tid] - row = {"task_id": tid, "category": ms[next(iter(ms))]["category"]} - for m in modes: - if m not in ms: continue - r = ms[m] - js = r["judge_summary"] - row[m] = { - "input_tokens": r["input_tokens"], - "coverage": r["fact_recall"]["coverage_pct"], - "bonus": r["fact_recall"]["bonus_pct"], - "factual": js.get("factual_median", 0), - "specific": js.get("specific_median", 0), - "complete": js.get("complete_median", 0), - "halluc": js.get("hallucinations_mean", 0), - "composite_15": composite(r), - } - detail.append(row) - -final = { - "ran_at": data["ran_at"], - "task_model": data["task_model"], - "judge_model": data["judge_model"], - "judge_runs": data["judge_runs"], - "task_count": data["task_count"], - "raw_all_baseline_tokens": 186654, - "by_mode": agg, - "paired": paired_summary, - "retrieval_quality": retr_summary, - "lift_efficiency": lift, - "detail": detail, -} - -DATA.mkdir(parents=True, exist_ok=True) -(DATA / "results-v13-final.json").write_text(json.dumps(final, indent=2), encoding="utf-8") - -# Print summary -print("=" * 96) -print(f"v1.3 RESULTS — task={data['task_model']} judge={data['judge_model']}×{data['judge_runs']} runs N={data['task_count']}") -print("=" * 96) - -print(f"\n{'Mode':<12} {'med tk':>8} {'cov':>5} {'bonus':>6} {'fact':>5} {'spec':>5} {'comp':>5} {'halluc':>7} {'comp/15':>8} {'$':>7}") -for m in modes: - a = agg[m] - print(f"{m:<12} {a['input_tokens_median']:>8,} " - f"{a['coverage_pct_mean']:>4.0f}% {a['bonus_pct_mean']:>5.0f}% " - f"{a['factual_median']:>5.1f} {a['specific_median']:>5.1f} {a['complete_median']:>5.1f} " - f"{a['hallucinations_mean']:>7.2f} {a['composite_15_median']:>7.1f} ${a['cost_usd_total']:>6.2f}") - -print("\nPaired comparisons (composite/15):") -for k, p in paired_summary.items(): - print(f" {k:<24} n={p['n']:<3} mean d={p['mean_delta']:>6.2f} wins/ties/losses = {p['wins']}/{p['ties']}/{p['losses']}") - if p['worst_task']: - print(f" worst: {p['worst_task'][0]} (d={p['worst_task'][1]:.0f}) best: {p['best_task'][0]} (d={p['best_task'][1]:.0f})") - -print("\nRetrieval quality (search mode):") -r = retr_summary -print(f" P@8 mean = {r['precision_at_k_mean']:.3f} R@8 mean = {r['recall_at_k_mean']:.3f} MRR mean = {r['mrr_mean']:.3f}") -print(f" Complete retrieval misses: {len(r['complete_misses'])} of {r['n']} tasks") - -print("\nLift efficiency over no_context (tokens per +1 composite point of lift):") -for m, l in lift.items(): - if l is None: - continue - if l.get("median_tk_per_lift_pt") is None: - print(f" {m}: NO POSITIVE LIFT on any task (negative={l['n_negative_lift']}, zero={l['n_zero_lift']}, positive={l['n_positive_lift']})") - else: - print(f" {m}: {l['median_tk_per_lift_pt']:>10,.0f} tk/+1pt " - f"(positive lift {l['n_positive_lift']}/{l['n_total']}; " - f"negative lift {l['n_negative_lift']}; zero {l['n_zero_lift']})") - -print("\nHEADLINE: Smart Compile in its current form") -sv = paired_summary["smart_vs_no_context"] -if sv["mean_delta"] < 0: - print(f" DEGRADES composite quality by {abs(sv['mean_delta']):.2f} points on average vs no-context.") - print(f" Loses on {sv['losses']}/{sv['n']} tasks; wins on {sv['wins']}.") - print(f" Implication: full-skill-body injection at ~75k tokens hits an attention-dilution regime") - print(f" the no-context model doesn't experience. Hypothesis 1 (in its v1.0 form) is contradicted.") -elif sv["mean_delta"] > 0: - print(f" IMPROVES composite quality by {sv['mean_delta']:.2f} points on average vs no-context.") -sr = paired_summary["search_vs_no_context"] -print(f"\n Search vs no_context: mean d = {sr['mean_delta']:+.2f} ({sr['wins']} wins / {sr['ties']} ties / {sr['losses']} losses)") -print(f"\nWrote {DATA / 'results-v13-final.json'}") - -# --gate: check quality gates, exit 1 on failure -if args.json: - json.dump(final, sys.stdout, indent=2) - print() - -if args.gate: - gates_failed = 0 - - # Gate 1: No-context quality — Smart and Search must beat or tie no-context - for mode_key, label in [("smart_vs_no_context", "Smart Compile vs no_context"), - ("search_vs_no_context", "MCP Search vs no_context")]: - p_data = paired_summary[mode_key] - if p_data["mean_delta"] < 0: - print(f"\n [GATE FAIL] {label}: mean delta = {p_data['mean_delta']:.2f} " - f"(losses={p_data['losses']}/{p_data['n']})") - gates_failed += 1 - else: - print(f"\n [GATE PASS] {label}: mean delta = {p_data['mean_delta']:.2f} " - f"(wins={p_data['wins']}, ties={p_data['ties']})") - - # Gate 2: Retrieval quality — Recall@8 must be 1.00 - retr_r = retr_summary - if retr_r["recall_at_k_mean"] < 1.0: - print(f" [GATE FAIL] R@8 = {retr_r['recall_at_k_mean']:.3f} (expected 1.000). " - f"{len(retr_r['complete_misses'])} complete misses.") - gates_failed += 1 - else: - print(f" [GATE PASS] R@8 = {retr_r['recall_at_k_mean']:.3f}") - - if gates_failed: - print(f"\n {gates_failed} gate(s) FAILED.") - sys.exit(1) - else: - print(f"\n All gates PASSED.") diff --git a/bench/build_report.py b/bench/build_report.py deleted file mode 100644 index 89d8c80..0000000 --- a/bench/build_report.py +++ /dev/null @@ -1,571 +0,0 @@ -#!/usr/bin/env python3 -"""Build the consolidated Context Engine benchmark report.""" - -from __future__ import annotations - -import html -import json -import math -import statistics -from pathlib import Path - -from PIL import Image, ImageDraw, ImageFont -from reportlab.lib import colors -from reportlab.lib.enums import TA_CENTER, TA_LEFT -from reportlab.lib.pagesizes import A4 -from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet -from reportlab.lib.units import mm -from reportlab.platypus import ( - Image as RLImage, - KeepTogether, - LongTable, - PageBreak, - Paragraph, - SimpleDocTemplate, - Spacer, - Table, - TableStyle, -) - - -ROOT = Path(__file__).resolve().parents[2] -BENCH = ROOT / "app" / "bench" -CHARTS = BENCH / "artifacts" / "charts" -OUT = BENCH / "artifacts" / "reports" / "Context Engine Benchmark Report v1.0.pdf" - -IVORY = "#f7f3ea" -SLATE = "#222832" -MUTED = "#62666f" -CORAL = "#d8664f" -GOLD = "#c99a42" -BLUE = "#4f6f8f" -GREEN = "#5f8d69" -RED = "#b8524b" -GRID = "#d9d2c6" - -MODES = ["no_context", "smart", "search"] -MODE_LABEL = {"no_context": "No context", "smart": "Smart", "search": "Search"} - - -def load_json(name: str): - return json.loads((BENCH / name).read_text(encoding="utf-8")) - - -tasks = load_json("tasks.json") -task_order = [t["id"] for t in tasks] -v11 = load_json("data/v1.1/results-graded-sample.json") -v12 = load_json("data/v1.2/results-v12-final.json") -v13 = load_json("data/v1.3/results-v13-final.json") -v13_rows = load_json("data/v1.3/results-v13.json")["rows"] -responses = load_json("data/v1.3/responses-v13.json") - - -def esc(text) -> str: - return html.escape(str(text)).replace("\n", "
") - - -def pct(x, d=1) -> str: - return f"{x:.{d}f}%" - - -def num(x) -> str: - if isinstance(x, float): - return f"{x:,.1f}" - return f"{x:,}" - - -def font(size=28, bold=False): - candidates = [ - "C:/Windows/Fonts/arialbd.ttf" if bold else "C:/Windows/Fonts/arial.ttf", - "C:/Windows/Fonts/segoeuib.ttf" if bold else "C:/Windows/Fonts/segoeui.ttf", - ] - for candidate in candidates: - try: - return ImageFont.truetype(candidate, size) - except Exception: - pass - return ImageFont.load_default() - - -FONT = font(24) -FONT_B = font(24, True) -FONT_S = font(18) -FONT_SB = font(18, True) -FONT_XS = font(15) - - -def new_chart(title: str, subtitle: str = "", w=1600, h=900): - img = Image.new("RGB", (w, h), IVORY) - d = ImageDraw.Draw(img) - d.text((60, 42), title, fill=SLATE, font=font(38, True)) - if subtitle: - d.text((62, 92), subtitle, fill=MUTED, font=FONT_S) - return img, d - - -def save_chart(img: Image.Image, name: str) -> Path: - CHARTS.mkdir(parents=True, exist_ok=True) - path = CHARTS / name - img.save(path, "PNG", optimize=True) - return path - - -def text_center(d, xy, text, fill=SLATE, fnt=FONT): - box = d.textbbox((0, 0), text, font=fnt) - d.text((xy[0] - (box[2] - box[0]) / 2, xy[1]), text, fill=fill, font=fnt) - - -def chart_token_cost(): - data = [ - ("Raw all", v13["raw_all_baseline_tokens"], SLATE), - ("Smart", v13["by_mode"]["smart"]["input_tokens_median"], CORAL), - ("Search", v13["by_mode"]["search"]["input_tokens_median"], BLUE), - ("No ctx", v13["by_mode"]["no_context"]["input_tokens_median"], GREEN), - ] - img, d = new_chart("Token Cost by Mode", "Median input tokens on a log scale") - x0, y0, x1, y1 = 160, 740, 1480, 170 - for tick in [10, 100, 1000, 10000, 100000]: - y = y0 - (math.log10(tick) - 1) / 4.4 * (y0 - y1) - d.line((x0, y, x1, y), fill=GRID, width=2) - d.text((62, y - 12), f"{tick:,}", fill=MUTED, font=FONT_XS) - bw = 190 - gap = 105 - for i, (label, value, color) in enumerate(data): - x = x0 + 90 + i * (bw + gap) - y = y0 - (math.log10(value) - 1) / 4.4 * (y0 - y1) - d.rounded_rectangle((x, y, x + bw, y0), radius=10, fill=color) - text_center(d, (x + bw / 2, y - 42), f"{value:,}", SLATE, FONT_SB) - text_center(d, (x + bw / 2, y0 + 22), label, SLATE, FONT_S) - d.text((160, 812), "Takeaway: four orders of magnitude separate the cheapest path from the naive baseline.", fill=SLATE, font=FONT_SB) - return save_chart(img, "01-token-cost-log.png") - - -def mode_value(task_id, mode, field): - row = next(r for r in v13["detail"] if r["task_id"] == task_id) - return row[mode][field] - - -def chart_quality_heatmap(): - img, d = new_chart("Composite Quality by Task", "Composite score on 0-15 scale") - left, top = 340, 155 - cell_w, cell_h = 260, 38 - for j, mode in enumerate(MODES): - text_center(d, (left + j * cell_w + cell_w / 2, top - 44), MODE_LABEL[mode], SLATE, FONT_SB) - for i, tid in enumerate(task_order): - y = top + i * cell_h - d.text((54, y + 8), tid, fill=SLATE, font=FONT_XS) - for j, mode in enumerate(MODES): - val = mode_value(tid, mode, "composite_15") - ratio = max(0, min(1, val / 15)) - r = int(245 - ratio * 60) - g = int(225 - ratio * 95) - b = int(210 - ratio * 115) - x = left + j * cell_w - d.rectangle((x, y, x + cell_w - 8, y + cell_h - 6), fill=(r, g, b)) - text_center(d, (x + cell_w / 2 - 4, y + 5), f"{val:g}", SLATE, FONT_XS) - d.text((60, 812), "Takeaway: Smart Compile loses on 8 of 15 tasks despite costing far more tokens than no-context.", fill=SLATE, font=FONT_SB) - return save_chart(img, "02-quality-heatmap.png") - - -def chart_smart_delta(): - rows = [] - for tid in task_order: - rows.append((tid, mode_value(tid, "smart", "composite_15") - mode_value(tid, "no_context", "composite_15"))) - rows.sort(key=lambda x: x[1]) - img, d = new_chart("Smart vs No-Context Delta", "Positive values mean Smart wins") - x0, y0, x1, y1 = 780, 760, 1480, 160 - zero = x0 + (0 + 10) / 16 * (x1 - x0) - d.line((zero, y1, zero, y0), fill=SLATE, width=3) - for tick in [-10, -5, 0, 5]: - x = x0 + (tick + 10) / 16 * (x1 - x0) - d.line((x, y1, x, y0), fill=GRID, width=1) - text_center(d, (x, y0 + 20), str(tick), MUTED, FONT_XS) - bh = 30 - for i, (tid, delta) in enumerate(rows): - y = y1 + i * 39 - d.text((60, y + 5), tid, fill=SLATE, font=FONT_XS) - x = x0 + (delta + 10) / 16 * (x1 - x0) - color = GREEN if delta > 0 else RED if delta < 0 else MUTED - d.rectangle((min(zero, x), y, max(zero, x), y + bh), fill=color) - d.text((max(zero, x) + 8 if delta >= 0 else min(zero, x) - 42, y + 4), f"{delta:+g}", fill=SLATE, font=FONT_XS) - d.text((60, 812), "Takeaway: when Smart fails, it fails catastrophically; when it wins, it wins modestly.", fill=SLATE, font=FONT_SB) - return save_chart(img, "03-smart-delta.png") - - -def chart_hallucinations(): - img, d = new_chart("Hallucination Distribution", "Mean fabricated facts per response") - x_positions = [380, 800, 1220] - y0, y1 = 730, 160 - for tick in range(0, 6): - y = y0 - tick / 5 * (y0 - y1) - d.line((180, y, 1420, y), fill=GRID, width=1) - d.text((120, y - 10), str(tick), fill=MUTED, font=FONT_XS) - for j, mode in enumerate(MODES): - xs = x_positions[j] - vals = [mode_value(tid, mode, "halluc") for tid in task_order] - for i, val in enumerate(vals): - y = y0 - val / 5 * (y0 - y1) - x = xs + ((i % 5) - 2) * 18 - d.ellipse((x - 9, y - 9, x + 9, y + 9), fill=[GREEN, CORAL, BLUE][j], outline=SLATE) - med = statistics.mean(vals) - y = y0 - med / 5 * (y0 - y1) - d.line((xs - 95, y, xs + 95, y), fill=SLATE, width=4) - text_center(d, (xs, y0 + 24), MODE_LABEL[mode], SLATE, FONT_S) - text_center(d, (xs, y - 34), f"mean {med:.2f}", SLATE, FONT_XS) - d.text((160, 812), "Takeaway: loading more context creates more fabricated facts, not fewer.", fill=SLATE, font=FONT_SB) - return save_chart(img, "04-hallucinations.png") - - -def search_retrieval_rows(): - return [r for r in v13_rows if r["mode"] == "search"] - - -def chart_retrieval_scatter(): - points = [] - for row in search_retrieval_rows(): - tid = row["task_id"] - p = row["retrieval_quality"]["precision_at_k"] - y = mode_value(tid, "search", "composite_15") - mode_value(tid, "no_context", "composite_15") - points.append((tid, p, y)) - img, d = new_chart("Retrieval Quality vs Outcome", "Search Precision@K against quality delta") - x0, y0, x1, y1 = 170, 720, 1460, 170 - for tick in [0, 0.25, 0.5, 0.75, 1.0]: - x = x0 + tick * (x1 - x0) - d.line((x, y0, x, y1), fill=GRID) - text_center(d, (x, y0 + 22), f"{tick:.2f}", MUTED, FONT_XS) - for tick in [-4, -2, 0, 2, 4]: - y = y0 - (tick + 4) / 8 * (y0 - y1) - d.line((x0, y, x1, y), fill=GRID) - d.text((110, y - 10), f"{tick:+d}", fill=MUTED, font=FONT_XS) - xs = [p[1] for p in points] - ys = [p[2] for p in points] - mean_x, mean_y = statistics.mean(xs), statistics.mean(ys) - denom = sum((x - mean_x) ** 2 for x in xs) or 1 - slope = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys)) / denom - intercept = mean_y - slope * mean_x - tx0, tx1 = 0, 1 - ty0, ty1 = intercept, intercept + slope - d.line((x0, y0 - (ty0 + 4) / 8 * (y0 - y1), x1, y0 - (ty1 + 4) / 8 * (y0 - y1)), fill=CORAL, width=4) - for tid, p, delta in points: - x = x0 + p * (x1 - x0) - y = y0 - (delta + 4) / 8 * (y0 - y1) - d.ellipse((x - 10, y - 10, x + 10, y + 10), fill=BLUE, outline=SLATE) - if p in (0, 1) or abs(delta) >= 3: - d.text((x + 12, y - 8), tid[:18], fill=SLATE, font=FONT_XS) - d.text((170, 812), "Takeaway: where retrieval surfaces relevant chunks, search adds value; misses degrade output.", fill=SLATE, font=FONT_SB) - return save_chart(img, "05-retrieval-scatter.png") - - -def chart_lift_efficiency(): - img, d = new_chart("Cost per Positive Quality Lift", "Tokens per +1 composite point over no-context") - data = [("Smart", v13["lift_efficiency"]["smart"], CORAL), ("Search", v13["lift_efficiency"]["search"], BLUE)] - maxv = max(item["median_tk_per_lift_pt"] for _, item, _ in data) - x0, y0, x1, y1 = 260, 700, 1350, 180 - for i, (label, item, color) in enumerate(data): - val = item["median_tk_per_lift_pt"] - h = val / maxv * (y0 - y1) - x = x0 + i * 500 - d.rounded_rectangle((x, y0 - h, x + 260, y0), radius=12, fill=color) - text_center(d, (x + 130, y0 - h - 42), f"{val:,.0f}", SLATE, FONT_SB) - text_center(d, (x + 130, y0 + 24), label, SLATE, FONT_S) - d.text((x - 10, y0 + 68), f"negative-lift tasks: {item['n_negative_lift']}/{item['n_total']}", fill=MUTED, font=FONT_XS) - d.text((200, 812), "Takeaway: Search is 76x cheaper per positive lift point, but both modes lose on most tasks.", fill=SLATE, font=FONT_SB) - return save_chart(img, "06-lift-efficiency.png") - - -def chart_methodology(): - img, d = new_chart("Methodology Changed the Result", "Why v1.1 and v1.2 were retracted") - rows = [ - ("v1.1", "5 tasks", "Session as judge, holistic 1-10", "Smart 9.0/10; optimistic"), - ("v1.2", "5 tasks", "Gold facts + decomposed rubric, same session judge", "Smart +20 composite pts vs no-context"), - ("v1.3", "15 tasks", "Sonnet task model, Haiku judge x3", "Smart -1.27 pts; loses 8/15"), - ] - x = [90, 300, 540, 1040] - y = 190 - headers = ["Run", "N", "Judge/rubric", "Headline"] - for j, h in enumerate(headers): - d.text((x[j], y), h, fill=SLATE, font=FONT_SB) - for i, row in enumerate(rows): - yy = y + 80 + i * 155 - fill = "#fffaf1" if i % 2 == 0 else "#eee8dc" - d.rounded_rectangle((70, yy - 24, 1510, yy + 92), radius=16, fill=fill, outline=GRID, width=2) - for j, cell in enumerate(row): - d.text((x[j], yy), cell, fill=SLATE if i < 2 else RED, font=FONT_S if j else FONT_B) - d.text((90, 812), "Takeaway: self-evaluation bias accounted for roughly three composite points of inflation.", fill=SLATE, font=FONT_SB) - return save_chart(img, "07-methodology.png") - - -def chart_latency_cost(): - img, d = new_chart("Latency and Cost by Mode", "Median latency and total API cost in v1.3") - panels = [("Median latency (ms)", "latency_ms_median", 150, 720, 680), ("Total cost (USD)", "cost_usd_total", 880, 720, 1410)] - for title, field, x0, y0, x1 in panels: - d.text((x0, 150), title, fill=SLATE, font=FONT_SB) - vals = [v13["by_mode"][m][field] for m in MODES] - maxv = max(vals) - for i, (mode, val) in enumerate(zip(MODES, vals)): - x = x0 + i * 170 - h = val / maxv * 430 - d.rounded_rectangle((x, y0 - h, x + 110, y0), radius=8, fill=[GREEN, CORAL, BLUE][i]) - label = f"${val:.2f}" if field == "cost_usd_total" else f"{int(val):,}" - text_center(d, (x + 55, y0 - h - 34), label, SLATE, FONT_XS) - text_center(d, (x + 55, y0 + 22), MODE_LABEL[mode].replace(" ", "\n"), SLATE, FONT_XS) - d.text((150, 812), "Takeaway: Smart is slower and materially more expensive while under-performing no-context.", fill=SLATE, font=FONT_SB) - return save_chart(img, "08-latency-cost.png") - - -def build_charts(): - return [ - chart_token_cost(), - chart_quality_heatmap(), - chart_smart_delta(), - chart_hallucinations(), - chart_retrieval_scatter(), - chart_lift_efficiency(), - chart_methodology(), - chart_latency_cost(), - ] - - -styles = getSampleStyleSheet() -H1 = ParagraphStyle("H1", parent=styles["Heading1"], fontName="Helvetica-Bold", fontSize=20, leading=24, textColor=colors.HexColor(SLATE), spaceAfter=10) -H2 = ParagraphStyle("H2", parent=styles["Heading2"], fontName="Helvetica-Bold", fontSize=14, leading=18, textColor=colors.HexColor(SLATE), spaceBefore=10, spaceAfter=6) -H3 = ParagraphStyle("H3", parent=styles["Heading3"], fontName="Helvetica-Bold", fontSize=10.5, leading=13, textColor=colors.HexColor(CORAL), spaceBefore=6, spaceAfter=3) -BODY = ParagraphStyle("BODY", parent=styles["BodyText"], fontName="Helvetica", fontSize=9.6, leading=13, textColor=colors.HexColor(SLATE), alignment=TA_LEFT, spaceAfter=6) -QUOTE = ParagraphStyle("QUOTE", parent=BODY, leftIndent=10, rightIndent=10, fontName="Helvetica-Oblique", textColor=colors.HexColor(MUTED), backColor=colors.HexColor("#eee8dc"), borderPadding=6, spaceAfter=8) -CAP = ParagraphStyle("CAP", parent=BODY, fontSize=8.4, leading=11, textColor=colors.HexColor(MUTED), spaceAfter=8) -TITLE = ParagraphStyle("TITLE", parent=H1, fontSize=28, leading=34, alignment=TA_CENTER, spaceAfter=14) -SUB = ParagraphStyle("SUB", parent=BODY, fontSize=12, leading=16, alignment=TA_CENTER, textColor=colors.HexColor(MUTED)) -SMALL = ParagraphStyle("SMALL", parent=BODY, fontSize=7.5, leading=9) - - -def para(text, style=BODY): - return Paragraph(esc(text), style) - - -def rich(text, style=BODY): - return Paragraph(text, style) - - -def table(headers, rows, widths, small=False): - body = [[rich(f"{esc(h)}", SMALL if small else BODY) for h in headers]] - for row in rows: - body.append([rich(esc(c), SMALL if small else BODY) for c in row]) - t = LongTable(body, colWidths=widths, repeatRows=1) - t.setStyle(TableStyle([ - ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor(SLATE)), - ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), - ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f2ede3")]), - ("GRID", (0, 0), (-1, -1), 0.25, colors.HexColor(GRID)), - ("VALIGN", (0, 0), (-1, -1), "TOP"), - ("LEFTPADDING", (0, 0), (-1, -1), 4), - ("RIGHTPADDING", (0, 0), (-1, -1), 4), - ("TOPPADDING", (0, 0), (-1, -1), 3), - ("BOTTOMPADDING", (0, 0), (-1, -1), 3), - ])) - return t - - -def chart_flow(path: Path, caption: str): - return KeepTogether([ - RLImage(str(path), width=170 * mm, height=95 * mm), - rich(f"Caption. {esc(caption)}", CAP), - ]) - - -def cover(story): - story.append(Spacer(1, 65 * mm)) - story.append(rich("Context Engine
Benchmark Report v1.0", TITLE)) - story.append(rich("Token efficiency, quality regression, and roadmap implications across v1.1 to v1.3", SUB)) - story.append(Spacer(1, 18 * mm)) - story.append(rich("Publication-grade summary: Context Engine saves tokens. The current v0.3.x retrieval and Smart Compile implementation does not yet preserve quality. v1.3 retracts the earlier headline while validating the paper's multi-resolution architecture.", QUOTE)) - story.append(PageBreak()) - - -def executive_summary(story): - story.append(rich("Executive Summary", H1)) - story.append(rich("The honest result: v1.3 retracts the v1.1/v1.2 headline. Smart Compile in its current v0.3.x form net-degrades quality versus no-context: mean delta -1.27 composite points on a 15-point scale, with 8 losses, 2 ties, and 5 wins across 15 tasks. Search also under-performs no-context on the corpus: -0.80 mean delta, with 8 losses.", BODY)) - story.append(rich("The validation: token reduction is real and large. Median input tokens fall from 186,654 in the raw-all baseline to 74,279 in Smart and 903 in Search. This validates the token-efficiency part of the architecture but not the quality-preservation claim.", BODY)) - story.append(rich("White paper cross-reference: Section 11 predicted that binary include/exclude selection would not be enough; skills need multi-resolution packaging. v1.3 empirically confirms that prediction. Section 32 Hypothesis 1 - Smart Compile reduces token load without reducing task quality - is not supported by the current implementation.", BODY)) - story.append(rich("Product decision: CE should be framed today as a token-saving and retrieval research system, not as a quality-improving context broker. Phase 2 ranking/dedup and Section 11 multi-resolution packaging are load-bearing, not polish.", BODY)) - story.append(PageBreak()) - - -def methodology(story, charts): - story.append(rich("Methodology Evolution", H1)) - story.append(rich("The result changed because the measurement became harder to fool. v1.1 used the active session as a holistic judge. v1.2 added gold facts and a decomposed rubric but retained session judging. v1.3 used Sonnet 4.5 as the task model and Haiku 4.5 as a separate judge over three runs per response.", BODY)) - rows = [ - ["v1.1", "5", "Session-as-judge; holistic 1-10", "Smart 9.0/10, Search 7.6/10", "Retracted as over-optimistic"], - ["v1.2", "5", "Gold facts + decomposed rubric; session judge", "Smart +20 composite pts vs no-context", "Directionally useful, still biased"], - ["v1.3", "15", "Sonnet task model; Haiku judge x3", "Smart -1.27; Search -0.80 vs no-context", "Current decision baseline"], - ] - story.append(table(["Run", "N", "Judge/rubric", "Headline", "Status"], rows, [18*mm, 10*mm, 48*mm, 50*mm, 40*mm])) - story.append(Spacer(1, 6)) - story.append(chart_flow(charts[6], "Self-evaluation bias made earlier runs look better than the full v1.3 pipeline.")) - story.append(PageBreak()) - - -def headline_results(story, charts): - story.append(rich("Headline Results", H1)) - story.append(chart_flow(charts[0], "Four orders of magnitude separate the cheapest path from the naive raw-all baseline.")) - rows = [] - for mode in MODES: - a = v13["by_mode"][mode] - rows.append([MODE_LABEL[mode], num(a["input_tokens_median"]), pct(a["coverage_pct_mean"]), pct(a["bonus_pct_mean"]), f"{a['factual_median']:.1f}", f"{a['specific_median']:.1f}", f"{a['complete_median']:.1f}", f"{a['hallucinations_mean']:.2f}", f"{a['composite_15_median']:.1f}", f"${a['cost_usd_total']:.2f}"]) - story.append(table(["Mode", "med tk", "Cov", "Bonus", "Fact", "Spec", "Comp", "Halluc", "Comp/15", "Cost"], rows, [24*mm, 18*mm, 14*mm, 15*mm, 12*mm, 12*mm, 12*mm, 16*mm, 17*mm, 16*mm], small=True)) - story.append(rich("Table 1. Per-mode aggregate, all 15 tasks. Fact/Spec/Comp are judge medians on 1-5 axes; composite is on a 0-15 scale.", CAP)) - story.append(PageBreak()) - story.append(chart_flow(charts[1], "Smart Compile loses on 8 of 15 tasks despite costing about 2,500x more input tokens than no-context.")) - story.append(chart_flow(charts[2], "The worst Smart failure is large enough to dominate the mean; positive wins are smaller.")) - story.append(PageBreak()) - paired_rows = [] - for key, label in [("smart_vs_no_context", "Smart vs no-context"), ("search_vs_no_context", "Search vs no-context"), ("smart_vs_search", "Smart vs Search")]: - p = v13["paired"][key] - paired_rows.append([label, str(p["n"]), f"{p['mean_delta']:+.2f}", f"{p['median_delta']:+.1f}", f"{p['wins']}/{p['ties']}/{p['losses']}", f"{p['worst_task'][0]} ({p['worst_task'][1]:+g})", f"{p['best_task'][0]} ({p['best_task'][1]:+g})"]) - story.append(table(["Comparison", "n", "mean", "median", "W/T/L", "Worst", "Best"], paired_rows, [38*mm, 10*mm, 16*mm, 16*mm, 18*mm, 40*mm, 40*mm])) - story.append(rich("Table 2. Paired comparison by task. Positive deltas mean the first mode outperformed the second.", CAP)) - story.append(chart_flow(charts[7], "Smart costs more and runs slower while under-performing no-context in aggregate.")) - story.append(PageBreak()) - - -def whitepaper_crossref(story): - story.append(rich("White Paper Predictions vs v1.3", H1)) - story.append(rich("Section 11 - Multi-Resolution Context Packaging: The paper says the key optimisation is not merely selecting chunks but selecting resolution: manifest, glossary, summary, targeted chunks, section, or full source. v1.3 confirms this. Injecting many full skill bodies causes attention dilution; the model follows meta-style and process text instead of the user's concrete request.", BODY)) - story.append(rich("Section 19 - Future Evaluation Framework: The paper specifically named Precision@K, Recall@K, MRR, token utility, and compile reduction as future metrics. v1.3 is the first run to instantiate that scaffold. Retrieval quality is low: Precision@8 mean 0.23, Recall@8 mean 0.56, MRR mean 0.31, with 5/15 complete misses.", BODY)) - story.append(rich("Section 32 - Hypothesis 1: 'Smart Compile reduces token load without reducing task quality' is not supported by v0.3.x. The token-load half is true; the quality half is false on this corpus. Hypothesis 2 - multi-resolution packaging improves retrieval efficiency - becomes the more important next test.", BODY)) - story.append(rich("Section 34 - Build Roadmap: Phase 2 Dedup and Rank is the first major improvement to retrieval quality. The benchmark makes that phase urgent. Phase 4 Multi-Resolution Skill Compiler is not optional if Smart Compile is to become quality-preserving.", BODY)) - story.append(PageBreak()) - - -def failure_mode(story): - story.append(rich("Failure Mode Case Study", H1)) - story.append(rich("The clearest failure is comfy-prompt-fantasy. The user asked for a ComfyUI prompt. No-context produced a practical prompt with positive/negative sections and settings. Smart, after receiving 36k tokens of skill context, produced a philosophical essay titled Mystral Dusk and failed the task.", BODY)) - no_ctx = responses["comfy-prompt-fantasy"]["no_context"]["text"][:780] - smart = responses["comfy-prompt-fantasy"]["smart"]["text"][:780] - story.append(rich("No-context response excerpt", H3)) - story.append(para(no_ctx, QUOTE)) - story.append(rich("Smart response excerpt", H3)) - story.append(para(smart, QUOTE)) - story.append(rich("This is the attention-dilution failure mode in concrete form: the model absorbed aesthetic and meta-discussion from context and produced more meta-discussion instead of an executable prompt.", BODY)) - story.append(PageBreak()) - - -def hallucination_retrieval(story, charts): - story.append(rich("Hallucinations and Retrieval Quality", H1)) - story.append(chart_flow(charts[3], "Loading more context increases fabricated facts: Smart mean hallucinations 1.49 vs no-context 0.98.")) - story.append(chart_flow(charts[4], "Search quality depends on retrieval quality. Complete misses predict degraded output.")) - story.append(PageBreak()) - rows = [] - for row in search_retrieval_rows(): - rq = row["retrieval_quality"] - rows.append([row["task_id"], f"{rq['precision_at_k']:.2f}", f"{rq['recall_at_k']:.2f}", f"{rq['mrr']:.2f}", ", ".join(rq.get("hits") or []) or "-", ", ".join(rq.get("expected") or [])]) - story.append(table(["Task", "P@K", "R@K", "MRR", "Hits", "Expected"], rows, [38*mm, 13*mm, 13*mm, 13*mm, 42*mm, 55*mm], small=True)) - story.append(rich("Table 4. Retrieval quality per task for search mode. Complete misses are the strongest warning signal for quality loss.", CAP)) - story.append(PageBreak()) - - -def validated_retracted(story, charts): - story.append(rich("Validated, Retracted, Pending", H1)) - story.append(rich("Validated: token reduction is real; the benchmark harness now exists; Section 11's architecture is validated; retrieval quality predicts failure; Search is much more efficient than Smart per positive lift point.", BODY)) - story.append(rich("Retracted: the v1.1 claim that Smart held at 9.0/10 quality; the v1.2 framing that Smart improves quality over no-context; the product claim that v0.3.x Smart Compile preserves quality.", BODY)) - story.append(rich("Pending: cross-family judges beyond Haiku; alternative task models; a larger task corpus; user correction-rate measurement; and a post-Phase-2 rerun after ranking/dedup changes.", BODY)) - story.append(chart_flow(charts[5], "Search is dramatically more token-efficient when it helps, but both modes show negative lift on most tasks.")) - story.append(PageBreak()) - - -def roadmap(story): - story.append(rich("Build Roadmap Implications", H1)) - rows = [ - ["Phase 2: Dedup + Rank", "Highest", "P@8 is 0.23 and 5/15 searches miss completely. Ranking quality is the bottleneck."], - ["Phase 3: Smart Compile", "Re-scope", "Do not ship as quality-improving until it chooses resolution, not whole skills."], - ["Phase 4: Multi-Resolution Compiler", "Load-bearing", "Needed to avoid flooding the task model with full SKILL.md bodies."], - ["Retrieval dashboard", "High", "Expose P@K/R@K/MRR style signals and expected-source misses before user-facing claims."], - ["Benchmark gate", "High", "Any ranking change should rerun this report before release notes claim quality preservation."], - ] - story.append(table(["Roadmap item", "Priority", "Why v1.3 changes the decision"], rows, [42*mm, 24*mm, 104*mm])) - story.append(Spacer(1, 8)) - story.append(rich("The product framing should change immediately: current CE saves tokens and provides the plumbing for brokered context. It does not yet prove that brokered context improves output quality. That distinction protects the roadmap from overselling v0.3.x.", BODY)) - story.append(PageBreak()) - - -def limitations(story): - story.append(rich("Limitations Still Standing", H1)) - for item in [ - "Judge family is still Anthropic-only: Sonnet task model, Haiku judge. This avoids same-model self-judging but not family-level bias.", - "The corpus has only 15 tasks. It is representative of this project, not a universal benchmark.", - "Gold facts and expected sources are manually authored, so they encode human judgement.", - "No user correction-rate or real workflow completion metric is included yet.", - "v1.3 measures the current implementation, not the intended Section 11 multi-resolution implementation.", - ]: - story.append(rich("- " + esc(item), BODY)) - story.append(PageBreak()) - - -def reproducibility(story): - story.append(rich("Reproducibility and Source Lineage", H1)) - story.append(rich("The report is generated from immutable benchmark artifacts already present under app/bench. Historical white papers remain read-only records; this report cross-references them rather than rebuilding them.", BODY)) - rows = [ - ["Task corpus", "app/bench/tasks.json", "15 representative tasks"], - ["Gold answers", "app/bench/gold-answers.json", "Must/may/forbidden facts and expected sources"], - ["v1.1 token-only", "app/bench/data/v1.1/results-latest.json", "15-task token run, no quality grading"], - ["v1.1 graded sample", "app/bench/data/v1.1/results-graded-sample.json", "5-task session-as-judge quality sample"], - ["v1.2 final", "app/bench/data/v1.2/results-v12-final.json", "Gold scoring and decomposed rubric on 5-task subset"], - ["v1.3 final", "app/bench/data/v1.3/results-v13-final.json", "15-task aggregate, paired comparisons, lift efficiency"], - ["v1.3 raw rows", "app/bench/data/v1.3/results-v13.json", "45 task/mode rows with retrieval-quality detail"], - ["v1.3 responses", "app/bench/data/v1.3/responses-v13.json", "All task-model outputs used for case studies"], - ["Build script", "app/bench/build_report.py", "Single-command PDF and chart generation"], - ] - story.append(table(["Input", "Path", "Use"], rows, [36*mm, 66*mm, 68*mm], small=True)) - story.append(Spacer(1, 8)) - story.append(rich("Rebuild command: python app/bench/build_report.py. The script writes eight PNG charts to app/bench/artifacts/charts/ and the final PDF to app/bench/artifacts/reports/.", BODY)) - story.append(rich("No historical result JSON, benchmark runner, grading script, or white-paper PDF is modified by the build.", BODY)) - story.append(PageBreak()) - - -def detail_tables(story): - story.append(rich("Appendix A: Full Per-Task Detail", H1)) - rows = [] - for tid in task_order: - detail = next(r for r in v13["detail"] if r["task_id"] == tid) - for mode in MODES: - m = detail[mode] - rows.append([tid, MODE_LABEL[mode], num(m["input_tokens"]), pct(m["coverage"], 0), pct(m["bonus"], 0), f"{m['factual']:.1f}", f"{m['specific']:.1f}", f"{m['complete']:.1f}", f"{m['halluc']:.2f}", f"{m['composite_15']:.1f}"]) - story.append(table(["Task", "Mode", "Tokens", "Cov", "Bonus", "Fact", "Spec", "Comp", "Halluc", "Comp/15"], rows, [38*mm, 20*mm, 17*mm, 13*mm, 14*mm, 12*mm, 12*mm, 12*mm, 15*mm, 15*mm], small=True)) - story.append(PageBreak()) - - -def build_pdf(): - charts = build_charts() - doc = SimpleDocTemplate(str(OUT), pagesize=A4, rightMargin=18*mm, leftMargin=18*mm, topMargin=18*mm, bottomMargin=16*mm) - story = [] - cover(story) - executive_summary(story) - methodology(story, charts) - headline_results(story, charts) - whitepaper_crossref(story) - failure_mode(story) - hallucination_retrieval(story, charts) - validated_retracted(story, charts) - roadmap(story) - limitations(story) - reproducibility(story) - detail_tables(story) - doc.build(story, onFirstPage=page_bg, onLaterPages=page_bg) - - -def page_bg(canvas, doc): - canvas.saveState() - canvas.setFillColor(colors.HexColor(IVORY)) - canvas.rect(0, 0, A4[0], A4[1], fill=1, stroke=0) - canvas.setStrokeColor(colors.HexColor(GRID)) - canvas.line(18*mm, 13*mm, A4[0]-18*mm, 13*mm) - canvas.setFillColor(colors.HexColor(MUTED)) - canvas.setFont("Helvetica", 7) - canvas.drawString(18*mm, 8*mm, "Context Engine Benchmark Report v1.0") - canvas.drawRightString(A4[0]-18*mm, 8*mm, str(doc.page)) - canvas.restoreState() - - -if __name__ == "__main__": - build_pdf() - size_mb = OUT.stat().st_size / (1024 * 1024) - print(f"Wrote {OUT} ({size_mb:.2f} MB)") diff --git a/bench/continuity-gold.json b/bench/continuity-gold.json deleted file mode 100644 index abc79bb..0000000 --- a/bench/continuity-gold.json +++ /dev/null @@ -1,198 +0,0 @@ -{ - "_meta": { - "schema": "Per-task: must_inherit (facts S2 must respect from S1), reset_questions (regex patterns indicating S2 is asking for state it should already have), forbidden_drift (alternative choices that contradict S1).", - "scoring": "must_inherit hits are good. reset_question matches are bad. forbidden_drift matches are bad." - }, - "tasks": { - "strava-project": { - "must_inherit": [ - { - "name": "uses_uv", - "keywords": ["uv add", "uv run", "uv pip", "uv sync"], - "desc": "uses uv, not pip or poetry" - }, - { "name": "uses_pytest", "keywords": ["pytest", "test_", "tests/"] }, - { "name": "uses_ruff", "keywords": ["ruff"] }, - { - "name": "correct_path", - "keywords": ["E:/projects/strava-analyzer", "E:\\projects\\strava-analyzer", "strava-analyzer/"] - }, - { "name": "src_layout", "keywords": ["src/strava_analyzer", "src/strava_analyzer/"] }, - { "name": "python_312", "keywords": ["3.12", "Python 3.12", "python_version = \"3.12\""] } - ], - "reset_questions": [ - { - "name": "asks_path", - "patterns": ["what.*path", "where.*project", "which directory", "project location"] - }, - { - "name": "asks_tooling", - "patterns": [ - "which.*tool", - "uv or pip", - "pip or uv", - "pytest or unittest", - "ruff or black", - "what package manager" - ] - }, - { "name": "asks_layout", "patterns": ["src layout or flat", "which layout", "src/ or no"] } - ], - "forbidden_drift": [ - { - "name": "uses_pip", - "keywords": ["pip install", "requirements.txt"], - "desc": "S1 chose uv; pip is drift" - }, - { "name": "uses_poetry", "keywords": ["poetry add", "poetry install"] }, - { "name": "uses_black", "keywords": ["black ."], "desc": "S1 chose ruff" }, - { "name": "uses_unittest", "keywords": ["import unittest", "TestCase"] } - ] - }, - "fashion-shoot-prompts": { - "must_inherit": [ - { "name": "uses_juggernaut", "keywords": ["Juggernaut"] }, - { "name": "uses_dpm_karras", "keywords": ["DPM++ 2M Karras", "DPM++ 2M", "Karras"] }, - { "name": "uses_30_steps", "keywords": ["30 steps", "steps: 30", "steps=30"] }, - { "name": "uses_cfg_65", "keywords": ["CFG 6.5", "CFG: 6.5", "cfg=6.5"] }, - { "name": "uses_resolution", "keywords": ["1344x768", "1344 x 768"] }, - { "name": "aesthetic_tokyo", "keywords": ["Tokyo"] }, - { "name": "aesthetic_neon", "keywords": ["neon"] } - ], - "reset_questions": [ - { "name": "asks_sampler", "patterns": ["which sampler", "what sampler"] }, - { "name": "asks_checkpoint", "patterns": ["which.*checkpoint", "which.*model", "SDXL or"] }, - { "name": "asks_aesthetic", "patterns": ["what.*aesthetic", "what.*style", "what.*vibe"] }, - { "name": "asks_resolution", "patterns": ["which resolution", "what resolution", "what.*size"] } - ], - "forbidden_drift": [ - { - "name": "different_sampler", - "keywords": ["Euler a", "DDIM", "UniPC"], - "desc": "S1 chose DPM++ 2M Karras" - }, - { "name": "different_checkpoint", "keywords": ["DreamShaper", "RealVis", "SD 1.5"] } - ] - }, - "slack-release-thread": { - "must_inherit": [ - { "name": "version_referenced", "keywords": ["v0.3.1", "0.3.1"] }, - { "name": "uses_rocket_emoji", "keywords": [":rocket:"] }, - { - "name": "casual_technical_tone", - "keywords": ["folks", "team", "—", "shipped", "rolled out", "going out"] - }, - { "name": "migration_topic", "keywords": ["migration", "migrate", "userData", "auto-migration"] } - ], - "reset_questions": [ - { "name": "asks_version", "patterns": ["which version", "what version", "which release"] }, - { "name": "asks_audience", "patterns": ["who is.*audience", "what channel", "which channel"] }, - { "name": "asks_tone", "patterns": ["what tone", "how formal"] } - ], - "forbidden_drift": [ - { "name": "wrong_emoji", "keywords": [":tada:", ":sparkles:", ":fire:"], "desc": "S1 used :rocket:" }, - { "name": "marketing_tone", "keywords": ["world-class", "leverage synergies", "best-in-class"] } - ] - }, - "brand-poster-adapt": { - "must_inherit": [ - { "name": "accent_color", "keywords": ["#CC785C", "Book Cloth", "Clay"] }, - { "name": "background_color", "keywords": ["#F0EEE6", "Ivory"] }, - { "name": "text_color", "keywords": ["#191919", "Slate Dark"] }, - { "name": "display_type", "keywords": ["Styrene A", "Styrene"] }, - { "name": "body_type", "keywords": ["Tiempos"] }, - { "name": "asterisk_motif", "keywords": ["asterisk", "asterisk glyph"] }, - { "name": "headline_kept", "keywords": ["BUILDING WITH CLAUDE", "Building with Claude"] } - ], - "reset_questions": [ - { "name": "asks_palette", "patterns": ["which colors", "what palette", "color scheme"] }, - { "name": "asks_typography", "patterns": ["which font", "what typography", "which typeface"] }, - { "name": "asks_motif", "patterns": ["what graphic", "what motif", "any imagery"] } - ], - "forbidden_drift": [ - { "name": "different_palette", "keywords": ["#FF6B35", "neon", "vibrant gradient", "rainbow"] }, - { "name": "different_typography", "keywords": ["Inter", "Roboto", "Helvetica", "Arial"] } - ] - }, - "memory-consolidation": { - "must_inherit": [ - { - "name": "five_clusters", - "keywords": ["5 clusters", "five clusters", "cluster A", "cluster B", "cluster C"] - }, - { - "name": "merge_targets", - "keywords": [ - "cluster_A_windows", - "cluster A", - "Windows duplicates", - "cluster_B_x100vi", - "cluster B", - "X100VI" - ] - }, - { - "name": "keep_with_stale_tag", - "keywords": ["cluster_D", "cluster D", "cluster_E", "cluster E", "stale tag", "tag as stale"] - }, - { "name": "backup_required", "keywords": ["backup", "snapshot", "save copy", "before mutating"] }, - { "name": "audit_log_path", "keywords": ["data/memory-audits", "memory-audits/2026-05-15.json"] } - ], - "reset_questions": [ - { - "name": "asks_what_clusters", - "patterns": ["which.*clusters", "what duplicates", "what was found"] - }, - { - "name": "asks_decisions", - "patterns": ["which.*merge", "what should I merge", "what should be kept"] - }, - { "name": "asks_safety", "patterns": ["should I backup", "do you want a backup"] } - ], - "forbidden_drift": [ - { "name": "no_backup", "keywords": ["overwrites memory.json directly", "no backup needed"] }, - { "name": "merges_wrong", "keywords": ["merge cluster D", "merge cluster E"] } - ] - }, - "comfy-workflow-iteration": { - "must_inherit": [ - { "name": "uses_ultrasharp", "keywords": ["4x-UltraSharp", "UltraSharp"] }, - { "name": "refiner_denoise", "keywords": ["denoise 0.35", "denoise: 0.35", "denoise=0.35"] }, - { "name": "tile_size", "keywords": ["1024 tile", "tile size 1024", "tile_size: 1024"] }, - { "name": "output_4k", "keywords": ["3840x2160", "3840 x 2160", "4K"] }, - { "name": "refiner_checkpoint", "keywords": ["Juggernaut XL", "Juggernaut"] }, - { - "name": "workflow_path", - "keywords": ["E:/comfyui-workflows/product-upscale-4k.json", "product-upscale-4k.json"] - } - ], - "reset_questions": [ - { "name": "asks_upscaler", "patterns": ["which upscaler", "what upscaler model"] }, - { "name": "asks_denoise", "patterns": ["what denoise", "which denoise"] }, - { "name": "asks_resolution", "patterns": ["what resolution", "what output size"] } - ], - "forbidden_drift": [ - { "name": "different_upscaler", "keywords": ["RealESRGAN", "SwinIR", "LDSR"] }, - { "name": "different_denoise", "keywords": ["denoise 0.5", "denoise 0.4", "denoise 0.2"] } - ] - }, - "email-followup-thread": { - "must_inherit": [ - { "name": "patch_version", "keywords": ["v0.3.1", "0.3.1"] }, - { "name": "vulnerability_name", "keywords": ["symlink", "symlink-escape"] }, - { "name": "severity", "keywords": ["moderate"] }, - { "name": "audience", "keywords": ["eng-all", "engineering", "the team"] }, - { "name": "tone_calm", "keywords": ["update", "updated", "thanks", "appreciated", "no exploit"] } - ], - "reset_questions": [ - { "name": "asks_version", "patterns": ["which version", "which patch"] }, - { "name": "asks_vuln", "patterns": ["what vulnerability", "what was the bug"] }, - { "name": "asks_audience", "patterns": ["who is.*audience", "who should this go to"] } - ], - "forbidden_drift": [ - { "name": "panic_tone", "keywords": ["URGENT", "EMERGENCY", "DISASTER"] }, - { "name": "different_severity", "keywords": ["critical severity", "low severity"] } - ] - } - } -} diff --git a/bench/continuity-tasks.json b/bench/continuity-tasks.json deleted file mode 100644 index c8e983a..0000000 --- a/bench/continuity-tasks.json +++ /dev/null @@ -1,142 +0,0 @@ -{ - "_meta": { - "purpose": "Measure whether session 2 inherits state established in session 1. Each task simulates a paused multi-step workflow: session 1 makes specific decisions (tool choices, paths, parameters, aesthetic choices); session 2 is prompted to continue. With CE-backed memory, session 2 should respect those decisions; without it, session 2 must ask reset questions or guess.", - "modes_to_test": [ - "no_context_cold (S2 has no memory of S1 decisions; baseline of pain)", - "ce_continue (S2 system prompt includes a memory snapshot of S1 state)", - "ce_continue_compact (S2 system prompt includes only the compact handoff note CE emits)" - ], - "scoring": "Per task, gold facts define what S2 MUST inherit (tool choices, paths, parameters) and what counts as a reset question. Deterministic keyword scoring + LLM judge × 3 runs (decomposed rubric)." - }, - "tasks": [ - { - "id": "strava-project", - "category": "code", - "session_1_summary": "Set up a Python project to analyse a Strava CSV export. Decisions made: uv for dependencies, pytest for tests, ruff (not black) for formatting, src-layout, Python 3.12, project at E:/projects/strava-analyzer, package name strava_analyzer.", - "session_1_state": { - "project_path": "E:/projects/strava-analyzer", - "package_name": "strava_analyzer", - "layout": "src layout (src/strava_analyzer/)", - "python_version": "3.12", - "deps_tool": "uv", - "test_framework": "pytest", - "formatter": "ruff", - "files_created": ["pyproject.toml", "src/strava_analyzer/__init__.py", "tests/__init__.py"], - "csv_columns_observed": [ - "Activity Date", - "Activity Type", - "Distance", - "Moving Time", - "Average Heart Rate" - ] - }, - "session_1_compact_handoff": "Strava analyzer project at E:/projects/strava-analyzer (package strava_analyzer, src layout, Python 3.12, uv + pytest + ruff). pyproject.toml and src/strava_analyzer/__init__.py created. CSV cols: Activity Date, Activity Type, Distance, Moving Time, Average Heart Rate.", - "session_2_prompt": "Add a CSV loader function for the Strava export." - }, - { - "id": "fashion-shoot-prompts", - "category": "image-gen", - "session_1_summary": "Generated 3 ComfyUI prompts for a streetwear fashion shoot in Tokyo at dusk. Decisions: SDXL Juggernaut checkpoint, DPM++ 2M Karras sampler, 30 steps, CFG 6.5, 1344x768, aesthetic tags include 'oversized silhouettes', 'reflective surfaces', 'neon bleed', model 'Aya 24yo Japanese-Brazilian'.", - "session_1_state": { - "checkpoint": "Juggernaut XL v9", - "sampler": "DPM++ 2M Karras", - "steps": 30, - "cfg": 6.5, - "resolution": "1344x768", - "aesthetic_tags": ["oversized silhouettes", "reflective surfaces", "neon bleed", "Tokyo dusk"], - "model_persona": "Aya, 24yo Japanese-Brazilian model", - "negative_prompt_baseline": "low quality, blurry, deformed, watermark, modern buildings without neon" - }, - "session_1_compact_handoff": "Tokyo streetwear shoot. Juggernaut XL v9, DPM++ 2M Karras 30 steps CFG 6.5 at 1344x768. Aesthetic: oversized silhouettes, reflective surfaces, neon bleed. Model: Aya, 24yo Japanese-Brazilian.", - "session_2_prompt": "Give me 2 more prompt variants with different poses." - }, - { - "id": "slack-release-thread", - "category": "comms", - "session_1_summary": "Drafted a Slack announcement for shipping Context Engine v0.3.1 with the data persistence fix. Decisions: :rocket: emoji (not :tada:), audience '@channel devs', casual-technical tone, specific mention of CE_ROOT pinned to app.getPath('userData') and the asar.unpacked seeding. Length ~150 words. Posted to #context-engine.", - "session_1_state": { - "version_shipped": "v0.3.1", - "emoji": ":rocket:", - "audience": "@channel devs", - "tone": "casual-technical", - "channel": "#context-engine", - "approximate_length_words": 150, - "key_technical_points": [ - "CE_ROOT pinned to app.getPath('userData')", - "asar.unpacked seeding on first run", - "auto-migration of legacy data" - ], - "previous_message_opening": "Rolling out v0.3.1 today —" - }, - "session_1_compact_handoff": "Posted v0.3.1 announcement in #context-engine (@channel devs, :rocket: emoji, casual-technical tone, ~150 words). Covered: CE_ROOT pinned to userData, asar.unpacked seeding, auto-migration. Opened with 'Rolling out v0.3.1 today —'.", - "session_2_prompt": "Draft tomorrow's follow-up message with the migration status." - }, - { - "id": "brand-poster-adapt", - "category": "design", - "session_1_summary": "Designed an A3 poster for a developer workshop in Anthropic brand style. Decisions: palette Book Cloth #CC785C accent + Ivory Medium #F0EEE6 background + Slate Dark #191919 text, Styrene A display + Tiempos Text body, layout with a 15-degree asterisk glyph top-right, three-circle Venn graphic at lower-third 15% opacity, 'BUILDING WITH CLAUDE' headline.", - "session_1_state": { - "format_original": "A3 portrait (297x420mm)", - "accent_hex": "#CC785C", - "background_hex": "#F0EEE6", - "text_hex": "#191919", - "display_type": "Styrene A", - "body_type": "Tiempos Text", - "headline": "BUILDING WITH CLAUDE", - "graphic_motif": "asterisk glyph + three-circle Venn", - "asterisk_rotation_degrees": 15, - "venn_opacity_pct": 15 - }, - "session_1_compact_handoff": "Anthropic A3 workshop poster. Palette: Book Cloth #CC785C accent / Ivory #F0EEE6 bg / Slate #191919 text. Type: Styrene A display, Tiempos Text body. Headline 'BUILDING WITH CLAUDE'. Motif: asterisk glyph rotated 15deg + three-circle Venn at 15% opacity.", - "session_2_prompt": "Adapt this for an A4 flyer version." - }, - { - "id": "memory-consolidation", - "category": "meta", - "session_1_summary": "Audited memory.json. Identified 5 duplicate clusters (cluster A: 'Jeremy uses Windows' x3, cluster B: 'X100VI photography' x2, cluster C: 'CE token saving' x2, cluster D: 'morning routine' x2, cluster E: 'health tracker on Android' x2). Proposed merges for A, B, C. Flagged D and E as keep-but-tag-stale.", - "session_1_state": { - "clusters_identified": 5, - "clusters_to_merge": ["cluster_A_windows", "cluster_B_x100vi", "cluster_C_ce_tokens"], - "clusters_to_keep_with_stale_tag": ["cluster_D_morning_routine", "cluster_E_health_android"], - "backup_required_before": true, - "audit_log_path": "data/memory-audits/2026-05-15.json" - }, - "session_1_compact_handoff": "Memory audit done. 5 clusters found. Merge: cluster_A_windows, cluster_B_x100vi, cluster_C_ce_tokens. Keep-with-stale-tag: cluster_D_morning_routine, cluster_E_health_android. Backup first. Log at data/memory-audits/2026-05-15.json.", - "session_2_prompt": "Apply the consolidation we discussed." - }, - { - "id": "comfy-workflow-iteration", - "category": "image-gen", - "session_1_summary": "Built a ComfyUI upscale workflow for product photos. Decisions: 4x-UltraSharp upscaler, KSampler refiner at denoise 0.35, tiled VAE decode at 1024 tile size, output target 4K (3840x2160), Juggernaut XL refiner, sharpening pass via ImageFilter at 1.2 strength.", - "session_1_state": { - "upscaler_model": "4x-UltraSharp", - "refiner_denoise": 0.35, - "tile_size": 1024, - "tiled_vae": true, - "output_resolution": "3840x2160", - "refiner_checkpoint": "Juggernaut XL v9", - "sharpening_strength": 1.2, - "workflow_json_path": "E:/comfyui-workflows/product-upscale-4k.json" - }, - "session_1_compact_handoff": "Product upscale workflow saved to E:/comfyui-workflows/product-upscale-4k.json. 4x-UltraSharp upscaler -> KSampler refiner (denoise 0.35, Juggernaut XL v9) -> tiled VAE (1024 tiles) -> 3840x2160 output -> sharpening 1.2.", - "session_2_prompt": "Modify the workflow to also do a face-detail pass." - }, - { - "id": "email-followup-thread", - "category": "comms", - "session_1_summary": "Wrote an internal email about the symlink-escape security patch. Subject '[Security] Symlink-escape patch shipped — no action required for most'. Tone: calm-not-panicked. Severity called out as moderate. Recommended action: update via auto-updater. Sender: Jeremy. Recipients: eng-all@.", - "session_1_state": { - "subject_line": "[Security] Symlink-escape patch shipped — no action required for most", - "vulnerability": "symlink-escape", - "severity_called": "moderate", - "recommended_action": "update via auto-updater", - "audience": "eng-all@", - "tone": "calm-not-panicked", - "patch_version": "v0.3.1", - "no_exploit_observed_stated": true - }, - "session_1_compact_handoff": "Sent email to eng-all@ — Subject '[Security] Symlink-escape patch shipped — no action required for most'. Severity moderate, no exploit observed, action: update via auto-updater. Tone calm-not-panicked.", - "session_2_prompt": "Draft the one-week-later follow-up confirming everyone's updated." - } - ] -} diff --git a/bench/fetch_contexts.py b/bench/fetch_contexts.py deleted file mode 100644 index 534110e..0000000 --- a/bench/fetch_contexts.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python3 -""" -fetch_contexts.py — Pre-fetch CE smart + search contexts for each task and -dump them to disk. Used when the grading pass needs an LLM that isn't -reachable via API key (e.g. running through Claude Code session auth). - -Output: bench/artifacts/contexts/.smart.md and -bench/artifacts/contexts/.search.md plus -bench/artifacts/contexts/manifest.json with token counts + skill counts per task. - -Run AFTER `python bench/tokenomics.py` (or alongside; only depends on CE). -""" - -from __future__ import annotations - -import json -import os -import sys -from pathlib import Path - -# Re-use the helpers we already built in tokenomics.py. -sys.path.insert(0, str(Path(__file__).parent)) -from tokenomics import ( # noqa: E402 - DEFAULT_CE_URL, - build_context_text, - build_search_context_text, - count_tokens, - fetch_active_skill_bodies, - fetch_compiled_baseline, - post_json, - reachable, -) - -HERE = Path(__file__).parent -OUT_DIR = HERE / "artifacts" / "contexts" -TASKS_PATH = HERE / "tasks.json" - -CE_URL = os.environ.get("CE_URL", DEFAULT_CE_URL) -MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "16000")) -SEARCH_LIMIT = int(os.environ.get("SEARCH_LIMIT", "8")) - - -def main() -> int: - if not reachable(CE_URL): - sys.stderr.write(f"[!] CE not reachable at {CE_URL}\n") - return 1 - - OUT_DIR.mkdir(parents=True, exist_ok=True) - - with TASKS_PATH.open(encoding="utf-8") as f: - tasks = json.load(f) - - print(f"Fetching active skill bodies...") - active_bodies = fetch_active_skill_bodies(CE_URL) - print(f" {len(active_bodies)} active skills loaded") - - manifest = { - "ce_url": CE_URL, - "max_tokens": MAX_TOKENS, - "search_limit": SEARCH_LIMIT, - "active_skill_count": len(active_bodies), - "tasks": [], - } - - for i, task in enumerate(tasks, 1): - tid = task["id"] - print(f" [{i:>2}/{len(tasks)}] {tid:<28}", end=" ", flush=True) - - smart = post_json(CE_URL, "/api/compile/smart", - {"task": task["prompt"], "maxTokens": MAX_TOKENS}) - selected = smart.get("selectedSkillIds") or [] - smart_ctx = build_context_text(selected, active_bodies) - smart_tokens = count_tokens(smart_ctx) - - search = post_json(CE_URL, "/api/search", - {"query": task["prompt"], "limit": SEARCH_LIMIT}) - chunks = search.get("results") or [] - search_ctx = build_search_context_text(chunks) - search_tokens = count_tokens(search_ctx) - - (OUT_DIR / f"{tid}.smart.md").write_text(smart_ctx, encoding="utf-8") - (OUT_DIR / f"{tid}.search.md").write_text(search_ctx, encoding="utf-8") - - manifest["tasks"].append({ - "id": tid, - "category": task.get("category", ""), - "prompt": task["prompt"], - "selected_skill_count": len(selected), - "smart_tokens": smart_tokens, - "search_tokens": search_tokens, - "search_chunk_count": len(chunks), - "smart_path": str((OUT_DIR / f"{tid}.smart.md").relative_to(HERE.parent)), - "search_path": str((OUT_DIR / f"{tid}.search.md").relative_to(HERE.parent)), - }) - print(f"smart {smart_tokens:>6,}tk / search {search_tokens:>5,}tk") - - (OUT_DIR / "manifest.json").write_text( - json.dumps(manifest, indent=2), encoding="utf-8" - ) - print(f"\nWrote {len(tasks)} task contexts to {OUT_DIR.relative_to(HERE.parent)}/") - print(f"Manifest: {(OUT_DIR / 'manifest.json').relative_to(HERE.parent)}") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bench/fixtures/skills/app-launcher/SKILL.md b/bench/fixtures/skills/app-launcher/SKILL.md deleted file mode 100644 index 6f44058..0000000 --- a/bench/fixtures/skills/app-launcher/SKILL.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -name: App Launcher -description: Windows application launcher shortcuts and automation -triggers: [launch, open, start, Start-Process, shortcut, application, automation] ---- - -# App Launcher - -## Launching Applications - -Use `Start-Process` to launch any application: - -```powershell -Start-Process "C:\Program Files\Slack\slack.exe" -Start-Process "code" # via PATH -Start-Process "https://chat.openai.com" # default browser -``` - -## Finding Application Paths - -Get installed app paths from the registry: - -```powershell -Get-ItemProperty "HKLM:\Software\Microsoft\Windows\CurrentVersion\App Paths\*" | Select-Object '(default)' -``` - -## Startup Folder Automation - -Place shortcuts in `shell:startup` for automatic launch at logon. Use `$env:APPDATA\Microsoft\Windows\Start Menu\Programs\Startup` for per-user startup items. - -## Launch Order - -Use `Start-Process -Wait` to launch apps sequentially, ensuring each is fully loaded before the next starts. Combine with Start-Sleep for apps that need extra initialization time. diff --git a/bench/fixtures/skills/brand-guidelines/SKILL.md b/bench/fixtures/skills/brand-guidelines/SKILL.md deleted file mode 100644 index 47b9e59..0000000 --- a/bench/fixtures/skills/brand-guidelines/SKILL.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -name: Brand Guidelines -description: Anthropic brand design guidelines, colors, typography, and aesthetics -triggers: [brand, design, Anthropic, poster, workshop, typography, color palette] ---- - -# Brand Guidelines - -## Color Palette - -Primary brand colors: -- **Book Cloth / Clay**: #CC785C (signature accent) -- **Ivory**: warm off-white background -- **Kraft**: natural brown tone -- **Slate**: dark neutral - -## Typography - -- **Display typeface**: Styrene (headings, impactful text) -- **Body typeface**: Tiempos (body copy, readability) - -## Design Principles - -- **Restraint**: generous whitespace, minimal elements -- **Editorial quality**: single accent per composition -- **No gradients or neon**: avoid glassmorphism, glow effects -- **Print-ready**: A3 format (297×420mm), CMYK, 300 DPI with 3mm bleed - -## Workshop Poster Template - -A3 poster for developer workshops: title in Styrene, body in Tiempos, one Clay accent element, rest in Slate/Ivory. diff --git a/bench/fixtures/skills/canvas-design/SKILL.md b/bench/fixtures/skills/canvas-design/SKILL.md deleted file mode 100644 index f67c8fd..0000000 --- a/bench/fixtures/skills/canvas-design/SKILL.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -name: Canvas Design -description: Print and digital canvas design specification and layout -triggers: [canvas, layout, A3, poster, design, composition, print, bleed, DPI] ---- - -# Canvas Design - -## Print Specifications - -- **A3 format**: 297×420mm -- **Resolution**: 300 DPI for print, 72 DPI for digital -- **Color mode**: CMYK for print, RGB for digital -- **Bleed**: 3mm bleed on all sides for print production -- **Safe zone**: 10mm inset from trim edge - -## Poster Layout - -- **Hierarchy**: headline (20%), visual (50%), body text (20%), footer (10%) -- **Margins**: generous whitespace around all elements -- **Grid**: single-column or two-column depending on density -- **Visual**: centered hero image or graphic - -## Digital Canvas - -- **Workshop slides**: 1920×1080 (16:9) or 1024×768 (4:3) -- **Social media**: 1200×630 for Open Graph, 1080×1080 for Instagram -- **Presentation**: consistent margins, brand header/footer diff --git a/bench/fixtures/skills/claude-api/SKILL.md b/bench/fixtures/skills/claude-api/SKILL.md deleted file mode 100644 index 98631c3..0000000 --- a/bench/fixtures/skills/claude-api/SKILL.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -name: Claude API -description: Anthropic Claude API integration, tool use, prompt caching -triggers: [anthropic, claude, API, SDK, tool use, prompt caching, messages, stream] ---- - -# Claude API - -## Tool Use Pattern - -```python -import anthropic - -client = anthropic.Anthropic() -response = client.messages.create( - model="claude-sonnet-4-6", - max_tokens=1024, - tools=[{ - "name": "get_weather", - "description": "Get current weather for a location", - "input_schema": { - "type": "object", - "properties": { - "location": {"type": "string"}, - }, - "required": ["location"] - } - }], - messages=[{"role": "user", "content": "What's the weather in London?"}] -) -``` - -## Agent Loop - -Check `response.stop_reason` for `"tool_use"`. When tools are requested, execute the function, add tool_result, and continue: - -```python -while response.stop_reason == "tool_use": - # Execute tool, add result - messages.append({"role": "user", "content": tool_result}) - response = client.messages.create(model=model, messages=messages, tools=tools) -``` - -## Prompt Caching (4.6+) - -Use `cache_control = {"type": "ephemeral"}` on system messages and tools to reduce costs: - -```python -response = client.messages.create( - model="claude-sonnet-4-6", - system=[{"type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}}], - tools=tools_with_cache, - messages=messages, -) -``` - -Cache hits shown via `usage.cache_read_input_tokens` and `usage.cache_creation_input_tokens`. diff --git a/bench/fixtures/skills/comfyui-prompt-builder/SKILL.md b/bench/fixtures/skills/comfyui-prompt-builder/SKILL.md deleted file mode 100644 index 9454567..0000000 --- a/bench/fixtures/skills/comfyui-prompt-builder/SKILL.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -name: ComfyUI Prompt Builder -description: Building effective prompts for ComfyUI image generation workflows -triggers: [comfyui, prompt, stable diffusion, image generation, positive, negative, CFG] ---- - -# ComfyUI Prompt Builder - -## Positive Prompt Structure - -Structure positive prompts with subject, style, environment, and lighting: - -``` -fantasy landscape, rolling fog, warm sunset, dramatic clouds, epic vista, detailed, cinematic lighting -``` - -## Negative Prompts - -Include negative prompts to avoid artifacts: - -``` -low quality, blurry, distorted, deformed, extra limbs, bad anatomy, watermark, text, signature -``` - -## Settings Guidance - -- CFG Scale: 7.0 for balanced creativity -- Sampler: DPM++ 2M Karras for quality -- Steps: 20-30 for standard output -- Resolution: 768x768 or 1024x1024 for SDXL -- Seed: -1 for random, fixed for reproducibility - -## Weighted Tokens - -Use parentheses and number weights: `(fog:1.2)` or `(sunset:1.3)` to emphasize elements. Use `(unwanted:0.5)` to reduce emphasis. diff --git a/bench/fixtures/skills/comfyui-workflow-manager/SKILL.md b/bench/fixtures/skills/comfyui-workflow-manager/SKILL.md deleted file mode 100644 index 4b0f12b..0000000 --- a/bench/fixtures/skills/comfyui-workflow-manager/SKILL.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -name: ComfyUI Workflow Manager -description: Managing ComfyUI workflows for upscaling, image generation, prompt integration, and refinement -triggers: [comfyui, workflow, upscale, KSampler, refiner, image processing, node, prompt] ---- - -# ComfyUI Workflow Manager - -## Upscaling Workflow - -Build an upscale workflow: -1. **Load Image** node for input -2. **Upscale Model** node with 4x-UltraSharp ESRGAN model -3. **Upscale Image By** node connected to upscale model -4. **KSampler** for refiner pass at reduced denoise (0.3-0.4) -5. **Save Image** node for output - -## 4K Target - -For 4K output, use 4x upscale factor from 1080p input. Check VAE tile mode (`--tile`) to avoid memory issues at high resolutions. Target resolution: 3840x2160 or 4096x2160. - -## Refiner Pass - -After upscaling, run a KSampler with denoise 0.3-0.4 to add detail without changing composition. Set CFG to 5-6 for the refiner pass. Use the same seed as the original generation if preserving composition. - -## Prompt Workflow Integration - -When building a prompt workflow, combine positive and negative prompt nodes with the KSampler for text-to-image generation. Use the CLIP Text Encode node to convert your positive and negative prompts into the conditioning needed by the KSampler. Load a checkpoint model (like SDXL) and connect it to the CLIP and VAE nodes. - -## Fantasy Landscape Prompt Workflow - -To generate a moody fantasy landscape with rolling fog and warm sunset, build this workflow: -1. **Checkpoint Loader**: SDXL or Juggernaut XL model -2. **CLIP Text Encode (Positive Prompt)**: "fantasy landscape, rolling fog, warm sunset, dramatic lighting, epic vista, detailed, cinematic atmosphere, moody" -3. **CLIP Text Encode (Negative Prompt)**: "low quality, blurry, distorted, deformed, bad anatomy, watermark, text" -4. **Empty Latent Image**: 1024x1024 or 768x768 -5. **KSampler**: DPM++ 2M Karras, CFG 7, steps 25, denoise 1.0 -6. **VAE Decode** and **Save Image** - -## Template Workflows - -Save workflows as .json files in `ComfyUI/user/default/workflows/`. Load from the Workflows menu. Share prompt templates as workflow files that include the full node graph — image loading, upscaling, refiner pass, and save node. Use template workflows to save your favourite prompt configurations for reuse. diff --git a/bench/fixtures/skills/competitor-analyser/SKILL.md b/bench/fixtures/skills/competitor-analyser/SKILL.md deleted file mode 100644 index 51abc61..0000000 --- a/bench/fixtures/skills/competitor-analyser/SKILL.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -name: Competitor Analyser -description: Market and competitor analysis for product ideation and startup planning -triggers: [competitor, market, analysis, startup, business, photographer, SaaS] ---- - -# Competitor Analyser - -## Market Analysis Framework - -1. **Identify competitors**: direct (similar product), indirect (different approach, same need) -2. **Feature comparison**: build a matrix of features across products -3. **Pricing analysis**: free vs freemium vs subscription tiers -4. **User pain points**: read reviews, support forums, social media complaints - -## Photography AI Market - -Current landscape for photographer-focused AI tools: -- **Editing**: Adobe Lightroom AI, Luminar Neo, Topaz Photo AI -- **Workflow**: Pixieset, SmugMug, ShootProof for client galleries -- **Culling**: Narrative Select, PhotoMechanic for initial selection -- **Gap**: personal AI assistant that integrates editing, culling, client management, and EXIF analysis - -## Pain Points - -Freelance photographers spend 40% of time on non-shooting tasks: culling, editing, client communication, gallery management. A personal AI assistant could automate the repetitive parts while keeping the creative control. diff --git a/bench/fixtures/skills/downloads-cleaner/SKILL.md b/bench/fixtures/skills/downloads-cleaner/SKILL.md deleted file mode 100644 index 4807f74..0000000 --- a/bench/fixtures/skills/downloads-cleaner/SKILL.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -name: Downloads Cleaner -description: Windows Downloads folder cleanup and file organization automation -triggers: [downloads, cleanup, organize, file management, archive, stale files, PowerShell] ---- - -# Downloads Cleaner - -## Finding Stale Files - -```powershell -$cutoff = (Get-Date).AddDays(-90) -$stale = Get-ChildItem "$env:USERPROFILE\Downloads" -File | Where-Object { $_.LastWriteTime -lt $cutoff } -$stale | Select-Object Name, LastWriteTime, Length -``` - -## Cleanup Plan - -1. **Review**: first list stale files with dry-run -2. **Archive**: move to `_Archive` folder in Downloads -3. **Delete**: after confirming nothing is needed -4. **Report**: show size recovered and file counts - -## Safety - -Always use `-WhatIf` on destructive operations. Never delete without confirmation. Archive before removing. Create a report of what was moved/deleted. - -## Automation - -Schedule with Task Scheduler to run monthly. Log results to a file. Send summary via email if configured. diff --git a/bench/fixtures/skills/email-drafter/SKILL.md b/bench/fixtures/skills/email-drafter/SKILL.md deleted file mode 100644 index 02bb59b..0000000 --- a/bench/fixtures/skills/email-drafter/SKILL.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -name: Email Drafter -description: Professional email drafting for internal and external communications -triggers: [email, draft, subject, message, internal, security, announcement] ---- - -# Email Drafter - -## Security Patch Email Template - -**Subject:** `[Security] Symlink-escape vulnerability patched in v0.3.1` - -**Body:** -1. **What happened**: brief description of the symlink-escape path traversal vulnerability -2. **Impact**: what the vulnerability could allow (local file access, but no remote exploit observed) -3. **Fix**: shipped in v0.3.1 — the path sanitization now prevents symlink traversal -4. **Action**: update to v0.3.1 via auto-update or manual download -5. **Questions**: reply to this thread or DM the security lead - -## Tone Guidelines - -- Professional but direct -- No alarmist language -- State severity clearly -- Include concrete action items -- Sign with sender name and role - -## Structure - -Use `Subject:` line with clear prefix. Body divided into: context, impact, resolution, action items. End with contact info. diff --git a/bench/fixtures/skills/file-organiser/SKILL.md b/bench/fixtures/skills/file-organiser/SKILL.md deleted file mode 100644 index 77d8e74..0000000 --- a/bench/fixtures/skills/file-organiser/SKILL.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -name: File Organiser -description: Windows file organization, categorization, and grouping utilities -triggers: [organize, file, categorize, group, move, archive, extension, cleanup] ---- - -# File Organiser - -## Categorization - -Group files by extension or type: -```powershell -Get-ChildItem -Path $downloads -File | Group-Object Extension | Sort-Object Count -Descending -``` - -## Organizing by Extension - -```powershell -$categories = @{ - ".pdf" = "Documents"; ".docx" = "Documents"; ".xlsx" = "Documents" - ".jpg" = "Images"; ".png" = "Images"; ".gif" = "Images" - ".zip" = "Archives"; ".rar" = "Archives"; ".7z" = "Archives" - ".exe" = "Installers"; ".msi" = "Installers" -} -``` - -## Archive Strategy - -Move files older than 90 days to subfolders. Keep a manifest of archived files. Use date-based folder structure: `Archive\2026\05\` for monthly archiving. Log all moves to an archive audit file. - -## Safety - -Preview changes before executing. Use `-WhatIf` flag to verify. Create restore point before bulk operations. diff --git a/bench/fixtures/skills/file-search/SKILL.md b/bench/fixtures/skills/file-search/SKILL.md deleted file mode 100644 index 01b629a..0000000 --- a/bench/fixtures/skills/file-search/SKILL.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -name: File Search -description: Windows file search and discovery utilities -triggers: [search, file, find, Get-ChildItem, directory, locate, Path] ---- - -# File Search - -## Searching Files - -```powershell -Get-ChildItem -Path "$env:USERPROFILE\Downloads" -Recurse -File | Where-Object { $_.Extension -in ".pdf", ".docx", ".zip" } -``` - -## Finding Specific Files - -Search by pattern, date, or size: -```powershell -Get-ChildItem -Path C:\ -Recurse -File -Filter "*.log" -ErrorAction SilentlyContinue -Get-ChildItem -Path $HOME -Recurse -File | Where-Object { $_.LastWriteTime -gt (Get-Date).AddDays(-7) } -Get-ChildItem -Path $HOME -Recurse -File | Sort-Object Length -Descending | Select-Object -First 20 -``` - -## Date Filtering - -Use `LastWriteTime` and `CreationTime` properties. Common filters: `-gt (Get-Date).AddDays(-90)` for files modified in the last 90 days, or `-lt` for files older than a threshold. diff --git a/bench/fixtures/skills/health-tracker-analyst/SKILL.md b/bench/fixtures/skills/health-tracker-analyst/SKILL.md deleted file mode 100644 index fa41232..0000000 --- a/bench/fixtures/skills/health-tracker-analyst/SKILL.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -name: Health Tracker Analyst -description: Health data analysis, step tracking, and fitness trend analysis -triggers: [health, step, fitness, analysis, tracking, Apple Health, Google Fit, daily goal] ---- - -# Health Tracker Analyst - -## Step Data Analysis - -Analyze step data from Apple Health or Google Fit exports: - -```python -import pandas as pd -df = pd.read_csv("steps_export.csv") -daily_avg = df["steps"].mean() -weekly_median = df.resample("W", on="date")["steps"].median() -``` - -## Key Metrics - -- **Daily average**: mean steps per day -- **Median**: more robust against outlier days -- **Trend**: rolling 7-day average shows direction -- **Streak**: consecutive days above 10,000 steps - -## Goal Tracking - -Compare current performance against the 10,000 steps daily goal. Calculate percentage of days on track, longest streak, and weekly averages. Visualize with a 7-day rolling average chart to see trends clearly. - -## Data Sources - -- Apple Health: export via Health app (XML or CSV) -- Google Fit: Takeout export -- Fitbit: account data export -- Garmin: Connect web export diff --git a/bench/fixtures/skills/internal-comms/SKILL.md b/bench/fixtures/skills/internal-comms/SKILL.md deleted file mode 100644 index fc6b154..0000000 --- a/bench/fixtures/skills/internal-comms/SKILL.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -name: Internal Communications -description: Internal team communications, release announcements, and patch notifications -triggers: [internal, communication, announcement, release, patch, team, Slack, email] ---- - -# Internal Communications - -## Release Announcements - -Draft announcements with this structure: -1. **Version and what shipped**: name the version number -2. **The fix or feature**: clear description of what changed and why -3. **Team action**: call to action (update, restart, no action needed) -4. **Format**: use formatting (bold, emoji, hashtags) appropriate to channel - -## Security Patch Notifications - -When shipping a security patch: -1. **Subject**: clear `[Security]` prefix -2. **Vulnerability**: name the specific issue (e.g., "symlink-escape vulnerability") -3. **Status**: state it's shipped/fixed -4. **Action required**: tell recipients what they need to do - -## Tone Guidelines - -- Internal: direct and informational, not marketing -- No hype language (avoid "leverage", "synergy", "best-in-class") -- Include contact for questions -- For security: no panic tone, no "URGENT" unless genuinely critical diff --git a/bench/fixtures/skills/pptx/SKILL.md b/bench/fixtures/skills/pptx/SKILL.md deleted file mode 100644 index 21e7a45..0000000 --- a/bench/fixtures/skills/pptx/SKILL.md +++ /dev/null @@ -1,51 +0,0 @@ ---- -name: PowerPoint Generator -description: PowerPoint slide deck creation and automation for presentations and slide decks -triggers: [PowerPoint, pptx, slide, deck, presentation, office, automation, slideshow] ---- - -# PowerPoint Generator - -## Creating Presentations - -Use the `python-pptx` library to create slide decks programmatically: - -```python -from pptx import Presentation -from pptx.util import Inches, Pt - -prs = Presentation() -slide_layout = prs.slide_layouts[1] # Title and Content -slide = prs.slides.add_slide(slide_layout) -slide.shapes.title.text = "Q2 Objectives" -``` - -## Slide Layouts - -- **Title slide**: for section dividers -- **Content slide**: bullet points or body text -- **Two content**: side-by-side comparison -- **Blank**: custom layouts - -## Deck Structure - -A well-structured presentation deck includes a title slide, an agenda slide, content slides for each topic, and a closing slide. For Q2 planning, include milestones, key results, and timeline slides. - -## Theme-Factory Integration - -Use custom themes from theme-factory with python-pptx: design principles like Dieter Rams' "less but better" can be applied programmatically. Set the theme's accent colors, typography (Styrene for titles, Tiempos for body), and slide master layouts. The preferred aesthetic is restrained — generous whitespace, single accent element per slide, no gradients. - -## Q2 Planning Deck with Rams Theme - -For a Q2 slide deck using the Dieter Rams theme: create a title slide with project name, an agenda slide listing objectives, milestone slides (1 per quarter-month), and key results slides. Apply the Rams design principles: honest presentation of data, unobtrusive layouts, useful visual hierarchy. Use the theme-factory preset's color palette (Slate, Clay, Ivory) and ensure consistent typography throughout every slide. - -## Formatting Tips - -- Use consistent font sizes (title 28pt, body 18pt) -- Keep 5-7 bullets per slide max -- Include speaker notes for presentation delivery -- Export to PDF for distribution - -## Automation - -Use `Add-Member -Type NoteProperty` in PowerShell to generate PPTX via COM object, or use python-pptx for cross-platform creation. Slide decks can be automated end-to-end with dynamic data from spreadsheets or databases. diff --git a/bench/fixtures/skills/process-manager/SKILL.md b/bench/fixtures/skills/process-manager/SKILL.md deleted file mode 100644 index 414fd02..0000000 --- a/bench/fixtures/skills/process-manager/SKILL.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -name: Process Manager -description: Windows process management, CPU monitoring, and task management -triggers: [cpu, process, task manager, performance, high usage, Get-Process, Stop-Process] ---- - -# Process Manager - -## Monitoring CPU Usage - -Use `Get-Process | Sort-Object CPU -Descending | Select-Object -First 10` to find the top CPU consumers. The CPU column shows total processor time in seconds. - -## Identifying High CPU - -When a process is using high CPU: -1. Run `Get-Process -Name | Select-Object Id, ProcessName, CPU, WorkingSet` -2. Check if it's a known system process or third-party -3. Note the process ID for targeted investigation - -## Taking Action - -- `Stop-Process -Id -Force` to terminate -- `taskkill /PID /F` as cmd alternative -- Use Task Manager (Ctrl+Shift+Esc) for GUI monitoring - -## Prevention - -Set up Resource Monitor alerts for sustained CPU > 80%. Use Performance Monitor to log CPU trends over time. diff --git a/bench/fixtures/skills/slack-gif-creator/SKILL.md b/bench/fixtures/skills/slack-gif-creator/SKILL.md deleted file mode 100644 index 4da827e..0000000 --- a/bench/fixtures/skills/slack-gif-creator/SKILL.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -name: Slack GIF Creator -description: Creating and formatting Slack messages with GIFs and rich formatting -triggers: [Slack, message, announcement, GIF, emoji, format, channel] ---- - -# Slack GIF Creator - -## Slack Message Formatting - -Use Slack mrkdwn for rich messages: -- `*bold*` for emphasis -- `:emoji:` for reactions and decoration -- `>blockquote` for quoted text -- `#channel` for channel references -- `@user` for user mentions - -## Announcement Format - -```markdown -*Release: v0.3.1 is here!* :rocket: - -The data persistence fix has shipped. Your context is now saved reliably. - -*What to do:* Update via auto-update or download from GitHub. -:inbox_tray: No action if auto-update is on. - -Any questions, drop them in #dev-chat. -``` - -## Best Practices - -- Include a clear headline (bold) -- Use 2-3 emojis max — don't overdo it -- State the call to action explicitly -- Keep it under 5 lines if possible -- Use threads for additional detail diff --git a/bench/fixtures/skills/startup-routine/SKILL.md b/bench/fixtures/skills/startup-routine/SKILL.md deleted file mode 100644 index 5c022e6..0000000 --- a/bench/fixtures/skills/startup-routine/SKILL.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -name: Startup Routine -description: Windows startup automation and morning routine configuration -triggers: [startup, morning, routine, launch, autostart, schedule, Start-Process] ---- - -# Startup Routine - -## Morning Startup Script - -Create a PowerShell script that launches your daily apps in order: - -```powershell -# Morning startup routine -Write-Host "Starting morning routine..." - -# First: communication apps -Start-Process "slack://" -Start-Process "C:\Users\jerem\AppData\Local\Programs\Microsoft VS Code\Code.exe" - -# Second: wait, then browsers -Start-Process "C:\Program Files\Google\Chrome\Application\chrome.exe" - -Write-Host "Morning routine complete." -``` - -## Scheduling with Task Scheduler - -Use `Register-ScheduledTask` to run the startup script at logon: - -```powershell -$action = New-ScheduledTaskAction -Execute "PowerShell.exe" -Argument "-File C:\scripts\morning-routine.ps1" -$trigger = New-ScheduledTaskTrigger -AtLogOn -Register-ScheduledTask -TaskName "MorningRoutine" -Action $action -Trigger $trigger -``` - -## Ordering - -Start communication apps first (Slack), then editor (VS Code), then browsers (ChatGPT). Use `Start-Sleep` between launches to ensure each app is ready before the next starts. diff --git a/bench/fixtures/skills/system-monitor/SKILL.md b/bench/fixtures/skills/system-monitor/SKILL.md deleted file mode 100644 index f3149cd..0000000 --- a/bench/fixtures/skills/system-monitor/SKILL.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -name: System Monitor -description: System performance monitoring and resource tracking on Windows -triggers: [performance, system, monitor, resource, task manager, perfmon, Resource Monitor] ---- - -# System Monitor - -## Resource Monitor - -Launch Resource Monitor (`resmon.exe`) to see real-time CPU, memory, disk, and network usage. The CPU tab shows per-process utilization with expandable views. - -## Performance Monitor - -Use Performance Monitor (`perfmon.exe`) to create data collector sets. Track counters like: -- `\Processor(_Total)\% Processor Time` -- `\Memory\Available MBytes` -- `\LogicalDisk(*)\% Disk Time` - -## Task Manager - -Task Manager shows per-process CPU, memory, disk, and network columns. Sort by any column to identify resource hogs. The Performance tab shows overall system utilization graphs. - -## Identifying Worst Offender - -Use `Get-Process | Sort-Object CPU -Descending | Select-Object Name, CPU, WorkingSet -First 5` to quickly identify the process using the most CPU time. diff --git a/bench/fixtures/skills/theme-factory/SKILL.md b/bench/fixtures/skills/theme-factory/SKILL.md deleted file mode 100644 index 609c1e5..0000000 --- a/bench/fixtures/skills/theme-factory/SKILL.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -name: Theme Factory -description: Slide deck theming engine with design philosophy presets -triggers: [theme, slides, deck, presentation, Dieter Rams, slide deck, PowerPoint] ---- - -# Theme Factory - -## Dieter Rams Theme - -Based on Rams' ten principles of good design: innovative, useful, aesthetic, understandable, unobtrusive, honest, long-lasting, thorough, environmentally friendly, and minimal. - -"Less, but better" — focus on essential content. Remove anything decorative. - -## Applying the Theme - -```powershell -# Generate slide deck using Rams theme -theme-factory --theme rams --format pptx --output Q2-plans.pptx -``` - -## Theme Structure - -- **Master slide**: clean white background, Slate title text, single Clay accent line -- **Content slides**: minimal, one idea per slide, generous whitespace -- **Data slides**: clean charts, no 3D effects, muted colors -- **Typography**: Styrene for titles, Tiempos for body - -## Q2 Planning Deck - -For Q2 objectives and milestones, use the Professional theme with Rams influence: results-focused layouts, timeline slides, key results highlighted with restraint. diff --git a/bench/fixtures/skills/web-researcher/SKILL.md b/bench/fixtures/skills/web-researcher/SKILL.md deleted file mode 100644 index d550c32..0000000 --- a/bench/fixtures/skills/web-researcher/SKILL.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -name: Web Researcher -description: Web research and information gathering for product validation and market analysis -triggers: [research, web, market, validation, idea, startup, customer, problem] ---- - -# Web Researcher - -## Research Methodology - -1. **Define the problem**: what specific pain point are you solving? -2. **Identify target users**: who experiences this pain most acutely? -3. **Validate demand**: search volume, Reddit threads, existing solutions -4. **Size the market**: TAM, SAM, SOM for the target segment - -## Photographer Market Research - -For a photographer-focused product: -- **Target users**: wedding photographers (high volume), commercial photographers (high value), portrait studios (repeat business) -- **Pain points**: culling time, editing consistency, client communication, gallery delivery -- **Existing solutions**: fragmented across multiple tools, no unified AI assistant -- **Monetization**: SaaS subscription at $15-30/month, or per-session pricing for occasional users - -## Validation - -Search for "photographer AI assistant", "photo editing AI", "client gallery automation" on Reddit, Product Hunt, and G2 to assess market interest. Build an MVP with the single most painful feature (automated culling + basic editing) and validate with 10 target users. diff --git a/bench/gold-answers.json b/bench/gold-answers.json deleted file mode 100644 index 68b8489..0000000 --- a/bench/gold-answers.json +++ /dev/null @@ -1,480 +0,0 @@ -{ - "_schema": "Per-task gold facts and labeled retrieval expectations. Used by grade_v2.py.", - "_fields": { - "must_include": "Facts the answer MUST cover. Each entry has a `name`, a list of `keywords` (any match counts), and optional `desc`.", - "may_include": "Bonus facts. Same shape as must_include.", - "forbidden": "Anti-patterns that should NOT appear (wrong OS, wrong API style, etc).", - "expected_sources": "Skill IDs that should appear in MCP search results. Used for Precision@K / Recall@K.", - "failure_modes": "Enumerated tags the judge may apply to label what went wrong." - }, - "tasks": { - "pc-cpu-hog": { - "must_include": [ - { - "name": "windows_command", - "keywords": ["Get-Process", "Task Manager", "Resource Monitor", "tasklist"], - "desc": "A native Windows mechanism to inspect CPU usage" - }, - { - "name": "sort_by_cpu", - "keywords": ["Sort-Object CPU", "sort by cpu", "sorted by cpu", "CPU -Descending", "highest cpu"], - "desc": "Show how to order the list by CPU consumption" - }, - { - "name": "action_after_finding", - "keywords": ["Stop-Process", "End task", "kill", "suspend", "investigate"], - "desc": "What to do with the offending process" - } - ], - "may_include": [ - { "name": "process_explorer", "keywords": ["Process Explorer", "procexp", "Sysinternals"] }, - { "name": "performance_tab", "keywords": ["Performance tab", "Performance monitor", "perfmon"] } - ], - "forbidden": [ - { - "name": "linux_only", - "keywords": ["sudo apt", "/proc/stat", "htop install", "apt-get"], - "desc": "Wrong OS — user is on Windows" - } - ], - "expected_sources": ["process-manager", "system-monitor"], - "failure_modes": ["wrong_os", "missing_command", "vague_advice"] - }, - "pc-morning-routine": { - "must_include": [ - { - "name": "script_or_automation", - "keywords": ["Start-Process", "PowerShell", ".ps1", "Task Scheduler", "shortcut"], - "desc": "A concrete automation mechanism" - }, - { - "name": "three_apps_named", - "keywords": ["Slack", "VS Code", "ChatGPT"], - "match_mode": "all", - "desc": "All three named applications appear" - }, - { - "name": "order_rationale", - "keywords": ["first", "second", "third", "order", "before", "after", "delay", "sleep"], - "desc": "An explicit launch order or sequencing" - } - ], - "may_include": [ - { - "name": "auto_run_logon", - "keywords": ["at log on", "AtLogOn", "Task Scheduler", "Register-ScheduledTask", "Startup folder"] - }, - { "name": "profile_alias", "keywords": ["$PROFILE", "alias", "function"] }, - { "name": "store_aumid", "keywords": ["AUMID", "shell:AppsFolder", "Get-StartApps"] } - ], - "forbidden": [ - { - "name": "linux_cron", - "keywords": ["crontab", "systemd timer", "launchd"], - "desc": "Wrong OS — user is on Windows" - } - ], - "expected_sources": ["startup-routine", "app-launcher"], - "failure_modes": ["wrong_os", "missing_app", "no_order_rationale", "vague_advice"] - }, - "comfy-prompt-fantasy": { - "must_include": [ - { - "name": "positive_prompt", - "keywords": ["positive", "prompt:", "Positive:"], - "desc": "Distinct positive prompt section" - }, - { - "name": "negative_prompt", - "keywords": ["negative", "Negative:"], - "desc": "Distinct negative prompt section" - }, - { - "name": "scene_elements", - "keywords": ["fog", "sunset", "fantasy", "landscape"], - "match_mode": "all", - "desc": "Names all four requested scene elements" - }, - { - "name": "settings_hint", - "keywords": ["CFG", "steps", "sampler", "resolution", "DPM", "Karras", "768", "1024", "1536"], - "desc": "Mentions concrete generation settings" - } - ], - "may_include": [ - { - "name": "artist_reference", - "keywords": ["Rutkowski", "Bierstadt", "Aivazovsky", "painterly", "in the style of"] - }, - { "name": "weighted_tokens", "keywords": ["(", "weight", "emphasis", ":1.1", ":1.2"] }, - { "name": "hires_fix", "keywords": ["hires", "upscale", "highres", "refiner"] } - ], - "forbidden": [ - { - "name": "a1111_only", - "keywords": ["AUTOMATIC1111 webui-user.bat", "stable-diffusion-webui"], - "desc": "Question is about ComfyUI" - }, - { "name": "midjourney_syntax", "keywords": ["/imagine", "--ar", "--style raw"] } - ], - "expected_sources": ["comfyui-prompt-builder", "comfyui-workflow-manager"], - "failure_modes": ["missing_negative", "no_settings", "wrong_tool", "vague_advice"] - }, - "comfy-workflow-upscale": { - "must_include": [ - { "name": "load_image_node", "keywords": ["Load Image", "LoadImage"] }, - { - "name": "upscale_node", - "keywords": [ - "Upscale Model", - "ImageUpscaleWithModel", - "UpscaleModelLoader", - "ESRGAN", - "UltraSharp", - "RealESRGAN", - "4x-UltraSharp" - ] - }, - { "name": "refiner_pass", "keywords": ["KSampler", "refiner", "denoise", "second pass", "sampler"] }, - { "name": "4k_target", "keywords": ["4K", "3840", "4096", "2160", "4x"] } - ], - "may_include": [ - { "name": "tile_vae", "keywords": ["tile", "VAE", "tiled decode"] }, - { "name": "model_choice", "keywords": ["SDXL", "checkpoint", "Juggernaut", "RealVis"] } - ], - "forbidden": [{ "name": "a1111_syntax", "keywords": ["webui.py", "extensions/sd-webui"] }], - "expected_sources": ["comfyui-workflow-manager", "comfyui-prompt-builder"], - "failure_modes": ["missing_upscaler", "no_refiner", "wrong_resolution", "vague_workflow"] - }, - "prompt-library-save": { - "must_include": [ - { - "name": "template_name", - "keywords": ["cinematic portrait v3"], - "desc": "Names the exact template the user asked for" - }, - { - "name": "save_mechanism", - "keywords": ["save", "store", "library", "template", "register"], - "desc": "Explains how it's saved" - }, - { - "name": "reusable_intent", - "keywords": ["reuse", "reusable", "later", "next time", "load"], - "desc": "Acknowledges reuse intent" - } - ], - "may_include": [ - { "name": "workflow_json", "keywords": [".json", "workflow", "metadata", "tags"] }, - { "name": "location_path", "keywords": ["path", "directory", "folder", "templates/"] } - ], - "forbidden": [], - "expected_sources": ["prompt-library-manager", "comfyui-prompt-builder"], - "failure_modes": ["missing_template_name", "no_save_mechanism", "vague_advice"] - }, - "claude-api-tooluse": { - "must_include": [ - { "name": "anthropic_sdk", "keywords": ["anthropic", "Anthropic(", "import anthropic"] }, - { "name": "tools_array", "keywords": ["tools=", "tools:", "input_schema"] }, - { - "name": "agent_loop", - "keywords": ["stop_reason", "tool_use"], - "match_mode": "all", - "desc": "Loop until non-tool_use stop_reason" - }, - { "name": "tool_result_id", "keywords": ["tool_use_id", "tool_result"] }, - { "name": "weather_function", "keywords": ["get_weather", "weather", "temperature"] } - ], - "may_include": [ - { "name": "error_as_data", "keywords": ["is_error", "is_error: True", "is_error=True"] }, - { "name": "prompt_caching", "keywords": ["cache_control", "ephemeral"] }, - { "name": "real_api", "keywords": ["openweather", "open-meteo", "weatherapi"] } - ], - "forbidden": [ - { - "name": "openai_format", - "keywords": ["function_call", "openai", "ChatCompletion"], - "desc": "Wrong SDK — this is Anthropic tool-use" - }, - { "name": "legacy_function_calling", "keywords": ["functions=", "function_call:"] } - ], - "expected_sources": ["claude-api"], - "failure_modes": ["wrong_sdk", "no_loop", "missing_tool_id", "vague_advice"] - }, - "claude-api-migrate": { - "must_include": [ - { - "name": "model_string_change", - "keywords": ["claude-4-6", "claude-sonnet-4-6", "claude-opus-4-6"], - "desc": "Mentions a 4.6 model name" - }, - { "name": "cache_control", "keywords": ["cache_control"] }, - { "name": "ephemeral_type", "keywords": ["ephemeral"] }, - { - "name": "cache_placement", - "keywords": ["system", "tools", "messages", "prefix"], - "desc": "Where to put the cache marker" - } - ], - "may_include": [ - { "name": "cache_ttl", "keywords": ["5 minutes", "5-minute", "TTL", "expires"] }, - { - "name": "cache_hit_metric", - "keywords": ["cache_creation_input_tokens", "cache_read_input_tokens", "usage."] - } - ], - "forbidden": [ - { "name": "openai_cache_syntax", "keywords": ["openai", "ChatCompletion", "system_fingerprint"] } - ], - "expected_sources": ["claude-api"], - "failure_modes": ["wrong_cache_syntax", "missing_model_swap", "vague_advice"] - }, - "brand-poster": { - "must_include": [ - { "name": "a3_dimensions", "keywords": ["A3", "297", "420"] }, - { - "name": "anthropic_palette", - "keywords": ["#CC785C", "Book Cloth", "Clay", "coral", "ivory", "kraft", "slate"], - "desc": "Names at least one signature brand palette color" - }, - { - "name": "typography", - "keywords": ["Styrene", "Tiempos"], - "match_mode": "all", - "desc": "Names both Anthropic display + body typefaces" - }, - { - "name": "restraint", - "keywords": ["restraint", "whitespace", "minimal", "editorial", "quiet", "single accent"], - "desc": "Acknowledges Anthropic's restrained aesthetic" - } - ], - "may_include": [ - { "name": "bleed_cmyk", "keywords": ["bleed", "CMYK", "300 DPI"] }, - { "name": "workshop_specifics", "keywords": ["developer", "workshop", "agenda"] } - ], - "forbidden": [ - { - "name": "gradients", - "keywords": ["neon", "gradient", "glassmorphism", "glow"], - "desc": "Off-brand visual treatments" - } - ], - "expected_sources": ["brand-guidelines", "canvas-design"], - "failure_modes": ["wrong_palette", "wrong_typography", "off_brand_aesthetic", "no_specifics"] - }, - "theme-slides": { - "must_include": [ - { "name": "theme_factory_ref", "keywords": ["theme-factory", "theme factory"] }, - { "name": "dieter_rams", "keywords": ["Dieter Rams", "Rams"] }, - { - "name": "rams_principles", - "keywords": [ - "less but better", - "less, but better", - "useful", - "honest", - "long-lasting", - "unobtrusive", - "good design", - "ten principles" - ], - "desc": "Names at least one Rams principle" - }, - { "name": "deck_format", "keywords": ["slide", "deck", "presentation", "pptx", "PowerPoint"] } - ], - "may_include": [{ "name": "q2_structure", "keywords": ["Q2", "objectives", "milestones", "outcomes"] }], - "forbidden": [{ "name": "off_brand_style", "keywords": ["gradient", "neon", "skeuomorphic"] }], - "expected_sources": ["theme-factory", "pptx"], - "failure_modes": ["missing_theme_reference", "no_rams_principles", "wrong_format"] - }, - "internal-release-note": { - "must_include": [ - { "name": "version_named", "keywords": ["v0.3.1", "0.3.1"] }, - { - "name": "fix_described", - "keywords": ["persistence", "data persistence", "userData", "data fix", "memory", "wiped"], - "desc": "Explains what the fix actually does" - }, - { - "name": "slack_format", - "keywords": [":", "*", "#", ":rocket:", ":tada:", "@"], - "desc": "Slack-style formatting markers" - }, - { - "name": "audience_action", - "keywords": ["update", "upgrade", "restart", "pull", "auto-update"], - "desc": "Tells reader what to do" - } - ], - "may_include": [ - { "name": "ce_root_env", "keywords": ["CE_ROOT", "getPath", "app.getPath", "userData"] }, - { "name": "migration_note", "keywords": ["migration", "migrate", "resourcesPath", "asar"] } - ], - "forbidden": [ - { - "name": "marketing_speak", - "keywords": ["leverage", "synergy", "best-in-class", "world-class"], - "desc": "Internal note, not marketing copy" - } - ], - "expected_sources": ["internal-comms", "slack-gif-creator"], - "failure_modes": ["missing_version", "vague_fix", "wrong_format", "missing_audience_action"] - }, - "email-security-patch": { - "must_include": [ - { - "name": "subject_line", - "keywords": ["Subject:", "Re:", "Security patch", "[Security]", "Patch shipped"] - }, - { - "name": "vulnerability_named", - "keywords": ["symlink", "symbolic link", "symlink-escape", "path traversal"], - "desc": "Names the actual vulnerability" - }, - { - "name": "patch_action", - "keywords": ["shipped", "patched", "fixed", "released", "rolled out"], - "desc": "States that fix is live" - }, - { - "name": "audience_action", - "keywords": ["update", "upgrade", "restart", "no action", "pull", "deploy"], - "desc": "What recipients should do" - } - ], - "may_include": [ - { - "name": "severity_or_impact", - "keywords": ["severity", "impact", "critical", "high", "low risk", "no exploit observed"] - }, - { "name": "contact_owner", "keywords": ["questions", "reach out", "DM", "ping me", "contact"] } - ], - "forbidden": [ - { - "name": "panic_tone", - "keywords": ["URGENT URGENT", "DISASTER", "EVERYONE STOP"], - "desc": "Internal patch note, not an incident" - }, - { "name": "external_marketing", "keywords": ["our valued customers", "world-class security"] } - ], - "expected_sources": ["internal-comms", "email-drafter"], - "failure_modes": ["missing_vulnerability_name", "no_audience_action", "wrong_tone"] - }, - "health-step-streak": { - "must_include": [ - { - "name": "data_source", - "keywords": ["Apple Health", "Google Fit", "Fitbit", "Garmin", "step data", "CSV", "export"] - }, - { - "name": "analysis_approach", - "keywords": ["median", "mean", "average", "trend", "rolling", "per day"] - }, - { - "name": "target_or_streak", - "keywords": ["target", "goal", "streak", "10,000", "10k", "daily goal", "on track"] - } - ], - "may_include": [ - { "name": "visualization", "keywords": ["chart", "graph", "plot", "visualize"] }, - { "name": "weekly_breakdown", "keywords": ["weekly", "per week", "weekday vs weekend"] } - ], - "forbidden": [ - { - "name": "fabricated_numbers", - "keywords": ["you walked exactly 8,432 steps", "your average is precisely"], - "desc": "Specific numbers without source data" - } - ], - "expected_sources": ["health-tracker-analyst"], - "failure_modes": ["no_data_source", "fabricated_data", "vague_analysis"] - }, - "file-cleanup-downloads": { - "must_include": [ - { - "name": "enumerate_downloads", - "keywords": ["Get-ChildItem", "Downloads", "$env:USERPROFILE\\Downloads", "ls", "dir"] - }, - { "name": "age_filter", "keywords": ["LastWriteTime", "AddDays(-90)", "90 days", "older than"] }, - { "name": "proposed_action", "keywords": ["move", "archive", "delete", "review", "list", "report"] }, - { - "name": "safety_step", - "keywords": ["dry-run", "dry run", "confirm", "WhatIf", "preview", "review before"], - "desc": "Doesn't delete blind" - } - ], - "may_include": [ - { "name": "size_breakdown", "keywords": ["size", "GB", "MB", "Measure-Object"] }, - { "name": "categorize", "keywords": ["extension", "type", "group", "category"] } - ], - "forbidden": [ - { - "name": "delete_without_confirm", - "keywords": ["Remove-Item -Recurse -Force without", "rm -rf"], - "desc": "Should never delete without confirmation" - } - ], - "expected_sources": ["downloads-cleaner", "file-search", "file-organiser"], - "failure_modes": ["no_safety_step", "missing_age_filter", "vague_plan"] - }, - "memory-consolidate": { - "must_include": [ - { "name": "read_memory", "keywords": ["memory.json", "memory file", "read memory", "load memory"] }, - { "name": "find_duplicates", "keywords": ["duplicate", "merge", "similar", "overlap"] }, - { "name": "prune_stale", "keywords": ["stale", "old", "outdated", "prune", "remove"] }, - { - "name": "backup_first", - "keywords": ["backup", "snapshot", "save copy", "before mutating", "before changes"], - "desc": "Don't blow up the memory file" - } - ], - "may_include": [ - { "name": "propose_then_apply", "keywords": ["propose", "review", "confirm", "preview", "diff"] }, - { "name": "similarity_threshold", "keywords": ["similarity", "threshold", "embedding"] } - ], - "forbidden": [ - { - "name": "silent_overwrite", - "keywords": ["overwrites memory.json", "no backup needed"], - "desc": "Always back up first" - } - ], - "expected_sources": ["context-engine-api"], - "failure_modes": ["no_backup", "destructive_default", "vague_plan"] - }, - "brainstorm-startup": { - "must_include": [ - { - "name": "multiple_ideas", - "keywords": ["1.", "2.", "3.", "Idea 1", "First", "Second", "Third"], - "desc": "More than one distinct idea" - }, - { - "name": "photographer_specific", - "keywords": ["photographer", "photography", "Lightroom", "shoot", "client gallery", "EXIF"] - }, - { "name": "problem_framing", "keywords": ["problem", "pain", "frustration", "currently", "today"] } - ], - "may_include": [ - { "name": "target_user", "keywords": ["wedding", "commercial", "portrait", "wildlife", "freelance"] }, - { "name": "monetization", "keywords": ["subscription", "SaaS", "pricing", "freemium", "$"] }, - { "name": "mvp_scope", "keywords": ["MVP", "first version", "wedge", "first feature"] } - ], - "forbidden": [ - { - "name": "single_idea", - "keywords": [], - "desc": "Pseudo-forbidden — checked separately; brainstorm should yield >1 idea" - }, - { - "name": "generic_only", - "keywords": [], - "desc": "Pseudo-forbidden — ideas must be photographer-specific, not generic AI assistant" - } - ], - "expected_sources": ["competitor-analyser", "web-researcher"], - "failure_modes": ["single_idea", "generic_not_photographer_specific", "no_problem_framing"] - } - } -} diff --git a/bench/grade_v2.py b/bench/grade_v2.py deleted file mode 100644 index 97a51cd..0000000 --- a/bench/grade_v2.py +++ /dev/null @@ -1,420 +0,0 @@ -#!/usr/bin/env python3 -""" -grade_v2.py — Improved measurement layer addressing the v1.1 critique. - -What's new vs the tokenomics.py --grade pass: - - 1. **Gold-answer fact recall.** For each task, gold-answers.json defines a - list of facts that MUST be covered, optional facts that MAY be covered, - and FORBIDDEN content (hallucinations, wrong OS, etc). Scoring is - deterministic keyword-match first, with optional LLM-confirmed soft - matches for keywords that didn't hit. Output is a coverage % per - response — opinion replaced with measurement. - - 2. **Decomposed rubric.** Each response is scored on four axes - separately, not one holistic 1-10: - - factual_correctness (1-5) - - specificity (1-5) - - completeness (1-5) — against gold must_include - - hallucination_count (integer; lower is better) - - 3. **Multi-judge, multi-run.** Each response is graded N times by each - configured judge. Cross-family judges (Anthropic + OpenAI) break - self-evaluation bias. Reported metrics: median + IQR across runs. - - 4. **No-context baseline.** A fourth measurement path: same task model, - no skill context at all. Tells you whether retrieval is doing real - work or the model already knew enough. - - 5. **Precision@K / Recall@K / MRR.** When expected_sources is labeled, - score the search results against it. - - 6. **Latency + cost.** End-to-end milliseconds per task and dollar cost - per response, computed from per-model price tables. - - 7. **Failure-mode labels.** Each low-scoring response gets one or more - failure-mode tags from a fixed enum (per task in gold-answers.json). - Lets us count "what kind of failures" not just "how many." - -Usage: - python bench/grade_v2.py --responses bench/data/v1.2/responses-v12.json --gold bench/gold-answers.json - python bench/grade_v2.py --responses ... --judges anthropic,openai --runs 3 - -This module is importable; run_v12.py orchestrates the full pipeline. -""" -from __future__ import annotations - -import json -import os -import re -import statistics -import sys -import time -from dataclasses import dataclass, field, asdict -from pathlib import Path -from typing import Optional, Callable - -HERE = Path(__file__).parent - -# ---------- Pricing table (USD per 1M tokens, input/output) ---------- -# Update as published prices change. Used for the cost column only. -PRICING = { - "claude-haiku-4-5": {"in": 1.00, "out": 5.00}, - "claude-sonnet-4-5": {"in": 3.00, "out": 15.00}, - "claude-sonnet-4-6": {"in": 3.00, "out": 15.00}, - "claude-opus-4-7": {"in": 15.00, "out": 75.00}, - "gpt-4o": {"in": 2.50, "out": 10.00}, - "gpt-4o-mini": {"in": 0.15, "out": 0.60}, - "gpt-5": {"in": 5.00, "out": 25.00}, # placeholder; update on release - "gemini-1.5-pro": {"in": 1.25, "out": 5.00}, - "gemini-2.0-flash": {"in": 0.10, "out": 0.40}, - "local": {"in": 0.00, "out": 0.00}, -} - - -def cost_usd(model: str, in_tok: int, out_tok: int) -> float: - p = PRICING.get(model, PRICING["claude-haiku-4-5"]) - return (in_tok * p["in"] + out_tok * p["out"]) / 1_000_000 - - -# ---------- Gold-answer fact recall (deterministic) ---------- - -def _normalize(text: str) -> str: - return re.sub(r"\s+", " ", text.lower()).strip() - - -_NEGATION_PREFIXES = ("no ", "not ", "without ", "avoid ", "avoids ", "avoiding ", - "don't ", "do not ", "never ", "skip ", "skipping ", - "no gradients", "no glow") - - -def keyword_hits(text: str, keywords: list[str], guard_negation: bool = False) -> list[str]: - """Return the subset of keywords that appear in text (case-insensitive, - whitespace-tolerant). A keyword may be a multi-word phrase. When - guard_negation is True, suppress hits where the keyword is immediately - preceded by a negation token — used for forbidden checks so 'no gradients' - doesn't count as the response asserting gradients.""" - if not keywords: - return [] - norm = _normalize(text) - hits = [] - for k in keywords: - nk = _normalize(k) - if nk not in norm: - continue - if guard_negation: - # Find every occurrence; only count if at least one is NOT negated. - counted = False - start = 0 - while True: - idx = norm.find(nk, start) - if idx < 0: - break - window = norm[max(0, idx - 30):idx] - if not any(window.rstrip().endswith(neg.rstrip()) for neg in _NEGATION_PREFIXES): - counted = True - break - start = idx + len(nk) - if counted: - hits.append(k) - else: - hits.append(k) - return hits - - -@dataclass -class FactResult: - name: str - required: bool # True = must_include, False = may_include - hit: bool - matched_keywords: list[str] - - -@dataclass -class FactRecallResult: - must_total: int - must_hit: int - may_total: int - may_hit: int - forbidden_hits: list[str] # names of forbidden patterns that fired - facts: list[FactResult] - coverage_pct: float # must_hit / must_total - bonus_pct: float # may_hit / may_total - failure_modes: list[str] # inferred from misses - - -def score_facts(response: str, gold: dict) -> FactRecallResult: - must = gold.get("must_include", []) - may = gold.get("may_include", []) - forbidden = gold.get("forbidden", []) - - def _hit(f: dict) -> tuple[bool, list[str]]: - kws = f.get("keywords") or [] - if not kws: - return True, [] # empty keyword list = manual-review pass - hits = keyword_hits(response, kws) - mode = f.get("match_mode", "any") - if mode == "all": - ok = len(hits) == len(kws) - else: # default "any" - ok = bool(hits) - return ok, hits - - facts: list[FactResult] = [] - must_hit = 0 - for f in must: - ok, hits = _hit(f) - if ok: - must_hit += 1 - facts.append(FactResult(f["name"], True, ok, hits)) - - may_hit = 0 - for f in may: - ok, hits = _hit(f) - if ok: - may_hit += 1 - facts.append(FactResult(f["name"], False, ok, hits)) - - forbidden_fired = [] - for f in forbidden: - kws = f.get("keywords") or [] - hits = keyword_hits(response, kws, guard_negation=True) - if hits: - forbidden_fired.append(f["name"]) - - coverage = (must_hit / len(must)) * 100 if must else 100.0 - bonus = (may_hit / len(may)) * 100 if may else 0.0 - - # Infer failure modes from misses - failure_modes = [] - enumerated = gold.get("failure_modes", []) - if coverage < 60 and "vague_advice" in enumerated: - failure_modes.append("vague_advice") - for f in facts: - if f.required and not f.hit: - # Map common patterns to failure-mode tags. Names in gold-answers.json - # are designed so this mapping is mostly identity. - tag_map = { - "windows_command": "wrong_os", - "linux_cron": "wrong_os", - "negative_prompt": "missing_negative", - "agent_loop": "no_loop", - "tool_result_id": "missing_tool_id", - "anthropic_sdk": "wrong_sdk", - "backup_first": "no_backup", - "safety_step": "no_safety_step", - "version_named": "missing_version", - "vulnerability_named": "missing_vulnerability_name", - } - tag = tag_map.get(f.name, f"missing_{f.name}") - if tag in enumerated: - failure_modes.append(tag) - failure_modes.extend([f"forbidden:{n}" for n in forbidden_fired]) - - return FactRecallResult( - must_total=len(must), must_hit=must_hit, - may_total=len(may), may_hit=may_hit, - forbidden_hits=forbidden_fired, - facts=facts, - coverage_pct=round(coverage, 1), - bonus_pct=round(bonus, 1), - failure_modes=sorted(set(failure_modes)), - ) - - -# ---------- Retrieval quality (Precision@K / Recall@K / MRR) ---------- - -@dataclass -class RetrievalQuality: - k: int - precision_at_k: float - recall_at_k: float - mrr: float - expected: list[str] - retrieved: list[str] - hits: list[str] - - -def score_retrieval(retrieved_skill_ids: list[str], expected_sources: list[str], k: int = None) -> RetrievalQuality: - """retrieved_skill_ids: the ordered list of skill_ids returned by /api/search. - expected_sources: the labeled gold list for the task.""" - if k is None: - k = len(retrieved_skill_ids) - top_k = retrieved_skill_ids[:k] - expected_set = set(expected_sources) - hits = [s for s in top_k if s in expected_set] - precision = (len(hits) / len(top_k)) if top_k else 0.0 - recall = (len(hits) / len(expected_set)) if expected_set else 0.0 - # MRR: 1 / rank of first relevant result - mrr = 0.0 - for i, s in enumerate(top_k, 1): - if s in expected_set: - mrr = 1.0 / i - break - return RetrievalQuality( - k=k, - precision_at_k=round(precision, 3), - recall_at_k=round(recall, 3), - mrr=round(mrr, 3), - expected=expected_sources, - retrieved=top_k, - hits=hits, - ) - - -# ---------- Decomposed rubric (LLM-judged) ---------- - -DECOMPOSED_RUBRIC = """You are grading an AI assistant's response to a user task. Score on FOUR independent axes. - -Axes: -- factual_correctness (1-5): Do the named APIs, commands, paths, syntax actually exist as stated? Score 1 if there are clear factual errors; 5 if everything checkable looks correct. -- specificity (1-5): Is the response concrete? 1 = generic platitudes; 5 = task-specific with exact names, values, paths, code. -- completeness (1-5): Does it cover what a user would need to actually finish the task? 1 = misses critical steps; 5 = covers everything required. -- hallucination_count (integer >= 0): Count distinct fabricated facts — things the response asserts as true that are likely wrong or invented. 0 if none spotted. - -Output EXACTLY five lines in this format, nothing else: -FACTUAL: <1-5> -SPECIFIC: <1-5> -COMPLETE: <1-5> -HALLUCINATIONS: -NOTES: """ - - -@dataclass -class DecomposedGrade: - factual: int = 0 - specific: int = 0 - complete: int = 0 - hallucinations: int = 0 - notes: str = "" - judge: str = "" - run_idx: int = 0 - - -def parse_decomposed(text: str) -> DecomposedGrade: - g = DecomposedGrade() - for line in text.splitlines(): - line = line.strip() - if line.upper().startswith("FACTUAL:"): - digits = re.findall(r"\d+", line) - if digits: - g.factual = max(1, min(5, int(digits[0]))) - elif line.upper().startswith("SPECIFIC:"): - digits = re.findall(r"\d+", line) - if digits: - g.specific = max(1, min(5, int(digits[0]))) - elif line.upper().startswith("COMPLETE:"): - digits = re.findall(r"\d+", line) - if digits: - g.complete = max(1, min(5, int(digits[0]))) - elif line.upper().startswith("HALLUCINATIONS:"): - digits = re.findall(r"\d+", line) - if digits: - g.hallucinations = max(0, int(digits[0])) - elif line.upper().startswith("NOTES:"): - g.notes = line.split(":", 1)[1].strip() - return g - - -# ---------- Multi-judge orchestration ---------- - -@dataclass -class JudgeConfig: - name: str # e.g. "anthropic:haiku" or "openai:gpt-4o-mini" - family: str # "anthropic" | "openai" | "gemini" | "session" - model: str - api_key_env: str # which env var to read - - -def call_anthropic_judge(model: str, system: str, user: str, api_key: str) -> tuple[str, int, int]: - import anthropic - client = anthropic.Anthropic(api_key=api_key) - resp = client.messages.create( - model=model, max_tokens=300, - system=system, messages=[{"role": "user", "content": user}], - ) - text = "".join(b.text for b in resp.content if getattr(b, "type", None) == "text") - return text, int(resp.usage.input_tokens), int(resp.usage.output_tokens) - - -def call_openai_judge(model: str, system: str, user: str, api_key: str) -> tuple[str, int, int]: - import openai - client = openai.OpenAI(api_key=api_key) - resp = client.chat.completions.create( - model=model, max_tokens=300, - messages=[{"role": "system", "content": system}, - {"role": "user", "content": user}], - ) - text = resp.choices[0].message.content or "" - u = resp.usage - return text, int(u.prompt_tokens), int(u.completion_tokens) - - -JUDGE_DISPATCH = { - "anthropic": call_anthropic_judge, - "openai": call_openai_judge, -} - - -def grade_response_decomposed( - task_prompt: str, response: str, - judges: list[JudgeConfig], runs: int = 3, - session_grader: Optional[Callable[[str, str], DecomposedGrade]] = None, -) -> list[DecomposedGrade]: - """Returns a list of DecomposedGrade — one per (judge, run). The session_grader - callable, if provided, is used for judges of family 'session' (the current - Claude Code conversation acting as judge, useful when API keys aren't available).""" - user = f"TASK:\n{task_prompt}\n\nRESPONSE TO GRADE:\n{response}" - grades = [] - for j in judges: - for run_idx in range(runs): - if j.family == "session" and session_grader is not None: - g = session_grader(task_prompt, response) - else: - api_key = os.environ.get(j.api_key_env, "") - if not api_key: - g = DecomposedGrade(notes=f"(no {j.api_key_env})") - else: - try: - fn = JUDGE_DISPATCH[j.family] - text, _, _ = fn(j.model, DECOMPOSED_RUBRIC, user, api_key) - g = parse_decomposed(text) - except Exception as e: - g = DecomposedGrade(notes=f"(judge error: {e})") - g.judge = j.name - g.run_idx = run_idx - grades.append(g) - return grades - - -def aggregate_grades(grades: list[DecomposedGrade]) -> dict: - """Median + IQR + per-judge breakdown across multi-judge multi-run grading.""" - valid = [g for g in grades if g.factual > 0] - if not valid: - return {"n": 0} - def med(xs): return statistics.median(xs) - def iqr(xs): - if len(xs) < 4: - return 0.0 - q = statistics.quantiles(xs, n=4) - return round(q[2] - q[0], 2) - return { - "n": len(valid), - "factual_median": med([g.factual for g in valid]), - "factual_iqr": iqr([g.factual for g in valid]), - "specific_median": med([g.specific for g in valid]), - "specific_iqr": iqr([g.specific for g in valid]), - "complete_median": med([g.complete for g in valid]), - "complete_iqr": iqr([g.complete for g in valid]), - "hallucinations_mean": round(statistics.mean([g.hallucinations for g in valid]), 2), - "hallucinations_max": max([g.hallucinations for g in valid]), - "composite_median": round(med([(g.factual + g.specific + g.complete) / 3 for g in valid]), 2), - "per_judge": { - j: { - "factual_median": med([g.factual for g in valid if g.judge == j]), - "specific_median": med([g.specific for g in valid if g.judge == j]), - "complete_median": med([g.complete for g in valid if g.judge == j]), - } - for j in sorted({g.judge for g in valid}) - }, - } diff --git a/bench/legacy/build_addendum.py b/bench/legacy/build_addendum.py deleted file mode 100644 index 079a103..0000000 --- a/bench/legacy/build_addendum.py +++ /dev/null @@ -1,394 +0,0 @@ -#!/usr/bin/env python3 -""" -Build a PDF addendum (Appendix A) for the Context Engine white paper that -inserts the first empirical validation of Hypothesis 1: Smart Compile reduces -token load without reducing task quality. - -Source of truth for numbers: bench/results-graded-sample.json + bench/results-latest.json -""" -from __future__ import annotations -import json -from pathlib import Path - -from reportlab.lib.pagesizes import A4 -from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle -from reportlab.lib.units import mm -from reportlab.lib import colors -from reportlab.lib.enums import TA_LEFT -from reportlab.platypus import ( - SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, KeepTogether, -) -from PyPDF2 import PdfReader, PdfWriter - -ROOT = Path(__file__).resolve().parents[2] -BENCH = ROOT / "app" / "bench" -ADDENDUM_PDF = BENCH / "addendum-tokenomics.pdf" -SOURCE_PDF = ROOT / "Context Engine White Paper V1.pdf" -OUTPUT_PDF = ROOT / "Context Engine White Paper V1.1.pdf" - -graded = json.loads((BENCH / "results-graded-sample.json").read_text(encoding="utf-8")) -full = json.loads((BENCH / "results-latest.json").read_text(encoding="utf-8")) - -styles = getSampleStyleSheet() -H1 = ParagraphStyle("H1", parent=styles["Heading1"], fontName="Helvetica-Bold", - fontSize=18, leading=22, spaceAfter=12, textColor=colors.HexColor("#191919")) -H2 = ParagraphStyle("H2", parent=styles["Heading2"], fontName="Helvetica-Bold", - fontSize=13, leading=17, spaceBefore=14, spaceAfter=6, - textColor=colors.HexColor("#191919")) -H3 = ParagraphStyle("H3", parent=styles["Heading3"], fontName="Helvetica-Bold", - fontSize=11, leading=14, spaceBefore=8, spaceAfter=4, - textColor=colors.HexColor("#3a3a3a")) -BODY = ParagraphStyle("BODY", parent=styles["BodyText"], fontName="Helvetica", - fontSize=10.5, leading=14, spaceAfter=6, alignment=TA_LEFT, - textColor=colors.HexColor("#191919")) -QUOTE = ParagraphStyle("QUOTE", parent=BODY, leftIndent=14, rightIndent=14, - fontName="Helvetica-Oblique", textColor=colors.HexColor("#3a3a3a"), - spaceBefore=4, spaceAfter=10) -CODE = ParagraphStyle("CODE", parent=BODY, fontName="Courier", fontSize=9, leading=12, - leftIndent=10, textColor=colors.HexColor("#191919")) -CAPTION = ParagraphStyle("CAPTION", parent=BODY, fontSize=9, leading=12, - textColor=colors.HexColor("#666666"), spaceAfter=10) - - -def fmt(n): - return f"{n:,}" - - -story = [] - -# ============================================================ -story.append(Paragraph("Appendix A", H3)) -story.append(Paragraph("Empirical Validation — Tokenomics Benchmark v0.3.1", H1)) -story.append(Paragraph( - "Added in v1.1 of this paper. Status: first concrete data point against " - "Hypothesis 1 (Smart Compile reduces token load without reducing task quality). " - "Validation is partial — see methodology caveats at the end of this appendix.", - QUOTE, -)) - -# ---------- A.1 Why this section exists ---------- -story.append(Paragraph("A.1 Why this section exists", H2)) -story.append(Paragraph( - "Section 19 of this paper deferred empirical results until Context Engine's " - "Smart Compile, deduplication, and MCP retrieval layers were mature enough to " - "test fairly. Following the v0.3.0 ship (Handoffs, Skill Sources, MCP server) " - "and the v0.3.1 persistence fix, the three retrieval paths the paper describes — " - "naive all-on loading, Smart Compile selection, and MCP search retrieval — are " - "all live and measurable in the same instance.", - BODY, -)) -story.append(Paragraph( - "This appendix reports the first end-to-end token-and-quality benchmark " - "run against a representative task corpus, using the apparatus defined in " - "app/bench/tokenomics.py.", - BODY, -)) - -# ---------- A.2 Method ---------- -story.append(Paragraph("A.2 Method", H2)) -story.append(Paragraph("Corpus", H3)) -story.append(Paragraph( - "Fifteen tasks spanning eight categories (system-ops, image-gen, claude-api, " - "design, comms, health, meta, product). Each task is a single natural-language " - "request a user would plausibly send to an assistant with this skill library installed.", - BODY, -)) -story.append(Paragraph("Three measurement paths, same tokenizer (tiktoken cl100k_base)", H3)) -story.append(Paragraph( - "Raw All — every active skill body concatenated. This is the naive MCP-host " - "baseline: load everything the host advertises and hope the model finds the " - "relevant material. All savings percentages in this appendix are computed against this number.", - BODY, -)) -story.append(Paragraph( - "Smart — tokens after Context Engine's /api/compile/smart " - "endpoint selects relevant skills for the specific task. Same content type as Raw All, " - "just a task-conditioned subset. This is the "compiled context" path.", - BODY, -)) -story.append(Paragraph( - "Search — tokens an MCP host actually pulls when it calls " - "context_engine_search once per task and receives " - "N=8 ranked chunks. This is the "live retrieval" path described in Section 13.", - BODY, -)) -story.append(Paragraph( - "The reference column from the broader run (CONTEXT.md, the pre-compressed " - "system-prompt summary at 3,292 tokens) is a different content type entirely and " - "is intentionally excluded from savings ratios — mixing compressed summary tokens with " - "full skill bodies would inflate the headline number dishonestly.", - BODY, -)) - -story.append(Paragraph("Quality grading", H3)) -story.append(Paragraph( - "For five representative tasks, each context was supplied to an LLM as a system " - "prompt and the model produced an answer. Each (task, mode) response was then " - "scored 1–10 against a fixed three-axis rubric — specificity, actionability, " - "plausibility — by a separate judge call. The numerator the appendix cares about is " - "tokens per quality point: the cost in retrieved tokens of one point of usable output.", - BODY, -)) - -# ---------- A.3 Table 1 ---------- -story.append(Paragraph("A.3 Results — token efficiency, full 15-task corpus", H2)) -story.append(Paragraph( - f"Baseline (Raw All, every active skill body): {fmt(full['raw_all_tokens'])} tokens " - f"per turn, across {full['active_skill_count']} active skills.", - BODY, -)) - -table_data = [["Task", "Category", "Smart tk", "Smart save", "Search tk", "Search save"]] -for r in full["results"]: - table_data.append([ - r["task_id"], - r["category"], - fmt(r["smart_tokens"]), - f"{r['smart_saving_pct']:.1f}%", - fmt(r["search_tokens"]), - f"{r['search_saving_pct']:.1f}%", - ]) - -smart_savings = [r["smart_saving_pct"] for r in full["results"]] -search_savings = [r["search_saving_pct"] for r in full["results"]] -smart_tokens = [r["smart_tokens"] for r in full["results"]] -search_tokens = [r["search_tokens"] for r in full["results"]] -def median(xs): - s = sorted(xs); n = len(s) - return s[n//2] if n % 2 else (s[n//2-1] + s[n//2]) / 2 -table_data.append([ - "Median", "—", - fmt(int(median(smart_tokens))), - f"{median(smart_savings):.1f}%", - fmt(int(median(search_tokens))), - f"{median(search_savings):.1f}%", -]) - -t = Table(table_data, colWidths=[42*mm, 24*mm, 22*mm, 22*mm, 22*mm, 22*mm], repeatRows=1) -t.setStyle(TableStyle([ - ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), - ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), - ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), - ("FONTSIZE", (0, 0), (-1, -1), 8.5), - ("ALIGN", (2, 1), (-1, -1), "RIGHT"), - ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), - ("ROWBACKGROUNDS", (0, 1), (-1, -2), [colors.white, colors.HexColor("#f5f3ee")]), - ("BACKGROUND", (0, -1), (-1, -1), colors.HexColor("#e8e3d6")), - ("FONTNAME", (0, -1), (-1, -1), "Helvetica-Bold"), - ("LINEBELOW", (0, 0), (-1, 0), 0.5, colors.HexColor("#191919")), - ("LINEBELOW", (0, -2), (-1, -2), 0.3, colors.HexColor("#999999")), - ("LEFTPADDING", (0, 0), (-1, -1), 5), - ("RIGHTPADDING", (0, 0), (-1, -1), 5), - ("TOPPADDING", (0, 0), (-1, -1), 4), - ("BOTTOMPADDING", (0, 0), (-1, -1), 4), -])) -story.append(t) -story.append(Paragraph( - "Table 1. Per-task token cost for each retrieval path, against the 186,654-token " - "naive baseline. Smart Compile compresses by ~65% via selection; MCP search " - "compresses by ~99.6% via chunk retrieval.", - CAPTION, -)) - -story.append(Paragraph( - "The shape is the load-bearing finding: Smart Compile selection alone removes " - "roughly two-thirds of the baseline load on the median task. MCP search removes " - "more than 99%. These are independent and complementary paths — Smart " - "Compile is the right path when the host wants a curated system prompt; " - "MCP search is the right path when the host fetches just-in-time.", - BODY, -)) - -story.append(PageBreak()) - -# ---------- A.4 Quality ---------- -story.append(Paragraph("A.4 Results — quality, 5-task representative subset", H2)) -story.append(Paragraph( - "The five graded tasks were chosen to span categories with different context " - "shapes: system-ops (procedural), image-gen (recipe), claude-api (code), " - "design (brand), comms (release note). Responses were scored 1–10 on " - "specificity, actionability, and plausibility.", - BODY, -)) - -q_data = [["Task", "Smart tk", "Smart Q", "Search tk", "Search Q"]] -for r in graded["results"]: - q_data.append([ - r["task_id"], - fmt(r["smart_tokens"]), - str(r["smart_quality"]), - fmt(r["search_tokens"]), - str(r["search_quality"]), - ]) -s = graded["summary"] -q_data.append([ - "Median / Mean", - fmt(s["smart_tokens_median"]), - f"{s['smart_quality_median']:.1f} / {s['smart_quality_mean']:.1f}", - fmt(s["search_tokens_median"]), - f"{s['search_quality_median']:.1f} / {s['search_quality_mean']:.1f}", -]) -t2 = Table(q_data, colWidths=[50*mm, 28*mm, 28*mm, 28*mm, 28*mm], repeatRows=1) -t2.setStyle(TableStyle([ - ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), - ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), - ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), - ("FONTSIZE", (0, 0), (-1, -1), 8.5), - ("ALIGN", (1, 1), (-1, -1), "RIGHT"), - ("ROWBACKGROUNDS", (0, 1), (-1, -2), [colors.white, colors.HexColor("#f5f3ee")]), - ("BACKGROUND", (0, -1), (-1, -1), colors.HexColor("#e8e3d6")), - ("FONTNAME", (0, -1), (-1, -1), "Helvetica-Bold"), - ("LINEBELOW", (0, 0), (-1, 0), 0.5, colors.HexColor("#191919")), - ("LINEBELOW", (0, -2), (-1, -2), 0.3, colors.HexColor("#999999")), - ("LEFTPADDING", (0, 0), (-1, -1), 5), - ("RIGHTPADDING", (0, 0), (-1, -1), 5), - ("TOPPADDING", (0, 0), (-1, -1), 4), - ("BOTTOMPADDING", (0, 0), (-1, -1), 4), -])) -story.append(t2) -story.append(Paragraph( - "Table 2. Token cost vs. judged output quality for the five graded tasks. " - "Smart holds a flat 9.0/10. Search averages 7.6/10 — usable across the board, " - "with the gap concentrated in tasks where the answer depends on a specific recipe " - "(prompt construction, release-note technical details).", - CAPTION, -)) - -story.append(Paragraph("Tokens per quality point", H3)) -story.append(Paragraph( - f"This is the load-bearing efficiency number for the "lean MCP" claim: " - f"the cost in retrieved tokens of one point of usable output.", - BODY, -)) -tpq = Table([ - ["Path", "Median tokens / quality point"], - ["Smart Compile", f"{fmt(s['smart_tokens_per_quality_point_median'])} tk/pt"], - ["MCP Search", f"{fmt(s['search_tokens_per_quality_point_median'])} tk/pt"], - ["Ratio", f"Search is ~{s['efficiency_ratio_search_vs_smart']:.0f}× more efficient per quality point"], -], colWidths=[60*mm, 100*mm]) -tpq.setStyle(TableStyle([ - ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), - ("FONTSIZE", (0, 0), (-1, -1), 10), - ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), - ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), - ("BACKGROUND", (0, -1), (-1, -1), colors.HexColor("#e8e3d6")), - ("FONTNAME", (0, -1), (-1, -1), "Helvetica-Bold"), - ("LEFTPADDING", (0, 0), (-1, -1), 8), - ("RIGHTPADDING", (0, 0), (-1, -1), 8), - ("TOPPADDING", (0, 0), (-1, -1), 6), - ("BOTTOMPADDING", (0, 0), (-1, -1), 6), - ("ROWBACKGROUNDS", (0, 1), (-1, -2), [colors.white, colors.HexColor("#f5f3ee")]), -])) -story.append(tpq) -story.append(Spacer(1, 8)) - -# ---------- A.5 Interpretation ---------- -story.append(Paragraph("A.5 Interpretation against Hypothesis 1", H2)) -story.append(Paragraph( - "Hypothesis 1 holds in the direction predicted, with one caveat. " - "Smart Compile delivers a ~65% median reduction in retrieved tokens while preserving " - "task quality at 9.0/10 across the graded subset — quality is statistically flat compared " - "to the all-on baseline a reader would intuit. MCP search delivers a ~99.6% reduction " - "but with a measurable quality cost: 7.6/10 on the same rubric, a ~1.4-point drop.", - BODY, -)) -story.append(Paragraph( - "The quality cost is not random. The two tasks where search lost the most ground " - "(comfy-prompt-fantasy, internal-release-note) are ones where the " - "skill body contains a specific recipe or fact set that doesn't compress into a few " - "chunks. Conceptual or brainstorming tasks — the cases the paper's " - "Multi-Resolution Packaging section anticipates — should narrow that gap.", - BODY, -)) -story.append(Paragraph( - "Practical reading: an MCP host that wants the absolute cheapest path can use search " - "and accept the 1.4-point quality discount; a host that wants near-baseline quality at " - "a third of the token cost should use Smart Compile. Both paths beat the naive " - "all-on default by an order of magnitude in different dimensions, and neither requires " - "the host to know anything about the underlying skill library.", - BODY, -)) - -# ---------- A.6 Caveats ---------- -story.append(Paragraph("A.6 Caveats and what this run does not yet measure", H2)) -story.append(Paragraph( - "This is a v0.3.1 measurement on a single user's library (116 active skills, " - "Jeremy's Context Engine instance). It is honest signal of shape, not a " - "publishable headline number. Specific limitations:", - BODY, -)) -caveats = [ - "Single-judge grading. Each response was scored once. The validation framework calls " - "for three judge runs per response with the median taken; that work is deferred to v1.2.", - "Same model on both sides. The task model and the judge model were the same Claude " - "session, which introduces self-evaluation bias. A future run should use Sonnet for the task " - "and a different judge family (e.g. GPT-4 or Haiku) to break the dependency.", - "5-task quality subset. The token table covers all 15 tasks; quality grading covers 5. " - "Expanding to the full 15 is straightforward but was not budget-justified for the v0.3.1 ship.", - "Tokenizer is cl100k_base, not Anthropic's. Within ~5% for prose; consistent " - "across the three measurement paths, so internal ratios are correct.", - "One retrieval call per task. Real hosts may call search multiple times or fall back to " - "get_skill for full bodies. Reported search numbers are a lower " - "bound on what production traffic actually consumes.", - "Hypotheses 2–5 are not yet measured. Multi-resolution packaging, model-aware " - "budgets, deduplication impact, and usage-learned reranking remain future work.", -] -for c in caveats: - story.append(Paragraph(f"• {c}", BODY)) - -# ---------- A.7 Reproducibility ---------- -story.append(Paragraph("A.7 Reproducibility", H2)) -story.append(Paragraph( - "All numbers in this appendix are reproducible from a Context Engine install with " - "the same skill library:", - BODY, -)) -story.append(Paragraph("python bench/tokenomics.py # tokens, full corpus", CODE)) -story.append(Paragraph("python bench/tokenomics.py --grade # tokens + quality (needs API key)", CODE)) -story.append(Paragraph("python bench/fetch_contexts.py # dumps contexts to disk for offline grading", CODE)) -story.append(Paragraph( - "Outputs land in bench/results-latest.json and " - "bench/results-graded-sample.json. The corpus lives in " - "bench/tasks.json — extend or replace it to validate against " - "your own workload.", - BODY, -)) -story.append(Spacer(1, 14)) -story.append(Paragraph( - "Section author note: the headline result this appendix supports is that the " - "MCP-first pivot from May 2026 was the right call. Naive MCP hosts pay 186k " - "tokens per turn for context they mostly don't use. Context Engine's " - "two retrieval paths cut that to 58k (Smart) or under 1k (Search) without " - "collapsing answer quality. The brokered-context model is empirically cheaper, " - "not just architecturally cleaner.", - QUOTE, -)) - - -def on_page(canvas, doc): - canvas.saveState() - canvas.setFont("Helvetica", 8) - canvas.setFillColor(colors.HexColor("#999999")) - canvas.drawString(20*mm, 12*mm, "Context Engine White Paper — v1.1 Appendix A") - canvas.drawRightString(A4[0] - 20*mm, 12*mm, f"A-{doc.page}") - canvas.restoreState() - - -doc = SimpleDocTemplate( - str(ADDENDUM_PDF), pagesize=A4, - leftMargin=22*mm, rightMargin=22*mm, topMargin=22*mm, bottomMargin=22*mm, - title="Context Engine White Paper v1.1 — Appendix A", - author="Jeremy Walder-Willows", -) -doc.build(story, onFirstPage=on_page, onLaterPages=on_page) -print(f"Wrote addendum: {ADDENDUM_PDF}") - -# Merge V1 + addendum -> V1.1 -writer = PdfWriter() -for p in PdfReader(str(SOURCE_PDF)).pages: - writer.add_page(p) -for p in PdfReader(str(ADDENDUM_PDF)).pages: - writer.add_page(p) -with OUTPUT_PDF.open("wb") as f: - writer.write(f) -print(f"Wrote merged: {OUTPUT_PDF}") diff --git a/bench/legacy/build_addendum_v12.py b/bench/legacy/build_addendum_v12.py deleted file mode 100644 index 9a9df9b..0000000 --- a/bench/legacy/build_addendum_v12.py +++ /dev/null @@ -1,358 +0,0 @@ -#!/usr/bin/env python3 -""" -Build Appendix B for the v1.2 white paper. - -Appendix B replaces the v1.1 measurement methodology with the critique-addressed -version: - - Gold-answer fact recall (deterministic) - - Decomposed rubric (factual / specific / complete / hallucinations) - - No-context baseline column - - Retrieval quality (Precision@K / Recall@K / MRR) - - Latency + cost - - Failure-mode taxonomy - - Multi-judge support (architecture in code; v1.2 results are still single-judge - pending API keys, with a flagged caveat) - -Source: bench/results-v12-final.json -""" -from __future__ import annotations -import json -from pathlib import Path -from reportlab.lib.pagesizes import A4 -from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle -from reportlab.lib.units import mm -from reportlab.lib import colors -from reportlab.lib.enums import TA_LEFT -from reportlab.platypus import ( - SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, -) -from PyPDF2 import PdfReader, PdfWriter - -ROOT = Path(__file__).resolve().parents[2] -BENCH = ROOT / "app" / "bench" -ADDENDUM_PDF = BENCH / "addendum-v12.pdf" -SOURCE_PDF = ROOT / "Context Engine White Paper V1.1.pdf" -OUTPUT_PDF = ROOT / "Context Engine White Paper V1.2.pdf" - -data = json.loads((BENCH / "results-v12-final.json").read_text(encoding="utf-8")) -s = data["summary"] -rows = data["rows"] - -styles = getSampleStyleSheet() -H1 = ParagraphStyle("H1", parent=styles["Heading1"], fontName="Helvetica-Bold", - fontSize=18, leading=22, spaceAfter=12, textColor=colors.HexColor("#191919")) -H2 = ParagraphStyle("H2", parent=styles["Heading2"], fontName="Helvetica-Bold", - fontSize=13, leading=17, spaceBefore=14, spaceAfter=6, - textColor=colors.HexColor("#191919")) -H3 = ParagraphStyle("H3", parent=styles["Heading3"], fontName="Helvetica-Bold", - fontSize=11, leading=14, spaceBefore=8, spaceAfter=4, - textColor=colors.HexColor("#3a3a3a")) -BODY = ParagraphStyle("BODY", parent=styles["BodyText"], fontName="Helvetica", - fontSize=10.5, leading=14, spaceAfter=6, alignment=TA_LEFT, - textColor=colors.HexColor("#191919")) -QUOTE = ParagraphStyle("QUOTE", parent=BODY, leftIndent=14, rightIndent=14, - fontName="Helvetica-Oblique", textColor=colors.HexColor("#3a3a3a"), - spaceBefore=4, spaceAfter=10) -CAPTION = ParagraphStyle("CAPTION", parent=BODY, fontSize=9, leading=12, - textColor=colors.HexColor("#666666"), spaceAfter=10) -CODE = ParagraphStyle("CODE", parent=BODY, fontName="Courier", fontSize=9, leading=12, - leftIndent=10, textColor=colors.HexColor("#191919")) - - -def fmt(n): return f"{n:,}" if isinstance(n, (int, float)) else str(n) - - -def kv_table(pairs, col_widths=(60*mm, 100*mm)): - t = Table(pairs, colWidths=col_widths) - t.setStyle(TableStyle([ - ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), - ("FONTSIZE", (0, 0), (-1, -1), 9.5), - ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), - ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), - ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f5f3ee")]), - ("LEFTPADDING", (0, 0), (-1, -1), 6), - ("RIGHTPADDING", (0, 0), (-1, -1), 6), - ("TOPPADDING", (0, 0), (-1, -1), 4), - ("BOTTOMPADDING", (0, 0), (-1, -1), 4), - ])) - return t - - -story = [] - -story.append(Paragraph("Appendix B", H3)) -story.append(Paragraph("Tokenomics Benchmark v1.2 — Critique-Addressed Methodology", H1)) -story.append(Paragraph( - "Added in v1.2 of this paper. Supersedes Appendix A's measurement methodology while " - "preserving A as the historical record. The v1.1 critique identified seven concrete " - "weaknesses in A's grading apparatus — same-model self-evaluation, single judge run, " - "no gold answers, no no-context baseline, no retrieval-quality scoring, holistic " - "rubric, single task model. B addresses all of these in the harness, and reports the " - "in-session results from the subset that can be measured without external API keys.", - QUOTE, -)) - -# ---------- B.1 What changed ---------- -story.append(Paragraph("B.1 What changed between A and B", H2)) - -changes = [ - ("Gold-answer fact recall", - "Per-task labeled facts (must_include, may_include, forbidden) in " - "bench/gold-answers.json. Scoring is deterministic keyword-match with negation " - "guarding and an 'all'/'any' match mode per fact. Coverage % is now a measurement, not an opinion."), - ("Decomposed rubric", - "Four independent axes — factual_correctness (1-5), specificity (1-5), " - "completeness (1-5), hallucination_count (integer) — replacing the single " - "holistic 1-10 score from A. Hallucination as a count, not a grade, exposes the " - "failure mode MCP hosts care about most."), - ("No-context baseline", - "Fourth measurement path: same task, zero skill context. Distinguishes 'CE added " - "value' from 'the model already knew this'. Without this column, A's quality " - "numbers were unanchored."), - ("Retrieval quality (Precision@K, Recall@K, MRR)", - "Labeled expected_sources per task in gold-answers.json. Scores search results " - "before any LLM call. Predicts when search will underperform without spending tokens."), - ("Latency + cost columns", - "End-to-end milliseconds per call, dollar cost from a per-model price table. " - "Token counts don't equal pricing; reporting both removes the ambiguity."), - ("Failure-mode taxonomy", - "Enumerated tags per task (wrong_os, wrong_sdk, missing_loop, no_safety_step, etc) " - "inferred from gold-miss patterns. Failures get categorised, not just counted."), - ("Multi-judge cross-family architecture", - "grade_v2.py supports anthropic + openai judges, N runs each, with median + IQR " - "aggregation. In v1.2 the in-session run still uses a single session judge (no " - "external API keys available); the harness is wired for cross-family multi-run " - "to break self-evaluation bias as soon as keys are in scope."), -] -for title, body in changes: - story.append(Paragraph(f"{title}. {body}", BODY)) - -# ---------- B.2 Results table ---------- -story.append(PageBreak()) -story.append(Paragraph("B.2 Results — 5-task subset, three context paths", H2)) -story.append(Paragraph( - "Same 5 representative tasks as Appendix A (system-ops, image-gen, claude-api, " - "design, comms). Now scored against gold facts and on the decomposed rubric, with " - "the no-context baseline as a fourth row per task.", - BODY, -)) - -t_rows = [["Task", "Mode", "Tokens", "Cov", "Bonus", "Fact", "Spec", "Comp", "Comp%"]] -for r in rows: - t_rows.append([ - r["task_id"], - r["mode"], - fmt(r["input_tokens"]), - f"{r['fact_coverage_pct']:.0f}%", - f"{r['fact_bonus_pct']:.0f}%", - str(r["factual"]), - str(r["specific"]), - str(r["complete"]), - f"{r['composite_pct']:.0f}%", - ]) -t = Table(t_rows, colWidths=[40*mm, 18*mm, 18*mm, 12*mm, 14*mm, 12*mm, 12*mm, 12*mm, 16*mm], repeatRows=1) -t.setStyle(TableStyle([ - ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), - ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), - ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), - ("FONTSIZE", (0, 0), (-1, -1), 8), - ("ALIGN", (2, 1), (-1, -1), "RIGHT"), - ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), - ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f5f3ee")]), - ("LINEBELOW", (0, 0), (-1, 0), 0.5, colors.HexColor("#191919")), - ("LEFTPADDING", (0, 0), (-1, -1), 4), - ("RIGHTPADDING", (0, 0), (-1, -1), 4), - ("TOPPADDING", (0, 0), (-1, -1), 3), - ("BOTTOMPADDING", (0, 0), (-1, -1), 3), -])) -story.append(t) -story.append(Paragraph( - "Table B.1. Per-task per-mode results. Tokens = system-prompt context tokens. " - "Cov = must_include facts hit (deterministic). Bonus = may_include facts hit " - "(deterministic). Fact / Spec / Comp = decomposed rubric axes 1-5 (session-judged). " - "Comp% = mean of three axes normalised to 100.", - CAPTION, -)) - -# ---------- B.3 Per-mode aggregates ---------- -story.append(Paragraph("B.3 Per-mode aggregates (5 tasks)", H2)) - -agg_rows = [["Mode", "Median tk", "Saving vs raw_all", "Cov", "Bonus", "Composite Q", "Tokens / Q-pt"]] -for k in ("no_context", "smart", "search"): - tk = s["tokens"]["by_mode_input_tokens"][k]["median"] - cov = s["fact_recall"]["coverage_must_include_pct"][k]["mean"] - bonus = s["fact_recall"]["bonus_may_include_pct"][k]["mean"] - comp = s["decomposed_rubric"]["composite_pct"][k]["median"] - tpqp = s["tokens_per_quality_point"][k] - saving = (f"{round(100 * (1 - tk / 186654), 1)}%" - if k != "no_context" else "n/a") - agg_rows.append([ - k, fmt(tk), saving, f"{cov:.1f}%", f"{bonus:.1f}%", - f"{comp:.1f}%", fmt(tpqp), - ]) -agg = Table(agg_rows, colWidths=[26*mm, 24*mm, 30*mm, 16*mm, 18*mm, 26*mm, 26*mm], repeatRows=1) -agg.setStyle(TableStyle([ - ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), - ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), - ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), - ("FONTSIZE", (0, 0), (-1, -1), 9), - ("ALIGN", (1, 1), (-1, -1), "RIGHT"), - ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f5f3ee")]), - ("LEFTPADDING", (0, 0), (-1, -1), 5), - ("RIGHTPADDING", (0, 0), (-1, -1), 5), - ("TOPPADDING", (0, 0), (-1, -1), 4), - ("BOTTOMPADDING", (0, 0), (-1, -1), 4), -])) -story.append(agg) -story.append(Paragraph( - "Table B.2. Aggregate per mode across the 5-task subset. Cov and Bonus are means; " - "all other columns are medians.", - CAPTION, -)) - -story.append(Paragraph("B.4 Retrieval quality (search mode)", H2)) -retr = s["retrieval_quality_mean"] -story.append(kv_table([ - ["Metric", "Mean (5 tasks)"], - ["Precision@8", f"{retr['precision_at_k']:.2f}"], - ["Recall@8", f"{retr['recall_at_k']:.2f}"], - ["MRR", f"{retr['mrr']:.2f}"], -])) -story.append(Paragraph( - "Table B.3. Retrieval-quality metrics from MCP search vs the labeled " - "expected_sources gold set. Per-task: brand-poster scores cleanly " - "(P@8=0.50, R@8=1.00, MRR=1.00 — both relevant skills appear at ranks 1 and 2). " - "internal-release-note is a complete miss (P@8=0, R@8=0) — neither internal-comms " - "nor slack-gif-creator appears in the top 8. This metric predicts the observed " - "quality drop on that task without spending an LLM call.", - CAPTION, -)) - -story.append(PageBreak()) - -# ---------- B.5 Interpretation ---------- -story.append(Paragraph("B.5 Interpretation", H2)) - -h = s["headline"] -story.append(Paragraph( - f"Smart Compile delivers {h['smart_saving_vs_raw_all_pct']}% token savings " - f"vs the raw_all baseline and adds {h['smart_quality_lift_vs_no_context_pct']} composite " - f"quality points above the no-context baseline. Quality is a flat 100% on all 5 " - "tasks — Smart consistently brings every bonus fact (cache_control, AUMID, exact hex codes, " - "env var names). Cost: ~3,876 tokens per quality point above no-context.", - BODY, -)) - -story.append(Paragraph( - f"MCP Search delivers {h['search_saving_vs_raw_all_pct']}% token savings and adds " - f"only {h['search_quality_lift_vs_no_context_pct']} composite quality points above no-context. " - "This is a more sober result than v1.1 suggested. On 4 of 5 tasks, search adds 5-15 " - "quality points; on internal-release-note it adds zero because retrieval missed the " - "relevant skills entirely. Cost: ~58 tokens per quality point above no-context — " - "67× more efficient than Smart per quality-point of lift.", - BODY, -)) - -story.append(Paragraph( - "The honest framing the v1.2 numbers force: a modern hosted model with general " - "world knowledge already gets to 80% composite on these tasks with zero context. CE " - "Smart Compile closes the remaining 20% reliably at a known token cost. CE Search " - "closes about a third of that remaining gap at a tiny token cost, but the gap it closes " - "is conditional on retrieval succeeding — which Precision@K predicts before the LLM " - "is invoked.", - BODY, -)) - -story.append(Paragraph( - "The Hypothesis 1 claim ('Smart Compile reduces token load without reducing task quality') " - "remains supported and is now measured against four axes instead of one. Hypothesis 2 " - "(multi-resolution packaging) is partially probed by Retrieval Quality — search loses where " - "the relevant chunks aren't surfaced. Hypothesis 3 (model-aware budgets) and " - "Hypothesis 5 (usage-learned reranking) remain unmeasured.", - BODY, -)) - -# ---------- B.6 Known limitations ---------- -story.append(Paragraph("B.6 Known limitations of v1.2 (what's still pending)", H2)) - -caveats = [ - "Single judge, single run. grade_v2.py supports anthropic + openai judges with " - "N=3 runs each and median+IQR aggregation, but the in-session run uses one judge " - "(the same Claude session) once. Cross-family multi-run grading is the v1.3 milestone " - "and requires only API keys, not new code.", - "5-task quality subset. Gold answers are written for all 15 corpus tasks; " - "response capture and grading covers 5. Expanding to 15 is a 2-hour task with a key.", - "Single task model. claude-sonnet-4-5 only. Hypothesis 3 demands the same corpus " - "across Haiku / Sonnet / a local 8B model to test whether smaller models benefit more " - "from CE. Wired in PRICING table; runner is one CLI flag away.", - "One retrieval call per task. Real MCP hosts iterate; production search traffic " - "sits between our Smart and Search numbers. A multi-call host simulator is on the " - "v1.3 backlog.", - "Gold-answer authoring bias. Gold facts were written by the same person who " - "wrote the tasks. Independent authoring (e.g. via crowdsourced labels) would harden " - "the must_include set.", - "Task corpus is single-user. 116 skills, 15 tasks, all reflecting one user's " - "workload. Cross-user generalisation requires either a community corpus or telemetry-" - "sampled real prompts.", -] -for c in caveats: - story.append(Paragraph(f"• {c}", BODY)) - -story.append(Paragraph("B.7 Reproducibility", H2)) -story.append(Paragraph("Full v1.2 pipeline:", BODY)) -story.append(Paragraph("python bench/tokenomics.py # token-only, all 15", CODE)) -story.append(Paragraph("python bench/fetch_contexts.py # pre-fetch contexts to disk", CODE)) -story.append(Paragraph("python bench/run_v12.py --responses bench/responses-v12.json # deterministic scoring", CODE)) -story.append(Paragraph("python bench/run_v12.py --judges anthropic:haiku,openai:gpt-4o-mini --runs 3 # full v1.3 with API keys", CODE)) -story.append(Paragraph("python bench/aggregate_v12.py # final summary + JSON", CODE)) -story.append(Paragraph( - "Source files: bench/gold-answers.json, " - "bench/grade_v2.py, " - "bench/run_v12.py, " - "bench/aggregate_v12.py, " - "bench/responses-v12.json, " - "bench/session_grades.json, " - "bench/results-v12-final.json. " - "All in the repo; no external services required for the deterministic half of the pipeline.", - BODY, -)) - -story.append(Spacer(1, 14)) -story.append(Paragraph( - "The headline-defensible finding the v1.2 numbers support: a brokered context layer " - "(Smart Compile) buys a measurable, consistent ~20-point composite quality lift over " - "zero-context modern hosted models at ~69% of the naive baseline's token cost. MCP " - "search retrieves at a fraction of a percent of the baseline cost but its quality " - "contribution is conditional on retrieval success, which Precision@K predicts cheaply. " - "Both findings argue for the brokered-context architecture, but with different framings " - "than v1.1's looser methodology suggested.", - QUOTE, -)) - - -def on_page(canvas, doc): - canvas.saveState() - canvas.setFont("Helvetica", 8) - canvas.setFillColor(colors.HexColor("#999999")) - canvas.drawString(20*mm, 12*mm, "Context Engine White Paper — v1.2 Appendix B") - canvas.drawRightString(A4[0] - 20*mm, 12*mm, f"B-{doc.page}") - canvas.restoreState() - - -doc = SimpleDocTemplate( - str(ADDENDUM_PDF), pagesize=A4, - leftMargin=22*mm, rightMargin=22*mm, topMargin=22*mm, bottomMargin=22*mm, - title="Context Engine White Paper v1.2 — Appendix B", - author="Jeremy Walder-Willows", -) -doc.build(story, onFirstPage=on_page, onLaterPages=on_page) -print(f"Wrote: {ADDENDUM_PDF}") - -writer = PdfWriter() -for p in PdfReader(str(SOURCE_PDF)).pages: - writer.add_page(p) -for p in PdfReader(str(ADDENDUM_PDF)).pages: - writer.add_page(p) -with OUTPUT_PDF.open("wb") as f: - writer.write(f) -print(f"Wrote: {OUTPUT_PDF}") diff --git a/bench/legacy/build_addendum_v13.py b/bench/legacy/build_addendum_v13.py deleted file mode 100644 index 5380d1a..0000000 --- a/bench/legacy/build_addendum_v13.py +++ /dev/null @@ -1,366 +0,0 @@ -#!/usr/bin/env python3 -""" -Build Appendix C — v1.3 results. Honest correction of v1.1/v1.2's framing. -""" -from __future__ import annotations -import json -from pathlib import Path -from reportlab.lib.pagesizes import A4 -from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle -from reportlab.lib.units import mm -from reportlab.lib import colors -from reportlab.lib.enums import TA_LEFT -from reportlab.platypus import ( - SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak, -) -from PyPDF2 import PdfReader, PdfWriter - -ROOT = Path(__file__).resolve().parents[2] -BENCH = ROOT / "app" / "bench" -ADDENDUM_PDF = BENCH / "addendum-v13.pdf" -SOURCE_PDF = ROOT / "Context Engine White Paper V1.2.pdf" -OUTPUT_PDF = ROOT / "Context Engine White Paper V1.3.pdf" - -data = json.loads((BENCH / "results-v13-final.json").read_text(encoding="utf-8")) - -styles = getSampleStyleSheet() -H1 = ParagraphStyle("H1", parent=styles["Heading1"], fontName="Helvetica-Bold", - fontSize=18, leading=22, spaceAfter=12, textColor=colors.HexColor("#191919")) -H2 = ParagraphStyle("H2", parent=styles["Heading2"], fontName="Helvetica-Bold", - fontSize=13, leading=17, spaceBefore=14, spaceAfter=6, - textColor=colors.HexColor("#191919")) -H3 = ParagraphStyle("H3", parent=styles["Heading3"], fontName="Helvetica-Bold", - fontSize=11, leading=14, spaceBefore=8, spaceAfter=4, - textColor=colors.HexColor("#3a3a3a")) -BODY = ParagraphStyle("BODY", parent=styles["BodyText"], fontName="Helvetica", - fontSize=10.5, leading=14, spaceAfter=6, alignment=TA_LEFT, - textColor=colors.HexColor("#191919")) -QUOTE = ParagraphStyle("QUOTE", parent=BODY, leftIndent=14, rightIndent=14, - fontName="Helvetica-Oblique", textColor=colors.HexColor("#3a3a3a"), - spaceBefore=4, spaceAfter=10) -CAPTION = ParagraphStyle("CAPTION", parent=BODY, fontSize=9, leading=12, - textColor=colors.HexColor("#666666"), spaceAfter=10) -CODE = ParagraphStyle("CODE", parent=BODY, fontName="Courier", fontSize=9, leading=12, - leftIndent=10, textColor=colors.HexColor("#191919")) - - -def fmt(n): return f"{n:,}" if isinstance(n, (int, float)) else str(n) - - -def base_table(headers, rows, col_widths): - t = Table([headers] + rows, colWidths=col_widths, repeatRows=1) - t.setStyle(TableStyle([ - ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#191919")), - ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), - ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), - ("FONTSIZE", (0, 0), (-1, -1), 8.5), - ("ALIGN", (1, 1), (-1, -1), "RIGHT"), - ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f5f3ee")]), - ("LEFTPADDING", (0, 0), (-1, -1), 5), - ("RIGHTPADDING", (0, 0), (-1, -1), 5), - ("TOPPADDING", (0, 0), (-1, -1), 3.5), - ("BOTTOMPADDING", (0, 0), (-1, -1), 3.5), - ("LINEBELOW", (0, 0), (-1, 0), 0.5, colors.HexColor("#191919")), - ])) - return t - - -story = [] - -story.append(Paragraph("Appendix C", H3)) -story.append(Paragraph("Tokenomics Benchmark v1.3 — Full Pipeline, Cross-Model Judging", H1)) -story.append(Paragraph( - f"Added in v1.3. First full-pipeline run with a real task model " - f"({data['task_model']}) and a cross-size judge ({data['judge_model']} " - f"× {data['judge_runs']} runs) across all 15 corpus tasks. Total API cost: " - f"~${sum(m['cost_usd_total'] for m in data['by_mode'].values()):.2f}. " - f"Result is materially different from v1.1 / v1.2 — and the difference is the point. " - "The earlier appendices used a session-as-judge which under-penalised the task model's " - "own outputs; a separate Haiku judge with three runs per response exposes failure modes " - "the earlier methodology missed.", - QUOTE, -)) - -# C.1 Headline -story.append(Paragraph("C.1 Headline finding", H2)) -sv = data["paired"]["smart_vs_no_context"] -srv = data["paired"]["search_vs_no_context"] -story.append(Paragraph( - f"In its v0.3.1 form, Smart Compile net-degrades quality. " - f"Mean composite delta vs no-context: {sv['mean_delta']:+.2f} points on a 15-point scale, " - f"with {sv['wins']} wins, {sv['ties']} ties, and {sv['losses']} losses out of 15 tasks. " - f"MCP Search also under-performs no-context on this corpus: mean delta " - f"{srv['mean_delta']:+.2f} ({srv['wins']} wins / {srv['ties']} ties / {srv['losses']} losses). " - "The naive 'load relevant skills in full as system prompt' implementation of " - "Smart Compile hits an attention-dilution regime modern hosted models don't experience " - "when given no skill context at all.", - BODY, -)) -story.append(Paragraph( - "This appears to contradict v1.1's headline ('Smart holds at 9.0/10 across the " - "subset'). It does not contradict the paper's thesis — Section 11 (Multi-Resolution " - "Context Packaging) explicitly predicts this failure mode and argues that the " - "implementation must chunk and summarise skills before injection. The v1.3 result is " - "empirical validation of the paper's own Section 11 prediction and a clear signal " - "that Smart Compile alone, without the chunking layer the paper specifies, is the wrong " - "way to deliver context.", - BODY, -)) - -# C.2 Aggregate table -story.append(Paragraph("C.2 Per-mode aggregates (N=15 tasks, 3 judge runs each)", H2)) -mode_rows = [] -for m in ("no_context", "smart", "search"): - a = data["by_mode"][m] - mode_rows.append([ - m, - fmt(a["input_tokens_median"]), - f"{a['coverage_pct_mean']:.0f}%", - f"{a['bonus_pct_mean']:.0f}%", - f"{a['factual_median']:.1f}", - f"{a['specific_median']:.1f}", - f"{a['complete_median']:.1f}", - f"{a['hallucinations_mean']:.2f}", - f"{a['composite_15_median']:.1f}", - f"${a['cost_usd_total']:.2f}", - ]) -story.append(base_table( - ["Mode", "med tk", "Cov", "Bonus", "Fact", "Spec", "Comp", "Halluc", "Comp/15", "Cost"], - mode_rows, - [22*mm, 16*mm, 12*mm, 14*mm, 13*mm, 13*mm, 13*mm, 16*mm, 17*mm, 16*mm], -)) -story.append(Paragraph( - "Table C.1. Per-mode aggregates. Fact/Spec/Comp are median axes (1-5) across judge runs. " - "Halluc is mean fabricated facts per response. Comp/15 is median composite (sum of axes). " - "Cost is total API spend for the corpus at this mode.", - CAPTION, -)) - -# Paired comparisons -story.append(Paragraph("C.3 Paired comparisons (per-task delta)", H2)) -pc_rows = [] -for k in ("smart_vs_no_context", "search_vs_no_context", "smart_vs_search"): - p = data["paired"][k] - worst = p["worst_task"][0] if p["worst_task"] else "—" - best = p["best_task"][0] if p["best_task"] else "—" - pc_rows.append([ - k.replace("_", " "), - str(p["n"]), - f"{p['mean_delta']:+.2f}", - f"{p['wins']}/{p['ties']}/{p['losses']}", - worst, - best, - ]) -story.append(base_table( - ["Comparison", "n", "mean delta", "W/T/L", "Worst task", "Best task"], - pc_rows, - [42*mm, 10*mm, 22*mm, 20*mm, 38*mm, 38*mm], -)) -story.append(Paragraph( - "Table C.2. Paired per-task deltas, composite (0-15 scale). 'Worst task' is the task " - "with the largest negative delta; 'best' the largest positive. Smart Compile loses on " - "more than half the corpus.", - CAPTION, -)) - -story.append(PageBreak()) - -# C.4 What happens when smart loses? -story.append(Paragraph("C.4 The failure mode, concretely", H2)) -story.append(Paragraph( - "The single worst Smart-vs-no_context result was comfy-prompt-fantasy " - "(delta = -10). The task asks for a ComfyUI prompt. The no-context response delivered " - "exactly that — positive prompt, negative prompt, sampler / steps / CFG / resolution / " - "model recommendations (Haiku judge: factual=4 specific=5 complete=4, hallucinations=1).", - BODY, -)) -story.append(Paragraph( - "The smart-mode response, given 36k tokens of skill context including theme-factory " - "themes and prompt-builder bodies, instead wrote a 700-token philosophical essay called " - "'Mystral Dusk: an algorithmic philosophy of atmospheric emergence and temporal " - "liminal states.' It described the aesthetic at length, never produced an actual " - "prompt, never named a sampler or resolution. (Haiku judge: factual=1 specific=1 " - "complete=1, hallucinations=2.67.) The user could not run anything from the response.", - BODY, -)) -story.append(Paragraph( - "This is not a quirk. It's the predicted-by-Section-11 attention-dilution regime: when " - "a model is given many full skill bodies as a single system prompt, it picks up on the " - "meta-content (style discussion, philosophy of design, when-to-use prose) and produces " - "meta-content of its own, drifting away from the concrete task. The smaller, terser " - "search-mode context (1.1k tokens) does not trigger this, and neither does no-context.", - BODY, -)) - -# C.5 Retrieval quality -story.append(Paragraph("C.5 Retrieval quality (search mode)", H2)) -r = data["retrieval_quality"] -story.append(Paragraph( - f"Precision@8 mean = {r['precision_at_k_mean']:.2f} · " - f"Recall@8 mean = {r['recall_at_k_mean']:.2f} · " - f"MRR mean = {r['mrr_mean']:.2f}. " - f"{len(r['complete_misses'])} of {r['n']} tasks had complete retrieval misses — " - "zero relevant chunks in the top 8. Search quality is the bottleneck.", - BODY, -)) -story.append(Paragraph( - "Where retrieval works (brand-poster P@8=0.5, R@8=1.0, MRR=1.0) the search-mode response " - "matches or exceeds no-context. Where retrieval fails completely (5 tasks including " - "internal-release-note, pc-cpu-hog, claude-api-migrate) the search-mode response " - "degrades. The deterministic Precision@K predictor identifies these cases before the LLM " - "is invoked — improving retrieval is the highest-leverage fix in the v1.x roadmap.", - BODY, -)) - -# C.6 Why v1.1 and v1.2 looked different -story.append(Paragraph("C.6 Why v1.1 and v1.2 looked positive", H2)) -story.append(Paragraph( - "v1.1 used a single Claude session as both task model and judge, scoring on a holistic " - "1-10 rubric without gold facts or hallucination counting. v1.2 added gold facts and a " - "decomposed rubric but kept session-as-judge for the LLM axes. Both runs scored Smart at " - "100% composite on the 5-task subset.", - BODY, -)) -story.append(Paragraph( - "v1.3's three changes — different judge model (Haiku, not Sonnet), three runs per " - "response (variance instead of point estimates), and full 15-task coverage — " - "uncovered the failure mode. Same-model self-evaluation systematically under-penalises " - "verbose, on-style outputs that miss the task; cross-model judging exposes them. The " - "v1.1 and v1.2 numbers were not lies, but they were biased in a predictable direction.", - BODY, -)) - -# C.7 What the paper should now claim -story.append(Paragraph("C.7 What the paper should claim", H2)) -story.append(Paragraph( - "Validated claims (from this run):", - BODY, -)) -validated = [ - "Token reduction works. Smart cuts naive baseline by ~60%; Search by 99.5%. These " - "are reproducible, deterministic measurements unaffected by judging.", - "Section 11 (Multi-Resolution Packaging) is necessary, not optional. Full-body " - "skill injection produces measurable attention-dilution failures that no-context " - "responses do not exhibit. This is the strongest empirical signal in the appendix.", - "Retrieval quality is the bottleneck. Precision@8 of 0.23 and 5/15 complete " - "misses cap how much value Search can deliver. Improving retrieval (Phase 2 in the " - "build roadmap) has the highest expected payoff.", - "The brokered-context architecture is correct; the v0.3.x implementation isn't done. " - "CE's job is to broker context. v0.3.x brokers by selection + retrieval. The data says it " - "also needs to broker by summarising, chunking, deduplicating, and reranking before " - "anything reaches the model.", -] -for v in validated: - story.append(Paragraph(f"• {v}", BODY)) - -story.append(Paragraph("Retracted or revised claims (vs v1.1):", BODY)) -retracted = [ - "Smart Compile does NOT 'hold quality flat at 9.0/10'. It loses 8/15 tasks on composite " - "quality vs no-context under a cross-model judge. Hypothesis 1 is contradicted in the " - "v0.3.x implementation form.", - "MCP Search does NOT 'add value' on every task; it adds value where retrieval succeeds " - "(2/15 clear wins, 5/15 ties at no-context-equivalent quality) and degrades quality " - "where retrieval fails. Both wins and losses correlate with Precision@K.", - "The 'tokens per quality point' framing was misleading at face value. The right number " - "is tokens per quality point of lift over no-context — and that number is negative " - "for Smart on most tasks (no lift achieved at any token cost).", -] -for v in retracted: - story.append(Paragraph(f"• {v}", BODY)) - -# C.8 Implications for build roadmap -story.append(Paragraph("C.8 Implications for the build roadmap (Section 34)", H2)) -story.append(Paragraph( - "Phase 2 (Dedup and Rank) gains urgency. Reducing skill-body noise via dedup, and " - "promoting the most-useful chunks via reranking, directly addresses the attention-" - "dilution failures observed here. The dedup report should run before any Smart Compile " - "output is materialised.", - BODY, -)) -story.append(Paragraph( - "Phase 3 (Smart Compile) needs to be re-scoped. The current implementation selects " - "skills, then loads their full bodies. It should select skills, then load their compressed " - "summaries plus relevant chunks — the multi-resolution packaging from Section 11. Until " - "that ships, recommending Smart Compile to users is recommending a quality regression on " - "more than half their tasks.", - BODY, -)) -story.append(Paragraph( - "Phase 1 (Vector Foundation) needs a retrieval-quality dashboard. P@K, R@K, and " - "MRR per task should be visible in the CE UI. Users (and the team) should see when " - "retrieval is failing for a task class, not discover it via downstream quality drops.", - BODY, -)) - -# C.9 Methodology + reproducibility -story.append(Paragraph("C.9 Methodology and reproducibility", H2)) -story.append(Paragraph( - f"Pipeline: app/bench/run_v13.py + " - f"aggregate_v13.py. Task model {data['task_model']}; " - f"judge {data['judge_model']} × {data['judge_runs']} runs. 15 tasks × 3 modes = 45 task " - "calls + 135 judge calls = 180 API calls. Deterministic scoring (fact recall + retrieval " - "quality) runs alongside LLM judging — same numbers reproducible without any API key.", - BODY, -)) -story.append(Paragraph("Reproduce:", BODY)) -story.append(Paragraph("export ANTHROPIC_API_KEY=...", CODE)) -story.append(Paragraph("python bench/fetch_contexts.py # rebuild contexts/", CODE)) -story.append(Paragraph("python bench/run_v13.py # full pipeline", CODE)) -story.append(Paragraph("python bench/aggregate_v13.py # summary + per-task table", CODE)) - -story.append(Paragraph("C.10 Honest limitations still standing", H2)) -limits = [ - "Single judge family. Anthropic Haiku × 3 runs gives variance but doesn't break " - "intra-family bias. Adding GPT-4o-mini or Gemini Flash as a second judge is the v1.4 step.", - "Sonnet as the task model. Smaller models may benefit more from injected context — " - "the attention-dilution regime is model-size-dependent. Re-run on Haiku and on a local " - "8B model to test Hypothesis 3.", - "Skill corpus written for retrieval, not injection. Many skill bodies have meta-" - "prose (when to use, anti-patterns, lifecycle notes) that hurts when shown in full. " - "Authoring skills with both retrieval-shape and injection-shape variants is on the table.", - "One retrieval call per task. Real MCP hosts iterate; production search traffic " - "may close some of the search-mode gap observed here.", -] -for c in limits: - story.append(Paragraph(f"• {c}", BODY)) - -story.append(Spacer(1, 14)) -story.append(Paragraph( - "The v1.3 finding is the most useful result this benchmark has produced. It validates " - "the paper's core architectural thesis (a context broker is needed; multi-resolution " - "packaging is necessary) while invalidating the current implementation's claim to " - "deliver that thesis. The right product response is to ship Phase 2 (Dedup and Rank) and " - "the multi-resolution layer before claiming Smart Compile is production-ready, and to " - "communicate to users that v0.3.x's Smart Compile is a token-saving feature, not a " - "quality-improving one. The benchmark itself now stands as the test these phases must " - "pass to ship.", - QUOTE, -)) - - -def on_page(canvas, doc): - canvas.saveState() - canvas.setFont("Helvetica", 8) - canvas.setFillColor(colors.HexColor("#999999")) - canvas.drawString(20*mm, 12*mm, "Context Engine White Paper — v1.3 Appendix C") - canvas.drawRightString(A4[0] - 20*mm, 12*mm, f"C-{doc.page}") - canvas.restoreState() - - -doc = SimpleDocTemplate( - str(ADDENDUM_PDF), pagesize=A4, - leftMargin=22*mm, rightMargin=22*mm, topMargin=22*mm, bottomMargin=22*mm, - title="Context Engine White Paper v1.3 — Appendix C", - author="Jeremy Walder-Willows", -) -doc.build(story, onFirstPage=on_page, onLaterPages=on_page) -print(f"Wrote: {ADDENDUM_PDF}") - -writer = PdfWriter() -for p in PdfReader(str(SOURCE_PDF)).pages: - writer.add_page(p) -for p in PdfReader(str(ADDENDUM_PDF)).pages: - writer.add_page(p) -with OUTPUT_PDF.open("wb") as f: - writer.write(f) -print(f"Wrote: {OUTPUT_PDF}") diff --git a/bench/legacy/codex-handoff.md b/bench/legacy/codex-handoff.md deleted file mode 100644 index 18f1e0f..0000000 --- a/bench/legacy/codex-handoff.md +++ /dev/null @@ -1,164 +0,0 @@ -# Handoff to Codex — Tokenomics Benchmark Writeup - -You're picking up a three-iteration tokenomics benchmark of the Context Engine (CE) and producing a final consolidated PDF with tables, charts, and a critical analysis. The benchmark went through v1.1 → v1.2 → v1.3 and the headline result changed materially each time — the writeup must handle that arc honestly, not paper over it. - -## Your job - -Produce **one consolidated PDF** that: - -1. Cross-references all three benchmark iterations against each other (claims, retractions, methodology changes) -2. Cross-references the v1.3 findings against the white paper's own predictions (especially Section 11 — Multi-Resolution Context Packaging — and Hypothesis 1 in Section 32) -3. Includes every relevant table and chart (specs below) -4. Calls out what's defensible, what's been retracted, and what's still pending -5. Lands on a publication-grade summary the team can use to make product decisions - -Format: A4 PDF, ~15-25 pages. Same restrained design language as the existing white paper (Anthropic-ish: ivory background, slate text, single coral accent, generous whitespace). Use reportlab — it's already installed. - -Output file: `Context Engine Benchmark Report v1.0.pdf` at the repo root. - -## Source files (all in `app/bench/`) - -| File | What it is | -| ------------------------------------- | --------------------------------------------------------------------------------- | -| `tasks.json` | The 15-task corpus | -| `gold-answers.json` | Per-task must/may/forbidden facts + expected_sources for retrieval scoring | -| `results-latest.json` | v1.1: 15-task token-only run, no quality grading | -| `results-graded-sample.json` | v1.1: 5-task session-as-judge grading (holistic 1-10) | -| `results-v12-final.json` | v1.2: 5-task subset with deterministic gold scoring + session decomposed rubric | -| `results-v13.json` | v1.3: raw 15-task × 3-mode × 3-judge-run rows | -| `results-v13-final.json` | v1.3: aggregated v1.3 with paired comparisons, retrieval quality, lift efficiency | -| `responses-v13.json` | All 45 task-model responses (Sonnet 4.5) captured during v1.3 | -| `contexts/manifest.json` | Per-task smart and search context token counts | -| `contexts/*.smart.md` / `*.search.md` | The actual contexts each mode injected | - -## Source PDFs (read for cross-reference, especially Sections 11, 19, 32, 34) - -| File | What it is | -| ------------------------------------- | ------------------------------------------- | -| `Context Engine White Paper V1.pdf` | Original paper (41p) | -| `Context Engine White Paper V1.1.pdf` | V1 + Appendix A (v1.1 benchmark) | -| `Context Engine White Paper V1.2.pdf` | V1.1 + Appendix B (v1.2 critique-addressed) | -| `Context Engine White Paper V1.3.pdf` | V1.2 + Appendix C (v1.3 full pipeline) | - -## The narrative arc — handle this carefully - -**v1.1** (5-task, session-as-judge, holistic 1-10): "Smart 9.0/10, Search 7.6/10, Smart-Compile saves 65% tokens with no quality cost." → optimistic, claimed Hypothesis 1 holds. - -**v1.2** (5-task, gold facts + decomposed rubric, still session-judge): added no-context baseline. Found Smart adds +20 composite points over no-context at ~58k tokens; Search adds +6.7 at ~700 tokens. More sober but still positive. - -**v1.3** (15-task, Sonnet task model, Haiku judge × 3 runs, full pipeline): **Smart Compile in its v0.3.x form NET-DEGRADES quality vs no-context** — loses on 8/15 tasks, mean delta -1.27 composite/15. Search also slightly under-performs no-context (-0.80). Both v1.1 and v1.2 headlines retracted. - -**Why the change**: same-model self-evaluation in v1.1/v1.2 under-penalised verbose, on-style outputs that drift away from the task. A cross-model judge (Haiku, smaller than the Sonnet task model) caught the failure. The single worst case: `comfy-prompt-fantasy` smart-mode — given 36k tokens of skill context, Sonnet wrote a philosophical essay titled "Mystral Dusk" instead of a ComfyUI prompt. - -**What v1.3 _validates_** (don't bury this): - -- Token reduction is real (Smart -60%, Search -99.5%) — deterministic, judge-independent -- Section 11's prediction (multi-resolution packaging is necessary) is empirically confirmed -- Retrieval quality (P@8 = 0.23, 5/15 complete misses) is the bottleneck — Phase 2 in build roadmap (dedup + rank) is the highest-leverage fix -- The brokered-context architecture is right; the v0.3.x _implementation_ isn't done - -## Suggested visualisations - -Each chart should have a one-sentence caption that names the takeaway, not just labels the axes. - -### Chart 1 — Token cost by mode (log-scale bar chart) - -- X: mode (raw_all baseline, smart, search, no_context) -- Y: median input tokens, log scale -- Bars: raw_all 186,654 / smart 74,279 / search 903 / no_context 29 -- Takeaway caption: "Four orders of magnitude separate the cheapest path from the naive baseline." - -### Chart 2 — Composite quality per mode, all 15 tasks (grouped bar or heatmap) - -- X: 15 task IDs -- Y: composite quality (0-15 scale) -- 3 bars/cells per task: no_context, smart, search -- Source: `results-v13-final.json` → `detail[i].{mode}.composite_15` -- Takeaway: "Smart Compile loses on 8 of 15 tasks despite costing 2,500× more tokens than no-context." - -### Chart 3 — Paired delta plot (waterfall or dot) - -- X: 15 tasks, sorted by smart-vs-no_context delta -- Y: delta in composite points (positive = Smart wins) -- Source: `results-v13-final.json` → `detail` → compute per-task `smart.composite_15 - no_context.composite_15` -- Annotate `comfy-prompt-fantasy` (worst, -10) and `memory-consolidate` (best, +4) -- Takeaway: "When Smart Compile fails, it fails catastrophically; when it wins, it wins modestly." - -### Chart 4 — Hallucination distribution (box plot or strip) - -- X: mode -- Y: hallucinations_mean (judge runs averaged) per task -- Show: no_context cluster around 0.98, smart at 1.49 mean with high outliers (4.33 on claude-api-migrate), search at 1.20 -- Takeaway: "Loading more context creates more fabricated facts, not fewer." - -### Chart 5 — Retrieval quality predicts quality outcome (scatter) - -- X: Precision@8 per task (search mode, from `results-v13-final.json` → `retrieval_quality`) -- Y: search-mode composite quality delta vs no_context -- Trend line + labels for outliers -- Takeaway: "Where retrieval surfaces relevant chunks, search adds value; where it misses, search degrades." - -### Chart 6 — Cost per quality-point-of-lift over no-context - -- Bar: smart vs search -- Y: tokens per +1 composite point of lift (when lift > 0) -- Smart: 80,373 tk/+1pt | Search: 1,052 tk/+1pt | also show count of "negative-lift" tasks (smart 8/15, search 8/15) -- Takeaway: "Search is 76× more efficient per quality-point of lift, but both modes show negative lift on the majority of tasks." - -### Chart 7 — Methodology change vs result (narrative figure) - -- Three rows (v1.1 / v1.2 / v1.3), three columns showing: N, judge config, headline composite quality of Smart -- Visually emphasises how methodology drove the result -- Takeaway: "Self-evaluation bias accounted for ~3 composite points of inflation in the v1.1 result." - -### Chart 8 — Latency & cost per mode (small multiples) - -- Two side-by-side bars: median latency (ms) and total cost (USD) per mode for the v1.3 run -- Source: `results-v13-final.json` → `by_mode` per-mode `latency_ms_median`, `cost_usd_total` - -## Tables to include (verbatim from existing appendices) - -1. **Per-mode aggregate, all 15 tasks** (from Appendix C Table C.1) -2. **Paired comparison** (Smart/Search vs no_context, Smart vs Search) (Appendix C Table C.2) -3. **Per-task detail** (15 rows × 3 modes × {tokens, cov%, bonus%, fact/spec/comp/halluc, composite/15}) — pull from `results-v13-final.json` → `detail` -4. **Retrieval quality per task** (P@8, R@8, MRR, hits) — pull from `results-v13.json` → rows where mode == "search" → `retrieval_quality` -5. **Methodology evolution** (v1.1 → v1.2 → v1.3 differences in N, judge, rubric, gold answers, modes measured) — synthesise - -## Structure suggestion (15-25 pages) - -1. Executive summary (1 page) — headline numbers, big finding, the methodology-changed-the-result story -2. Methodology evolution (2 pages) — what each iteration changed and why -3. Headline results — v1.3 numbers (3-4 pages) — tables 1, 2, charts 1, 2, 3 -4. Cross-reference: white paper predictions vs v1.3 measurements (2 pages) — especially Section 11 vindication, Hypothesis 1 retraction -5. The failure mode in detail (1-2 pages) — comfy-prompt-fantasy Mystral Dusk case study (the actual responses are in `responses-v13.json`) -6. Hallucinations + retrieval-quality findings (2-3 pages) — charts 4, 5, table 4 -7. What's validated, what's retracted (1 page) — the honest list -8. Implications for build roadmap (1-2 pages) — Phase 2 urgency, re-scoping Phase 3, retrieval-quality dashboard -9. Honest limitations still standing (1 page) — single judge family, single task model, etc. -10. Appendix: full per-task detail table - -## Things the writeup MUST get right - -- **Don't bury the retraction.** v1.1 said "Smart holds at 9.0/10". v1.3 says Smart loses on 8/15 tasks. Both numbers are in the data; the writeup needs to say _why_ they differ (judge bias). -- **Don't bury the validation either.** Token reduction is real and large. Section 11 prediction is empirically confirmed. The architecture is right. -- **The product framing matters.** Current Smart Compile (v0.3.x) is a token-saving feature, not a quality-improving one. Recommending it to users today is recommending a quality regression on most tasks. That's the honest framing; don't soften it. -- **The roadmap framing matters.** Phase 2 (dedup + rank) and the Section 11 chunked-smart-compile aren't nice-to-haves; they're load-bearing for the product's value claim. Make that visible. - -## Files Codex should not modify - -The existing `Context Engine White Paper V1.x.pdf` series is the historical record. Don't rebuild those. Don't modify `tokenomics.py`, `grade_v2.py`, `run_v13.py`, `aggregate_v13.py`, or any of the JSON result files. Read-only. - -## Files Codex creates - -- `Context Engine Benchmark Report v1.0.pdf` (the main deliverable, at repo root) -- `app/bench/charts/*.png` (rendered chart images, kept around for re-use) -- `app/bench/build_report.py` (the reportlab/matplotlib script that produces the PDF; future-runnable) - -## Done criteria - -- PDF builds end-to-end from a single `python app/bench/build_report.py` invocation -- All 8 charts present, each with a take-away caption (not a label) -- All 5 tables present -- The v1.1 → v1.3 retraction is clearly explained, not euphemistic -- Section 11 vindication and Hypothesis 1 retraction both flagged in the executive summary -- File size under 10 MB (use 200 DPI for charts, not 600) diff --git a/bench/run_continuity.py b/bench/run_continuity.py deleted file mode 100644 index efd889a..0000000 --- a/bench/run_continuity.py +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env python3 -""" -run_continuity.py — Continuity benchmark for Context Engine. - -Tests whether session 2 of a paused workflow inherits state established in -session 1, across three modes: - - no_context_cold — S2 has no memory of S1. Baseline of pain. - System prompt empty; only the S2 user prompt is given. - ce_full — S2 gets the full session_1_state object as a memory - snapshot. Simulates CE persisting and replaying the - entire state. - ce_compact — S2 gets the compact handoff note (1-2 sentences) - CE emits. Simulates the realistic delivery — what CE - would inject as a memory entry, not the raw state dump. - -For each (task, mode) pair we measure: - • input/output tokens, latency, dollar cost - • inherited-fact coverage (deterministic; must_inherit keyword hits) - • reset-question count (deterministic; regex matches on response) - • forbidden-drift count (deterministic; alternative-choice hits) - • decomposed rubric (Haiku judge x N runs: factual / specific / complete / hallucinations) - -Headline metric: time/tokens/turns to productive output in S2 with CE-backed -state vs cold start. The lower the ce_continue numbers (vs no_context_cold), -the stronger the continuity claim. - -Requires ANTHROPIC_API_KEY in env. - -Usage: - export ANTHROPIC_API_KEY=... - python bench/run_continuity.py # all 7 tasks, 3 modes, 3 judge runs - python bench/run_continuity.py --tasks-subset 2 # smoke test - python bench/run_continuity.py --modes no_context_cold,ce_compact # just two modes -""" -from __future__ import annotations -import argparse -import json -import os -import re -import sys -import time -from dataclasses import asdict -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent)) -from grade_v2 import ( # noqa: E402 - keyword_hits, parse_decomposed, DECOMPOSED_RUBRIC, cost_usd, - aggregate_grades, DecomposedGrade, -) - -HERE = Path(__file__).parent -DATA = HERE / "data" / "continuity" - - -def system_for_mode(mode: str, task: dict) -> str: - """Build the system prompt that simulates each delivery mode of CE state.""" - if mode == "no_context_cold": - return "" - if mode == "ce_full": - state = task["session_1_state"] - return ( - "You are an AI assistant continuing a paused workflow. The user's " - "previous session established the following state, which you " - "should respect and build on without re-asking:\n\n" - f"PREVIOUS SESSION CONTEXT:\n{json.dumps(state, indent=2)}\n\n" - "Do not re-ask the user for facts that are already in the state above. " - "Continue from where the previous session left off." - ) - if mode == "ce_compact": - handoff = task["session_1_compact_handoff"] - return ( - "You are an AI assistant continuing a paused workflow. The user's " - "previous session left this handoff note:\n\n" - f"HANDOFF FROM PREVIOUS SESSION:\n{handoff}\n\n" - "Do not re-ask the user for facts that are already covered in the " - "handoff above. Continue from where the previous session left off." - ) - return "" - - -def score_inheritance(response: str, gold: dict) -> dict: - """Deterministic scoring of state inheritance.""" - must = gold.get("must_inherit", []) - reset_qs = gold.get("reset_questions", []) - forbidden = gold.get("forbidden_drift", []) - - must_hit = 0 - must_results = [] - for f in must: - hits = keyword_hits(response, f.get("keywords", [])) - ok = bool(hits) - if ok: - must_hit += 1 - must_results.append({"name": f["name"], "hit": ok, "matched": hits}) - - reset_hits = 0 - reset_results = [] - norm = response.lower() - for q in reset_qs: - matched = False - matches = [] - for pat in q.get("patterns", []): - if re.search(pat.lower(), norm): - matched = True - matches.append(pat) - if matched: - reset_hits += 1 - reset_results.append({"name": q["name"], "hit": matched, "patterns_matched": matches}) - - forbidden_hit = 0 - forbidden_results = [] - for f in forbidden: - hits = keyword_hits(response, f.get("keywords", [])) - ok = bool(hits) - if ok: - forbidden_hit += 1 - forbidden_results.append({"name": f["name"], "hit": ok, "matched": hits}) - - return { - "must_total": len(must), "must_hit": must_hit, - "inheritance_pct": round(100 * must_hit / len(must), 1) if must else 0, - "reset_questions_total": len(reset_qs), "reset_questions_hit": reset_hits, - "forbidden_drift_hit": forbidden_hit, - "must_results": must_results, - "reset_results": reset_results, - "forbidden_results": forbidden_results, - } - - -def call_anthropic(client, model: str, system: str, user: str, max_tokens: int = 600) -> dict: - t0 = time.time() - kwargs = {"model": model, "max_tokens": max_tokens, "messages": [{"role": "user", "content": user}]} - if system: - kwargs["system"] = system - resp = client.messages.create(**kwargs) - text = "".join(b.text for b in resp.content if getattr(b, "type", None) == "text") - return { - "text": text.strip(), - "input_tokens": int(resp.usage.input_tokens), - "output_tokens": int(resp.usage.output_tokens), - "latency_ms": int((time.time() - t0) * 1000), - } - - -def main() -> int: - p = argparse.ArgumentParser() - p.add_argument("--tasks", default=str(HERE / "continuity-tasks.json")) - p.add_argument("--gold", default=str(HERE / "continuity-gold.json")) - p.add_argument("--out", default=str(DATA / "results-continuity.json")) - p.add_argument("--responses-out", default=str(DATA / "responses-continuity.json")) - p.add_argument("--task-model", default="claude-sonnet-4-5") - p.add_argument("--judge-model", default="claude-haiku-4-5") - p.add_argument("--judge-runs", type=int, default=3) - p.add_argument("--modes", default="no_context_cold,ce_compact,ce_full") - p.add_argument("--tasks-subset", type=int, default=0) - p.add_argument("--max-out", type=int, default=600) - args = p.parse_args() - - api_key = os.environ.get("ANTHROPIC_API_KEY") - if not api_key: - sys.stderr.write("[!] ANTHROPIC_API_KEY not set\n") - return 1 - try: - import anthropic - except ImportError: - sys.stderr.write("[!] pip install anthropic\n") - return 1 - client = anthropic.Anthropic(api_key=api_key) - - tasks = json.loads(Path(args.tasks).read_text(encoding="utf-8"))["tasks"] - gold = json.loads(Path(args.gold).read_text(encoding="utf-8"))["tasks"] - if args.tasks_subset: - tasks = tasks[: args.tasks_subset] - modes = [m.strip() for m in args.modes.split(",") if m.strip()] - - print(f"Continuity benchmark — task={args.task_model} judge={args.judge_model} x {args.judge_runs}") - print(f"Tasks: {len(tasks)} Modes: {modes}\n") - - responses = {} - rows = [] - - for ti, task in enumerate(tasks, 1): - tid = task["id"] - s2_prompt = task["session_2_prompt"] - print(f" [{ti}/{len(tasks)}] {tid}") - responses[tid] = {} - gold_t = gold.get(tid, {}) - - for mode in modes: - system = system_for_mode(mode, task) - try: - r = call_anthropic(client, args.task_model, system, s2_prompt, args.max_out) - except Exception as e: - print(f" {mode:<18} FAIL {e}") - continue - r["model"] = args.task_model - r["system_chars"] = len(system) - responses[tid][mode] = r - - inh = score_inheritance(r["text"], gold_t) - - # Judge runs - judge_grades = [] - user_judge = ( - f"TASK (session 2 of a paused workflow):\n{s2_prompt}\n\n" - f"SESSION 1 SUMMARY (what the user had already done):\n{task['session_1_summary']}\n\n" - f"RESPONSE TO GRADE:\n{r['text']}" - ) - for run_idx in range(args.judge_runs): - try: - jr = call_anthropic(client, args.judge_model, DECOMPOSED_RUBRIC, - user_judge, max_tokens=200) - g = parse_decomposed(jr["text"]) - g.judge = f"anthropic:{args.judge_model}" - g.run_idx = run_idx - judge_grades.append(g) - except Exception as e: - judge_grades.append(DecomposedGrade(notes=f"(judge err: {e})", - judge=f"anthropic:{args.judge_model}", - run_idx=run_idx)) - judge_summary = aggregate_grades(judge_grades) - - print(f" {mode:<18} " - f"in={r['input_tokens']:>5,} out={r['output_tokens']:>4} " - f"inh={inh['inheritance_pct']:>5.1f}% " - f"resets={inh['reset_questions_hit']} " - f"drift={inh['forbidden_drift_hit']} " - f"fact={judge_summary.get('factual_median', 0):.1f} " - f"spec={judge_summary.get('specific_median', 0):.1f} " - f"comp={judge_summary.get('complete_median', 0):.1f} " - f"halluc={judge_summary.get('hallucinations_mean', 0)}") - - rows.append({ - "task_id": tid, - "category": task["category"], - "mode": mode, - "model": args.task_model, - "input_tokens": r["input_tokens"], - "output_tokens": r["output_tokens"], - "latency_ms": r["latency_ms"], - "cost_usd": round(cost_usd(args.task_model, r["input_tokens"], r["output_tokens"]), 6), - "system_chars": r["system_chars"], - "inheritance": inh, - "judge_runs": [asdict(g) for g in judge_grades], - "judge_summary": judge_summary, - }) - - responses_path = Path(args.responses_out) - out_path = Path(args.out) - responses_path.parent.mkdir(parents=True, exist_ok=True) - out_path.parent.mkdir(parents=True, exist_ok=True) - responses_path.write_text(json.dumps(responses, indent=2), encoding="utf-8") - out_path.write_text(json.dumps({ - "ran_at": time.strftime("%Y-%m-%dT%H:%M:%S"), - "task_model": args.task_model, - "judge_model": args.judge_model, - "judge_runs": args.judge_runs, - "task_count": len(tasks), - "modes": modes, - "rows": rows, - }, indent=2), encoding="utf-8") - print(f"\nWrote: {responses_path}\nWrote: {out_path}") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bench/run_v12.py b/bench/run_v12.py deleted file mode 100644 index 8511c04..0000000 --- a/bench/run_v12.py +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/env python3 -""" -run_v12.py — v1.2 benchmark orchestrator. - -Replaces tokenomics.py --grade with the full critique-addressing measurement -pipeline. Reads gold-answers.json + tasks.json, runs every task through up to -four context paths (no-context, smart, search, raw_all-skipped-for-cost), -grades each response by: - - deterministic gold-fact recall (always run; no API key needed) - - retrieval Precision@K / Recall@K / MRR (always run if expected_sources labeled) - - decomposed LLM rubric (factual / specific / complete / hallucinations) - via multiple judges across families (anthropic, openai), N runs each. - - failure-mode taxonomy (inferred from gold misses) - -Captures latency + cost per call. Writes bench/data/v1.2/results-v12.json. - -When API keys aren't available, the script still produces meaningful results -via the deterministic scorers (gold recall + retrieval metrics). Use ---responses path/to/file.json to score pre-generated responses without -calling the task model. - -Usage: - # Full pipeline with API keys: - export ANTHROPIC_API_KEY=... - export OPENAI_API_KEY=... - python bench/run_v12.py --judges anthropic:haiku,openai:gpt-4o-mini --runs 3 - - # Deterministic-only scoring of pre-generated responses: - python bench/run_v12.py --responses bench/data/v1.2/responses-v12.json --no-llm-grade - - # Single judge, low cost smoke test: - python bench/run_v12.py --judges anthropic:haiku --runs 1 --tasks-subset 5 -""" -from __future__ import annotations - -import argparse -import json -import os -import sys -import time -from dataclasses import asdict -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent)) -from grade_v2 import ( # noqa: E402 - JudgeConfig, score_facts, score_retrieval, - grade_response_decomposed, aggregate_grades, cost_usd, -) - -# Hard-coded judge presets so the CLI stays short. -JUDGE_PRESETS = { - "anthropic:haiku": JudgeConfig("anthropic:haiku", "anthropic", "claude-haiku-4-5", "ANTHROPIC_API_KEY"), - "anthropic:sonnet": JudgeConfig("anthropic:sonnet", "anthropic", "claude-sonnet-4-5", "ANTHROPIC_API_KEY"), - "openai:gpt-4o-mini": JudgeConfig("openai:gpt-4o-mini", "openai", "gpt-4o-mini", "OPENAI_API_KEY"), - "openai:gpt-4o": JudgeConfig("openai:gpt-4o", "openai", "gpt-4o", "OPENAI_API_KEY"), -} - -HERE = Path(__file__).parent -DATA = HERE / "data" / "v1.2" - - -def load_responses(path: Path) -> dict: - """Pre-generated responses from disk. Schema: - { task_id: { context_mode: { "text": ..., "input_tokens": ..., "output_tokens": ..., - "latency_ms": ..., "model": ..., "retrieved_skill_ids": [...] }}}""" - return json.loads(path.read_text(encoding="utf-8")) - - -def main() -> int: - p = argparse.ArgumentParser() - p.add_argument("--gold", default=str(HERE / "gold-answers.json")) - p.add_argument("--tasks", default=str(HERE / "tasks.json")) - p.add_argument("--responses", default=str(DATA / "responses-v12.json"), - help="Pre-generated responses to score. Required when --no-task-run.") - p.add_argument("--out", default=str(DATA / "results-v12.json")) - p.add_argument("--judges", default="anthropic:haiku", - help="Comma-separated judge presets, e.g. 'anthropic:haiku,openai:gpt-4o-mini'") - p.add_argument("--runs", type=int, default=3, help="Judge runs per response.") - p.add_argument("--no-llm-grade", action="store_true", - help="Skip the LLM decomposed-rubric pass. Deterministic scorers still run.") - p.add_argument("--tasks-subset", type=int, default=0, - help="Score only the first N tasks (for fast iteration).") - args = p.parse_args() - - gold_root = json.loads(Path(args.gold).read_text(encoding="utf-8")) - gold = gold_root["tasks"] - tasks = json.loads(Path(args.tasks).read_text(encoding="utf-8")) - if args.tasks_subset: - tasks = tasks[: args.tasks_subset] - - responses_path = Path(args.responses) - responses = load_responses(responses_path) if responses_path.exists() else {} - - judges = [JUDGE_PRESETS[j.strip()] for j in args.judges.split(",") if j.strip() in JUDGE_PRESETS] - if not args.no_llm_grade and judges: - print(f"Judges: {', '.join(j.name for j in judges)} x {args.runs} runs each") - else: - print("LLM grading: skipped (deterministic scorers only)") - - out = { - "ran_at": time.strftime("%Y-%m-%dT%H:%M:%S"), - "judges": [j.name for j in judges] if not args.no_llm_grade else [], - "runs_per_judge": args.runs if not args.no_llm_grade else 0, - "task_count": len(tasks), - "results": [], - } - - for task in tasks: - tid = task["id"] - if tid not in gold: - print(f" [skip] {tid}: no gold entry") - continue - if tid not in responses: - print(f" [skip] {tid}: no responses on disk") - continue - per_task = { - "task_id": tid, - "category": task.get("category", ""), - "prompt": task["prompt"], - "modes": {}, - } - for mode, resp in responses[tid].items(): - text = resp.get("text", "") - recall = score_facts(text, gold[tid]) - retrieval = None - if mode == "search": - retrieved = resp.get("retrieved_skill_ids", []) - expected = gold[tid].get("expected_sources", []) - if expected and retrieved: - retrieval = asdict(score_retrieval(retrieved, expected)) - llm_grades = [] - if not args.no_llm_grade and judges and text.strip(): - llm_grades = grade_response_decomposed( - task["prompt"], text, judges, runs=args.runs, - ) - llm_summary = aggregate_grades(llm_grades) if llm_grades else {"n": 0} - - model = resp.get("model", "unknown") - in_tok = int(resp.get("input_tokens", 0)) - out_tok = int(resp.get("output_tokens", 0)) - per_task["modes"][mode] = { - "model": model, - "input_tokens": in_tok, - "output_tokens": out_tok, - "latency_ms": int(resp.get("latency_ms", 0)), - "cost_usd": round(cost_usd(model, in_tok, out_tok), 6), - "fact_recall": asdict(recall), - "retrieval_quality": retrieval, - "llm_grades": [asdict(g) for g in llm_grades], - "llm_summary": llm_summary, - } - out["results"].append(per_task) - rs = per_task["modes"] - recall_line = " | ".join( - f"{m}={rs[m]['fact_recall']['coverage_pct']:.0f}%" for m in rs if "fact_recall" in rs[m] - ) - print(f" {tid:<28} {recall_line}") - - out_path = Path(args.out) - out_path.parent.mkdir(parents=True, exist_ok=True) - out_path.write_text(json.dumps(out, indent=2), encoding="utf-8") - print(f"\nWrote {args.out}") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bench/run_v13.py b/bench/run_v13.py deleted file mode 100644 index 4295a0d..0000000 --- a/bench/run_v13.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python3 -""" -run_v13.py — End-to-end v1.3 benchmark. - -Generates responses for every task × every mode using a real task model, then -grades each response N times with the configured judges, then writes the -final aggregated results. - -Pipeline: - 1. Load tasks + gold answers + pre-fetched contexts (from fetch_contexts.py). - 2. For each task, for each mode (no_context, smart, search): - a. Build the system prompt (empty / smart skill bodies / search chunks). - b. Call the task model. Record text, in/out tokens, latency. - 3. Score every response with the deterministic grade_v2 scorers: - fact-recall + retrieval-quality. - 4. Grade every response N times per judge with the decomposed rubric. - 5. Aggregate (median + IQR) and write bench/data/v1.3/results-v13.json. - -Requires ANTHROPIC_API_KEY in the env. Reads it once at startup; never -written to disk. - -Usage: - export ANTHROPIC_API_KEY=sk-ant-... - python bench/run_v13.py - python bench/run_v13.py --tasks-subset 5 --judge-runs 3 - python bench/run_v13.py --task-model claude-sonnet-4-5 --judge-model claude-haiku-4-5 -""" -from __future__ import annotations -import argparse -import json -import os -import sys -import time -from dataclasses import asdict -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent)) -from grade_v2 import ( # noqa: E402 - score_facts, score_retrieval, parse_decomposed, - DECOMPOSED_RUBRIC, cost_usd, aggregate_grades, DecomposedGrade, -) - -HERE = Path(__file__).parent -DATA = HERE / "data" / "v1.3" -CONTEXTS = HERE / "artifacts" / "contexts" - - -def system_for_mode(mode: str, tid: str) -> tuple[str, list[str]]: - """Return (system_prompt_body, retrieved_skill_ids). - retrieved_skill_ids is non-empty only for mode='search'.""" - if mode == "no_context": - return "", [] - if mode == "smart": - path = CONTEXTS / f"{tid}.smart.md" - if not path.exists(): - return "", [] - body = path.read_text(encoding="utf-8") - return f"You are an AI assistant. Use the following retrieved knowledge where relevant.\n\n{body}", [] - if mode == "search": - path = CONTEXTS / f"{tid}.search.md" - if not path.exists(): - return "", [] - body = path.read_text(encoding="utf-8") - # Extract retrieved skill IDs from chunk headers - import re - ids = [] - seen = set() - for m in re.finditer(r"skill=(\S+)", body): - if m.group(1) not in seen: - seen.add(m.group(1)) - ids.append(m.group(1)) - return f"You are an AI assistant. Use the following retrieved chunks where relevant.\n\n{body}", ids - return "", [] - - -def call_anthropic(client, model: str, system: str, user: str, max_tokens: int = 700) -> dict: - t0 = time.time() - kwargs = {"model": model, "max_tokens": max_tokens, "messages": [{"role": "user", "content": user}]} - if system: - kwargs["system"] = system - resp = client.messages.create(**kwargs) - text = "".join(b.text for b in resp.content if getattr(b, "type", None) == "text") - return { - "text": text.strip(), - "input_tokens": int(resp.usage.input_tokens), - "output_tokens": int(resp.usage.output_tokens), - "latency_ms": int((time.time() - t0) * 1000), - } - - -def main() -> int: - p = argparse.ArgumentParser() - p.add_argument("--gold", default=str(HERE / "gold-answers.json")) - p.add_argument("--tasks", default=str(HERE / "tasks.json")) - p.add_argument("--out", default=str(DATA / "results-v13.json")) - p.add_argument("--responses-out", default=str(DATA / "responses-v13.json")) - p.add_argument("--task-model", default="claude-sonnet-4-5") - p.add_argument("--judge-model", default="claude-haiku-4-5") - p.add_argument("--judge-runs", type=int, default=3) - p.add_argument("--tasks-subset", type=int, default=0, help="Run only the first N tasks.") - p.add_argument("--modes", default="no_context,smart,search") - p.add_argument("--task-max-out", type=int, default=700) - args = p.parse_args() - - api_key = os.environ.get("ANTHROPIC_API_KEY") - if not api_key: - sys.stderr.write("[!] ANTHROPIC_API_KEY not set. Aborting.\n") - return 1 - - try: - import anthropic - except ImportError: - sys.stderr.write("[!] pip install anthropic\n") - return 1 - client = anthropic.Anthropic(api_key=api_key) - - gold_root = json.loads(Path(args.gold).read_text(encoding="utf-8")) - gold = gold_root["tasks"] - tasks = json.loads(Path(args.tasks).read_text(encoding="utf-8")) - if args.tasks_subset: - tasks = tasks[: args.tasks_subset] - modes = [m.strip() for m in args.modes.split(",") if m.strip()] - - print(f"Task model: {args.task_model}") - print(f"Judge model: {args.judge_model} x {args.judge_runs} runs per response") - print(f"Tasks: {len(tasks)} | modes: {modes}") - print() - - responses = {} - rows = [] - - for ti, task in enumerate(tasks, 1): - tid = task["id"] - prompt = task["prompt"] - print(f" [{ti:>2}/{len(tasks)}] {tid}") - responses[tid] = {} - gold_t = gold.get(tid, {}) - for mode in modes: - system, retrieved_ids = system_for_mode(mode, tid) - if mode != "no_context" and not system: - print(f" {mode:<11} (no context on disk; skipping)") - continue - try: - r = call_anthropic(client, args.task_model, system, prompt, args.task_max_out) - except Exception as e: - print(f" {mode:<11} FAIL {e}") - continue - r["model"] = args.task_model - if retrieved_ids: - r["retrieved_skill_ids"] = retrieved_ids - responses[tid][mode] = r - - # Score deterministically - recall = score_facts(r["text"], gold_t) - retrieval = None - if mode == "search": - expected = gold_t.get("expected_sources", []) - if expected and retrieved_ids: - retrieval = asdict(score_retrieval(retrieved_ids, expected)) - - # Judge N times - judge_grades = [] - user_judge = f"TASK:\n{prompt}\n\nRESPONSE TO GRADE:\n{r['text']}" - for run_idx in range(args.judge_runs): - try: - jr = call_anthropic(client, args.judge_model, DECOMPOSED_RUBRIC, - user_judge, max_tokens=200) - g = parse_decomposed(jr["text"]) - g.judge = f"anthropic:{args.judge_model}" - g.run_idx = run_idx - judge_grades.append(g) - except Exception as e: - judge_grades.append(DecomposedGrade(notes=f"(judge err: {e})", - judge=f"anthropic:{args.judge_model}", - run_idx=run_idx)) - judge_summary = aggregate_grades(judge_grades) - - print(f" {mode:<11} in={r['input_tokens']:>6,} out={r['output_tokens']:>4} " - f"cov={recall.coverage_pct:>5.1f}% bonus={recall.bonus_pct:>5.1f}% " - f"fact={judge_summary.get('factual_median', 0):.1f} " - f"spec={judge_summary.get('specific_median', 0):.1f} " - f"comp={judge_summary.get('complete_median', 0):.1f} " - f"halluc={judge_summary.get('hallucinations_mean', 0)} ") - - rows.append({ - "task_id": tid, - "category": task.get("category", ""), - "mode": mode, - "model": args.task_model, - "input_tokens": r["input_tokens"], - "output_tokens": r["output_tokens"], - "latency_ms": r["latency_ms"], - "cost_usd": round(cost_usd(args.task_model, r["input_tokens"], r["output_tokens"]), 6), - "fact_recall": asdict(recall), - "retrieval_quality": retrieval, - "judge_runs": [asdict(g) for g in judge_grades], - "judge_summary": judge_summary, - }) - - responses_path = Path(args.responses_out) - out_path = Path(args.out) - responses_path.parent.mkdir(parents=True, exist_ok=True) - out_path.parent.mkdir(parents=True, exist_ok=True) - responses_path.write_text(json.dumps(responses, indent=2), encoding="utf-8") - out_path.write_text(json.dumps({ - "ran_at": time.strftime("%Y-%m-%dT%H:%M:%S"), - "task_model": args.task_model, - "judge_model": args.judge_model, - "judge_runs": args.judge_runs, - "task_count": len(tasks), - "rows": rows, - }, indent=2), encoding="utf-8") - print(f"\nWrote responses: {responses_path}") - print(f"Wrote results: {out_path}") - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/scripts/quick-search.cjs b/scripts/quick-search.cjs deleted file mode 100644 index 7193522..0000000 --- a/scripts/quick-search.cjs +++ /dev/null @@ -1,46 +0,0 @@ -/** - * quick-search.cjs — Quick search test against running CE. - * Usage: node scripts/quick-search.cjs [port=3847] - */ -const http = require('http'); -const port = parseInt(process.argv[2], 10) || 3847; -const query = process.argv[3] || 'What is using my CPU right now and how do I find the worst offender?'; - -function request(method, urlPath, body) { - return new Promise((resolve, reject) => { - const payload = body ? JSON.stringify(body) : null; - const opts = { - host: '127.0.0.1', - port, - path: urlPath, - method, - timeout: 30000, - headers: payload - ? { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) } - : {}, - }; - const req = http.request(opts, (res) => { - let d = ''; - res.on('data', (c) => (d += c)); - res.on('end', () => { - try { - resolve(JSON.parse(d)); - } catch { - reject(new Error(d.slice(0, 200))); - } - }); - }); - req.on('timeout', () => req.destroy(new Error('Timeout'))); - req.on('error', reject); - if (payload) req.write(payload); - req.end(); - }); -} - -request('POST', '/api/search', { query, limit: 10 }).then((data) => { - (data.results || []).forEach((r, i) => - console.log( - `${i + 1}. ${r.skillId} (score=${(r.score || 0).toFixed(3)}, lex=${(r.lexicalScore || 0).toFixed(3)}) — ${r.section}`, - ), - ); -}); diff --git a/scripts/retrieval-smoke.js b/scripts/retrieval-smoke.js deleted file mode 100644 index 750f53d..0000000 --- a/scripts/retrieval-smoke.js +++ /dev/null @@ -1,213 +0,0 @@ -// @ts-check -/** - * retrieval-smoke.js — Retrieval-quality smoke gate. - * - * Connects to CE (starts one if --start is given), registers fixture skills, - * rebuilds the vector index, then verifies expected_source Recall@8 = 1.00 - * for every task in gold-answers.json. Exits non-zero on failure. - * - * Usage: - * node scripts/retrieval-smoke.js # connect to existing CE - * node scripts/retrieval-smoke.js --port 3847 # specify port - * node scripts/retrieval-smoke.js --start # start CE then test - * node scripts/retrieval-smoke.js --fixtures bench/fixtures/skills - */ - -const http = require('http'); -const path = require('path'); -const fs = require('fs'); - -const HERE = path.resolve(__dirname, '..'); -const GOLD = path.join(HERE, 'bench', 'gold-answers.json'); -const TASKS = path.join(HERE, 'bench', 'tasks.json'); -const FIXTURES = path.join(HERE, 'bench', 'fixtures', 'skills'); -const REQUEST_TIMEOUT_MS = Number(process.env.CE_RETRIEVAL_SMOKE_TIMEOUT_MS || 300000); - -/** @type {import('http').Server | null} */ -let server = null; -let activePort = 3847; - -/** - * @param {string} method - * @param {string} urlPath - * @param {unknown=} body - */ -function request(method, urlPath, body) { - return new Promise((resolve, reject) => { - const payload = body !== undefined ? JSON.stringify(body) : null; - const opts = { - host: '127.0.0.1', - port: activePort, - path: urlPath, - method, - timeout: REQUEST_TIMEOUT_MS, - headers: payload - ? { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) } - : {}, - }; - const req = http.request(opts, (res) => { - let data = ''; - res.on('data', (chunk) => { - data += chunk; - }); - res.on('end', () => { - try { - resolve(JSON.parse(data)); - } catch { - reject(new Error(`Invalid JSON: ${data.slice(0, 200)}`)); - } - }); - }); - req.on('timeout', () => req.destroy(new Error('Request timed out'))); - req.on('error', reject); - if (payload) req.write(payload); - req.end(); - }); -} - -async function run() { - // Re-parse args inside run scope - const innerArgv = process.argv.slice(2); - let innerPort = 3847; - let innerStart = false; - let fixtureArg = FIXTURES; - for (let i = 0; i < innerArgv.length; i++) { - if (innerArgv[i] === '--port') { - innerPort = parseInt(/** @type {string} */ (innerArgv[i + 1]), 10) || innerPort; - i++; - } - if (innerArgv[i] === '--start') { - innerStart = true; - } - if (innerArgv[i] === '--fixtures') { - fixtureArg = path.resolve(HERE, /** @type {string} */ (innerArgv[i + 1])); - i++; - } - } - const fixturePath = fixtureArg; - - // Validate inputs - const gold = JSON.parse(fs.readFileSync(GOLD, 'utf8')); - const tasks = JSON.parse(fs.readFileSync(TASKS, 'utf8')); - const tasksGold = gold.tasks || {}; - - console.log(`Retrieval smoke gate`); - console.log(` Fixtures: ${fixturePath}`); - console.log(` CE port: ${innerPort}`); - console.log(` Tasks: ${tasks.length}`); - console.log(); - - activePort = innerPort; - - if (innerStart) { - process.env.CE_PORT = String(innerPort); - const { PORT } = require('../server/lib/config'); - const { startServer } = require('../server/server'); - const srv = startServer({ port: PORT, refresh: false }); - await new Promise((resolve) => srv.once('listening', resolve)); - server = srv; - console.log(' CE started on port', PORT); - } - - // Verify CE is reachable - try { - const hc = await request('GET', `/api/health`); - if (!hc?.skills) { - throw new Error('CE health check failed'); - } - console.log(` CE reachable: ${Object.keys(hc.skills || {}).length} skill counts`); - } catch (e) { - throw new Error(`CE not reachable on port ${innerPort}: ${e instanceof Error ? e.message : String(e)}`); - } - - try { - // Register fixture skills as a source - const addResult = await request('POST', '/api/skill-sources', { - path: fixturePath, - label: 'benchmark-fixtures', - }); - if (!addResult.ok && !addResult.error.includes('already linked')) { - throw new Error(`Failed to add fixtures: ${addResult.error}`); - } - console.log(' Fixtures registered.'); - - // Rebuild vector index - console.log(' Rebuilding index...'); - const indexResult = await request('POST', '/api/index'); - if (!indexResult.ok) { - throw new Error(`Index rebuild failed: ${indexResult.error}`); - } - console.log(` Index built: ${indexResult.chunks} chunks, ${indexResult.skills} skills`); - - // Test each task - let failures = 0; - let totalExpected = 0; - let totalFound = 0; - - for (const task of tasks) { - const tid = task.id; - const goldTask = tasksGold[tid]; - if (!goldTask) { - console.log(` ⚠ ${tid}: no gold data, skipping`); - continue; - } - - const expected = goldTask.expected_sources || []; - if (!expected.length) { - console.log(` ✓ ${tid}: no expected sources`); - continue; - } - - totalExpected += expected.length; - - // Use a generous limit to ensure we catch everything - const searchResult = await request('POST', '/api/search', { - query: task.prompt, - limit: 8, - }); - - // External sources prefix skillIds as `:`. - // Accept both prefixed and bare matches. - const retrieved = (searchResult.results || []).map(/** @param {any} r */ (r) => r.skillId); - const uniqueRetrieved = [...new Set(retrieved)]; - /** @param {string} expectedId */ - const matches = (expectedId) => - uniqueRetrieved.some((rid) => rid === expectedId || rid.endsWith(':' + expectedId)); - const hits = expected.filter((/** @type {string} */ s) => matches(s)); - - totalFound += hits.length; - - if (hits.length === expected.length) { - console.log(` ✓ ${tid}: R@8 = ${hits.length}/${expected.length} ${JSON.stringify(expected)}`); - } else { - failures++; - const missed = expected.filter((/** @type {string} */ s) => !matches(s)); - console.log( - ` ✗ ${tid}: R@8 = ${hits.length}/${expected.length} — missed: ${JSON.stringify(missed)}`, - ); - console.log(` retrieved: ${JSON.stringify(uniqueRetrieved)}`); - } - } - - const recall = totalExpected > 0 ? totalFound / totalExpected : 1; - console.log(); - console.log(` Overall R@8: ${(recall * 100).toFixed(1)}% (${totalFound}/${totalExpected})`); - - if (failures > 0) { - console.log(`\n FAIL: ${failures} task(s) with incomplete retrieval`); - process.exitCode = 1; - } else if (recall < 1.0) { - console.log(`\n FAIL: Overall R@8 < 1.00`); - process.exitCode = 1; - } else { - console.log(`\n PASS: All tasks achieve R@8 = 1.00`); - } - } finally { - if (server !== null) server.close(); - } -} - -run().catch((error) => { - console.error('Fatal:', error.message); - process.exitCode = 1; -}); diff --git a/scripts/setup-benchmark.cjs b/scripts/setup-benchmark.cjs deleted file mode 100644 index 243d326..0000000 --- a/scripts/setup-benchmark.cjs +++ /dev/null @@ -1,106 +0,0 @@ -/** - * setup-benchmark.cjs — Register fixtures, rebuild index, verify retrieval. - * - * Usage: - * node scripts/setup-benchmark.cjs - * - * This: - * 1. Registers bench/fixtures/skills/ as a skill source - * 2. Rebuilds the vector index - * 3. Runs a quick retrieval health check - */ - -const http = require('http'); -const path = require('path'); - -const CE_HOST = '127.0.0.1'; -const CE_PORT = 3847; -const FIXTURES = path.resolve(__dirname, '..', 'bench', 'fixtures', 'skills'); - -function request(method, urlPath, body) { - return new Promise((resolve, reject) => { - const payload = body !== undefined ? JSON.stringify(body) : null; - const opts = { - host: CE_HOST, - port: CE_PORT, - path: urlPath, - method, - timeout: 120000, - headers: payload - ? { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) } - : {}, - }; - const req = http.request(opts, (res) => { - let data = ''; - res.on('data', (chunk) => { - data += chunk; - }); - res.on('end', () => { - try { - resolve(JSON.parse(data)); - } catch { - reject(new Error(`Invalid JSON: ${data.slice(0, 200)}`)); - } - }); - }); - req.on('timeout', () => req.destroy(new Error('Request timed out'))); - req.on('error', reject); - if (payload) req.write(payload); - req.end(); - }); -} - -async function main() { - console.log('=== Benchmark Setup ===\n'); - - // Step 1: Register fixtures - console.log('1. Registering fixture skills...'); - const addResult = await request('POST', '/api/skill-sources', { - path: FIXTURES, - label: 'benchmark-fixtures', - }); - if (addResult.ok) { - console.log(` Registered: ${addResult.source.id} -> ${addResult.source.path}`); - } else if (addResult.error && addResult.error.includes('already linked')) { - console.log(' Already registered (skipping).'); - } else { - console.error(` FAILED: ${addResult.error}`); - process.exitCode = 1; - return; - } - - // Step 2: Rebuild index - console.log('\n2. Rebuilding vector index...'); - const indexResult = await request('POST', '/api/index'); - if (indexResult.ok) { - console.log( - ` Index built: ${indexResult.chunks} chunks, ${indexResult.skills} skills, model=${indexResult.model}`, - ); - } else { - console.error(` FAILED: ${indexResult.error}`); - process.exitCode = 1; - return; - } - - // Step 3: Quick health check - console.log('\n3. Quick search test...'); - const searchResult = await request('POST', '/api/search', { - query: 'process CPU usage Windows task manager', - limit: 5, - }); - if (searchResult.ok) { - const ids = (searchResult.results || []).map((r) => r.skillId); - console.log(` Top results: ${ids.join(', ')}`); - } else { - console.error(` FAILED: ${searchResult.error}`); - process.exitCode = 1; - return; - } - - console.log('\n=== Setup complete ==='); -} - -main().catch((err) => { - console.error('Fatal:', err.message); - process.exitCode = 1; -}); diff --git a/server/lib/system-scan-definitions.js b/server/lib/system-scan-definitions.js new file mode 100644 index 0000000..31ac22e --- /dev/null +++ b/server/lib/system-scan-definitions.js @@ -0,0 +1,174 @@ +// @ts-check +// system-scan-definitions.js — Static host, file, IDE, and extension probes. + +const HOSTS = [ + { id: '.claude', label: 'Claude Code', icon: 'claude' }, + { id: '.cursor', label: 'Cursor', icon: 'cursor' }, + { id: '.windsurf', label: 'Windsurf', icon: 'windsurf' }, + { id: '.codex', label: 'Codex CLI', icon: 'openai' }, + { id: '.opencode', label: 'OpenCode', icon: 'opencode' }, + { id: '.continue', label: 'Continue', icon: 'continue' }, + { id: '.roo', label: 'Roo CLI', icon: 'cline' }, + { id: '.cline', label: 'Cline', icon: 'cline' }, + { id: '.kimi', label: 'Kimi K2', icon: 'kimi' }, + { id: '.goose', label: 'Goose', icon: 'goose' }, + { id: '.amp', label: 'Amp', icon: 'sourcegraph' }, + { id: '.kiro', label: 'Kiro', icon: 'kiro' }, + { id: '.antigravity', label: 'Antigravity', icon: 'antigravity' }, + { id: '.gemini', label: 'Gemini', icon: 'gemini' }, + { id: '.augment', label: 'Augment', icon: 'augment' }, + { id: '.pearai', label: 'PearAI', icon: 'pearai' }, + { id: '.void', label: 'Void', icon: 'void' }, +]; + +const RULE_FILE_NAMES = [ + '.clinerules', + '.cursorrules', + '.windsurfrules', + '.rules', + '.ampcoderc', + '.goosehints', +]; + +const INSTRUCTION_FILE_NAMES = [ + 'AGENTS.md', + 'CLAUDE.md', + 'GEMINI.md', + 'devin.md', + 'CONVENTIONS.md', + '.kimi-system-prompt.md', + '.github/copilot-instructions.md', + 'CONTEXT.md', + 'steering.md', +]; + +const CONFIG_FILE_NAMES = [ + 'settings.json', + 'config.json', + 'config.toml', + 'kimi.json', + 'mcp.json', + 'claude_desktop_config.json', +]; + +const OPPORTUNITY_FILES = { + '.claude': 'CLAUDE.md', + '.cursor': '.cursorrules', + '.windsurf': '.windsurfrules', + '.codex': 'instructions.md', + '.opencode': null, + '.continue': null, + '.roo': null, + '.cline': '.clinerules', + '.kimi': '.kimi-system-prompt.md', + '.goose': '.goosehints', + '.amp': '.ampcoderc', + '.kiro': '.kiro/steering.md', + '.antigravity': null, + '.gemini': 'GEMINI.md', + '.augment': '.augment-guidelines', + '.pearai': '.pearai', + '.void': null, +}; + +const IDE_PROBE_PATHS = [ + { + exe: 'Code.exe', + label: 'VS Code', + dirs: [ + '%LOCALAPPDATA%\\Programs\\Microsoft VS Code', + '%ProgramFiles%\\Microsoft VS Code', + '%ProgramFiles(x86)%\\Microsoft VS Code', + ], + }, + { + exe: 'Cursor.exe', + label: 'Cursor', + dirs: ['%LOCALAPPDATA%\\Programs\\cursor', '%ProgramFiles%\\Cursor'], + }, + { + exe: 'Windsurf.exe', + label: 'Windsurf', + dirs: ['%LOCALAPPDATA%\\Programs\\windsurf', '%ProgramFiles%\\Windsurf'], + }, + { exe: 'Kiro.exe', label: 'Kiro', dirs: ['%LOCALAPPDATA%\\Programs\\Kiro'] }, + { + exe: 'Antigravity.exe', + label: 'Antigravity', + dirs: ['%LOCALAPPDATA%\\Programs\\Antigravity', '%ProgramFiles%\\Antigravity'], + }, + { + exe: 'idea64.exe', + label: 'IntelliJ IDEA', + dirs: ['%ProgramFiles%\\JetBrains\\IntelliJ IDEA*', '%LOCALAPPDATA%\\JetBrains\\IntelliJ IDEA*'], + }, + { + exe: 'pycharm64.exe', + label: 'PyCharm', + dirs: ['%ProgramFiles%\\JetBrains\\PyCharm*', '%LOCALAPPDATA%\\JetBrains\\PyCharm*'], + }, + { + exe: 'webstorm64.exe', + label: 'WebStorm', + dirs: ['%ProgramFiles%\\JetBrains\\WebStorm*', '%LOCALAPPDATA%\\JetBrains\\WebStorm*'], + }, + { + exe: 'rider64.exe', + label: 'Rider', + dirs: ['%ProgramFiles%\\JetBrains\\Rider*', '%LOCALAPPDATA%\\JetBrains\\Rider*'], + }, + { + exe: 'goland64.exe', + label: 'GoLand', + dirs: ['%ProgramFiles%\\JetBrains\\GoLand*', '%LOCALAPPDATA%\\JetBrains\\GoLand*'], + }, + { + exe: 'clion64.exe', + label: 'CLion', + dirs: ['%ProgramFiles%\\JetBrains\\CLion*', '%LOCALAPPDATA%\\JetBrains\\CLion*'], + }, + { exe: 'fleet.exe', label: 'JetBrains Fleet', dirs: ['%LOCALAPPDATA%\\Programs\\Fleet'] }, + { exe: 'sublime_text.exe', label: 'Sublime Text', dirs: ['%ProgramFiles%\\Sublime Text*'] }, + { + exe: 'notepad++.exe', + label: 'Notepad++', + dirs: ['%ProgramFiles%\\Notepad++', '%ProgramFiles(x86)%\\Notepad++'], + }, + { + exe: 'devenv.exe', + label: 'Visual Studio', + dirs: ['%ProgramFiles%\\Microsoft Visual Studio*', '%ProgramFiles(x86)%\\Microsoft Visual Studio*'], + }, + { exe: 'zed.exe', label: 'Zed', dirs: ['%LOCALAPPDATA%\\Programs\\Zed', '%ProgramFiles%\\Zed'] }, + { exe: 'Trae.exe', label: 'Trae', dirs: ['%LOCALAPPDATA%\\Programs\\Trae', '%ProgramFiles%\\Trae'] }, + { + exe: 'PearAI.exe', + label: 'PearAI', + dirs: ['%LOCALAPPDATA%\\Programs\\PearAI', '%ProgramFiles%\\PearAI'], + }, +]; + +const AI_EXTENSION_PATTERNS = [ + { pattern: 'github.copilot', label: 'GitHub Copilot' }, + { pattern: 'github.copilot-chat', label: 'GitHub Copilot Chat' }, + { pattern: 'openai.chatgpt', label: 'ChatGPT' }, + { pattern: 'continue', label: 'Continue' }, + { pattern: 'cline', label: 'Cline' }, + { pattern: 'roo-code', label: 'Roo Code' }, + { pattern: 'aider', label: 'Aider' }, + { pattern: 'codeium', label: 'Codeium' }, + { pattern: 'tabnine', label: 'Tabnine' }, + { pattern: 'supermaven', label: 'Supermaven' }, + { pattern: 'amazonwebservices.aws-toolkit', label: 'AWS Q' }, + { pattern: 'sourcegraph.cody', label: 'Cody (Sourcegraph)' }, +]; + +module.exports = { + HOSTS, + RULE_FILE_NAMES, + INSTRUCTION_FILE_NAMES, + CONFIG_FILE_NAMES, + OPPORTUNITY_FILES, + IDE_PROBE_PATHS, + AI_EXTENSION_PATTERNS, +}; diff --git a/server/lib/system-scan-ides.js b/server/lib/system-scan-ides.js new file mode 100644 index 0000000..cb5c0a2 --- /dev/null +++ b/server/lib/system-scan-ides.js @@ -0,0 +1,114 @@ +// @ts-check +// system-scan-ides.js — Installed IDE and AI extension probes. + +const fs = require('fs'); +const path = require('path'); +const { HOMEDIR } = require('./config'); +const { IDE_PROBE_PATHS, AI_EXTENSION_PATTERNS } = require('./system-scan-definitions'); + +/** @param {string} env */ +function expandEnvVar(env) { + return env.replace( + /%([^%]+)%/g, + (_ /** @type {string} */, v /** @type {string} */) => process.env[v] || '', + ); +} + +/** @param {string} p */ +function isDir(p) { + try { + return fs.statSync(p).isDirectory(); + } catch { + return false; + } +} + +function probeIDEs() { + const found = []; + const seen = new Set(); + for (const ide of IDE_PROBE_PATHS) { + if (seen.has(ide.label)) continue; + let resolvedPath = null; + for (const dirPattern of ide.dirs) { + const base = expandEnvVar(dirPattern); + if (!base) continue; + if (base.includes('*')) { + const wildIdx = base.indexOf('*'); + const prefix = base.substring(0, wildIdx); + const parentDir = path.dirname(prefix); + try { + if (fs.statSync(parentDir).isDirectory()) { + const entries = fs.readdirSync(parentDir); + const match = entries + .filter((e) => e.startsWith(path.basename(prefix))) + .sort() + .pop(); + if (match) resolvedPath = path.join(parentDir, match); + } + } catch { + /* ignore */ + } + } + if (!resolvedPath) { + try { + if (fs.statSync(base).isDirectory()) resolvedPath = base; + } catch { + /* ignore */ + } + } + if (!resolvedPath) continue; + const exePath = path.join(resolvedPath, ide.exe); + try { + if (fs.statSync(exePath).isFile()) { + found.push({ + id: 'ide-' + ide.label.replace(/\s+/g, '-').toLowerCase(), + label: ide.label, + path: resolvedPath, + exe: exePath, + }); + seen.add(ide.label); + break; + } + } catch { + /* ignore */ + } + } + } + return found; +} + +function probeAIExtensions() { + const ideExtDirs = [ + { label: 'VS Code', path: path.join(HOMEDIR, '.vscode', 'extensions') }, + { + label: 'Cursor', + path: path.join( + process.env.APPDATA || path.join(HOMEDIR, 'AppData', 'Roaming'), + 'Cursor', + 'extensions', + ), + }, + { label: 'Kiro', path: path.join(HOMEDIR, '.kiro', 'extensions') }, + { label: 'Antigravity', path: path.join(HOMEDIR, '.antigravity', 'extensions') }, + { label: 'Trae', path: path.join(HOMEDIR, '.trae', 'extensions') }, + { label: 'PearAI', path: path.join(HOMEDIR, '.pearai', 'extensions') }, + ]; + /** @type {Record} */ + const perIde = {}; + for (const ide of ideExtDirs) { + if (!isDir(ide.path)) continue; + try { + const entries = fs.readdirSync(ide.path); + const found = []; + for (const ai of AI_EXTENSION_PATTERNS) { + if (entries.some((e) => e.startsWith(ai.pattern))) found.push(ai.label); + } + if (found.length > 0) perIde[ide.label] = [...new Set(found)]; + } catch { + /* ignore */ + } + } + return perIde; +} + +module.exports = { probeIDEs, probeAIExtensions }; diff --git a/server/lib/system-scan.js b/server/lib/system-scan.js index 02e9408..85207a4 100644 --- a/server/lib/system-scan.js +++ b/server/lib/system-scan.js @@ -6,170 +6,14 @@ const fs = require('fs'); const path = require('path'); const { HOMEDIR, DATA_DIR, SKILLS_DIR } = require('./config'); const { countSkillFiles, listSkillNames } = require('./skills'); - -// ---- Host definitions ---- - -const HOSTS = [ - { id: '.claude', label: 'Claude Code', icon: 'claude' }, - { id: '.cursor', label: 'Cursor', icon: 'cursor' }, - { id: '.windsurf', label: 'Windsurf', icon: 'windsurf' }, - { id: '.codex', label: 'Codex CLI', icon: 'openai' }, - { id: '.opencode', label: 'OpenCode', icon: 'opencode' }, - { id: '.continue', label: 'Continue', icon: 'continue' }, - { id: '.roo', label: 'Roo CLI', icon: 'cline' }, - { id: '.cline', label: 'Cline', icon: 'cline' }, - { id: '.kimi', label: 'Kimi K2', icon: 'kimi' }, - { id: '.goose', label: 'Goose', icon: 'goose' }, - { id: '.amp', label: 'Amp', icon: 'sourcegraph' }, - { id: '.kiro', label: 'Kiro', icon: 'kiro' }, - { id: '.antigravity', label: 'Antigravity', icon: 'antigravity' }, - { id: '.gemini', label: 'Gemini', icon: 'gemini' }, - { id: '.augment', label: 'Augment', icon: 'augment' }, - { id: '.pearai', label: 'PearAI', icon: 'pearai' }, - { id: '.void', label: 'Void', icon: 'void' }, -]; - -const RULE_FILE_NAMES = [ - '.clinerules', - '.cursorrules', - '.windsurfrules', - '.rules', - '.ampcoderc', - '.goosehints', -]; - -const INSTRUCTION_FILE_NAMES = [ - 'AGENTS.md', - 'CLAUDE.md', - 'GEMINI.md', - 'devin.md', - 'CONVENTIONS.md', - '.kimi-system-prompt.md', - '.github/copilot-instructions.md', - 'CONTEXT.md', - 'steering.md', -]; - -const CONFIG_FILE_NAMES = [ - 'settings.json', - 'config.json', - 'config.toml', - 'kimi.json', - 'mcp.json', - 'claude_desktop_config.json', -]; - -const OPPORTUNITY_FILES = { - '.claude': 'CLAUDE.md', - '.cursor': '.cursorrules', - '.windsurf': '.windsurfrules', - '.codex': 'instructions.md', - '.opencode': null, - '.continue': null, - '.roo': null, - '.cline': '.clinerules', - '.kimi': '.kimi-system-prompt.md', - '.goose': '.goosehints', - '.amp': '.ampcoderc', - '.kiro': '.kiro/steering.md', - '.antigravity': null, - '.gemini': 'GEMINI.md', - '.augment': '.augment-guidelines', - '.pearai': '.pearai', - '.void': null, -}; - -const IDE_PROBE_PATHS = [ - { - exe: 'Code.exe', - label: 'VS Code', - dirs: [ - '%LOCALAPPDATA%\\Programs\\Microsoft VS Code', - '%ProgramFiles%\\Microsoft VS Code', - '%ProgramFiles(x86)%\\Microsoft VS Code', - ], - }, - { - exe: 'Cursor.exe', - label: 'Cursor', - dirs: ['%LOCALAPPDATA%\\Programs\\cursor', '%ProgramFiles%\\Cursor'], - }, - { - exe: 'Windsurf.exe', - label: 'Windsurf', - dirs: ['%LOCALAPPDATA%\\Programs\\windsurf', '%ProgramFiles%\\Windsurf'], - }, - { exe: 'Kiro.exe', label: 'Kiro', dirs: ['%LOCALAPPDATA%\\Programs\\Kiro'] }, - { - exe: 'Antigravity.exe', - label: 'Antigravity', - dirs: ['%LOCALAPPDATA%\\Programs\\Antigravity', '%ProgramFiles%\\Antigravity'], - }, - { - exe: 'idea64.exe', - label: 'IntelliJ IDEA', - dirs: ['%ProgramFiles%\\JetBrains\\IntelliJ IDEA*', '%LOCALAPPDATA%\\JetBrains\\IntelliJ IDEA*'], - }, - { - exe: 'pycharm64.exe', - label: 'PyCharm', - dirs: ['%ProgramFiles%\\JetBrains\\PyCharm*', '%LOCALAPPDATA%\\JetBrains\\PyCharm*'], - }, - { - exe: 'webstorm64.exe', - label: 'WebStorm', - dirs: ['%ProgramFiles%\\JetBrains\\WebStorm*', '%LOCALAPPDATA%\\JetBrains\\WebStorm*'], - }, - { - exe: 'rider64.exe', - label: 'Rider', - dirs: ['%ProgramFiles%\\JetBrains\\Rider*', '%LOCALAPPDATA%\\JetBrains\\Rider*'], - }, - { - exe: 'goland64.exe', - label: 'GoLand', - dirs: ['%ProgramFiles%\\JetBrains\\GoLand*', '%LOCALAPPDATA%\\JetBrains\\GoLand*'], - }, - { - exe: 'clion64.exe', - label: 'CLion', - dirs: ['%ProgramFiles%\\JetBrains\\CLion*', '%LOCALAPPDATA%\\JetBrains\\CLion*'], - }, - { exe: 'fleet.exe', label: 'JetBrains Fleet', dirs: ['%LOCALAPPDATA%\\Programs\\Fleet'] }, - { exe: 'sublime_text.exe', label: 'Sublime Text', dirs: ['%ProgramFiles%\\Sublime Text*'] }, - { - exe: 'notepad++.exe', - label: 'Notepad++', - dirs: ['%ProgramFiles%\\Notepad++', '%ProgramFiles(x86)%\\Notepad++'], - }, - { - exe: 'devenv.exe', - label: 'Visual Studio', - dirs: ['%ProgramFiles%\\Microsoft Visual Studio*', '%ProgramFiles(x86)%\\Microsoft Visual Studio*'], - }, - { exe: 'zed.exe', label: 'Zed', dirs: ['%LOCALAPPDATA%\\Programs\\Zed', '%ProgramFiles%\\Zed'] }, - { exe: 'Trae.exe', label: 'Trae', dirs: ['%LOCALAPPDATA%\\Programs\\Trae', '%ProgramFiles%\\Trae'] }, - { - exe: 'PearAI.exe', - label: 'PearAI', - dirs: ['%LOCALAPPDATA%\\Programs\\PearAI', '%ProgramFiles%\\PearAI'], - }, -]; - -const AI_EXTENSION_PATTERNS = [ - { pattern: 'github.copilot', label: 'GitHub Copilot' }, - { pattern: 'github.copilot-chat', label: 'GitHub Copilot Chat' }, - { pattern: 'openai.chatgpt', label: 'ChatGPT' }, - { pattern: 'continue', label: 'Continue' }, - { pattern: 'cline', label: 'Cline' }, - { pattern: 'roo-code', label: 'Roo Code' }, - { pattern: 'aider', label: 'Aider' }, - { pattern: 'codeium', label: 'Codeium' }, - { pattern: 'tabnine', label: 'Tabnine' }, - { pattern: 'supermaven', label: 'Supermaven' }, - { pattern: 'amazonwebservices.aws-toolkit', label: 'AWS Q' }, - { pattern: 'sourcegraph.cody', label: 'Cody (Sourcegraph)' }, -]; +const { + HOSTS, + RULE_FILE_NAMES, + INSTRUCTION_FILE_NAMES, + CONFIG_FILE_NAMES, + OPPORTUNITY_FILES, +} = require('./system-scan-definitions'); +const { probeIDEs, probeAIExtensions } = require('./system-scan-ides'); // ---- Helpers ---- @@ -188,14 +32,6 @@ function getDriveRoots() { return drives; } -/** @param {string} env */ -function expandEnvVar(env) { - return env.replace( - /%([^%]+)%/g, - (_ /** @type {string} */, v /** @type {string} */) => process.env[v] || '', - ); -} - /** @param {string} p */ function isFile(p) { try { @@ -222,97 +58,6 @@ function readJsonSafe(p) { } } -// ---- Probe functions ---- - -function probeIDEs() { - const found = []; - const seen = new Set(); - for (const ide of IDE_PROBE_PATHS) { - if (seen.has(ide.label)) continue; - let resolvedPath = null; - for (const dirPattern of ide.dirs) { - const base = expandEnvVar(dirPattern); - if (!base) continue; - if (base.includes('*')) { - const wildIdx = base.indexOf('*'); - const prefix = base.substring(0, wildIdx); - const parentDir = path.dirname(prefix); - try { - if (fs.statSync(parentDir).isDirectory()) { - const entries = fs.readdirSync(parentDir); - const match = entries - .filter((e) => e.startsWith(path.basename(prefix))) - .sort() - .pop(); - if (match) resolvedPath = path.join(parentDir, match); - } - } catch { - /* ignore */ - } - } - if (!resolvedPath) { - try { - if (fs.statSync(base).isDirectory()) resolvedPath = base; - } catch { - /* ignore */ - } - } - if (!resolvedPath) continue; - const exePath = path.join(resolvedPath, ide.exe); - try { - if (fs.statSync(exePath).isFile()) { - found.push({ - id: 'ide-' + ide.label.replace(/\s+/g, '-').toLowerCase(), - label: ide.label, - path: resolvedPath, - exe: exePath, - }); - seen.add(ide.label); - break; - } - } catch { - /* ignore */ - } - } - } - return found; -} - -function probeAIExtensions() { - // Map: IDE label → Set of AI extension labels found in that IDE's extensions dir - const ideExtDirs = [ - { label: 'VS Code', path: path.join(HOMEDIR, '.vscode', 'extensions') }, - { - label: 'Cursor', - path: path.join( - process.env.APPDATA || path.join(HOMEDIR, 'AppData', 'Roaming'), - 'Cursor', - 'extensions', - ), - }, - { label: 'Kiro', path: path.join(HOMEDIR, '.kiro', 'extensions') }, - { label: 'Antigravity', path: path.join(HOMEDIR, '.antigravity', 'extensions') }, - { label: 'Trae', path: path.join(HOMEDIR, '.trae', 'extensions') }, - { label: 'PearAI', path: path.join(HOMEDIR, '.pearai', 'extensions') }, - ]; - /** @type {Record} */ - const perIde = {}; - for (const ide of ideExtDirs) { - if (!isDir(ide.path)) continue; - try { - const entries = fs.readdirSync(ide.path); - const found = []; - for (const ai of AI_EXTENSION_PATTERNS) { - if (entries.some((e) => e.startsWith(ai.pattern))) found.push(ai.label); - } - if (found.length > 0) perIde[ide.label] = [...new Set(found)]; - } catch { - /* ignore */ - } - } - return perIde; -} - // ---- Host-grouped scan ---- /** @typedef {{ path: string, label: string, count: number, names: { bareId: string, name: string, cat: string }[], internal?: boolean }} SkillEntry */ diff --git a/ui/index.html b/ui/index.html index a2f77f3..56978c9 100644 --- a/ui/index.html +++ b/ui/index.html @@ -779,6 +779,7 @@

+ diff --git a/ui/onboarding-render.js b/ui/onboarding-render.js new file mode 100644 index 0000000..1043289 --- /dev/null +++ b/ui/onboarding-render.js @@ -0,0 +1,193 @@ +// @ts-nocheck +// onboarding-render.js — Host and IDE card markup for the onboarding scan results. + +const OnboardingRender = (() => { + function esc(s) { + return String(s) + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); + } + + function hostIcon(iconId) { + return obIcon(iconId); + } + + function ideIconKey(label) { + return window.ideIconKey ? window.ideIconKey(label) : 'vscode'; + } + + function renderHostCard(h, skillSources) { + const srcByPath = {}; + for (const s of skillSources) { + if (s.path && s.type !== 'internal') srcByPath[s.path.toLowerCase()] = s; + } + const totalForHost = + h.skills.length + h.configs.length + h.instructions.length + h.rules.length + h.mcpServers.length; + const sections = []; + + if (h.skills.length) { + sections.push(` +
+ +
+ ${h.skills + .map((sk) => { + const key = (sk.path || '').toLowerCase(); + const src = srcByPath[key]; + const escPath = esc(sk.path || '').replace(/\\/g, '\\\\'); + const escLabel = esc(sk.label || sk.path || ''); + const names = sk.names || []; + const count = sk.count || names.length; + return ` +
+
+ ${escLabel} + ${count} skill${count !== 1 ? 's' : ''} +
+ ${ + names.length + ? `
    ${names + .slice(0, 20) + .map((n) => `
  • ${esc(n.cat)}${esc(n.name)}
  • `) + .join( + '', + )}${names.length > 20 ? `
  • +${names.length - 20} more
  • ` : ''}
` + : '' + } +
+ ${sk.internal ? 'Internal' : src ? `Linked` : ``} +
+
`; + }) + .join('')} +
+
`); + } + + if (h.instructions.length) { + sections.push(` +
+ +
+
    ${h.instructions.map((i) => `
  • ${catSvg('instruct')}${esc(i.label)}${esc(i.path)}
  • `).join('')}
+
+
`); + } + + if (h.rules.length) { + sections.push(` +
+ +
+
    ${h.rules.map((r) => `
  • ${catSvg('rules')}${esc(r.label)}${esc(r.path)}
  • `).join('')}
+
+
`); + } + + if (h.configs.length) { + sections.push(` +
+ +
+
    ${h.configs.map((c) => `
  • ${catSvg('config')}${esc(c.label)}${esc(c.path)}
  • `).join('')}
+
+
`); + } + + if (h.mcpServers.length) { + sections.push(` +
+ +
+ ${h.mcpServers.map((m) => `
${esc(m.path)}${m.count} server${m.count !== 1 ? 's' : ''}${m.servers ? `
    ${m.servers.map((s) => `
  • ${esc(s)}
  • `).join('')}
` : ''}
`).join('')} +
+
`); + } + + if (h.opportunities.length) { + sections.push(` +
+ +
+ ${h.opportunities + .map( + (o) => ` +
+ ${esc(o.label)} + ${esc(o.description)} +
+ `, + ) + .join('')} +
+
`); + } + + return ` +
+
+
${hostIcon(h.icon || h.id)}
+
+ ${esc(h.label)} + ${h.path ? `${esc(h.path)}` : ''} +
+ ${totalForHost} item${totalForHost !== 1 ? 's' : ''} +
+ ${sections.join('')} +
`; + } + + function renderIdeCard(ideList, extByIde) { + return ` +
+
+
${hostIcon('vscode')}
+
+ IDEs + ${ideList.map((ide) => esc(ide.label)).join(', ')} +
+ ${ideList.length} IDE${ideList.length !== 1 ? 's' : ''} +
+
+
+ ${ideList + .map( + (ide) => ` +
+
+
${hostIcon(ideIconKey(ide.label))}
+ ${esc(ide.label)} +
+
${extByIde && extByIde[ide.label] ? extByIde[ide.label].map((e) => `${esc(e)}`).join('') : ''}
+
+ `, + ) + .join('')} +
+
+
`; + } + + return { renderHostCard, renderIdeCard }; +})(); diff --git a/ui/onboarding.js b/ui/onboarding.js index 2c97a6b..b38bd66 100644 --- a/ui/onboarding.js +++ b/ui/onboarding.js @@ -1,6 +1,5 @@ // @ts-nocheck // onboarding.js — Full-window 3-step setup: scan → build → done. -// Results grouped by host app, with skills, rules, configs, and opportunities per app. const Onboarding = (() => { const STEPS = [ @@ -53,7 +52,6 @@ const Onboarding = (() => { } } - // ---- Step 1: Scan ---- async function runScan() { scanPhase = 'scanning'; scanning = true; @@ -232,7 +230,6 @@ const Onboarding = (() => { window.scrollTo(0, 0); } - // ---- Render ---- function renderSteps() { return STEPS.map((s, idx) => { const state = s.num < step ? 'done' : s.num === step ? 'current' : ''; @@ -275,13 +272,6 @@ const Onboarding = (() => { .replace(/'/g, '''); } - const ICONS = ObIcons; - - function hostIcon(iconId) { - return obIcon(iconId); - } - - // ---- Step 1: Scan ---- function renderScan() { if (scanPhase === 'config') return renderScanConfig(); if (scanPhase === 'scanning') return renderScanProgress(); @@ -427,8 +417,8 @@ const Onboarding = (() => {

${totalItems()} items across ${hostList.length} AI tool${hostList.length !== 1 ? 's' : ''}.

- ${hostList.map((h) => renderHostCard(h)).join('')} - ${ideList.length ? renderIdeCard(ideList, extList) : ''} + ${hostList.map((h) => OnboardingRender.renderHostCard(h, skillSources)).join('')} + ${ideList.length ? OnboardingRender.renderIdeCard(ideList, extList) : ''}
@@ -436,187 +426,6 @@ const Onboarding = (() => {
`; } - function renderHostCard(h) { - const srcByPath = {}; - for (const s of skillSources) { - if (s.path && s.type !== 'internal') srcByPath[s.path.toLowerCase()] = s; - } - const totalForHost = - h.skills.length + h.configs.length + h.instructions.length + h.rules.length + h.mcpServers.length; - const sections = []; - - // Skills - if (h.skills.length) { - sections.push(` -
- -
- ${h.skills - .map((sk) => { - const key = (sk.path || '').toLowerCase(); - const src = srcByPath[key]; - const escPath = esc(sk.path || '').replace(/\\/g, '\\\\'); - const escLabel = esc(sk.label || sk.path || ''); - const names = sk.names || []; - const count = sk.count || names.length; - return ` -
-
- ${escLabel} - ${count} skill${count !== 1 ? 's' : ''} -
- ${ - names.length - ? `
    ${names - .slice(0, 20) - .map((n) => `
  • ${esc(n.cat)}${esc(n.name)}
  • `) - .join( - '', - )}${names.length > 20 ? `
  • +${names.length - 20} more
  • ` : ''}
` - : '' - } -
- ${sk.internal ? 'Internal' : src ? `Linked` : ``} -
-
`; - }) - .join('')} -
-
`); - } - - // Instructions - if (h.instructions.length) { - sections.push(` -
- -
-
    ${h.instructions.map((i) => `
  • ${catSvg('instruct')}${esc(i.label)}${esc(i.path)}
  • `).join('')}
-
-
`); - } - - // Rules - if (h.rules.length) { - sections.push(` -
- -
-
    ${h.rules.map((r) => `
  • ${catSvg('rules')}${esc(r.label)}${esc(r.path)}
  • `).join('')}
-
-
`); - } - - // Configs - if (h.configs.length) { - sections.push(` -
- -
-
    ${h.configs.map((c) => `
  • ${catSvg('config')}${esc(c.label)}${esc(c.path)}
  • `).join('')}
-
-
`); - } - - // MCP - if (h.mcpServers.length) { - sections.push(` -
- -
- ${h.mcpServers.map((m) => `
${esc(m.path)}${m.count} server${m.count !== 1 ? 's' : ''}${m.servers ? `
    ${m.servers.map((s) => `
  • ${esc(s)}
  • `).join('')}
` : ''}
`).join('')} -
-
`); - } - - // Opportunities - if (h.opportunities.length) { - sections.push(` -
- -
- ${h.opportunities - .map( - (o) => ` -
- ${esc(o.label)} - ${esc(o.description)} -
- `, - ) - .join('')} -
-
`); - } - - return ` -
-
-
${hostIcon(h.icon || h.id)}
-
- ${esc(h.label)} - ${h.path ? `${esc(h.path)}` : ''} -
- ${totalForHost} item${totalForHost !== 1 ? 's' : ''} -
- ${sections.join('')} -
`; - } - - function ideIconKey(label) { - return window.ideIconKey ? window.ideIconKey(label) : 'vscode'; - } - - function renderIdeCard(ideList, extByIde) { - return ` -
-
-
${hostIcon('vscode')}
-
- IDEs - ${ideList.map((ide) => esc(ide.label)).join(', ')} -
- ${ideList.length} IDE${ideList.length !== 1 ? 's' : ''} -
-
-
- ${ideList - .map( - (ide) => ` -
-
-
${hostIcon(ideIconKey(ide.label))}
- ${esc(ide.label)} -
-
${extByIde && extByIde[ide.label] ? extByIde[ide.label].map((e) => `${esc(e)}`).join('') : ''}
-
- `, - ) - .join('')} -
-
-
`; - } - - // ---- Step 2: Build ---- function renderBuild() { const srcCount = skillSources.length; const skillCount = Array.isArray(SKILL_DATA) ? SKILL_DATA.length : 0; @@ -657,7 +466,6 @@ const Onboarding = (() => {
`; } - // ---- Step 3: Done ---- function renderDone() { const hostList = scanResults?.hosts || []; const opportunities = hostList.flatMap((h) => h.opportunities.map((o) => ({ ...o, host: h.label }))); diff --git a/ui/styles/_index.css b/ui/styles/_index.css index 9f63aa2..c766e6b 100644 --- a/ui/styles/_index.css +++ b/ui/styles/_index.css @@ -12,6 +12,8 @@ @import url('toast.css'); @import url('command-bar.css'); @import url('onboarding.css'); +@import url('onboarding-hosts.css'); +@import url('onboarding-flow.css'); @import url('tab-dashboard.css'); @import url('tab-config.css'); @import url('tab-compile.css'); diff --git a/ui/styles/onboarding-flow.css b/ui/styles/onboarding-flow.css new file mode 100644 index 0000000..4d73941 --- /dev/null +++ b/ui/styles/onboarding-flow.css @@ -0,0 +1,181 @@ +/* onboarding-flow.css — Scan progress, build, done, and responsive flow states. */ + +.ob-progress-track { + width: 100%; + max-width: 400px; + height: 4px; + background: var(--bg-sunken); + border-radius: var(--r-full); + overflow: hidden; + margin: 0 auto var(--s-6); +} + +.ob-progress-fill { + height: 100%; + width: 0%; + background: var(--accent); + border-radius: var(--r-full); + transition: width 0.3s ease-out; +} + +.ob-row.ob-row-dim { + opacity: 0.6; + cursor: default; +} + +.ob-row.ob-row-dim:hover { + background: var(--dram-card-bg); +} + +.ob-empty { + padding: var(--s-6) var(--s-4); + text-align: center; + background: var(--bg-sunken); + border: 1px dashed var(--line); + border-radius: var(--r-1); + color: var(--text-4); + font-size: var(--fs-03); + line-height: 1.5; +} + +.ob-moment { + display: flex; + flex-direction: column; + align-items: center; + text-align: center; + gap: var(--s-5); + padding: var(--s-9) var(--s-6); + background: var(--dram-card-bg); + border: 1px solid var(--dram-card-border); + border-radius: var(--r-2); +} + +.ob-moment-badge { + width: 56px; + height: 56px; + display: grid; + place-items: center; + border-radius: var(--r-full); + font-size: 24px; + font-weight: 700; + flex: 0 0 auto; +} + +.ob-moment-badge.spin { + border: 2px solid var(--line); + border-top-color: var(--accent); + animation: ob-spin 0.9s linear infinite; + background: transparent; + width: 32px; + height: 32px; +} + +.ob-moment-badge.done { + background: var(--ok-bg); + border: 2px solid var(--ok); + color: var(--ok); +} + +@keyframes ob-spin { + to { + transform: rotate(360deg); + } +} + +.ob-moment-text { + display: grid; + gap: var(--s-2); + max-width: 420px; +} + +.ob-moment-text strong { + font-size: var(--fs-06); + font-weight: 650; + color: var(--text); +} + +.ob-moment-text span { + color: var(--text-3); + font-size: var(--fs-04); + line-height: 1.6; +} + +.ob-moment .ob-actions { + display: flex; + gap: var(--s-3); + align-items: center; + margin-top: var(--s-2); +} + +.ob-scanning { + display: flex; + align-items: center; + gap: var(--s-4); + padding: var(--s-5); + background: var(--dram-card-bg); + border: 1px solid var(--dram-card-border); + border-radius: var(--r-2); +} + +.ob-scanning-text { + display: grid; + gap: var(--s-1); +} + +.ob-scanning-text strong { + color: var(--text); + font-size: var(--fs-04); + font-weight: 600; +} + +.ob-scanning-text span { + color: var(--text-3); + font-size: var(--fs-03); + line-height: 1.5; +} + +.ob-actions { + display: flex; + gap: var(--s-3); + align-items: center; +} + +.ob-actions .ob-skip { + margin-right: auto; +} + +.ob-mt-5 { + margin-top: var(--s-5); +} +.ob-mt-6 { + margin-top: var(--s-6); +} +.ob-mt-7 { + margin-top: var(--s-7); +} + +@media (max-width: 720px) { + .ob-body-inner { + max-width: 100%; + } + + .ob-step-label { + display: none; + } + + .ob-step-connector { + margin-bottom: 0; + } + + .ob-source-form { + flex-direction: column; + } + + .ob-body { + padding: var(--s-6) var(--s-4); + } + + .ob-moment { + padding: var(--s-7) var(--s-4); + } +} diff --git a/ui/styles/onboarding-hosts.css b/ui/styles/onboarding-hosts.css new file mode 100644 index 0000000..2b79b19 --- /dev/null +++ b/ui/styles/onboarding-hosts.css @@ -0,0 +1,291 @@ +/* onboarding-hosts.css — Host review cards and linked source details. */ + +.ob-muted { + color: var(--text-4); + font-size: var(--fs-02); +} + +.ob-host-cards { + display: grid; + gap: var(--s-5); +} + +.ob-host-card { + background: var(--dram-card-bg); + border: 1px solid var(--line); + border-radius: var(--r-2); + overflow: hidden; +} + +.ob-host-card-hdr { + display: flex; + align-items: center; + gap: var(--s-3); + padding: var(--s-3) var(--s-4); + background: var(--dram-card-hover); + border-bottom: 1px solid var(--line); +} + +.ob-host-card-info { + flex: 1; + min-width: 0; + display: grid; + gap: 1px; +} + +.ob-host-card-info .ob-row-name { + color: var(--text); + font-size: var(--fs-04); + font-weight: 600; +} + +.ob-host-section { + border-top: 1px solid var(--line); +} + +.ob-host-section-hdr { + display: flex; + align-items: center; + gap: var(--s-2); + width: 100%; + padding: var(--s-2) var(--s-4); + border: none; + background: transparent; + color: var(--text-2); + cursor: pointer; + font: inherit; + font-size: var(--fs-02); + text-align: left; + transition: background var(--out); +} + +.ob-host-section-hdr:hover { + background: var(--dram-card-hover); +} + +.ob-host-section-hdr svg { + width: 16px; + height: 16px; + flex: 0 0 auto; +} + +.ob-host-section-hdr .ob-accordion-chev { + margin-left: auto; +} + +.ob-host-section-open .ob-host-section-hdr { + color: var(--text); +} + +.ob-host-section-body { + display: none; + padding: 0 var(--s-4) var(--s-3) var(--s-4); +} + +.ob-host-section-open .ob-host-section-body { + display: block; +} + +.ob-host-section.ob-host-opportunity .ob-host-section-hdr { + color: var(--ok); +} + +.ob-skill-source { + padding: var(--s-2) 0; + border-bottom: 1px solid var(--line); +} + +.ob-skill-source:last-child { + border-bottom: none; +} + +.ob-skill-source-hdr { + display: flex; + align-items: center; + gap: var(--s-2); + margin-bottom: var(--s-1); +} + +.ob-skill-source-hdr .ob-row-name { + color: var(--text); + font-size: var(--fs-03); + font-weight: 500; +} + +.ob-host-actions { + display: flex; + align-items: center; + gap: var(--s-2); + margin-top: var(--s-2); +} + +.ob-file-list { + list-style: none; + margin: 0; + padding: 0; + display: grid; + gap: 2px; +} + +.ob-file-list li { + display: flex; + align-items: center; + gap: var(--s-2); + font-size: var(--fs-02); + color: var(--text-2); + padding: 2px 0; +} + +.ob-file-list li svg { + width: 16px; + height: 16px; + flex: 0 0 auto; +} + +.ob-file-path { + color: var(--text-4); + font-family: var(--mono); + font-size: var(--fs-01); + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; +} + +.ob-mcp-entry { + padding: var(--s-2) 0; + border-bottom: 1px solid var(--line); +} + +.ob-mcp-entry:last-child { + border-bottom: none; +} + +.ob-mcp-path { + font-family: var(--mono); + font-size: var(--fs-01); + color: var(--text-3); +} + +.ob-mcp-servers { + list-style: none; + margin: var(--s-1) 0 0 var(--s-3); + padding: 0; + font-size: var(--fs-02); + color: var(--text-3); +} + +.ob-ide-ext-row { + display: flex; + align-items: center; + gap: var(--s-3); + padding: var(--s-2) 0; + border-bottom: 1px solid var(--line); +} + +.ob-ide-ext-row:last-child { + border-bottom: none; +} + +.ob-ide-ext-label-group { + display: flex; + align-items: center; + gap: var(--s-2); + min-width: 140px; + flex: 0 0 auto; +} + +.ob-ide-ext-label { + color: var(--text); + font-size: var(--fs-03); + font-weight: 500; +} + +.ob-row-icon-sm { + width: 16px; + height: 16px; + min-width: 16px; +} + +.ob-row-icon-sm img { + width: 16px; + height: 16px; +} + +.ob-ext-badges { + display: flex; + flex-wrap: wrap; + gap: var(--s-2); + margin-top: var(--s-2); +} + +.ob-opportunity { + padding: var(--s-2) 0; + display: grid; + gap: 2px; +} + +.ob-opportunity-label { + color: var(--ok); + font-size: var(--fs-03); + font-weight: 500; +} + +.ob-opportunity-desc { + color: var(--text-3); + font-size: var(--fs-02); +} + +.ob-row.ob-row-opportunity { + gap: var(--s-3); +} + +.ob-row-desc { + color: var(--text-3); + font-size: var(--fs-02); +} + +.ob-step-head p { + color: var(--text-2); + font-size: var(--fs-03); + margin: var(--s-2) 0 0; +} + +.ob-step-head .ob-row-desc { + margin: 0; +} + +.ob-step-total { + color: var(--text-3); + font-size: var(--fs-02); + font-family: var(--mono); + margin-top: var(--s-2); + display: block; +} + +.ob-source-form { + display: flex; + gap: var(--s-2); + align-items: center; + margin-top: var(--s-4); +} + +.ob-source-input { + flex: 1; + min-height: 36px; + padding: 0 var(--s-3); + background: var(--bg-sunken); + border: 1px solid var(--line); + border-radius: var(--r-1); + color: var(--text); + font-family: var(--mono); + font-size: var(--fs-03); + transition: + border-color var(--out), + background var(--out); +} + +.ob-source-input:focus { + border-color: var(--line-accent); + background: var(--bg-raised); + outline: none; +} diff --git a/ui/styles/onboarding.css b/ui/styles/onboarding.css index c386565..3785e86 100644 --- a/ui/styles/onboarding.css +++ b/ui/styles/onboarding.css @@ -448,489 +448,3 @@ background: var(--ok-bg); border-color: var(--ok); } - -.ob-muted { - color: var(--text-4); - font-size: var(--fs-02); -} - -/* ---- Host card ---- */ -.ob-host-cards { - display: grid; - gap: var(--s-5); -} - -.ob-host-card { - background: var(--dram-card-bg); - border: 1px solid var(--line); - border-radius: var(--r-2); - overflow: hidden; -} - -.ob-host-card-hdr { - display: flex; - align-items: center; - gap: var(--s-3); - padding: var(--s-3) var(--s-4); - background: var(--dram-card-hover); - border-bottom: 1px solid var(--line); -} - -.ob-host-card-info { - flex: 1; - min-width: 0; - display: grid; - gap: 1px; -} - -.ob-host-card-info .ob-row-name { - color: var(--text); - font-size: var(--fs-04); - font-weight: 600; -} - -/* ---- Host section (collapsible within card) ---- */ -.ob-host-section { - border-top: 1px solid var(--line); -} - -.ob-host-section-hdr { - display: flex; - align-items: center; - gap: var(--s-2); - width: 100%; - padding: var(--s-2) var(--s-4); - border: none; - background: transparent; - color: var(--text-2); - cursor: pointer; - font: inherit; - font-size: var(--fs-02); - text-align: left; - transition: background var(--out); -} - -.ob-host-section-hdr:hover { - background: var(--dram-card-hover); -} - -.ob-host-section-hdr svg { - width: 16px; - height: 16px; - flex: 0 0 auto; -} - -.ob-host-section-hdr .ob-accordion-chev { - margin-left: auto; -} - -.ob-host-section-open .ob-host-section-hdr { - color: var(--text); -} - -.ob-host-section-body { - display: none; - padding: 0 var(--s-4) var(--s-3) var(--s-4); -} - -.ob-host-section-open .ob-host-section-body { - display: block; -} - -.ob-host-section.ob-host-opportunity .ob-host-section-hdr { - color: var(--ok); -} - -/* ---- Skill source within host ---- */ -.ob-skill-source { - padding: var(--s-2) 0; - border-bottom: 1px solid var(--line); -} - -.ob-skill-source:last-child { - border-bottom: none; -} - -.ob-skill-source-hdr { - display: flex; - align-items: center; - gap: var(--s-2); - margin-bottom: var(--s-1); -} - -.ob-skill-source-hdr .ob-row-name { - color: var(--text); - font-size: var(--fs-03); - font-weight: 500; -} - -.ob-host-actions { - display: flex; - align-items: center; - gap: var(--s-2); - margin-top: var(--s-2); -} - -/* ---- File list (rules, configs, instructions) ---- */ -.ob-file-list { - list-style: none; - margin: 0; - padding: 0; - display: grid; - gap: 2px; -} - -.ob-file-list li { - display: flex; - align-items: center; - gap: var(--s-2); - font-size: var(--fs-02); - color: var(--text-2); - padding: 2px 0; -} - -.ob-file-list li svg { - width: 16px; - height: 16px; - flex: 0 0 auto; -} - -.ob-file-path { - color: var(--text-4); - font-family: var(--mono); - font-size: var(--fs-01); - overflow: hidden; - text-overflow: ellipsis; - white-space: nowrap; -} - -/* ---- MCP entry ---- */ -.ob-mcp-entry { - padding: var(--s-2) 0; - border-bottom: 1px solid var(--line); -} - -.ob-mcp-entry:last-child { - border-bottom: none; -} - -.ob-mcp-path { - font-family: var(--mono); - font-size: var(--fs-01); - color: var(--text-3); -} - -.ob-mcp-servers { - list-style: none; - margin: var(--s-1) 0 0 var(--s-3); - padding: 0; - font-size: var(--fs-02); - color: var(--text-3); -} - -/* ---- IDE extension row ---- */ -.ob-ide-ext-row { - display: flex; - align-items: center; - gap: var(--s-3); - padding: var(--s-2) 0; - border-bottom: 1px solid var(--line); -} - -.ob-ide-ext-row:last-child { - border-bottom: none; -} - -.ob-ide-ext-label-group { - display: flex; - align-items: center; - gap: var(--s-2); - min-width: 140px; - flex: 0 0 auto; -} - -.ob-ide-ext-label { - color: var(--text); - font-size: var(--fs-03); - font-weight: 500; -} - -.ob-row-icon-sm { - width: 16px; - height: 16px; - min-width: 16px; -} - -.ob-row-icon-sm img { - width: 16px; - height: 16px; -} - -.ob-ext-badges { - display: flex; - flex-wrap: wrap; - gap: var(--s-2); - margin-top: var(--s-2); -} - -/* ---- Opportunity ---- */ -.ob-opportunity { - padding: var(--s-2) 0; - display: grid; - gap: 2px; -} - -.ob-opportunity-label { - color: var(--ok); - font-size: var(--fs-03); - font-weight: 500; -} - -.ob-opportunity-desc { - color: var(--text-3); - font-size: var(--fs-02); -} - -.ob-row.ob-row-opportunity { - gap: var(--s-3); -} - -.ob-row-desc { - color: var(--text-3); - font-size: var(--fs-02); -} - -/* ---- Step head descriptions ---- */ -.ob-step-head p { - color: var(--text-2); - font-size: var(--fs-03); - margin: var(--s-2) 0 0; -} - -.ob-step-head .ob-row-desc { - margin: 0; -} - -.ob-step-total { - color: var(--text-3); - font-size: var(--fs-02); - font-family: var(--mono); - margin-top: var(--s-2); - display: block; -} - -.ob-source-form { - display: flex; - gap: var(--s-2); - align-items: center; - margin-top: var(--s-4); -} - -.ob-source-input { - flex: 1; - min-height: 36px; - padding: 0 var(--s-3); - background: var(--bg-sunken); - border: 1px solid var(--line); - border-radius: var(--r-1); - color: var(--text); - font-family: var(--mono); - font-size: var(--fs-03); - transition: - border-color var(--out), - background var(--out); -} - -.ob-source-input:focus { - border-color: var(--line-accent); - background: var(--bg-raised); - outline: none; -} - -/* ── Progress bar ── */ -.ob-progress-track { - width: 100%; - max-width: 400px; - height: 4px; - background: var(--bg-sunken); - border-radius: var(--r-full); - overflow: hidden; - margin: 0 auto var(--s-6); -} - -.ob-progress-fill { - height: 100%; - width: 0%; - background: var(--accent); - border-radius: var(--r-full); - transition: width 0.3s ease-out; -} - -/* ── Dimmed rows for default paths ── */ -.ob-row.ob-row-dim { - opacity: 0.6; - cursor: default; -} - -.ob-row.ob-row-dim:hover { - background: var(--dram-card-bg); -} - -.ob-empty { - padding: var(--s-6) var(--s-4); - text-align: center; - background: var(--bg-sunken); - border: 1px dashed var(--line); - border-radius: var(--r-1); - color: var(--text-4); - font-size: var(--fs-03); - line-height: 1.5; -} - -/* ── Build / Done steps (centered moment cards) ── */ -.ob-moment { - display: flex; - flex-direction: column; - align-items: center; - text-align: center; - gap: var(--s-5); - padding: var(--s-9) var(--s-6); - background: var(--dram-card-bg); - border: 1px solid var(--dram-card-border); - border-radius: var(--r-2); -} - -.ob-moment-badge { - width: 56px; - height: 56px; - display: grid; - place-items: center; - border-radius: var(--r-full); - font-size: 24px; - font-weight: 700; - flex: 0 0 auto; -} - -.ob-moment-badge.spin { - border: 2px solid var(--line); - border-top-color: var(--accent); - animation: ob-spin 0.9s linear infinite; - background: transparent; - width: 32px; - height: 32px; -} - -.ob-moment-badge.done { - background: var(--ok-bg); - border: 2px solid var(--ok); - color: var(--ok); -} - -@keyframes ob-spin { - to { - transform: rotate(360deg); - } -} - -.ob-moment-text { - display: grid; - gap: var(--s-2); - max-width: 420px; -} - -.ob-moment-text strong { - font-size: var(--fs-06); - font-weight: 650; - color: var(--text); -} - -.ob-moment-text span { - color: var(--text-3); - font-size: var(--fs-04); - line-height: 1.6; -} - -.ob-moment .ob-actions { - display: flex; - gap: var(--s-3); - align-items: center; - margin-top: var(--s-2); -} - -.ob-scanning { - display: flex; - align-items: center; - gap: var(--s-4); - padding: var(--s-5); - background: var(--dram-card-bg); - border: 1px solid var(--dram-card-border); - border-radius: var(--r-2); -} - -.ob-scanning-text { - display: grid; - gap: var(--s-1); -} - -.ob-scanning-text strong { - color: var(--text); - font-size: var(--fs-04); - font-weight: 600; -} - -.ob-scanning-text span { - color: var(--text-3); - font-size: var(--fs-03); - line-height: 1.5; -} - -/* ── Actions row ── */ -.ob-actions { - display: flex; - gap: var(--s-3); - align-items: center; -} - -.ob-actions .ob-skip { - margin-right: auto; -} - -/* ── Footer ── */ - -/* ── Spacing helpers ── */ -.ob-mt-5 { - margin-top: var(--s-5); -} -.ob-mt-6 { - margin-top: var(--s-6); -} -.ob-mt-7 { - margin-top: var(--s-7); -} - -/* ── Responsive ── */ -@media (max-width: 720px) { - .ob-body-inner { - max-width: 100%; - } - - .ob-step-label { - display: none; - } - - .ob-step-connector { - margin-bottom: 0; - } - - .ob-source-form { - flex-direction: column; - } - - .ob-body { - padding: var(--s-6) var(--s-4); - } - - .ob-moment { - padding: var(--s-7) var(--s-4); - } -} From a00a282fca7466e4698b2fe2ebb4f23736d56e0a Mon Sep 17 00:00:00 2001 From: James Chapman Date: Tue, 19 May 2026 23:28:53 +0100 Subject: [PATCH 4/7] Fix validateRules: use allowed.includes() instead of allowed[key].includes() The destructured loop variable `allowed` is already the per-section array, so indexing it with `key` was undefined. This fixes the TS7015 type error and the runtime TypeError in validation tests. Co-Authored-By: Claude Opus 4.7 --- server/lib/validation.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/lib/validation.js b/server/lib/validation.js index d9c88f3..9adf098 100644 --- a/server/lib/validation.js +++ b/server/lib/validation.js @@ -34,7 +34,7 @@ function validateRules(data) { if (!val || typeof val !== 'object' || Array.isArray(val)) return { valid: false, error: `Missing or invalid "${key}" section` }; for (const pkey of Object.keys(val)) { - if (!allowed[key].includes(/** @type {any} */ (pkey))) + if (!allowed.includes(pkey)) return { valid: false, error: `"${key}" does not allow priority "${pkey}"` }; if (typeof val[pkey] !== 'string') return { valid: false, error: `"${key}.${pkey}" must be a string` }; } From 5848afe7ed6ba4db88ec16145cac4f69c959408b Mon Sep 17 00:00:00 2001 From: James Chapman Date: Wed, 20 May 2026 00:41:01 +0100 Subject: [PATCH 5/7] Address PR review: async I/O, safer defaults, search constants, namespace, minor fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BLOCKER 1: Convert all sync fs operations in system-scan.js and system-scan-ides.js to async (fs.promises.*) with Promise.all for parallel probing. Router handler now awaits scanSystem(). BLOCKER 2: Default drives scanning to disabled (drives: false) in onboarding.js; skipDrives defaults to true in scanSystem(). BLOCKER 3: Add required @ts-nocheck annotation to onboarding.js and onboarding-render.js per project lint rules. Issue 4: Extract hybrid search magic numbers (0.6, 0.4, 0.3) to named constants. Replace naive suffix stripping with min-length guard to avoid false matches (e.g. "string" → "str"). Issue 9: Wrap onboarding-populate.js in OnboardingPopulate namespace with convenience shims for backward compatibility. Minor: Use windowBackground constant in main.cjs titleBarOverlay, add mounted guard to advanceProgress(), document null values in OPPORTUNITY_FILES, add TODO for quality gate pending in dashboard. Co-Authored-By: Claude Opus 4.7 --- electron/main.cjs | 2 +- server/lib/system-scan-definitions.js | 1 + server/lib/system-scan-ides.js | 26 +-- server/lib/system-scan.js | 282 +++++++++++++++----------- server/lib/vectorstore.js | 35 +++- server/router.js | 4 +- ui/dashboard.js | 1 + ui/onboarding-populate.js | 194 +++++++++--------- ui/onboarding-render.js | 2 +- ui/onboarding.js | 5 +- 10 files changed, 321 insertions(+), 231 deletions(-) diff --git a/electron/main.cjs b/electron/main.cjs index 4a71642..f829931 100644 --- a/electron/main.cjs +++ b/electron/main.cjs @@ -178,7 +178,7 @@ function createWindow() { show: false, titleBarStyle: 'hidden', titleBarOverlay: { - color: '#050309', + color: windowBackground, symbolColor: '#ffffff', height: 32, }, diff --git a/server/lib/system-scan-definitions.js b/server/lib/system-scan-definitions.js index 31ac22e..314b6ec 100644 --- a/server/lib/system-scan-definitions.js +++ b/server/lib/system-scan-definitions.js @@ -51,6 +51,7 @@ const CONFIG_FILE_NAMES = [ 'claude_desktop_config.json', ]; +// null = host has no standard global config file to create const OPPORTUNITY_FILES = { '.claude': 'CLAUDE.md', '.cursor': '.cursorrules', diff --git a/server/lib/system-scan-ides.js b/server/lib/system-scan-ides.js index cb5c0a2..f837b5c 100644 --- a/server/lib/system-scan-ides.js +++ b/server/lib/system-scan-ides.js @@ -15,15 +15,15 @@ function expandEnvVar(env) { } /** @param {string} p */ -function isDir(p) { +async function isDir(p) { try { - return fs.statSync(p).isDirectory(); + return (await fs.promises.stat(p)).isDirectory(); } catch { return false; } } -function probeIDEs() { +async function probeIDEs() { const found = []; const seen = new Set(); for (const ide of IDE_PROBE_PATHS) { @@ -37,8 +37,8 @@ function probeIDEs() { const prefix = base.substring(0, wildIdx); const parentDir = path.dirname(prefix); try { - if (fs.statSync(parentDir).isDirectory()) { - const entries = fs.readdirSync(parentDir); + if (await isDir(parentDir)) { + const entries = await fs.promises.readdir(parentDir); const match = entries .filter((e) => e.startsWith(path.basename(prefix))) .sort() @@ -51,7 +51,7 @@ function probeIDEs() { } if (!resolvedPath) { try { - if (fs.statSync(base).isDirectory()) resolvedPath = base; + if (await isDir(base)) resolvedPath = base; } catch { /* ignore */ } @@ -59,7 +59,8 @@ function probeIDEs() { if (!resolvedPath) continue; const exePath = path.join(resolvedPath, ide.exe); try { - if (fs.statSync(exePath).isFile()) { + const s = await fs.promises.stat(exePath); + if (s.isFile()) { found.push({ id: 'ide-' + ide.label.replace(/\s+/g, '-').toLowerCase(), label: ide.label, @@ -77,7 +78,7 @@ function probeIDEs() { return found; } -function probeAIExtensions() { +async function probeAIExtensions() { const ideExtDirs = [ { label: 'VS Code', path: path.join(HOMEDIR, '.vscode', 'extensions') }, { @@ -95,10 +96,10 @@ function probeAIExtensions() { ]; /** @type {Record} */ const perIde = {}; - for (const ide of ideExtDirs) { - if (!isDir(ide.path)) continue; + const checks = ideExtDirs.map(async (ide) => { + if (!(await isDir(ide.path))) return; try { - const entries = fs.readdirSync(ide.path); + const entries = await fs.promises.readdir(ide.path); const found = []; for (const ai of AI_EXTENSION_PATTERNS) { if (entries.some((e) => e.startsWith(ai.pattern))) found.push(ai.label); @@ -107,7 +108,8 @@ function probeAIExtensions() { } catch { /* ignore */ } - } + }); + await Promise.all(checks); return perIde; } diff --git a/server/lib/system-scan.js b/server/lib/system-scan.js index 85207a4..704c7ae 100644 --- a/server/lib/system-scan.js +++ b/server/lib/system-scan.js @@ -17,42 +17,46 @@ const { probeIDEs, probeAIExtensions } = require('./system-scan-ides'); // ---- Helpers ---- -function getDriveRoots() { +async function getDriveRoots() { const drives = []; if (process.platform === 'win32') { + const checks = []; for (let i = 65; i <= 90; i++) { const root = `${String.fromCharCode(i)}:\\`; - try { - if (fs.statSync(root).isDirectory()) drives.push(root); - } catch { - /* skip */ - } + checks.push( + fs.promises.stat(root).then( + (s) => (s.isDirectory() ? root : null), + () => null, + ), + ); } + const results = await Promise.all(checks); + for (const r of results) if (r) drives.push(r); } return drives; } /** @param {string} p */ -function isFile(p) { +async function isFile(p) { try { - return fs.statSync(p).isFile(); + return (await fs.promises.stat(p)).isFile(); } catch { return false; } } /** @param {string} p */ -function isDir(p) { +async function isDir(p) { try { - return fs.statSync(p).isDirectory(); + return (await fs.promises.stat(p)).isDirectory(); } catch { return false; } } /** @param {string} p */ -function readJsonSafe(p) { +async function readJsonSafe(p) { try { - return JSON.parse(fs.readFileSync(p, 'utf8')); + return JSON.parse(await fs.promises.readFile(p, 'utf8')); } catch { return null; } @@ -69,9 +73,9 @@ function readJsonSafe(p) { * @param {{ id: string, label: string, icon: string }} hostDef * @param {string} homedir */ -function probeHostDir(hostDef, homedir) { +async function probeHostDir(hostDef, homedir) { const hostPath = path.join(homedir, hostDef.id); - if (!isDir(hostPath)) return null; + if (!(await isDir(hostPath))) return null; /** @type {SkillEntry[]} */ const skills = []; @@ -101,7 +105,7 @@ function probeHostDir(hostDef, homedir) { // Skills: standard skill dirs const skillDir = path.join(hostPath, 'skills'); - if (isDir(skillDir) && countSkillFiles(skillDir) > 0) { + if ((await isDir(skillDir)) && countSkillFiles(skillDir) > 0) { result.skills.push({ path: skillDir, label: `${hostDef.label} skills`, @@ -120,7 +124,7 @@ function probeHostDir(hostDef, homedir) { 'claude-plugins-official', 'external_plugins', ); - if (isDir(pluginDir)) { + if (await isDir(pluginDir)) { const count = countSkillFiles(pluginDir); if (count > 0) result.skills.push({ @@ -130,7 +134,7 @@ function probeHostDir(hostDef, homedir) { names: listSkillNames(pluginDir), }); } - if (isDir(externalDir)) { + if (await isDir(externalDir)) { const count = countSkillFiles(externalDir); if (count > 0) result.skills.push({ @@ -143,101 +147,140 @@ function probeHostDir(hostDef, homedir) { } // Configs - for (const name of CONFIG_FILE_NAMES) { + const configChecks = CONFIG_FILE_NAMES.map(async (name) => { const p = path.join(hostPath, name); - if (isFile(p)) result.configs.push({ path: p, label: name }); - } + if (await isFile(p)) result.configs.push({ path: p, label: name }); + }); // Special config locations if (hostDef.id === '.cursor') { - const mcpPath = path.join(hostPath, 'mcp.json'); - if (isFile(mcpPath) && !result.configs.some((c) => c.path === mcpPath)) - result.configs.push({ path: mcpPath, label: 'mcp.json' }); + configChecks.push( + (async () => { + const mcpPath = path.join(hostPath, 'mcp.json'); + if (await isFile(mcpPath)) { + if (!result.configs.some((c) => c.path === mcpPath)) + result.configs.push({ path: mcpPath, label: 'mcp.json' }); + } + })(), + ); } if (hostDef.id === '.claude') { - const desktopConfig = path.join( - process.env.APPDATA || path.join(HOMEDIR, 'AppData', 'Roaming'), - 'Claude', - 'claude_desktop_config.json', + configChecks.push( + (async () => { + const desktopConfig = path.join( + process.env.APPDATA || path.join(HOMEDIR, 'AppData', 'Roaming'), + 'Claude', + 'claude_desktop_config.json', + ); + if (await isFile(desktopConfig)) + result.configs.push({ path: desktopConfig, label: 'claude_desktop_config.json' }); + })(), ); - if (isFile(desktopConfig)) - result.configs.push({ path: desktopConfig, label: 'claude_desktop_config.json' }); } // Kiro: steering.md and settings if (hostDef.id === '.kiro') { - const steering = path.join(hostPath, 'steering', 'steering.md'); - if (isFile(steering)) result.instructions.push({ path: steering, label: 'steering/steering.md' }); - const settings = path.join(hostPath, 'settings', 'settings.json'); - if (isFile(settings)) result.configs.push({ path: settings, label: 'settings.json' }); + configChecks.push( + (async () => { + const steering = path.join(hostPath, 'steering', 'steering.md'); + if (await isFile(steering)) + result.instructions.push({ path: steering, label: 'steering/steering.md' }); + })(), + ); + configChecks.push( + (async () => { + const settings = path.join(hostPath, 'settings', 'settings.json'); + if (await isFile(settings)) result.configs.push({ path: settings, label: 'settings.json' }); + })(), + ); } // Antigravity: settings and AI extensions if (hostDef.id === '.antigravity') { - const agSettings = path.join( - process.env.APPDATA || path.join(HOMEDIR, 'AppData', 'Roaming'), - 'Antigravity', - 'User', - 'settings.json', + configChecks.push( + (async () => { + const agSettings = path.join( + process.env.APPDATA || path.join(HOMEDIR, 'AppData', 'Roaming'), + 'Antigravity', + 'User', + 'settings.json', + ); + if (await isFile(agSettings)) result.configs.push({ path: agSettings, label: 'settings.json' }); + })(), ); - if (isFile(agSettings)) result.configs.push({ path: agSettings, label: 'settings.json' }); } // Gemini: GEMINI.md and antigravity MCP config if (hostDef.id === '.gemini') { - const agMcp = path.join(hostPath, 'antigravity', 'mcp_config.json'); - if (isFile(agMcp)) { - const json = readJsonSafe(agMcp); - const servers = json?.mcpServers || json?.servers || {}; - const count = Object.keys(servers).length; - if (count > 0) result.mcpServers.push({ path: agMcp, count, servers: Object.keys(servers) }); - else result.configs.push({ path: agMcp, label: 'antigravity/mcp_config.json' }); - } + configChecks.push( + (async () => { + const agMcp = path.join(hostPath, 'antigravity', 'mcp_config.json'); + if (!(await isFile(agMcp))) return; + const json = await readJsonSafe(agMcp); + const servers = json?.mcpServers || json?.servers || {}; + const count = Object.keys(servers).length; + if (count > 0) result.mcpServers.push({ path: agMcp, count, servers: Object.keys(servers) }); + else result.configs.push({ path: agMcp, label: 'antigravity/mcp_config.json' }); + })(), + ); } + await Promise.all(configChecks); // Instructions - for (const name of INSTRUCTION_FILE_NAMES) { + const instrChecks = INSTRUCTION_FILE_NAMES.map(async (name) => { const p = path.join(hostPath, name); - if (isFile(p)) result.instructions.push({ path: p, label: name }); - } + if (await isFile(p)) result.instructions.push({ path: p, label: name }); + }); // Special instruction dirs if (hostDef.id === '.claude') { - const projectsDir = path.join(hostPath, 'projects'); - if (isDir(projectsDir)) { - try { - for (const proj of fs.readdirSync(projectsDir)) { - const memDir = path.join(projectsDir, proj, 'memory'); - if (isDir(memDir)) { + instrChecks.push( + (async () => { + const projectsDir = path.join(hostPath, 'projects'); + if (!(await isDir(projectsDir))) return; + try { + const projEntries = await fs.promises.readdir(projectsDir); + const memChecks = projEntries.map(async (proj) => { + const memDir = path.join(projectsDir, proj, 'memory'); + if (!(await isDir(memDir))) return; try { - for (const f of fs.readdirSync(memDir)) { + const files = await fs.promises.readdir(memDir); + for (const f of files) { if (f.endsWith('.md')) result.instructions.push({ path: path.join(memDir, f), label: `memory/${f}` }); } } catch { /* ignore */ } - } + }); + await Promise.all(memChecks); + } catch { + /* ignore */ } - } catch { - /* ignore */ - } - } + })(), + ); } + await Promise.all(instrChecks); // Rules - for (const name of RULE_FILE_NAMES) { + const ruleChecks = RULE_FILE_NAMES.map(async (name) => { const p = path.join(hostPath, name); - if (isFile(p)) result.rules.push({ path: p, label: name }); - } + if (await isFile(p)) result.rules.push({ path: p, label: name }); + }); if (hostDef.id === '.codex') { - const rulesDir = path.join(hostPath, 'rules'); - if (isDir(rulesDir)) { - try { - for (const f of fs.readdirSync(rulesDir)) { - const p = path.join(rulesDir, f); - if (isFile(p)) result.rules.push({ path: p, label: `rules/${f}` }); + ruleChecks.push( + (async () => { + const rulesDir = path.join(hostPath, 'rules'); + if (!(await isDir(rulesDir))) return; + try { + const files = await fs.promises.readdir(rulesDir); + const fileChecks = files.map(async (f) => { + const p = path.join(rulesDir, f); + if (await isFile(p)) result.rules.push({ path: p, label: `rules/${f}` }); + }); + await Promise.all(fileChecks); + } catch { + /* ignore */ } - } catch { - /* ignore */ - } - } + })(), + ); } + await Promise.all(ruleChecks); // MCP servers from host config const mcpConfigs = []; @@ -253,13 +296,14 @@ function probeHostDir(hostDef, homedir) { if (hostDef.id === '.codex') mcpConfigs.push(path.join(hostPath, 'mcp.json')); if (hostDef.id === '.cursor') mcpConfigs.push(path.join(hostPath, 'mcp.json')); if (hostDef.id === '.windsurf') mcpConfigs.push(path.join(hostPath, 'mcp.json')); - for (const mcpPath of mcpConfigs) { - if (!isFile(mcpPath)) continue; - const json = readJsonSafe(mcpPath); + const mcpChecks = mcpConfigs.map(async (mcpPath) => { + if (!(await isFile(mcpPath))) return; + const json = await readJsonSafe(mcpPath); const servers = json?.mcpServers || json?.mcp_servers || {}; const count = Object.keys(servers).length; if (count > 0) result.mcpServers.push({ path: mcpPath, count, servers: Object.keys(servers) }); - } + }); + await Promise.all(mcpChecks); // Opportunities (missing global config) const expected = OPPORTUNITY_FILES[/** @type {keyof typeof OPPORTUNITY_FILES} */ (hostDef.id)]; @@ -267,7 +311,7 @@ function probeHostDir(hostDef, homedir) { const filePath = path.join(hostPath, expected); const homedirFile = path.join(homedir, expected); // If config doesn't exist inside host dir or at homedir root - if (!isFile(filePath) && !isFile(homedirFile)) { + if (!(await isFile(filePath)) && !(await isFile(homedirFile))) { result.opportunities.push({ type: 'missing-global-config', label: expected, @@ -280,9 +324,9 @@ function probeHostDir(hostDef, homedir) { } /** @param {Array<{id: string, label: string, path: string, exe: string}>} ideList */ -function probeIdegGroup(ideList) { +async function probeIdegGroup(ideList) { if (!ideList.length) return null; - const perIde = probeAIExtensions(); + const perIde = await probeAIExtensions(); return { id: 'ides', label: 'IDEs', @@ -298,17 +342,18 @@ function probeIdegGroup(ideList) { * @param {string[]} customPaths * @param {{ skipDrives?: boolean, skipHomedir?: boolean, skipWorkspaces?: boolean }} [opts] */ -function scanSystem(customPaths = [], opts = {}) { - const { skipDrives = false, skipHomedir = false, skipWorkspaces = false } = opts; - const workspaces = skipWorkspaces ? [] : readWorkspaces(); +async function scanSystem(customPaths = [], opts = {}) { + const { skipDrives = true, skipHomedir = false, skipWorkspaces = false } = opts; + const workspaces = skipWorkspaces ? [] : await readWorkspaces(); + /** @type {any[]} */ const hosts = []; const seenHosts = new Set(); // Probe host dirs from homedir if (!skipHomedir) { - for (const h of HOSTS) { - const data = probeHostDir(h, HOMEDIR); + const homedirResults = await Promise.all(HOSTS.map((h) => probeHostDir(h, HOMEDIR))); + for (const data of homedirResults) { if (data && !seenHosts.has(data.path)) { seenHosts.add(data.path); hosts.push(data); @@ -318,44 +363,45 @@ function scanSystem(customPaths = [], opts = {}) { // Probe host dirs from drives if (!skipDrives) { - for (const drive of getDriveRoots()) { - for (const h of HOSTS) { - const p = path.join(drive, h.id); - if (seenHosts.has(p)) continue; - if (isDir(p)) { + const drives = await getDriveRoots(); + const driveResults = await Promise.all( + drives.flatMap((drive) => + HOSTS.map(async (h) => { + const p = path.join(drive, h.id); + if (seenHosts.has(p)) return null; + if (!(await isDir(p))) return null; const isWin = process.platform === 'win32'; - // On Windows, drive-level host dirs overlap with homedir (same user) - // Skip if we already found this host from homedir const homedirVersion = path.join(HOMEDIR, h.id); - if (isWin && isDir(homedirVersion) && seenHosts.has(homedirVersion)) continue; - const data = probeHostDir(h, drive); - if (data) { - seenHosts.add(p); - hosts.push(data); - } - } - } + if (isWin && (await isDir(homedirVersion)) && seenHosts.has(homedirVersion)) return null; + const data = await probeHostDir(h, drive); + if (data) seenHosts.add(p); + return data; + }), + ), + ); + for (const data of driveResults) { + if (data) hosts.push(data); } } // Probe IDEs - const ideList = skipDrives && skipHomedir ? [] : probeIDEs(); + const ideList = skipDrives && skipHomedir ? [] : await probeIDEs(); // Custom paths: scan as additional skill sources for (const cp of customPaths) { - if (!isDir(cp)) continue; + if (!(await isDir(cp))) continue; const count = countSkillFiles(cp); if (count <= 0) continue; let realPath = cp; try { - realPath = fs.realpathSync(cp); + realPath = await fs.promises.realpath(cp); } catch { /* use unresolved */ } const isWin = process.platform === 'win32'; - const internalReal = (() => { + const internalReal = await (async () => { try { - return fs.realpathSync(SKILLS_DIR); + return await fs.promises.realpath(SKILLS_DIR); } catch { return SKILLS_DIR; } @@ -396,12 +442,13 @@ function scanSystem(customPaths = [], opts = {}) { // Scan for standalone rule/instruction files in homedir root and workspaces if (!skipHomedir) { - scanStandaloneFiles(HOMEDIR, hosts); + await scanStandaloneFiles(HOMEDIR, hosts); } if (!skipDrives) { - for (const drive of getDriveRoots()) scanStandaloneFiles(drive, hosts); + const drives = await getDriveRoots(); + await Promise.all(drives.map((d) => scanStandaloneFiles(d, hosts))); } - for (const ws of workspaces) scanStandaloneFiles(ws, hosts); + await Promise.all(workspaces.map((/** @type {string} */ ws) => scanStandaloneFiles(ws, hosts))); // Filter out hosts with nothing found (empty dirs) const populated = hosts.filter( @@ -415,7 +462,7 @@ function scanSystem(customPaths = [], opts = {}) { ); // IDE group - const ides = probeIdegGroup(ideList); + const ides = await probeIdegGroup(ideList); return { hosts: populated, @@ -426,12 +473,12 @@ function scanSystem(customPaths = [], opts = {}) { } /** @param {string} dir @param {Array} hosts */ -function scanStandaloneFiles(dir, hosts) { - if (!isDir(dir)) return; +async function scanStandaloneFiles(dir, hosts) { + if (!(await isDir(dir))) return; // Check for rule/instruction files at the dir root that don't belong to a host dir for (const name of RULE_FILE_NAMES) { const p = path.join(dir, name); - if (!isFile(p)) continue; + if (!(await isFile(p))) continue; // Skip if it's inside a host dir we already scanned if (hosts.some((h) => p.startsWith(h.path + path.sep) || p === h.path)) continue; // Attach to a "standalone" section @@ -456,7 +503,7 @@ function scanStandaloneFiles(dir, hosts) { } for (const name of INSTRUCTION_FILE_NAMES) { const p = path.join(dir, name); - if (!isFile(p)) continue; + if (!(await isFile(p))) continue; if (hosts.some((h) => p.startsWith(h.path + path.sep) || p === h.path)) continue; let standalone = hosts.find((h) => h.id === 'standalone-rules'); if (!standalone) { @@ -479,9 +526,10 @@ function scanStandaloneFiles(dir, hosts) { } } -function readWorkspaces() { +// workspaces.json is created at runtime by the projects API; absence is expected on first run +async function readWorkspaces() { try { - const raw = fs.readFileSync(path.join(DATA_DIR, 'workspaces.json'), 'utf8'); + const raw = await fs.promises.readFile(path.join(DATA_DIR, 'workspaces.json'), 'utf8'); const parsed = JSON.parse(raw); if (Array.isArray(parsed?.workspaces)) { return parsed.workspaces diff --git a/server/lib/vectorstore.js b/server/lib/vectorstore.js index fee8b01..04554e6 100644 --- a/server/lib/vectorstore.js +++ b/server/lib/vectorstore.js @@ -7,6 +7,12 @@ const { DATA_DIR } = require('./config'); const DEFAULT_VECTOR_FILE = path.join(DATA_DIR, 'vectors.json'); const INDEX_STALE_FILE = path.join(DATA_DIR, 'index-stale.json'); +const VECTOR_WEIGHT = 0.6; +const LEXICAL_WEIGHT = 0.4; +const LEXICAL_SKILL_WEIGHT = 0.4; +const LEXICAL_SECTION_WEIGHT = 0.3; +const LEXICAL_TEXT_WEIGHT = 0.3; + /** * Mark the vector index as stale. The next /api/index/status response will * carry { stale: true, staleReason, staleSince } so the dashboard + onboarding @@ -167,7 +173,7 @@ function hybridSearch(store, queryVector, query, options = {}) { const lexicalScore = computeLexicalScore(record, terms); return { ...record, - score: 0.6 * vectorScore + 0.4 * lexicalScore, + score: VECTOR_WEIGHT * vectorScore + LEXICAL_WEIGHT * lexicalScore, lexicalScore, }; }) @@ -218,6 +224,22 @@ function bareSkillId(skillId) { ); } +/** + * Strip common suffixes for loose matching, but never shrink below 3 chars + * to avoid false matches (e.g. "string" → "str", "processed" → "process"). + * @param {string} word + */ +function stripSuffix(word) { + let stem = word; + for (const suffix of ['ing', 'ed', 'es', 's']) { + if (stem.endsWith(suffix) && stem.length - suffix.length >= 3) { + stem = stem.slice(0, -suffix.length); + break; + } + } + return stem; +} + /** * Extract meaningful lowercase terms from a query string. * Removes common stopwords and short tokens. @@ -348,7 +370,8 @@ function extractQueryTerms(query) { /** * Compute a lexical relevance score (0-1) for a record against query terms. - * Matches in skillId get weight 0.4, section title 0.3, chunk text 0.3. + * Matches in skillId get weight LEXICAL_SKILL_WEIGHT, section title + * LEXICAL_SECTION_WEIGHT, chunk text LEXICAL_TEXT_WEIGHT. * Supports prefix matching: "files" matches "file" in "file-search". * @param {import('./vectorstore').VectorRecord} record * @param {string[]} terms @@ -367,7 +390,7 @@ function computeLexicalScore(record, terms) { let textHits = 0; for (const term of terms) { - const termStem = term.replace(/s$/, '').replace(/ing$/, '').replace(/ed$/, ''); + const termStem = stripSuffix(term); // Check full term, stemmed term, and prefix matches /** * @param {string} word @@ -392,7 +415,11 @@ function computeLexicalScore(record, terms) { const maxHits = terms.length; if (!maxHits) return 0; - return 0.4 * (skillHits / maxHits) + 0.3 * (sectionHits / maxHits) + 0.3 * (textHits / maxHits); + return ( + LEXICAL_SKILL_WEIGHT * (skillHits / maxHits) + + LEXICAL_SECTION_WEIGHT * (sectionHits / maxHits) + + LEXICAL_TEXT_WEIGHT * (textHits / maxHits) + ); } /** diff --git a/server/router.js b/server/router.js index 4e4041d..dc91637 100644 --- a/server/router.js +++ b/server/router.js @@ -314,10 +314,10 @@ async function handleRequest(req, res, url) { skipHomedir: !!data?.skipHomedir, skipWorkspaces: !!data?.skipWorkspaces, }; - return json(res, { ok: true, ...scanSystem(customPaths, opts) }); + return json(res, { ok: true, ...(await scanSystem(customPaths, opts)) }); } if (req.method === 'GET') { - return json(res, { ok: true, ...scanSystem() }); + return json(res, { ok: true, ...(await scanSystem()) }); } } diff --git a/ui/dashboard.js b/ui/dashboard.js index 7c4a4c3..8cba542 100644 --- a/ui/dashboard.js +++ b/ui/dashboard.js @@ -309,6 +309,7 @@ const DashboardTab = (() => { host.innerHTML = `
Preview${sel.toLocaleString()} tokens selected
+ ${/* TODO: Remove "quality gate pending" once Recall@8 = 1.00 benchmark passes */ ''}
vs ${all.toLocaleString()} all on${saved > 0 ? `, saving ${saved.toLocaleString()}` : ''} (${pct}% of full) — quality gate pending
${tagRow} diff --git a/ui/onboarding-populate.js b/ui/onboarding-populate.js index 7fbe74b..44eb72a 100644 --- a/ui/onboarding-populate.js +++ b/ui/onboarding-populate.js @@ -1,102 +1,112 @@ // onboarding-populate.js — Seeds memory entries and creates a handoff from onboarding scan results. // Loaded after store.js, before onboarding.js. -function populateOnboardingMemory(scanResults) { - if (!scanResults?.hosts?.length) return; - const mem = MS.getData() || { version: '1.1', entries: [] }; - const skipContent = new Set((mem.entries || []).map((e) => (typeof e === 'string' ? e : e.content || ''))); - const pending = []; - const hosts = scanResults.hosts.filter((h) => { - const n = - h.skills.length + h.configs.length + h.instructions.length + h.rules.length + h.mcpServers.length; - return n > 0 || h.opportunities.length > 0; - }); - if (hosts.length) { - const summary = hosts - .map((h) => { - const p = []; - if (h.skills.length) - p.push(h.skills.reduce((n, s) => n + (s.count || (s.names || []).length), 0) + ' skills'); - if (h.instructions.length) p.push(h.instructions.length + ' instruction files'); - if (h.rules.length) p.push(h.rules.length + ' rule files'); - if (h.configs.length) p.push(h.configs.length + ' config files'); - if (h.mcpServers.length) p.push(h.mcpServers.reduce((n, m) => n + m.count, 0) + ' MCP servers'); - return h.label + ' (' + p.join(', ') + ')'; - }) - .join('; '); - const c = 'AI tools on this machine: ' + summary + '.'; - if (!skipContent.has(c)) - pending.push({ id: 'entry_ob_' + Date.now(), category: 'workspace', label: '', content: c }); - } - const ides = scanResults.ides || []; - if (ides.length) { - const c = 'IDEs installed: ' + ides.map((i) => i.label).join(', ') + '.'; - if (!skipContent.has(c)) - pending.push({ id: 'entry_ob_ide_' + Date.now(), category: 'workspace', label: '', content: c }); - } - const exts = [...new Set(Object.values(scanResults.ideExtensions || {}).flat())]; - if (exts.length) { - const c = 'AI extensions detected: ' + exts.join(', ') + '.'; - if (!skipContent.has(c)) - pending.push({ id: 'entry_ob_ext_' + Date.now(), category: 'workspace', label: '', content: c }); - } - if (pending.length) { - mem.entries = [...(mem.entries || []), ...pending]; - MS.save(mem); - } -} - -function populateOnboardingHandoff(scanResults) { - if (!scanResults?.hosts?.length) return; - const hosts = scanResults.hosts.filter((h) => { - const n = - h.skills.length + h.configs.length + h.instructions.length + h.rules.length + h.mcpServers.length; - return n > 0 || h.opportunities.length > 0; - }); - if (!hosts.length) return; - const lines = ['# Onboarding Scan Results', '', '## AI Tools Detected']; - for (const h of hosts) { - lines.push('', '### ' + h.label, 'Path: ' + (h.path || 'N/A')); - if (h.skills.length) { - lines.push('', '**Skills:**'); - h.skills.forEach((sk) => - lines.push('- ' + sk.label + ' (' + (sk.count || (sk.names || []).length) + ')'), - ); +const OnboardingPopulate = (() => { + function populateOnboardingMemory(scanResults) { + if (!scanResults?.hosts?.length) return; + const mem = MS.getData() || { version: '1.1', entries: [] }; + const skipContent = new Set( + (mem.entries || []).map((e) => (typeof e === 'string' ? e : e.content || '')), + ); + const pending = []; + const hosts = scanResults.hosts.filter((h) => { + const n = + h.skills.length + h.configs.length + h.instructions.length + h.rules.length + h.mcpServers.length; + return n > 0 || h.opportunities.length > 0; + }); + if (hosts.length) { + const summary = hosts + .map((h) => { + const p = []; + if (h.skills.length) + p.push(h.skills.reduce((n, s) => n + (s.count || (s.names || []).length), 0) + ' skills'); + if (h.instructions.length) p.push(h.instructions.length + ' instruction files'); + if (h.rules.length) p.push(h.rules.length + ' rule files'); + if (h.configs.length) p.push(h.configs.length + ' config files'); + if (h.mcpServers.length) p.push(h.mcpServers.reduce((n, m) => n + m.count, 0) + ' MCP servers'); + return h.label + ' (' + p.join(', ') + ')'; + }) + .join('; '); + const c = 'AI tools on this machine: ' + summary + '.'; + if (!skipContent.has(c)) + pending.push({ id: 'entry_ob_' + Date.now(), category: 'workspace', label: '', content: c }); + } + const ides = scanResults.ides || []; + if (ides.length) { + const c = 'IDEs installed: ' + ides.map((i) => i.label).join(', ') + '.'; + if (!skipContent.has(c)) + pending.push({ id: 'entry_ob_ide_' + Date.now(), category: 'workspace', label: '', content: c }); } - if (h.instructions.length) { - lines.push('', '**Instructions:**'); - h.instructions.forEach((i) => lines.push('- ' + i.label + ' (' + i.path + ')')); + const exts = [...new Set(Object.values(scanResults.ideExtensions || {}).flat())]; + if (exts.length) { + const c = 'AI extensions detected: ' + exts.join(', ') + '.'; + if (!skipContent.has(c)) + pending.push({ id: 'entry_ob_ext_' + Date.now(), category: 'workspace', label: '', content: c }); } - if (h.rules.length) { - lines.push('', '**Rules:**'); - h.rules.forEach((r) => lines.push('- ' + r.label + ' (' + r.path + ')')); + if (pending.length) { + mem.entries = [...(mem.entries || []), ...pending]; + MS.save(mem); } - if (h.configs.length) { - lines.push('', '**Configs:**'); - h.configs.forEach((c) => lines.push('- ' + c.label + ' (' + c.path + ')')); + } + + function populateOnboardingHandoff(scanResults) { + if (!scanResults?.hosts?.length) return; + const hosts = scanResults.hosts.filter((h) => { + const n = + h.skills.length + h.configs.length + h.instructions.length + h.rules.length + h.mcpServers.length; + return n > 0 || h.opportunities.length > 0; + }); + if (!hosts.length) return; + const lines = ['# Onboarding Scan Results', '', '## AI Tools Detected']; + for (const h of hosts) { + lines.push('', '### ' + h.label, 'Path: ' + (h.path || 'N/A')); + if (h.skills.length) { + lines.push('', '**Skills:**'); + h.skills.forEach((sk) => + lines.push('- ' + sk.label + ' (' + (sk.count || (sk.names || []).length) + ')'), + ); + } + if (h.instructions.length) { + lines.push('', '**Instructions:**'); + h.instructions.forEach((i) => lines.push('- ' + i.label + ' (' + i.path + ')')); + } + if (h.rules.length) { + lines.push('', '**Rules:**'); + h.rules.forEach((r) => lines.push('- ' + r.label + ' (' + r.path + ')')); + } + if (h.configs.length) { + lines.push('', '**Configs:**'); + h.configs.forEach((c) => lines.push('- ' + c.label + ' (' + c.path + ')')); + } + if (h.mcpServers.length) { + lines.push('', '**MCP Servers:**'); + h.mcpServers.forEach((m) => lines.push('- ' + m.path + ' (' + m.count + ' servers)')); + } + if (h.opportunities.length) { + lines.push('', '**Opportunities:**'); + h.opportunities.forEach((o) => lines.push('- ' + o.label + ': ' + o.description)); + } } - if (h.mcpServers.length) { - lines.push('', '**MCP Servers:**'); - h.mcpServers.forEach((m) => lines.push('- ' + m.path + ' (' + m.count + ' servers)')); + if ((scanResults.ides || []).length) { + lines.push('', '## IDEs'); + scanResults.ides.forEach((ide) => lines.push('- ' + ide.label + ' (' + ide.path + ')')); } - if (h.opportunities.length) { - lines.push('', '**Opportunities:**'); - h.opportunities.forEach((o) => lines.push('- ' + o.label + ': ' + o.description)); + const extByIde = scanResults.ideExtensions || {}; + if (Object.keys(extByIde).length) { + lines.push('', '## AI Extensions'); + for (const [ide, exts] of Object.entries(extByIde)) lines.push('- ' + ide + ': ' + exts.join(', ')); } + return apiFetch('/handoffs', 'POST', { + title: 'Onboarding Scan', + body: lines.join('\n'), + type: 'thread', + thread_tag: 'onboarding-scan', + }).catch(() => {}); } - if ((scanResults.ides || []).length) { - lines.push('', '## IDEs'); - scanResults.ides.forEach((ide) => lines.push('- ' + ide.label + ' (' + ide.path + ')')); - } - const extByIde = scanResults.ideExtensions || {}; - if (Object.keys(extByIde).length) { - lines.push('', '## AI Extensions'); - for (const [ide, exts] of Object.entries(extByIde)) lines.push('- ' + ide + ': ' + exts.join(', ')); - } - return apiFetch('/handoffs', 'POST', { - title: 'Onboarding Scan', - body: lines.join('\n'), - type: 'thread', - thread_tag: 'onboarding-scan', - }).catch(() => {}); -} + + return { populateMemory: populateOnboardingMemory, populateHandoff: populateOnboardingHandoff }; +})(); + +// Convenience shims so onboarding.js can call without namespace +const populateOnboardingMemory = (scanResults) => OnboardingPopulate.populateMemory(scanResults); +const populateOnboardingHandoff = (scanResults) => OnboardingPopulate.populateHandoff(scanResults); diff --git a/ui/onboarding-render.js b/ui/onboarding-render.js index 1043289..91643cd 100644 --- a/ui/onboarding-render.js +++ b/ui/onboarding-render.js @@ -1,4 +1,4 @@ -// @ts-nocheck +// @ts-nocheck — Path-A backlog: new onboarding modules, typing deferred to post-merge // onboarding-render.js — Host and IDE card markup for the onboarding scan results. const OnboardingRender = (() => { diff --git a/ui/onboarding.js b/ui/onboarding.js index b38bd66..a8e49e5 100644 --- a/ui/onboarding.js +++ b/ui/onboarding.js @@ -1,4 +1,4 @@ -// @ts-nocheck +// @ts-nocheck — Path-A backlog: new onboarding modules, typing deferred to post-merge // onboarding.js — Full-window 3-step setup: scan → build → done. const Onboarding = (() => { @@ -15,7 +15,7 @@ const Onboarding = (() => { let scanning = false; let scanPhase = 'config'; let customScanPaths = []; - let scanConfig = { drives: true, homedir: true, workspaces: true }; + let scanConfig = { drives: false, homedir: true, workspaces: true }; let indexing = false; let buildDone = false; let hosts = []; @@ -376,6 +376,7 @@ const Onboarding = (() => { } function advanceProgress(pct, label) { + if (!mounted) return; const fill = document.getElementById('ob-progress-fill'); const lbl = document.getElementById('ob-progress-label'); if (fill) fill.style.width = pct + '%'; From 9ba3bb1797179a517865efe31a229bce85ad12f1 Mon Sep 17 00:00:00 2001 From: James Chapman Date: Wed, 20 May 2026 01:12:14 +0100 Subject: [PATCH 6/7] Fix review issues: stem guard, cache drives, dedup isDir, typo, null check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Raise stripSuffix minimum length from 3 to 4 to prevent "rules"→"rul" - Cache getDriveRoots() result to avoid double I/O when skipDrives=false - Deduplicate isDir: export from system-scan-ides, import in system-scan - Fix typo: probeIdegGroup → probeIdeGroup - Use explicit null check for OPPORTUNITY_FILES entries - Add TODO documenting sync countSkillFiles/listSkillNames dependency Co-Authored-By: Claude Opus 4.7 --- server/lib/system-scan-ides.js | 2 +- server/lib/system-scan.js | 22 ++++++++-------------- server/lib/vectorstore.js | 7 ++++--- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/server/lib/system-scan-ides.js b/server/lib/system-scan-ides.js index f837b5c..1e6ffc9 100644 --- a/server/lib/system-scan-ides.js +++ b/server/lib/system-scan-ides.js @@ -113,4 +113,4 @@ async function probeAIExtensions() { return perIde; } -module.exports = { probeIDEs, probeAIExtensions }; +module.exports = { probeIDEs, probeAIExtensions, isDir }; diff --git a/server/lib/system-scan.js b/server/lib/system-scan.js index 704c7ae..d5005d1 100644 --- a/server/lib/system-scan.js +++ b/server/lib/system-scan.js @@ -5,6 +5,9 @@ const fs = require('fs'); const path = require('path'); const { HOMEDIR, DATA_DIR, SKILLS_DIR } = require('./config'); +// TODO: countSkillFiles and listSkillNames are synchronous and block the event loop. +// Converting them to async (fs.promises) would require changes across skills.js and +// all callers. Until then, calls here are the main latency source in scanSystem. const { countSkillFiles, listSkillNames } = require('./skills'); const { HOSTS, @@ -13,7 +16,7 @@ const { CONFIG_FILE_NAMES, OPPORTUNITY_FILES, } = require('./system-scan-definitions'); -const { probeIDEs, probeAIExtensions } = require('./system-scan-ides'); +const { probeIDEs, probeAIExtensions, isDir } = require('./system-scan-ides'); // ---- Helpers ---- @@ -44,14 +47,6 @@ async function isFile(p) { return false; } } -/** @param {string} p */ -async function isDir(p) { - try { - return (await fs.promises.stat(p)).isDirectory(); - } catch { - return false; - } -} /** @param {string} p */ async function readJsonSafe(p) { @@ -307,7 +302,7 @@ async function probeHostDir(hostDef, homedir) { // Opportunities (missing global config) const expected = OPPORTUNITY_FILES[/** @type {keyof typeof OPPORTUNITY_FILES} */ (hostDef.id)]; - if (expected) { + if (expected != null) { const filePath = path.join(hostPath, expected); const homedirFile = path.join(homedir, expected); // If config doesn't exist inside host dir or at homedir root @@ -324,7 +319,7 @@ async function probeHostDir(hostDef, homedir) { } /** @param {Array<{id: string, label: string, path: string, exe: string}>} ideList */ -async function probeIdegGroup(ideList) { +async function probeIdeGroup(ideList) { if (!ideList.length) return null; const perIde = await probeAIExtensions(); return { @@ -362,8 +357,8 @@ async function scanSystem(customPaths = [], opts = {}) { } // Probe host dirs from drives + const drives = !skipDrives ? await getDriveRoots() : []; if (!skipDrives) { - const drives = await getDriveRoots(); const driveResults = await Promise.all( drives.flatMap((drive) => HOSTS.map(async (h) => { @@ -445,7 +440,6 @@ async function scanSystem(customPaths = [], opts = {}) { await scanStandaloneFiles(HOMEDIR, hosts); } if (!skipDrives) { - const drives = await getDriveRoots(); await Promise.all(drives.map((d) => scanStandaloneFiles(d, hosts))); } await Promise.all(workspaces.map((/** @type {string} */ ws) => scanStandaloneFiles(ws, hosts))); @@ -462,7 +456,7 @@ async function scanSystem(customPaths = [], opts = {}) { ); // IDE group - const ides = await probeIdegGroup(ideList); + const ides = await probeIdeGroup(ideList); return { hosts: populated, diff --git a/server/lib/vectorstore.js b/server/lib/vectorstore.js index 04554e6..4823030 100644 --- a/server/lib/vectorstore.js +++ b/server/lib/vectorstore.js @@ -225,14 +225,15 @@ function bareSkillId(skillId) { } /** - * Strip common suffixes for loose matching, but never shrink below 3 chars - * to avoid false matches (e.g. "string" → "str", "processed" → "process"). + * Strip common suffixes for loose matching, but preserve enough characters + * to avoid false matches (e.g. "string" → "str", "rules" → "rul"). + * Minimum remaining length of 4 ensures stems like "make" stay intact. * @param {string} word */ function stripSuffix(word) { let stem = word; for (const suffix of ['ing', 'ed', 'es', 's']) { - if (stem.endsWith(suffix) && stem.length - suffix.length >= 3) { + if (stem.endsWith(suffix) && stem.length - suffix.length >= 4) { stem = stem.slice(0, -suffix.length); break; } From ce456cd1e2807d22706cc8b96c0ed33881fcb56b Mon Sep 17 00:00:00 2001 From: James Chapman Date: Wed, 20 May 2026 01:12:57 +0100 Subject: [PATCH 7/7] Bump version to 0.5.0 Co-Authored-By: Claude Opus 4.7 --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index a7738cf..183caae 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "context-engine", - "version": "0.4.0", + "version": "0.5.0", "description": "Local-first continuity layer for AI work across tools, providers, and fresh sessions.", "main": "electron/main.cjs", "bin": {