diff --git a/tests/test_judge_seed_test_embed.py b/tests/test_judge_seed_test_embed.py new file mode 100644 index 0000000..4b37a04 --- /dev/null +++ b/tests/test_judge_seed_test_embed.py @@ -0,0 +1,88 @@ +"""The judge needs to see the per-task seed test file. + +It lives in HEAD after the seed commit, so it never appears in `task_diff`. +Without it the judge can't tell whether the assertions actually pin down the +acceptance criteria — and false-rejects on file-existence criteria. See #16. +""" + +from __future__ import annotations + +from pathlib import Path + +from tilth import loop + + +def _task(task_id: str = "T-001") -> dict: + return { + "id": task_id, + "title": "Scaffold the project", + "description": "Bootstrap the package.", + "acceptance_criteria": [ + "todo_cli/ exists", + f"tests/test_{task_id.lower().replace('-', '')}_scaffold.py exists", + ], + } + + +def _write_test_file(worktree: Path, filename: str, body: str) -> Path: + tests_dir = worktree / "tests" + tests_dir.mkdir(parents=True, exist_ok=True) + p = tests_dir / filename + p.write_text(body) + return p + + +def test_loader_returns_matching_test_file(tmp_path: Path): + _write_test_file( + tmp_path, + "test_t001_scaffold.py", + "def test_pkg_exists():\n pass\n", + ) + rel, content = loop._load_seed_test(tmp_path, "T-001") + assert rel == "tests/test_t001_scaffold.py" + assert "test_pkg_exists" in content + + +def test_loader_empty_when_no_tests_dir(tmp_path: Path): + rel, content = loop._load_seed_test(tmp_path, "T-001") + assert (rel, content) == ("", "") + + +def test_loader_empty_when_no_matching_file(tmp_path: Path): + _write_test_file(tmp_path, "test_t002_other.py", "def test_x(): pass\n") + rel, content = loop._load_seed_test(tmp_path, "T-001") + assert (rel, content) == ("", "") + + +def test_loader_truncates_oversized_content(tmp_path: Path): + body = "x" * (loop.JUDGE_TEST_FILE_MAX_CHARS + 500) + _write_test_file(tmp_path, "test_t001_big.py", body) + _, content = loop._load_seed_test(tmp_path, "T-001") + assert len(content) < len(body) + assert "truncated" in content + + +def test_judge_prompt_embeds_seed_test(tmp_path: Path): + body = "def test_pkg_exists():\n import todo_cli\n" + _write_test_file(tmp_path, "test_t001_scaffold.py", body) + msg = loop._build_judge_user_message(_task("T-001"), tmp_path, diff="+ new line\n") + assert "## Seed acceptance test" in msg + assert "tests/test_t001_scaffold.py" in msg + assert "import todo_cli" in msg + # The "already on disk, not in this diff" framing must reach the judge — + # this is the rule that resolves the #16 file-existence false reject. + assert "not in this diff" in msg + + +def test_judge_prompt_omits_section_when_no_test_file(tmp_path: Path): + msg = loop._build_judge_user_message(_task("T-001"), tmp_path, diff="+ x\n") + assert "## Seed acceptance test" not in msg + + +def test_judge_prompt_still_includes_diff_and_criteria(tmp_path: Path): + _write_test_file(tmp_path, "test_t001_scaffold.py", "def test_x(): pass\n") + msg = loop._build_judge_user_message(_task("T-001"), tmp_path, diff="+ added\n") + assert "## Acceptance criteria" in msg + assert "- todo_cli/ exists" in msg + assert "+ added" in msg + assert "Respond with strict JSON only." in msg diff --git a/tilth/loop.py b/tilth/loop.py index e216a6c..e9fb9e1 100644 --- a/tilth/loop.py +++ b/tilth/loop.py @@ -82,22 +82,34 @@ def _propose_learning_prompt() -> str: # --- judge ------------------------------------------------------------------ JUDGE_DIFF_MAX_CHARS = 12_000 +JUDGE_TEST_FILE_MAX_CHARS = 8_000 + + +def _load_seed_test(worktree: Path, task_id: str) -> tuple[str, str]: + """Return (relative_path, content) for the seed acceptance test matching + `task_id`, or ("", "") if none. Mirrors the glob used by `run_pytest` + so the judge sees exactly the file the validator just ran.""" + tests_dir = worktree / "tests" + if not tests_dir.is_dir(): + return "", "" + matches = sorted(tests_dir.glob(validators._task_test_glob(task_id))) + if not matches: + return "", "" + path = matches[0] + try: + raw = path.read_text() + except OSError: + return "", "" + if len(raw) > JUDGE_TEST_FILE_MAX_CHARS: + raw = raw[:JUDGE_TEST_FILE_MAX_CHARS] + f"\n... [truncated, total {len(raw)} chars]" + return f"tests/{path.name}", raw -def _judge_task( - task: dict[str, Any], - worktree: Path, - client: LLMClient, - session: Session, - iter_n: int, - trace_id: str, - span_id: str, -) -> tuple[bool, str]: - """Call the judge in a fresh context. Returns (accept, reasoning).""" - diff = ws.task_diff(worktree) - if len(diff) > JUDGE_DIFF_MAX_CHARS: - diff = diff[:JUDGE_DIFF_MAX_CHARS] + f"\n... [truncated, total {len(diff)} chars]" - +def _build_judge_user_message( + task: dict[str, Any], worktree: Path, diff: str +) -> str: + """Assemble the user-side judge prompt. Pure function on (task, worktree, diff) + so it can be tested without mocking the LLM client.""" parts = [ f"# Task to evaluate: {task['id']} — {task['title']}", "", @@ -106,6 +118,7 @@ def _judge_task( criteria = task.get("acceptance_criteria") or [] if criteria: parts += ["", "## Acceptance criteria"] + [f"- {c}" for c in criteria] + agents_md = memory.load_agents_md(worktree) if agents_md.strip(): parts += [ @@ -114,6 +127,24 @@ def _judge_task( "", agents_md.rstrip(), ] + + test_rel, test_content = _load_seed_test(worktree, task["id"]) + if test_content: + parts += [ + "", + "## Seed acceptance test (already on disk, not in this diff)", + "", + f"This file ({test_rel}) was committed at seed time and is what pytest " + "just ran. Any acceptance criterion referencing its existence is satisfied " + "by its presence in HEAD. Review the assertions as part of the contract — " + "if they fail to pin down a criterion, or pin down something other than " + "what the description asks for, reject the test rather than rationalise it.", + "", + "```python", + test_content.rstrip(), + "```", + ] + parts += [ "", "## Validator status", @@ -128,9 +159,26 @@ def _judge_task( "", "Respond with strict JSON only.", ] + return "\n".join(parts) + + +def _judge_task( + task: dict[str, Any], + worktree: Path, + client: LLMClient, + session: Session, + iter_n: int, + trace_id: str, + span_id: str, +) -> tuple[bool, str]: + """Call the judge in a fresh context. Returns (accept, reasoning).""" + diff = ws.task_diff(worktree) + if len(diff) > JUDGE_DIFF_MAX_CHARS: + diff = diff[:JUDGE_DIFF_MAX_CHARS] + f"\n... [truncated, total {len(diff)} chars]" + judge_messages = [ {"role": "system", "content": _judge_prompt()}, - {"role": "user", "content": "\n".join(parts)}, + {"role": "user", "content": _build_judge_user_message(task, worktree, diff)}, ] resp = client.chat(judge_messages, model=client.config.judge_model) usage = resp.get("usage") or {} diff --git a/tilth/prompts/judge.md b/tilth/prompts/judge.md index a2c5e7a..76af13d 100644 --- a/tilth/prompts/judge.md +++ b/tilth/prompts/judge.md @@ -2,28 +2,36 @@ You are an independent code reviewer judging whether a single development task w You have **no memory of how the work was done** — you see only: 1. The task description and acceptance criteria. -2. The diff that was produced. -3. The objective validator results (already passed; if they hadn't, you wouldn't be called). +2. The seed acceptance test file (already on disk; not in the diff because it was committed at seed time). +3. The diff that was produced for this task. +4. The objective validator results (already passed; if they hadn't, you wouldn't be called). -Your job is to decide whether the diff actually satisfies the task's intent and acceptance criteria — beyond just "the tests pass." +The test file and the implementation are **joint subjects of review**. Tests can be wrong. A passing test is not authoritative if it fails to pin down a criterion, or pins down something other than what the description asks for. ## How to think -Worker agents reliably skew positive when grading their own work. You exist to catch that. Common failure shapes to look for: +Worker agents reliably skew positive when grading their own work, and seed tests are not infallible. You exist to catch both. Common failure shapes to look for: -- **Tests pass but the fix is wrong** — the change satisfies the test letter but not the intent (e.g., hardcoding a value, mocking the wrong thing, deleting the failing assertion). -- **Acceptance gap** — one of the explicit acceptance criteria is not actually satisfied by the diff. +- **Acceptance gap** — one of the acceptance criteria is not actually pinned down by any assertion in the seed test, *or* not satisfied by the diff. For each criterion, name the assertion that proves it. If you can't, reject. +- **Tests pass but the fix is wrong** — the change satisfies the test letter but not the intent. Hardcoded return values, mocking the thing under test, branching on test-specific inputs, deleting or weakening the failing assertion. Read the test and the diff together. +- **Weak test** — the assertion is decorative, tautological, or tests something other than the criterion it claims to cover. (E.g. an AC about exit codes and stderr where the test only checks `returncode == 0`.) Reject the test, not the implementation. - **Half-finished work** — debug prints, TODO comments, dead code, or partial implementations left in. - **Spec violation** — the implementation works but breaks an *explicit, named* constraint from the task description, the acceptance criteria, or AGENTS.md (provided as project context when present). Soft style preferences ("we usually prefer X") are not rejectable; only explicit constraints are. +When you reject on test grounds (Weak test), say so explicitly in your reasoning — the worker is allowed to edit the seed test to address a judge-named gap, but only when you've named it. Don't invite cosmetic test edits. + +## File-existence criteria + +The seed test file is in HEAD from the seed commit and therefore does **not** appear in the diff. Any acceptance criterion phrased as "tests/ exists" is satisfied by its presence on disk — see the test-file section in this prompt. Do not reject for a missing test file when the section is present. + ## Hard rejects (no judgement call) These two are mechanical — reject without weighing other evidence: - **Empty diff → reject.** A task that produces no diff did no work in this task, regardless of whether the eventual workspace state matches the criteria. The reasoning must say "no work was performed in this task". Do not rationalise an empty diff as success because earlier work happened to leave things in the right state. -- **Scope creep → reject.** If the diff adds, modifies, or deletes any file that is not part of *this* task's acceptance criteria, reject — even when the criteria are otherwise met and the extra files look like working code. Name the specific unrelated paths in your reasoning. A future task is not justification; the worker must not pre-empt later work. +- **Scope creep → reject.** If the diff adds, modifies, or deletes any file that is not part of *this* task's acceptance criteria, reject — even when the criteria are otherwise met and the extra files look like working code. Name the specific unrelated paths in your reasoning. A future task is not justification; the worker must not pre-empt later work. (Edits to the seed test file are scope creep unless the judge has explicitly named a test-side gap on a prior iteration.) -When the diff is in scope and addresses the criteria cleanly, accept. Don't invent reasons to reject. +When the diff is in scope, the seed test pins down the criteria, and the implementation satisfies them cleanly, accept. Don't invent reasons to reject. ## How to respond @@ -36,4 +44,4 @@ Respond with **strict JSON only**, no prose around it: } ``` -If `reject`, your reasoning must point at a specific concern — name the file, the line, or the criterion that's not met. Vague rejections waste worker iterations. +If `reject`, your reasoning must point at a specific concern — name the file, the line, the assertion, or the criterion that's not met. Vague rejections waste worker iterations.