diff --git a/tests/test_event_recording.py b/tests/test_event_recording.py index a4f8525..26177c3 100644 --- a/tests/test_event_recording.py +++ b/tests/test_event_recording.py @@ -1,4 +1,5 @@ import sys +import re from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "tools")) @@ -77,4 +78,7 @@ def test_phase_transcript_does_not_truncate_existing_file(tmp_path, monkeypatch) assert existing.read_text(encoding="utf-8") == "keep me\n" assert transcript.path != existing - assert transcript.path.name.startswith("last-phase-1c-no-finding-attempt-1-") + assert re.fullmatch( + r"last-phase-1c-no-finding-attempt-1-\d{8}-\d{6}-pid\d+\.jsonl", + transcript.path.name, + ) diff --git a/tests/test_findings_quality.py b/tests/test_findings_quality.py new file mode 100644 index 0000000..b156fac --- /dev/null +++ b/tests/test_findings_quality.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "tools")) + + +def _write_phase2_finding(path: Path, *, title: str, category: str, target_area: str, summary: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + "---\n" + "id: \"CC-0099\"\n" + f"title: \"{title}\"\n" + "status: \"PENDING\"\n" + "severity: \"MEDIUM\"\n" + "cvss_v4:\n vector: \"\"\n score: 0.0\n justification: \"\"\n" + "confidence: \"MEDIUM\"\n" + f"category: \"{category}\"\n" + "cwe: [\"CWE-287\"]\n" + "language: \"c\"\n" + f"target_area: \"{target_area}\"\n" + "files: [\"scheduler/ipp.c\"]\n" + "symbols: [\"ippReadIO\"]\n" + "entry_points: [\"IPP request parser\"]\n" + "sources: [\"network IPP request\"]\n" + "sinks: [\"authorization decision\"]\n" + "trust_boundary: \"remote client to scheduler\"\n" + "assets_at_risk: [\"scheduler authorization state\"]\n" + "validation:\n status: \"NOT_STARTED\"\n methods: []\n evidence_dir: \"itemdb/evidence/CC-0099\"\n summary: \"\"\n" + "exploitation:\n status: \"NOT_STARTED\"\n impact_demonstrated: \"\"\n exploit_type: \"\"\n severity_before: \"\"\n severity_after: \"\"\n artifacts_dir: \"itemdb/evidence/CC-0099/exploits\"\n summary: \"\"\n" + "created_at: \"2026-06-18\"\n" + "updated_at: \"2026-06-18\"\n" + "---\n\n" + "# Summary\n\n" + f"{summary}\n\n" + "# Target context\n\n" + "The CUPS scheduler accepts IPP requests from remote clients and maps request metadata into authorization state.\n\n" + "# Affected code\n\n" + "The affected path is `scheduler/ipp.c` in the IPP request parser and authorization handoff.\n\n" + "# Vulnerability hypothesis\n\n" + "A remote client may control the user identity attribute that reaches an authorization decision without canonicalization.\n\n" + "# Source-to-sink reasoning\n\n" + "The source is a network IPP request. The parser copies the identity attribute into scheduler request state. The sink is an authorization check that trusts that state.\n\n" + "# Attackability / trigger conditions\n\n" + "An unauthenticated remote client can send a crafted IPP request before authorization is evaluated.\n\n" + "# Impact\n\n" + "Successful exploitation could bypass authorization checks and perform scheduler operations as a more privileged user.\n\n" + "# Validation plan\n\n" + "Send crafted IPP requests with controlled identity attributes and compare the scheduler authorization outcome against a baseline request.\n\n" + "# Counter-analysis\n\n" + "Review parser normalization, authentication layers, and later authorization checks to determine whether attacker control is removed before the sink.\n\n" + "# Validation result\n\n" + "Pending.\n\n" + "# Evidence\n\n" + "Pending.\n", + encoding="utf-8", + ) + + +def test_phase2_quality_rejects_test_template_artifact(tmp_path: Path) -> None: + from findings.quality import validate_phase2_finding_quality + + finding = tmp_path / "itemdb" / "findings" / "PENDING" / "CC-0099-test-finding.md" + _write_phase2_finding( + finding, + title="Test finding to see template", + category="Test", + target_area="Testing", + summary="This is a test finding created to verify the template system. It does not represent an actual vulnerability.", + ) + + errors = validate_phase2_finding_quality(finding) + + assert any("test/template artifact" in error for error in errors), errors + assert any("not an actual target vulnerability" in error for error in errors), errors + + +def test_phase2_quality_rejects_case_insensitive_template_markers(tmp_path: Path) -> None: + from findings.quality import validate_phase2_finding_quality + + finding = tmp_path / "itemdb" / "findings" / "PENDING" / "CC-0099-case-variant.md" + _write_phase2_finding( + finding, + title="IPP identity bypass in request parser", + category="Auth", + target_area="Scheduler", + summary="A remote client may inject identity attributes.", + ) + content = finding.read_text(encoding="utf-8") + content = content.replace( + "A remote client may control the user identity attribute", + "briefly describe the suspected vulnerability. the parser accepts unvalidated identity attributes.", + ) + finding.write_text(content, encoding="utf-8") + + errors = validate_phase2_finding_quality(finding) + + assert any("contains template guidance" in error for error in errors), errors diff --git a/tests/test_gate_check.py b/tests/test_gate_check.py index 4ab9982..b4e06f1 100644 --- a/tests/test_gate_check.py +++ b/tests/test_gate_check.py @@ -127,3 +127,18 @@ def test_gate_phase_4_rejects_wrong_status(tmp_path, monkeypatch): assert exit_code == 1 finally: gates_module.ROOT = original_root + + +def test_gate_phase_3_no_pending_is_noop_success(tmp_path, capsys): + original_root = gates_module.ROOT + gates_module.ROOT = tmp_path + (tmp_path / "itemdb" / "findings" / "PENDING").mkdir(parents=True) + + try: + exit_code = gates_module.gate_phase_3() + finally: + gates_module.ROOT = original_root + + out = capsys.readouterr().out + assert exit_code == 0 + assert "nothing to review" in out diff --git a/tests/test_phase_artifacts_cli.py b/tests/test_phase_artifacts_cli.py index b92bf7a..bdba1be 100644 --- a/tests/test_phase_artifacts_cli.py +++ b/tests/test_phase_artifacts_cli.py @@ -216,3 +216,85 @@ def test_has_valid_threat_model_returns_false_when_missing(tmp_path: Path) -> No with patch("phases.artifact_checks.ROOT", tmp_path): assert not has_valid_threat_model() + + +def test_phase2_artifacts_accept_explicit_no_findings_summary(tmp_path: Path) -> None: + from phases.artifact_checks import check_phase_2_artifacts + + runs = tmp_path / "runs" + runs.mkdir(parents=True) + (runs / "phase-2-summary-2026-06-16-120000.md").write_text( + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| - | None. | - |\n", + encoding="utf-8", + ) + + with patch("phases.artifact_checks.ROOT", tmp_path): + assert check_phase_2_artifacts() == [] + + +def test_phase2_artifacts_reject_stub_finding(tmp_path: Path) -> None: + from phases.artifact_checks import check_phase_2_artifacts + + runs = tmp_path / "runs" + pending = tmp_path / "itemdb" / "findings" / "PENDING" + runs.mkdir(parents=True) + pending.mkdir(parents=True) + (runs / "phase-2-summary-2026-06-16-120000.md").write_text( + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| CC-0001 | Stub | itemdb/findings/PENDING/CC-0001-stub.md |\n", + encoding="utf-8", + ) + (pending / "CC-0001-stub.md").write_text( + "---\n" + "id: \"CC-0001\"\n" + "title: \"Stub\"\n" + "status: \"PENDING\"\n" + "severity: \"MEDIUM\"\n" + "cvss_v4:\n vector: \"\"\n score: 0.0\n justification: \"\"\n" + "confidence: \"LOW\"\ncategory: \"Unclassified\"\ncwe: []\nlanguage: \"unknown\"\ntarget_area: \"unknown\"\n" + "files: []\nsymbols: []\nentry_points: []\nsources: []\nsinks: []\ntrust_boundary: \"unknown\"\nassets_at_risk: []\n" + "validation:\n status: \"NOT_STARTED\"\n methods: []\n evidence_dir: \"itemdb/evidence/CC-0001\"\n summary: \"\"\n" + "exploitation:\n status: \"NOT_STARTED\"\n impact_demonstrated: \"\"\n exploit_type: \"\"\n severity_before: \"\"\n severity_after: \"\"\n artifacts_dir: \"itemdb/evidence/CC-0001/exploits\"\n summary: \"\"\n" + "created_at: \"2026-06-16\"\nupdated_at: \"2026-06-16\"\n---\n\n# Summary\n\nPending.\n", + encoding="utf-8", + ) + + with patch("phases.artifact_checks.ROOT", tmp_path): + errors = check_phase_2_artifacts() + + assert any("not a complete Phase 2 finding" in error for error in errors), errors + + +def test_phase2_artifacts_report_all_quality_errors(tmp_path: Path, monkeypatch) -> None: + from phases.artifact_checks import check_phase_2_artifacts + from findings import quality as quality_mod + + runs = tmp_path / "runs" + pending = tmp_path / "itemdb" / "findings" / "PENDING" + runs.mkdir(parents=True) + pending.mkdir(parents=True) + (runs / "phase-2-summary-2026-06-18-120000.md").write_text( + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| CC-0099 | Many | itemdb/findings/PENDING/CC-0099-many-errors.md |\n", + encoding="utf-8", + ) + (pending / "CC-0099-many-errors.md").write_text("placeholder", encoding="utf-8") + monkeypatch.setattr( + quality_mod, + "validate_phase2_finding_quality", + lambda _path: [f"artifact-error-{i}" for i in range(7)], + ) + + with patch("phases.artifact_checks.ROOT", tmp_path): + errors = check_phase_2_artifacts() + + joined = "\n".join(errors) + for i in range(7): + assert f"artifact-error-{i}" in joined diff --git a/tests/test_phase_failure_state_reset.py b/tests/test_phase_failure_state_reset.py index 2442577..3c15321 100644 --- a/tests/test_phase_failure_state_reset.py +++ b/tests/test_phase_failure_state_reset.py @@ -117,6 +117,138 @@ def fake_resume_prompt(*_args, failure_details=None, **_kw): assert captured == [["stale failure from previous attempt"], None] +def test_phase_mode_terminal_stop_missing_artifacts_auto_resumes(monkeypatch): + from codecome import harness as harness_mod + from codecome import runner as runner_mod + + transcript = harness_mod.ROOT / "tmp" / "fake.jsonl" + attempts = iter([ + (0, "ses_test", _terminal_result(), transcript), + (0, "ses_test", _terminal_result(), transcript), + ]) + completion_results = iter([ + (False, ["runs/phase-2-summary*.md was not updated during this run"]), + (True, []), + ]) + captured: list[list[str] | None] = [] + prompts: list[str] = [] + + def fake_run_single_attempt(_args, _console, prompt, *_a, **_kw): + prompts.append(prompt) + return next(attempts) + + def fake_resume_prompt(*_args, failure_details=None, **_kw): + captured.append(failure_details) + return "resume prompt" + + monkeypatch.setattr(harness_mod, "ServerRunner", lambda: _FakeServerRunner()) + monkeypatch.setenv("CODECOME_MAX_ITERATION_RETRIES", "1") + monkeypatch.setattr(harness_mod, "load_prompt", lambda *_a, **_kw: "initial prompt") + monkeypatch.setattr(harness_mod, "resolve_runtime_config", lambda _agent: _FakeRuntimeConfig()) + monkeypatch.setattr(harness_mod, "configure_rendering", lambda *_a, **_kw: None) + monkeypatch.setattr(runner_mod, "_run_single_attempt", fake_run_single_attempt) + monkeypatch.setattr( + harness_mod, + "check_phase_graceful_completion", + lambda *_a, **_kw: next(completion_results), + ) + monkeypatch.setattr(harness_mod, "build_phase_resume_prompt", fake_resume_prompt) + + from findings import checks_entry + monkeypatch.setattr(checks_entry, "run_frontmatter_validation", lambda: (0, "")) + + rc = harness_mod.run_phase_mode(_args()) + + assert rc == 0 + assert prompts == ["initial prompt", "resume prompt"] + assert captured == [["runs/phase-2-summary*.md was not updated during this run"]] + + +def test_phase_mode_final_failure_prints_gate_failures(monkeypatch, capsys): + from codecome import harness as harness_mod + from codecome import runner as runner_mod + + transcript = harness_mod.ROOT / "tmp" / "fake.jsonl" + + monkeypatch.setattr(harness_mod, "ServerRunner", lambda: _FakeServerRunner()) + monkeypatch.setenv("CODECOME_MAX_ITERATION_RETRIES", "0") + monkeypatch.setattr(harness_mod, "load_prompt", lambda *_a, **_kw: "initial prompt") + monkeypatch.setattr(harness_mod, "resolve_runtime_config", lambda _agent: _FakeRuntimeConfig()) + monkeypatch.setattr(harness_mod, "configure_rendering", lambda *_a, **_kw: None) + monkeypatch.setattr( + runner_mod, + "_run_single_attempt", + lambda *_a, **_kw: (0, "ses_test", _terminal_result(), transcript), + ) + monkeypatch.setattr( + harness_mod, + "check_phase_graceful_completion", + lambda *_a, **_kw: (False, ["Invalid: CC-0015 still contains template guidance"]), + ) + + rc = harness_mod.run_phase_mode(_args()) + output = capsys.readouterr().out + + assert rc == 2 + assert "remaining gate failures" in output + assert "CC-0015 still contains template guidance" in output + + +def _budget_result() -> RunResult: + return RunResult( + last_finish_reason="length", + any_step_finish_seen=True, + step_finish_count=1, + ) + + +def test_phase_mode_budget_exhaustion_auto_resumes(monkeypatch): + from codecome import harness as harness_mod + from codecome import runner as runner_mod + + transcript = harness_mod.ROOT / "tmp" / "fake.jsonl" + attempts = iter([ + (0, "ses_test", _budget_result(), transcript), + (0, "ses_test", _budget_result(), transcript), + ]) + completion_results = iter([ + (False, ["runs/phase-2-summary*.md was not updated during this run"]), + (True, []), + ]) + captured: list[list[str] | None] = [] + prompts: list[str] = [] + + def fake_run_single_attempt(_args, _console, prompt, *_a, **_kw): + prompts.append(prompt) + return next(attempts) + + def fake_resume_prompt(*_args, failure_details=None, **_kw): + captured.append(failure_details) + return "resume prompt" + + monkeypatch.setattr(harness_mod, "ServerRunner", lambda: _FakeServerRunner()) + monkeypatch.setenv("CODECOME_MAX_ITERATION_RETRIES", "1") + monkeypatch.setattr(harness_mod, "load_prompt", lambda *_a, **_kw: "initial prompt") + monkeypatch.setattr(harness_mod, "resolve_runtime_config", lambda _agent: _FakeRuntimeConfig()) + monkeypatch.setattr(harness_mod, "configure_rendering", lambda *_a, **_kw: None) + monkeypatch.setattr(runner_mod, "_run_single_attempt", fake_run_single_attempt) + monkeypatch.setattr( + harness_mod, + "check_phase_graceful_completion", + lambda *_a, **_kw: next(completion_results), + ) + monkeypatch.setattr(harness_mod, "build_phase_resume_prompt", fake_resume_prompt) + + from findings import checks_entry + monkeypatch.setattr(checks_entry, "run_frontmatter_validation", lambda: (0, "")) + + rc = harness_mod.run_phase_mode(_args()) + + assert rc == 0 + assert prompts == ["initial prompt", "resume prompt"] + assert captured == [["runs/phase-2-summary*.md was not updated during this run"]] + + def test_phase1_subphase_does_not_reuse_previous_attempt_failures(monkeypatch): from codecome import phase_1 as p1 diff --git a/tests/test_phases_completion.py b/tests/test_phases_completion.py index 22d650a..219540e 100644 --- a/tests/test_phases_completion.py +++ b/tests/test_phases_completion.py @@ -213,8 +213,8 @@ def test_phase1_path_diagnostics_handle_notes_root_outside_root(self, tmp_path): completion_mod.SANDBOX_PLAN_PATH = orig_sandbox_plan completion_mod.ROOT = orig_root - def test_phase2_accepts_summary_with_no_new_findings(self, tmp_path): - """Phase 2 should pass when only the run summary is fresh (no new findings).""" + def test_phase2_accepts_explicit_no_findings_summary(self, tmp_path): + """Phase 2 may pass with zero findings when the summary says so.""" import os import phases.completion as completion_mod @@ -224,7 +224,13 @@ def test_phase2_accepts_summary_with_no_new_findings(self, tmp_path): completion_mod.FINDINGS_ROOT = tmp_path / "itemdb" / "findings" (tmp_path / "runs").mkdir(parents=True, exist_ok=True) summary = tmp_path / "runs" / "phase-2-summary-2026-06-05-143022.md" - summary.write_text("", encoding="utf-8") + summary.write_text( + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| - | None. | - |\n", + encoding="utf-8", + ) run_start = time.time() - 60 os.utime(summary, (run_start + 60, run_start + 60)) @@ -236,6 +242,70 @@ def test_phase2_accepts_summary_with_no_new_findings(self, tmp_path): completion_mod.ROOT = orig_root completion_mod.FINDINGS_ROOT = orig_findings_root + def test_phase2_rejects_vague_no_finding_summary(self, tmp_path): + """A fresh summary without explicit no-finding wording is not enough.""" + import os + import phases.completion as completion_mod + + orig_root = completion_mod.ROOT + orig_findings_root = completion_mod.FINDINGS_ROOT + completion_mod.ROOT = tmp_path + completion_mod.FINDINGS_ROOT = tmp_path / "itemdb" / "findings" + (tmp_path / "runs").mkdir(parents=True, exist_ok=True) + summary = tmp_path / "runs" / "phase-2-summary-2026-06-05-143022.md" + summary.write_text("# Goal\n\nLooked around.\n", encoding="utf-8") + run_start = time.time() - 60 + os.utime(summary, (run_start + 60, run_start + 60)) + + try: + ok, failures = completion_mod.check_phase_graceful_completion("2", None, run_start) + assert ok is False + assert any("does not explicitly state" in f for f in failures), failures + finally: + completion_mod.ROOT = orig_root + completion_mod.FINDINGS_ROOT = orig_findings_root + + def test_phase2_rejects_fresh_stub_finding(self, tmp_path): + """A fresh template-like finding should not satisfy Phase 2 completion.""" + import os + import phases.completion as completion_mod + + orig_root = completion_mod.ROOT + orig_findings_root = completion_mod.FINDINGS_ROOT + completion_mod.ROOT = tmp_path + completion_mod.FINDINGS_ROOT = tmp_path / "itemdb" / "findings" + pending = tmp_path / "itemdb" / "findings" / "PENDING" + pending.mkdir(parents=True, exist_ok=True) + (tmp_path / "runs").mkdir(parents=True, exist_ok=True) + summary = tmp_path / "runs" / "phase-2-summary-2026-06-05-143022.md" + summary.write_text("# Findings created\n\n| ID | Title | Path |\n|---|---|---|\n| CC-0001 | Stub | itemdb/findings/PENDING/CC-0001-stub.md |\n", encoding="utf-8") + finding = pending / "CC-0001-stub.md" + finding.write_text( + "---\n" + "id: \"CC-0001\"\n" + "title: \"Stub\"\n" + "status: \"PENDING\"\n" + "severity: \"MEDIUM\"\n" + "cvss_v4:\n vector: \"\"\n score: 0.0\n justification: \"\"\n" + "confidence: \"LOW\"\ncategory: \"Unclassified\"\ncwe: []\nlanguage: \"unknown\"\ntarget_area: \"unknown\"\n" + "files: []\nsymbols: []\nentry_points: []\nsources: []\nsinks: []\ntrust_boundary: \"unknown\"\nassets_at_risk: []\n" + "validation:\n status: \"NOT_STARTED\"\n methods: []\n evidence_dir: \"itemdb/evidence/CC-0001\"\n summary: \"\"\n" + "exploitation:\n status: \"NOT_STARTED\"\n impact_demonstrated: \"\"\n exploit_type: \"\"\n severity_before: \"\"\n severity_after: \"\"\n artifacts_dir: \"itemdb/evidence/CC-0001/exploits\"\n summary: \"\"\n" + "created_at: \"2026-06-16\"\nupdated_at: \"2026-06-16\"\n---\n\n# Summary\n\nPending.\n", + encoding="utf-8", + ) + run_start = time.time() - 60 + for path in (summary, finding): + os.utime(path, (run_start + 60, run_start + 60)) + + try: + ok, failures = completion_mod.check_phase_graceful_completion("2", None, run_start) + assert ok is False + assert any("Phase 2 finding is incomplete" in f for f in failures), failures + finally: + completion_mod.ROOT = orig_root + completion_mod.FINDINGS_ROOT = orig_findings_root + def test_phase2_failure_details_mention_run_summary(self, tmp_path): """Phase 2 should report missing run summary when nothing is freshened.""" import phases.completion as completion_mod @@ -340,7 +410,45 @@ def test_resume_prompt_with_failure_details_lists_missing_artifacts(self): ) assert "Missing required artifacts:" in prompt assert "runs/phase-2-summary*.md" in prompt - assert "Fix only these missing items." in prompt + assert "Fix every listed item." in prompt + assert "authoritative completion-gate output" in prompt + assert "Do not claim validation passed" in prompt + + def test_phase2_reports_all_quality_errors_without_truncation(self, tmp_path, monkeypatch): + import os + import phases.completion as completion_mod + from findings import quality as quality_mod + + orig_root = completion_mod.ROOT + orig_findings_root = completion_mod.FINDINGS_ROOT + completion_mod.ROOT = tmp_path + completion_mod.FINDINGS_ROOT = tmp_path / "itemdb" / "findings" + pending = tmp_path / "itemdb" / "findings" / "PENDING" + pending.mkdir(parents=True, exist_ok=True) + (tmp_path / "runs").mkdir(parents=True, exist_ok=True) + summary = tmp_path / "runs" / "phase-2-summary-2026-06-18-120000.md" + finding = pending / "CC-0099-many-errors.md" + summary.write_text("# Findings created\n\n| ID | Title | Path |\n|---|---|---|\n| CC-0099 | Many | itemdb/findings/PENDING/CC-0099-many-errors.md |\n", encoding="utf-8") + finding.write_text("placeholder", encoding="utf-8") + run_start = time.time() - 60 + for path in (summary, finding): + os.utime(path, (run_start + 60, run_start + 60)) + monkeypatch.setattr( + quality_mod, + "validate_phase2_finding_quality", + lambda _path: [f"error-{i}" for i in range(7)], + ) + + try: + ok, failures = completion_mod.check_phase_graceful_completion("2", None, run_start) + finally: + completion_mod.ROOT = orig_root + completion_mod.FINDINGS_ROOT = orig_findings_root + + assert ok is False + joined = "\n".join(failures) + for i in range(7): + assert f"error-{i}" in joined def test_resume_prompt_without_failure_details_uses_generic_wording(self): """Resume prompt should use the generic reassess wording when no failure details given.""" @@ -792,7 +900,13 @@ def test_sweep_accepts_phase2_summary_sweep(self, tmp_path): completion_mod.FINDINGS_ROOT = tmp_path / "itemdb" / "findings" (tmp_path / "runs").mkdir(parents=True, exist_ok=True) summary = tmp_path / "runs" / "phase-2-summary-sweep-src-foo-php-2026-06-12-143022.md" - summary.write_text("", encoding="utf-8") + summary.write_text( + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| - | None. | - |\n", + encoding="utf-8", + ) run_start = time.time() - 60 os.utime(summary, (run_start + 60, run_start + 60)) @@ -849,7 +963,8 @@ def test_sweep_resume_prompt_uses_phase2_gate(self): ], ) assert "runs/phase-2-summary*.md" in prompt - assert "Fix only these missing items." in prompt + assert "Fix every listed item." in prompt + assert "authoritative completion-gate output" in prompt class TestPhase3ChecklistMentionsRunSummary: diff --git a/tools/codecome/harness.py b/tools/codecome/harness.py index 28fc300..7e420e1 100644 --- a/tools/codecome/harness.py +++ b/tools/codecome/harness.py @@ -16,6 +16,7 @@ import os import signal import time +from datetime import datetime from pathlib import Path from typing import Any, Optional @@ -24,7 +25,7 @@ from codecome.console import build_console, _emit_fatal_error from rendering.dispatch import _get_rendering_ctx, configure_rendering, render_event from rendering.output import get_output, T -from rendering.events import _FINISH_TERMINAL_OK, _FINISH_MID_TURN, _FINISH_FAILURE +from rendering.events import _FINISH_TERMINAL_OK, _FINISH_MID_TURN, _FINISH_BUDGET, _FINISH_FAILURE from codecome.config import ROOT, resolve_color_mode, load_prompt, resolve_runtime_config from phases.completion import ( check_phase_graceful_completion, @@ -33,6 +34,65 @@ ) +def _pending_finding_count() -> int: + pending_dir = ROOT / "itemdb" / "findings" / "PENDING" + if not pending_dir.exists(): + return 0 + return sum(1 for p in pending_dir.glob("CC-*.md") if p.is_file()) + + +def _write_phase3_noop_summary() -> Path: + runs_dir = ROOT / "runs" + runs_dir.mkdir(parents=True, exist_ok=True) + now = datetime.now() + timestamp = now.strftime("%Y-%m-%d-%H%M%S") + path = runs_dir / f"phase-3-summary-{timestamp}.md" + path.write_text( + "# CodeCome Run Summary\n\n" + f"Date: {now.date().isoformat()} \n" + "Phase: counter_analysis \n" + "Agent: reviewer \n\n" + "# Goal\n\n" + "Review pending findings.\n\n" + "# Prompt\n\n" + "Phase 3 was requested, but there were no PENDING findings to review.\n\n" + "# Files read\n\n" + "- `itemdb/findings/PENDING/`\n\n" + "# Files created\n\n" + f"- `runs/{path.name}`\n\n" + "# Files modified\n\n" + "None.\n\n" + "# Findings created\n\n" + "| ID | Title | Path |\n" + "|---|---|---|\n" + "| - | None. | - |\n\n" + "# Findings moved\n\n" + "| ID | From | To | Reason |\n" + "|---|---|---|---|\n" + "| - | - | - | None. |\n\n" + "# Findings updated\n\n" + "| ID | Update summary |\n" + "|---|---|\n" + "| - | None. |\n\n" + "# Evidence created\n\n" + "None.\n\n" + "# Important observations\n\n" + "No PENDING findings were available, so Phase 3 had nothing to review.\n\n" + "# Assumptions\n\n" + "None.\n\n" + "# Open questions for the user\n\n" + "None.\n\n" + "# Re-run prompt hints\n\n" + "None.\n\n" + "# Limitations\n\n" + "No counter-analysis was performed because there were no PENDING findings.\n\n" + "# Recommended next step\n\n" + "Run Phase 2 if additional vulnerability hypotheses are needed.\n", + encoding="utf-8", + ) + return path + + def run_phase_mode(args: argparse.Namespace) -> int: """Run a single phase with auto-retry/resume. @@ -125,6 +185,14 @@ def _p1_forward_signal(signum: int, _frame: Any) -> None: if console is None: out.warn("rich is not installed; using plain structured output fallback") + if str(args.phase) == "3" and _pending_finding_count() == 0: + summary_path = _write_phase3_noop_summary() + out.separator(tone=T.SUCCESS) + out.success("Phase 3 completed successfully", symbol=True) + out.detail(" no PENDING findings; nothing to review") + out.detail(f" summary: {summary_path.relative_to(ROOT)}") + return 0 + attempt_number = 0 last_session_id: str = "" last_finish_reason: Optional[str] = None @@ -169,6 +237,7 @@ def _forward_signal(signum: int, _frame: Any) -> None: attempt_number += 1 phase_failures = [] phase_ok = False + finish_warning = None # Clear per-session dedup state so retries don't suppress updates. _reset_subagent_state() returncode, session_id, run_result, transcript_path = _run_single_attempt( @@ -226,6 +295,12 @@ def _forward_signal(signum: int, _frame: Any) -> None: "CodeCome observed a step_finish event without a finish reason, so the model/provider completion " "state is ambiguous. Treating the run as incomplete." ) + elif last_finish_reason in _FINISH_BUDGET: + finish_warning = ( + f"CodeCome observed output budget exhaustion (finish reason '{last_finish_reason}'). " + f"The model/provider stopped after {step_finish_count} completed loops before producing all " + "required artifacts. Treating the run as incomplete." + ) elif last_finish_reason in _FINISH_FAILURE: finish_warning = ( f"CodeCome observed finish reason '{last_finish_reason}', which means the model/provider stopped " @@ -251,22 +326,29 @@ def _forward_signal(signum: int, _frame: Any) -> None: if finish_warning is not None: if ( - last_finish_reason in _FINISH_MID_TURN + (last_finish_reason in _FINISH_MID_TURN or last_finish_reason in _FINISH_BUDGET) and last_permission_error is None ): phase_ok, phase_failures = check_phase_graceful_completion( args.phase, args.finding, RUN_START_TIME) - if phase_ok: - msg = ( - f"CodeCome observed a mid-turn model/provider cutoff for Phase {args.phase} after {step_finish_count} " - "completed loops, but expected durable artifacts were written during " - "the run. Treating the phase as complete enough to run validation and auto-repair." - ) - out.success(msg) - finish_warning = None - last_finish_reason = "graceful_forgiveness" + if last_finish_reason in _FINISH_MID_TURN: + if phase_ok: + msg = ( + f"CodeCome observed a mid-turn model/provider cutoff for Phase {args.phase} after {step_finish_count} " + "completed loops, but expected durable artifacts were written during " + "the run. Treating the phase as complete enough to run validation and auto-repair." + ) + out.success(msg) + finish_warning = None + last_finish_reason = "graceful_forgiveness" + else: + returncode = 2 else: - returncode = 2 + # _FINISH_BUDGET: fail only if artifacts are incomplete + if not phase_ok: + returncode = 2 + else: + finish_warning = None else: returncode = 2 @@ -315,19 +397,29 @@ def _forward_signal(signum: int, _frame: Any) -> None: out.error(msg) print(validation_output) break - break + if returncode == 0: + break if returncode == 2 and ( last_finish_reason in _FINISH_MID_TURN + or last_finish_reason in _FINISH_BUDGET or (last_finish_reason in _FINISH_TERMINAL_OK and not phase_ok) ): max_iteration_retries = int(os.environ.get("CODECOME_MAX_ITERATION_RETRIES", "3")) if iteration_retry_count < max_iteration_retries: iteration_retry_count += 1 - msg = ( - "\n[Auto-Resume] CodeCome observed an incomplete run and will resume the same " - f"session once to let the model finish the interrupted work (retry {iteration_retry_count}/{max_iteration_retries})." - ) + if last_finish_reason in _FINISH_BUDGET: + msg = ( + "\n[Auto-Resume] CodeCome observed output budget exhaustion and will resume the same " + f"session once to let the model finish the interrupted work " + f"(retry {iteration_retry_count}/{max_iteration_retries})." + ) + else: + msg = ( + "\n[Auto-Resume] CodeCome observed an incomplete run and will resume the same " + f"session once to let the model finish the interrupted work " + f"(retry {iteration_retry_count}/{max_iteration_retries})." + ) out.warn(msg) if last_session_id and last_session_id != "id": prompt = build_phase_resume_prompt( @@ -337,8 +429,9 @@ def _forward_signal(signum: int, _frame: Any) -> None: continue else: finish_warning = ( - "CodeCome correctly detected that the model/provider stopped mid-turn, but it could not determine " - "a session ID for automatic continuation. Treating the phase as incomplete." + "CodeCome correctly detected that the model/provider stopped before completing " + "the phase, but it could not determine a session ID for automatic continuation. " + "Treating the phase as incomplete." ) out.error("Could not determine session ID to resume.", strong=False) break @@ -367,6 +460,10 @@ def _forward_signal(signum: int, _frame: Any) -> None: ) if finish_warning: out.error(f" reason: {finish_warning}", strong=False) + if phase_failures: + out.error(" remaining gate failures:", strong=False) + for failure in phase_failures: + out.detail(f" - {failure}") out.detail(f" transcript: {transcript_path.relative_to(ROOT)}") out.warn( " hint: the run is likely partial; rerun the phase or " diff --git a/tools/codecome/phase_1.py b/tools/codecome/phase_1.py index 703e4fc..bb95323 100644 --- a/tools/codecome/phase_1.py +++ b/tools/codecome/phase_1.py @@ -33,6 +33,7 @@ from rendering.events import ( _FINISH_TERMINAL_OK, _FINISH_MID_TURN, + _FINISH_BUDGET, _FINISH_FAILURE, _reset_subagent_state, ) @@ -453,6 +454,12 @@ def _run_subphase( "CodeCome observed a step_finish event without a finish reason, so the model/provider completion " "state is ambiguous. Treating the run as incomplete." ) + elif last_finish_reason in _FINISH_BUDGET: + finish_warning = ( + f"CodeCome observed output budget exhaustion (finish reason '{last_finish_reason}'). " + f"The model/provider stopped after {step_finish_count} completed loops before producing all " + "required artifacts. Treating the subphase as incomplete." + ) elif last_finish_reason in _FINISH_FAILURE: finish_warning = ( f"CodeCome observed finish reason '{last_finish_reason}', which means the model/provider stopped " @@ -478,22 +485,29 @@ def _run_subphase( if finish_warning is not None: if ( - (not any_step_finish_seen or last_finish_reason in _FINISH_MID_TURN) + (last_finish_reason in _FINISH_MID_TURN or last_finish_reason in _FINISH_BUDGET) and last_permission_error is None ): phase_ok, phase_failures = check_phase_graceful_completion( phase_id, finding, subphase_start_time) - if phase_ok: - msg = ( - f"CodeCome observed an incomplete model/provider completion signal for Phase {phase_id} after " - f"{step_finish_count} completed loops, but expected durable artifacts were written during " - "the run. Treating the subphase as complete enough to run validation and auto-repair." - ) - out.success(msg) - finish_warning = None - last_finish_reason = "graceful_forgiveness" + if last_finish_reason in _FINISH_MID_TURN: + if phase_ok: + msg = ( + f"CodeCome observed an incomplete model/provider completion signal for Phase {phase_id} after " + f"{step_finish_count} completed loops, but expected durable artifacts were written during " + "the run. Treating the subphase as complete enough to run validation and auto-repair." + ) + out.success(msg) + finish_warning = None + last_finish_reason = "graceful_forgiveness" + else: + returncode = 2 else: - returncode = 2 + # _FINISH_BUDGET: fail only if artifacts are incomplete + if not phase_ok: + returncode = 2 + else: + finish_warning = None else: returncode = 2 @@ -573,15 +587,24 @@ def _run_subphase( break - if returncode == 2 and last_finish_reason in _FINISH_MID_TURN: + if returncode == 2 and ( + last_finish_reason in _FINISH_MID_TURN + or last_finish_reason in _FINISH_BUDGET + ): import os max_iteration_retries = int(os.environ.get("CODECOME_MAX_ITERATION_RETRIES", "1")) if iteration_retry_count < max_iteration_retries: iteration_retry_count += 1 - msg = ( - "\n[Auto-Resume] CodeCome observed a mid-turn model/provider cutoff and will resume the same " - f"session once to let the model finish the interrupted work (retry {iteration_retry_count}/{max_iteration_retries})." - ) + if last_finish_reason in _FINISH_BUDGET: + msg = ( + "\n[Auto-Resume] CodeCome observed output budget exhaustion and will resume the same " + f"session once to let the model finish the interrupted work (retry {iteration_retry_count}/{max_iteration_retries})." + ) + else: + msg = ( + "\n[Auto-Resume] CodeCome observed a mid-turn model/provider cutoff and will resume the same " + f"session once to let the model finish the interrupted work (retry {iteration_retry_count}/{max_iteration_retries})." + ) out.warn(msg) if last_session_id and last_session_id != "id": prompt = build_phase_resume_prompt( @@ -591,8 +614,9 @@ def _run_subphase( continue else: finish_warning = ( - "CodeCome correctly detected that the model/provider stopped mid-turn, but it could not determine " - "a session ID for automatic continuation. Treating the subphase as incomplete." + "CodeCome correctly detected that the model/provider stopped before completing " + "the phase, but it could not determine a session ID for automatic continuation. " + "Treating the subphase as incomplete." ) out.error("Could not determine session ID to resume.", strong=False) break diff --git a/tools/codecome/transcript.py b/tools/codecome/transcript.py index a3316e5..dd9ce14 100644 --- a/tools/codecome/transcript.py +++ b/tools/codecome/transcript.py @@ -70,7 +70,11 @@ def for_phase(cls, phase: str, finding: str | None) -> Transcript: counter = _ATTEMPT_COUNTER.get(key, 1) _ATTEMPT_COUNTER[key] = counter + 1 - path = _unique_transcript_path(_transcript_dir() / f"last-phase-{phase}-{finding_tag}-attempt-{counter}.jsonl") + stamp = time.strftime("%Y%m%d-%H%M%S") + path = _unique_transcript_path( + _transcript_dir() + / f"last-phase-{phase}-{finding_tag}-attempt-{counter}-{stamp}-pid{os.getpid()}.jsonl" + ) return cls(path, path.open("x", encoding="utf-8", buffering=1)) @classmethod diff --git a/tools/findings/quality.py b/tools/findings/quality.py new file mode 100644 index 0000000..ef1c490 --- /dev/null +++ b/tools/findings/quality.py @@ -0,0 +1,166 @@ +# Copyright (C) 2025-2026 Pablo Ruiz García +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from findings.checks import load_sections, validate_finding +from findings.frontmatter import load_frontmatter_strict + + +PHASE2_CORE_SECTIONS = [ + "Summary", + "Target context", + "Affected code", + "Vulnerability hypothesis", + "Source-to-sink reasoning", + "Attackability / trigger conditions", + "Impact", + "Validation plan", +] + +PHASE2_REQUIRED_SECTIONS = PHASE2_CORE_SECTIONS + [ + "Counter-analysis", + "Validation result", + "Evidence", +] + +_PLACEHOLDER_TEXT = { + "", + "pending.", + "todo.", + "tbd.", + "not applicable.", +} + +_TEMPLATE_MARKERS = [ + "Briefly describe the suspected vulnerability.", + "Describe the relevant target type", + "List the relevant files", + "Explain the suspected vulnerability.", + "This section must clearly distinguish what is known from what is assumed.", + "Describe the path from attacker-controlled", + "Explain how an attacker", + "Explain the realistic security impact.", + "Describe exactly how to prove or disprove the finding.", + "Try to disprove the finding.", +] + +_NON_FINDING_MARKERS = [ + "does not represent an actual vulnerability", + "does not describe a real vulnerability", + "does not contain a vulnerability", + "no actual vulnerability", + "this is a test finding", + "test finding created to verify", + "template system itself does not contain a vulnerability", +] + + +def _is_placeholder(value: Any) -> bool: + if not isinstance(value, str): + return False + return value.strip().lower() in _PLACEHOLDER_TEXT + + +def _contains_template_marker(value: str) -> bool: + lowered = value.lower() + return any(marker.lower() in lowered for marker in _TEMPLATE_MARKERS) + + +def _contains_non_finding_marker(value: str) -> bool: + lowered = value.lower() + return any(marker in lowered for marker in _NON_FINDING_MARKERS) + + +def _list_is_populated(data: dict[str, Any], key: str) -> bool: + value = data.get(key) + return isinstance(value, list) and any(str(item).strip() for item in value) + + +def _scalar_is_populated(data: dict[str, Any], key: str) -> bool: + value = data.get(key) + if not isinstance(value, str): + return False + return value.strip().lower() not in _PLACEHOLDER_TEXT | {"unknown", "unclassified"} + + +def validate_phase2_finding_quality(path: Path) -> list[str]: + """Return Phase 2 quality errors for a candidate finding. + + This is intentionally stricter than frontmatter validation. A PENDING + finding can be syntactically valid while still being unusable as a Phase 2 + durable artifact if it is mostly an untouched template. + """ + errors: list[str] = [] + + errors.extend(validate_finding(path)) + try: + data = load_frontmatter_strict(path) + except Exception as exc: + return errors or [str(exc)] + + if data.get("status") != "PENDING": + errors.append("Phase 2 findings must remain in PENDING status") + + title = str(data.get("title", "")).strip().lower() + if title.startswith("test finding") or title.startswith("template test"): + errors.append("Phase 2 findings must describe a target vulnerability, not a test/template artifact") + + for key in ("category", "target_area"): + value = str(data.get(key, "")).strip().lower() + if value in {"test", "testing", "template"}: + errors.append(f"Phase 2 frontmatter field {key} describes a test/template artifact") + + for key in ("category", "language", "target_area", "trust_boundary"): + if not _scalar_is_populated(data, key): + errors.append(f"Phase 2 requires populated frontmatter field: {key}") + + for key in ("files", "sources", "sinks", "assets_at_risk"): + if not _list_is_populated(data, key): + errors.append(f"Phase 2 requires non-empty frontmatter list: {key}") + + if not (_list_is_populated(data, "symbols") or _list_is_populated(data, "entry_points")): + errors.append("Phase 2 requires at least one symbol or entry point") + + sections = load_sections(path) + combined_sections = "\n".join(str(value) for value in sections.values()) + if _contains_non_finding_marker(combined_sections): + errors.append("Phase 2 finding text says it is not an actual target vulnerability") + + for section in PHASE2_REQUIRED_SECTIONS: + body = sections.get(section) + if body is None: + errors.append(f"Phase 2 requires #{section} section") + continue + if section in PHASE2_CORE_SECTIONS: + if _is_placeholder(body): + errors.append(f"Phase 2 requires populated #{section} section") + elif _contains_template_marker(body): + errors.append(f"Phase 2 #{section} still contains template guidance") + + counter = sections.get("Counter-analysis", "") + if counter and not _is_placeholder(counter) and _contains_template_marker(counter): + errors.append("Phase 2 #Counter-analysis still contains template guidance") + + return errors + + +def phase2_summary_declares_no_findings(path: Path) -> bool: + try: + content = path.read_text(encoding="utf-8") + except OSError: + return False + lowered = content.lower() + if "# findings created" not in lowered: + return False + no_finding_markers = [ + "| - | none.", + "| - | none |", + "no findings created", + "no new findings", + "zero findings", + ] + return any(marker in lowered for marker in no_finding_markers) diff --git a/tools/phases/artifact_checks.py b/tools/phases/artifact_checks.py index 2b57a74..13b5a36 100644 --- a/tools/phases/artifact_checks.py +++ b/tools/phases/artifact_checks.py @@ -283,6 +283,71 @@ def _validate_risk_index() -> list[str]: return validate_file_risk_index() +# -- Phase 2 ------------------------------------------------------------------- + + +def _phase2_summaries() -> list[Path]: + runs = ROOT / "runs" + if not runs.exists(): + return [] + return sorted( + [path for path in runs.glob("phase-2-summary*.md") if path.is_file()], + key=lambda p: p.stat().st_mtime, + reverse=True, + ) + + +def _phase2_pending_findings() -> list[Path]: + pending = ROOT / "itemdb" / "findings" / "PENDING" + if not pending.exists(): + return [] + return sorted(path for path in pending.glob("CC-*.md") if path.is_file()) + + +def check_phase_2_artifacts(allow_missing_generated: bool = False) -> list[str]: + """Validate Phase 2 artifacts. + + A correct Phase 2 run may create zero findings, but that must be explicit in + the run summary. If findings exist, they must be reviewable Phase 2 artifacts + rather than untouched templates. + """ + from findings.quality import ( + phase2_summary_declares_no_findings, + validate_phase2_finding_quality, + ) + + errors: list[str] = [] + summaries = _phase2_summaries() + if not summaries: + if not allow_missing_generated: + errors.append("Missing artifact: runs/phase-2-summary*.md") + return errors + + if phase2_summary_declares_no_findings(summaries[0]): + return errors + + findings = _phase2_pending_findings() + if not findings: + errors.append( + "Phase 2 summary does not explicitly state that no findings were found, " + "and no PENDING findings exist" + ) + return errors + + for finding in findings: + quality_errors = validate_phase2_finding_quality(finding) + if quality_errors: + try: + rel = finding.relative_to(ROOT) + except ValueError: + rel = finding + errors.append( + f"{rel} is not a complete Phase 2 finding: " + + "; ".join(quality_errors) + ) + return errors + + # -- Dispatch ------------------------------------------------------------------ @@ -308,10 +373,10 @@ def check_phase_artifacts( phases_to_check: list[str] if phase == "all": - phases_to_check = ["1a", "1b", "1c"] + phases_to_check = ["1a", "1b", "1c", "2"] elif phase == "1": phases_to_check = ["1a", "1b", "1c"] - elif phase in ("2", "3", "4", "5", "6"): + elif phase in ("3", "4", "5", "6"): print(C.info(f"Phase {phase} artifact checks not yet implemented; nothing to do.")) return 0 else: @@ -364,8 +429,16 @@ def _check_phase_1c_with_ct( return check_phase_1c_artifacts(allow_missing_generated=allow_missing_generated) +def _check_phase_2_with_ct( + allow_missing_generated: bool = False, + phase_1a_start_time: float | None = None, +) -> list[str]: + return check_phase_2_artifacts(allow_missing_generated=allow_missing_generated) + + _CHECKERS = { "1a": _check_phase_1a_with_ct, "1b": _check_phase_1b_with_ct, "1c": _check_phase_1c_with_ct, + "2": _check_phase_2_with_ct, } diff --git a/tools/phases/completion.py b/tools/phases/completion.py index 11bc7f2..b0c4b4b 100644 --- a/tools/phases/completion.py +++ b/tools/phases/completion.py @@ -91,6 +91,16 @@ def _run_summary_is_fresh(phase_id: str, run_start_time: float) -> bool: return any(Path(p).stat().st_mtime >= run_start_time for p in matches) +def _fresh_run_summaries(phase_id: str, run_start_time: float) -> list[Path]: + import glob as _glob + matches = [Path(p) for p in _glob.glob(str(ROOT / "runs" / f"phase-{phase_id}-summary*.md"))] + return sorted( + [p for p in matches if p.is_file() and p.stat().st_mtime >= run_start_time], + key=lambda p: p.stat().st_mtime, + reverse=True, + ) + + def _append_run_summary_check( failures: list[str], phase_id: str, run_start_time: float ) -> None: @@ -111,6 +121,28 @@ def _iter_files(root: Path) -> Iterator[Path]: yield path +def _fresh_pending_findings(run_start_time: float) -> list[Path]: + pending_dir = FINDINGS_ROOT / "PENDING" + if not pending_dir.exists(): + return [] + return sorted( + [ + path for path in pending_dir.glob("CC-*.md") + if path.is_file() and path.stat().st_mtime >= run_start_time + ], + key=lambda p: p.stat().st_mtime, + reverse=True, + ) + + +def _latest_summary_declares_no_phase2_findings(fresh_summaries: list[Path]) -> bool: + from findings.quality import phase2_summary_declares_no_findings + + if not fresh_summaries: + return False + return phase2_summary_declares_no_findings(fresh_summaries[0]) + + def _find_finding_file(status_dir: Path, finding_id: str, run_start_time: float) -> Path | None: """Find a finding file matching the ID in the status directory. @@ -256,14 +288,34 @@ def check_phase_graceful_completion(phase: str, finding: str | None, run_start_t ) return (len(failures) == 0, failures) elif phase_key in ("2", "sweep"): - import glob as _glob - run_summaries = _glob.glob(str(ROOT / "runs" / "phase-2-summary*.md")) - summary_fresh = any(Path(p).stat().st_mtime >= run_start_time for p in run_summaries) - if not summary_fresh: + from findings.quality import validate_phase2_finding_quality + + fresh_summaries = _fresh_run_summaries("2", run_start_time) + if not fresh_summaries: failures.append( "Missing: runs/phase-2-summary*.md — run summary was not " "created or updated" ) + + fresh_findings = _fresh_pending_findings(run_start_time) + if fresh_findings: + for finding_path in fresh_findings: + quality_errors = validate_phase2_finding_quality(finding_path) + if quality_errors: + try: + rel = finding_path.relative_to(ROOT) + except ValueError: + rel = finding_path + failures.append( + f"Invalid: {rel} — Phase 2 finding is incomplete: " + + "; ".join(quality_errors) + ) + elif fresh_summaries and not _latest_summary_declares_no_phase2_findings(fresh_summaries): + failures.append( + "Missing: itemdb/findings/PENDING/ — no finding was created or updated during this run, " + "and the fresh Phase 2 summary does not explicitly state that no findings were found" + ) + return (len(failures) == 0, failures) elif phase_key == "3": import glob as _glob @@ -434,9 +486,12 @@ def _resume_opener_for_reason(reason: str) -> str: - ``"infrastructure_error"`` — harness fatal-retry path. - A finish reason from ``rendering.events._FINISH_MID_TURN`` (e.g. - ``"length"``, ``"tool_use"``) — the model/provider cut off mid-turn. - - A finish reason from ``rendering.events._FINISH_FAILURE`` — the - model/provider reported a failure finish reason. + ``"tool_use"``, ``"unknown"``) — the model/provider cut off mid-turn. + - A finish reason from ``rendering.events._FINISH_BUDGET`` (e.g. + ``"length"``, ``"max_tokens"``) — the model/provider exhausted its + output budget before completing all required artifacts. + - A finish reason from ``rendering.events._FINISH_HARD_FAILURE`` — the + model/provider reported a non-recoverable failure finish reason. - ``"graceful_forgiveness"`` — synthesized by the harness when the mid-turn cutoff happened but partial artifacts were written. - A finish reason from ``rendering.events._FINISH_TERMINAL_OK`` (e.g. @@ -444,6 +499,7 @@ def _resume_opener_for_reason(reason: str) -> str: the gate still failed, so required artifacts are missing. """ from rendering.events import ( + _FINISH_BUDGET, _FINISH_FAILURE, _FINISH_MID_TURN, _FINISH_TERMINAL_OK, @@ -456,6 +512,11 @@ def _resume_opener_for_reason(reason: str) -> str: f"Your previous run was cut off mid-turn (finish reason '{reason}') " "before completing all required artifacts." ) + if reason in _FINISH_BUDGET: + return ( + f"Your previous run exhausted its output budget (finish reason '{reason}') " + "before completing all required artifacts." + ) if reason in _FINISH_FAILURE: return ( f"Your previous run stopped with a failure finish reason '{reason}' " @@ -497,7 +558,8 @@ def build_phase_resume_prompt( lines.append(f"- {detail}") lines.append("") lines.append( - "Fix only these missing items. Do not redo completed work." + "Fix every listed item. Do not redo completed work. The list above is CodeCome's " + "authoritative completion-gate output; do not ignore any item." ) else: lines.append("") @@ -516,6 +578,10 @@ def build_phase_resume_prompt( "Before ending, verify that the required durable artifacts for " "this phase exist, are updated, and are internally consistent." ) + lines.append( + "Do not claim validation passed unless the validation tool call actually completed successfully. " + "If a validation or shell tool call fails, fix the invocation or report that validation could not be run." + ) return "\n".join(lines) diff --git a/tools/phases/gates.py b/tools/phases/gates.py index e92c684..8e88682 100644 --- a/tools/phases/gates.py +++ b/tools/phases/gates.py @@ -177,10 +177,10 @@ def gate_phase_3() -> int: nv_count = count_findings("PENDING") if nv_count == 0: - print(fail("No findings in PENDING.")) + print(warn("No findings in PENDING.")) print() - print(info("Run Phase 2 first: make phase-2")) - return 1 + print(info("Phase 3 has nothing to review and will complete as a no-op.")) + return 0 print(ok(f"{nv_count} finding(s) in PENDING.")) print() diff --git a/tools/rendering/events/__init__.py b/tools/rendering/events/__init__.py index 63bec01..706c883 100644 --- a/tools/rendering/events/__init__.py +++ b/tools/rendering/events/__init__.py @@ -15,6 +15,8 @@ EventRenderer, _FINISH_TERMINAL_OK, _FINISH_MID_TURN, + _FINISH_BUDGET, + _FINISH_HARD_FAILURE, _FINISH_FAILURE, _SUBAGENT_LAST_STATE, _reset_subagent_state, @@ -40,6 +42,8 @@ "EventRenderer", "_FINISH_TERMINAL_OK", "_FINISH_MID_TURN", + "_FINISH_BUDGET", + "_FINISH_HARD_FAILURE", "_FINISH_FAILURE", "_SUBAGENT_LAST_STATE", "_reset_subagent_state", diff --git a/tools/rendering/events/base.py b/tools/rendering/events/base.py index c5d6e9c..d810a21 100644 --- a/tools/rendering/events/base.py +++ b/tools/rendering/events/base.py @@ -27,7 +27,18 @@ # incomplete signal that warrants retry/resume handling. _FINISH_MID_TURN = {"tool-calls", "tool_use", "unknown"} -_FINISH_FAILURE = {"content-filter", "content_filter", "length", "max_tokens", "error"} +# Budget-exhaustion — the model/provider stopped because the output budget was +# consumed (token limit reached). The session is still viable; auto-resume can +# let the model continue from where it was cut off. +_FINISH_BUDGET = {"length", "max_tokens"} + +# Hard failure — the model/provider stopped with a non-recoverable error +# (safety filter, internal error). Retrying the same session is unlikely to +# succeed. +_FINISH_HARD_FAILURE = {"content-filter", "content_filter", "error"} + +# Legacy alias; kept for backward compatibility. +_FINISH_FAILURE = _FINISH_BUDGET | _FINISH_HARD_FAILURE # Per-session dedup state for subagent update events. _SUBAGENT_LAST_STATE: dict[str, tuple[dict[str, Any], float]] = {}