From c373fd3055f0b87650eea227d0017471dbdcc96a Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 30 Jun 2026 06:12:13 +0000 Subject: [PATCH] fix #307: record_test_baseline timeout is now fail-safe, not fail-open MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When record_test_baseline times out the subprocess never finishes, so baseline_failures was always [] — indistinguishable from a genuinely clean suite. Any pre-existing failure was silently treated as "not pre-existing", defeating the regression-vs-pre-existing distinction. Changes: - status is now "timed_out" (not "baseline_failures") when the subprocess times out, so the two cases have distinct values - new baseline_complete: bool field — false on timeout, true otherwise; downstream code can check this before trusting an empty baseline - list_baseline_failures propagates baseline_complete, timed_out, and emits a "warning" key when the stored baseline is incomplete - default timeout_seconds raised from 120 → 600 (10 min) to give most suites room to finish; --timeout flag still accepts an explicit value - two regression tests added: one for the timeout path, one asserting baseline_complete is true on a normal run Co-Authored-By: Claude Sonnet 4.6 Claude-Session: https://claude.ai/code/session_015ncF4ESXM8YoxbH4U9VyYY --- .map/scripts/map_step_runner.py | 28 ++++++++-- .../templates/map/scripts/map_step_runner.py | 28 ++++++++-- .../map/scripts/map_step_runner.py.jinja | 28 ++++++++-- tests/test_map_step_runner.py | 56 +++++++++++++++++++ 4 files changed, 128 insertions(+), 12 deletions(-) diff --git a/.map/scripts/map_step_runner.py b/.map/scripts/map_step_runner.py index 4ae040c5..dab0a623 100755 --- a/.map/scripts/map_step_runner.py +++ b/.map/scripts/map_step_runner.py @@ -11705,7 +11705,7 @@ def record_test_baseline( test_command: str = "", *, module_dir: str = "", - timeout_seconds: int = 120, + timeout_seconds: int = 600, ) -> dict[str, object]: """Record a pre-flight test baseline so subtasks can distinguish "this regression is mine" from "this was broken before I started". @@ -11867,9 +11867,17 @@ def record_test_baseline( if m: failures.append(m.group(1)) + if timed_out: + status = "timed_out" + elif returncode == 0: + status = "success" + else: + status = "baseline_failures" + payload: dict[str, object] = { "branch": branch_name, - "status": "success" if returncode == 0 else "baseline_failures", + "status": status, + "baseline_complete": not timed_out, "command": cmd_str, "auto_detected": bool(auto_detected_command), "module_dir": detected_module_dir, @@ -11907,14 +11915,26 @@ def list_baseline_failures(branch: str) -> dict[str, object]: failures = data.get("baseline_failures", []) if not isinstance(failures, list): failures = [] - return { + baseline_complete = data.get("baseline_complete", not data.get("timed_out", False)) + timed_out_flag = data.get("timed_out", False) + result: dict[str, object] = { "status": "success", "branch": branch_name, "command": data.get("command", ""), "returncode": data.get("returncode"), + "baseline_complete": baseline_complete, + "timed_out": timed_out_flag, "baseline_failures": failures, "recorded_at": data.get("recorded_at"), } + if timed_out_flag: + result["warning"] = ( + "Baseline timed out — baseline_failures is empty because the suite " + "did not finish, not because there were no pre-existing failures. " + "Treat this baseline as UNKNOWN, not clean. Re-run record_test_baseline " + "with a longer --timeout or a faster --command." + ) + return result def _acknowledged_diagnostics_path(branch: str) -> Path: @@ -18578,7 +18598,7 @@ def _opt_value(flag: str) -> str: baseline_branch = sys.argv[2] baseline_cmd = "" baseline_module_dir = "" - baseline_timeout = 120 + baseline_timeout = 600 if "--command" in sys.argv: c_idx = sys.argv.index("--command") if c_idx + 1 < len(sys.argv): diff --git a/src/mapify_cli/templates/map/scripts/map_step_runner.py b/src/mapify_cli/templates/map/scripts/map_step_runner.py index 4ae040c5..dab0a623 100755 --- a/src/mapify_cli/templates/map/scripts/map_step_runner.py +++ b/src/mapify_cli/templates/map/scripts/map_step_runner.py @@ -11705,7 +11705,7 @@ def record_test_baseline( test_command: str = "", *, module_dir: str = "", - timeout_seconds: int = 120, + timeout_seconds: int = 600, ) -> dict[str, object]: """Record a pre-flight test baseline so subtasks can distinguish "this regression is mine" from "this was broken before I started". @@ -11867,9 +11867,17 @@ def record_test_baseline( if m: failures.append(m.group(1)) + if timed_out: + status = "timed_out" + elif returncode == 0: + status = "success" + else: + status = "baseline_failures" + payload: dict[str, object] = { "branch": branch_name, - "status": "success" if returncode == 0 else "baseline_failures", + "status": status, + "baseline_complete": not timed_out, "command": cmd_str, "auto_detected": bool(auto_detected_command), "module_dir": detected_module_dir, @@ -11907,14 +11915,26 @@ def list_baseline_failures(branch: str) -> dict[str, object]: failures = data.get("baseline_failures", []) if not isinstance(failures, list): failures = [] - return { + baseline_complete = data.get("baseline_complete", not data.get("timed_out", False)) + timed_out_flag = data.get("timed_out", False) + result: dict[str, object] = { "status": "success", "branch": branch_name, "command": data.get("command", ""), "returncode": data.get("returncode"), + "baseline_complete": baseline_complete, + "timed_out": timed_out_flag, "baseline_failures": failures, "recorded_at": data.get("recorded_at"), } + if timed_out_flag: + result["warning"] = ( + "Baseline timed out — baseline_failures is empty because the suite " + "did not finish, not because there were no pre-existing failures. " + "Treat this baseline as UNKNOWN, not clean. Re-run record_test_baseline " + "with a longer --timeout or a faster --command." + ) + return result def _acknowledged_diagnostics_path(branch: str) -> Path: @@ -18578,7 +18598,7 @@ def _opt_value(flag: str) -> str: baseline_branch = sys.argv[2] baseline_cmd = "" baseline_module_dir = "" - baseline_timeout = 120 + baseline_timeout = 600 if "--command" in sys.argv: c_idx = sys.argv.index("--command") if c_idx + 1 < len(sys.argv): diff --git a/src/mapify_cli/templates_src/map/scripts/map_step_runner.py.jinja b/src/mapify_cli/templates_src/map/scripts/map_step_runner.py.jinja index 4ae040c5..dab0a623 100755 --- a/src/mapify_cli/templates_src/map/scripts/map_step_runner.py.jinja +++ b/src/mapify_cli/templates_src/map/scripts/map_step_runner.py.jinja @@ -11705,7 +11705,7 @@ def record_test_baseline( test_command: str = "", *, module_dir: str = "", - timeout_seconds: int = 120, + timeout_seconds: int = 600, ) -> dict[str, object]: """Record a pre-flight test baseline so subtasks can distinguish "this regression is mine" from "this was broken before I started". @@ -11867,9 +11867,17 @@ def record_test_baseline( if m: failures.append(m.group(1)) + if timed_out: + status = "timed_out" + elif returncode == 0: + status = "success" + else: + status = "baseline_failures" + payload: dict[str, object] = { "branch": branch_name, - "status": "success" if returncode == 0 else "baseline_failures", + "status": status, + "baseline_complete": not timed_out, "command": cmd_str, "auto_detected": bool(auto_detected_command), "module_dir": detected_module_dir, @@ -11907,14 +11915,26 @@ def list_baseline_failures(branch: str) -> dict[str, object]: failures = data.get("baseline_failures", []) if not isinstance(failures, list): failures = [] - return { + baseline_complete = data.get("baseline_complete", not data.get("timed_out", False)) + timed_out_flag = data.get("timed_out", False) + result: dict[str, object] = { "status": "success", "branch": branch_name, "command": data.get("command", ""), "returncode": data.get("returncode"), + "baseline_complete": baseline_complete, + "timed_out": timed_out_flag, "baseline_failures": failures, "recorded_at": data.get("recorded_at"), } + if timed_out_flag: + result["warning"] = ( + "Baseline timed out — baseline_failures is empty because the suite " + "did not finish, not because there were no pre-existing failures. " + "Treat this baseline as UNKNOWN, not clean. Re-run record_test_baseline " + "with a longer --timeout or a faster --command." + ) + return result def _acknowledged_diagnostics_path(branch: str) -> Path: @@ -18578,7 +18598,7 @@ if __name__ == "__main__": baseline_branch = sys.argv[2] baseline_cmd = "" baseline_module_dir = "" - baseline_timeout = 120 + baseline_timeout = 600 if "--command" in sys.argv: c_idx = sys.argv.index("--command") if c_idx + 1 < len(sys.argv): diff --git a/tests/test_map_step_runner.py b/tests/test_map_step_runner.py index 3fe85dfe..40e85c7f 100644 --- a/tests/test_map_step_runner.py +++ b/tests/test_map_step_runner.py @@ -5943,6 +5943,62 @@ def test_list_baseline_failures_no_baseline_path( assert report["status"] == "no_baseline" assert report["baseline_failures"] == [] + def test_baseline_timeout_is_unknown_not_clean( + self, branch_workspace, monkeypatch + ): + """Regression #307: a timed-out baseline must NOT look like a clean run. + + When the suite exceeds the timeout the subprocess never finishes, so + baseline_failures is always [] — indistinguishable from a genuinely + green suite unless the caller checks baseline_complete / timed_out. + This test verifies that both record_test_baseline and list_baseline_failures + surface the timeout as an explicit 'unknown' signal, not a clean pass. + """ + repo = branch_workspace.parents[1] + monkeypatch.setenv("CLAUDE_PROJECT_DIR", str(repo)) + + import subprocess as _subprocess + + def fake_run_timeout(cmd, **kwargs): + raise _subprocess.TimeoutExpired(cmd, kwargs.get("timeout", 1)) + + monkeypatch.setattr(map_step_runner.subprocess, "run", fake_run_timeout) + + report = map_step_runner.record_test_baseline( + "test-branch", "pytest", timeout_seconds=1 + ) + # Must NOT report as success or baseline_failures with empty list + assert report["status"] == "timed_out", ( + f"Expected 'timed_out' status, got {report['status']!r}" + ) + assert report["baseline_complete"] is False + assert report["timed_out"] is True + assert report["baseline_failures"] == [] + + # list_baseline_failures must propagate the incomplete-baseline signal + listed = map_step_runner.list_baseline_failures("test-branch") + assert listed["status"] == "success" + assert listed["baseline_complete"] is False + assert listed["timed_out"] is True + assert "warning" in listed, "list_baseline_failures must emit a warning on timed-out baseline" + assert listed["baseline_failures"] == [] + + def test_baseline_complete_true_on_normal_run( + self, branch_workspace, monkeypatch + ): + """baseline_complete is True when the suite runs to completion (#307).""" + repo = branch_workspace.parents[1] + monkeypatch.setenv("CLAUDE_PROJECT_DIR", str(repo)) + report = map_step_runner.record_test_baseline("test-branch", "true") + assert report["status"] == "success" + assert report["baseline_complete"] is True + assert report["timed_out"] is False + + listed = map_step_runner.list_baseline_failures("test-branch") + assert listed["baseline_complete"] is True + assert listed["timed_out"] is False + assert "warning" not in listed + class TestRecordSubtaskResultFilesSeparatorParsing: """Fix #2 (2026-05-26): CLI must accept --files with comma OR