From c373fd3055f0b87650eea227d0017471dbdcc96a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 30 Jun 2026 06:12:13 +0000
Subject: [PATCH] fix #307: record_test_baseline timeout is now fail-safe, not
 fail-open
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When record_test_baseline times out the subprocess never finishes, so
baseline_failures was always [] — indistinguishable from a genuinely
clean suite. Any pre-existing failure was silently treated as "not
pre-existing", defeating the regression-vs-pre-existing distinction.

Changes:
- status is now "timed_out" (not "baseline_failures") when the
  subprocess times out, so the two cases have distinct values
- new baseline_complete: bool field — false on timeout, true otherwise;
  downstream code can check this before trusting an empty baseline
- list_baseline_failures propagates baseline_complete, timed_out, and
  emits a "warning" key when the stored baseline is incomplete
- default timeout_seconds raised from 120 → 600 (10 min) to give most
  suites room to finish; --timeout flag still accepts an explicit value
- two regression tests added: one for the timeout path, one asserting
  baseline_complete is true on a normal run

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_015ncF4ESXM8YoxbH4U9VyYY
---
 .map/scripts/map_step_runner.py               | 28 ++++++++--
 .../templates/map/scripts/map_step_runner.py  | 28 ++++++++--
 .../map/scripts/map_step_runner.py.jinja      | 28 ++++++++--
 tests/test_map_step_runner.py                 | 56 +++++++++++++++++++
 4 files changed, 128 insertions(+), 12 deletions(-)

diff --git a/.map/scripts/map_step_runner.py b/.map/scripts/map_step_runner.py
index 4ae040c5..dab0a623 100755
--- a/.map/scripts/map_step_runner.py
+++ b/.map/scripts/map_step_runner.py
@@ -11705,7 +11705,7 @@ def record_test_baseline(
     test_command: str = "",
     *,
     module_dir: str = "",
-    timeout_seconds: int = 120,
+    timeout_seconds: int = 600,
 ) -> dict[str, object]:
     """Record a pre-flight test baseline so subtasks can distinguish
     "this regression is mine" from "this was broken before I started".
@@ -11867,9 +11867,17 @@ def record_test_baseline(
         if m:
             failures.append(m.group(1))
 
+    if timed_out:
+        status = "timed_out"
+    elif returncode == 0:
+        status = "success"
+    else:
+        status = "baseline_failures"
+
     payload: dict[str, object] = {
         "branch": branch_name,
-        "status": "success" if returncode == 0 else "baseline_failures",
+        "status": status,
+        "baseline_complete": not timed_out,
         "command": cmd_str,
         "auto_detected": bool(auto_detected_command),
         "module_dir": detected_module_dir,
@@ -11907,14 +11915,26 @@ def list_baseline_failures(branch: str) -> dict[str, object]:
     failures = data.get("baseline_failures", [])
     if not isinstance(failures, list):
         failures = []
-    return {
+    baseline_complete = data.get("baseline_complete", not data.get("timed_out", False))
+    timed_out_flag = data.get("timed_out", False)
+    result: dict[str, object] = {
         "status": "success",
         "branch": branch_name,
         "command": data.get("command", ""),
         "returncode": data.get("returncode"),
+        "baseline_complete": baseline_complete,
+        "timed_out": timed_out_flag,
         "baseline_failures": failures,
         "recorded_at": data.get("recorded_at"),
     }
+    if timed_out_flag:
+        result["warning"] = (
+            "Baseline timed out — baseline_failures is empty because the suite "
+            "did not finish, not because there were no pre-existing failures. "
+            "Treat this baseline as UNKNOWN, not clean. Re-run record_test_baseline "
+            "with a longer --timeout or a faster --command."
+        )
+    return result
 
 
 def _acknowledged_diagnostics_path(branch: str) -> Path:
@@ -18578,7 +18598,7 @@ def _opt_value(flag: str) -> str:
         baseline_branch = sys.argv[2]
         baseline_cmd = ""
         baseline_module_dir = ""
-        baseline_timeout = 120
+        baseline_timeout = 600
         if "--command" in sys.argv:
             c_idx = sys.argv.index("--command")
             if c_idx + 1 < len(sys.argv):
diff --git a/src/mapify_cli/templates/map/scripts/map_step_runner.py b/src/mapify_cli/templates/map/scripts/map_step_runner.py
index 4ae040c5..dab0a623 100755
--- a/src/mapify_cli/templates/map/scripts/map_step_runner.py
+++ b/src/mapify_cli/templates/map/scripts/map_step_runner.py
@@ -11705,7 +11705,7 @@ def record_test_baseline(
     test_command: str = "",
     *,
     module_dir: str = "",
-    timeout_seconds: int = 120,
+    timeout_seconds: int = 600,
 ) -> dict[str, object]:
     """Record a pre-flight test baseline so subtasks can distinguish
     "this regression is mine" from "this was broken before I started".
@@ -11867,9 +11867,17 @@ def record_test_baseline(
         if m:
             failures.append(m.group(1))
 
+    if timed_out:
+        status = "timed_out"
+    elif returncode == 0:
+        status = "success"
+    else:
+        status = "baseline_failures"
+
     payload: dict[str, object] = {
         "branch": branch_name,
-        "status": "success" if returncode == 0 else "baseline_failures",
+        "status": status,
+        "baseline_complete": not timed_out,
         "command": cmd_str,
         "auto_detected": bool(auto_detected_command),
         "module_dir": detected_module_dir,
@@ -11907,14 +11915,26 @@ def list_baseline_failures(branch: str) -> dict[str, object]:
     failures = data.get("baseline_failures", [])
     if not isinstance(failures, list):
         failures = []
-    return {
+    baseline_complete = data.get("baseline_complete", not data.get("timed_out", False))
+    timed_out_flag = data.get("timed_out", False)
+    result: dict[str, object] = {
         "status": "success",
         "branch": branch_name,
         "command": data.get("command", ""),
         "returncode": data.get("returncode"),
+        "baseline_complete": baseline_complete,
+        "timed_out": timed_out_flag,
         "baseline_failures": failures,
         "recorded_at": data.get("recorded_at"),
     }
+    if timed_out_flag:
+        result["warning"] = (
+            "Baseline timed out — baseline_failures is empty because the suite "
+            "did not finish, not because there were no pre-existing failures. "
+            "Treat this baseline as UNKNOWN, not clean. Re-run record_test_baseline "
+            "with a longer --timeout or a faster --command."
+        )
+    return result
 
 
 def _acknowledged_diagnostics_path(branch: str) -> Path:
@@ -18578,7 +18598,7 @@ def _opt_value(flag: str) -> str:
         baseline_branch = sys.argv[2]
         baseline_cmd = ""
         baseline_module_dir = ""
-        baseline_timeout = 120
+        baseline_timeout = 600
         if "--command" in sys.argv:
             c_idx = sys.argv.index("--command")
             if c_idx + 1 < len(sys.argv):
diff --git a/src/mapify_cli/templates_src/map/scripts/map_step_runner.py.jinja b/src/mapify_cli/templates_src/map/scripts/map_step_runner.py.jinja
index 4ae040c5..dab0a623 100755
--- a/src/mapify_cli/templates_src/map/scripts/map_step_runner.py.jinja
+++ b/src/mapify_cli/templates_src/map/scripts/map_step_runner.py.jinja
@@ -11705,7 +11705,7 @@ def record_test_baseline(
     test_command: str = "",
     *,
     module_dir: str = "",
-    timeout_seconds: int = 120,
+    timeout_seconds: int = 600,
 ) -> dict[str, object]:
     """Record a pre-flight test baseline so subtasks can distinguish
     "this regression is mine" from "this was broken before I started".
@@ -11867,9 +11867,17 @@ def record_test_baseline(
         if m:
             failures.append(m.group(1))
 
+    if timed_out:
+        status = "timed_out"
+    elif returncode == 0:
+        status = "success"
+    else:
+        status = "baseline_failures"
+
     payload: dict[str, object] = {
         "branch": branch_name,
-        "status": "success" if returncode == 0 else "baseline_failures",
+        "status": status,
+        "baseline_complete": not timed_out,
         "command": cmd_str,
         "auto_detected": bool(auto_detected_command),
         "module_dir": detected_module_dir,
@@ -11907,14 +11915,26 @@ def list_baseline_failures(branch: str) -> dict[str, object]:
     failures = data.get("baseline_failures", [])
     if not isinstance(failures, list):
         failures = []
-    return {
+    baseline_complete = data.get("baseline_complete", not data.get("timed_out", False))
+    timed_out_flag = data.get("timed_out", False)
+    result: dict[str, object] = {
         "status": "success",
         "branch": branch_name,
         "command": data.get("command", ""),
         "returncode": data.get("returncode"),
+        "baseline_complete": baseline_complete,
+        "timed_out": timed_out_flag,
         "baseline_failures": failures,
         "recorded_at": data.get("recorded_at"),
     }
+    if timed_out_flag:
+        result["warning"] = (
+            "Baseline timed out — baseline_failures is empty because the suite "
+            "did not finish, not because there were no pre-existing failures. "
+            "Treat this baseline as UNKNOWN, not clean. Re-run record_test_baseline "
+            "with a longer --timeout or a faster --command."
+        )
+    return result
 
 
 def _acknowledged_diagnostics_path(branch: str) -> Path:
@@ -18578,7 +18598,7 @@ if __name__ == "__main__":
         baseline_branch = sys.argv[2]
         baseline_cmd = ""
         baseline_module_dir = ""
-        baseline_timeout = 120
+        baseline_timeout = 600
         if "--command" in sys.argv:
             c_idx = sys.argv.index("--command")
             if c_idx + 1 < len(sys.argv):
diff --git a/tests/test_map_step_runner.py b/tests/test_map_step_runner.py
index 3fe85dfe..40e85c7f 100644
--- a/tests/test_map_step_runner.py
+++ b/tests/test_map_step_runner.py
@@ -5943,6 +5943,62 @@ def test_list_baseline_failures_no_baseline_path(
         assert report["status"] == "no_baseline"
         assert report["baseline_failures"] == []
 
+    def test_baseline_timeout_is_unknown_not_clean(
+        self, branch_workspace, monkeypatch
+    ):
+        """Regression #307: a timed-out baseline must NOT look like a clean run.
+
+        When the suite exceeds the timeout the subprocess never finishes, so
+        baseline_failures is always [] — indistinguishable from a genuinely
+        green suite unless the caller checks baseline_complete / timed_out.
+        This test verifies that both record_test_baseline and list_baseline_failures
+        surface the timeout as an explicit 'unknown' signal, not a clean pass.
+        """
+        repo = branch_workspace.parents[1]
+        monkeypatch.setenv("CLAUDE_PROJECT_DIR", str(repo))
+
+        import subprocess as _subprocess
+
+        def fake_run_timeout(cmd, **kwargs):
+            raise _subprocess.TimeoutExpired(cmd, kwargs.get("timeout", 1))
+
+        monkeypatch.setattr(map_step_runner.subprocess, "run", fake_run_timeout)
+
+        report = map_step_runner.record_test_baseline(
+            "test-branch", "pytest", timeout_seconds=1
+        )
+        # Must NOT report as success or baseline_failures with empty list
+        assert report["status"] == "timed_out", (
+            f"Expected 'timed_out' status, got {report['status']!r}"
+        )
+        assert report["baseline_complete"] is False
+        assert report["timed_out"] is True
+        assert report["baseline_failures"] == []
+
+        # list_baseline_failures must propagate the incomplete-baseline signal
+        listed = map_step_runner.list_baseline_failures("test-branch")
+        assert listed["status"] == "success"
+        assert listed["baseline_complete"] is False
+        assert listed["timed_out"] is True
+        assert "warning" in listed, "list_baseline_failures must emit a warning on timed-out baseline"
+        assert listed["baseline_failures"] == []
+
+    def test_baseline_complete_true_on_normal_run(
+        self, branch_workspace, monkeypatch
+    ):
+        """baseline_complete is True when the suite runs to completion (#307)."""
+        repo = branch_workspace.parents[1]
+        monkeypatch.setenv("CLAUDE_PROJECT_DIR", str(repo))
+        report = map_step_runner.record_test_baseline("test-branch", "true")
+        assert report["status"] == "success"
+        assert report["baseline_complete"] is True
+        assert report["timed_out"] is False
+
+        listed = map_step_runner.list_baseline_failures("test-branch")
+        assert listed["baseline_complete"] is True
+        assert listed["timed_out"] is False
+        assert "warning" not in listed
+
 
 class TestRecordSubtaskResultFilesSeparatorParsing:
     """Fix #2 (2026-05-26): CLI must accept --files with comma OR