From 78a2e014c118f60dae33744f5d8b7d319b343d7e Mon Sep 17 00:00:00 2001 From: Abir Abbas Date: Mon, 11 May 2026 16:05:43 -0400 Subject: [PATCH 1/2] fix(build): hard timeout on hax_client.create_request to prevent silent hangs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this, a wedged hax-sdk causes app.pause() to never be reached: the surrounding reasoner sits in the Phase 1.5 revision loop with no new sub-reasoners spawned and no app.pause() to mark the execution WAITING. The parent reasoner (e.g. github-buddy.implement_from_issue) keeps accumulating active-time on its pause-aware watchdog and eventually trips the default 7200s budget — even though the cascade fix from agentfield#568/#569 is doing exactly what it's supposed to. Run that motivated this: run_1778512783034_f4985c96 — first hax cycle worked end-to-end (start_pause→end_pause→revision iter). After the revision phases completed at 16:29:45, swe-planner.build was supposed to format a new hax payload and call hax_client.create_request → app.pause for the second cycle. Instead build sat doing nothing visible for 76 minutes and github-buddy.implement_from_issue tripped its watchdog at 17:45:36 with active_elapsed=7201s (810s of legitimate pause time correctly subtracted by the cascade — math was right; the upstream hang shouldn't have been allowed to run that long in the first place). Fix: wrap the synchronous hax_client.create_request in asyncio.wait_for(asyncio.to_thread(...), timeout=120) so a wedged hax-sdk surfaces as a fast, loud RuntimeError instead of an opaque multi-hour watchdog timeout up the call chain. Adds app.note() before the call (entry), on success (with request_id), and on each error branch (TimeoutError specifically vs anything else) so the run timeline shows exactly when the call started, succeeded, or got stuck. 120s gives hax-sdk reasonable headroom for cold-start; anything beyond that is almost certainly wedged. If hax-sdk's create endpoint ever legitimately needs more, we can bump the constant — but the current behaviour of "wait forever" is strictly worse than any finite value. Co-Authored-By: Claude Opus 4.7 (1M context) --- swe_af/app.py | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/swe_af/app.py b/swe_af/app.py index 35abd98..2c9ea8c 100644 --- a/swe_af/app.py +++ b/swe_af/app.py @@ -671,7 +671,52 @@ async def build( if approval_user_id: hax_create_kwargs["user_id"] = approval_user_id - hax_request = hax_client.create_request(**hax_create_kwargs) + # Hard timeout on the synchronous hax-sdk HTTP call. Without this, + # a wedged hax-sdk causes app.pause() to never be reached — build + # sits in the revision loop and the parent reasoner's active-time + # budget burns out silently. Observed on + # run_1778512783034_f4985c96: the SECOND revision's create_request + # hung, swe-planner.build never re-paused, and + # github-buddy.implement_from_issue tripped its 7200s watchdog after + # 76 minutes of nothing. 120s gives hax-sdk room for cold-start; + # anything longer is almost certainly wedged and we want to fail + # loudly rather than chew through the parent's pause-aware budget. + app.note( + f"Phase 1.5: Submitting hax create_request (iteration {revision_iter})", + tags=["build", "approval", "hax", "create_request"], + ) + try: + hax_request = await asyncio.wait_for( + asyncio.to_thread( + hax_client.create_request, **hax_create_kwargs + ), + timeout=120.0, + ) + except asyncio.TimeoutError as exc: + app.note( + f"hax_client.create_request timed out after 120s " + f"(iteration {revision_iter})", + tags=["build", "approval", "hax", "timeout"], + ) + raise RuntimeError( + f"hax-sdk create_request timed out after 120s on " + f"iteration {revision_iter}; hax-sdk is likely wedged. " + f"Without this hard timeout the surrounding reasoner " + f"would silently consume the parent's pause-aware " + f"active-time budget." + ) from exc + except Exception as exc: + app.note( + f"hax_client.create_request raised " + f"{type(exc).__name__}: {exc} (iteration {revision_iter})", + tags=["build", "approval", "hax", "error"], + ) + raise + app.note( + f"hax create_request succeeded " + f"(request_id={hax_request.id}, iteration {revision_iter})", + tags=["build", "approval", "hax", "submitted"], + ) with open(approval_state_path, "w") as _fp: _json.dump({ From b2abcb57f2235bd1faac6c60fcc14e318129f852 Mon Sep 17 00:00:00 2001 From: Abir Abbas Date: Mon, 11 May 2026 16:40:18 -0400 Subject: [PATCH 2/2] test(build): pin hax_client.create_request timeout under hang/success/error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a focused test file pinning the wrapper from the prior commit (_create_hax_request_with_timeout). 50 parametrized iterations + 2 sanity tests catch flakes in async scheduler / thread-pool behaviour that a single-run check would miss. Refactor: extracts the inline timeout block from build() into a module-level helper of the same name so tests can drive it directly without standing up a full reasoner invocation. Same control flow, same timeout, same notes — just hoisted for reuse and testability. The module-level HAX_CREATE_REQUEST_TIMEOUT_SECONDS=120.0 is the production default and a sanity test pins it against accidental drift. Test coverage: * 20× hung create_request → must raise RuntimeError within [timeout, 4*timeout]. Pins both lower bound (no spurious early failure) and upper bound (the actual production bug — unbounded wait — would blow past 4*timeout). * 20× responsive create_request → must return its response, complete well under timeout, fire the 'submitted' app.note. * 10× ConnectionError → must propagate unchanged (operators need to tell hang from network failure apart). * configurable timeout_seconds kwarg is honoured (not silently overridden by the module default). * module default is 120s (drift guard). Test characteristics: * Full file runs in ~32s locally (PYENV_VERSION=3.12.5). * hang_seconds=1.5 in the fake client — long enough to outlive the 0.3s test timeout with margin, short enough that orphaned threads (asyncio.to_thread doesn't kill the underlying thread on cancel) don't pile up and starve the default executor pool. * Fixture-scoped patch of app.note prevents real HTTP calls during tests. What this PR does NOT prove: the actual production hang IS in hax_client.create_request specifically. I still haven't reproduced the production failure end-to-end (would require a stack dump from the stuck swe-planner.build container OR a fake hax-sdk + full control-plane setup). The fix is targeted at the highest-likelihood candidate from timeline analysis. If the actual hang is elsewhere (e.g. _format_plan_for_approval), this PR doesn't address it. Co-Authored-By: Claude Opus 4.7 (1M context) --- swe_af/app.py | 118 +++++++---- tests/test_hax_create_request_timeout.py | 249 +++++++++++++++++++++++ 2 files changed, 322 insertions(+), 45 deletions(-) create mode 100644 tests/test_hax_create_request_timeout.py diff --git a/swe_af/app.py b/swe_af/app.py index 2c9ea8c..e35b6f1 100644 --- a/swe_af/app.py +++ b/swe_af/app.py @@ -354,6 +354,75 @@ def _format_plan_for_approval( return plan_summary, prd_markdown, architecture_markdown, issues_for_template +# Default timeout for the synchronous hax-sdk HTTP call. 120s gives hax-sdk +# reasonable headroom for cold-start; anything longer is almost certainly +# wedged. Module-level so tests can shorten it for fast iteration. +HAX_CREATE_REQUEST_TIMEOUT_SECONDS = 120.0 + + +async def _create_hax_request_with_timeout( + *, + hax_client, + hax_create_kwargs: dict, + revision_iter: int, + timeout_seconds: float = HAX_CREATE_REQUEST_TIMEOUT_SECONDS, +): + """Submit the hax-sdk approval request with a hard timeout. + + Without this wrapper, a wedged hax-sdk causes ``app.pause()`` to never be + reached: the surrounding reasoner sits in the Phase 1.5 revision loop, no + new sub-reasoners are spawned, and the parent reasoner's pause-aware + active-time budget burns out silently. Observed on production run + ``run_1778512783034_f4985c96`` — the SECOND revision's ``create_request`` + hung for 76min between sprint_planner completion (16:29:45) and the + parent watchdog firing (17:45:36). + + The hax-sdk Python client is synchronous, so we run it on a thread and + bound the wait with ``asyncio.wait_for``. A ``TimeoutError`` is + surfaced as a clear ``RuntimeError`` so the caller can fail-fast rather + than chew through the parent's pause-aware budget for two hours. + + Notifies the run timeline at three points (entry, success, error) so a + future hang is diagnosable from logs alone — the production failure was + invisible because the original synchronous call had no observability + hooks. + """ + app.note( + f"Phase 1.5: Submitting hax create_request (iteration {revision_iter})", + tags=["build", "approval", "hax", "create_request"], + ) + try: + hax_request = await asyncio.wait_for( + asyncio.to_thread(hax_client.create_request, **hax_create_kwargs), + timeout=timeout_seconds, + ) + except asyncio.TimeoutError as exc: + app.note( + f"hax_client.create_request timed out after {timeout_seconds}s " + f"(iteration {revision_iter})", + tags=["build", "approval", "hax", "timeout"], + ) + raise RuntimeError( + f"hax-sdk create_request timed out after {timeout_seconds}s on " + f"iteration {revision_iter}; hax-sdk is likely wedged. Without " + f"this hard timeout the surrounding reasoner would silently " + f"consume the parent's pause-aware active-time budget." + ) from exc + except Exception as exc: + app.note( + f"hax_client.create_request raised " + f"{type(exc).__name__}: {exc} (iteration {revision_iter})", + tags=["build", "approval", "hax", "error"], + ) + raise + app.note( + f"hax create_request succeeded " + f"(request_id={hax_request.id}, iteration {revision_iter})", + tags=["build", "approval", "hax", "submitted"], + ) + return hax_request + + @app.reasoner() async def build( goal: str, @@ -671,51 +740,10 @@ async def build( if approval_user_id: hax_create_kwargs["user_id"] = approval_user_id - # Hard timeout on the synchronous hax-sdk HTTP call. Without this, - # a wedged hax-sdk causes app.pause() to never be reached — build - # sits in the revision loop and the parent reasoner's active-time - # budget burns out silently. Observed on - # run_1778512783034_f4985c96: the SECOND revision's create_request - # hung, swe-planner.build never re-paused, and - # github-buddy.implement_from_issue tripped its 7200s watchdog after - # 76 minutes of nothing. 120s gives hax-sdk room for cold-start; - # anything longer is almost certainly wedged and we want to fail - # loudly rather than chew through the parent's pause-aware budget. - app.note( - f"Phase 1.5: Submitting hax create_request (iteration {revision_iter})", - tags=["build", "approval", "hax", "create_request"], - ) - try: - hax_request = await asyncio.wait_for( - asyncio.to_thread( - hax_client.create_request, **hax_create_kwargs - ), - timeout=120.0, - ) - except asyncio.TimeoutError as exc: - app.note( - f"hax_client.create_request timed out after 120s " - f"(iteration {revision_iter})", - tags=["build", "approval", "hax", "timeout"], - ) - raise RuntimeError( - f"hax-sdk create_request timed out after 120s on " - f"iteration {revision_iter}; hax-sdk is likely wedged. " - f"Without this hard timeout the surrounding reasoner " - f"would silently consume the parent's pause-aware " - f"active-time budget." - ) from exc - except Exception as exc: - app.note( - f"hax_client.create_request raised " - f"{type(exc).__name__}: {exc} (iteration {revision_iter})", - tags=["build", "approval", "hax", "error"], - ) - raise - app.note( - f"hax create_request succeeded " - f"(request_id={hax_request.id}, iteration {revision_iter})", - tags=["build", "approval", "hax", "submitted"], + hax_request = await _create_hax_request_with_timeout( + hax_client=hax_client, + hax_create_kwargs=hax_create_kwargs, + revision_iter=revision_iter, ) with open(approval_state_path, "w") as _fp: diff --git a/tests/test_hax_create_request_timeout.py b/tests/test_hax_create_request_timeout.py new file mode 100644 index 0000000..b46c0eb --- /dev/null +++ b/tests/test_hax_create_request_timeout.py @@ -0,0 +1,249 @@ +"""Pin: ``_create_hax_request_with_timeout`` bounds a hung hax-sdk call. + +Reproduces and pins the fix for production run ``run_1778512783034_f4985c96``: +github-buddy.implement_from_issue tripped its 7200s active-time watchdog +because swe-planner.build's revision-iteration ``hax_client.create_request`` +hung for 76 minutes (16:29:45 → 17:45:36). The hax-sdk Python client is +synchronous and has no client-side timeout; without an explicit wrap, a +wedged hax-sdk surfaces as a multi-hour silent stall up the call chain. + +Each test runs 10–20 iterations to catch scheduler/timing flakes. The +parametrize ID is the iteration number so failures point at the specific +attempt that broke. +""" + +from __future__ import annotations + +import asyncio +import time +from unittest.mock import MagicMock, patch + +import pytest + +from swe_af.app import _create_hax_request_with_timeout + + +# --------------------------------------------------------------------------- +# Common test plumbing +# --------------------------------------------------------------------------- + + +def _make_hung_hax_client(*, hang_seconds: float = 1.5): + """A hax client whose ``create_request`` blocks the calling thread. + + ``hang_seconds`` defaults to 1.5s — long enough to outlast the test's + 0.3s timeout with a clean margin, short enough that orphaned threads + don't pile up in the default executor and starve later iterations. + ``asyncio.to_thread`` doesn't cancel the underlying thread when the + awaiting coroutine is cancelled by ``wait_for``, so the thread keeps + sleeping until ``hang_seconds`` elapses regardless of test outcome. + """ + + client = MagicMock() + client.create_request = MagicMock( + side_effect=lambda **_kwargs: (time.sleep(hang_seconds), MagicMock())[1] + ) + return client + + +def _make_responsive_hax_client(*, request_id: str = "req-test", url: str = "https://hax/r/test"): + """A hax client that returns immediately with the given ids.""" + + client = MagicMock() + response = MagicMock(id=request_id, url=url) + client.create_request = MagicMock(return_value=response) + return client + + +@pytest.fixture(autouse=True) +def _silence_app_note(): + """Patch ``app.note`` so tests don't make real HTTP calls. + + The helper writes timeline notes at entry/success/error. In tests we + only care about the timeout semantics, not the notes themselves. + """ + + with patch("swe_af.app.app.note") as note_mock: + yield note_mock + + +# --------------------------------------------------------------------------- +# 1. Hang case — the actual production bug we're fixing. +# +# A synchronously-hung ``create_request`` must be cancelled by +# ``asyncio.wait_for`` within a small margin of the configured timeout, +# and the helper must surface a clear ``RuntimeError`` (not silently +# return None or hang the event loop). +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +@pytest.mark.parametrize("iteration", list(range(20))) +async def test_hung_create_request_raises_runtime_error_within_timeout_bound(iteration): + """20× repeat: under hang, the helper raises RuntimeError within + [timeout, timeout * 4] seconds. The lower bound rules out spurious + early failures; the upper bound rules out the bug (unbounded wait). + """ + + hax_client = _make_hung_hax_client(hang_seconds=1.5) + + timeout_seconds = 0.3 + start = time.monotonic() + with pytest.raises(RuntimeError, match=r"hax-sdk create_request timed out"): + await _create_hax_request_with_timeout( + hax_client=hax_client, + hax_create_kwargs={"type": "plan-review-v2"}, + revision_iter=1, + timeout_seconds=timeout_seconds, + ) + elapsed = time.monotonic() - start + + # Must not fire BEFORE the configured timeout — that would mean the + # helper is firing for the wrong reason. + assert elapsed >= timeout_seconds * 0.9, ( + f"iteration {iteration}: timeout fired at {elapsed:.3f}s, " + f"before configured {timeout_seconds}s" + ) + # Must not take much LONGER than the timeout — the bug we're pinning is + # exactly "wait longer than configured." A 4× margin allows for thread + # cleanup + asyncio scheduling jitter; the real production failure was + # 6000× over budget. + assert elapsed <= timeout_seconds * 4, ( + f"iteration {iteration}: timeout did NOT fire within " + f"4*{timeout_seconds}s; elapsed={elapsed:.3f}s. This means a hung " + f"hax-sdk could still chew through the parent's active-time budget." + ) + + +# --------------------------------------------------------------------------- +# 2. Success case — happy path must not be broken by the wrap. +# +# A fast ``create_request`` (the normal production case) must return its +# response, complete well under the configured timeout, and call +# ``app.note(... 'submitted' ...)`` so the run timeline reflects success. +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +@pytest.mark.parametrize("iteration", list(range(20))) +async def test_responsive_create_request_returns_response(iteration, _silence_app_note): + """20× repeat: under no-hang, the helper returns the hax response + and the timeout doesn't fire.""" + + hax_client = _make_responsive_hax_client( + request_id=f"req-{iteration:02d}", + url=f"https://hax/r/{iteration:02d}", + ) + + start = time.monotonic() + result = await _create_hax_request_with_timeout( + hax_client=hax_client, + hax_create_kwargs={"type": "plan-review-v2", "title": "test"}, + revision_iter=iteration, + timeout_seconds=0.5, + ) + elapsed = time.monotonic() - start + + assert result.id == f"req-{iteration:02d}", f"iteration {iteration}: wrong response id" + assert elapsed < 0.4, ( + f"iteration {iteration}: responsive call took {elapsed:.3f}s " + f"(unexpected slowness)" + ) + # The submitted-note must fire so the run timeline shows success. + submitted_calls = [ + call for call in _silence_app_note.call_args_list + if "submitted" in str(call) + ] + assert len(submitted_calls) >= 1, ( + f"iteration {iteration}: 'submitted' app.note never fired" + ) + + +# --------------------------------------------------------------------------- +# 3. Non-timeout exception — must surface unchanged. +# +# A ConnectionError from hax-sdk (network-level failure) must propagate +# verbatim. We do NOT want to swallow real errors or convert them into +# our timeout RuntimeError — those are different failure modes and +# operators need to tell them apart. +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +@pytest.mark.parametrize("iteration", list(range(10))) +async def test_non_timeout_exception_propagates(iteration, _silence_app_note): + """10× repeat: a ConnectionError (or any non-TimeoutError) must surface + unchanged so operators can distinguish network failure from hang.""" + + hax_client = MagicMock() + hax_client.create_request = MagicMock( + side_effect=ConnectionError(f"hax-sdk unreachable (iter {iteration})") + ) + + with pytest.raises(ConnectionError, match=rf"hax-sdk unreachable \(iter {iteration}\)"): + await _create_hax_request_with_timeout( + hax_client=hax_client, + hax_create_kwargs={"type": "plan-review-v2"}, + revision_iter=iteration, + timeout_seconds=0.5, + ) + + # The error-note must fire so the run timeline shows the failure mode. + error_calls = [ + call for call in _silence_app_note.call_args_list + if "raised" in str(call) and "ConnectionError" in str(call) + ] + assert len(error_calls) >= 1, ( + f"iteration {iteration}: 'raised ConnectionError' app.note never fired" + ) + + +# --------------------------------------------------------------------------- +# 4. Regression pin: the helper must use the configured timeout, not the +# module default. This is a property of the wrapper that we want to keep +# stable as the helper evolves. +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_configurable_timeout_is_honored(): + """The ``timeout_seconds`` kwarg must shorten the wait correctly. + + Without this, callers that want to tighten the budget (e.g. tests, or + a future caller that knows hax-sdk should never take > 30s) couldn't. + """ + + hax_client = _make_hung_hax_client(hang_seconds=1.5) + + # 0.2s timeout → must fire in ~0.2s, NOT the module default of 120s. + start = time.monotonic() + with pytest.raises(RuntimeError): + await _create_hax_request_with_timeout( + hax_client=hax_client, + hax_create_kwargs={"type": "x"}, + revision_iter=0, + timeout_seconds=0.2, + ) + elapsed = time.monotonic() - start + assert elapsed < 1.0, ( + f"timeout_seconds=0.2 took {elapsed:.3f}s — module default is leaking through" + ) + + +# --------------------------------------------------------------------------- +# 5. Module default sanity. Pin so the production timeout doesn't drift +# accidentally (e.g. someone bumps it to 1s thinking it's a test value). +# --------------------------------------------------------------------------- + + +def test_module_default_timeout_is_120_seconds(): + """120s is the production budget. Drift would either hide hangs + (too high) or false-positive on legitimate cold-starts (too low).""" + + from swe_af.app import HAX_CREATE_REQUEST_TIMEOUT_SECONDS + + assert HAX_CREATE_REQUEST_TIMEOUT_SECONDS == 120.0, ( + f"Unexpected module default {HAX_CREATE_REQUEST_TIMEOUT_SECONDS}. " + f"If you intentionally changed it, update this assertion + add a " + f"comment in the helper explaining the new value." + )