Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 74 additions & 1 deletion swe_af/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,75 @@ def _format_plan_for_approval(
return plan_summary, prd_markdown, architecture_markdown, issues_for_template


# Default timeout for the synchronous hax-sdk HTTP call. 120s gives hax-sdk
# reasonable headroom for cold-start; anything longer is almost certainly
# wedged. Module-level so tests can shorten it for fast iteration.
HAX_CREATE_REQUEST_TIMEOUT_SECONDS = 120.0


async def _create_hax_request_with_timeout(
*,
hax_client,
hax_create_kwargs: dict,
revision_iter: int,
timeout_seconds: float = HAX_CREATE_REQUEST_TIMEOUT_SECONDS,
):
"""Submit the hax-sdk approval request with a hard timeout.

Without this wrapper, a wedged hax-sdk causes ``app.pause()`` to never be
reached: the surrounding reasoner sits in the Phase 1.5 revision loop, no
new sub-reasoners are spawned, and the parent reasoner's pause-aware
active-time budget burns out silently. Observed on production run
``run_1778512783034_f4985c96`` — the SECOND revision's ``create_request``
hung for 76min between sprint_planner completion (16:29:45) and the
parent watchdog firing (17:45:36).

The hax-sdk Python client is synchronous, so we run it on a thread and
bound the wait with ``asyncio.wait_for``. A ``TimeoutError`` is
surfaced as a clear ``RuntimeError`` so the caller can fail-fast rather
than chew through the parent's pause-aware budget for two hours.

Notifies the run timeline at three points (entry, success, error) so a
future hang is diagnosable from logs alone — the production failure was
invisible because the original synchronous call had no observability
hooks.
"""
app.note(
f"Phase 1.5: Submitting hax create_request (iteration {revision_iter})",
tags=["build", "approval", "hax", "create_request"],
)
try:
hax_request = await asyncio.wait_for(
asyncio.to_thread(hax_client.create_request, **hax_create_kwargs),
timeout=timeout_seconds,
)
except asyncio.TimeoutError as exc:
app.note(
f"hax_client.create_request timed out after {timeout_seconds}s "
f"(iteration {revision_iter})",
tags=["build", "approval", "hax", "timeout"],
)
raise RuntimeError(
f"hax-sdk create_request timed out after {timeout_seconds}s on "
f"iteration {revision_iter}; hax-sdk is likely wedged. Without "
f"this hard timeout the surrounding reasoner would silently "
f"consume the parent's pause-aware active-time budget."
) from exc
except Exception as exc:
app.note(
f"hax_client.create_request raised "
f"{type(exc).__name__}: {exc} (iteration {revision_iter})",
tags=["build", "approval", "hax", "error"],
)
raise
app.note(
f"hax create_request succeeded "
f"(request_id={hax_request.id}, iteration {revision_iter})",
tags=["build", "approval", "hax", "submitted"],
)
return hax_request


@app.reasoner()
async def build(
goal: str,
Expand Down Expand Up @@ -671,7 +740,11 @@ async def build(
if approval_user_id:
hax_create_kwargs["user_id"] = approval_user_id

hax_request = hax_client.create_request(**hax_create_kwargs)
hax_request = await _create_hax_request_with_timeout(
hax_client=hax_client,
hax_create_kwargs=hax_create_kwargs,
revision_iter=revision_iter,
)

with open(approval_state_path, "w") as _fp:
_json.dump({
Expand Down
249 changes: 249 additions & 0 deletions tests/test_hax_create_request_timeout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
"""Pin: ``_create_hax_request_with_timeout`` bounds a hung hax-sdk call.

Reproduces and pins the fix for production run ``run_1778512783034_f4985c96``:
github-buddy.implement_from_issue tripped its 7200s active-time watchdog
because swe-planner.build's revision-iteration ``hax_client.create_request``
hung for 76 minutes (16:29:45 → 17:45:36). The hax-sdk Python client is
synchronous and has no client-side timeout; without an explicit wrap, a
wedged hax-sdk surfaces as a multi-hour silent stall up the call chain.

Each test runs 10–20 iterations to catch scheduler/timing flakes. The
parametrize ID is the iteration number so failures point at the specific
attempt that broke.
"""

from __future__ import annotations

import asyncio
import time
from unittest.mock import MagicMock, patch

import pytest

from swe_af.app import _create_hax_request_with_timeout


# ---------------------------------------------------------------------------
# Common test plumbing
# ---------------------------------------------------------------------------


def _make_hung_hax_client(*, hang_seconds: float = 1.5):
"""A hax client whose ``create_request`` blocks the calling thread.

``hang_seconds`` defaults to 1.5s — long enough to outlast the test's
0.3s timeout with a clean margin, short enough that orphaned threads
don't pile up in the default executor and starve later iterations.
``asyncio.to_thread`` doesn't cancel the underlying thread when the
awaiting coroutine is cancelled by ``wait_for``, so the thread keeps
sleeping until ``hang_seconds`` elapses regardless of test outcome.
"""

client = MagicMock()
client.create_request = MagicMock(
side_effect=lambda **_kwargs: (time.sleep(hang_seconds), MagicMock())[1]
)
return client


def _make_responsive_hax_client(*, request_id: str = "req-test", url: str = "https://hax/r/test"):
"""A hax client that returns immediately with the given ids."""

client = MagicMock()
response = MagicMock(id=request_id, url=url)
client.create_request = MagicMock(return_value=response)
return client


@pytest.fixture(autouse=True)
def _silence_app_note():
"""Patch ``app.note`` so tests don't make real HTTP calls.

The helper writes timeline notes at entry/success/error. In tests we
only care about the timeout semantics, not the notes themselves.
"""

with patch("swe_af.app.app.note") as note_mock:
yield note_mock


# ---------------------------------------------------------------------------
# 1. Hang case — the actual production bug we're fixing.
#
# A synchronously-hung ``create_request`` must be cancelled by
# ``asyncio.wait_for`` within a small margin of the configured timeout,
# and the helper must surface a clear ``RuntimeError`` (not silently
# return None or hang the event loop).
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
@pytest.mark.parametrize("iteration", list(range(20)))
async def test_hung_create_request_raises_runtime_error_within_timeout_bound(iteration):
"""20× repeat: under hang, the helper raises RuntimeError within
[timeout, timeout * 4] seconds. The lower bound rules out spurious
early failures; the upper bound rules out the bug (unbounded wait).
"""

hax_client = _make_hung_hax_client(hang_seconds=1.5)

timeout_seconds = 0.3
start = time.monotonic()
with pytest.raises(RuntimeError, match=r"hax-sdk create_request timed out"):
await _create_hax_request_with_timeout(
hax_client=hax_client,
hax_create_kwargs={"type": "plan-review-v2"},
revision_iter=1,
timeout_seconds=timeout_seconds,
)
elapsed = time.monotonic() - start

# Must not fire BEFORE the configured timeout — that would mean the
# helper is firing for the wrong reason.
assert elapsed >= timeout_seconds * 0.9, (
f"iteration {iteration}: timeout fired at {elapsed:.3f}s, "
f"before configured {timeout_seconds}s"
)
# Must not take much LONGER than the timeout — the bug we're pinning is
# exactly "wait longer than configured." A 4× margin allows for thread
# cleanup + asyncio scheduling jitter; the real production failure was
# 6000× over budget.
assert elapsed <= timeout_seconds * 4, (
f"iteration {iteration}: timeout did NOT fire within "
f"4*{timeout_seconds}s; elapsed={elapsed:.3f}s. This means a hung "
f"hax-sdk could still chew through the parent's active-time budget."
)


# ---------------------------------------------------------------------------
# 2. Success case — happy path must not be broken by the wrap.
#
# A fast ``create_request`` (the normal production case) must return its
# response, complete well under the configured timeout, and call
# ``app.note(... 'submitted' ...)`` so the run timeline reflects success.
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
@pytest.mark.parametrize("iteration", list(range(20)))
async def test_responsive_create_request_returns_response(iteration, _silence_app_note):
"""20× repeat: under no-hang, the helper returns the hax response
and the timeout doesn't fire."""

hax_client = _make_responsive_hax_client(
request_id=f"req-{iteration:02d}",
url=f"https://hax/r/{iteration:02d}",
)

start = time.monotonic()
result = await _create_hax_request_with_timeout(
hax_client=hax_client,
hax_create_kwargs={"type": "plan-review-v2", "title": "test"},
revision_iter=iteration,
timeout_seconds=0.5,
)
elapsed = time.monotonic() - start

assert result.id == f"req-{iteration:02d}", f"iteration {iteration}: wrong response id"
assert elapsed < 0.4, (
f"iteration {iteration}: responsive call took {elapsed:.3f}s "
f"(unexpected slowness)"
)
# The submitted-note must fire so the run timeline shows success.
submitted_calls = [
call for call in _silence_app_note.call_args_list
if "submitted" in str(call)
]
assert len(submitted_calls) >= 1, (
f"iteration {iteration}: 'submitted' app.note never fired"
)


# ---------------------------------------------------------------------------
# 3. Non-timeout exception — must surface unchanged.
#
# A ConnectionError from hax-sdk (network-level failure) must propagate
# verbatim. We do NOT want to swallow real errors or convert them into
# our timeout RuntimeError — those are different failure modes and
# operators need to tell them apart.
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
@pytest.mark.parametrize("iteration", list(range(10)))
async def test_non_timeout_exception_propagates(iteration, _silence_app_note):
"""10× repeat: a ConnectionError (or any non-TimeoutError) must surface
unchanged so operators can distinguish network failure from hang."""

hax_client = MagicMock()
hax_client.create_request = MagicMock(
side_effect=ConnectionError(f"hax-sdk unreachable (iter {iteration})")
)

with pytest.raises(ConnectionError, match=rf"hax-sdk unreachable \(iter {iteration}\)"):
await _create_hax_request_with_timeout(
hax_client=hax_client,
hax_create_kwargs={"type": "plan-review-v2"},
revision_iter=iteration,
timeout_seconds=0.5,
)

# The error-note must fire so the run timeline shows the failure mode.
error_calls = [
call for call in _silence_app_note.call_args_list
if "raised" in str(call) and "ConnectionError" in str(call)
]
assert len(error_calls) >= 1, (
f"iteration {iteration}: 'raised ConnectionError' app.note never fired"
)


# ---------------------------------------------------------------------------
# 4. Regression pin: the helper must use the configured timeout, not the
# module default. This is a property of the wrapper that we want to keep
# stable as the helper evolves.
# ---------------------------------------------------------------------------


@pytest.mark.asyncio
async def test_configurable_timeout_is_honored():
"""The ``timeout_seconds`` kwarg must shorten the wait correctly.

Without this, callers that want to tighten the budget (e.g. tests, or
a future caller that knows hax-sdk should never take > 30s) couldn't.
"""

hax_client = _make_hung_hax_client(hang_seconds=1.5)

# 0.2s timeout → must fire in ~0.2s, NOT the module default of 120s.
start = time.monotonic()
with pytest.raises(RuntimeError):
await _create_hax_request_with_timeout(
hax_client=hax_client,
hax_create_kwargs={"type": "x"},
revision_iter=0,
timeout_seconds=0.2,
)
elapsed = time.monotonic() - start
assert elapsed < 1.0, (
f"timeout_seconds=0.2 took {elapsed:.3f}s — module default is leaking through"
)


# ---------------------------------------------------------------------------
# 5. Module default sanity. Pin so the production timeout doesn't drift
# accidentally (e.g. someone bumps it to 1s thinking it's a test value).
# ---------------------------------------------------------------------------


def test_module_default_timeout_is_120_seconds():
"""120s is the production budget. Drift would either hide hangs
(too high) or false-positive on legitimate cold-starts (too low)."""

from swe_af.app import HAX_CREATE_REQUEST_TIMEOUT_SECONDS

assert HAX_CREATE_REQUEST_TIMEOUT_SECONDS == 120.0, (
f"Unexpected module default {HAX_CREATE_REQUEST_TIMEOUT_SECONDS}. "
f"If you intentionally changed it, update this assertion + add a "
f"comment in the helper explaining the new value."
)
Loading