From e139e7d93ea0059147f5b643542b1a913084c360 Mon Sep 17 00:00:00 2001 From: eshulman2 Date: Thu, 18 Jun 2026 12:35:31 +0300 Subject: [PATCH] fix: reduce container timeout from 2 hours to 30 minutes and fix orphaned containers on timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The effective container timeout was 7200 s (2 hours) despite SandboxConfig advertising 1800 s — that class was dead code. Containers that hit the Python-level asyncio.wait_for TimeoutError also leaked: process.kill() only terminated the podman run client, leaving the container running in the Podman daemon. - Set Settings.container_timeout and ContainerConfig.timeout_seconds to 1800 s (30 minutes) — overridable via FORGE_SANDBOX_TIMEOUT env var - Extract _stop_timed_out_container() and use it for both TimeoutError and CancelledError paths: podman stop → podman kill on non-zero exit or wait timeout → process.wait() with force-kill fallback Co-Authored-By: Claude Sonnet 4.6 (1M context) --- docs/developer-guide.md | 2 +- src/forge/config.py | 4 +- src/forge/sandbox/runner.py | 81 ++++++++++++------- tests/test_sandbox_runner.py | 4 +- .../sandbox/test_runner_timeout_cleanup.py | 70 ++++++++++++++++ 5 files changed, 128 insertions(+), 33 deletions(-) create mode 100644 tests/unit/sandbox/test_runner_timeout_cleanup.py diff --git a/docs/developer-guide.md b/docs/developer-guide.md index c4d8f79a..bb00f4dc 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -175,7 +175,7 @@ REDIS_URL=redis://localhost:6380/0 # matches docker-compose port mapping ```bash CONTAINER_IMAGE=localhost/forge-dev:latest # built with podman above -CONTAINER_TIMEOUT=7200 # 2 hours max +CONTAINER_TIMEOUT=1800 # 30 minutes max CONTAINER_MEMORY=4g CONTAINER_CPUS=2 ``` diff --git a/src/forge/config.py b/src/forge/config.py index 0a10ea17..bfb0b12e 100644 --- a/src/forge/config.py +++ b/src/forge/config.py @@ -300,8 +300,8 @@ def ignored_ci_checks(self) -> list[str]: description="Container image for task execution (local or registry URL)", ) container_timeout: int = Field( - default=7200, - description="Container execution timeout in seconds (default: 2 hours)", + default=1800, + description="Container execution timeout in seconds (default: 30 minutes)", ) container_memory: str = Field( default="4g", diff --git a/src/forge/sandbox/runner.py b/src/forge/sandbox/runner.py index 10696680..5d81afa1 100644 --- a/src/forge/sandbox/runner.py +++ b/src/forge/sandbox/runner.py @@ -59,7 +59,7 @@ class ContainerConfig: """Configuration for container execution.""" image: str = DEFAULT_IMAGE - timeout_seconds: int = 7200 # 2 hours default + timeout_seconds: int = 1800 # 30 minutes default memory_limit: str = "4g" cpu_limit: str = "2" network_mode: str = "slirp4netns" # Rootless networking @@ -314,6 +314,55 @@ def _build_podman_command( return cmd + async def _stop_timed_out_container( + self, + container_name: str, + process: asyncio.subprocess.Process, + ) -> None: + """Stop a running container and ensure the podman run process exits.""" + stop_process = await asyncio.create_subprocess_exec( + "podman", + "stop", + "-t", + "10", + container_name, + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + + should_kill = False + try: + await asyncio.wait_for(stop_process.wait(), timeout=15.0) + if stop_process.returncode != 0: + logger.warning( + f"podman stop failed for {container_name} " + f"(exit {stop_process.returncode}), killing" + ) + should_kill = True + except TimeoutError: + logger.warning(f"Container {container_name} didn't stop, killing") + should_kill = True + + if should_kill: + kill_process = await asyncio.create_subprocess_exec( + "podman", + "kill", + container_name, + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + try: + await asyncio.wait_for(kill_process.wait(), timeout=15.0) + except TimeoutError: + logger.warning(f"podman kill for {container_name} did not finish") + + try: + await asyncio.wait_for(process.wait(), timeout=15.0) + except TimeoutError: + logger.warning(f"podman run process for {container_name} did not exit, killing") + process.kill() + await process.wait() + async def run( self, workspace_path: Path, @@ -377,9 +426,8 @@ async def run( timeout=config.timeout_seconds + 60, # Extra buffer ) except TimeoutError: - logger.error("Container execution timed out") - process.kill() - await process.wait() + logger.error(f"Container execution timed out, stopping {container_name}") + await self._stop_timed_out_container(container_name, process) return ContainerResult( success=False, exit_code=-1, @@ -389,30 +437,7 @@ async def run( ) except asyncio.CancelledError: logger.warning(f"Container execution cancelled, stopping {container_name}") - # Stop the container via podman (more reliable than killing process) - stop_process = await asyncio.create_subprocess_exec( - "podman", - "stop", - "-t", - "10", - container_name, - stdout=asyncio.subprocess.DEVNULL, - stderr=asyncio.subprocess.DEVNULL, - ) - try: - await asyncio.wait_for(stop_process.wait(), timeout=15.0) - except TimeoutError: - logger.warning(f"Container {container_name} didn't stop, killing") - kill_process = await asyncio.create_subprocess_exec( - "podman", - "kill", - container_name, - stdout=asyncio.subprocess.DEVNULL, - stderr=asyncio.subprocess.DEVNULL, - ) - await kill_process.wait() - # Wait for the original process to finish - await process.wait() + await self._stop_timed_out_container(container_name, process) raise # Re-raise CancelledError exit_code = process.returncode or 0 diff --git a/tests/test_sandbox_runner.py b/tests/test_sandbox_runner.py index bcda80b1..e4e02c24 100644 --- a/tests/test_sandbox_runner.py +++ b/tests/test_sandbox_runner.py @@ -63,8 +63,8 @@ async def test_simple_container_run(self): async def test_container_config_defaults(self): """Test ContainerConfig has sensible defaults.""" config = ContainerConfig() - assert config.image == "forge-dev:latest" - assert config.timeout_seconds == 7200 # 2 hours + assert config.image == "localhost/forge-dev:latest" + assert config.timeout_seconds == 1800 # 30 minutes assert config.memory_limit == "4g" assert config.cpu_limit == "2" assert config.skip_tests is False diff --git a/tests/unit/sandbox/test_runner_timeout_cleanup.py b/tests/unit/sandbox/test_runner_timeout_cleanup.py new file mode 100644 index 00000000..5651134f --- /dev/null +++ b/tests/unit/sandbox/test_runner_timeout_cleanup.py @@ -0,0 +1,70 @@ +"""Tests for container runner timeout cleanup.""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from forge.sandbox.runner import ContainerRunner + + +def _runner_without_init() -> ContainerRunner: + return object.__new__(ContainerRunner) + + +@pytest.mark.asyncio +async def test_stop_failure_kills_container_and_waits_for_run_process() -> None: + runner = _runner_without_init() + stop_process = MagicMock() + stop_process.returncode = 1 + stop_process.wait = AsyncMock() + kill_process = MagicMock() + kill_process.wait = AsyncMock() + run_process = MagicMock() + run_process.wait = AsyncMock() + run_process.kill = MagicMock() + + with patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + new=AsyncMock(side_effect=[stop_process, kill_process]), + ) as mock_exec: + await runner._stop_timed_out_container("forge-ticket-abc123", run_process) + + assert mock_exec.call_count == 2 + assert mock_exec.call_args_list[0].args[:3] == ("podman", "stop", "-t") + assert mock_exec.call_args_list[1].args[:2] == ("podman", "kill") + run_process.wait.assert_awaited() + run_process.kill.assert_not_called() + + +@pytest.mark.asyncio +async def test_run_process_wait_timeout_kills_run_process() -> None: + runner = _runner_without_init() + stop_process = MagicMock() + stop_process.returncode = 0 + stop_process.wait = AsyncMock() + run_process = MagicMock() + run_process.wait = AsyncMock() + run_process.kill = MagicMock() + calls = 0 + + async def fake_wait_for(awaitable, timeout): # noqa: ANN001, ARG001 + nonlocal calls + calls += 1 + if calls == 2: + awaitable.close() + raise TimeoutError + return await awaitable + + with ( + patch( + "forge.sandbox.runner.asyncio.create_subprocess_exec", + new=AsyncMock(return_value=stop_process), + ) as mock_exec, + patch("forge.sandbox.runner.asyncio.wait_for", side_effect=fake_wait_for), + ): + await runner._stop_timed_out_container("forge-ticket-abc123", run_process) + + mock_exec.assert_awaited_once() + run_process.kill.assert_called_once() + assert run_process.wait.call_count == 2 + assert run_process.wait.await_count == 1