From 3cc8f5b04f6c7241b29b6deae3d4491ece56d482 Mon Sep 17 00:00:00 2001 From: ivanbao9783 Date: Tue, 30 Jun 2026 16:32:48 +0800 Subject: [PATCH] =?UTF-8?q?refactor(swebp):=201.=20=E5=BD=92=E4=B8=80swe?= =?UTF-8?q?=E5=92=8Cswebp=E4=B8=A4=E4=B8=AA=E6=95=B0=E6=8D=AE=E9=9B=86?= =?UTF-8?q?=E7=9A=84=E9=95=9C=E5=83=8F=E6=93=8D=E4=BD=9C=EF=BC=8C=E6=B6=88?= =?UTF-8?q?=E9=99=A4=E5=86=97=E4=BD=99=E4=BB=A3=E7=A0=81=E3=80=82=202.=20?= =?UTF-8?q?=E4=BC=98=E5=8C=96swebp=E4=B8=B4=E6=97=B6=E5=AE=B9=E5=99=A8?= =?UTF-8?q?=E6=B8=85=E7=90=86=E9=80=BB=E8=BE=91=EF=BC=8C=E4=BB=A5Session?= =?UTF-8?q?=E7=B2=92=E5=BA=A6=E7=AE=A1=E7=90=86=E7=94=9F=E5=91=BD=E5=91=A8?= =?UTF-8?q?=E6=9C=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ais_bench/benchmark/tasks/swebench/utils.py | 27 +-- .../tasks/swebench_pro/swebench_pro_eval.py | 12 +- .../tasks/swebench_pro/swebench_pro_infer.py | 10 +- .../benchmark/tasks/swebench_pro/utils.py | 66 ++++--- tests/UT/tasks/swebp_pro/test_utils.py | 186 ++++++++++++++---- 5 files changed, 229 insertions(+), 72 deletions(-) diff --git a/ais_bench/benchmark/tasks/swebench/utils.py b/ais_bench/benchmark/tasks/swebench/utils.py index e9fa20c8..c3d828b4 100644 --- a/ais_bench/benchmark/tasks/swebench/utils.py +++ b/ais_bench/benchmark/tasks/swebench/utils.py @@ -26,7 +26,7 @@ def make_swebench_session_id() -> str: return uuid.uuid4().hex -def _merge_docker_labels(labels, session_id: str) -> dict: +def _merge_docker_labels(labels, session_id: str, label_key: str = SWEBENCH_SESSION_LABEL) -> dict: """Merge session label into Docker labels dict. Docker SDK ``containers.create/run(labels=...)`` expects a mapping @@ -43,19 +43,21 @@ def _merge_docker_labels(labels, session_id: str) -> dict: merged[k] = v else: merged = {} - merged[SWEBENCH_SESSION_LABEL] = session_id + merged[label_key] = session_id return merged class _DockerContainersWithSessionLabel: - def __init__(self, containers, session_id: str): + def __init__(self, containers, session_id: str, label_key: str = SWEBENCH_SESSION_LABEL): self._containers = containers self._session_id = session_id + self._label_key = label_key def create(self, *args, **kwargs): kwargs["labels"] = _merge_docker_labels( kwargs.get("labels"), self._session_id, + self._label_key, ) return self._containers.create(*args, **kwargs) @@ -63,6 +65,7 @@ def run(self, *args, **kwargs): kwargs["labels"] = _merge_docker_labels( kwargs.get("labels"), self._session_id, + self._label_key, ) return self._containers.run(*args, **kwargs) @@ -71,23 +74,24 @@ def __getattr__(self, name): class _DockerClientWithSessionLabel: - def __init__(self, client, session_id: str): + def __init__(self, client, session_id: str, label_key: str = SWEBENCH_SESSION_LABEL): self._client = client self.containers = _DockerContainersWithSessionLabel( client.containers, session_id, + label_key, ) def __getattr__(self, name): return getattr(self._client, name) -def add_swebench_session_label_to_docker_client(client, session_id: str): +def add_swebench_session_label_to_docker_client(client, session_id: str, label_key: str = SWEBENCH_SESSION_LABEL): """Return a Docker client wrapper that labels containers it creates.""" - return _DockerClientWithSessionLabel(client, session_id) + return _DockerClientWithSessionLabel(client, session_id, label_key) -def list_swebench_container_ids(session_id: Optional[str] = None) -> Set[str]: +def list_swebench_container_ids(session_id: Optional[str] = None, label_key: str = SWEBENCH_SESSION_LABEL) -> Set[str]: """Return Docker container IDs tagged for one SWE-bench task session.""" if not session_id: return set() @@ -100,7 +104,7 @@ def list_swebench_container_ids(session_id: Optional[str] = None) -> Set[str]: "ps", "-aq", "--filter", - f"label={SWEBENCH_SESSION_LABEL}={session_id}", + f"label={label_key}={session_id}", ], capture_output=True, text=True, @@ -123,10 +127,11 @@ def cleanup_swebench_containers( *, container_ids: Optional[Iterable[str]] = None, session_id: Optional[str] = None, + label_key: str = SWEBENCH_SESSION_LABEL, ): """Stop and remove containers created by the current SWE-bench task.""" targets = set(container_ids or []) - targets.update(list_swebench_container_ids(session_id)) + targets.update(list_swebench_container_ids(session_id, label_key)) targets = sorted(targets) if not targets: return @@ -144,11 +149,11 @@ def cleanup_swebench_containers( _logger.warning("Unexpected error removing containers", exc_info=True) -def add_swebench_session_label_to_run_args(config: dict, session_id: str) -> None: +def add_swebench_session_label_to_run_args(config: dict, session_id: str, label_key: str = SWEBENCH_SESSION_LABEL) -> None: """Add this task's Docker label to mini-swe-agent Docker run args.""" environment = config.setdefault("environment", {}) run_args = list(environment.get("run_args", ["--rm"])) - label_flag = f"{SWEBENCH_SESSION_LABEL}={session_id}" + label_flag = f"{label_key}={session_id}" if label_flag not in run_args: run_args.extend(["--label", label_flag]) environment["run_args"] = run_args diff --git a/ais_bench/benchmark/tasks/swebench_pro/swebench_pro_eval.py b/ais_bench/benchmark/tasks/swebench_pro/swebench_pro_eval.py index 89adfc9b..14cdba40 100644 --- a/ais_bench/benchmark/tasks/swebench_pro/swebench_pro_eval.py +++ b/ais_bench/benchmark/tasks/swebench_pro/swebench_pro_eval.py @@ -35,6 +35,9 @@ eval_with_docker, list_swebench_pro_images, clean_swebench_pro_images, + cleanup_swebench_pro_containers, + make_swebench_pro_session_id, + add_swebench_pro_session_label_to_docker_client, ) KEY_INSTANCE_ID = "instance_id" @@ -361,7 +364,10 @@ def run(self, task_state_manager: TaskStateManager): SWEBP_CODES.SWEBENCH_HARNESS_IMPORT_ERROR, "docker SDK is not installed. Install via 'pip install docker'" ) from e - docker_client = docker.from_env() + session_id = make_swebench_pro_session_id() + docker_client = add_swebench_pro_session_label_to_docker_client( + docker.from_env(), session_id + ) prior_images = list_swebench_pro_images(docker_client) ensure_swebench_pro_docker_images( @@ -482,9 +488,9 @@ def run_eval_with_progress(patch, instance, report_dir, scripts_dir_abs, docker_ finally: if pbar is not None: pbar.close() - + cleanup_swebench_pro_containers(session_id=session_id) + self.logger.info("All instances run.") - self.logger.info("Cleaning up SWE-bench Pro images...") clean_swebench_pro_images(docker_client, prior_images, self.logger) self.logger.info("Image cleanup completed.") diff --git a/ais_bench/benchmark/tasks/swebench_pro/swebench_pro_infer.py b/ais_bench/benchmark/tasks/swebench_pro/swebench_pro_infer.py index f8685a44..23d9d879 100644 --- a/ais_bench/benchmark/tasks/swebench_pro/swebench_pro_infer.py +++ b/ais_bench/benchmark/tasks/swebench_pro/swebench_pro_infer.py @@ -28,9 +28,11 @@ AISBenchValueError, ) from ais_bench.benchmark.tasks.swebench_pro.utils import ( + add_swebench_pro_session_label_to_run_args, cleanup_swebench_pro_containers, ensure_swebench_pro_docker_images, get_dockerhub_image_uri, + make_swebench_pro_session_id, merge_nested_dicts, build_problem_statement, sanitize_config_for_logging, @@ -282,6 +284,10 @@ def run(self, task_state_manager: TaskStateManager): base_config.setdefault("agent", {})["step_limit"] = dataset_cfg[ "step_limit" ] + + session_id = make_swebench_pro_session_id() + add_swebench_pro_session_label_to_run_args(base_config, session_id) + self.logger.info("base_config '%s'", sanitize_config_for_logging(base_config)) progress_manager, live_render_group = _make_swebench_pro_progress_manager( @@ -369,13 +375,13 @@ def run_executor(): for future in futures: if not future.running() and not future.done(): future.cancel() - cleanup_swebench_pro_containers() + cleanup_swebench_pro_containers(session_id=session_id) executor.shutdown(wait=False) raise finally: if not interrupted[0]: executor.shutdown(wait=True) - cleanup_swebench_pro_containers() + cleanup_swebench_pro_containers(session_id=session_id) if live_render_group is not None: from rich.live import Live diff --git a/ais_bench/benchmark/tasks/swebench_pro/utils.py b/ais_bench/benchmark/tasks/swebench_pro/utils.py index 5c06d6a7..d5ea11cd 100644 --- a/ais_bench/benchmark/tasks/swebench_pro/utils.py +++ b/ais_bench/benchmark/tasks/swebench_pro/utils.py @@ -1,7 +1,7 @@ import os import subprocess import re -from typing import Callable, Iterable, TypeVar +from typing import Callable, Iterable, Optional, Set, TypeVar import json import copy @@ -10,6 +10,17 @@ from ais_bench.benchmark.utils.logging.error_codes import SWEBP_CODES from ais_bench.benchmark.utils.logging.exceptions import AISBenchRuntimeError, AISBenchImportError +from ais_bench.benchmark.tasks.swebench.utils import ( + add_swebench_session_label_to_docker_client as _add_session_label_to_docker_client, + add_swebench_session_label_to_run_args as _add_session_label_to_run_args, + cleanup_swebench_containers as _cleanup_session_containers, + list_swebench_container_ids as _list_session_container_ids, + make_swebench_session_id as _make_session_id, +) + + +SWEBENCH_PRO_SESSION_LABEL = "ais_bench.swebench_pro.session" + def sanitize_config_for_logging(config: dict) -> dict: """Deep-copy a config dict and mask sensitive fields (e.g. api_key) for safe logging.""" @@ -28,28 +39,37 @@ def _mask(obj): return _mask(copy.deepcopy(config)) -def cleanup_swebench_pro_containers(): - name_filters = ["minisweagent-", "sweb.eval"] - for name_filter in name_filters: - try: - r = subprocess.run( - ["docker", "ps", "-aq", "--filter", f"name={name_filter}"], - capture_output=True, - text=True, - timeout=10, - ) - if r.returncode != 0 or not (r.stdout or "").strip(): - continue - ids = [x.strip() for x in r.stdout.strip().splitlines() if x.strip()] - if not ids: - continue - subprocess.run( - ["docker", "rm", "-f"] + ids, - capture_output=True, - timeout=30, - ) - except (FileNotFoundError, subprocess.TimeoutExpired, Exception): - pass +def make_swebench_pro_session_id() -> str: + """Generate a unique session id for one SWE-bench Pro task run.""" + return _make_session_id() + + +def add_swebench_pro_session_label_to_docker_client(client, session_id: str): + """Return a Docker client wrapper that labels containers it creates.""" + return _add_session_label_to_docker_client(client, session_id, SWEBENCH_PRO_SESSION_LABEL) + + +def list_swebench_pro_container_ids(session_id: Optional[str] = None) -> Set[str]: + """Return Docker container IDs tagged for one SWE-bench Pro task session.""" + return _list_session_container_ids(session_id, SWEBENCH_PRO_SESSION_LABEL) + + +def cleanup_swebench_pro_containers( + *, + container_ids: Optional[Iterable[str]] = None, + session_id: Optional[str] = None, +): + """Stop and remove containers created by the current SWE-bench Pro task.""" + _cleanup_session_containers( + container_ids=container_ids, + session_id=session_id, + label_key=SWEBENCH_PRO_SESSION_LABEL, + ) + + +def add_swebench_pro_session_label_to_run_args(config: dict, session_id: str) -> None: + """Add this task's Docker label to mini-swe-agent Docker run args.""" + _add_session_label_to_run_args(config, session_id, SWEBENCH_PRO_SESSION_LABEL) def list_swebench_pro_images(client) -> set[str]: diff --git a/tests/UT/tasks/swebp_pro/test_utils.py b/tests/UT/tasks/swebp_pro/test_utils.py index 292e89fd..a3b2f144 100644 --- a/tests/UT/tasks/swebp_pro/test_utils.py +++ b/tests/UT/tasks/swebp_pro/test_utils.py @@ -7,7 +7,7 @@ import shutil import subprocess from pathlib import Path -from unittest.mock import patch, MagicMock +from unittest.mock import patch, MagicMock, call PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..', '..')) if PROJECT_ROOT not in sys.path: @@ -579,42 +579,162 @@ def test_no_new_images_to_clean(self): mock_logger.debug.assert_called_once() -class TestCleanupSwebenchProContainers(unittest.TestCase): +class TestSwebenchProContainerCleanup(unittest.TestCase): + """Session-scoped container cleanup tests (mirror swebench session isolation).""" + @classmethod def setUpClass(cls): cls.utils = utils_module - @patch('subprocess.run') - def test_cleanup_containers_success(self, mock_run): - mock_result1 = MagicMock() - mock_result1.returncode = 0 - mock_result1.stdout = "container1\ncontainer2\n" - mock_result2 = MagicMock() - mock_result2.returncode = 0 - mock_result2.stdout = "" - mock_run.side_effect = [mock_result1, MagicMock(), mock_result2] - - self.utils.cleanup_swebench_pro_containers() - self.assertEqual(mock_run.call_count, 3) - - @patch('subprocess.run') - def test_cleanup_no_containers(self, mock_run): - mock_result = MagicMock() - mock_result.returncode = 0 - mock_result.stdout = "" - mock_run.return_value = mock_result - - self.utils.cleanup_swebench_pro_containers() - - @patch('subprocess.run') - def test_cleanup_handles_docker_not_found(self, mock_run): - mock_run.side_effect = FileNotFoundError("docker not found") - self.utils.cleanup_swebench_pro_containers() - - @patch('subprocess.run') - def test_cleanup_handles_timeout(self, mock_run): - mock_run.side_effect = subprocess.TimeoutExpired(cmd="docker", timeout=10) - self.utils.cleanup_swebench_pro_containers() + def test_list_swebench_pro_container_ids_queries_session_label(self): + result = subprocess.CompletedProcess(args=[], returncode=0, stdout="one\ntwo\n") + + with patch.object(utils_module.subprocess, "run", return_value=result) as mock_run: + container_ids = self.utils.list_swebench_pro_container_ids("session-1") + + self.assertEqual(container_ids, {"one", "two"}) + self.assertEqual( + mock_run.call_args_list, + [ + call( + [ + "docker", + "ps", + "-aq", + "--filter", + f"label={utils_module.SWEBENCH_PRO_SESSION_LABEL}=session-1", + ], + capture_output=True, + text=True, + timeout=10, + ), + ], + ) + + def test_list_swebench_pro_container_ids_without_session_is_empty(self): + with patch.object(utils_module.subprocess, "run", MagicMock()) as mock_run: + container_ids = self.utils.list_swebench_pro_container_ids() + + self.assertEqual(container_ids, set()) + mock_run.assert_not_called() + + def test_cleanup_removes_only_session_tagged_ids(self): + # Simulate docker CLI: ``docker ps`` returns container IDs tagged with + # the session label, ``docker rm -f`` removes them. + def fake_run(cmd, **kwargs): + if "ps" in cmd: + return subprocess.CompletedProcess( + args=cmd, returncode=0, stdout="session-a\nsession-b\n", stderr="" + ) + if "rm" in cmd: + return subprocess.CompletedProcess(args=cmd, returncode=0, stdout="", stderr="") + + with patch.object(utils_module.subprocess, "run", side_effect=fake_run) as mock_run: + self.utils.cleanup_swebench_pro_containers(session_id="session-1") + + # docker ps (list) + docker rm -f (cleanup) + self.assertEqual(mock_run.call_count, 2) + rm_call = mock_run.call_args_list[1] + self.assertEqual( + rm_call, + call( + ["docker", "rm", "-f", "session-a", "session-b"], + capture_output=True, + timeout=30, + ), + ) + + def test_cleanup_without_recorded_or_session_ids_is_noop(self): + with patch.object(utils_module.subprocess, "run", MagicMock()) as mock_run: + self.utils.cleanup_swebench_pro_containers() + + mock_run.assert_not_called() + + def test_cleanup_handles_docker_not_found(self): + def fake_run(cmd, **kwargs): + if "ps" in cmd: + return subprocess.CompletedProcess( + args=cmd, returncode=0, stdout="c1\n", stderr="" + ) + if "rm" in cmd: + raise FileNotFoundError("docker not found") + + with patch.object(utils_module.subprocess, "run", side_effect=fake_run): + # Should not raise even when docker rm fails with FileNotFoundError. + self.utils.cleanup_swebench_pro_containers(session_id="session-1") + + def test_cleanup_handles_timeout(self): + def fake_run(cmd, **kwargs): + if "ps" in cmd: + return subprocess.CompletedProcess( + args=cmd, returncode=0, stdout="c1\n", stderr="" + ) + if "rm" in cmd: + raise subprocess.TimeoutExpired(cmd="docker", timeout=30) + + with patch.object(utils_module.subprocess, "run", side_effect=fake_run): + # Should not raise even when docker rm times out. + self.utils.cleanup_swebench_pro_containers(session_id="session-1") + + def test_add_session_label_to_run_args_preserves_existing_args(self): + config = {"environment": {"run_args": ["--rm", "--network", "none"]}} + + self.utils.add_swebench_pro_session_label_to_run_args(config, "session-1") + + self.assertEqual( + config["environment"]["run_args"], + [ + "--rm", + "--network", + "none", + "--label", + f"{utils_module.SWEBENCH_PRO_SESSION_LABEL}=session-1", + ], + ) + + def test_add_session_label_to_run_args_defaults_run_args(self): + config = {} + self.utils.add_swebench_pro_session_label_to_run_args(config, "session-1") + self.assertEqual( + config["environment"]["run_args"], + ["--rm", "--label", f"{utils_module.SWEBENCH_PRO_SESSION_LABEL}=session-1"], + ) + + def test_docker_client_wrapper_adds_session_label_without_changing_name(self): + client = MagicMock() + wrapped = self.utils.add_swebench_pro_session_label_to_docker_client( + client, + "session-1", + ) + + wrapped.containers.run( + "image", + name="default-name", + labels={"existing": "value"}, + ) + + client.containers.run.assert_called_once_with( + "image", + name="default-name", + labels={ + "existing": "value", + utils_module.SWEBENCH_PRO_SESSION_LABEL: "session-1", + }, + ) + + def test_docker_client_wrapper_passes_through_other_attrs(self): + client = MagicMock() + wrapped = self.utils.add_swebench_pro_session_label_to_docker_client( + client, "session-1" + ) + # ``images`` and other APIs are not containers; ensure they pass through. + _ = wrapped.images + client.images.__getattr__ # attribute exists on mock + self.assertIs(wrapped.images, client.images) + + def test_make_swebench_pro_session_id_is_unique(self): + ids = {self.utils.make_swebench_pro_session_id() for _ in range(100)} + self.assertEqual(len(ids), 100) class TestCollectOutputsLocal(unittest.TestCase):