From f207fd26f136a05c9dae9f623120cbe74d123136 Mon Sep 17 00:00:00 2001
From: Minh Vu <vuhoangminh97@gmail.com>
Date: Sat, 27 Jun 2026 12:32:27 +0200
Subject: [PATCH] Harden SWE agent test file parsing

---
 responses_api_agents/swe_agents/app.py        | 33 +++++++++++---
 .../swe_agents/tests/test_app.py              | 44 +++++++++++++++++++
 2 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/responses_api_agents/swe_agents/app.py b/responses_api_agents/swe_agents/app.py
index e248038bf..3df6d683c 100644
--- a/responses_api_agents/swe_agents/app.py
+++ b/responses_api_agents/swe_agents/app.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import ast
 import asyncio
 import glob
 import importlib.util
@@ -537,6 +538,28 @@ def get_run_command(self) -> ExecuteContainerCommandArgs:
 
 
 class NVInternalDatasetProcessor(BaseDatasetHarnessProcessor):
+    @staticmethod
+    def _parse_selected_test_files_to_run(raw_value: Any) -> list[str]:
+        parsed_value = raw_value
+        if isinstance(raw_value, str):
+            try:
+                parsed_value = json.loads(raw_value)
+            except json.JSONDecodeError:
+                try:
+                    parsed_value = ast.literal_eval(raw_value)
+                except (SyntaxError, ValueError) as exc:
+                    raise ValueError(
+                        "selected_test_files_to_run must be a JSON array or Python literal list of strings"
+                    ) from exc
+
+        if isinstance(parsed_value, tuple):
+            parsed_value = list(parsed_value)
+
+        if not isinstance(parsed_value, list) or not all(isinstance(test_file, str) for test_file in parsed_value):
+            raise ValueError("selected_test_files_to_run must decode to a list of strings")
+
+        return parsed_value
+
     def get_run_command(self) -> ExecuteContainerCommandArgs:
         instance_dict = json.loads(self.config.problem_info["instance_dict"])
         base_dockerfile = instance_dict.get("base_dockerfile", "")
@@ -572,11 +595,9 @@ def get_run_command(self) -> ExecuteContainerCommandArgs:
             repo_cmd = repo_cmd.split("\n")[-1]
 
         # Get test files
-        test_files_str = instance_dict.get("selected_test_files_to_run", "[]")
-        if isinstance(test_files_str, str):
-            test_files = ",".join(eval(test_files_str))
-        else:
-            test_files = ",".join(test_files_str)
+        test_files = self._parse_selected_test_files_to_run(instance_dict.get("selected_test_files_to_run", "[]"))
+        test_files_arg = ",".join(test_files)
+        test_files_arg = f" {shlex.quote(test_files_arg)}" if test_files_arg else ""
 
         run_script = instance_dict["run_script.sh"]
         parsing_script = instance_dict["parsing_script.py"]
@@ -608,7 +629,7 @@ def get_run_command(self) -> ExecuteContainerCommandArgs:
 {repo_cmd}
 
 # Run tests
-bash /root/run_script.sh {test_files} > /root/stdout.log 2> /root/stderr.log || true
+bash /root/run_script.sh{test_files_arg} > /root/stdout.log 2> /root/stderr.log || true
 
 # Parse results
 python /root/parsing_script.py /root/stdout.log /root/stderr.log /root/output.json
diff --git a/responses_api_agents/swe_agents/tests/test_app.py b/responses_api_agents/swe_agents/tests/test_app.py
index d8f6bcd04..e8554097e 100644
--- a/responses_api_agents/swe_agents/tests/test_app.py
+++ b/responses_api_agents/swe_agents/tests/test_app.py
@@ -530,6 +530,50 @@ def test_get_run_command_list_test_files(self) -> None:
             result = processor.get_run_command()
             assert "test_x.py,test_y.py" in result.command
 
+    def test_get_run_command_python_literal_test_files(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            processor = self._make_processor(
+                tmpdir,
+                {
+                    "selected_test_files_to_run": "['test_x.py', 'test_y.py']",
+                },
+            )
+            result = processor.get_run_command()
+            assert "test_x.py,test_y.py" in result.command
+
+    def test_get_run_command_quotes_test_files_arg(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            processor = self._make_processor(
+                tmpdir,
+                {
+                    "selected_test_files_to_run": ["test_x.py", "test;touch /tmp/pwned"],
+                },
+            )
+            result = processor.get_run_command()
+            assert "bash /root/run_script.sh 'test_x.py,test;touch /tmp/pwned'" in result.command
+
+    def test_get_run_command_rejects_executable_test_files_metadata(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            processor = self._make_processor(
+                tmpdir,
+                {
+                    "selected_test_files_to_run": '__import__("os").system("touch /tmp/pwned")',
+                },
+            )
+            with pytest.raises(ValueError, match="selected_test_files_to_run"):
+                processor.get_run_command()
+
+    def test_get_run_command_rejects_non_string_test_files(self) -> None:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            processor = self._make_processor(
+                tmpdir,
+                {
+                    "selected_test_files_to_run": '["test_x.py", 1]',
+                },
+            )
+            with pytest.raises(ValueError, match="selected_test_files_to_run"):
+                processor.get_run_command()
+
     def test_get_run_command_no_repo_cmd(self) -> None:
         with tempfile.TemporaryDirectory() as tmpdir:
             processor = self._make_processor(