From 6568d5f16db9260bd52a73aecc2fde45e32221e5 Mon Sep 17 00:00:00 2001 From: Nancy Date: Fri, 8 May 2026 09:54:06 -0700 Subject: [PATCH] Update Harbor converter runtime isolation Co-authored-by: Ryan Tan <63581031+ryantzr1@users.noreply.github.com> --- hud/cli/convert/__init__.py | 16 ++- hud/cli/convert/harbor.py | 163 ++++++++++++++++++++++++--- hud/cli/convert/tests/test_harbor.py | 98 +++++++++++++++- 3 files changed, 257 insertions(+), 20 deletions(-) diff --git a/hud/cli/convert/__init__.py b/hud/cli/convert/__init__.py index c2cfbd0eb..ca2e1cd47 100644 --- a/hud/cli/convert/__init__.py +++ b/hud/cli/convert/__init__.py @@ -13,6 +13,7 @@ import json import logging +import re import shutil from pathlib import Path @@ -36,6 +37,17 @@ # Shell script extensions that need CRLF -> LF normalization _SHELL_EXTENSIONS = frozenset({".sh", ".bash", ".zsh", ".ksh"}) +_SOURCE_CONTEXT_MARKDOWN_RE = re.compile(r"^[a-z]{2,8}-\d+(?:-\d+)*\.md$", re.IGNORECASE) + + +def _should_skip_task_data_item(path: Path) -> bool: + """Return True for source task files that should not be exposed to agents.""" + lower_name = path.name.lower() + return ( + path.name in ("environment", "solution") + or lower_name in (".dockerignore", "scoring.md") + or bool(_SOURCE_CONTEXT_MARKDOWN_RE.match(path.name)) + ) def _normalize_line_endings(directory: Path) -> None: @@ -154,8 +166,8 @@ def write_result(result: ConvertResult, output_dir: Path) -> Path: dest.mkdir(parents=True, exist_ok=True) for item in source_dir.iterdir(): - # Skip dirs that are handled by the Dockerfile or ignored - if item.name in ("environment", "solution"): + # Skip dirs handled elsewhere and files that leak scoring/source context. + if _should_skip_task_data_item(item): continue if item.is_dir(): shutil.copytree(item, dest / item.name) diff --git a/hud/cli/convert/harbor.py b/hud/cli/convert/harbor.py index dc745bc99..fbe7ac3f9 100644 --- a/hud/cli/convert/harbor.py +++ b/hud/cli/convert/harbor.py @@ -33,6 +33,7 @@ import hashlib import logging import re +import shlex import tomllib from dataclasses import dataclass from pathlib import Path # noqa: TC003 - used at runtime @@ -76,6 +77,55 @@ def _normalize_name(name: str) -> str: return normalized.strip("-") or "converted" +def _docker_instruction_name(line: str) -> str | None: + """Return the Dockerfile instruction name for *line*, if it has one.""" + stripped = line.strip() + if not stripped or stripped.startswith("#"): + return None + return stripped.split(maxsplit=1)[0].upper() + + +def _docker_instruction_value(line: str) -> str: + """Return the remainder of a Dockerfile instruction line.""" + parts = line.strip().split(maxsplit=1) + return parts[1] if len(parts) > 1 else "" + + +def _extract_workdir(content: str) -> str: + """Return the last Dockerfile WORKDIR, defaulting to /app.""" + workdir = "/app" + for line in content.splitlines(): + if _docker_instruction_name(line) != "WORKDIR": + continue + value = _docker_instruction_value(line) + if value: + workdir = value + return workdir + + +def _make_task_slug(task_id: str, used_slugs: set[str]) -> str: + """Create a stable, unique HUD task slug from a Harbor task id.""" + base = _normalize_name(task_id) + digest = hashlib.sha256(task_id.encode()).hexdigest()[:8] + + if len(base) > 100: + base = f"{base[:91].rstrip('-')}-{digest}" + + slug = base + if slug in used_slugs: + slug = f"{base[:91].rstrip('-')}-{digest}" + + counter = 2 + while slug in used_slugs: + suffix = f"{digest}-{counter}" + prefix_limit = 99 - len(suffix) + slug = f"{base[:prefix_limit].rstrip('-')}-{suffix}" + counter += 1 + + used_slugs.add(slug) + return slug + + def _find_dockerfile(env_dir: Path) -> str | None: """Read the Dockerfile from a Harbor environment directory.""" for name in ("Dockerfile", "dockerfile"): @@ -92,8 +142,20 @@ def _adapt_harbor_dockerfile(content: str) -> str: """ lines = content.splitlines() adapted: list[str] = [] + in_healthcheck_continuation = False for line in lines: stripped = line.strip().upper() + + if stripped.startswith("HEALTHCHECK "): + adapted.append(line) + in_healthcheck_continuation = line.rstrip().endswith("\\") + continue + + if in_healthcheck_continuation: + adapted.append(line) + in_healthcheck_continuation = line.rstrip().endswith("\\") + continue + if stripped.startswith(("CMD ", "CMD[", "ENTRYPOINT ", "ENTRYPOINT[")): adapted.append(f"# [harbor original] {line}") else: @@ -167,26 +229,64 @@ def _parse_task(task_dir: Path) -> HarborTask | None: import json import logging +import os import subprocess from pathlib import Path {extra_imports} from hud import Environment from hud.tools import BashTool, EditTool from hud.tools.filesystem import GlobTool, GrepTool, ListTool, ReadTool +from hud.tools.types import ToolError LOGGER = logging.getLogger(__name__) -TASKS_DIR = Path("/harbor/tasks") +TASKS_DIR = Path("/root/.hud_harbor/tasks") +AGENT_WORKDIR = os.path.expandvars({agent_workdir!r}) + + +def _set_agent_workdir() -> None: + """Put agent shell sessions in the original Harbor challenge workdir.""" + try: + os.chdir(AGENT_WORKDIR) + except FileNotFoundError: + if TASKS_DIR.exists(): + LOGGER.warning("Agent workdir does not exist: %s", AGENT_WORKDIR) + else: + LOGGER.debug("Skipping container workdir on host import: %s", AGENT_WORKDIR) + + +_set_agent_workdir() + + +def _resolve_within_base(file_path: Path, base_path: Path) -> Path: + resolved = file_path.resolve() if file_path.is_absolute() else (base_path / file_path).resolve() + try: + resolved.relative_to(base_path) + except ValueError: + raise ToolError(f"Path escapes base directory: {{file_path}}") from None + return resolved + + +class ScopedEditTool(EditTool): + """EditTool variant constrained to the task workdir.""" + + def __init__(self, base_path: str | Path) -> None: + super().__init__() + self._base_path = Path(base_path).resolve() + + def validate_path(self, command: str, path: Path) -> None: + resolved = _resolve_within_base(path, self._base_path) + super().validate_path(command, resolved) env = Environment("{env_name}") # Standard coding tools - agents interact via bash (matching Harbor's model) -env.add_tool(BashTool()) -env.add_tool(EditTool()) -env.add_tool(ReadTool()) -env.add_tool(GrepTool()) -env.add_tool(GlobTool()) -env.add_tool(ListTool()) +env.add_tool(BashTool(timeout=600.0)) +env.add_tool(ScopedEditTool(base_path=AGENT_WORKDIR)) +env.add_tool(ReadTool(base_path=AGENT_WORKDIR)) +env.add_tool(GrepTool(base_path=AGENT_WORKDIR)) +env.add_tool(GlobTool(base_path=AGENT_WORKDIR)) +env.add_tool(ListTool(base_path=AGENT_WORKDIR)) ''' @@ -208,7 +308,7 @@ async def run_task(task_id: TaskId): _SCENARIO_BODY = '''\ """Run a Harbor task by ID. - Reads /harbor/tasks//instruction.md as the prompt. + Reads the root-only task bundle's instruction.md as the prompt. After the agent works, runs tests/test.sh and parses /logs/verifier/reward.txt or reward.json for the reward. """ @@ -228,6 +328,11 @@ async def run_task(task_id: TaskId): # Ensure log output directory exists logs_dir = Path("/logs/verifier") logs_dir.mkdir(parents=True, exist_ok=True) + for reward_file in (Path("/logs/verifier/reward.txt"), Path("/logs/verifier/reward.json")): + try: + reward_file.unlink(missing_ok=True) + except OSError as exc: + LOGGER.warning("Failed to clear stale reward file %s: %s", reward_file, exc) # Harbor mounts the task's tests/ directory at /tests/ — replicate that tests_link = Path("/tests") @@ -243,7 +348,7 @@ async def run_task(task_id: TaskId): try: result = subprocess.run( ["bash", str(test_script)], - cwd="/app", + cwd=AGENT_WORKDIR if Path(AGENT_WORKDIR).is_dir() else "/app", capture_output=True, text=True, timeout={verifier_timeout}, @@ -303,6 +408,7 @@ def _build_env_py( source_path: str, task_ids: list[str], verifier_timeout: int, + agent_workdir: str, ) -> str: """Build the env.py content, adapting the scenario signature to task count.""" if len(task_ids) == 1: @@ -318,6 +424,7 @@ def _build_env_py( source_path=source_path, task_count=len(task_ids), extra_imports=extra_imports, + agent_workdir=agent_workdir, ) body = _SCENARIO_BODY.format(verifier_timeout=verifier_timeout) return header + scenario + body @@ -327,6 +434,14 @@ def _build_env_py( # Shared snippet: install uv standalone (works on any base image with curl or # apt), then use uv to bootstrap Python and sync dependencies. _HUD_LAYER = """\ +USER root +# HUD coding subprocesses run as uid/gid 1000, so let them edit the original +# challenge tree while keeping scenario-only task data outside that tree. +RUN agent_workdir={agent_workdir_shell} \\ + && eval "agent_workdir=\\"$agent_workdir\\"" \\ + && mkdir -p /workspace /app \\ + && if [ -d "$agent_workdir" ]; then chmod -R a+rwX "$agent_workdir"; fi + # ============================================================ # HUD MCP server layer # ============================================================ @@ -341,11 +456,14 @@ def _build_env_py( ENV PATH="/root/.local/bin:$PATH" COPY pyproject.toml uv.lock* ./ -RUN uv sync --frozen --no-dev --no-install-project 2>/dev/null || \\ - uv sync --no-dev --no-install-project +RUN uv sync --frozen --no-dev --no-install-project --python 3.12 2>/dev/null || \\ + uv sync --no-dev --no-install-project --python 3.12 +ENV PATH="/hud/.venv/bin:$PATH" -# Harbor task data (instructions + test scripts baked into image) -COPY tasks/ /harbor/tasks/ +# The scenario reads task data directly from a root-only bundle. The agent only +# receives the yielded prompt and task workdir files. +COPY tasks/ /root/.hud_harbor/tasks/ +RUN chown -R root:root /root/.hud_harbor && chmod -R go-rwx /root/.hud_harbor # Ensure standard directories exist and are writable at runtime # (MCP server may run as non-root; Harbor tasks expect /app writable) @@ -353,7 +471,7 @@ def _build_env_py( COPY env.py ./ -CMD ["uv", "run", "--no-project", "python", "-m", "hud", "dev", "env:env", "--stdio"] +CMD ["hud", "dev", "env:env", "--stdio"] """ DOCKERFILE_WITH_BASE_TEMPLATE = ( @@ -457,6 +575,7 @@ def convert(self, path: Path) -> ConvertResult: # Generate environments and taskset environments: list[GeneratedEnvironment] = [] taskset: list[dict[str, Any]] = [] + used_slugs: set[str] = set() base_name = f"hud-harbor-{_normalize_name(dataset_name)}" # Sort groups by size (largest first) for consistent naming @@ -470,6 +589,13 @@ def convert(self, path: Path) -> ConvertResult: rep_task = group_tasks[0] env_dir = rep_task.directory / "environment" dockerfile_content = _find_dockerfile(env_dir) if env_dir.exists() else None + agent_workdir = _extract_workdir(dockerfile_content or "") + env_cfg = rep_task.config.get("environment", {}) + if isinstance(env_cfg, dict): + configured_workdir = env_cfg.get("workdir") + if isinstance(configured_workdir, str) and configured_workdir: + agent_workdir = configured_workdir + agent_workdir_shell = shlex.quote(agent_workdir) # Extract verifier timeout from config verifier_timeout = 600 @@ -487,6 +613,7 @@ def convert(self, path: Path) -> ConvertResult: source_path=path.as_posix(), task_ids=task_ids, verifier_timeout=verifier_timeout, + agent_workdir=agent_workdir, ) # --- Generate Dockerfile.hud --- @@ -495,9 +622,12 @@ def convert(self, path: Path) -> ConvertResult: dockerfile = DOCKERFILE_WITH_BASE_TEMPLATE.format( source=env_dir.as_posix(), base_dockerfile=adapted, + agent_workdir_shell=agent_workdir_shell, ) else: - dockerfile = DOCKERFILE_FALLBACK_TEMPLATE + dockerfile = DOCKERFILE_FALLBACK_TEMPLATE.format( + agent_workdir_shell=agent_workdir_shell, + ) # --- Generate pyproject.toml --- pyproject = PYPROJECT_TEMPLATE.format(name=env_name) @@ -532,10 +662,13 @@ def convert(self, path: Path) -> ConvertResult: taskset.append( { + "slug": _make_task_slug(task.task_id, used_slugs), "env": {"name": env_name}, "scenario": f"{env_name}:run-task", "args": {"task_id": task.task_id}, "metadata": metadata, + "agent_config": {"append_setup_output": False}, + "validation": None, } ) diff --git a/hud/cli/convert/tests/test_harbor.py b/hud/cli/convert/tests/test_harbor.py index 64c6c6b2d..b69943e7b 100644 --- a/hud/cli/convert/tests/test_harbor.py +++ b/hud/cli/convert/tests/test_harbor.py @@ -19,6 +19,7 @@ from hud.cli.convert.harbor import ( HarborConverter, _adapt_harbor_dockerfile, + _extract_workdir, _find_dockerfile, _hash_directory, _is_harbor_task, @@ -85,6 +86,23 @@ def test_no_cmd_or_entrypoint(self) -> None: dockerfile = "FROM python:3.11\nRUN apt-get update" assert _adapt_harbor_dockerfile(dockerfile) == dockerfile + def test_preserves_healthcheck_cmd(self) -> None: + dockerfile = ( + "FROM python:3.11\n" + "HEALTHCHECK --interval=10s \\\n" + " CMD pg_isready -q || exit 1\n" + 'ENTRYPOINT ["/entrypoint.sh"]' + ) + result = _adapt_harbor_dockerfile(dockerfile) + assert " CMD pg_isready -q || exit 1" in result + assert '# [harbor original] ENTRYPOINT ["/entrypoint.sh"]' in result + + +class TestDockerfileExtraction: + def test_extracts_last_workdir(self) -> None: + dockerfile = "FROM python:3.11\nWORKDIR /tmp\nWORKDIR /app/src\n" + assert _extract_workdir(dockerfile) == "/app/src" + class TestHashDirectory: def test_same_content_same_hash(self, tmp_path: Path) -> None: @@ -247,6 +265,7 @@ def test_taskset_references_env(self, single_task: Path) -> None: result = self.converter.convert(single_task) entry = result.taskset[0] env_name = result.environments[0].name + assert entry["slug"] == "cancel-async-tasks" assert entry["scenario"] == f"{env_name}:run-task" assert entry["args"]["task_id"] == "cancel-async-tasks" @@ -278,6 +297,11 @@ def test_all_task_ids_present(self, dataset_same_env: Path) -> None: task_ids = {e["args"]["task_id"] for e in result.taskset} assert task_ids == {"cancel-async-tasks", "build-pmars", "chess-best-move"} + def test_all_task_slugs_present(self, dataset_same_env: Path) -> None: + result = self.converter.convert(dataset_same_env) + slugs = {e["slug"] for e in result.taskset} + assert slugs == {"cancel-async-tasks", "build-pmars", "chess-best-move"} + def test_env_name_from_dataset(self, dataset_same_env: Path) -> None: result = self.converter.convert(dataset_same_env) env = result.environments[0] @@ -481,6 +505,12 @@ def test_metadata_includes_toml_metadata(self, single_task: Path) -> None: assert meta.get("category") == "systems" assert meta.get("difficulty") == "medium" + def test_taskset_disables_agent_setup_append(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + entry = result.taskset[0] + assert entry["agent_config"] == {"append_setup_output": False} + assert entry["validation"] is None + # ============================================================================ # Dockerfile generation @@ -507,7 +537,18 @@ def test_hud_layer_present(self, single_task: Path) -> None: def test_tasks_copied_into_image(self, single_task: Path) -> None: result = self.converter.convert(single_task) dockerfile = result.environments[0].dockerfile - assert "COPY tasks/ /harbor/tasks/" in dockerfile + assert "COPY tasks/ /root/.hud_harbor/tasks/" in dockerfile + + def test_task_data_hidden_from_agent_uid(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + dockerfile = result.environments[0].dockerfile + assert "chmod -R go-rwx /root/.hud_harbor" in dockerfile + + def test_agent_workdir_permissions_added(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + dockerfile = result.environments[0].dockerfile + assert "agent_workdir=/workspace" in dockerfile + assert 'chmod -R a+rwX "$agent_workdir"' in dockerfile def test_logs_dir_created(self, single_task: Path) -> None: result = self.converter.convert(single_task) @@ -529,12 +570,45 @@ def test_imports_present(self, single_task: Path) -> None: env_py = result.environments[0].env_py assert "from hud import Environment" in env_py assert "from hud.tools import BashTool" in env_py + assert "from hud.tools.filesystem import GlobTool, GrepTool, ListTool, ReadTool" in env_py + assert "from hud.tools.types import ToolError" in env_py def test_tools_added(self, single_task: Path) -> None: result = self.converter.convert(single_task) env_py = result.environments[0].env_py - assert "env.add_tool(BashTool())" in env_py - assert "env.add_tool(EditTool())" in env_py + assert "env.add_tool(BashTool(timeout=600.0))" in env_py + assert "env.add_tool(ScopedEditTool(base_path=AGENT_WORKDIR))" in env_py + for tool in ("GlobTool", "GrepTool", "ListTool", "ReadTool"): + assert f"env.add_tool({tool}(base_path=AGENT_WORKDIR))" in env_py + assert "class ScopedEditTool(EditTool)" in env_py + + def test_agent_workdir_set_from_dockerfile(self, single_task: Path) -> None: + result = self.converter.convert(single_task) + env_py = result.environments[0].env_py + assert "AGENT_WORKDIR = os.path.expandvars('/workspace')" in env_py + assert "_set_agent_workdir()" in env_py + assert 'cwd=AGENT_WORKDIR if Path(AGENT_WORKDIR).is_dir() else "/app"' in env_py + + def test_agent_workdir_prefers_task_config(self, tmp_path: Path) -> None: + task_toml = """ +[metadata] +category = "systems" + +[environment] +workdir = "/build/repo" + +[verifier] +timeout_sec = 120 +""" + task = make_harbor_task( + tmp_path, + "configured-workdir", + task_toml=task_toml, + dockerfile="FROM python:3.11\nWORKDIR /donotaccess\n", + ) + result = self.converter.convert(task) + env_py = result.environments[0].env_py + assert "AGENT_WORKDIR = os.path.expandvars('/build/repo')" in env_py def test_reward_parsing_logic(self, single_task: Path) -> None: result = self.converter.convert(single_task) @@ -662,6 +736,7 @@ def test_taskset_json_valid(self, single_task: Path, tmp_path: Path) -> None: assert isinstance(data, list) assert len(data) == 1 + assert data[0]["slug"] == "cancel-async-tasks" assert data[0]["args"]["task_id"] == "cancel-async-tasks" def test_task_files_copied(self, single_task: Path, tmp_path: Path) -> None: @@ -676,6 +751,23 @@ def test_task_files_copied(self, single_task: Path, tmp_path: Path) -> None: assert (task_out / "task.toml").is_file() assert (task_out / "tests" / "test.sh").is_file() + def test_scoring_and_source_context_not_copied(self, single_task: Path, tmp_path: Path) -> None: + hidden_files = ("SCORING.md", "ref-1.md", "note-12-3.md", ".dockerignore") + for file_name in hidden_files: + (single_task / file_name).write_text("hidden task context", encoding="utf-8") + (single_task / "README.md").write_text("public task context", encoding="utf-8") + + result = self.converter.convert(single_task) + out = tmp_path / "output" + write_result(result, out) + + env = result.environments[0] + task_out = out / env.name / "tasks" / "cancel-async-tasks" + + for file_name in hidden_files: + assert not (task_out / file_name).exists() + assert (task_out / "README.md").is_file() + def test_environment_dir_not_copied(self, single_task: Path, tmp_path: Path) -> None: result = self.converter.convert(single_task) out = tmp_path / "output"