Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/agent/opencode_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@
When file or bash access is enabled, use the current working directory as the
run workspace. Write any scripts, temporary files, intermediate data, and final
artifacts there. Do not read or write files outside the current workspace.
Do not inspect parent directories, repository folders, reports, traces,
groundtruth files, previous agent outputs, or hidden evaluation artifacts.
"""
)

Expand Down Expand Up @@ -647,6 +649,11 @@ async def run(self, question: str) -> AgentResult:
cmd.append(question)

env = os.environ.copy()
# The OpenCode subprocess should not expose host-side evaluation
# output paths to file/bash tools. The Python wrapper persists the
# trajectory after OpenCode exits, using the parent process env.
env.pop("AGENT_TRAJECTORY_DIR", None)
env.pop("SCENARIOS_DATA_DIR", None)
env.update(self._env_overrides)
env["OPENCODE_CONFIG_CONTENT"] = json.dumps(self._config)
env.setdefault("OPENCODE_DISABLE_AUTOUPDATE", "true")
Expand All @@ -659,7 +666,7 @@ async def run(self, question: str) -> AgentResult:
)
proc = await asyncio.create_subprocess_exec(
*cmd,
cwd=str(_REPO_ROOT),
cwd=str(self._run_dir),
env=env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
Expand Down
181 changes: 180 additions & 1 deletion src/benchmark/scenario_suite_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@
from __future__ import annotations

import argparse
import json
import os
import re
import shutil
import subprocess
import sys
from dataclasses import dataclass
Expand All @@ -48,6 +50,7 @@ class MethodConfig:
model_id: str
extra_args: tuple[str, ...] = ()
workspace_root: Path | None = None
stage_workspace: bool = False


def model_dir_name(model_id: str) -> str:
Expand Down Expand Up @@ -142,6 +145,156 @@ def validate_groundtruth_exists(scenario_root: Path, scenario_id: str) -> None:
)


def _is_relative_to(path: Path, base: Path) -> bool:
try:
path.resolve().relative_to(base.resolve())
return True
except ValueError:
return False


def validate_workspace_root_outside_repo(workspace_root: Path, label: str) -> None:
"""Reject CLI workspaces inside the repo to avoid report/trace leakage."""
workspace_root = workspace_root.expanduser().resolve()
if _is_relative_to(workspace_root, REPO_ROOT):
raise ValueError(
f"{label} must be outside the repository when file/bash/edit "
f"access is enabled: {workspace_root}"
)


def _iter_manifest_paths(value: object) -> list[str]:
"""Return path-like strings from a scenario manifest value."""
if isinstance(value, str):
return [value]
if isinstance(value, list):
paths: list[str] = []
for item in value:
paths.extend(_iter_manifest_paths(item))
return paths
if isinstance(value, dict):
paths = []
for item in value.values():
paths.extend(_iter_manifest_paths(item))
return paths
return []


def _resolve_manifest_data_path(
*,
scenario_root: Path,
scenario_dir: Path,
spec: str,
) -> Path | None:
"""Resolve a manifest data path using the same search order as init_data."""
raw = Path(spec).expanduser()
candidates = [raw] if raw.is_absolute() else [
scenario_dir / raw,
scenario_root / raw,
]

for candidate in candidates:
try:
resolved = candidate.resolve(strict=True)
except FileNotFoundError:
continue
if resolved.is_file() and _is_relative_to(resolved, scenario_root):
return resolved
return None


def _copy_into_workspace(
*,
source: Path,
scenario_root: Path,
workspace_dir: Path,
) -> Path:
relative = source.resolve().relative_to(scenario_root.resolve())
destination = workspace_dir / relative
destination.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(source, destination)
return destination


def stage_scenario_workspace(
*,
scenario_root: Path,
scenario_id: str,
workspace_dir: Path,
) -> list[Path]:
"""Create a clean run workspace with only allowed scenario input files.

The staged workspace intentionally excludes ``groundtruth.txt`` and any
existing reports/traces. It includes ``question.txt``, ``manifest.json``,
and data files referenced by the manifest.
"""
scenario_root = scenario_root.expanduser().resolve()
scenario_dir = scenario_dir_for_id(scenario_root, scenario_id)
manifest_path = scenario_dir / "manifest.json"
question_path = scenario_dir / "question.txt"

if not question_path.exists():
raise FileNotFoundError(
f"Missing question file for scenario {scenario_id}: {question_path}"
)
if not manifest_path.exists():
raise FileNotFoundError(
f"Missing manifest file for scenario {scenario_id}: {manifest_path}"
)

workspace_dir = workspace_dir.expanduser().resolve()
if workspace_dir.exists():
shutil.rmtree(workspace_dir)
workspace_dir.mkdir(parents=True, exist_ok=True)

copied = [
_copy_into_workspace(
source=question_path,
scenario_root=scenario_root,
workspace_dir=workspace_dir,
),
_copy_into_workspace(
source=manifest_path,
scenario_root=scenario_root,
workspace_dir=workspace_dir,
),
]

manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
for spec in sorted(set(_iter_manifest_paths(manifest))):
source = _resolve_manifest_data_path(
scenario_root=scenario_root,
scenario_dir=scenario_dir,
spec=spec,
)
if source is None or source.name == "groundtruth.txt":
continue
copied.append(
_copy_into_workspace(
source=source,
scenario_root=scenario_root,
workspace_dir=workspace_dir,
)
)

readme = workspace_dir / "README.md"
readme.write_text(
"\n".join(
[
f"# AssetOpsBench scenario {scenario_id} workspace",
"",
"This workspace contains only staged scenario inputs.",
"Do not inspect files outside this directory.",
"Ground truth, reports, and previous trajectories are intentionally excluded.",
"",
]
),
encoding="utf-8",
)
copied.append(readme)
return copied


def reset_and_load_couchdb(scenario_id: str, scenario_root: Path, dry_run: bool) -> None:
"""Reset CouchDB and load the scenario-specific data from scenario_root."""
env = os.environ.copy()
Expand Down Expand Up @@ -172,6 +325,7 @@ def run_agent_for_scenario(
question: str,
trajectory_dir: Path,
dry_run: bool,
scenario_root: Path | None = None,
) -> None:
"""Run one scenario with one method."""
run_id = f"{method.agent_name}_{scenario_id}"
Expand All @@ -184,7 +338,22 @@ def run_agent_for_scenario(
workspace_dir = method.workspace_root / run_id
extra_args.extend(["--workspace-dir", str(workspace_dir)])
if not dry_run:
workspace_dir.mkdir(parents=True, exist_ok=True)
if method.stage_workspace:
if scenario_root is None:
raise ValueError(
"scenario_root is required when staging a run workspace"
)
staged = stage_scenario_workspace(
scenario_root=scenario_root,
scenario_id=scenario_id,
workspace_dir=workspace_dir,
)
print(
f"Staged {len(staged)} files for scenario {scenario_id} "
f"into {workspace_dir}"
)
else:
workspace_dir.mkdir(parents=True, exist_ok=True)

cmd = [
"uv",
Expand Down Expand Up @@ -308,6 +477,7 @@ def build_methods(args: argparse.Namespace) -> dict[str, MethodConfig]:
model_id=args.model_id,
extra_args=tuple(opencode_extra_args),
workspace_root=args.opencode_workspace_root,
stage_workspace=True,
),
"gemini_cli_agent": MethodConfig(
agent_name="gemini_cli_agent",
Expand Down Expand Up @@ -552,6 +722,13 @@ def main() -> None:
"--opencode-workspace-root is required when enabling OpenCode "
"files, bash, or edits"
)
if opencode_workspace_required:
try:
validate_workspace_root_outside_repo(
args.opencode_workspace_root, "--opencode-workspace-root"
)
except ValueError as exc:
parser.error(str(exc))
gemini_workspace_required = (
args.gemini_allow_files or args.gemini_allow_bash or args.gemini_allow_edit
)
Expand Down Expand Up @@ -592,6 +769,7 @@ def main() -> None:
model_id=method.model_id,
extra_args=method.extra_args,
workspace_root=method_workspace_root,
stage_workspace=method.stage_workspace,
)

if not args.dry_run:
Expand Down Expand Up @@ -623,6 +801,7 @@ def main() -> None:
scenario_id=scenario_id,
question=question,
trajectory_dir=trajectory_dir,
scenario_root=args.scenario_root,
dry_run=args.dry_run,
)
except Exception as exc:
Expand Down
Loading