Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b6089bf
init
sdevare-nv May 6, 2026
1a1bf7a
bench: add nemo-gym provider + run_infer entry
sdevare-nv May 7, 2026
c55a4ba
bench: drop eager TUI command imports from CLI entry
sdevare-nv May 7, 2026
ab84347
bench: prefer pre-bundled opencode.js over running src/index.ts
sdevare-nv May 7, 2026
85e8e32
bench: commit empty models-snapshot stubs (force-added)
sdevare-nv May 8, 2026
7c9b883
bench: drop webfetch/websearch permission entries from per-instance c…
sdevare-nv May 10, 2026
f59e8c6
bench: accept opts.headers as either function or plain object
sdevare-nv May 10, 2026
b401866
bench: fix default-merge bug that made every fetch abort instantly
sdevare-nv May 10, 2026
a245187
bench: omit body.model when modelId is empty / 'default'
sdevare-nv May 10, 2026
06724b5
bench: enable todowrite tool for SWE-bench agent
sdevare-nv May 11, 2026
2fec780
bench: port _deep_reset_to_base_commit from nv-OpenHands
sdevare-nv May 11, 2026
1b8998a
bench: add progress echoes to deep_reset careful + nuclear passes
sdevare-nv May 11, 2026
9b3d676
bench: capture per-session trajectories + add --enable-subagents flag
sdevare-nv May 11, 2026
6495092
bench: use opencode's anthropic.txt as default system prompt
sdevare-nv May 11, 2026
d850c8f
bench: gate dynamic system env block behind OPENCODE_DISABLE_ENV_PROMPT
sdevare-nv May 11, 2026
01f0889
bench: spawn shell/bun/git via absolute paths to dodge SIF PATH quirks
sdevare-nv May 12, 2026
cdfc4b7
bench: drop spawn cwd, route chdir via shell / opencode --dir
sdevare-nv May 12, 2026
b9602b1
feat: change nudge to user message
sdevare-nv May 18, 2026
52efa89
bench: include untracked files in captured git diff
sdevare-nv Jun 4, 2026
a8f6fc9
bench: bootstrap a git repo when the SIF ships no .git
sdevare-nv Jun 10, 2026
0c088fd
bench: exit 0 deterministically when bench wrote output.jsonl
sdevare-nv Jun 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions evaluation/benchmarks/swe_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env bash
# Bench entry script — invoked by gym's OpenCodeHarnessProcessor.get_run_command().
#
# Args (positional, must match the order in app.py's get_run_command):
# $1 COMMIT_HASH opencode commit (informational; checkout is done at setup)
# $2 AGENT agent class name (informational)
# $3 MAX_ITER max agent turns
# $4 DATASET dataset name (informational; gym already dispatched)
# $5 SPLIT dataset split (informational)
# $6 EVAL_OUTPUT_DIR where to write trajectories (relative to opencode dir)
# $7 SELECTED_ID instance_id to run
# $8 INSTANCE_DICT_PATH /root/dataset/data.jsonl (single-line JSONL)
# $9 CONFIG_FILE opencode model config JSON (written by gym)
# $10 WORKSPACE_ROOT resolved repo path inside the SIF (gym side decided)
# $11 USER_MESSAGE_PATH pre-rendered user prompt file (workspace baked in)
# $12 SYSTEM_PROMPT_PATH optional system-prompt override
#
# Environment (set by gym):
# NEMO_GYM_MODEL_SERVER_NAME proxy name on the gym head server
# NEMO_GYM_MODEL_SERVER_BASE_URL base http://host:port for the model server
# NEMO_GYM_METRICS_FPATH path to the metrics JSON to update
# NEMO_GYM_CONFIG_DICT (informational) the gym YAML config blob
# COMMAND_EXEC_TIMEOUT per-bash-command timeout in seconds
# DIVERSIFY_TOOL_NAMES optional: rename tools for RL diversity
# CAMEL_CASE_TOOL_NAMES optional: camelCase tool names

set -eo pipefail

COMMIT_HASH="${1:-}"
AGENT="${2:-OpenCodeAgent}"
MAX_ITER="${3:-100}"
DATASET="${4:-}"
SPLIT="${5:-test}"
EVAL_OUTPUT_DIR="${6:-evaluation/oh}"
SELECTED_ID="${7:-}"
INSTANCE_DICT_PATH="${8:-/root/dataset/data.jsonl}"
CONFIG_FILE="${9:-/tmp/oc_config.json}"
WORKSPACE_ROOT="${10:-}"
USER_MESSAGE_PATH="${11:-}"
SYSTEM_PROMPT_PATH="${12:-}"

if [ -z "$SELECTED_ID" ]; then
echo "ERROR: SELECTED_ID (\$7) is required."
exit 64
fi
if [ -z "$WORKSPACE_ROOT" ]; then
echo "ERROR: WORKSPACE_ROOT (\$10) is required — gym side resolves the dataset-aware repo path."
exit 65
fi
if [ -z "$USER_MESSAGE_PATH" ]; then
echo "ERROR: USER_MESSAGE_PATH (\$11) is required — gym side renders the user prompt."
exit 66
fi
if [ -z "${NEMO_GYM_MODEL_SERVER_NAME:-}" ]; then
echo "ERROR: NEMO_GYM_MODEL_SERVER_NAME not set in env."
exit 67
fi
if [ -z "${NEMO_GYM_MODEL_SERVER_BASE_URL:-}" ]; then
echo "ERROR: NEMO_GYM_MODEL_SERVER_BASE_URL not set in env."
exit 68
fi

# Resolve the opencode root directory. The script lives at
# evaluation/benchmarks/swe_bench/scripts/run_infer.sh — go up four levels.
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OPENCODE_DIR="$(cd "$SCRIPT_DIR/../../../.." && pwd)"
BENCH_CLI="$OPENCODE_DIR/packages/opencode/src/bench/cli.ts"

if [ ! -f "$BENCH_CLI" ]; then
echo "ERROR: bench cli.ts not found at $BENCH_CLI"
exit 69
fi
if ! command -v bun >/dev/null 2>&1; then
echo "ERROR: bun not on PATH (expected /opencode_setup/bun/bin/bun)"
exit 70
fi

# Make EVAL_OUTPUT_DIR absolute (relative to opencode dir).
case "$EVAL_OUTPUT_DIR" in
/*) ABS_OUTPUT_DIR="$EVAL_OUTPUT_DIR" ;;
*) ABS_OUTPUT_DIR="$OPENCODE_DIR/$EVAL_OUTPUT_DIR" ;;
esac
mkdir -p "$ABS_OUTPUT_DIR"

echo "OPENCODE_DIR: $OPENCODE_DIR"
echo "BENCH_CLI: $BENCH_CLI"
echo "AGENT: $AGENT COMMIT: $COMMIT_HASH MAX_ITER: $MAX_ITER"
echo "DATASET: $DATASET SPLIT: $SPLIT SELECTED_ID: $SELECTED_ID"
echo "EVAL_OUTPUT_DIR: $ABS_OUTPUT_DIR"
echo "INSTANCE_DICT_PATH: $INSTANCE_DICT_PATH"
echo "CONFIG_FILE: $CONFIG_FILE"
echo "WORKSPACE_ROOT: $WORKSPACE_ROOT"
echo "USER_MESSAGE_PATH: $USER_MESSAGE_PATH"
echo "SYSTEM_PROMPT_PATH: $SYSTEM_PROMPT_PATH"
echo "MODEL_SERVER: $NEMO_GYM_MODEL_SERVER_NAME @ $NEMO_GYM_MODEL_SERVER_BASE_URL"

cmd=(
bun "$BENCH_CLI"
--instance-dict-path "$INSTANCE_DICT_PATH"
--output-dir "$ABS_OUTPUT_DIR"
--config "$CONFIG_FILE"
--max-turns "$MAX_ITER"
--agent-cls "$AGENT"
--dataset "$DATASET"
--split "$SPLIT"
--selected-id "$SELECTED_ID"
--workspace-root "$WORKSPACE_ROOT"
--user-message-file "$USER_MESSAGE_PATH"
)
if [ -n "$SYSTEM_PROMPT_PATH" ]; then
cmd+=(--system-prompt "$SYSTEM_PROMPT_PATH")
fi
if [ "${ENABLE_SUBAGENTS:-0}" = "1" ] || [ "${ENABLE_SUBAGENTS:-}" = "true" ]; then
cmd+=(--enable-subagents)
fi

echo "Executing: ${cmd[*]}"
exec "${cmd[@]}"
78 changes: 78 additions & 0 deletions packages/opencode/src/bench/bootstrap_repo.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/**
* Bootstrap a git repository inside the workspace when the SIF ships a flat
* source tree without a `.git` directory.
*
* Some dataset SIFs (notably `swe-bench-ext`, and certain SWE-rebench variants)
* copy the repo contents into `/workspace/repo` (or the dataset-specific path)
* without preserving git history. Without `.git`, `runDeepReset` is a silent
* no-op (its `git rev-parse` fails under the outer `|| true`) and
* `captureGitDiff` returns "" — every rollout is recorded as `patch=0 bytes`
* regardless of what the agent did. Port of nv-OpenHands'
* `evaluation/benchmarks/swe_bench/run_infer.py:1142-1156`.
*
* If `.git` already exists, this is a no-op. Otherwise a pristine baseline
* commit is created and tagged `opencode_bench_baseline`. Callers should skip
* `runDeepReset` when this returns `{ freshInit: true }` — the dataset's
* upstream `base_commit` SHA does not exist in the fresh repo, so deep_reset
* would just fail rev-parse and noisily fall through to its nuclear pass.
*/

import { spawn } from "node:child_process"
import { existsSync } from "node:fs"
import path from "node:path"

function detectShell(): string | null {
for (const p of ["/bin/bash", "/usr/bin/bash", "/bin/sh", "/usr/bin/sh"]) {
if (existsSync(p)) return p
}
return null
}

function shellQuote(s: string): string {
return `'${s.replace(/'/g, `'\\''`)}'`
}

function buildBootstrapCmd(workspaceRoot: string): string {
const q = shellQuote(workspaceRoot)
return (
`cd ${q} && ` +
`echo "[bootstrap_repo] initializing git repo at ${workspaceRoot}" && ` +
`git config --global --add safe.directory ${q} && ` +
`git init -q && ` +
`git config user.email 'bench@opencode.local' && ` +
`git config user.name 'opencode bench' && ` +
`git add -A && ` +
`git commit -q --allow-empty -m 'opencode bench baseline' && ` +
`git tag -f opencode_bench_baseline HEAD && ` +
`echo "[bootstrap_repo] done; HEAD=$(git rev-parse --short HEAD)"`
)
}

export interface BootstrapResult {
freshInit: boolean
}

export async function bootstrapRepoIfMissing(workspaceRoot: string): Promise<BootstrapResult> {
if (existsSync(path.join(workspaceRoot, ".git"))) {
return { freshInit: false }
}
const shell = detectShell()
if (!shell) {
console.warn(`[bench] bootstrap_repo skipped: no shell found at /bin/{bash,sh} or /usr/bin/{bash,sh}`)
return { freshInit: false }
}
const cmd = buildBootstrapCmd(workspaceRoot)
console.log(`[bench] bootstrap_repo workspace=${workspaceRoot} shell=${shell}`)
const exitCode = await new Promise<number>((resolve) => {
const child = spawn(shell, ["-c", cmd], {
stdio: ["ignore", "inherit", "inherit"],
})
child.on("close", (code) => resolve(code ?? 0))
child.on("error", (err) => {
console.warn(`[bench] bootstrap_repo spawn error: ${err}`)
resolve(1)
})
})
console.log(`[bench] bootstrap_repo exit=${exitCode}`)
return { freshInit: exitCode === 0 }
}
Loading