From b6089bf0e72a2b91c6a04e98a489b0f54953a1eb Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Wed, 6 May 2026 15:58:07 -0700 Subject: [PATCH 01/21] init --- .../benchmarks/swe_bench/scripts/run_infer.sh | 106 ++++ packages/opencode/src/bench/cli.ts | 389 +++++++++++++ packages/opencode/src/provider/provider.ts | 5 + .../src/provider/sdk/nemo-gym/index.ts | 62 +++ .../provider/sdk/nemo-gym/language-model.ts | 510 ++++++++++++++++++ 5 files changed, 1072 insertions(+) create mode 100755 evaluation/benchmarks/swe_bench/scripts/run_infer.sh create mode 100644 packages/opencode/src/bench/cli.ts create mode 100644 packages/opencode/src/provider/sdk/nemo-gym/index.ts create mode 100644 packages/opencode/src/provider/sdk/nemo-gym/language-model.ts diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh new file mode 100755 index 000000000000..25bd5b001e09 --- /dev/null +++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# Bench entry script — invoked by gym's OpenCodeHarnessProcessor.get_run_command(). +# +# Args (positional, must match the order in app.py's get_run_command): +# $1 COMMIT_HASH opencode commit (informational; checkout is done at setup) +# $2 AGENT agent class name (informational) +# $3 MAX_ITER max agent turns +# $4 DATASET dataset name (informational; dispatch already done by gym) +# $5 SPLIT dataset split (informational) +# $6 EVAL_OUTPUT_DIR where to write trajectories (relative to opencode dir) +# $7 SELECTED_ID instance_id to run +# $8 INSTANCE_DICT_PATH /root/dataset/data.jsonl (single-line JSONL) +# $9 CONFIG_FILE opencode model config JSON (written by gym) +# $10 USER_PROMPT_PATH optional +# $11 SYSTEM_PROMPT_PATH optional +# +# Environment (set by gym): +# NEMO_GYM_MODEL_SERVER_NAME proxy name on the gym head server +# NEMO_GYM_MODEL_SERVER_BASE_URL base http://host:port for the model server +# NEMO_GYM_METRICS_FPATH path to the metrics JSON to update +# NEMO_GYM_CONFIG_DICT (informational) the gym YAML config blob +# COMMAND_EXEC_TIMEOUT per-bash-command timeout in seconds +# DIVERSIFY_TOOL_NAMES optional: rename tools for RL diversity +# CAMEL_CASE_TOOL_NAMES optional: camelCase tool names + +set -eo pipefail + +COMMIT_HASH="${1:-}" +AGENT="${2:-OpenCodeAgent}" +MAX_ITER="${3:-100}" +DATASET="${4:-}" +SPLIT="${5:-test}" +EVAL_OUTPUT_DIR="${6:-evaluation/oh}" +SELECTED_ID="${7:-}" +INSTANCE_DICT_PATH="${8:-/root/dataset/data.jsonl}" +CONFIG_FILE="${9:-/tmp/oc_config.json}" +USER_PROMPT_PATH="${10:-}" +SYSTEM_PROMPT_PATH="${11:-}" + +if [ -z "$SELECTED_ID" ]; then + echo "ERROR: SELECTED_ID (\$7) is required." + exit 64 +fi +if [ -z "${NEMO_GYM_MODEL_SERVER_NAME:-}" ]; then + echo "ERROR: NEMO_GYM_MODEL_SERVER_NAME not set in env." + exit 65 +fi +if [ -z "${NEMO_GYM_MODEL_SERVER_BASE_URL:-}" ]; then + echo "ERROR: NEMO_GYM_MODEL_SERVER_BASE_URL not set in env." + exit 66 +fi + +# Resolve the opencode root directory. The script lives at +# evaluation/benchmarks/swe_bench/scripts/run_infer.sh — go up four levels. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OPENCODE_DIR="$(cd "$SCRIPT_DIR/../../../.." && pwd)" +BENCH_CLI="$OPENCODE_DIR/packages/opencode/src/bench/cli.ts" + +if [ ! -f "$BENCH_CLI" ]; then + echo "ERROR: bench cli.ts not found at $BENCH_CLI" + exit 67 +fi +if ! command -v bun >/dev/null 2>&1; then + echo "ERROR: bun not on PATH (expected /opencode_setup/bun/bin/bun)" + exit 68 +fi + +# Make EVAL_OUTPUT_DIR absolute (relative to opencode dir). +case "$EVAL_OUTPUT_DIR" in + /*) ABS_OUTPUT_DIR="$EVAL_OUTPUT_DIR" ;; + *) ABS_OUTPUT_DIR="$OPENCODE_DIR/$EVAL_OUTPUT_DIR" ;; +esac +mkdir -p "$ABS_OUTPUT_DIR" + +# Echo the resolved config for log analysis. +echo "OPENCODE_DIR: $OPENCODE_DIR" +echo "BENCH_CLI: $BENCH_CLI" +echo "AGENT: $AGENT COMMIT: $COMMIT_HASH MAX_ITER: $MAX_ITER" +echo "DATASET: $DATASET SPLIT: $SPLIT SELECTED_ID: $SELECTED_ID" +echo "EVAL_OUTPUT_DIR: $ABS_OUTPUT_DIR" +echo "INSTANCE_DICT_PATH: $INSTANCE_DICT_PATH" +echo "CONFIG_FILE: $CONFIG_FILE" +echo "USER_PROMPT_PATH: $USER_PROMPT_PATH" +echo "SYSTEM_PROMPT_PATH: $SYSTEM_PROMPT_PATH" +echo "MODEL_SERVER: $NEMO_GYM_MODEL_SERVER_NAME @ $NEMO_GYM_MODEL_SERVER_BASE_URL" + +cmd=( + bun "$BENCH_CLI" + --instance-dict-path "$INSTANCE_DICT_PATH" + --output-dir "$ABS_OUTPUT_DIR" + --config "$CONFIG_FILE" + --max-turns "$MAX_ITER" + --agent-cls "$AGENT" + --dataset "$DATASET" + --split "$SPLIT" + --selected-id "$SELECTED_ID" +) +if [ -n "$USER_PROMPT_PATH" ]; then + cmd+=(--user-prompt "$USER_PROMPT_PATH") +fi +if [ -n "$SYSTEM_PROMPT_PATH" ]; then + cmd+=(--system-prompt "$SYSTEM_PROMPT_PATH") +fi + +echo "Executing: ${cmd[*]}" +exec "${cmd[@]}" diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts new file mode 100644 index 000000000000..33dbb92d6e6b --- /dev/null +++ b/packages/opencode/src/bench/cli.ts @@ -0,0 +1,389 @@ +/** + * SWE-bench bench CLI driver. + * + * Drives a single SWE-bench instance to completion using opencode's REAL + * agentic loop. We spawn `bun .../src/index.ts run` as a subprocess (with a + * per-instance opencode config that registers our `nemo-gym` provider, a + * SWE-bench agent, and disables compaction) and let it run to idle. + * + * Why subprocess instead of in-process Server.Default? Subprocess is the + * model the user-facing `opencode run` already uses (cli/cmd/run.ts:670–675 + * also uses an in-process fetch but the public entry is `bun .../index.ts`). + * A subprocess gives us: + * - clean process isolation per instance (matters for many parallel SIFs) + * - identical bootstrapping path to `opencode run`, so we don't drift + * - the JSON event stream on stdout for free (--format json) + * + * Trajectory capture: the nemo-gym provider (registered via this config) + * writes `/.json` per LLM call BEFORE returning. On + * exit we capture `git diff` and write `output.jsonl`. + */ + +import { promises as fs, readFileSync } from "node:fs" +import path from "node:path" +import os from "node:os" +import { spawn } from "node:child_process" + +interface CliArgs { + instanceDictPath: string + outputDir: string + config: string + maxTurns: number + agentCls: string + dataset: string + split: string + selectedId: string + userPromptPath?: string + systemPromptPath?: string +} + +function parseArgs(argv: string[]): CliArgs { + const out: Partial = { maxTurns: 100, agentCls: "OpenCodeAgent", dataset: "", split: "test" } + for (let i = 0; i < argv.length; i++) { + const a = argv[i] + const next = () => argv[++i] + switch (a) { + case "--instance-dict-path": + out.instanceDictPath = next() + break + case "--output-dir": + out.outputDir = next() + break + case "--config": + out.config = next() + break + case "--max-turns": + out.maxTurns = parseInt(next(), 10) + break + case "--agent-cls": + out.agentCls = next() + break + case "--dataset": + out.dataset = next() + break + case "--split": + out.split = next() + break + case "--selected-id": + out.selectedId = next() + break + case "--user-prompt": + out.userPromptPath = next() + break + case "--system-prompt": + out.systemPromptPath = next() + break + default: + if (a.startsWith("--")) throw new Error(`Unknown flag: ${a}`) + } + } + for (const required of ["instanceDictPath", "outputDir", "config", "selectedId"] as const) { + if (!out[required]) + throw new Error(`Missing required arg --${required.replace(/[A-Z]/g, (c) => "-" + c.toLowerCase())}`) + } + return out as CliArgs +} + +interface InstanceDict { + instance_id: string + problem_statement: string + repo?: string + repo_name?: string + workspace?: string + [key: string]: unknown +} + +async function readInstance(instanceDictPath: string, selectedId: string): Promise { + const text = await fs.readFile(instanceDictPath, "utf8") + const lines = text + .split("\n") + .map((l) => l.trim()) + .filter(Boolean) + const records = lines.map((l) => JSON.parse(l) as InstanceDict) + const match = records.find((r) => r.instance_id === selectedId) ?? records[0] + if (!match) throw new Error(`No instance found in ${instanceDictPath}`) + return match +} + +function detectWorkspaceRoot(instance: InstanceDict): string { + if (instance.workspace) return instance.workspace + // SWE-bench SIFs check the repo out at /testbed by convention. + return "/testbed" +} + +function loadGymConfig(configPath: string): Record { + return JSON.parse(readFileSync(configPath, "utf8")) +} + +const DEFAULT_SYSTEM_PROMPT = `You are an autonomous software engineer fixing a known issue in a checked-out git repository. + +Work in small, deliberate steps: +1. Read the issue and explore the relevant files. +2. Reproduce the issue if applicable. +3. Edit the source to fix the issue. +4. Run the project's tests to verify the fix. +5. Iterate until the issue is resolved. + +Use the available tools (bash, edit, read, glob, grep) to investigate and act. Do NOT modify the test files unless the task explicitly says so. The harness will capture the final \`git diff\` of the workspace as your patch — do not commit or format the diff yourself. +` + +async function buildConfigDir(args: { + instanceId: string + workspaceRoot: string + modelName: string + baseURL: string + completionsDir: string + maxTurns: number + problemStatement: string + systemPromptPath?: string + userPromptPath?: string +}): Promise<{ tmpRoot: string; configFile: string; userPrompt: string }> { + const tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), `bench-${args.instanceId}-`)) + await fs.mkdir(tmpRoot, { recursive: true }) + + const systemPrompt = args.systemPromptPath + ? await fs.readFile(args.systemPromptPath, "utf8") + : DEFAULT_SYSTEM_PROMPT + + const userPromptTemplate = args.userPromptPath + ? await fs.readFile(args.userPromptPath, "utf8") + : `\n{{problem_statement}}\n\n\nThe workspace is at ${args.workspaceRoot}. Investigate, fix, run tests, and stop when the issue is resolved.` + + const cfg: Record = { + $schema: "https://opencode.ai/config.json", + provider: { + "nemo-gym": { + npm: "@opencode-ai/nemo-gym", + options: { + baseURL: args.baseURL, + completionsDir: args.completionsDir, + instanceId: args.instanceId, + }, + models: { + [args.modelName]: { + id: args.modelName, + name: args.modelName, + limit: { context: 131072, output: 32768 }, + tool_call: true, + temperature: true, + }, + }, + }, + }, + agent: { + "swe-bench": { + mode: "primary", + model: `nemo-gym/${args.modelName}`, + prompt: systemPrompt, + // Allow the read+write tool set; disable web/skill/task to keep the + // agent focused on local code editing. + permission: { + edit: { "**": "allow" }, + bash: { "*": "allow" }, + webfetch: { "*": "deny" }, + websearch: { "*": "deny" }, + }, + tools: { + bash: true, + edit: true, + read: true, + glob: true, + grep: true, + write: true, + apply_patch: true, + webfetch: false, + websearch: false, + task: false, + skill: false, + todowrite: false, + }, + steps: args.maxTurns, + options: {}, + }, + }, + compaction: { auto: false }, + share: "manual", + } + + const configFile = path.join(tmpRoot, "opencode.jsonc") + await fs.writeFile(configFile, JSON.stringify(cfg, null, 2)) + + return { + tmpRoot, + configFile, + userPrompt: userPromptTemplate.replace(/\{\{problem_statement\}\}/g, args.problemStatement), + } +} + +function runOpencode(args: { + workspaceRoot: string + modelName: string + message: string + env: NodeJS.ProcessEnv + opencodeBin: string + agent: string +}): Promise<{ exitCode: number; stdout: string; stderr: string }> { + return new Promise((resolve) => { + const child = spawn( + "bun", + [ + args.opencodeBin, + "run", + args.message, + "--agent", + args.agent, + "--model", + `nemo-gym/${args.modelName}`, + "--format", + "json", + "--dangerously-skip-permissions", + "--dir", + args.workspaceRoot, + ], + { + cwd: args.workspaceRoot, + env: args.env, + stdio: ["ignore", "pipe", "pipe"], + }, + ) + let stdout = "" + let stderr = "" + child.stdout?.on("data", (b) => { + const chunk = b.toString("utf8") + stdout += chunk + // Forward to our stdout so the gym log captures the event stream. + process.stdout.write(chunk) + }) + child.stderr?.on("data", (b) => { + const chunk = b.toString("utf8") + stderr += chunk + process.stderr.write(chunk) + }) + child.on("close", (code) => resolve({ exitCode: code ?? 0, stdout, stderr })) + child.on("error", (err) => { + stderr += String(err) + resolve({ exitCode: 999, stdout, stderr }) + }) + }) +} + +async function captureGitDiff(workspaceRoot: string): Promise { + return new Promise((resolve) => { + const child = spawn("git", ["-C", workspaceRoot, "diff"], { + env: { ...process.env, GIT_PAGER: "cat" }, + }) + let stdout = "" + child.stdout?.on("data", (b) => (stdout += b.toString("utf8"))) + child.on("close", () => resolve(stdout)) + child.on("error", () => resolve("")) + }) +} + +interface OutputJsonl { + instance_id: string + test_result: { git_patch: string } + metadata: { llm_config: { model: string } } + metrics: Record + error: string | null +} + +async function writeOutputJsonl(evalOutputDir: string, instanceId: string, payload: OutputJsonl): Promise { + const runDir = path.join(evalOutputDir, instanceId, "bench_run") + await fs.mkdir(runDir, { recursive: true }) + const outPath = path.join(runDir, "output.jsonl") + const tmp = `${outPath}.tmp` + await fs.writeFile(tmp, JSON.stringify(payload) + "\n") + await fs.rename(tmp, outPath) + return outPath +} + +function completionsDirFor(evalOutputDir: string, instanceId: string): string { + // Match openhands' on-host glob: /*/*/*/llm_completions//*.json + return path.join(evalOutputDir, instanceId, "bench_run", "llm_completions", instanceId) +} + +function detectOpencodeBin(): string { + // bench/cli.ts runs from packages/opencode/src/bench/. The opencode index + // entry sits at packages/opencode/src/index.ts. From this script's url we + // resolve up two levels. + const here = path.dirname(new URL(import.meta.url).pathname) + return path.resolve(here, "..", "index.ts") +} + +async function main() { + const args = parseArgs(process.argv.slice(2)) + const instance = await readInstance(args.instanceDictPath, args.selectedId) + const workspaceRoot = detectWorkspaceRoot(instance) + const gymConfig = loadGymConfig(args.config) + const llmModelCfg = ((gymConfig as Record>).llm?.model ?? {}) as Record< + string, + unknown + > + const modelName = String(llmModelCfg.model ?? "unknown-model") + const baseURL = process.env.NEMO_GYM_MODEL_SERVER_BASE_URL + if (!baseURL) throw new Error("NEMO_GYM_MODEL_SERVER_BASE_URL not set in env (gym harness sets this).") + + const completionsDir = completionsDirFor(args.outputDir, instance.instance_id) + await fs.mkdir(completionsDir, { recursive: true }) + + // We pre-render the user message so opencode's prompt machinery doesn't + // need to know about SWE-bench-specific templating. + const problemStatement = (instance.problem_statement ?? "").toString() + + const { tmpRoot, configFile, userPrompt } = await buildConfigDir({ + instanceId: instance.instance_id, + workspaceRoot, + modelName, + baseURL, + completionsDir, + maxTurns: args.maxTurns, + problemStatement, + systemPromptPath: args.systemPromptPath, + userPromptPath: args.userPromptPath, + }) + + const startedAt = Date.now() + const childEnv: NodeJS.ProcessEnv = { + ...process.env, + // Run-isolated opencode state. + OPENCODE_DB: ":memory:", + OPENCODE_DATA: path.join(tmpRoot, "data"), + OPENCODE_CONFIG: configFile, + // Disable opencode's built-in plugin loaders; the bench harness doesn't need them. + OPENCODE_PURE: "1", + } + + const opencodeBin = detectOpencodeBin() + const result = await runOpencode({ + workspaceRoot, + modelName, + message: userPrompt, + env: childEnv, + opencodeBin, + agent: "swe-bench", + }) + + const patch = await captureGitDiff(workspaceRoot) + const benchRunTime = (Date.now() - startedAt) / 1000 + + const error: string | null = result.exitCode === 0 ? null : `opencode_exit_${result.exitCode}` + const outPath = await writeOutputJsonl(args.outputDir, instance.instance_id, { + instance_id: instance.instance_id, + test_result: { git_patch: patch }, + metadata: { llm_config: { model: modelName } }, + metrics: { + bench_run_time: benchRunTime, + opencode_exit_code: result.exitCode, + }, + error, + }) + + console.log(`[bench] wrote ${outPath} (patch=${patch.length} bytes, error=${error ?? "none"})`) + + if (result.exitCode !== 0) process.exit(1) +} + +main().catch((err) => { + console.error(`[bench] fatal: ${err?.stack ?? err}`) + process.exit(2) +}) diff --git a/packages/opencode/src/provider/provider.ts b/packages/opencode/src/provider/provider.ts index 4013dcee36e7..dbcb319ff284 100644 --- a/packages/opencode/src/provider/provider.ts +++ b/packages/opencode/src/provider/provider.ts @@ -114,6 +114,11 @@ const BUNDLED_PROVIDERS: Record Promise<(opts: any) => BundledSDK> "gitlab-ai-provider": () => import("gitlab-ai-provider").then((m) => m.createGitLab), "@ai-sdk/github-copilot": () => import("./sdk/copilot/copilot-provider").then((m) => m.createOpenaiCompatible), "venice-ai-sdk-provider": () => import("venice-ai-sdk-provider").then((m) => m.createVenice), + // NeMo-Gym custom provider used by the SWE-bench RL rollout harness. + // Routes chat completions through the gym's vllm model server while + // threading prompt/generation token IDs into providerMetadata. The + // bench cli (`bench/cli.ts`) configures this provider per-instance. + "@opencode-ai/nemo-gym": () => import("./sdk/nemo-gym/index").then((m) => m.createNemoGym), } type CustomModelLoader = (sdk: any, modelID: string, options?: Record) => Promise diff --git a/packages/opencode/src/provider/sdk/nemo-gym/index.ts b/packages/opencode/src/provider/sdk/nemo-gym/index.ts new file mode 100644 index 000000000000..0d7001b8dc28 --- /dev/null +++ b/packages/opencode/src/provider/sdk/nemo-gym/index.ts @@ -0,0 +1,62 @@ +/** + * NeMo-Gym opencode provider entry. + * + * Provider id: `nemo-gym`. Used by the bench harness for SWE-bench RL rollouts. + * Registered in `provider/provider.ts:BUNDLED_PROVIDERS`. + * + * The factory mirrors `@ai-sdk/openai-compatible`'s shape: `createNemoGym(opts)` + * returns a provider with `.languageModel(modelId)` so opencode's existing + * provider plumbing (Provider.Service.getModel) works without special-casing. + */ + +import { NemoGymLanguageModel, type NemoGymLanguageModelConfig } from "./language-model" + +export interface CreateNemoGymOptions { + /** Base URL of the gym model server (`http://host:port`). */ + baseURL: string + /** Optional name of the model server (informational; useful for logs). */ + modelServerName?: string + /** Custom request headers. */ + headers?: () => Record + /** + * Where to dump per-call llm_completions/.json files. + * Set per-instance by the bench harness; if absent, no trajectory dump. + */ + completionsDir?: string + /** instance_id to embed in trajectory dump paths/file names. */ + instanceId?: string + /** Per-call HTTP timeout in ms. */ + requestTimeoutMs?: number + /** HTTP retry count on transient errors. */ + retries?: number + /** Optional turn counter shared across all model calls in a session. */ + turnCounter?: { next(): number } + /** Optional callback invoked after each successful chat-completion. */ + onCompletion?: NemoGymLanguageModelConfig["onCompletion"] +} + +export interface NemoGymProvider { + languageModel: (modelId: string) => NemoGymLanguageModel +} + +export function createNemoGym(opts: CreateNemoGymOptions): NemoGymProvider { + if (!opts.baseURL) { + throw new Error("createNemoGym: baseURL is required (e.g. http://host:port)") + } + return { + languageModel(modelId: string) { + return new NemoGymLanguageModel(modelId, { + provider: "nemo-gym", + baseURL: opts.baseURL, + modelServerName: opts.modelServerName, + headers: opts.headers, + completionsDir: opts.completionsDir, + instanceId: opts.instanceId, + requestTimeoutMs: opts.requestTimeoutMs, + retries: opts.retries, + turnCounter: opts.turnCounter, + onCompletion: opts.onCompletion, + }) + }, + } +} diff --git a/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts new file mode 100644 index 000000000000..0ba88a5d1edb --- /dev/null +++ b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts @@ -0,0 +1,510 @@ +/** + * NeMo-Gym LanguageModelV3 implementation. + * + * The opencode `processor.ts` agentic loop is unmodified — this is the only + * piece that swaps. Internally we POST to NeMo Gym's `/v1/chat/completions` + * non-streaming, capture token IDs (`prompt_token_ids` / `generation_token_ids` + * / `generation_log_probs`) from the response, and emit a single-shot synthetic + * stream so opencode's streaming handler is happy. + * + * Why non-streaming? RL training requires contiguous, exact token IDs across + * turns. Streaming has them drip in across SSE chunks; non-streaming returns + * them in the final response cleanly. The opencode loop doesn't notice — it + * receives all stream parts at once. + * + * Trajectory dump: every doStream call writes + * `//.json` BEFORE the stream finishes, + * so a tool crash later cannot lose this turn's token IDs. The shape matches + * openhands' `llm_completions//*.json` exactly so gym's + * `get_openhands_trajectory_from_completions` reads it without changes. + */ + +import { + type LanguageModelV3, + type LanguageModelV3CallOptions, + type LanguageModelV3StreamPart, + type LanguageModelV3Content, + type SharedV3ProviderMetadata, + type SharedV3Warning, +} from "@ai-sdk/provider" +import { promises as fs } from "node:fs" +import path from "node:path" +import { convertToOpenAICompatibleChatMessages } from "../copilot/chat/convert-to-openai-compatible-chat-messages" +import { prepareTools } from "../copilot/chat/openai-compatible-prepare-tools" + +// --------------------------------------------------------------------------- +// Wire types +// --------------------------------------------------------------------------- + +interface ChatRequestMessage { + role: "system" | "user" | "assistant" | "tool" + content?: string | Array | null + tool_calls?: Array<{ + id: string + type: "function" + function: { name: string; arguments: string } + }> + tool_call_id?: string + name?: string + prompt_token_ids?: number[] + generation_token_ids?: number[] + generation_log_probs?: number[] + [key: string]: unknown +} + +interface ChatResponseChoice { + index?: number + finish_reason?: string | null + message: { + role: string + content?: string | null + reasoning_text?: string | null + tool_calls?: Array<{ + id?: string + type?: string + function: { name: string; arguments: string } + }> + prompt_token_ids?: number[] + generation_token_ids?: number[] + generation_log_probs?: number[] + [key: string]: unknown + } +} + +interface ChatResponseUsage { + prompt_tokens?: number | null + completion_tokens?: number | null + total_tokens?: number | null +} + +interface ChatResponse { + id?: string + model?: string + created?: number + choices: ChatResponseChoice[] + usage?: ChatResponseUsage +} + +// --------------------------------------------------------------------------- +// Config +// --------------------------------------------------------------------------- + +const TOKEN_ID_FIELDS = ["prompt_token_ids", "generation_token_ids", "generation_log_probs"] as const + +export interface NemoGymLanguageModelConfig { + /** Provider id used to namespace providerMetadata. Defaults to "nemo-gym". */ + provider: string + /** Full base URL of the model server (e.g. `http://gym-host:18086`). */ + baseURL: string + /** Optional gym head-server-style model server name; informational only. */ + modelServerName?: string + /** Custom request headers (auth, etc). */ + headers?: () => Record + /** Per-call HTTP timeout in ms. */ + requestTimeoutMs?: number + /** Number of HTTP retry attempts on transient errors. */ + retries?: number + /** + * Where per-call llm_completions JSONs land. The bench harness builds this + * path; it must match what gym's host-side glob expects. If unset, no + * trajectory dump happens (useful for dev/test). + */ + completionsDir?: string + /** instance_id for the dump file naming + path. Required when completionsDir set. */ + instanceId?: string + /** Optional sink that the bench harness uses to count turns globally. */ + turnCounter?: { next(): number } + /** Optional callback fired after each successful chat completion. */ + onCompletion?: (info: { + turn: number + messages: ChatRequestMessage[] + response: ChatResponse + providerSpecificFields: Record + requestParams: Record + }) => void | Promise +} + +// --------------------------------------------------------------------------- +// Implementation +// --------------------------------------------------------------------------- + +export class NemoGymLanguageModel implements LanguageModelV3 { + readonly specificationVersion = "v3" + readonly modelId: string + readonly provider: string + + private readonly cfg: NemoGymLanguageModelConfig + private cookies: Record = {} + + constructor(modelId: string, cfg: NemoGymLanguageModelConfig) { + this.modelId = modelId + this.provider = cfg.provider + this.cfg = { + requestTimeoutMs: 600_000, + retries: 3, + ...cfg, + } + } + + get supportedUrls() { + return {} as Record + } + + // The streamText path in `session/llm.ts` only calls doStream. We still + // implement doGenerate for completeness / future direct-use. + async doGenerate(options: LanguageModelV3CallOptions) { + const { warnings, messages, requestParams } = await this._buildRequestParams(options) + const { responseJson } = await this._postChat(requestParams) + + const choice = responseJson.choices[0] + if (!choice) throw new Error("nemo-gym: empty choices in response") + const msg: ChatResponseChoice["message"] = choice.message ?? ({ role: "assistant" } as ChatResponseChoice["message"]) + + const content: LanguageModelV3Content[] = [] + if (msg.content) content.push({ type: "text", text: msg.content }) + if (msg.reasoning_text) content.push({ type: "reasoning", text: msg.reasoning_text }) + if (msg.tool_calls) { + for (const tc of msg.tool_calls) { + content.push({ + type: "tool-call", + toolCallId: tc.id ?? `call_${Math.random().toString(36).slice(2, 10)}`, + toolName: tc.function.name, + input: tc.function.arguments, + }) + } + } + + const providerSpecificFields = this._extractProviderFields(msg) + const providerMetadata = this._buildProviderMetadata(providerSpecificFields) + + await this._dumpAndNotify({ + messages, + response: responseJson, + providerSpecificFields, + requestParams, + }) + + return { + content, + finishReason: this._mapFinishReason(choice.finish_reason ?? null), + usage: this._mapUsage(responseJson.usage), + providerMetadata, + request: { body: JSON.stringify(requestParams) }, + response: { body: responseJson }, + warnings, + } + } + + async doStream(options: LanguageModelV3CallOptions) { + const { warnings, messages, requestParams } = await this._buildRequestParams(options) + + // Fire the HTTP call eagerly so any error surfaces synchronously when the + // stream is consumed. We then synthesize parts in `start`. + const self = this + + const stream = new ReadableStream({ + async start(controller) { + controller.enqueue({ type: "stream-start", warnings }) + + try { + const { responseJson } = await self._postChat(requestParams) + + const choice = responseJson.choices[0] + if (!choice) throw new Error("nemo-gym: empty choices in response") + const msg: ChatResponseChoice["message"] = + choice.message ?? ({ role: "assistant" } as ChatResponseChoice["message"]) + + const providerSpecificFields = self._extractProviderFields(msg) + const providerMetadata = self._buildProviderMetadata(providerSpecificFields) + + // Emit response-metadata first. + controller.enqueue({ + type: "response-metadata", + id: responseJson.id, + modelId: responseJson.model, + timestamp: responseJson.created ? new Date(responseJson.created * 1000) : undefined, + }) + + // Reasoning content. + if (msg.reasoning_text) { + controller.enqueue({ type: "reasoning-start", id: "reasoning-0" }) + controller.enqueue({ type: "reasoning-delta", id: "reasoning-0", delta: msg.reasoning_text }) + controller.enqueue({ type: "reasoning-end", id: "reasoning-0" }) + } + + // Text content. + if (msg.content) { + controller.enqueue({ type: "text-start", id: "txt-0" }) + controller.enqueue({ type: "text-delta", id: "txt-0", delta: msg.content }) + controller.enqueue({ type: "text-end", id: "txt-0" }) + } + + // Tool calls. + if (msg.tool_calls) { + for (const tc of msg.tool_calls) { + const tcId = tc.id ?? `call_${Math.random().toString(36).slice(2, 10)}` + controller.enqueue({ + type: "tool-input-start", + id: tcId, + toolName: tc.function.name, + }) + controller.enqueue({ + type: "tool-input-delta", + id: tcId, + delta: tc.function.arguments, + }) + controller.enqueue({ type: "tool-input-end", id: tcId }) + controller.enqueue({ + type: "tool-call", + toolCallId: tcId, + toolName: tc.function.name, + input: tc.function.arguments, + }) + } + } + + // Persist trajectory BEFORE finishing so a downstream tool crash + // cannot lose this turn's token IDs. + await self._dumpAndNotify({ + messages, + response: responseJson, + providerSpecificFields, + requestParams, + }) + + controller.enqueue({ + type: "finish", + finishReason: self._mapFinishReason(choice.finish_reason ?? null), + usage: self._mapUsage(responseJson.usage), + providerMetadata, + }) + + controller.close() + } catch (err) { + controller.enqueue({ type: "error", error: err instanceof Error ? err.message : String(err) }) + controller.enqueue({ + type: "finish", + finishReason: { unified: "error", raw: undefined }, + usage: self._mapUsage(undefined), + providerMetadata: {}, + }) + controller.close() + } + }, + }) + + return { + stream, + request: { body: JSON.stringify(requestParams) }, + response: {}, + } + } + + // ----------------------------------------------------------------------- + // Helpers + // ----------------------------------------------------------------------- + + private async _buildRequestParams(options: LanguageModelV3CallOptions): Promise<{ + warnings: SharedV3Warning[] + messages: ChatRequestMessage[] + tools: unknown + toolChoice: unknown + requestParams: Record + }> { + const warnings: SharedV3Warning[] = [] + // Reuse opencode's existing OpenAI-compatible message converter so all + // tool-call / multi-content shapes map identically to the rest of opencode. + const messages = convertToOpenAICompatibleChatMessages(options.prompt) as unknown as ChatRequestMessage[] + + const { tools, toolChoice, toolWarnings } = prepareTools({ + tools: options.tools, + toolChoice: options.toolChoice, + }) + warnings.push(...toolWarnings) + + // Strip token-ID fields from all assistant messages EXCEPT the most recent. + // Mirrors nemo_gym_client.py:85-97. Wire-payload dedup; the most recent + // message keeps its IDs so the server can verify continuity. + { + let lastSeen = false + for (let i = messages.length - 1; i >= 0; i--) { + const m = messages[i] as Record + const hasAll = TOKEN_ID_FIELDS.every((f) => f in m) + if (lastSeen) { + for (const f of TOKEN_ID_FIELDS) delete m[f] + } else if (hasAll) { + lastSeen = true + } + } + } + + const requestParams: Record = { + model: this.modelId, + messages, + max_tokens: options.maxOutputTokens, + temperature: options.temperature, + top_p: options.topP, + stop: options.stopSequences, + seed: options.seed, + } + if (tools && (tools as unknown[]).length) requestParams.tools = tools + if (toolChoice) requestParams.tool_choice = toolChoice + + // Strip undefineds — vllm errors on null/undefined keys. + for (const k of Object.keys(requestParams)) { + if (requestParams[k] === undefined) delete requestParams[k] + } + + return { warnings, messages, tools, toolChoice, requestParams } + } + + private async _postChat(params: Record): Promise<{ responseJson: ChatResponse }> { + const url = this._urlFor("/v1/chat/completions") + const headers: Record = { + "Content-Type": "application/json", + Accept: "application/json", + } + const cfgHeaders = this.cfg.headers?.() + if (cfgHeaders) { + for (const [k, v] of Object.entries(cfgHeaders)) if (v != null) headers[k] = v + } + if (Object.keys(this.cookies).length) { + headers.Cookie = Object.entries(this.cookies) + .map(([k, v]) => `${k}=${v}`) + .join("; ") + } + + const retries = this.cfg.retries ?? 3 + let lastErr: unknown = null + for (let attempt = 0; attempt < retries; attempt++) { + const ac = new AbortController() + const timer = setTimeout(() => ac.abort(), this.cfg.requestTimeoutMs) + try { + const res = await fetch(url, { + method: "POST", + headers, + body: JSON.stringify(params), + signal: ac.signal, + }) + clearTimeout(timer) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`NeMoGym ${url} ${res.status}: ${text.slice(0, 500)}`) + } + const setCookie = res.headers.get("set-cookie") + if (setCookie) { + for (const part of setCookie.split(/,(?=[^;]+=)/)) { + const [kv] = part.split(";") + const [k, v] = kv.split("=") + if (k && v) this.cookies[k.trim()] = v.trim() + } + } + const responseJson = (await res.json()) as ChatResponse + return { responseJson } + } catch (err) { + clearTimeout(timer) + lastErr = err + if (attempt === retries - 1) break + await new Promise((r) => setTimeout(r, 1000 * 2 ** attempt)) + } + } + throw new Error(`NeMoGym chat completions failed after ${retries} attempts: ${String(lastErr)}`) + } + + private _urlFor(p: string): string { + const base = this.cfg.baseURL.endsWith("/") ? this.cfg.baseURL : `${this.cfg.baseURL}/` + return new URL(p.replace(/^\//, ""), base).toString() + } + + private _extractProviderFields(msg: ChatResponseChoice["message"]): Record { + const out: Record = {} + if (Array.isArray(msg.prompt_token_ids)) { + for (const f of TOKEN_ID_FIELDS) { + const v = (msg as Record)[f] + if (v !== undefined) out[f] = v + } + } + return out + } + + private _buildProviderMetadata(providerSpecific: Record): SharedV3ProviderMetadata { + const md: SharedV3ProviderMetadata = { [this.provider]: {} } + for (const [k, v] of Object.entries(providerSpecific)) { + ;(md[this.provider] as Record)[k] = v as never + } + return md + } + + private _mapFinishReason(raw: string | null): { unified: "stop" | "length" | "tool-calls" | "error" | "other"; raw: string | undefined } { + if (!raw) return { unified: "other", raw: undefined } + switch (raw) { + case "stop": + return { unified: "stop", raw } + case "length": + return { unified: "length", raw } + case "tool_calls": + case "function_call": + return { unified: "tool-calls", raw } + default: + return { unified: "other", raw } + } + } + + private _mapUsage(raw?: ChatResponseUsage) { + return { + inputTokens: { + total: raw?.prompt_tokens ?? undefined, + noCache: raw?.prompt_tokens ?? undefined, + cacheRead: undefined, + cacheWrite: undefined, + }, + outputTokens: { + total: raw?.completion_tokens ?? undefined, + text: raw?.completion_tokens ?? undefined, + reasoning: undefined, + }, + } + } + + private async _dumpAndNotify(args: { + messages: ChatRequestMessage[] + response: ChatResponse + providerSpecificFields: Record + requestParams: Record + }) { + const turn = this.cfg.turnCounter ? this.cfg.turnCounter.next() : Date.now() + if (this.cfg.onCompletion) { + try { + await this.cfg.onCompletion({ turn, ...args }) + } catch (err) { + console.warn(`[nemo-gym] onCompletion hook threw: ${String(err)}`) + } + } + + if (!this.cfg.completionsDir || !this.cfg.instanceId) return + + try { + await fs.mkdir(this.cfg.completionsDir, { recursive: true }) + const turnStr = String(turn).padStart(4, "0") + const safeModel = this.modelId.replace(/\//g, "__") + const fname = `${safeModel}-${turnStr}-${Date.now()}.json` + const fpath = path.join(this.cfg.completionsDir, fname) + const kwargs: Record = {} + for (const [k, v] of Object.entries(args.requestParams)) { + if (k !== "messages") kwargs[k] = v + } + const payload = { + messages: args.messages, + response: args.response, + provider_specific_fields: args.providerSpecificFields, + kwargs, + timestamp: Date.now() / 1000, + } + const tmp = `${fpath}.tmp` + await fs.writeFile(tmp, JSON.stringify(payload)) + await fs.rename(tmp, fpath) + } catch (err) { + console.warn(`[nemo-gym] failed to dump completion: ${String(err)}`) + } + } +} From 1a1bf7a5f09b18f22eb70689cf7098c5327e78fc Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Wed, 6 May 2026 19:13:16 -0700 Subject: [PATCH 02/21] bench: add nemo-gym provider + run_infer entry --- .../benchmarks/swe_bench/scripts/run_infer.sh | 37 +++++++----- packages/opencode/src/bench/cli.ts | 56 +++++++++---------- 2 files changed, 48 insertions(+), 45 deletions(-) diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh index 25bd5b001e09..8392a21b8b0e 100755 --- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh @@ -5,14 +5,15 @@ # $1 COMMIT_HASH opencode commit (informational; checkout is done at setup) # $2 AGENT agent class name (informational) # $3 MAX_ITER max agent turns -# $4 DATASET dataset name (informational; dispatch already done by gym) +# $4 DATASET dataset name (informational; gym already dispatched) # $5 SPLIT dataset split (informational) # $6 EVAL_OUTPUT_DIR where to write trajectories (relative to opencode dir) # $7 SELECTED_ID instance_id to run # $8 INSTANCE_DICT_PATH /root/dataset/data.jsonl (single-line JSONL) # $9 CONFIG_FILE opencode model config JSON (written by gym) -# $10 USER_PROMPT_PATH optional -# $11 SYSTEM_PROMPT_PATH optional +# $10 WORKSPACE_ROOT resolved repo path inside the SIF (gym side decided) +# $11 USER_MESSAGE_PATH pre-rendered user prompt file (workspace baked in) +# $12 SYSTEM_PROMPT_PATH optional system-prompt override # # Environment (set by gym): # NEMO_GYM_MODEL_SERVER_NAME proxy name on the gym head server @@ -34,20 +35,29 @@ EVAL_OUTPUT_DIR="${6:-evaluation/oh}" SELECTED_ID="${7:-}" INSTANCE_DICT_PATH="${8:-/root/dataset/data.jsonl}" CONFIG_FILE="${9:-/tmp/oc_config.json}" -USER_PROMPT_PATH="${10:-}" -SYSTEM_PROMPT_PATH="${11:-}" +WORKSPACE_ROOT="${10:-}" +USER_MESSAGE_PATH="${11:-}" +SYSTEM_PROMPT_PATH="${12:-}" if [ -z "$SELECTED_ID" ]; then echo "ERROR: SELECTED_ID (\$7) is required." exit 64 fi +if [ -z "$WORKSPACE_ROOT" ]; then + echo "ERROR: WORKSPACE_ROOT (\$10) is required — gym side resolves the dataset-aware repo path." + exit 65 +fi +if [ -z "$USER_MESSAGE_PATH" ]; then + echo "ERROR: USER_MESSAGE_PATH (\$11) is required — gym side renders the user prompt." + exit 66 +fi if [ -z "${NEMO_GYM_MODEL_SERVER_NAME:-}" ]; then echo "ERROR: NEMO_GYM_MODEL_SERVER_NAME not set in env." - exit 65 + exit 67 fi if [ -z "${NEMO_GYM_MODEL_SERVER_BASE_URL:-}" ]; then echo "ERROR: NEMO_GYM_MODEL_SERVER_BASE_URL not set in env." - exit 66 + exit 68 fi # Resolve the opencode root directory. The script lives at @@ -58,11 +68,11 @@ BENCH_CLI="$OPENCODE_DIR/packages/opencode/src/bench/cli.ts" if [ ! -f "$BENCH_CLI" ]; then echo "ERROR: bench cli.ts not found at $BENCH_CLI" - exit 67 + exit 69 fi if ! command -v bun >/dev/null 2>&1; then echo "ERROR: bun not on PATH (expected /opencode_setup/bun/bin/bun)" - exit 68 + exit 70 fi # Make EVAL_OUTPUT_DIR absolute (relative to opencode dir). @@ -72,7 +82,6 @@ case "$EVAL_OUTPUT_DIR" in esac mkdir -p "$ABS_OUTPUT_DIR" -# Echo the resolved config for log analysis. echo "OPENCODE_DIR: $OPENCODE_DIR" echo "BENCH_CLI: $BENCH_CLI" echo "AGENT: $AGENT COMMIT: $COMMIT_HASH MAX_ITER: $MAX_ITER" @@ -80,7 +89,8 @@ echo "DATASET: $DATASET SPLIT: $SPLIT SELECTED_ID: $SELECTED_ID" echo "EVAL_OUTPUT_DIR: $ABS_OUTPUT_DIR" echo "INSTANCE_DICT_PATH: $INSTANCE_DICT_PATH" echo "CONFIG_FILE: $CONFIG_FILE" -echo "USER_PROMPT_PATH: $USER_PROMPT_PATH" +echo "WORKSPACE_ROOT: $WORKSPACE_ROOT" +echo "USER_MESSAGE_PATH: $USER_MESSAGE_PATH" echo "SYSTEM_PROMPT_PATH: $SYSTEM_PROMPT_PATH" echo "MODEL_SERVER: $NEMO_GYM_MODEL_SERVER_NAME @ $NEMO_GYM_MODEL_SERVER_BASE_URL" @@ -94,10 +104,9 @@ cmd=( --dataset "$DATASET" --split "$SPLIT" --selected-id "$SELECTED_ID" + --workspace-root "$WORKSPACE_ROOT" + --user-message-file "$USER_MESSAGE_PATH" ) -if [ -n "$USER_PROMPT_PATH" ]; then - cmd+=(--user-prompt "$USER_PROMPT_PATH") -fi if [ -n "$SYSTEM_PROMPT_PATH" ]; then cmd+=(--system-prompt "$SYSTEM_PROMPT_PATH") fi diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index 33dbb92d6e6b..1961a6acaf60 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -33,7 +33,10 @@ interface CliArgs { dataset: string split: string selectedId: string - userPromptPath?: string + /** Resolved repo path inside the SIF — gym side decided based on dataset_name. */ + workspaceRoot: string + /** Pre-rendered user message file (workspace_path baked in by gym). */ + userMessageFile: string systemPromptPath?: string } @@ -67,8 +70,11 @@ function parseArgs(argv: string[]): CliArgs { case "--selected-id": out.selectedId = next() break - case "--user-prompt": - out.userPromptPath = next() + case "--workspace-root": + out.workspaceRoot = next() + break + case "--user-message-file": + out.userMessageFile = next() break case "--system-prompt": out.systemPromptPath = next() @@ -77,7 +83,14 @@ function parseArgs(argv: string[]): CliArgs { if (a.startsWith("--")) throw new Error(`Unknown flag: ${a}`) } } - for (const required of ["instanceDictPath", "outputDir", "config", "selectedId"] as const) { + for (const required of [ + "instanceDictPath", + "outputDir", + "config", + "selectedId", + "workspaceRoot", + "userMessageFile", + ] as const) { if (!out[required]) throw new Error(`Missing required arg --${required.replace(/[A-Z]/g, (c) => "-" + c.toLowerCase())}`) } @@ -105,12 +118,6 @@ async function readInstance(instanceDictPath: string, selectedId: string): Promi return match } -function detectWorkspaceRoot(instance: InstanceDict): string { - if (instance.workspace) return instance.workspace - // SWE-bench SIFs check the repo out at /testbed by convention. - return "/testbed" -} - function loadGymConfig(configPath: string): Record { return JSON.parse(readFileSync(configPath, "utf8")) } @@ -129,15 +136,12 @@ Use the available tools (bash, edit, read, glob, grep) to investigate and act. D async function buildConfigDir(args: { instanceId: string - workspaceRoot: string modelName: string baseURL: string completionsDir: string maxTurns: number - problemStatement: string systemPromptPath?: string - userPromptPath?: string -}): Promise<{ tmpRoot: string; configFile: string; userPrompt: string }> { +}): Promise<{ tmpRoot: string; configFile: string }> { const tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), `bench-${args.instanceId}-`)) await fs.mkdir(tmpRoot, { recursive: true }) @@ -145,10 +149,6 @@ async function buildConfigDir(args: { ? await fs.readFile(args.systemPromptPath, "utf8") : DEFAULT_SYSTEM_PROMPT - const userPromptTemplate = args.userPromptPath - ? await fs.readFile(args.userPromptPath, "utf8") - : `\n{{problem_statement}}\n\n\nThe workspace is at ${args.workspaceRoot}. Investigate, fix, run tests, and stop when the issue is resolved.` - const cfg: Record = { $schema: "https://opencode.ai/config.json", provider: { @@ -208,11 +208,7 @@ async function buildConfigDir(args: { const configFile = path.join(tmpRoot, "opencode.jsonc") await fs.writeFile(configFile, JSON.stringify(cfg, null, 2)) - return { - tmpRoot, - configFile, - userPrompt: userPromptTemplate.replace(/\{\{problem_statement\}\}/g, args.problemStatement), - } + return { tmpRoot, configFile } } function runOpencode(args: { @@ -313,7 +309,8 @@ function detectOpencodeBin(): string { async function main() { const args = parseArgs(process.argv.slice(2)) const instance = await readInstance(args.instanceDictPath, args.selectedId) - const workspaceRoot = detectWorkspaceRoot(instance) + // workspaceRoot is decided gym-side based on dataset_name; we use it verbatim. + const workspaceRoot = args.workspaceRoot const gymConfig = loadGymConfig(args.config) const llmModelCfg = ((gymConfig as Record>).llm?.model ?? {}) as Record< string, @@ -326,20 +323,17 @@ async function main() { const completionsDir = completionsDirFor(args.outputDir, instance.instance_id) await fs.mkdir(completionsDir, { recursive: true }) - // We pre-render the user message so opencode's prompt machinery doesn't - // need to know about SWE-bench-specific templating. - const problemStatement = (instance.problem_statement ?? "").toString() + // The user message is fully rendered by gym (workspace_path baked in based + // on dataset_name); we just read it as-is and pass it to opencode. + const userPrompt = await fs.readFile(args.userMessageFile, "utf8") - const { tmpRoot, configFile, userPrompt } = await buildConfigDir({ + const { tmpRoot, configFile } = await buildConfigDir({ instanceId: instance.instance_id, - workspaceRoot, modelName, baseURL, completionsDir, maxTurns: args.maxTurns, - problemStatement, systemPromptPath: args.systemPromptPath, - userPromptPath: args.userPromptPath, }) const startedAt = Date.now() From c55a4bab7d0bcd134d08298fd58bc7a16b79c0d4 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Wed, 6 May 2026 19:23:51 -0700 Subject: [PATCH 03/21] bench: drop eager TUI command imports from CLI entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The opencode CLI registers TUI subcommands (Attach, TuiThread) via eager imports in index.ts. Loading them transitively imports cli/cmd/tui/app.tsx, whose JSX is meant to compile against @opentui/solid (per the package tsconfig's jsxImportSource) but is incorrectly resolved against react/jsx-dev-runtime when Bun runs the un-bundled .ts file at runtime — react isn't a dep of packages/opencode, so the bench harness crashes at startup before the run command ever executes. The bench harness (packages/opencode/src/bench/cli.ts) only ever spawns \`bun src/index.ts run\` as a subprocess; it never invokes the TUI. Remove the imports + command registrations entirely so the cli/cmd/tui/ subtree is unreachable from the bench code path. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/opencode/src/index.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/packages/opencode/src/index.ts b/packages/opencode/src/index.ts index 4c8e447041c0..444e9730d29c 100644 --- a/packages/opencode/src/index.ts +++ b/packages/opencode/src/index.ts @@ -22,8 +22,13 @@ import { McpCommand } from "./cli/cmd/mcp" import { GithubCommand } from "./cli/cmd/github" import { ExportCommand } from "./cli/cmd/export" import { ImportCommand } from "./cli/cmd/import" -import { AttachCommand } from "./cli/cmd/tui/attach" -import { TuiThreadCommand } from "./cli/cmd/tui/thread" +// TUI subcommands (Attach, TuiThread) are dropped from this build of opencode. +// The bench harness (`packages/opencode/src/bench/cli.ts`) only invokes the +// `run` command via subprocess, never the TUI; loading them eagerly here drags +// in `cli/cmd/tui/app.tsx` at startup, which JSX-compiles against +// `@opentui/solid` and trips Bun's runtime JSX resolver into looking for +// `react/jsx-dev-runtime` (a bug we hit when running the un-bundled .ts). +// Removed entirely rather than lazy-loaded — bench has no use for them. import { AcpCommand } from "./cli/cmd/acp" import { EOL } from "os" import { WebCommand } from "./cli/cmd/web" @@ -156,8 +161,6 @@ const cli = yargs(args) .completion("completion", "generate shell completion script") .command(AcpCommand) .command(McpCommand) - .command(TuiThreadCommand) - .command(AttachCommand) .command(RunCommand) .command(GenerateCommand) .command(DebugCommand) From ab84347402bb65b85452642083935749a8a8928a Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Wed, 6 May 2026 19:33:49 -0700 Subject: [PATCH 04/21] bench: prefer pre-bundled opencode.js over running src/index.ts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Running opencode's CLI un-bundled (`bun src/index.ts run`) triggers cascading runtime-resolution failures: tsconfig's `jsxImportSource` isn't honored for inline `.tsx` JIT compilation (we already removed the TUI commands to dodge that), and bun's isolated install layout under `node_modules/.bun/@/...` breaks `..`-relative `.mjs` imports inside packages like @anthropic-ai/sdk (the `internal/to-file.mjs` import from `core/uploads.mjs` fails to traverse the symlink the way Node does at runtime). opencode is meant to be shipped as a pre-bundled single file (their own `bin/opencode` is built the same way) — `bun build` resolves every transitive import statically and inlines the result, so runtime never has to. The companion gym `setup_scripts/opencode.sh` now invokes `bun build packages/opencode/src/index.ts --outdir .bench-build --entry-naming opencode.js`. This commit teaches `bench/cli.ts` to prefer that bundle when it exists, and falls back to `src/index.ts` for dev / when the build step hasn't run. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/opencode/src/bench/cli.ts | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index 1961a6acaf60..4d2901ebe9e9 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -19,7 +19,7 @@ * exit we capture `git diff` and write `output.jsonl`. */ -import { promises as fs, readFileSync } from "node:fs" +import { existsSync, promises as fs, readFileSync } from "node:fs" import path from "node:path" import os from "node:os" import { spawn } from "node:child_process" @@ -299,10 +299,18 @@ function completionsDirFor(evalOutputDir: string, instanceId: string): string { } function detectOpencodeBin(): string { - // bench/cli.ts runs from packages/opencode/src/bench/. The opencode index - // entry sits at packages/opencode/src/index.ts. From this script's url we - // resolve up two levels. + // Prefer the pre-bundled artifact at /.bench-build/opencode.js. + // Running un-bundled `src/index.ts` triggers cascading runtime resolution + // failures (TUI JSX runtime not honored, @anthropic-ai/sdk relative .mjs + // paths failing across the isolated install layout). The bundle inlines + // every transitive dep and is opencode's intended deployment shape. + // Falls back to src/index.ts only for dev / when setup_scripts/opencode.sh + // hasn't run. const here = path.dirname(new URL(import.meta.url).pathname) + // bench/cli.ts → packages/opencode/src/bench → packages/opencode/src → packages/opencode → packages → + const opencodeRoot = path.resolve(here, "..", "..", "..", "..") + const bundled = path.resolve(opencodeRoot, ".bench-build", "opencode.js") + if (existsSync(bundled)) return bundled return path.resolve(here, "..", "index.ts") } From 85e8e32179a28454744166f254ae61f6cd3196ec Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Thu, 7 May 2026 17:41:30 -0700 Subject: [PATCH 05/21] bench: commit empty models-snapshot stubs (force-added) `provider/models.ts:137` does `import("./models-snapshot.js")` for a static cache of models.dev metadata. Upstream generates this file at build time via `script/generate.ts` (network fetch from models.dev), and `.gitignore` correctly excludes it as a build artifact. For the bench harness we register a single custom provider (`@opencode-ai/nemo-gym`) in the per-instance opencode config and never consult the snapshot. But `bun build --target=bun packages/opencode/src/index.ts ...` still needs the import target to exist at static-analysis time, otherwise it errors with: error: Could not resolve: "./models-snapshot.js" Force-add empty stubs so the bundle build succeeds without running `script/generate.ts` (which needs network + adds 5+ seconds). The runtime `try:` lambda in models.ts handles an empty snapshot fine. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../opencode/src/provider/models-snapshot.d.ts | 3 +++ .../opencode/src/provider/models-snapshot.js | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 packages/opencode/src/provider/models-snapshot.d.ts create mode 100644 packages/opencode/src/provider/models-snapshot.js diff --git a/packages/opencode/src/provider/models-snapshot.d.ts b/packages/opencode/src/provider/models-snapshot.d.ts new file mode 100644 index 000000000000..508ab6ee22fe --- /dev/null +++ b/packages/opencode/src/provider/models-snapshot.d.ts @@ -0,0 +1,3 @@ +// Empty stub committed for the bench harness build path. See models-snapshot.js +// for the rationale. +export declare const snapshot: Record diff --git a/packages/opencode/src/provider/models-snapshot.js b/packages/opencode/src/provider/models-snapshot.js new file mode 100644 index 000000000000..c48d54a8f72a --- /dev/null +++ b/packages/opencode/src/provider/models-snapshot.js @@ -0,0 +1,18 @@ +// @ts-nocheck +// Empty stub committed for the bench harness build path. +// +// Upstream opencode generates this file at build time via `script/generate.ts` +// (which fetches https://models.dev/api.json). For the nemo-gym bench harness +// we only register a single custom provider in the per-instance opencode +// config, so the snapshot is unused — but `bun build` still has to resolve +// `import("./models-snapshot.js")` from `provider/models.ts:137` at static +// analysis time. An empty snapshot satisfies that requirement; the runtime +// `try:` lambda in models.ts handles an empty snapshot gracefully. +// +// `.gitignore` excludes this file because upstream regenerates it. We +// force-add it on the bench branch (sdd/dev) so `bun build --target=bun +// packages/opencode/src/index.ts ...` succeeds without running generate.ts +// (which requires network access to models.dev). If you ever DO want real +// model metadata, run `bun run script/generate.ts` and don't commit the +// regenerated file. +export const snapshot = {} From 7c9b883e5376b8aa6bbd7fd07911efc3bebbb2bb Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Sun, 10 May 2026 15:51:55 -0700 Subject: [PATCH 06/21] bench: drop webfetch/websearch permission entries from per-instance config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `PermissionActionConfig` accepts a glob-keyed map only for file/shell-style permissions (edit, bash). webfetch / websearch use a different shape (a single literal action), so emitting `{"*": "deny"}` for them trips the config validator at startup: Error: Configuration is invalid at .../opencode.jsonc ↳ Expected PermissionActionConfig | undefined, got {"*":"deny"} agent.swe-bench.permission.webfetch ↳ Expected PermissionActionConfig | undefined, got {"*":"deny"} agent.swe-bench.permission.websearch Both tools are already disabled via the agent's `tools:` block (webfetch: false, websearch: false), so the permission entries are redundant. Removing them lets the per-instance config load cleanly and the run command actually start. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/opencode/src/bench/cli.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index 4d2901ebe9e9..e3e5b0f23f94 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -178,10 +178,12 @@ async function buildConfigDir(args: { // Allow the read+write tool set; disable web/skill/task to keep the // agent focused on local code editing. permission: { + // Glob-keyed `PermissionActionConfig` for file/shell access. edit: { "**": "allow" }, bash: { "*": "allow" }, - webfetch: { "*": "deny" }, - websearch: { "*": "deny" }, + // webfetch / websearch use a different schema (single action, not + // a glob map) and we already disable them in `tools` below — no + // need for an explicit entry here. }, tools: { bash: true, From f59e8c6e7f81a6261219c72a71fb4a0a12815ea6 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Sun, 10 May 2026 15:59:12 -0700 Subject: [PATCH 07/21] bench: accept opts.headers as either function or plain object opencode's bundled-provider loader invokes the factory with \`{ name, ...options }\` where `options` is the merged user + custom provider defaults from `provider.ts`. For some providers opencode injects an empty/static `headers: {}` into that merged options dict (distinct from upstream openai-compatible's schema where `headers` is a function `() => Record`). We were unconditionally invoking `this.cfg.headers?.()`. The optional chain only short-circuits on null/undefined, so when opencode passed a plain object it threw at session-start: Error: this.cfg.headers is not a function. (In 'this.cfg.headers?.()', 'this.cfg.headers' is an instance of Object) Branch on `typeof headers` to support both shapes: call it if it's a function, spread it directly if it's a plain object. Either way the resulting kv pairs are merged into the per-request HTTP headers. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/provider/sdk/nemo-gym/language-model.ts | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts index 0ba88a5d1edb..af81208bf6b2 100644 --- a/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts +++ b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts @@ -364,7 +364,17 @@ export class NemoGymLanguageModel implements LanguageModelV3 { "Content-Type": "application/json", Accept: "application/json", } - const cfgHeaders = this.cfg.headers?.() + // opencode's bundled-provider loader can pass `headers` as either a + // function (matching upstream openai-compatible's schema) OR a plain + // object (when opencode injects defaults from its provider merge layer). + // Handle both — `?.()` would throw on a non-callable object. + let cfgHeaders: Record | undefined + const rawHeaders = this.cfg.headers as unknown + if (typeof rawHeaders === "function") { + cfgHeaders = (rawHeaders as () => Record)() + } else if (rawHeaders && typeof rawHeaders === "object") { + cfgHeaders = rawHeaders as Record + } if (cfgHeaders) { for (const [k, v] of Object.entries(cfgHeaders)) if (v != null) headers[k] = v } From b401866bbb7b5fcb5f6d88cd9c4344e522115402 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Sun, 10 May 2026 16:28:27 -0700 Subject: [PATCH 08/21] bench: fix default-merge bug that made every fetch abort instantly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The constructor merged caller config over defaults via: this.cfg = { requestTimeoutMs: 600_000, retries: 3, ...cfg } opencode's provider loader invokes the factory with options that include explicit `undefined` for optional fields. Our factory in `provider/sdk/nemo-gym/index.ts` faithfully passes those through: new NemoGymLanguageModel(modelId, { ..., requestTimeoutMs: opts.requestTimeoutMs, // undefined retries: opts.retries, // undefined }) The `...cfg` spread then overwrites the 600_000 / 3 defaults with `undefined`. At runtime `setTimeout(fn, undefined)` is treated as `setTimeout(fn, 0)` — so the abort timer fires before `fetch` can even hand off the request. Result: NeMoGym chat completions failed after 3 attempts: AbortError: The operation was aborted. with ~10 ms total elapsed between step_start and the error event. Swap to the spread-first, coalesce-with-`??` pattern so undefineds fall through to the defaults instead of clobbering them. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../opencode/src/provider/sdk/nemo-gym/language-model.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts index af81208bf6b2..fdb1ff9e034d 100644 --- a/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts +++ b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts @@ -139,10 +139,14 @@ export class NemoGymLanguageModel implements LanguageModelV3 { constructor(modelId: string, cfg: NemoGymLanguageModelConfig) { this.modelId = modelId this.provider = cfg.provider + // Spread first, then coalesce — opencode's provider loader passes + // optional fields explicitly as `undefined`, and a default-then-spread + // pattern lets those undefineds overwrite the defaults. `??` only + // replaces null/undefined, preserving any real caller-supplied value. this.cfg = { - requestTimeoutMs: 600_000, - retries: 3, ...cfg, + requestTimeoutMs: cfg.requestTimeoutMs ?? 600_000, + retries: cfg.retries ?? 3, } } From a2451878ef61447eaf3634c05d0b4537bf03761d Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Sun, 10 May 2026 16:59:15 -0700 Subject: [PATCH 09/21] bench: omit body.model when modelId is empty / 'default' When the input row's responses_create_params.model is unset, the gym side hands opencode an empty model string. opencode's session/provider resolver then falls back to its sentinel `"default"` (or threads the empty string straight through). We were faithfully POSTing \`model: "default"\` to the gym openai_model server, which forwarded it to OpenAI: The model `default` does not exist or you do not have access to it. The gym openai_model server already has the right pattern for this: body_dict.setdefault("model", self.config.openai_model) so omitting `model` from the outbound body lets the server fill in the policy-configured model name (e.g. gpt-4.1-2025-04-14) without any client-side knowledge of what the policy actually points to. Guard the model assignment in `_buildRequestParams` to do exactly that. The companion gym-side change is in OpenCodeHarnessProcessor.get_run_command: when body.model is empty, fall back to the resolved model_server_cfg.openai_model (or .model for vllm) before writing the per-instance config, so opencode also sees a real string when it's populating the agent's model field. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/provider/sdk/nemo-gym/language-model.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts index fdb1ff9e034d..fc940f986320 100644 --- a/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts +++ b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts @@ -343,7 +343,6 @@ export class NemoGymLanguageModel implements LanguageModelV3 { } const requestParams: Record = { - model: this.modelId, messages, max_tokens: options.maxOutputTokens, temperature: options.temperature, @@ -351,6 +350,16 @@ export class NemoGymLanguageModel implements LanguageModelV3 { stop: options.stopSequences, seed: options.seed, } + // Only include `model` when the caller-supplied modelId is a real value. + // opencode's session resolver falls back to its sentinel `"default"` + // (and to empty string with some misconfigured agents) when no model is + // pinned. We DO NOT want either of those leaking through to OpenAI as a + // literal `model: "default"` — the gym openai_model server's + // `body_dict.setdefault("model", self.config.openai_model)` will fill in + // the policy-configured model name when we omit it instead. + if (this.modelId && this.modelId !== "default") { + requestParams.model = this.modelId + } if (tools && (tools as unknown[]).length) requestParams.tools = tools if (toolChoice) requestParams.tool_choice = toolChoice From 06724b5a2a058f8b876084269d20c6eeebcf42a6 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Sun, 10 May 2026 17:57:08 -0700 Subject: [PATCH 10/21] bench: enable todowrite tool for SWE-bench agent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The todowrite tool is just an in-session planning aid — explicit multi-step task lists help the model decompose non-trivial fixes without affecting token-ID contiguity or subagent dispatch. Was disabled defensively; flipping it on. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sugam Devare --- packages/opencode/src/bench/cli.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index e3e5b0f23f94..d8055570a769 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -197,7 +197,7 @@ async function buildConfigDir(args: { websearch: false, task: false, skill: false, - todowrite: false, + todowrite: true, }, steps: args.maxTurns, options: {}, From 2fec7806fe2c75c00644435dfa7dda2a28069647 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Sun, 10 May 2026 18:29:34 -0700 Subject: [PATCH 11/21] bench: port _deep_reset_to_base_commit from nv-OpenHands Strip git history past base_commit so the agent can't reach future commits via `git checkout ` / `git log` exploration. Two-pass: careful per-ref iteration first; nuclear batch-delete fallback for monorepos with thousands of refs (e.g. datadog-agent with 5k+ release tags would time out the careful pass). Runs once in cli.ts before spawning the opencode session, scoped to the per-instance workspace, using the base_commit field from the instance JSONL. `|| true` at the tail so a busted git state can't kill the rollout. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sugam Devare --- packages/opencode/src/bench/cli.ts | 5 ++ packages/opencode/src/bench/deep_reset.ts | 91 +++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 packages/opencode/src/bench/deep_reset.ts diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index d8055570a769..f3ca8995ad77 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -23,6 +23,7 @@ import { existsSync, promises as fs, readFileSync } from "node:fs" import path from "node:path" import os from "node:os" import { spawn } from "node:child_process" +import { runDeepReset } from "./deep_reset" interface CliArgs { instanceDictPath: string @@ -103,6 +104,7 @@ interface InstanceDict { repo?: string repo_name?: string workspace?: string + base_commit?: string [key: string]: unknown } @@ -357,6 +359,9 @@ async function main() { OPENCODE_PURE: "1", } + // Prune git history past base_commit so the agent can't reach future commits. + await runDeepReset(workspaceRoot, String(instance.base_commit ?? "")) + const opencodeBin = detectOpencodeBin() const result = await runOpencode({ workspaceRoot, diff --git a/packages/opencode/src/bench/deep_reset.ts b/packages/opencode/src/bench/deep_reset.ts new file mode 100644 index 000000000000..89a9c9d1834a --- /dev/null +++ b/packages/opencode/src/bench/deep_reset.ts @@ -0,0 +1,91 @@ +/** + * Strip git history past base_commit so the agent can't reach future commits. + * + * Port of nv-OpenHands' `_deep_reset_to_base_commit` + * (evaluation/benchmarks/swe_bench/run_infer.py:774). Two-pass design: + * + * - Careful pass: per-ref iteration with `git for-each-ref`. Preserves + * local branches that don't descend from base, resets branches that do, + * deletes tags/remote-tracking/stash/notes refs past base. + * - Nuclear fallback: batch-delete every tag/remote/stash/notes ref + every + * local branch in two `git update-ref --stdin` calls. Microseconds + * regardless of ref count — handles monorepos with thousands of refs + * where the careful pass times out. + * + * `|| true` at the very end so a busted git state can't kill the agent run. + */ + +import { spawn } from "node:child_process" + +function carefulPass(baseCommit: string): string { + return ( + `BASE=$(git rev-parse --verify ${baseCommit}^{commit}) && ` + + `ORIG_BRANCH=$(git symbolic-ref --short -q HEAD || echo main) && ` + + `git checkout --detach "$BASE" && ` + + `git for-each-ref --format="%(refname)" refs/heads | while read -r ref; do ` + + ` tip=$(git rev-parse -q --verify "$ref^{commit}" 2>/dev/null || true); ` + + ` [ -z "$tip" ] && continue; ` + + ` if [ "$tip" != "$BASE" ] && git merge-base --is-ancestor "$BASE" "$tip"; then ` + + ` git update-ref "$ref" "$BASE"; ` + + ` fi; ` + + `done && ` + + `git for-each-ref --format="%(refname)" refs | while read -r ref; do ` + + ` case "$ref" in refs/heads/*) continue ;; esac; ` + + ` if git symbolic-ref -q "$ref" >/dev/null 2>&1; then continue; fi; ` + + ` tip=$(git rev-parse -q --verify "$ref^{commit}" 2>/dev/null || true); ` + + ` [ -z "$tip" ] && continue; ` + + ` if [ "$tip" != "$BASE" ] && git merge-base --is-ancestor "$BASE" "$tip"; then ` + + ` git update-ref -d "$ref"; ` + + ` fi; ` + + `done && ` + + `for r in $(git remote); do git remote remove "$r"; done; ` + + `gd=$(git rev-parse --git-dir) && ` + + `rm -f "$gd"/FETCH_HEAD "$gd"/ORIG_HEAD "$gd"/MERGE_HEAD "$gd"/CHERRY_PICK_HEAD ` + + `"$gd"/REVERT_HEAD "$gd"/BISECT_HEAD "$gd"/AUTO_MERGE && ` + + `git reflog expire --expire=now --expire-unreachable=now --all && ` + + `git repack -ad && git prune --expire=now && git gc --prune=now && ` + + `git checkout -B "$ORIG_BRANCH" "$BASE"` + ) +} + +function nuclearPass(baseCommit: string): string { + return ( + `BASE=$(git rev-parse --verify ${baseCommit}^{commit}) && ` + + `ORIG_BRANCH=$(git symbolic-ref --short -q HEAD || echo main) && ` + + `git checkout --detach "$BASE" && ` + + `for r in $(git remote); do git remote remove "$r"; done; ` + + `git for-each-ref --format="delete %(refname)" refs/tags refs/remotes refs/stash refs/notes 2>/dev/null ` + + `| git update-ref --stdin; ` + + `git for-each-ref --format="delete %(refname)" refs/heads | git update-ref --stdin; ` + + `gd=$(git rev-parse --git-dir) && ` + + `rm -f "$gd"/FETCH_HEAD "$gd"/ORIG_HEAD "$gd"/MERGE_HEAD "$gd"/CHERRY_PICK_HEAD ` + + `"$gd"/REVERT_HEAD "$gd"/BISECT_HEAD "$gd"/AUTO_MERGE && ` + + `git reflog expire --expire=now --expire-unreachable=now --all && ` + + `git repack -ad && git prune --expire=now && git gc --prune=now && ` + + `git checkout -B "$ORIG_BRANCH" "$BASE"` + ) +} + +export function buildDeepResetCmd(baseCommit: string): string { + return `( ${carefulPass(baseCommit)} ) || ( ${nuclearPass(baseCommit)} ) || true` +} + +export async function runDeepReset(workspaceRoot: string, baseCommit: string): Promise { + if (!baseCommit) return + const cmd = buildDeepResetCmd(baseCommit) + console.log(`[bench] deep_reset workspace=${workspaceRoot} base=${baseCommit}`) + await new Promise((resolve) => { + const child = spawn("bash", ["-c", cmd], { + cwd: workspaceRoot, + stdio: ["ignore", "inherit", "inherit"], + }) + child.on("close", (code) => { + console.log(`[bench] deep_reset exit=${code ?? 0}`) + resolve() + }) + child.on("error", (err) => { + console.warn(`[bench] deep_reset spawn error: ${err}`) + resolve() + }) + }) +} From 1b8998ad7d0c205286f7341f1de219381090afd3 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Sun, 10 May 2026 18:30:57 -0700 Subject: [PATCH 12/21] bench: add progress echoes to deep_reset careful + nuclear passes Mark each phase + each ref delete/reset with a `[deep_reset:careful]` or `[deep_reset:nuclear]` line so the agent log shows what happened. Mirrors the per-step `echo` markers in nv-OpenHands' run_infer.py deep-reset path. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sugam Devare --- packages/opencode/src/bench/deep_reset.ts | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/packages/opencode/src/bench/deep_reset.ts b/packages/opencode/src/bench/deep_reset.ts index 89a9c9d1834a..c6dd07cda5d1 100644 --- a/packages/opencode/src/bench/deep_reset.ts +++ b/packages/opencode/src/bench/deep_reset.ts @@ -19,50 +19,65 @@ import { spawn } from "node:child_process" function carefulPass(baseCommit: string): string { return ( + `echo "[deep_reset:careful] start" && ` + `BASE=$(git rev-parse --verify ${baseCommit}^{commit}) && ` + `ORIG_BRANCH=$(git symbolic-ref --short -q HEAD || echo main) && ` + + `echo "[deep_reset:careful] base=$BASE orig_branch=$ORIG_BRANCH" && ` + `git checkout --detach "$BASE" && ` + + `echo "[deep_reset:careful] resetting local branches descending from base..." && ` + `git for-each-ref --format="%(refname)" refs/heads | while read -r ref; do ` + ` tip=$(git rev-parse -q --verify "$ref^{commit}" 2>/dev/null || true); ` + ` [ -z "$tip" ] && continue; ` + ` if [ "$tip" != "$BASE" ] && git merge-base --is-ancestor "$BASE" "$tip"; then ` + + ` echo "[deep_reset:careful] reset $ref -> $BASE"; ` + ` git update-ref "$ref" "$BASE"; ` + ` fi; ` + `done && ` + + `echo "[deep_reset:careful] deleting tags/remotes/stash/notes past base..." && ` + `git for-each-ref --format="%(refname)" refs | while read -r ref; do ` + ` case "$ref" in refs/heads/*) continue ;; esac; ` + ` if git symbolic-ref -q "$ref" >/dev/null 2>&1; then continue; fi; ` + ` tip=$(git rev-parse -q --verify "$ref^{commit}" 2>/dev/null || true); ` + ` [ -z "$tip" ] && continue; ` + ` if [ "$tip" != "$BASE" ] && git merge-base --is-ancestor "$BASE" "$tip"; then ` + + ` echo "[deep_reset:careful] delete $ref"; ` + ` git update-ref -d "$ref"; ` + ` fi; ` + `done && ` + - `for r in $(git remote); do git remote remove "$r"; done; ` + + `echo "[deep_reset:careful] removing remotes + transient refs..." && ` + + `for r in $(git remote); do echo "[deep_reset:careful] rm remote $r"; git remote remove "$r"; done; ` + `gd=$(git rev-parse --git-dir) && ` + `rm -f "$gd"/FETCH_HEAD "$gd"/ORIG_HEAD "$gd"/MERGE_HEAD "$gd"/CHERRY_PICK_HEAD ` + `"$gd"/REVERT_HEAD "$gd"/BISECT_HEAD "$gd"/AUTO_MERGE && ` + + `echo "[deep_reset:careful] expiring reflog + gc..." && ` + `git reflog expire --expire=now --expire-unreachable=now --all && ` + `git repack -ad && git prune --expire=now && git gc --prune=now && ` + - `git checkout -B "$ORIG_BRANCH" "$BASE"` + `git checkout -B "$ORIG_BRANCH" "$BASE" && ` + + `echo "[deep_reset:careful] done; HEAD=$ORIG_BRANCH at $BASE"` ) } function nuclearPass(baseCommit: string): string { return ( + `echo "[deep_reset:nuclear] careful pass failed; running batch-delete fallback" && ` + `BASE=$(git rev-parse --verify ${baseCommit}^{commit}) && ` + `ORIG_BRANCH=$(git symbolic-ref --short -q HEAD || echo main) && ` + + `echo "[deep_reset:nuclear] base=$BASE orig_branch=$ORIG_BRANCH" && ` + `git checkout --detach "$BASE" && ` + - `for r in $(git remote); do git remote remove "$r"; done; ` + + `for r in $(git remote); do echo "[deep_reset:nuclear] rm remote $r"; git remote remove "$r"; done; ` + + `echo "[deep_reset:nuclear] batch-delete tags/remotes/stash/notes..." && ` + `git for-each-ref --format="delete %(refname)" refs/tags refs/remotes refs/stash refs/notes 2>/dev/null ` + `| git update-ref --stdin; ` + + `echo "[deep_reset:nuclear] batch-delete local branches..." && ` + `git for-each-ref --format="delete %(refname)" refs/heads | git update-ref --stdin; ` + `gd=$(git rev-parse --git-dir) && ` + `rm -f "$gd"/FETCH_HEAD "$gd"/ORIG_HEAD "$gd"/MERGE_HEAD "$gd"/CHERRY_PICK_HEAD ` + `"$gd"/REVERT_HEAD "$gd"/BISECT_HEAD "$gd"/AUTO_MERGE && ` + + `echo "[deep_reset:nuclear] expiring reflog + gc..." && ` + `git reflog expire --expire=now --expire-unreachable=now --all && ` + `git repack -ad && git prune --expire=now && git gc --prune=now && ` + - `git checkout -B "$ORIG_BRANCH" "$BASE"` + `git checkout -B "$ORIG_BRANCH" "$BASE" && ` + + `echo "[deep_reset:nuclear] done; HEAD=$ORIG_BRANCH at $BASE"` ) } From 9b3d676695dd2e8e6fb6a6f702ba424bf0e7e250 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Mon, 11 May 2026 10:27:17 -0700 Subject: [PATCH 13/21] bench: capture per-session trajectories + add --enable-subagents flag - language-model.ts: read sessionID/parentSessionID from `x-session-affinity` / `x-parent-session-id` request headers (set by opencode's session/llm.ts for non-opencode providers). Per-session turn counter prevents subagent dumps from clobbering the main session's. Filename includes sessionID so each session's per-turn JSONs sit side-by-side; payload now carries session_id, parent_session_id, and turn for downstream reconstruction of the agent tree. - cli.ts: new --enable-subagents flag; toggles `task: ` in the per-instance opencode.jsonc. Default off. - run_infer.sh: forwards ENABLE_SUBAGENTS env -> --enable-subagents flag. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sugam Devare --- .../benchmarks/swe_bench/scripts/run_infer.sh | 3 ++ packages/opencode/src/bench/cli.ts | 17 +++++++- .../provider/sdk/nemo-gym/language-model.ts | 42 ++++++++++++++++--- 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh index 8392a21b8b0e..2edf91b721cc 100755 --- a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh +++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh @@ -110,6 +110,9 @@ cmd=( if [ -n "$SYSTEM_PROMPT_PATH" ]; then cmd+=(--system-prompt "$SYSTEM_PROMPT_PATH") fi +if [ "${ENABLE_SUBAGENTS:-0}" = "1" ] || [ "${ENABLE_SUBAGENTS:-}" = "true" ]; then + cmd+=(--enable-subagents) +fi echo "Executing: ${cmd[*]}" exec "${cmd[@]}" diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index f3ca8995ad77..f86afd9c311b 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -39,10 +39,18 @@ interface CliArgs { /** Pre-rendered user message file (workspace_path baked in by gym). */ userMessageFile: string systemPromptPath?: string + /** Enable opencode's `task` tool (spawns subagent sessions). */ + enableSubagents: boolean } function parseArgs(argv: string[]): CliArgs { - const out: Partial = { maxTurns: 100, agentCls: "OpenCodeAgent", dataset: "", split: "test" } + const out: Partial = { + maxTurns: 100, + agentCls: "OpenCodeAgent", + dataset: "", + split: "test", + enableSubagents: false, + } for (let i = 0; i < argv.length; i++) { const a = argv[i] const next = () => argv[++i] @@ -80,6 +88,9 @@ function parseArgs(argv: string[]): CliArgs { case "--system-prompt": out.systemPromptPath = next() break + case "--enable-subagents": + out.enableSubagents = true + break default: if (a.startsWith("--")) throw new Error(`Unknown flag: ${a}`) } @@ -143,6 +154,7 @@ async function buildConfigDir(args: { completionsDir: string maxTurns: number systemPromptPath?: string + enableSubagents: boolean }): Promise<{ tmpRoot: string; configFile: string }> { const tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), `bench-${args.instanceId}-`)) await fs.mkdir(tmpRoot, { recursive: true }) @@ -197,7 +209,7 @@ async function buildConfigDir(args: { apply_patch: true, webfetch: false, websearch: false, - task: false, + task: args.enableSubagents, skill: false, todowrite: true, }, @@ -346,6 +358,7 @@ async function main() { completionsDir, maxTurns: args.maxTurns, systemPromptPath: args.systemPromptPath, + enableSubagents: args.enableSubagents, }) const startedAt = Date.now() diff --git a/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts index fc940f986320..6800fc568798 100644 --- a/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts +++ b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts @@ -135,14 +135,14 @@ export class NemoGymLanguageModel implements LanguageModelV3 { private readonly cfg: NemoGymLanguageModelConfig private cookies: Record = {} + // Per-session turn counter. opencode's session header is `x-session-affinity`; + // subagents spawned via the task tool get their own sessionID, so keeping + // a Map keeps their dump filenames from clobbering the main session's. + private readonly turnCounters: Map = new Map() constructor(modelId: string, cfg: NemoGymLanguageModelConfig) { this.modelId = modelId this.provider = cfg.provider - // Spread first, then coalesce — opencode's provider loader passes - // optional fields explicitly as `undefined`, and a default-then-spread - // pattern lets those undefineds overwrite the defaults. `??` only - // replaces null/undefined, preserving any real caller-supplied value. this.cfg = { ...cfg, requestTimeoutMs: cfg.requestTimeoutMs ?? 600_000, @@ -150,6 +150,25 @@ export class NemoGymLanguageModel implements LanguageModelV3 { } } + private _nextTurn(sessionID: string): number { + const n = (this.turnCounters.get(sessionID) ?? -1) + 1 + this.turnCounters.set(sessionID, n) + return n + } + + private _sessionFromHeaders(headers: unknown): { sessionID: string; parentSessionID: string | undefined } { + let sid = "" + let pid: string | undefined + if (headers && typeof headers === "object") { + const h = headers as Record + const v = h["x-session-affinity"] ?? h["X-Session-Affinity"] + if (typeof v === "string") sid = v + const p = h["x-parent-session-id"] ?? h["X-Parent-Session-Id"] + if (typeof p === "string") pid = p + } + return { sessionID: sid || "main", parentSessionID: pid } + } + get supportedUrls() { return {} as Record } @@ -158,6 +177,7 @@ export class NemoGymLanguageModel implements LanguageModelV3 { // implement doGenerate for completeness / future direct-use. async doGenerate(options: LanguageModelV3CallOptions) { const { warnings, messages, requestParams } = await this._buildRequestParams(options) + const session = this._sessionFromHeaders(options.headers) const { responseJson } = await this._postChat(requestParams) const choice = responseJson.choices[0] @@ -186,6 +206,7 @@ export class NemoGymLanguageModel implements LanguageModelV3 { response: responseJson, providerSpecificFields, requestParams, + session, }) return { @@ -201,6 +222,7 @@ export class NemoGymLanguageModel implements LanguageModelV3 { async doStream(options: LanguageModelV3CallOptions) { const { warnings, messages, requestParams } = await this._buildRequestParams(options) + const session = this._sessionFromHeaders(options.headers) // Fire the HTTP call eagerly so any error surfaces synchronously when the // stream is consumed. We then synthesize parts in `start`. @@ -274,6 +296,7 @@ export class NemoGymLanguageModel implements LanguageModelV3 { response: responseJson, providerSpecificFields, requestParams, + session, }) controller.enqueue({ @@ -494,8 +517,9 @@ export class NemoGymLanguageModel implements LanguageModelV3 { response: ChatResponse providerSpecificFields: Record requestParams: Record + session: { sessionID: string; parentSessionID: string | undefined } }) { - const turn = this.cfg.turnCounter ? this.cfg.turnCounter.next() : Date.now() + const turn = this._nextTurn(args.session.sessionID) if (this.cfg.onCompletion) { try { await this.cfg.onCompletion({ turn, ...args }) @@ -510,7 +534,10 @@ export class NemoGymLanguageModel implements LanguageModelV3 { await fs.mkdir(this.cfg.completionsDir, { recursive: true }) const turnStr = String(turn).padStart(4, "0") const safeModel = this.modelId.replace(/\//g, "__") - const fname = `${safeModel}-${turnStr}-${Date.now()}.json` + // sessionID is part of the filename so subagent dumps don't clobber the + // main session's. Sanitized for filesystem safety. + const safeSession = args.session.sessionID.replace(/[^A-Za-z0-9_-]/g, "_") + const fname = `${safeModel}-${safeSession}-${turnStr}-${Date.now()}.json` const fpath = path.join(this.cfg.completionsDir, fname) const kwargs: Record = {} for (const [k, v] of Object.entries(args.requestParams)) { @@ -521,6 +548,9 @@ export class NemoGymLanguageModel implements LanguageModelV3 { response: args.response, provider_specific_fields: args.providerSpecificFields, kwargs, + session_id: args.session.sessionID, + parent_session_id: args.session.parentSessionID ?? null, + turn, timestamp: Date.now() / 1000, } const tmp = `${fpath}.tmp` From 6495092f9b636c15917f653a4306fe645b3898a9 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Mon, 11 May 2026 13:53:20 -0700 Subject: [PATCH 14/21] bench: use opencode's anthropic.txt as default system prompt Replaces the 12-line bench-local DEFAULT_SYSTEM_PROMPT with the real anthropic system prompt opencode ships (`session/prompt/anthropic.txt`, ~105 lines), plus a short SWE-bench addendum (don't commit/format the diff, harness captures git diff as the patch, don't edit tests). `--system-prompt ` override still wins when supplied. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sugam Devare --- packages/opencode/src/bench/cli.ts | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index f86afd9c311b..aa569b3a221f 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -24,6 +24,9 @@ import path from "node:path" import os from "node:os" import { spawn } from "node:child_process" import { runDeepReset } from "./deep_reset" +// opencode's built-in anthropic system prompt — Bun bundles .txt as a string. +// Used as the default when no --system-prompt override is passed. +import PROMPT_ANTHROPIC from "../session/prompt/anthropic.txt" interface CliArgs { instanceDictPath: string @@ -135,18 +138,18 @@ function loadGymConfig(configPath: string): Record { return JSON.parse(readFileSync(configPath, "utf8")) } -const DEFAULT_SYSTEM_PROMPT = `You are an autonomous software engineer fixing a known issue in a checked-out git repository. +// Default is opencode's built-in anthropic system prompt + a short SWE-bench +// addendum (workspace is git-tracked, harness captures git diff as the patch, +// don't commit/format the diff yourself, don't modify the test files). +const SWE_BENCH_ADDENDUM = ` -Work in small, deliberate steps: -1. Read the issue and explore the relevant files. -2. Reproduce the issue if applicable. -3. Edit the source to fix the issue. -4. Run the project's tests to verify the fix. -5. Iterate until the issue is resolved. +# SWE-bench harness context -Use the available tools (bash, edit, read, glob, grep) to investigate and act. Do NOT modify the test files unless the task explicitly says so. The harness will capture the final \`git diff\` of the workspace as your patch — do not commit or format the diff yourself. +You are running inside a SWE-bench evaluation harness on a checked-out git repository. The harness will capture the final \`git diff\` of the workspace as your patch — do not commit, push, or format the diff yourself. Do NOT modify the test files unless the task explicitly says so. Stop calling tools once you are confident the issue is fully resolved. ` +const DEFAULT_SYSTEM_PROMPT = PROMPT_ANTHROPIC + SWE_BENCH_ADDENDUM + async function buildConfigDir(args: { instanceId: string modelName: string From d850c8fd717494b155221c3caa49ab33505262f2 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Mon, 11 May 2026 14:45:16 -0700 Subject: [PATCH 15/21] bench: gate dynamic system env block behind OPENCODE_DISABLE_ENV_PROMPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The env block in session/system.ts (cwd, worktree, platform, today's date) is appended to the system prompt every turn. The `Today's date` field uses `new Date().toDateString()` — across a midnight rollover the system message shifts, which breaks the RL prompt-token-prefix invariant (turn[N+1].prompt_token_ids must extend turn[N]'s). Bench mode sets OPENCODE_DISABLE_ENV_PROMPT=1 in the child env so the environment() effect short-circuits to []. Normal `opencode run` is unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sugam Devare --- packages/opencode/src/bench/cli.ts | 4 ++++ packages/opencode/src/session/system.ts | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index aa569b3a221f..3635bd1e6336 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -373,6 +373,10 @@ async function main() { OPENCODE_CONFIG: configFile, // Disable opencode's built-in plugin loaders; the bench harness doesn't need them. OPENCODE_PURE: "1", + // Skip the dynamic env block (working dir + Today's date) in the system + // prompt — keeps the RL prompt-token prefix invariant stable across turns + // (a midnight rollover would otherwise shift `Today's date: ...`). + OPENCODE_DISABLE_ENV_PROMPT: "1", } // Prune git history past base_commit so the agent can't reach future commits. diff --git a/packages/opencode/src/session/system.ts b/packages/opencode/src/session/system.ts index 06c71fa7dbdd..acde90b448d7 100644 --- a/packages/opencode/src/session/system.ts +++ b/packages/opencode/src/session/system.ts @@ -46,6 +46,11 @@ export const layer = Layer.effect( return Service.of({ environment: Effect.fn("SystemPrompt.environment")(function* (model: Provider.Model) { + // Bench / RL mode: skip the dynamic env block entirely. It includes + // `new Date().toDateString()` which would shift prompt tokens across a + // midnight rollover and break the RL contiguity invariant + // (prompt_token_ids[N+1] must extend prompt_token_ids[N]). + if (process.env.OPENCODE_DISABLE_ENV_PROMPT === "1") return [] const ctx = yield* InstanceState.context return [ [ From 01f0889a96cbb63c17ee54e70a52f7063864830b Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Mon, 11 May 2026 18:02:29 -0700 Subject: [PATCH 16/21] bench: spawn shell/bun/git via absolute paths to dodge SIF PATH quirks Some apptainer images ENOENT on bare program names through Bun's posix_spawn (libuv falls back to PATH-less execv on some kernels / musl builds). Symptoms in the wild: [bench] deep_reset spawn error: ENOENT posix_spawn 'bash' [bench] wrote ... error=opencode_exit_999 # bun spawn also ENOENT Fixes: - deep_reset.ts: probe /bin/bash, /usr/bin/bash, /bin/sh, /usr/bin/sh and spawn the first that exists. The script is POSIX-compatible so /bin/sh works fine when bash is absent (minimal/distroless SIFs). Skip deep_reset with a warning if no shell is found. - cli.ts runOpencode: use process.execPath (the currently-running bun binary) instead of bare "bun". - cli.ts captureGitDiff: probe /usr/bin/git, /bin/git, /usr/local/bin/git with bare "git" as last-ditch fallback. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sugam Devare --- packages/opencode/src/bench/cli.ts | 18 ++++++++++++++++-- packages/opencode/src/bench/deep_reset.ts | 22 ++++++++++++++++++++-- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index 3635bd1e6336..759bc753638b 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -230,6 +230,16 @@ async function buildConfigDir(args: { return { tmpRoot, configFile } } +// Some SIFs ship with bare PATH lookups that ENOENT on bare program names +// through Bun's posix_spawn. Resolve to an absolute path up front for any +// binary we shell out to. +function detectBin(candidates: string[]): string | null { + for (const p of candidates) { + if (existsSync(p)) return p + } + return null +} + function runOpencode(args: { workspaceRoot: string modelName: string @@ -238,9 +248,12 @@ function runOpencode(args: { opencodeBin: string agent: string }): Promise<{ exitCode: number; stdout: string; stderr: string }> { + // Use the same bun binary that's currently running — guaranteed to exist + // and avoids PATH lookup quirks under Bun's posix_spawn. + const bunPath = process.execPath return new Promise((resolve) => { const child = spawn( - "bun", + bunPath, [ args.opencodeBin, "run", @@ -283,8 +296,9 @@ function runOpencode(args: { } async function captureGitDiff(workspaceRoot: string): Promise { + const gitPath = detectBin(["/usr/bin/git", "/bin/git", "/usr/local/bin/git"]) ?? "git" return new Promise((resolve) => { - const child = spawn("git", ["-C", workspaceRoot, "diff"], { + const child = spawn(gitPath, ["-C", workspaceRoot, "diff"], { env: { ...process.env, GIT_PAGER: "cat" }, }) let stdout = "" diff --git a/packages/opencode/src/bench/deep_reset.ts b/packages/opencode/src/bench/deep_reset.ts index c6dd07cda5d1..77167e1530c6 100644 --- a/packages/opencode/src/bench/deep_reset.ts +++ b/packages/opencode/src/bench/deep_reset.ts @@ -16,6 +16,19 @@ */ import { spawn } from "node:child_process" +import { existsSync } from "node:fs" + +// Some SIFs are minimal and ship without `bash` on PATH, or Bun's posix_spawn +// doesn't fall back to PATH lookup the way `execvp` does — either way, +// spawn("bash", ...) ENOENTs. Probe absolute paths up front; the deep-reset +// script uses only POSIX features, so /bin/sh is a safe fallback if bash is +// absent. +function detectShell(): string | null { + for (const p of ["/bin/bash", "/usr/bin/bash", "/bin/sh", "/usr/bin/sh"]) { + if (existsSync(p)) return p + } + return null +} function carefulPass(baseCommit: string): string { return ( @@ -87,10 +100,15 @@ export function buildDeepResetCmd(baseCommit: string): string { export async function runDeepReset(workspaceRoot: string, baseCommit: string): Promise { if (!baseCommit) return + const shell = detectShell() + if (!shell) { + console.warn(`[bench] deep_reset skipped: no shell found at /bin/{bash,sh} or /usr/bin/{bash,sh}`) + return + } const cmd = buildDeepResetCmd(baseCommit) - console.log(`[bench] deep_reset workspace=${workspaceRoot} base=${baseCommit}`) + console.log(`[bench] deep_reset workspace=${workspaceRoot} base=${baseCommit} shell=${shell}`) await new Promise((resolve) => { - const child = spawn("bash", ["-c", cmd], { + const child = spawn(shell, ["-c", cmd], { cwd: workspaceRoot, stdio: ["ignore", "inherit", "inherit"], }) From cdfc4b76aa08939cf581b578e5c0308808b80140 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Mon, 11 May 2026 18:09:51 -0700 Subject: [PATCH 17/21] bench: drop spawn cwd, route chdir via shell / opencode --dir The previous fix moved to absolute binary paths but `/bin/bash` still ENOENTs through Bun's posix_spawn whenever the spawn passes a `cwd` option on minimal apptainer images (e.g., SWE-bench's astropy SIF). Root cause: libc lacks `posix_spawn_file_actions_addchdir_np`, so libuv's cwd-handling fallback fails the spawn outright instead of falling back to fork+chdir+exec. Workaround: don't pass `cwd` to spawn at all. - deep_reset.ts: prepend `cd && ` to the shell script itself (shellQuote handles paths with spaces / special chars). - cli.ts runOpencode: drop `cwd: workspaceRoot` from the bun spawn. Opencode's existing `--dir ` flag changes its own working directory, so spawn-level cwd was redundant anyway. - captureGitDiff already uses `git -C ` (no cwd). Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sugam Devare --- packages/opencode/src/bench/cli.ts | 5 ++++- packages/opencode/src/bench/deep_reset.ts | 11 +++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index 759bc753638b..8c9643b61b96 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -252,6 +252,10 @@ function runOpencode(args: { // and avoids PATH lookup quirks under Bun's posix_spawn. const bunPath = process.execPath return new Promise((resolve) => { + // Don't set spawn's `cwd` — Bun's posix_spawn on some minimal apptainer + // images ENOENTs whenever cwd is set (libc lacks addchdir_np). Opencode's + // own `--dir ` flag changes the working directory + // internally, so we don't need spawn-level cwd. const child = spawn( bunPath, [ @@ -269,7 +273,6 @@ function runOpencode(args: { args.workspaceRoot, ], { - cwd: args.workspaceRoot, env: args.env, stdio: ["ignore", "pipe", "pipe"], }, diff --git a/packages/opencode/src/bench/deep_reset.ts b/packages/opencode/src/bench/deep_reset.ts index 77167e1530c6..543815f454a5 100644 --- a/packages/opencode/src/bench/deep_reset.ts +++ b/packages/opencode/src/bench/deep_reset.ts @@ -98,6 +98,10 @@ export function buildDeepResetCmd(baseCommit: string): string { return `( ${carefulPass(baseCommit)} ) || ( ${nuclearPass(baseCommit)} ) || true` } +function shellQuote(s: string): string { + return `'${s.replace(/'/g, `'\\''`)}'` +} + export async function runDeepReset(workspaceRoot: string, baseCommit: string): Promise { if (!baseCommit) return const shell = detectShell() @@ -105,11 +109,14 @@ export async function runDeepReset(workspaceRoot: string, baseCommit: string): P console.warn(`[bench] deep_reset skipped: no shell found at /bin/{bash,sh} or /usr/bin/{bash,sh}`) return } - const cmd = buildDeepResetCmd(baseCommit) + // Bake `cd ` into the shell script instead of passing the `cwd` + // option to spawn(). On some minimal apptainer images Bun's posix_spawn + // ENOENTs whenever a `cwd` is set (libc lacks addchdir_np extension); routing + // the chdir through the shell sidesteps that entirely. + const cmd = `cd ${shellQuote(workspaceRoot)} && ` + buildDeepResetCmd(baseCommit) console.log(`[bench] deep_reset workspace=${workspaceRoot} base=${baseCommit} shell=${shell}`) await new Promise((resolve) => { const child = spawn(shell, ["-c", cmd], { - cwd: workspaceRoot, stdio: ["ignore", "inherit", "inherit"], }) child.on("close", (code) => { From b9602b100aefee5d8ba03bfdd57858e3955bc1a3 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Mon, 18 May 2026 14:17:35 -0700 Subject: [PATCH 18/21] feat: change nudge to user message --- packages/opencode/src/session/prompt.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/opencode/src/session/prompt.ts b/packages/opencode/src/session/prompt.ts index fef8c438366c..2e53938e04e9 100644 --- a/packages/opencode/src/session/prompt.ts +++ b/packages/opencode/src/session/prompt.ts @@ -1581,7 +1581,7 @@ NOTE: At any point in time through this workflow you should feel free to ask the sessionID, parentSessionID: session.parentID, system, - messages: [...modelMsgs, ...(isLastStep ? [{ role: "assistant" as const, content: MAX_STEPS }] : [])], + messages: [...modelMsgs, ...(isLastStep ? [{ role: "user" as const, content: MAX_STEPS }] : [])], tools, model, toolChoice: format.type === "json_schema" ? "required" : undefined, From 52efa897091fcfa50dd56aa9d13930b7240a9e08 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Thu, 4 Jun 2026 12:57:07 -0700 Subject: [PATCH 19/21] bench: include untracked files in captured git diff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plain `git -C diff` only reports changes to tracked files. Every new file the agent creates via the `write` tool stays untracked, so it was silently dropped from the captured patch and the rollout was recorded as `patch_exists=false` / reward 0 even though the agent ran to completion (`reason: "stop"`, bench `exit=0`). In a recent SWE-bench batch (swebench_results_1780596228320_bcd81ba7), 8 of the 21 zero-byte patches were exactly this case — 7/8 had `write` calls to brand-new paths and never staged. Example trajectory: denoland-deno-8408-agentic wrote a 6499-byte std/encoding/csv_stringify.ts from scratch, type-checked, ran a functional test, and stopped. git_patch came back "" because the file was untracked. Fix: mark untracked files as intent-to-add (`git add -AN`) so they appear in `git diff` without being committed. Worktree-style diff is preserved so the SWE-bench evaluator's `git apply` still works. Also pass `--binary` so any binary artifacts the agent produces aren't silently truncated by the textual diff path. The system-prompt addendum already tells the model not to commit or push, which is still correct after this fix — the harness now picks up unstaged new files on its own. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sugam Devare --- packages/opencode/src/bench/cli.ts | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index 8c9643b61b96..d0dc383dcdd4 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -300,15 +300,21 @@ function runOpencode(args: { async function captureGitDiff(workspaceRoot: string): Promise { const gitPath = detectBin(["/usr/bin/git", "/bin/git", "/usr/local/bin/git"]) ?? "git" - return new Promise((resolve) => { - const child = spawn(gitPath, ["-C", workspaceRoot, "diff"], { - env: { ...process.env, GIT_PAGER: "cat" }, + const runGit = (args: string[], capture: boolean): Promise => + new Promise((resolve) => { + const child = spawn(gitPath, ["-C", workspaceRoot, ...args], { + env: { ...process.env, GIT_PAGER: "cat" }, + }) + let stdout = "" + if (capture) child.stdout?.on("data", (b) => (stdout += b.toString("utf8"))) + child.on("close", () => resolve(stdout)) + child.on("error", () => resolve("")) }) - let stdout = "" - child.stdout?.on("data", (b) => (stdout += b.toString("utf8"))) - child.on("close", () => resolve(stdout)) - child.on("error", () => resolve("")) - }) + // Mark untracked files as intent-to-add so newly-created files appear in + // `git diff` without being committed. Plain `git diff` only shows changes + // to tracked files, which silently drops new-file patches the agent wrote. + await runGit(["add", "-AN"], false) + return runGit(["diff", "--binary"], true) } interface OutputJsonl { From a8f6fc98d6196c998d3a03c3d96a6877e4c6c61e Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Wed, 10 Jun 2026 13:56:38 -0700 Subject: [PATCH 20/21] bench: bootstrap a git repo when the SIF ships no .git MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some dataset SIFs (notably `swe-bench-ext`, and certain SWE-rebench variants) copy the repo into the workspace as a flat source tree with no `.git` directory. Without one: * runDeepReset's first `git rev-parse` fails, the outer `|| true` swallows the error, and deep_reset is a silent no-op. * captureGitDiff runs `git -C diff` against a non-repo; git emits `fatal: not a git repository` to stderr and exits with empty stdout, so the function returns "". Net effect: every rollout for those SIFs lands as `patch=0 bytes` regardless of how many files the agent edited. Recent symptom — in swebench_results_1781123430109_686771d1 (swe-bench-ext, 37 instances), every patch was zero bytes despite trajectories with up to 30 successful `edit` calls on the workspace. `cd /workspace/repo && git status` in the same SIF returns `fatal: not a git repository`. Port of nv-OpenHands' run_infer.py:1142-1156 swe-bench-ext baseline: detect the missing `.git`, init a local repo, snapshot the pristine tree as `opencode_bench_baseline`, and skip deep_reset (the upstream base_commit SHA doesn't exist in the fresh repo, so deep_reset would just fall through to its nuclear pass and noisily no-op anyway). This composes with 52efa8970 (`git add -AN && git diff --binary`) — the prior commit handled `.git`-present-but-untracked-write-files; this commit handles `.git`-missing-entirely. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sugam Devare --- packages/opencode/src/bench/bootstrap_repo.ts | 78 +++++++++++++++++++ packages/opencode/src/bench/cli.ts | 14 +++- 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 packages/opencode/src/bench/bootstrap_repo.ts diff --git a/packages/opencode/src/bench/bootstrap_repo.ts b/packages/opencode/src/bench/bootstrap_repo.ts new file mode 100644 index 000000000000..82c9c61a2353 --- /dev/null +++ b/packages/opencode/src/bench/bootstrap_repo.ts @@ -0,0 +1,78 @@ +/** + * Bootstrap a git repository inside the workspace when the SIF ships a flat + * source tree without a `.git` directory. + * + * Some dataset SIFs (notably `swe-bench-ext`, and certain SWE-rebench variants) + * copy the repo contents into `/workspace/repo` (or the dataset-specific path) + * without preserving git history. Without `.git`, `runDeepReset` is a silent + * no-op (its `git rev-parse` fails under the outer `|| true`) and + * `captureGitDiff` returns "" — every rollout is recorded as `patch=0 bytes` + * regardless of what the agent did. Port of nv-OpenHands' + * `evaluation/benchmarks/swe_bench/run_infer.py:1142-1156`. + * + * If `.git` already exists, this is a no-op. Otherwise a pristine baseline + * commit is created and tagged `opencode_bench_baseline`. Callers should skip + * `runDeepReset` when this returns `{ freshInit: true }` — the dataset's + * upstream `base_commit` SHA does not exist in the fresh repo, so deep_reset + * would just fail rev-parse and noisily fall through to its nuclear pass. + */ + +import { spawn } from "node:child_process" +import { existsSync } from "node:fs" +import path from "node:path" + +function detectShell(): string | null { + for (const p of ["/bin/bash", "/usr/bin/bash", "/bin/sh", "/usr/bin/sh"]) { + if (existsSync(p)) return p + } + return null +} + +function shellQuote(s: string): string { + return `'${s.replace(/'/g, `'\\''`)}'` +} + +function buildBootstrapCmd(workspaceRoot: string): string { + const q = shellQuote(workspaceRoot) + return ( + `cd ${q} && ` + + `echo "[bootstrap_repo] initializing git repo at ${workspaceRoot}" && ` + + `git config --global --add safe.directory ${q} && ` + + `git init -q && ` + + `git config user.email 'bench@opencode.local' && ` + + `git config user.name 'opencode bench' && ` + + `git add -A && ` + + `git commit -q --allow-empty -m 'opencode bench baseline' && ` + + `git tag -f opencode_bench_baseline HEAD && ` + + `echo "[bootstrap_repo] done; HEAD=$(git rev-parse --short HEAD)"` + ) +} + +export interface BootstrapResult { + freshInit: boolean +} + +export async function bootstrapRepoIfMissing(workspaceRoot: string): Promise { + if (existsSync(path.join(workspaceRoot, ".git"))) { + return { freshInit: false } + } + const shell = detectShell() + if (!shell) { + console.warn(`[bench] bootstrap_repo skipped: no shell found at /bin/{bash,sh} or /usr/bin/{bash,sh}`) + return { freshInit: false } + } + const cmd = buildBootstrapCmd(workspaceRoot) + console.log(`[bench] bootstrap_repo workspace=${workspaceRoot} shell=${shell}`) + const exitCode = await new Promise((resolve) => { + const child = spawn(shell, ["-c", cmd], { + stdio: ["ignore", "inherit", "inherit"], + }) + child.on("close", (code) => resolve(code ?? 0)) + child.on("error", (err) => { + console.warn(`[bench] bootstrap_repo spawn error: ${err}`) + resolve(1) + }) + }) + console.log(`[bench] bootstrap_repo exit=${exitCode}`) + return { freshInit: exitCode === 0 } +} diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index d0dc383dcdd4..e4ba969d7dde 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -24,6 +24,7 @@ import path from "node:path" import os from "node:os" import { spawn } from "node:child_process" import { runDeepReset } from "./deep_reset" +import { bootstrapRepoIfMissing } from "./bootstrap_repo" // opencode's built-in anthropic system prompt — Bun bundles .txt as a string. // Used as the default when no --system-prompt override is passed. import PROMPT_ANTHROPIC from "../session/prompt/anthropic.txt" @@ -402,8 +403,19 @@ async function main() { OPENCODE_DISABLE_ENV_PROMPT: "1", } + // Bootstrap a git repo if the SIF shipped a flat source tree (swe-bench-ext + // and some SWE-rebench variants). Without this, captureGitDiff returns "" + // and every patch is recorded as 0 bytes. + const { freshInit } = await bootstrapRepoIfMissing(workspaceRoot) + // Prune git history past base_commit so the agent can't reach future commits. - await runDeepReset(workspaceRoot, String(instance.base_commit ?? "")) + // Skip when we just freshly initialized: the dataset's upstream base_commit + // SHA doesn't exist in our local repo, so deep_reset would just fail + // rev-parse and fall through to its nuclear pass. The fresh `HEAD` is + // already the correct baseline (also tagged `opencode_bench_baseline`). + if (!freshInit) { + await runDeepReset(workspaceRoot, String(instance.base_commit ?? "")) + } const opencodeBin = detectOpencodeBin() const result = await runOpencode({ From 0c088fd18b5ef6ff9b2a949ac2a41204b1ac8046 Mon Sep 17 00:00:00 2001 From: Sugam Devare Date: Wed, 10 Jun 2026 14:58:35 -0700 Subject: [PATCH 21/21] bench: exit 0 deterministically when bench wrote output.jsonl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Falling off the end of main() with no explicit process.exit() left Bun to drain the event loop on its own, which produced a flaky exit=1 even on successful runs that wrote a valid output.jsonl with `error=none`. The residual handles seem to come from the spawned opencode subprocess (piped stdio that hasn't been explicitly closed) and from opencode's sqlite migration code — both stay registered long enough that Bun treats the runtime drain as a non-clean exit. Symptom in DeepSeek-V4-Flash_opencode_0-0-12711152.out (SWE-rebench-V2 train shard): three instances logged [bench] wrote .../bench_run/output.jsonl (patch=N bytes, error=none) with N=2001, 9046, ... — `error=none` means `result.exitCode === 0`, so opencode itself exited cleanly and the patch was captured. But apptainer exited 1, gym's runner raised `RuntimeError("Command failed with return code 1")`, and the rollout was discarded as a failure even though the bench succeeded. Fix: explicitly `process.exit(result.exitCode === 0 ? 0 : 1)` after the [bench] wrote log line. Deterministic, mirrors opencode's exit code, and doesn't depend on the event-loop drain heuristic. Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sugam Devare --- packages/opencode/src/bench/cli.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts index e4ba969d7dde..707c98c12a2a 100644 --- a/packages/opencode/src/bench/cli.ts +++ b/packages/opencode/src/bench/cli.ts @@ -444,7 +444,13 @@ async function main() { console.log(`[bench] wrote ${outPath} (patch=${patch.length} bytes, error=${error ?? "none"})`) - if (result.exitCode !== 0) process.exit(1) + // Mirror opencode's exit code explicitly. Falling off the end of main() and + // letting Bun drain the event loop produced a flaky exit=1 even when the + // bench wrote output.jsonl cleanly (sqlite migration handles, residual + // child-stdio pipes from the opencode subprocess). Gym's runner treats any + // non-zero apptainer exit as `Agent command failed` and discards the + // already-written patch, so we MUST exit 0 deterministically on success. + process.exit(result.exitCode === 0 ? 0 : 1) } main().catch((err) => {