diff --git a/evaluation/benchmarks/swe_bench/scripts/run_infer.sh b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh new file mode 100755 index 000000000000..2edf91b721cc --- /dev/null +++ b/evaluation/benchmarks/swe_bench/scripts/run_infer.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# Bench entry script — invoked by gym's OpenCodeHarnessProcessor.get_run_command(). +# +# Args (positional, must match the order in app.py's get_run_command): +# $1 COMMIT_HASH opencode commit (informational; checkout is done at setup) +# $2 AGENT agent class name (informational) +# $3 MAX_ITER max agent turns +# $4 DATASET dataset name (informational; gym already dispatched) +# $5 SPLIT dataset split (informational) +# $6 EVAL_OUTPUT_DIR where to write trajectories (relative to opencode dir) +# $7 SELECTED_ID instance_id to run +# $8 INSTANCE_DICT_PATH /root/dataset/data.jsonl (single-line JSONL) +# $9 CONFIG_FILE opencode model config JSON (written by gym) +# $10 WORKSPACE_ROOT resolved repo path inside the SIF (gym side decided) +# $11 USER_MESSAGE_PATH pre-rendered user prompt file (workspace baked in) +# $12 SYSTEM_PROMPT_PATH optional system-prompt override +# +# Environment (set by gym): +# NEMO_GYM_MODEL_SERVER_NAME proxy name on the gym head server +# NEMO_GYM_MODEL_SERVER_BASE_URL base http://host:port for the model server +# NEMO_GYM_METRICS_FPATH path to the metrics JSON to update +# NEMO_GYM_CONFIG_DICT (informational) the gym YAML config blob +# COMMAND_EXEC_TIMEOUT per-bash-command timeout in seconds +# DIVERSIFY_TOOL_NAMES optional: rename tools for RL diversity +# CAMEL_CASE_TOOL_NAMES optional: camelCase tool names + +set -eo pipefail + +COMMIT_HASH="${1:-}" +AGENT="${2:-OpenCodeAgent}" +MAX_ITER="${3:-100}" +DATASET="${4:-}" +SPLIT="${5:-test}" +EVAL_OUTPUT_DIR="${6:-evaluation/oh}" +SELECTED_ID="${7:-}" +INSTANCE_DICT_PATH="${8:-/root/dataset/data.jsonl}" +CONFIG_FILE="${9:-/tmp/oc_config.json}" +WORKSPACE_ROOT="${10:-}" +USER_MESSAGE_PATH="${11:-}" +SYSTEM_PROMPT_PATH="${12:-}" + +if [ -z "$SELECTED_ID" ]; then + echo "ERROR: SELECTED_ID (\$7) is required." + exit 64 +fi +if [ -z "$WORKSPACE_ROOT" ]; then + echo "ERROR: WORKSPACE_ROOT (\$10) is required — gym side resolves the dataset-aware repo path." + exit 65 +fi +if [ -z "$USER_MESSAGE_PATH" ]; then + echo "ERROR: USER_MESSAGE_PATH (\$11) is required — gym side renders the user prompt." + exit 66 +fi +if [ -z "${NEMO_GYM_MODEL_SERVER_NAME:-}" ]; then + echo "ERROR: NEMO_GYM_MODEL_SERVER_NAME not set in env." + exit 67 +fi +if [ -z "${NEMO_GYM_MODEL_SERVER_BASE_URL:-}" ]; then + echo "ERROR: NEMO_GYM_MODEL_SERVER_BASE_URL not set in env." + exit 68 +fi + +# Resolve the opencode root directory. The script lives at +# evaluation/benchmarks/swe_bench/scripts/run_infer.sh — go up four levels. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OPENCODE_DIR="$(cd "$SCRIPT_DIR/../../../.." && pwd)" +BENCH_CLI="$OPENCODE_DIR/packages/opencode/src/bench/cli.ts" + +if [ ! -f "$BENCH_CLI" ]; then + echo "ERROR: bench cli.ts not found at $BENCH_CLI" + exit 69 +fi +if ! command -v bun >/dev/null 2>&1; then + echo "ERROR: bun not on PATH (expected /opencode_setup/bun/bin/bun)" + exit 70 +fi + +# Make EVAL_OUTPUT_DIR absolute (relative to opencode dir). +case "$EVAL_OUTPUT_DIR" in + /*) ABS_OUTPUT_DIR="$EVAL_OUTPUT_DIR" ;; + *) ABS_OUTPUT_DIR="$OPENCODE_DIR/$EVAL_OUTPUT_DIR" ;; +esac +mkdir -p "$ABS_OUTPUT_DIR" + +echo "OPENCODE_DIR: $OPENCODE_DIR" +echo "BENCH_CLI: $BENCH_CLI" +echo "AGENT: $AGENT COMMIT: $COMMIT_HASH MAX_ITER: $MAX_ITER" +echo "DATASET: $DATASET SPLIT: $SPLIT SELECTED_ID: $SELECTED_ID" +echo "EVAL_OUTPUT_DIR: $ABS_OUTPUT_DIR" +echo "INSTANCE_DICT_PATH: $INSTANCE_DICT_PATH" +echo "CONFIG_FILE: $CONFIG_FILE" +echo "WORKSPACE_ROOT: $WORKSPACE_ROOT" +echo "USER_MESSAGE_PATH: $USER_MESSAGE_PATH" +echo "SYSTEM_PROMPT_PATH: $SYSTEM_PROMPT_PATH" +echo "MODEL_SERVER: $NEMO_GYM_MODEL_SERVER_NAME @ $NEMO_GYM_MODEL_SERVER_BASE_URL" + +cmd=( + bun "$BENCH_CLI" + --instance-dict-path "$INSTANCE_DICT_PATH" + --output-dir "$ABS_OUTPUT_DIR" + --config "$CONFIG_FILE" + --max-turns "$MAX_ITER" + --agent-cls "$AGENT" + --dataset "$DATASET" + --split "$SPLIT" + --selected-id "$SELECTED_ID" + --workspace-root "$WORKSPACE_ROOT" + --user-message-file "$USER_MESSAGE_PATH" +) +if [ -n "$SYSTEM_PROMPT_PATH" ]; then + cmd+=(--system-prompt "$SYSTEM_PROMPT_PATH") +fi +if [ "${ENABLE_SUBAGENTS:-0}" = "1" ] || [ "${ENABLE_SUBAGENTS:-}" = "true" ]; then + cmd+=(--enable-subagents) +fi + +echo "Executing: ${cmd[*]}" +exec "${cmd[@]}" diff --git a/packages/opencode/src/bench/bootstrap_repo.ts b/packages/opencode/src/bench/bootstrap_repo.ts new file mode 100644 index 000000000000..82c9c61a2353 --- /dev/null +++ b/packages/opencode/src/bench/bootstrap_repo.ts @@ -0,0 +1,78 @@ +/** + * Bootstrap a git repository inside the workspace when the SIF ships a flat + * source tree without a `.git` directory. + * + * Some dataset SIFs (notably `swe-bench-ext`, and certain SWE-rebench variants) + * copy the repo contents into `/workspace/repo` (or the dataset-specific path) + * without preserving git history. Without `.git`, `runDeepReset` is a silent + * no-op (its `git rev-parse` fails under the outer `|| true`) and + * `captureGitDiff` returns "" — every rollout is recorded as `patch=0 bytes` + * regardless of what the agent did. Port of nv-OpenHands' + * `evaluation/benchmarks/swe_bench/run_infer.py:1142-1156`. + * + * If `.git` already exists, this is a no-op. Otherwise a pristine baseline + * commit is created and tagged `opencode_bench_baseline`. Callers should skip + * `runDeepReset` when this returns `{ freshInit: true }` — the dataset's + * upstream `base_commit` SHA does not exist in the fresh repo, so deep_reset + * would just fail rev-parse and noisily fall through to its nuclear pass. + */ + +import { spawn } from "node:child_process" +import { existsSync } from "node:fs" +import path from "node:path" + +function detectShell(): string | null { + for (const p of ["/bin/bash", "/usr/bin/bash", "/bin/sh", "/usr/bin/sh"]) { + if (existsSync(p)) return p + } + return null +} + +function shellQuote(s: string): string { + return `'${s.replace(/'/g, `'\\''`)}'` +} + +function buildBootstrapCmd(workspaceRoot: string): string { + const q = shellQuote(workspaceRoot) + return ( + `cd ${q} && ` + + `echo "[bootstrap_repo] initializing git repo at ${workspaceRoot}" && ` + + `git config --global --add safe.directory ${q} && ` + + `git init -q && ` + + `git config user.email 'bench@opencode.local' && ` + + `git config user.name 'opencode bench' && ` + + `git add -A && ` + + `git commit -q --allow-empty -m 'opencode bench baseline' && ` + + `git tag -f opencode_bench_baseline HEAD && ` + + `echo "[bootstrap_repo] done; HEAD=$(git rev-parse --short HEAD)"` + ) +} + +export interface BootstrapResult { + freshInit: boolean +} + +export async function bootstrapRepoIfMissing(workspaceRoot: string): Promise { + if (existsSync(path.join(workspaceRoot, ".git"))) { + return { freshInit: false } + } + const shell = detectShell() + if (!shell) { + console.warn(`[bench] bootstrap_repo skipped: no shell found at /bin/{bash,sh} or /usr/bin/{bash,sh}`) + return { freshInit: false } + } + const cmd = buildBootstrapCmd(workspaceRoot) + console.log(`[bench] bootstrap_repo workspace=${workspaceRoot} shell=${shell}`) + const exitCode = await new Promise((resolve) => { + const child = spawn(shell, ["-c", cmd], { + stdio: ["ignore", "inherit", "inherit"], + }) + child.on("close", (code) => resolve(code ?? 0)) + child.on("error", (err) => { + console.warn(`[bench] bootstrap_repo spawn error: ${err}`) + resolve(1) + }) + }) + console.log(`[bench] bootstrap_repo exit=${exitCode}`) + return { freshInit: exitCode === 0 } +} diff --git a/packages/opencode/src/bench/cli.ts b/packages/opencode/src/bench/cli.ts new file mode 100644 index 000000000000..707c98c12a2a --- /dev/null +++ b/packages/opencode/src/bench/cli.ts @@ -0,0 +1,459 @@ +/** + * SWE-bench bench CLI driver. + * + * Drives a single SWE-bench instance to completion using opencode's REAL + * agentic loop. We spawn `bun .../src/index.ts run` as a subprocess (with a + * per-instance opencode config that registers our `nemo-gym` provider, a + * SWE-bench agent, and disables compaction) and let it run to idle. + * + * Why subprocess instead of in-process Server.Default? Subprocess is the + * model the user-facing `opencode run` already uses (cli/cmd/run.ts:670–675 + * also uses an in-process fetch but the public entry is `bun .../index.ts`). + * A subprocess gives us: + * - clean process isolation per instance (matters for many parallel SIFs) + * - identical bootstrapping path to `opencode run`, so we don't drift + * - the JSON event stream on stdout for free (--format json) + * + * Trajectory capture: the nemo-gym provider (registered via this config) + * writes `/.json` per LLM call BEFORE returning. On + * exit we capture `git diff` and write `output.jsonl`. + */ + +import { existsSync, promises as fs, readFileSync } from "node:fs" +import path from "node:path" +import os from "node:os" +import { spawn } from "node:child_process" +import { runDeepReset } from "./deep_reset" +import { bootstrapRepoIfMissing } from "./bootstrap_repo" +// opencode's built-in anthropic system prompt — Bun bundles .txt as a string. +// Used as the default when no --system-prompt override is passed. +import PROMPT_ANTHROPIC from "../session/prompt/anthropic.txt" + +interface CliArgs { + instanceDictPath: string + outputDir: string + config: string + maxTurns: number + agentCls: string + dataset: string + split: string + selectedId: string + /** Resolved repo path inside the SIF — gym side decided based on dataset_name. */ + workspaceRoot: string + /** Pre-rendered user message file (workspace_path baked in by gym). */ + userMessageFile: string + systemPromptPath?: string + /** Enable opencode's `task` tool (spawns subagent sessions). */ + enableSubagents: boolean +} + +function parseArgs(argv: string[]): CliArgs { + const out: Partial = { + maxTurns: 100, + agentCls: "OpenCodeAgent", + dataset: "", + split: "test", + enableSubagents: false, + } + for (let i = 0; i < argv.length; i++) { + const a = argv[i] + const next = () => argv[++i] + switch (a) { + case "--instance-dict-path": + out.instanceDictPath = next() + break + case "--output-dir": + out.outputDir = next() + break + case "--config": + out.config = next() + break + case "--max-turns": + out.maxTurns = parseInt(next(), 10) + break + case "--agent-cls": + out.agentCls = next() + break + case "--dataset": + out.dataset = next() + break + case "--split": + out.split = next() + break + case "--selected-id": + out.selectedId = next() + break + case "--workspace-root": + out.workspaceRoot = next() + break + case "--user-message-file": + out.userMessageFile = next() + break + case "--system-prompt": + out.systemPromptPath = next() + break + case "--enable-subagents": + out.enableSubagents = true + break + default: + if (a.startsWith("--")) throw new Error(`Unknown flag: ${a}`) + } + } + for (const required of [ + "instanceDictPath", + "outputDir", + "config", + "selectedId", + "workspaceRoot", + "userMessageFile", + ] as const) { + if (!out[required]) + throw new Error(`Missing required arg --${required.replace(/[A-Z]/g, (c) => "-" + c.toLowerCase())}`) + } + return out as CliArgs +} + +interface InstanceDict { + instance_id: string + problem_statement: string + repo?: string + repo_name?: string + workspace?: string + base_commit?: string + [key: string]: unknown +} + +async function readInstance(instanceDictPath: string, selectedId: string): Promise { + const text = await fs.readFile(instanceDictPath, "utf8") + const lines = text + .split("\n") + .map((l) => l.trim()) + .filter(Boolean) + const records = lines.map((l) => JSON.parse(l) as InstanceDict) + const match = records.find((r) => r.instance_id === selectedId) ?? records[0] + if (!match) throw new Error(`No instance found in ${instanceDictPath}`) + return match +} + +function loadGymConfig(configPath: string): Record { + return JSON.parse(readFileSync(configPath, "utf8")) +} + +// Default is opencode's built-in anthropic system prompt + a short SWE-bench +// addendum (workspace is git-tracked, harness captures git diff as the patch, +// don't commit/format the diff yourself, don't modify the test files). +const SWE_BENCH_ADDENDUM = ` + +# SWE-bench harness context + +You are running inside a SWE-bench evaluation harness on a checked-out git repository. The harness will capture the final \`git diff\` of the workspace as your patch — do not commit, push, or format the diff yourself. Do NOT modify the test files unless the task explicitly says so. Stop calling tools once you are confident the issue is fully resolved. +` + +const DEFAULT_SYSTEM_PROMPT = PROMPT_ANTHROPIC + SWE_BENCH_ADDENDUM + +async function buildConfigDir(args: { + instanceId: string + modelName: string + baseURL: string + completionsDir: string + maxTurns: number + systemPromptPath?: string + enableSubagents: boolean +}): Promise<{ tmpRoot: string; configFile: string }> { + const tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), `bench-${args.instanceId}-`)) + await fs.mkdir(tmpRoot, { recursive: true }) + + const systemPrompt = args.systemPromptPath + ? await fs.readFile(args.systemPromptPath, "utf8") + : DEFAULT_SYSTEM_PROMPT + + const cfg: Record = { + $schema: "https://opencode.ai/config.json", + provider: { + "nemo-gym": { + npm: "@opencode-ai/nemo-gym", + options: { + baseURL: args.baseURL, + completionsDir: args.completionsDir, + instanceId: args.instanceId, + }, + models: { + [args.modelName]: { + id: args.modelName, + name: args.modelName, + limit: { context: 131072, output: 32768 }, + tool_call: true, + temperature: true, + }, + }, + }, + }, + agent: { + "swe-bench": { + mode: "primary", + model: `nemo-gym/${args.modelName}`, + prompt: systemPrompt, + // Allow the read+write tool set; disable web/skill/task to keep the + // agent focused on local code editing. + permission: { + // Glob-keyed `PermissionActionConfig` for file/shell access. + edit: { "**": "allow" }, + bash: { "*": "allow" }, + // webfetch / websearch use a different schema (single action, not + // a glob map) and we already disable them in `tools` below — no + // need for an explicit entry here. + }, + tools: { + bash: true, + edit: true, + read: true, + glob: true, + grep: true, + write: true, + apply_patch: true, + webfetch: false, + websearch: false, + task: args.enableSubagents, + skill: false, + todowrite: true, + }, + steps: args.maxTurns, + options: {}, + }, + }, + compaction: { auto: false }, + share: "manual", + } + + const configFile = path.join(tmpRoot, "opencode.jsonc") + await fs.writeFile(configFile, JSON.stringify(cfg, null, 2)) + + return { tmpRoot, configFile } +} + +// Some SIFs ship with bare PATH lookups that ENOENT on bare program names +// through Bun's posix_spawn. Resolve to an absolute path up front for any +// binary we shell out to. +function detectBin(candidates: string[]): string | null { + for (const p of candidates) { + if (existsSync(p)) return p + } + return null +} + +function runOpencode(args: { + workspaceRoot: string + modelName: string + message: string + env: NodeJS.ProcessEnv + opencodeBin: string + agent: string +}): Promise<{ exitCode: number; stdout: string; stderr: string }> { + // Use the same bun binary that's currently running — guaranteed to exist + // and avoids PATH lookup quirks under Bun's posix_spawn. + const bunPath = process.execPath + return new Promise((resolve) => { + // Don't set spawn's `cwd` — Bun's posix_spawn on some minimal apptainer + // images ENOENTs whenever cwd is set (libc lacks addchdir_np). Opencode's + // own `--dir ` flag changes the working directory + // internally, so we don't need spawn-level cwd. + const child = spawn( + bunPath, + [ + args.opencodeBin, + "run", + args.message, + "--agent", + args.agent, + "--model", + `nemo-gym/${args.modelName}`, + "--format", + "json", + "--dangerously-skip-permissions", + "--dir", + args.workspaceRoot, + ], + { + env: args.env, + stdio: ["ignore", "pipe", "pipe"], + }, + ) + let stdout = "" + let stderr = "" + child.stdout?.on("data", (b) => { + const chunk = b.toString("utf8") + stdout += chunk + // Forward to our stdout so the gym log captures the event stream. + process.stdout.write(chunk) + }) + child.stderr?.on("data", (b) => { + const chunk = b.toString("utf8") + stderr += chunk + process.stderr.write(chunk) + }) + child.on("close", (code) => resolve({ exitCode: code ?? 0, stdout, stderr })) + child.on("error", (err) => { + stderr += String(err) + resolve({ exitCode: 999, stdout, stderr }) + }) + }) +} + +async function captureGitDiff(workspaceRoot: string): Promise { + const gitPath = detectBin(["/usr/bin/git", "/bin/git", "/usr/local/bin/git"]) ?? "git" + const runGit = (args: string[], capture: boolean): Promise => + new Promise((resolve) => { + const child = spawn(gitPath, ["-C", workspaceRoot, ...args], { + env: { ...process.env, GIT_PAGER: "cat" }, + }) + let stdout = "" + if (capture) child.stdout?.on("data", (b) => (stdout += b.toString("utf8"))) + child.on("close", () => resolve(stdout)) + child.on("error", () => resolve("")) + }) + // Mark untracked files as intent-to-add so newly-created files appear in + // `git diff` without being committed. Plain `git diff` only shows changes + // to tracked files, which silently drops new-file patches the agent wrote. + await runGit(["add", "-AN"], false) + return runGit(["diff", "--binary"], true) +} + +interface OutputJsonl { + instance_id: string + test_result: { git_patch: string } + metadata: { llm_config: { model: string } } + metrics: Record + error: string | null +} + +async function writeOutputJsonl(evalOutputDir: string, instanceId: string, payload: OutputJsonl): Promise { + const runDir = path.join(evalOutputDir, instanceId, "bench_run") + await fs.mkdir(runDir, { recursive: true }) + const outPath = path.join(runDir, "output.jsonl") + const tmp = `${outPath}.tmp` + await fs.writeFile(tmp, JSON.stringify(payload) + "\n") + await fs.rename(tmp, outPath) + return outPath +} + +function completionsDirFor(evalOutputDir: string, instanceId: string): string { + // Match openhands' on-host glob: /*/*/*/llm_completions//*.json + return path.join(evalOutputDir, instanceId, "bench_run", "llm_completions", instanceId) +} + +function detectOpencodeBin(): string { + // Prefer the pre-bundled artifact at /.bench-build/opencode.js. + // Running un-bundled `src/index.ts` triggers cascading runtime resolution + // failures (TUI JSX runtime not honored, @anthropic-ai/sdk relative .mjs + // paths failing across the isolated install layout). The bundle inlines + // every transitive dep and is opencode's intended deployment shape. + // Falls back to src/index.ts only for dev / when setup_scripts/opencode.sh + // hasn't run. + const here = path.dirname(new URL(import.meta.url).pathname) + // bench/cli.ts → packages/opencode/src/bench → packages/opencode/src → packages/opencode → packages → + const opencodeRoot = path.resolve(here, "..", "..", "..", "..") + const bundled = path.resolve(opencodeRoot, ".bench-build", "opencode.js") + if (existsSync(bundled)) return bundled + return path.resolve(here, "..", "index.ts") +} + +async function main() { + const args = parseArgs(process.argv.slice(2)) + const instance = await readInstance(args.instanceDictPath, args.selectedId) + // workspaceRoot is decided gym-side based on dataset_name; we use it verbatim. + const workspaceRoot = args.workspaceRoot + const gymConfig = loadGymConfig(args.config) + const llmModelCfg = ((gymConfig as Record>).llm?.model ?? {}) as Record< + string, + unknown + > + const modelName = String(llmModelCfg.model ?? "unknown-model") + const baseURL = process.env.NEMO_GYM_MODEL_SERVER_BASE_URL + if (!baseURL) throw new Error("NEMO_GYM_MODEL_SERVER_BASE_URL not set in env (gym harness sets this).") + + const completionsDir = completionsDirFor(args.outputDir, instance.instance_id) + await fs.mkdir(completionsDir, { recursive: true }) + + // The user message is fully rendered by gym (workspace_path baked in based + // on dataset_name); we just read it as-is and pass it to opencode. + const userPrompt = await fs.readFile(args.userMessageFile, "utf8") + + const { tmpRoot, configFile } = await buildConfigDir({ + instanceId: instance.instance_id, + modelName, + baseURL, + completionsDir, + maxTurns: args.maxTurns, + systemPromptPath: args.systemPromptPath, + enableSubagents: args.enableSubagents, + }) + + const startedAt = Date.now() + const childEnv: NodeJS.ProcessEnv = { + ...process.env, + // Run-isolated opencode state. + OPENCODE_DB: ":memory:", + OPENCODE_DATA: path.join(tmpRoot, "data"), + OPENCODE_CONFIG: configFile, + // Disable opencode's built-in plugin loaders; the bench harness doesn't need them. + OPENCODE_PURE: "1", + // Skip the dynamic env block (working dir + Today's date) in the system + // prompt — keeps the RL prompt-token prefix invariant stable across turns + // (a midnight rollover would otherwise shift `Today's date: ...`). + OPENCODE_DISABLE_ENV_PROMPT: "1", + } + + // Bootstrap a git repo if the SIF shipped a flat source tree (swe-bench-ext + // and some SWE-rebench variants). Without this, captureGitDiff returns "" + // and every patch is recorded as 0 bytes. + const { freshInit } = await bootstrapRepoIfMissing(workspaceRoot) + + // Prune git history past base_commit so the agent can't reach future commits. + // Skip when we just freshly initialized: the dataset's upstream base_commit + // SHA doesn't exist in our local repo, so deep_reset would just fail + // rev-parse and fall through to its nuclear pass. The fresh `HEAD` is + // already the correct baseline (also tagged `opencode_bench_baseline`). + if (!freshInit) { + await runDeepReset(workspaceRoot, String(instance.base_commit ?? "")) + } + + const opencodeBin = detectOpencodeBin() + const result = await runOpencode({ + workspaceRoot, + modelName, + message: userPrompt, + env: childEnv, + opencodeBin, + agent: "swe-bench", + }) + + const patch = await captureGitDiff(workspaceRoot) + const benchRunTime = (Date.now() - startedAt) / 1000 + + const error: string | null = result.exitCode === 0 ? null : `opencode_exit_${result.exitCode}` + const outPath = await writeOutputJsonl(args.outputDir, instance.instance_id, { + instance_id: instance.instance_id, + test_result: { git_patch: patch }, + metadata: { llm_config: { model: modelName } }, + metrics: { + bench_run_time: benchRunTime, + opencode_exit_code: result.exitCode, + }, + error, + }) + + console.log(`[bench] wrote ${outPath} (patch=${patch.length} bytes, error=${error ?? "none"})`) + + // Mirror opencode's exit code explicitly. Falling off the end of main() and + // letting Bun drain the event loop produced a flaky exit=1 even when the + // bench wrote output.jsonl cleanly (sqlite migration handles, residual + // child-stdio pipes from the opencode subprocess). Gym's runner treats any + // non-zero apptainer exit as `Agent command failed` and discards the + // already-written patch, so we MUST exit 0 deterministically on success. + process.exit(result.exitCode === 0 ? 0 : 1) +} + +main().catch((err) => { + console.error(`[bench] fatal: ${err?.stack ?? err}`) + process.exit(2) +}) diff --git a/packages/opencode/src/bench/deep_reset.ts b/packages/opencode/src/bench/deep_reset.ts new file mode 100644 index 000000000000..543815f454a5 --- /dev/null +++ b/packages/opencode/src/bench/deep_reset.ts @@ -0,0 +1,131 @@ +/** + * Strip git history past base_commit so the agent can't reach future commits. + * + * Port of nv-OpenHands' `_deep_reset_to_base_commit` + * (evaluation/benchmarks/swe_bench/run_infer.py:774). Two-pass design: + * + * - Careful pass: per-ref iteration with `git for-each-ref`. Preserves + * local branches that don't descend from base, resets branches that do, + * deletes tags/remote-tracking/stash/notes refs past base. + * - Nuclear fallback: batch-delete every tag/remote/stash/notes ref + every + * local branch in two `git update-ref --stdin` calls. Microseconds + * regardless of ref count — handles monorepos with thousands of refs + * where the careful pass times out. + * + * `|| true` at the very end so a busted git state can't kill the agent run. + */ + +import { spawn } from "node:child_process" +import { existsSync } from "node:fs" + +// Some SIFs are minimal and ship without `bash` on PATH, or Bun's posix_spawn +// doesn't fall back to PATH lookup the way `execvp` does — either way, +// spawn("bash", ...) ENOENTs. Probe absolute paths up front; the deep-reset +// script uses only POSIX features, so /bin/sh is a safe fallback if bash is +// absent. +function detectShell(): string | null { + for (const p of ["/bin/bash", "/usr/bin/bash", "/bin/sh", "/usr/bin/sh"]) { + if (existsSync(p)) return p + } + return null +} + +function carefulPass(baseCommit: string): string { + return ( + `echo "[deep_reset:careful] start" && ` + + `BASE=$(git rev-parse --verify ${baseCommit}^{commit}) && ` + + `ORIG_BRANCH=$(git symbolic-ref --short -q HEAD || echo main) && ` + + `echo "[deep_reset:careful] base=$BASE orig_branch=$ORIG_BRANCH" && ` + + `git checkout --detach "$BASE" && ` + + `echo "[deep_reset:careful] resetting local branches descending from base..." && ` + + `git for-each-ref --format="%(refname)" refs/heads | while read -r ref; do ` + + ` tip=$(git rev-parse -q --verify "$ref^{commit}" 2>/dev/null || true); ` + + ` [ -z "$tip" ] && continue; ` + + ` if [ "$tip" != "$BASE" ] && git merge-base --is-ancestor "$BASE" "$tip"; then ` + + ` echo "[deep_reset:careful] reset $ref -> $BASE"; ` + + ` git update-ref "$ref" "$BASE"; ` + + ` fi; ` + + `done && ` + + `echo "[deep_reset:careful] deleting tags/remotes/stash/notes past base..." && ` + + `git for-each-ref --format="%(refname)" refs | while read -r ref; do ` + + ` case "$ref" in refs/heads/*) continue ;; esac; ` + + ` if git symbolic-ref -q "$ref" >/dev/null 2>&1; then continue; fi; ` + + ` tip=$(git rev-parse -q --verify "$ref^{commit}" 2>/dev/null || true); ` + + ` [ -z "$tip" ] && continue; ` + + ` if [ "$tip" != "$BASE" ] && git merge-base --is-ancestor "$BASE" "$tip"; then ` + + ` echo "[deep_reset:careful] delete $ref"; ` + + ` git update-ref -d "$ref"; ` + + ` fi; ` + + `done && ` + + `echo "[deep_reset:careful] removing remotes + transient refs..." && ` + + `for r in $(git remote); do echo "[deep_reset:careful] rm remote $r"; git remote remove "$r"; done; ` + + `gd=$(git rev-parse --git-dir) && ` + + `rm -f "$gd"/FETCH_HEAD "$gd"/ORIG_HEAD "$gd"/MERGE_HEAD "$gd"/CHERRY_PICK_HEAD ` + + `"$gd"/REVERT_HEAD "$gd"/BISECT_HEAD "$gd"/AUTO_MERGE && ` + + `echo "[deep_reset:careful] expiring reflog + gc..." && ` + + `git reflog expire --expire=now --expire-unreachable=now --all && ` + + `git repack -ad && git prune --expire=now && git gc --prune=now && ` + + `git checkout -B "$ORIG_BRANCH" "$BASE" && ` + + `echo "[deep_reset:careful] done; HEAD=$ORIG_BRANCH at $BASE"` + ) +} + +function nuclearPass(baseCommit: string): string { + return ( + `echo "[deep_reset:nuclear] careful pass failed; running batch-delete fallback" && ` + + `BASE=$(git rev-parse --verify ${baseCommit}^{commit}) && ` + + `ORIG_BRANCH=$(git symbolic-ref --short -q HEAD || echo main) && ` + + `echo "[deep_reset:nuclear] base=$BASE orig_branch=$ORIG_BRANCH" && ` + + `git checkout --detach "$BASE" && ` + + `for r in $(git remote); do echo "[deep_reset:nuclear] rm remote $r"; git remote remove "$r"; done; ` + + `echo "[deep_reset:nuclear] batch-delete tags/remotes/stash/notes..." && ` + + `git for-each-ref --format="delete %(refname)" refs/tags refs/remotes refs/stash refs/notes 2>/dev/null ` + + `| git update-ref --stdin; ` + + `echo "[deep_reset:nuclear] batch-delete local branches..." && ` + + `git for-each-ref --format="delete %(refname)" refs/heads | git update-ref --stdin; ` + + `gd=$(git rev-parse --git-dir) && ` + + `rm -f "$gd"/FETCH_HEAD "$gd"/ORIG_HEAD "$gd"/MERGE_HEAD "$gd"/CHERRY_PICK_HEAD ` + + `"$gd"/REVERT_HEAD "$gd"/BISECT_HEAD "$gd"/AUTO_MERGE && ` + + `echo "[deep_reset:nuclear] expiring reflog + gc..." && ` + + `git reflog expire --expire=now --expire-unreachable=now --all && ` + + `git repack -ad && git prune --expire=now && git gc --prune=now && ` + + `git checkout -B "$ORIG_BRANCH" "$BASE" && ` + + `echo "[deep_reset:nuclear] done; HEAD=$ORIG_BRANCH at $BASE"` + ) +} + +export function buildDeepResetCmd(baseCommit: string): string { + return `( ${carefulPass(baseCommit)} ) || ( ${nuclearPass(baseCommit)} ) || true` +} + +function shellQuote(s: string): string { + return `'${s.replace(/'/g, `'\\''`)}'` +} + +export async function runDeepReset(workspaceRoot: string, baseCommit: string): Promise { + if (!baseCommit) return + const shell = detectShell() + if (!shell) { + console.warn(`[bench] deep_reset skipped: no shell found at /bin/{bash,sh} or /usr/bin/{bash,sh}`) + return + } + // Bake `cd ` into the shell script instead of passing the `cwd` + // option to spawn(). On some minimal apptainer images Bun's posix_spawn + // ENOENTs whenever a `cwd` is set (libc lacks addchdir_np extension); routing + // the chdir through the shell sidesteps that entirely. + const cmd = `cd ${shellQuote(workspaceRoot)} && ` + buildDeepResetCmd(baseCommit) + console.log(`[bench] deep_reset workspace=${workspaceRoot} base=${baseCommit} shell=${shell}`) + await new Promise((resolve) => { + const child = spawn(shell, ["-c", cmd], { + stdio: ["ignore", "inherit", "inherit"], + }) + child.on("close", (code) => { + console.log(`[bench] deep_reset exit=${code ?? 0}`) + resolve() + }) + child.on("error", (err) => { + console.warn(`[bench] deep_reset spawn error: ${err}`) + resolve() + }) + }) +} diff --git a/packages/opencode/src/index.ts b/packages/opencode/src/index.ts index 4c8e447041c0..444e9730d29c 100644 --- a/packages/opencode/src/index.ts +++ b/packages/opencode/src/index.ts @@ -22,8 +22,13 @@ import { McpCommand } from "./cli/cmd/mcp" import { GithubCommand } from "./cli/cmd/github" import { ExportCommand } from "./cli/cmd/export" import { ImportCommand } from "./cli/cmd/import" -import { AttachCommand } from "./cli/cmd/tui/attach" -import { TuiThreadCommand } from "./cli/cmd/tui/thread" +// TUI subcommands (Attach, TuiThread) are dropped from this build of opencode. +// The bench harness (`packages/opencode/src/bench/cli.ts`) only invokes the +// `run` command via subprocess, never the TUI; loading them eagerly here drags +// in `cli/cmd/tui/app.tsx` at startup, which JSX-compiles against +// `@opentui/solid` and trips Bun's runtime JSX resolver into looking for +// `react/jsx-dev-runtime` (a bug we hit when running the un-bundled .ts). +// Removed entirely rather than lazy-loaded — bench has no use for them. import { AcpCommand } from "./cli/cmd/acp" import { EOL } from "os" import { WebCommand } from "./cli/cmd/web" @@ -156,8 +161,6 @@ const cli = yargs(args) .completion("completion", "generate shell completion script") .command(AcpCommand) .command(McpCommand) - .command(TuiThreadCommand) - .command(AttachCommand) .command(RunCommand) .command(GenerateCommand) .command(DebugCommand) diff --git a/packages/opencode/src/provider/models-snapshot.d.ts b/packages/opencode/src/provider/models-snapshot.d.ts new file mode 100644 index 000000000000..508ab6ee22fe --- /dev/null +++ b/packages/opencode/src/provider/models-snapshot.d.ts @@ -0,0 +1,3 @@ +// Empty stub committed for the bench harness build path. See models-snapshot.js +// for the rationale. +export declare const snapshot: Record diff --git a/packages/opencode/src/provider/models-snapshot.js b/packages/opencode/src/provider/models-snapshot.js new file mode 100644 index 000000000000..c48d54a8f72a --- /dev/null +++ b/packages/opencode/src/provider/models-snapshot.js @@ -0,0 +1,18 @@ +// @ts-nocheck +// Empty stub committed for the bench harness build path. +// +// Upstream opencode generates this file at build time via `script/generate.ts` +// (which fetches https://models.dev/api.json). For the nemo-gym bench harness +// we only register a single custom provider in the per-instance opencode +// config, so the snapshot is unused — but `bun build` still has to resolve +// `import("./models-snapshot.js")` from `provider/models.ts:137` at static +// analysis time. An empty snapshot satisfies that requirement; the runtime +// `try:` lambda in models.ts handles an empty snapshot gracefully. +// +// `.gitignore` excludes this file because upstream regenerates it. We +// force-add it on the bench branch (sdd/dev) so `bun build --target=bun +// packages/opencode/src/index.ts ...` succeeds without running generate.ts +// (which requires network access to models.dev). If you ever DO want real +// model metadata, run `bun run script/generate.ts` and don't commit the +// regenerated file. +export const snapshot = {} diff --git a/packages/opencode/src/provider/provider.ts b/packages/opencode/src/provider/provider.ts index 4013dcee36e7..dbcb319ff284 100644 --- a/packages/opencode/src/provider/provider.ts +++ b/packages/opencode/src/provider/provider.ts @@ -114,6 +114,11 @@ const BUNDLED_PROVIDERS: Record Promise<(opts: any) => BundledSDK> "gitlab-ai-provider": () => import("gitlab-ai-provider").then((m) => m.createGitLab), "@ai-sdk/github-copilot": () => import("./sdk/copilot/copilot-provider").then((m) => m.createOpenaiCompatible), "venice-ai-sdk-provider": () => import("venice-ai-sdk-provider").then((m) => m.createVenice), + // NeMo-Gym custom provider used by the SWE-bench RL rollout harness. + // Routes chat completions through the gym's vllm model server while + // threading prompt/generation token IDs into providerMetadata. The + // bench cli (`bench/cli.ts`) configures this provider per-instance. + "@opencode-ai/nemo-gym": () => import("./sdk/nemo-gym/index").then((m) => m.createNemoGym), } type CustomModelLoader = (sdk: any, modelID: string, options?: Record) => Promise diff --git a/packages/opencode/src/provider/sdk/nemo-gym/index.ts b/packages/opencode/src/provider/sdk/nemo-gym/index.ts new file mode 100644 index 000000000000..0d7001b8dc28 --- /dev/null +++ b/packages/opencode/src/provider/sdk/nemo-gym/index.ts @@ -0,0 +1,62 @@ +/** + * NeMo-Gym opencode provider entry. + * + * Provider id: `nemo-gym`. Used by the bench harness for SWE-bench RL rollouts. + * Registered in `provider/provider.ts:BUNDLED_PROVIDERS`. + * + * The factory mirrors `@ai-sdk/openai-compatible`'s shape: `createNemoGym(opts)` + * returns a provider with `.languageModel(modelId)` so opencode's existing + * provider plumbing (Provider.Service.getModel) works without special-casing. + */ + +import { NemoGymLanguageModel, type NemoGymLanguageModelConfig } from "./language-model" + +export interface CreateNemoGymOptions { + /** Base URL of the gym model server (`http://host:port`). */ + baseURL: string + /** Optional name of the model server (informational; useful for logs). */ + modelServerName?: string + /** Custom request headers. */ + headers?: () => Record + /** + * Where to dump per-call llm_completions/.json files. + * Set per-instance by the bench harness; if absent, no trajectory dump. + */ + completionsDir?: string + /** instance_id to embed in trajectory dump paths/file names. */ + instanceId?: string + /** Per-call HTTP timeout in ms. */ + requestTimeoutMs?: number + /** HTTP retry count on transient errors. */ + retries?: number + /** Optional turn counter shared across all model calls in a session. */ + turnCounter?: { next(): number } + /** Optional callback invoked after each successful chat-completion. */ + onCompletion?: NemoGymLanguageModelConfig["onCompletion"] +} + +export interface NemoGymProvider { + languageModel: (modelId: string) => NemoGymLanguageModel +} + +export function createNemoGym(opts: CreateNemoGymOptions): NemoGymProvider { + if (!opts.baseURL) { + throw new Error("createNemoGym: baseURL is required (e.g. http://host:port)") + } + return { + languageModel(modelId: string) { + return new NemoGymLanguageModel(modelId, { + provider: "nemo-gym", + baseURL: opts.baseURL, + modelServerName: opts.modelServerName, + headers: opts.headers, + completionsDir: opts.completionsDir, + instanceId: opts.instanceId, + requestTimeoutMs: opts.requestTimeoutMs, + retries: opts.retries, + turnCounter: opts.turnCounter, + onCompletion: opts.onCompletion, + }) + }, + } +} diff --git a/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts new file mode 100644 index 000000000000..6800fc568798 --- /dev/null +++ b/packages/opencode/src/provider/sdk/nemo-gym/language-model.ts @@ -0,0 +1,563 @@ +/** + * NeMo-Gym LanguageModelV3 implementation. + * + * The opencode `processor.ts` agentic loop is unmodified — this is the only + * piece that swaps. Internally we POST to NeMo Gym's `/v1/chat/completions` + * non-streaming, capture token IDs (`prompt_token_ids` / `generation_token_ids` + * / `generation_log_probs`) from the response, and emit a single-shot synthetic + * stream so opencode's streaming handler is happy. + * + * Why non-streaming? RL training requires contiguous, exact token IDs across + * turns. Streaming has them drip in across SSE chunks; non-streaming returns + * them in the final response cleanly. The opencode loop doesn't notice — it + * receives all stream parts at once. + * + * Trajectory dump: every doStream call writes + * `//.json` BEFORE the stream finishes, + * so a tool crash later cannot lose this turn's token IDs. The shape matches + * openhands' `llm_completions//*.json` exactly so gym's + * `get_openhands_trajectory_from_completions` reads it without changes. + */ + +import { + type LanguageModelV3, + type LanguageModelV3CallOptions, + type LanguageModelV3StreamPart, + type LanguageModelV3Content, + type SharedV3ProviderMetadata, + type SharedV3Warning, +} from "@ai-sdk/provider" +import { promises as fs } from "node:fs" +import path from "node:path" +import { convertToOpenAICompatibleChatMessages } from "../copilot/chat/convert-to-openai-compatible-chat-messages" +import { prepareTools } from "../copilot/chat/openai-compatible-prepare-tools" + +// --------------------------------------------------------------------------- +// Wire types +// --------------------------------------------------------------------------- + +interface ChatRequestMessage { + role: "system" | "user" | "assistant" | "tool" + content?: string | Array | null + tool_calls?: Array<{ + id: string + type: "function" + function: { name: string; arguments: string } + }> + tool_call_id?: string + name?: string + prompt_token_ids?: number[] + generation_token_ids?: number[] + generation_log_probs?: number[] + [key: string]: unknown +} + +interface ChatResponseChoice { + index?: number + finish_reason?: string | null + message: { + role: string + content?: string | null + reasoning_text?: string | null + tool_calls?: Array<{ + id?: string + type?: string + function: { name: string; arguments: string } + }> + prompt_token_ids?: number[] + generation_token_ids?: number[] + generation_log_probs?: number[] + [key: string]: unknown + } +} + +interface ChatResponseUsage { + prompt_tokens?: number | null + completion_tokens?: number | null + total_tokens?: number | null +} + +interface ChatResponse { + id?: string + model?: string + created?: number + choices: ChatResponseChoice[] + usage?: ChatResponseUsage +} + +// --------------------------------------------------------------------------- +// Config +// --------------------------------------------------------------------------- + +const TOKEN_ID_FIELDS = ["prompt_token_ids", "generation_token_ids", "generation_log_probs"] as const + +export interface NemoGymLanguageModelConfig { + /** Provider id used to namespace providerMetadata. Defaults to "nemo-gym". */ + provider: string + /** Full base URL of the model server (e.g. `http://gym-host:18086`). */ + baseURL: string + /** Optional gym head-server-style model server name; informational only. */ + modelServerName?: string + /** Custom request headers (auth, etc). */ + headers?: () => Record + /** Per-call HTTP timeout in ms. */ + requestTimeoutMs?: number + /** Number of HTTP retry attempts on transient errors. */ + retries?: number + /** + * Where per-call llm_completions JSONs land. The bench harness builds this + * path; it must match what gym's host-side glob expects. If unset, no + * trajectory dump happens (useful for dev/test). + */ + completionsDir?: string + /** instance_id for the dump file naming + path. Required when completionsDir set. */ + instanceId?: string + /** Optional sink that the bench harness uses to count turns globally. */ + turnCounter?: { next(): number } + /** Optional callback fired after each successful chat completion. */ + onCompletion?: (info: { + turn: number + messages: ChatRequestMessage[] + response: ChatResponse + providerSpecificFields: Record + requestParams: Record + }) => void | Promise +} + +// --------------------------------------------------------------------------- +// Implementation +// --------------------------------------------------------------------------- + +export class NemoGymLanguageModel implements LanguageModelV3 { + readonly specificationVersion = "v3" + readonly modelId: string + readonly provider: string + + private readonly cfg: NemoGymLanguageModelConfig + private cookies: Record = {} + // Per-session turn counter. opencode's session header is `x-session-affinity`; + // subagents spawned via the task tool get their own sessionID, so keeping + // a Map keeps their dump filenames from clobbering the main session's. + private readonly turnCounters: Map = new Map() + + constructor(modelId: string, cfg: NemoGymLanguageModelConfig) { + this.modelId = modelId + this.provider = cfg.provider + this.cfg = { + ...cfg, + requestTimeoutMs: cfg.requestTimeoutMs ?? 600_000, + retries: cfg.retries ?? 3, + } + } + + private _nextTurn(sessionID: string): number { + const n = (this.turnCounters.get(sessionID) ?? -1) + 1 + this.turnCounters.set(sessionID, n) + return n + } + + private _sessionFromHeaders(headers: unknown): { sessionID: string; parentSessionID: string | undefined } { + let sid = "" + let pid: string | undefined + if (headers && typeof headers === "object") { + const h = headers as Record + const v = h["x-session-affinity"] ?? h["X-Session-Affinity"] + if (typeof v === "string") sid = v + const p = h["x-parent-session-id"] ?? h["X-Parent-Session-Id"] + if (typeof p === "string") pid = p + } + return { sessionID: sid || "main", parentSessionID: pid } + } + + get supportedUrls() { + return {} as Record + } + + // The streamText path in `session/llm.ts` only calls doStream. We still + // implement doGenerate for completeness / future direct-use. + async doGenerate(options: LanguageModelV3CallOptions) { + const { warnings, messages, requestParams } = await this._buildRequestParams(options) + const session = this._sessionFromHeaders(options.headers) + const { responseJson } = await this._postChat(requestParams) + + const choice = responseJson.choices[0] + if (!choice) throw new Error("nemo-gym: empty choices in response") + const msg: ChatResponseChoice["message"] = choice.message ?? ({ role: "assistant" } as ChatResponseChoice["message"]) + + const content: LanguageModelV3Content[] = [] + if (msg.content) content.push({ type: "text", text: msg.content }) + if (msg.reasoning_text) content.push({ type: "reasoning", text: msg.reasoning_text }) + if (msg.tool_calls) { + for (const tc of msg.tool_calls) { + content.push({ + type: "tool-call", + toolCallId: tc.id ?? `call_${Math.random().toString(36).slice(2, 10)}`, + toolName: tc.function.name, + input: tc.function.arguments, + }) + } + } + + const providerSpecificFields = this._extractProviderFields(msg) + const providerMetadata = this._buildProviderMetadata(providerSpecificFields) + + await this._dumpAndNotify({ + messages, + response: responseJson, + providerSpecificFields, + requestParams, + session, + }) + + return { + content, + finishReason: this._mapFinishReason(choice.finish_reason ?? null), + usage: this._mapUsage(responseJson.usage), + providerMetadata, + request: { body: JSON.stringify(requestParams) }, + response: { body: responseJson }, + warnings, + } + } + + async doStream(options: LanguageModelV3CallOptions) { + const { warnings, messages, requestParams } = await this._buildRequestParams(options) + const session = this._sessionFromHeaders(options.headers) + + // Fire the HTTP call eagerly so any error surfaces synchronously when the + // stream is consumed. We then synthesize parts in `start`. + const self = this + + const stream = new ReadableStream({ + async start(controller) { + controller.enqueue({ type: "stream-start", warnings }) + + try { + const { responseJson } = await self._postChat(requestParams) + + const choice = responseJson.choices[0] + if (!choice) throw new Error("nemo-gym: empty choices in response") + const msg: ChatResponseChoice["message"] = + choice.message ?? ({ role: "assistant" } as ChatResponseChoice["message"]) + + const providerSpecificFields = self._extractProviderFields(msg) + const providerMetadata = self._buildProviderMetadata(providerSpecificFields) + + // Emit response-metadata first. + controller.enqueue({ + type: "response-metadata", + id: responseJson.id, + modelId: responseJson.model, + timestamp: responseJson.created ? new Date(responseJson.created * 1000) : undefined, + }) + + // Reasoning content. + if (msg.reasoning_text) { + controller.enqueue({ type: "reasoning-start", id: "reasoning-0" }) + controller.enqueue({ type: "reasoning-delta", id: "reasoning-0", delta: msg.reasoning_text }) + controller.enqueue({ type: "reasoning-end", id: "reasoning-0" }) + } + + // Text content. + if (msg.content) { + controller.enqueue({ type: "text-start", id: "txt-0" }) + controller.enqueue({ type: "text-delta", id: "txt-0", delta: msg.content }) + controller.enqueue({ type: "text-end", id: "txt-0" }) + } + + // Tool calls. + if (msg.tool_calls) { + for (const tc of msg.tool_calls) { + const tcId = tc.id ?? `call_${Math.random().toString(36).slice(2, 10)}` + controller.enqueue({ + type: "tool-input-start", + id: tcId, + toolName: tc.function.name, + }) + controller.enqueue({ + type: "tool-input-delta", + id: tcId, + delta: tc.function.arguments, + }) + controller.enqueue({ type: "tool-input-end", id: tcId }) + controller.enqueue({ + type: "tool-call", + toolCallId: tcId, + toolName: tc.function.name, + input: tc.function.arguments, + }) + } + } + + // Persist trajectory BEFORE finishing so a downstream tool crash + // cannot lose this turn's token IDs. + await self._dumpAndNotify({ + messages, + response: responseJson, + providerSpecificFields, + requestParams, + session, + }) + + controller.enqueue({ + type: "finish", + finishReason: self._mapFinishReason(choice.finish_reason ?? null), + usage: self._mapUsage(responseJson.usage), + providerMetadata, + }) + + controller.close() + } catch (err) { + controller.enqueue({ type: "error", error: err instanceof Error ? err.message : String(err) }) + controller.enqueue({ + type: "finish", + finishReason: { unified: "error", raw: undefined }, + usage: self._mapUsage(undefined), + providerMetadata: {}, + }) + controller.close() + } + }, + }) + + return { + stream, + request: { body: JSON.stringify(requestParams) }, + response: {}, + } + } + + // ----------------------------------------------------------------------- + // Helpers + // ----------------------------------------------------------------------- + + private async _buildRequestParams(options: LanguageModelV3CallOptions): Promise<{ + warnings: SharedV3Warning[] + messages: ChatRequestMessage[] + tools: unknown + toolChoice: unknown + requestParams: Record + }> { + const warnings: SharedV3Warning[] = [] + // Reuse opencode's existing OpenAI-compatible message converter so all + // tool-call / multi-content shapes map identically to the rest of opencode. + const messages = convertToOpenAICompatibleChatMessages(options.prompt) as unknown as ChatRequestMessage[] + + const { tools, toolChoice, toolWarnings } = prepareTools({ + tools: options.tools, + toolChoice: options.toolChoice, + }) + warnings.push(...toolWarnings) + + // Strip token-ID fields from all assistant messages EXCEPT the most recent. + // Mirrors nemo_gym_client.py:85-97. Wire-payload dedup; the most recent + // message keeps its IDs so the server can verify continuity. + { + let lastSeen = false + for (let i = messages.length - 1; i >= 0; i--) { + const m = messages[i] as Record + const hasAll = TOKEN_ID_FIELDS.every((f) => f in m) + if (lastSeen) { + for (const f of TOKEN_ID_FIELDS) delete m[f] + } else if (hasAll) { + lastSeen = true + } + } + } + + const requestParams: Record = { + messages, + max_tokens: options.maxOutputTokens, + temperature: options.temperature, + top_p: options.topP, + stop: options.stopSequences, + seed: options.seed, + } + // Only include `model` when the caller-supplied modelId is a real value. + // opencode's session resolver falls back to its sentinel `"default"` + // (and to empty string with some misconfigured agents) when no model is + // pinned. We DO NOT want either of those leaking through to OpenAI as a + // literal `model: "default"` — the gym openai_model server's + // `body_dict.setdefault("model", self.config.openai_model)` will fill in + // the policy-configured model name when we omit it instead. + if (this.modelId && this.modelId !== "default") { + requestParams.model = this.modelId + } + if (tools && (tools as unknown[]).length) requestParams.tools = tools + if (toolChoice) requestParams.tool_choice = toolChoice + + // Strip undefineds — vllm errors on null/undefined keys. + for (const k of Object.keys(requestParams)) { + if (requestParams[k] === undefined) delete requestParams[k] + } + + return { warnings, messages, tools, toolChoice, requestParams } + } + + private async _postChat(params: Record): Promise<{ responseJson: ChatResponse }> { + const url = this._urlFor("/v1/chat/completions") + const headers: Record = { + "Content-Type": "application/json", + Accept: "application/json", + } + // opencode's bundled-provider loader can pass `headers` as either a + // function (matching upstream openai-compatible's schema) OR a plain + // object (when opencode injects defaults from its provider merge layer). + // Handle both — `?.()` would throw on a non-callable object. + let cfgHeaders: Record | undefined + const rawHeaders = this.cfg.headers as unknown + if (typeof rawHeaders === "function") { + cfgHeaders = (rawHeaders as () => Record)() + } else if (rawHeaders && typeof rawHeaders === "object") { + cfgHeaders = rawHeaders as Record + } + if (cfgHeaders) { + for (const [k, v] of Object.entries(cfgHeaders)) if (v != null) headers[k] = v + } + if (Object.keys(this.cookies).length) { + headers.Cookie = Object.entries(this.cookies) + .map(([k, v]) => `${k}=${v}`) + .join("; ") + } + + const retries = this.cfg.retries ?? 3 + let lastErr: unknown = null + for (let attempt = 0; attempt < retries; attempt++) { + const ac = new AbortController() + const timer = setTimeout(() => ac.abort(), this.cfg.requestTimeoutMs) + try { + const res = await fetch(url, { + method: "POST", + headers, + body: JSON.stringify(params), + signal: ac.signal, + }) + clearTimeout(timer) + if (!res.ok) { + const text = await res.text().catch(() => "") + throw new Error(`NeMoGym ${url} ${res.status}: ${text.slice(0, 500)}`) + } + const setCookie = res.headers.get("set-cookie") + if (setCookie) { + for (const part of setCookie.split(/,(?=[^;]+=)/)) { + const [kv] = part.split(";") + const [k, v] = kv.split("=") + if (k && v) this.cookies[k.trim()] = v.trim() + } + } + const responseJson = (await res.json()) as ChatResponse + return { responseJson } + } catch (err) { + clearTimeout(timer) + lastErr = err + if (attempt === retries - 1) break + await new Promise((r) => setTimeout(r, 1000 * 2 ** attempt)) + } + } + throw new Error(`NeMoGym chat completions failed after ${retries} attempts: ${String(lastErr)}`) + } + + private _urlFor(p: string): string { + const base = this.cfg.baseURL.endsWith("/") ? this.cfg.baseURL : `${this.cfg.baseURL}/` + return new URL(p.replace(/^\//, ""), base).toString() + } + + private _extractProviderFields(msg: ChatResponseChoice["message"]): Record { + const out: Record = {} + if (Array.isArray(msg.prompt_token_ids)) { + for (const f of TOKEN_ID_FIELDS) { + const v = (msg as Record)[f] + if (v !== undefined) out[f] = v + } + } + return out + } + + private _buildProviderMetadata(providerSpecific: Record): SharedV3ProviderMetadata { + const md: SharedV3ProviderMetadata = { [this.provider]: {} } + for (const [k, v] of Object.entries(providerSpecific)) { + ;(md[this.provider] as Record)[k] = v as never + } + return md + } + + private _mapFinishReason(raw: string | null): { unified: "stop" | "length" | "tool-calls" | "error" | "other"; raw: string | undefined } { + if (!raw) return { unified: "other", raw: undefined } + switch (raw) { + case "stop": + return { unified: "stop", raw } + case "length": + return { unified: "length", raw } + case "tool_calls": + case "function_call": + return { unified: "tool-calls", raw } + default: + return { unified: "other", raw } + } + } + + private _mapUsage(raw?: ChatResponseUsage) { + return { + inputTokens: { + total: raw?.prompt_tokens ?? undefined, + noCache: raw?.prompt_tokens ?? undefined, + cacheRead: undefined, + cacheWrite: undefined, + }, + outputTokens: { + total: raw?.completion_tokens ?? undefined, + text: raw?.completion_tokens ?? undefined, + reasoning: undefined, + }, + } + } + + private async _dumpAndNotify(args: { + messages: ChatRequestMessage[] + response: ChatResponse + providerSpecificFields: Record + requestParams: Record + session: { sessionID: string; parentSessionID: string | undefined } + }) { + const turn = this._nextTurn(args.session.sessionID) + if (this.cfg.onCompletion) { + try { + await this.cfg.onCompletion({ turn, ...args }) + } catch (err) { + console.warn(`[nemo-gym] onCompletion hook threw: ${String(err)}`) + } + } + + if (!this.cfg.completionsDir || !this.cfg.instanceId) return + + try { + await fs.mkdir(this.cfg.completionsDir, { recursive: true }) + const turnStr = String(turn).padStart(4, "0") + const safeModel = this.modelId.replace(/\//g, "__") + // sessionID is part of the filename so subagent dumps don't clobber the + // main session's. Sanitized for filesystem safety. + const safeSession = args.session.sessionID.replace(/[^A-Za-z0-9_-]/g, "_") + const fname = `${safeModel}-${safeSession}-${turnStr}-${Date.now()}.json` + const fpath = path.join(this.cfg.completionsDir, fname) + const kwargs: Record = {} + for (const [k, v] of Object.entries(args.requestParams)) { + if (k !== "messages") kwargs[k] = v + } + const payload = { + messages: args.messages, + response: args.response, + provider_specific_fields: args.providerSpecificFields, + kwargs, + session_id: args.session.sessionID, + parent_session_id: args.session.parentSessionID ?? null, + turn, + timestamp: Date.now() / 1000, + } + const tmp = `${fpath}.tmp` + await fs.writeFile(tmp, JSON.stringify(payload)) + await fs.rename(tmp, fpath) + } catch (err) { + console.warn(`[nemo-gym] failed to dump completion: ${String(err)}`) + } + } +} diff --git a/packages/opencode/src/session/prompt.ts b/packages/opencode/src/session/prompt.ts index fef8c438366c..2e53938e04e9 100644 --- a/packages/opencode/src/session/prompt.ts +++ b/packages/opencode/src/session/prompt.ts @@ -1581,7 +1581,7 @@ NOTE: At any point in time through this workflow you should feel free to ask the sessionID, parentSessionID: session.parentID, system, - messages: [...modelMsgs, ...(isLastStep ? [{ role: "assistant" as const, content: MAX_STEPS }] : [])], + messages: [...modelMsgs, ...(isLastStep ? [{ role: "user" as const, content: MAX_STEPS }] : [])], tools, model, toolChoice: format.type === "json_schema" ? "required" : undefined, diff --git a/packages/opencode/src/session/system.ts b/packages/opencode/src/session/system.ts index 06c71fa7dbdd..acde90b448d7 100644 --- a/packages/opencode/src/session/system.ts +++ b/packages/opencode/src/session/system.ts @@ -46,6 +46,11 @@ export const layer = Layer.effect( return Service.of({ environment: Effect.fn("SystemPrompt.environment")(function* (model: Provider.Model) { + // Bench / RL mode: skip the dynamic env block entirely. It includes + // `new Date().toDateString()` which would shift prompt tokens across a + // midnight rollover and break the RL contiguity invariant + // (prompt_token_ids[N+1] must extend prompt_token_ids[N]). + if (process.env.OPENCODE_DISABLE_ENV_PROMPT === "1") return [] const ctx = yield* InstanceState.context return [ [