sdevare-nv · sdevare-nv · Mar 3, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -32,6 +32,7 @@
     codeact_user_response,
     codex_user_response,
     opencode_user_response,
+    terminus_2_user_response,
     get_default_sandbox_config_for_eval,
     get_metrics,
     get_openhands_config_for_eval,
@@ -108,6 +109,7 @@ def set_dataset_type(dataset_name: str) -> str:
     'CodeActAgent': codeact_user_response,
     'OpenCodeAgent': opencode_user_response,
     'CodexAgent': codex_user_response,
+    'Terminus2Agent': terminus_2_user_response,
 }
 
 

diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
@@ -249,6 +249,54 @@ def codex_user_response(
     return msg
 
 
+def terminus_2_user_response(
+    state: State,
+    encapsulate_solution: bool = False,
+    try_parse: Callable[[Action], str] | None = None,
+) -> str:
+    encaps_str = (
+        (
+            'Your final answer MUST be encapsulated within <solution> and </solution>.\n'
+            'For example: The answer to the question is <solution> 42 </solution>.\n'
+        )
+        if encapsulate_solution
+        else ''
+    )
+    msg = (
+        'Please continue working on the task. '
+        'Analyze the terminal output and issue the next batch of commands.\n'
+        'When the task is fully complete, set "task_complete": true in your JSON response.\n'
+        f'{encaps_str}'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
+    )
+
+    if state.history:
+        if try_parse is not None:
+            last_action = next(
+                (
+                    event
+                    for event in reversed(state.history)
+                    if isinstance(event, Action)
+                ),
+                None,
+            )
+            ans = try_parse(last_action)
+            if ans is not None:
+                return '/exit'
+
+        user_msgs = [
+            event
+            for event in state.history
+            if isinstance(event, MessageAction) and event.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            return (
+                msg
+                + 'If you want to give up, set "task_complete": true in your JSON response.\n'
+            )
+    return msg
+
+
 def cleanup():
     print('Cleaning up child processes...')
     for process in mp.active_children():

diff --git a/openhands/agenthub/__init__.py b/openhands/agenthub/__init__.py
@@ -11,6 +11,7 @@
     loc_agent,
     opencode_agent,
     readonly_agent,
+    terminus_2_agent,
     visualbrowsing_agent,
 )
 from openhands.controller.agent import Agent  # noqa: E402
@@ -25,4 +26,5 @@
     'loc_agent',
     'opencode_agent',
     'codex_agent',
+    'terminus_2_agent',
 ]
diff --git a/openhands/agenthub/terminus_2_agent/README.md b/openhands/agenthub/terminus_2_agent/README.md
@@ -0,0 +1,248 @@
+# Terminus-2 Agent
+
+The Terminus-2 Agent is a keystroke-based terminal agent ported from the [terminal-bench](../../temp/terminal-bench/) project. Unlike function-calling agents (CodeAct, OpenCode, Codex), it communicates with the LLM using structured JSON responses and interacts with the terminal by sending raw keystrokes and receiving screen capture output.
+
+## Overview
+
+Terminus-2 is designed around a fundamentally different interaction model than other OpenHands agents:
+
+- **No function calling** -- The LLM outputs a raw JSON object instead of tool calls. A dedicated parser extracts structured commands from the response.
+- **Keystroke-based terminal interaction** -- Instead of running shell commands and collecting stdout/stderr, the agent sends raw keystrokes (including tmux-style special keys like `C-c`, `C-d`) and receives the full terminal screen state back.
+- **Batch command execution** -- Each LLM response can contain multiple commands that are executed sequentially before the next LLM call.
+
+## Architecture
+
+```
+                +-----------+
+                |   LLM     |
+                +-----+-----+
+                      |
+          JSON response (text)
+                      |
+                +-----v-----+
+                |  JSON      |
+                |  Parser    |
+                +-----+-----+
+                      |
+          List[ParsedCommand]
+                      |
+        +-------------v--------------+
+        |     Terminus2Agent         |
+        |  (step loop, confirmation) |
+        +-------------+--------------+
+                      |
+       Terminus2CmdRunAction (per command)
+                      |
+        +-------------v--------------+
+        |  ActionExecutionServer     |
+        |  (keystroke execution)     |
+        +-------------+--------------+
+                      |
+       Terminus2CmdOutputObservation
+                      |
+              (terminal screen state)
+```
+
+### Agent Step Cycle
+
+Each call to `step()`:
+
+1. If there are pending actions queued from a previous LLM call, return the next one.
+2. Otherwise, build a conversation message list from the event history.
+3. Call the LLM (with up to 3 retries on parse errors).
+4. Parse the JSON response to extract commands.
+5. Queue a `Terminus2CmdRunAction` for each command.
+6. Return the first action from the queue.
+
+### JSON Response Format
+
+The LLM is expected to respond with a JSON object:
+
+```json
+{
+  "analysis": "What I observe in the terminal output and what has been done so far.",
+  "plan": "My plan for the next steps and what each command will accomplish.",
+  "commands": [
+    {
+      "keystrokes": "ls -la\n",
+      "duration": 0.1
+    },
+    {
+      "keystrokes": "cd project\n",
+      "duration": 0.1
+    }
+  ],
+  "task_complete": false
+}
+```
+
+| Field | Required | Type | Description |
+|-------|----------|------|-------------|
+| `analysis` | Yes | string | Analysis of the current terminal state |
+| `plan` | Yes | string | Plan for the next steps |
+| `commands` | Yes | array | Array of command objects to execute |
+| `task_complete` | No | boolean | Whether the task is finished (default: `false`) |
+
+Each command object:
+
+| Field | Required | Type | Default | Description |
+|-------|----------|------|---------|-------------|
+| `keystrokes` | Yes | string | -- | Exact keystrokes to send to the terminal |
+| `duration` | No | float | 1.0 | Seconds to wait before capturing output (capped at 60) |
+
+### Keystrokes
+
+The `keystrokes` field is sent **verbatim** to the terminal:
+
+- Commands should end with `\n` to execute (e.g., `"ls -la\n"`)
+- Special key sequences use tmux-style escapes:
+  - `C-c` -- Ctrl+C (send SIGINT)
+  - `C-d` -- Ctrl+D (send EOF)
+- Empty keystrokes (`""`) with a duration can be used to poll/wait for output
+- Multiple commands in the same batch are executed sequentially
+
+### Duration Guidelines
+
+| Command type | Recommended duration |
+|-------------|---------------------|
+| Immediate (`cd`, `ls`, `echo`, `cat`) | 0.1s |
+| Standard (`gcc`, `find`, `rustc`) | 1.0s |
+| Slow (`make`, `pip install`, `wget`) | 5.0-30.0s |
+| Polling (wait for output) | 10.0s |
+
+It is better to set a shorter duration and poll again than to set a long one. The maximum allowed duration is 60 seconds.
+
+## Key Features
+
+### JSON Parser with Auto-Correction
+
+The `TerminusJSONPlainParser` handles common LLM formatting mistakes:
+
+- **Incomplete JSON** -- Adds missing closing braces when the response is truncated
+- **Mixed content** -- Extracts JSON from responses that contain extra text before/after
+- **Markdown code fences** -- Handles JSON wrapped in `` ```json ``` `` blocks
+- **Field validation** -- Checks required fields, types, and correct field order
+- **Warnings** -- Non-fatal issues (missing duration, unknown fields, wrong order) are reported as warnings rather than errors
+
+### Double Confirmation for Task Completion
+
+To prevent premature task completion:
+
+1. First `"task_complete": true` -- Triggers a confirmation prompt: *"Are you sure you want to mark the task as complete?"*
+2. Second consecutive `"task_complete": true` -- Actually completes the task via `AgentFinishAction`
+3. If the LLM does *not* set `task_complete` after a confirmation prompt, the pending completion is reset.
+
+### Output Truncation
+
+Terminal output is truncated to 10KB to prevent context window overflow. When truncation occurs, the first and last 5KB are preserved with a marker indicating how many bytes were omitted from the middle.
+
+### Timeout Handling
+
+When a command exceeds its duration, the agent sends a timeout message to the LLM explaining that the command may still be running and showing the current terminal state. The LLM can then decide to wait longer (empty keystrokes with a duration), cancel the command (`C-c`), or proceed.
+
+## File Structure
+
+```
+openhands/agenthub/terminus_2_agent/
+    __init__.py                      # Agent registration
+    terminus_2_agent.py              # Main agent class
+    terminus_json_plain_parser.py    # JSON response parser
+    README.md                        # This file
+    prompts/
+        system_prompt.j2             # System prompt with JSON format spec
+        system_prompt_long_horizon.j2 # Extended prompt for long tasks
+        additional_info.j2           # Repository/runtime info template
+        microagent_info.j2           # Microagent trigger info template
+        user_prompt.j2               # Initial user message template
+```
+
+### Supporting files in other directories
+
+```
+openhands/events/action/terminus_2.py          # Terminus2CmdRunAction
+openhands/events/observation/terminus_2.py     # Terminus2CmdOutputObservation
+openhands/core/schema/action.py                # TERMINUS_2_CMD_RUN enum
+openhands/core/schema/observation.py           # TERMINUS_2_CMD_OUTPUT enum
+tests/unit/agenthub/test_terminus_2_parser.py  # Parser tests (36)
+tests/unit/agenthub/test_terminus_2_agent.py   # Agent tests (22)
+tests/unit/agenthub/test_terminus_2_action_observation.py  # Serialization tests (27)
+```
+
+## Usage
+
+### Quick Start
+
+To use the Terminus-2 agent in code:
+
+```python
+from openhands.core.config import AgentConfig
+from openhands.llm.llm_registry import LLMRegistry
+
+config = AgentConfig(agent_name='Terminus2Agent')
+llm_registry = LLMRegistry()
+agent = Terminus2Agent(config, llm_registry)
+```
+
+Or via configuration:
+
+```yaml
+agent:
+  name: Terminus2Agent
+```
+
+### Evaluation and Benchmarking
+
+To run SWE-bench evaluations with Terminus2Agent:
+
+```bash
+poetry run python evaluation/benchmarks/swe_bench/run_infer.py \
+  --agent-cls Terminus2Agent \
+  --llm-config your_model_config \
+  --max-iterations 50 \
+  --dataset princeton-nlp/SWE-bench_Lite \
+  --split test
+```
+
+### Custom Prompts
+
+Override the system prompt via config:
+
+```yaml
+agent:
+  name: Terminus2Agent
+  system_prompt_path: /path/to/custom/system_prompt.j2
+```
+
+Or override the entire prompt directory:
+
+```yaml
+agent:
+  name: Terminus2Agent
+  custom_prompt_dir: /path/to/custom/prompts/
+```
+
+## Comparison to Other Agents
+
+| Feature | CodeActAgent | OpenCodeAgent | Terminus2Agent |
+|---------|--------------|---------------|----------------|
+| LLM interface | Function calling | Function calling | Raw JSON parsing |
+| Terminal interaction | Command + stdout | Command + stdout | Keystrokes + screen capture |
+| Batch commands | Single per turn | Single per turn | Multiple per turn |
+| Special keys (Ctrl+C) | Via bash | Via bash | Native (`C-c`) |
+| File operations | Tools | Tools | Via terminal commands |
+| Task completion | `finish` tool | `finish` tool | `task_complete` field + double confirmation |
+| Parse error recovery | N/A (function calling) | N/A (function calling) | Auto-fix + retry (up to 3 attempts) |
+| Output format | Structured (exit code, stdout, stderr) | Structured | Full terminal screen state |
+
+### When to Use Terminus-2
+
+Terminus-2 is best suited for:
+
+- **Terminal-centric tasks** where seeing the full screen state matters (interactive programs, TUI applications, vim, etc.)
+- **Models without function calling support** that can reliably produce JSON
+- **Benchmarks** that measure terminal interaction fidelity (e.g., terminal-bench)
+- **Tasks requiring special key sequences** (Ctrl+C to cancel, Ctrl+D for EOF, interactive prompts)
+
+### Provenance
+
+This agent was ported from the standalone Terminus-2 implementation in `terminal-bench`. The original agent used tmux sessions for terminal interaction; this OpenHands port adapts the same logic to work with OpenHands' `BashSession` runtime while preserving the JSON-based LLM interaction model, the parser with auto-correction, and the double-confirmation completion flow.
diff --git a/openhands/agenthub/terminus_2_agent/__init__.py b/openhands/agenthub/terminus_2_agent/__init__.py
@@ -0,0 +1,4 @@
+from openhands.agenthub.terminus_2_agent.terminus_2_agent import Terminus2Agent
+from openhands.controller.agent import Agent
+
+Agent.register('Terminus2Agent', Terminus2Agent)
diff --git a/openhands/agenthub/terminus_2_agent/prompts/additional_info.j2 b/openhands/agenthub/terminus_2_agent/prompts/additional_info.j2
@@ -0,0 +1,52 @@
+{% if repository_info %}
+<REPOSITORY_INFO>
+At the user's request, repository {{ repository_info.repo_name }} has been cloned to {{ repository_info.repo_directory }} in the current working directory.
+{% if repository_info.branch_name %}The repository has been checked out to branch "{{ repository_info.branch_name }}".
+
+IMPORTANT: You should work within the current branch "{{ repository_info.branch_name }}" unless:
+    1. the user explicitly instructs otherwise
+    2. the current branch is "main", "master", or another default branch where direct pushes may be unsafe
+{% endif %}
+</REPOSITORY_INFO>
+{% endif %}
+{% if repository_instructions -%}
+<REPOSITORY_INSTRUCTIONS>
+{{ repository_instructions }}
+</REPOSITORY_INSTRUCTIONS>
+{% endif %}
+{% if runtime_info -%}
+<RUNTIME_INFORMATION>
+{% if runtime_info.working_dir %}
+The current working directory is {{ runtime_info.working_dir }}
+{% endif %}
+{% if runtime_info.available_hosts %}
+The user has access to the following hosts for accessing a web application,
+each of which has a corresponding port:
+{% for host, port in runtime_info.available_hosts.items() -%}
+* {{ host }} (port {{ port }})
+{% endfor %}
+When starting a web server, use the corresponding ports. You should also
+set any options to allow iframes and CORS requests, and allow the server to
+be accessed from any host (e.g. 0.0.0.0).
+{% endif %}
+{% if runtime_info.additional_agent_instructions %}
+{{ runtime_info.additional_agent_instructions }}
+{% endif %}
+{% if runtime_info.custom_secrets_descriptions %}
+<CUSTOM_SECRETS>
+You have access to the following environment variables
+{% for secret_name, secret_description in runtime_info.custom_secrets_descriptions.items() %}
+* **${{ secret_name }}**: {{ secret_description }}
+{% endfor %}
+</CUSTOM_SECRETS>
+{% endif %}
+{% if runtime_info.date %}
+Today's date is {{ runtime_info.date }} (UTC).
+{% endif %}
+</RUNTIME_INFORMATION>
+{% if conversation_instructions and conversation_instructions.content -%}
+<CONVERSATION_INSTRUCTIONS>
+{{ conversation_instructions.content }}
+</CONVERSATION_INSTRUCTIONS>
+{% endif %}
+{% endif %}