From 2a90b7c1469f4ef277d9cb831fffe55984a2dd2e Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Thu, 19 Feb 2026 21:51:40 -0800
Subject: [PATCH 1/9] feat: terminus init

---
 evaluation/benchmarks/swe_bench/run_infer.py  |   2 +
 evaluation/utils/shared.py                    |  48 ++
 openhands/agenthub/__init__.py                |   2 +
 .../agenthub/terminus_2_agent/__init__.py     |   4 +
 .../prompts/additional_info.j2                |  52 +++
 .../prompts/microagent_info.j2                |   8 +
 .../terminus_2_agent/prompts/system_prompt.j2 |  51 +++
 .../prompts/system_prompt_long_horizon.j2     |  12 +
 .../terminus_2_agent/prompts/user_prompt.j2   |   5 +
 .../terminus_2_agent/terminus_2_agent.py      | 365 +++++++++++++++
 .../terminus_json_plain_parser.py             | 329 ++++++++++++++
 openhands/core/config/agent_config.py         |  22 +-
 openhands/core/schema/action.py               |   4 +
 openhands/core/schema/observation.py          |   4 +
 openhands/events/action/__init__.py           |   3 +
 openhands/events/action/terminus_2.py         |  41 ++
 openhands/events/observation/__init__.py      |   3 +
 openhands/events/observation/terminus_2.py    |  33 ++
 openhands/events/serialization/action.py      |   3 +
 openhands/events/serialization/observation.py |   3 +
 openhands/memory/conversation_memory.py       |   9 +
 openhands/runtime/action_execution_server.py  |  67 +++
 .../test_terminus_2_action_observation.py     | 282 ++++++++++++
 tests/unit/agenthub/test_terminus_2_agent.py  | 330 ++++++++++++++
 tests/unit/agenthub/test_terminus_2_parser.py | 416 ++++++++++++++++++
 25 files changed, 2087 insertions(+), 11 deletions(-)
 create mode 100644 openhands/agenthub/terminus_2_agent/__init__.py
 create mode 100644 openhands/agenthub/terminus_2_agent/prompts/additional_info.j2
 create mode 100644 openhands/agenthub/terminus_2_agent/prompts/microagent_info.j2
 create mode 100644 openhands/agenthub/terminus_2_agent/prompts/system_prompt.j2
 create mode 100644 openhands/agenthub/terminus_2_agent/prompts/system_prompt_long_horizon.j2
 create mode 100644 openhands/agenthub/terminus_2_agent/prompts/user_prompt.j2
 create mode 100644 openhands/agenthub/terminus_2_agent/terminus_2_agent.py
 create mode 100644 openhands/agenthub/terminus_2_agent/terminus_json_plain_parser.py
 create mode 100644 openhands/events/action/terminus_2.py
 create mode 100644 openhands/events/observation/terminus_2.py
 create mode 100644 tests/unit/agenthub/test_terminus_2_action_observation.py
 create mode 100644 tests/unit/agenthub/test_terminus_2_agent.py
 create mode 100644 tests/unit/agenthub/test_terminus_2_parser.py
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
index dfb71464159c..9b456f5730aa 100644
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -32,6 +32,7 @@
     codeact_user_response,
     codex_user_response,
     opencode_user_response,
+    terminus_2_user_response,
     get_default_sandbox_config_for_eval,
     get_metrics,
     get_openhands_config_for_eval,
@@ -108,6 +109,7 @@ def set_dataset_type(dataset_name: str) -> str:
     'CodeActAgent': codeact_user_response,
     'OpenCodeAgent': opencode_user_response,
     'CodexAgent': codex_user_response,
+    'Terminus2Agent': terminus_2_user_response,
 }
 
 
diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py
index 9faebf20089c..7972cb6850bb 100644
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -249,6 +249,54 @@ def codex_user_response(
     return msg
 
 
+def terminus_2_user_response(
+    state: State,
+    encapsulate_solution: bool = False,
+    try_parse: Callable[[Action], str] | None = None,
+) -> str:
+    encaps_str = (
+        (
+            'Your final answer MUST be encapsulated within <solution> and </solution>.\n'
+            'For example: The answer to the question is <solution> 42 </solution>.\n'
+        )
+        if encapsulate_solution
+        else ''
+    )
+    msg = (
+        'Please continue working on the task. '
+        'Analyze the terminal output and issue the next batch of commands.\n'
+        'When the task is fully complete, set "task_complete": true in your JSON response.\n'
+        f'{encaps_str}'
+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
+    )
+
+    if state.history:
+        if try_parse is not None:
+            last_action = next(
+                (
+                    event
+                    for event in reversed(state.history)
+                    if isinstance(event, Action)
+                ),
+                None,
+            )
+            ans = try_parse(last_action)
+            if ans is not None:
+                return '/exit'
+
+        user_msgs = [
+            event
+            for event in state.history
+            if isinstance(event, MessageAction) and event.source == 'user'
+        ]
+        if len(user_msgs) >= 2:
+            return (
+                msg
+                + 'If you want to give up, set "task_complete": true in your JSON response.\n'
+            )
+    return msg
+
+
 def cleanup():
     print('Cleaning up child processes...')
     for process in mp.active_children():
diff --git a/openhands/agenthub/__init__.py b/openhands/agenthub/__init__.py
index 73bb54f2e6f9..f652ee6d466e 100644
--- a/openhands/agenthub/__init__.py
+++ b/openhands/agenthub/__init__.py
@@ -11,6 +11,7 @@
     loc_agent,
     opencode_agent,
     readonly_agent,
+    terminus_2_agent,
     visualbrowsing_agent,
 )
 from openhands.controller.agent import Agent  # noqa: E402
@@ -25,4 +26,5 @@
     'loc_agent',
     'opencode_agent',
     'codex_agent',
+    'terminus_2_agent',
 ]
diff --git a/openhands/agenthub/terminus_2_agent/__init__.py b/openhands/agenthub/terminus_2_agent/__init__.py
new file mode 100644
index 000000000000..5c625358c53b
--- /dev/null
+++ b/openhands/agenthub/terminus_2_agent/__init__.py
@@ -0,0 +1,4 @@
+from openhands.agenthub.terminus_2_agent.terminus_2_agent import Terminus2Agent
+from openhands.controller.agent import Agent
+
+Agent.register('Terminus2Agent', Terminus2Agent)
diff --git a/openhands/agenthub/terminus_2_agent/prompts/additional_info.j2 b/openhands/agenthub/terminus_2_agent/prompts/additional_info.j2
new file mode 100644
index 000000000000..d80a36338ca4
--- /dev/null
+++ b/openhands/agenthub/terminus_2_agent/prompts/additional_info.j2
@@ -0,0 +1,52 @@
+{% if repository_info %}
+<REPOSITORY_INFO>
+At the user's request, repository {{ repository_info.repo_name }} has been cloned to {{ repository_info.repo_directory }} in the current working directory.
+{% if repository_info.branch_name %}The repository has been checked out to branch "{{ repository_info.branch_name }}".
+
+IMPORTANT: You should work within the current branch "{{ repository_info.branch_name }}" unless:
+    1. the user explicitly instructs otherwise
+    2. the current branch is "main", "master", or another default branch where direct pushes may be unsafe
+{% endif %}
+</REPOSITORY_INFO>
+{% endif %}
+{% if repository_instructions -%}
+<REPOSITORY_INSTRUCTIONS>
+{{ repository_instructions }}
+</REPOSITORY_INSTRUCTIONS>
+{% endif %}
+{% if runtime_info -%}
+<RUNTIME_INFORMATION>
+{% if runtime_info.working_dir %}
+The current working directory is {{ runtime_info.working_dir }}
+{% endif %}
+{% if runtime_info.available_hosts %}
+The user has access to the following hosts for accessing a web application,
+each of which has a corresponding port:
+{% for host, port in runtime_info.available_hosts.items() -%}
+* {{ host }} (port {{ port }})
+{% endfor %}
+When starting a web server, use the corresponding ports. You should also
+set any options to allow iframes and CORS requests, and allow the server to
+be accessed from any host (e.g. 0.0.0.0).
+{% endif %}
+{% if runtime_info.additional_agent_instructions %}
+{{ runtime_info.additional_agent_instructions }}
+{% endif %}
+{% if runtime_info.custom_secrets_descriptions %}
+<CUSTOM_SECRETS>
+You have access to the following environment variables
+{% for secret_name, secret_description in runtime_info.custom_secrets_descriptions.items() %}
+* **${{ secret_name }}**: {{ secret_description }}
+{% endfor %}
+</CUSTOM_SECRETS>
+{% endif %}
+{% if runtime_info.date %}
+Today's date is {{ runtime_info.date }} (UTC).
+{% endif %}
+</RUNTIME_INFORMATION>
+{% if conversation_instructions and conversation_instructions.content -%}
+<CONVERSATION_INSTRUCTIONS>
+{{ conversation_instructions.content }}
+</CONVERSATION_INSTRUCTIONS>
+{% endif %}
+{% endif %}
diff --git a/openhands/agenthub/terminus_2_agent/prompts/microagent_info.j2 b/openhands/agenthub/terminus_2_agent/prompts/microagent_info.j2
new file mode 100644
index 000000000000..264828fbe206
--- /dev/null
+++ b/openhands/agenthub/terminus_2_agent/prompts/microagent_info.j2
@@ -0,0 +1,8 @@
+{% for agent_info in triggered_agents %}
+<EXTRA_INFO>
+The following information has been included based on a keyword match for "{{ agent_info.trigger }}".
+It may or may not be relevant to the user's request.
+
+{{ agent_info.content }}
+</EXTRA_INFO>
+{% endfor %}
diff --git a/openhands/agenthub/terminus_2_agent/prompts/system_prompt.j2 b/openhands/agenthub/terminus_2_agent/prompts/system_prompt.j2
new file mode 100644
index 000000000000..8743f5078e25
--- /dev/null
+++ b/openhands/agenthub/terminus_2_agent/prompts/system_prompt.j2
@@ -0,0 +1,51 @@
+You are an AI assistant tasked with solving command-line tasks in a Linux environment. You will be given a task description and the output from previously executed commands. Your goal is to solve the task by providing batches of shell commands.
+
+Format your response as JSON with the following structure:
+
+{
+  "analysis": "Analyze the current state based on the terminal output provided. What do you see? What has been accomplished? What still needs to be done?",
+  "plan": "Describe your plan for the next steps. What commands will you run and why? Be specific about what you expect each command to accomplish.",
+  "commands": [
+    {
+      "keystrokes": "ls -la\n",
+      "duration": 0.1
+    },
+    {
+      "keystrokes": "cd project\n",
+      "duration": 0.1
+    }
+  ],
+  "task_complete": true
+}
+
+Required fields:
+- "analysis": Your analysis of the current situation
+- "plan": Your plan for the next steps
+- "commands": Array of command objects to execute
+
+Optional fields:
+- "task_complete": Boolean indicating if the task is complete (defaults to false if not present)
+
+Command object structure:
+- "keystrokes": String containing the exact keystrokes to send to the terminal (required)
+- "duration": Number of seconds to wait for the command to complete before the next command will be executed (defaults to 1.0 if not present)
+
+IMPORTANT: The text inside "keystrokes" will be used completely verbatim as keystrokes. Write commands exactly as you want them sent to the terminal:
+- Most bash commands should end with a newline (\n) to cause them to execute
+- For special key sequences, use tmux-style escape sequences:
+  - C-c for Ctrl+C
+  - C-d for Ctrl+D
+
+The "duration" attribute specifies the number of seconds to wait for the command to complete (default: 1.0) before the next command will be executed. On immediate tasks (e.g., cd, ls, echo, cat) set a duration of 0.1 seconds. On commands (e.g., gcc, find, rustc) set a duration of 1.0 seconds. On slow commands (e.g., make, python3 [long running script], wget [file]) set an appropriate duration as you determine necessary.
+
+It is better to set a smaller duration than a longer duration. It is always possible to wait again if the prior output has not finished, by running {"keystrokes": "", "duration": 10.0} on subsequent requests to wait longer. Never wait longer than 60 seconds; prefer to poll to see intermediate result status.
+
+Important notes:
+- Each command's keystrokes are sent exactly as written to the terminal
+- Do not include extra whitespace before or after the keystrokes unless it's part of the intended command
+- Extra text before or after the JSON will generate warnings but be tolerated
+- The JSON must be valid - use proper escaping for quotes and special characters within strings
+- Commands array can be empty if you want to wait without taking action
+
+{% include 'additional_info.j2' %}
+{% include 'microagent_info.j2' %}
diff --git a/openhands/agenthub/terminus_2_agent/prompts/system_prompt_long_horizon.j2 b/openhands/agenthub/terminus_2_agent/prompts/system_prompt_long_horizon.j2
new file mode 100644
index 000000000000..4c61f6d104dd
--- /dev/null
+++ b/openhands/agenthub/terminus_2_agent/prompts/system_prompt_long_horizon.j2
@@ -0,0 +1,12 @@
+{% include 'system_prompt.j2' %}
+
+<LONG_HORIZON_TASK_GUIDANCE>
+This task may require many steps to complete. Keep these guidelines in mind:
+
+1. **Be methodical**: Break the problem into smaller sub-tasks and solve them one at a time.
+2. **Verify each step**: After running a command, check the output to confirm it succeeded before moving on.
+3. **Keep track of progress**: In your analysis, maintain a clear picture of what has been done and what remains.
+4. **Handle errors gracefully**: If a command fails, analyze why and adjust your approach.
+5. **Avoid redundant work**: If you've already verified something works, don't re-verify unnecessarily.
+6. **Use efficient commands**: Prefer commands that give you the most information in one step.
+</LONG_HORIZON_TASK_GUIDANCE>
diff --git a/openhands/agenthub/terminus_2_agent/prompts/user_prompt.j2 b/openhands/agenthub/terminus_2_agent/prompts/user_prompt.j2
new file mode 100644
index 000000000000..8d78b425ce67
--- /dev/null
+++ b/openhands/agenthub/terminus_2_agent/prompts/user_prompt.j2
@@ -0,0 +1,5 @@
+Task Description:
+{{ task }}
+
+Current terminal state:
+{{ terminal_state }}
diff --git a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
new file mode 100644
index 000000000000..b37777a9345a
--- /dev/null
+++ b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
@@ -0,0 +1,365 @@
+"""Terminus-2 Agent for OpenHands.
+
+A keystroke-based terminal agent that sends raw keystrokes to a terminal session
+and receives screen capture output. Uses JSON-formatted LLM responses with
+analysis, plan, and commands fields.
+"""
+
+import os
+from collections import deque
+from typing import TYPE_CHECKING
+
+from openhands.llm.llm_registry import LLMRegistry
+
+if TYPE_CHECKING:
+    from openhands.events.action import Action
+
+from openhands.agenthub.terminus_2_agent.terminus_json_plain_parser import (
+    ParsedCommand,
+    TerminusJSONPlainParser,
+)
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import Message, TextContent
+from openhands.events.action import AgentFinishAction, AgentThinkAction, MessageAction
+from openhands.events.action.terminus_2 import Terminus2CmdRunAction
+from openhands.events.event import Event, EventSource
+from openhands.events.observation.error import ErrorObservation
+from openhands.events.observation.terminus_2 import Terminus2CmdOutputObservation
+from openhands.memory.condenser import Condenser
+from openhands.memory.condenser.condenser import Condensation, View
+from openhands.runtime.plugins import PluginRequirement
+from openhands.utils.prompt import PromptManager
+
+
+MAX_OUTPUT_BYTES = 10000
+MAX_LLM_RETRY = 3
+
+TIMEOUT_TEMPLATE = (
+    'Previous command:\n{command}\n\n'
+    'The previous command timed out after {timeout_sec} seconds\n\n'
+    'It is possible that the command is not yet finished executing. '
+    'If that is the case, then do nothing. It is also possible that you '
+    'have entered an interactive shell and should continue sending '
+    'keystrokes as normal.\n\n'
+    'Here is the current state of the terminal:\n\n{terminal_state}'
+)
+
+COMPLETION_CONFIRMATION = (
+    'Current terminal state:\n{terminal_state}\n\n'
+    'Are you sure you want to mark the task as complete? '
+    'This will trigger your solution to be graded and you won\'t be able to '
+    'make any further corrections. If so, include "task_complete": true '
+    'in your JSON response again.'
+)
+
+
+class Terminus2Agent(Agent):
+    VERSION = '1.0'
+    """
+    The Terminus-2 Agent sends raw keystrokes to a terminal session and receives
+    screen capture output, using JSON-formatted LLM responses.
+
+    Unlike function-calling agents (CodeAct, OpenCode), Terminus-2 parses the
+    LLM's raw text response as JSON with fields: analysis, plan, commands, and
+    optionally task_complete.
+
+    Key features:
+    - Keystroke-based terminal interaction (tmux-style)
+    - JSON response parsing with auto-correction
+    - Double confirmation for task completion
+    - Output truncation to 10KB
+    - Duration-based command timeouts
+    """
+
+    sandbox_plugins: list[PluginRequirement] = []
+
+    def __init__(self, config: AgentConfig, llm_registry: LLMRegistry) -> None:
+        super().__init__(config, llm_registry)
+        self.pending_actions: deque['Action'] = deque()
+        self.parser = TerminusJSONPlainParser()
+        self._pending_completion = False
+        self._conversation_messages: list[dict[str, str]] = []
+        self._needs_llm_call = True
+
+        self.condenser = Condenser.from_config(self.config.condenser, llm_registry)
+        self.llm = self.llm_registry.get_router(self.config)
+
+    @property
+    def prompt_manager(self) -> PromptManager:
+        if self._prompt_manager is None:
+            prompt_dir = (
+                self.config.custom_prompt_dir
+                if self.config.custom_prompt_dir
+                else os.path.join(os.path.dirname(__file__), 'prompts')
+            )
+
+            template_overrides = {}
+            if self.config.system_prompt_path:
+                template_overrides['system_prompt.j2'] = self.config.system_prompt_path
+            if self.config.system_prompt_long_horizon_path:
+                template_overrides['system_prompt_long_horizon.j2'] = (
+                    self.config.system_prompt_long_horizon_path
+                )
+
+            self._prompt_manager = PromptManager(
+                prompt_dir=prompt_dir,
+                system_prompt_filename=self.config.resolved_system_prompt_filename,
+                template_overrides=template_overrides if template_overrides else None,
+            )
+
+        return self._prompt_manager
+
+    def reset(self) -> None:
+        super().reset()
+        self.pending_actions.clear()
+        self._pending_completion = False
+        self._conversation_messages = []
+        self._needs_llm_call = True
+
+    def step(self, state: State) -> 'Action':
+        """Performs one step of the Terminus-2 agent.
+
+        Returns pending actions from the queue, or calls the LLM to get
+        new commands when the queue is empty.
+        """
+        if self.pending_actions:
+            return self.pending_actions.popleft()
+
+        latest_user_message = state.get_last_user_message()
+        if latest_user_message and latest_user_message.content.strip() == '/exit':
+            return AgentFinishAction()
+
+        condensed_history: list[Event] = []
+        match self.condenser.condensed_history(state):
+            case View(events=events):
+                condensed_history = events
+            case Condensation(action=condensation_action):
+                return condensation_action
+
+        messages = self._build_messages(condensed_history, state)
+
+        commands, is_task_complete = self._call_llm_and_parse(messages)
+
+        if is_task_complete:
+            if self._pending_completion:
+                return AgentFinishAction(thought='Task completed (confirmed)')
+            else:
+                self._pending_completion = True
+                # Execute any accompanying commands, then ask for confirmation next round
+                if not commands:
+                    return AgentFinishAction(thought='Task completed (confirmed)')
+        else:
+            self._pending_completion = False
+
+        if not commands:
+            return AgentThinkAction(thought='No commands to execute, waiting for next input')
+
+        for cmd in commands:
+            action = Terminus2CmdRunAction(
+                keystrokes=cmd.keystrokes,
+                duration=min(cmd.duration, 60),
+            )
+            self.pending_actions.append(action)
+
+        return self.pending_actions.popleft()
+
+    def _build_messages(
+        self, condensed_history: list[Event], state: State
+    ) -> list[Message]:
+        """Build the conversation messages from event history.
+
+        Converts the event stream into a user/assistant message sequence:
+        - System message: JSON format instructions from the prompt template
+        - First user message: task instruction + initial terminal state
+          (rendered via user_prompt.j2)
+        - Subsequent turns: assistant = LLM JSON response, user = terminal output
+        """
+        messages: list[Message] = []
+
+        system_prompt = self.prompt_manager.get_system_message()
+        messages.append(Message(role='system', content=[TextContent(text=system_prompt)]))
+
+        initial_user_msg = self._find_initial_user_message(condensed_history)
+        if initial_user_msg:
+            initial_terminal = self._find_initial_terminal_state(condensed_history)
+            first_user_text = self.prompt_manager.user_template.render(
+                task=initial_user_msg,
+                terminal_state=initial_terminal,
+            ).strip()
+            messages.append(
+                Message(role='user', content=[TextContent(text=first_user_text)])
+            )
+
+        batch_observations: list[str] = []
+        last_timed_out = False
+        last_keystrokes = ''
+
+        for event in condensed_history:
+            if isinstance(event, MessageAction):
+                if event.source == EventSource.USER:
+                    continue
+                elif event.source == EventSource.AGENT:
+                    if batch_observations:
+                        terminal_output = batch_observations[-1]
+                        user_text = self._format_terminal_output(
+                            terminal_output, last_timed_out, last_keystrokes
+                        )
+                        messages.append(
+                            Message(role='user', content=[TextContent(text=user_text)])
+                        )
+                        batch_observations = []
+                        last_timed_out = False
+
+                    messages.append(
+                        Message(
+                            role='assistant',
+                            content=[TextContent(text=event.content)],
+                        )
+                    )
+
+            elif isinstance(event, Terminus2CmdRunAction):
+                last_keystrokes = event.keystrokes
+                if event.thought:
+                    if batch_observations:
+                        terminal_output = batch_observations[-1]
+                        user_text = self._format_terminal_output(
+                            terminal_output, last_timed_out, last_keystrokes
+                        )
+                        messages.append(
+                            Message(role='user', content=[TextContent(text=user_text)])
+                        )
+                        batch_observations = []
+                        last_timed_out = False
+
+                    messages.append(
+                        Message(
+                            role='assistant',
+                            content=[TextContent(text=event.thought)],
+                        )
+                    )
+
+            elif isinstance(event, Terminus2CmdOutputObservation):
+                batch_observations.append(event.terminal_state)
+                last_timed_out = event.timed_out
+
+            elif isinstance(event, ErrorObservation):
+                batch_observations.append(f'ERROR: {event.content}')
+
+            elif isinstance(event, AgentThinkAction):
+                pass
+
+        if batch_observations:
+            terminal_output = batch_observations[-1]
+            user_text = self._format_terminal_output(
+                terminal_output, last_timed_out, last_keystrokes
+            )
+            messages.append(
+                Message(role='user', content=[TextContent(text=user_text)])
+            )
+
+        if self._pending_completion:
+            last_terminal = batch_observations[-1] if batch_observations else ''
+            confirmation = COMPLETION_CONFIRMATION.format(terminal_state=last_terminal)
+            messages.append(
+                Message(role='user', content=[TextContent(text=confirmation)])
+            )
+
+        return messages
+
+    def _find_initial_user_message(self, events: list[Event]) -> str | None:
+        """Find the initial user message (task instruction) from the event history."""
+        for event in events:
+            if isinstance(event, MessageAction) and event.source == EventSource.USER:
+                return event.content
+        return None
+
+    def _find_initial_terminal_state(self, events: list[Event]) -> str:
+        """Find the first terminal output from the event history.
+
+        This is used to populate the {{ terminal_state }} variable in
+        user_prompt.j2 for the initial user message. If no terminal
+        observation has been recorded yet, returns an empty string.
+        """
+        for event in events:
+            if isinstance(event, Terminus2CmdOutputObservation):
+                return event.terminal_state
+        return ''
+
+    def _format_terminal_output(
+        self, terminal_output: str, timed_out: bool, keystrokes: str
+    ) -> str:
+        """Format terminal output for the next user message."""
+        if timed_out:
+            return TIMEOUT_TEMPLATE.format(
+                command=keystrokes,
+                timeout_sec=60,
+                terminal_state=self._limit_output_length(terminal_output),
+            )
+        return self._limit_output_length(terminal_output)
+
+    def _call_llm_and_parse(
+        self, messages: list[Message]
+    ) -> tuple[list[ParsedCommand], bool]:
+        """Call the LLM and parse the JSON response, with retry on parse errors."""
+        for attempt in range(MAX_LLM_RETRY):
+            params: dict = {
+                'messages': messages,
+            }
+            response = self.llm.completion(**params)
+
+            response_text = response.choices[0].message.content or ''
+            logger.debug(f'Terminus-2 LLM response (attempt {attempt + 1}): {response_text[:200]}...')
+
+            messages.append(
+                Message(role='assistant', content=[TextContent(text=response_text)])
+            )
+
+            result = self.parser.parse_response(response_text)
+
+            if result.error:
+                feedback = f'Previous response had parsing errors:\nERROR: {result.error}'
+                if result.warning:
+                    feedback += f'\nWARNINGS: {result.warning}'
+                feedback += '\n\nPlease fix these issues and provide a proper JSON response.'
+                logger.warning(f'Terminus-2 parse error (attempt {attempt + 1}): {result.error}')
+
+                messages.append(
+                    Message(role='user', content=[TextContent(text=feedback)])
+                )
+                continue
+
+            if result.warning:
+                logger.info(f'Terminus-2 parse warnings: {result.warning}')
+
+            commands = [
+                ParsedCommand(keystrokes=cmd.keystrokes, duration=min(cmd.duration, 60))
+                for cmd in result.commands
+            ]
+            return commands, result.is_task_complete
+
+        logger.error('Terminus-2: exhausted LLM retries due to parse errors')
+        return [], False
+
+    @staticmethod
+    def _limit_output_length(output: str, max_bytes: int = MAX_OUTPUT_BYTES) -> str:
+        """Limit output to specified byte length, keeping first and last portions."""
+        if len(output.encode('utf-8')) <= max_bytes:
+            return output
+
+        portion_size = max_bytes // 2
+        output_bytes = output.encode('utf-8')
+        first_portion = output_bytes[:portion_size].decode('utf-8', errors='ignore')
+        last_portion = output_bytes[-portion_size:].decode('utf-8', errors='ignore')
+        omitted_bytes = (
+            len(output_bytes)
+            - len(first_portion.encode('utf-8'))
+            - len(last_portion.encode('utf-8'))
+        )
+
+        return (
+            f'{first_portion}\n[... output limited to {max_bytes} bytes; '
+            f'{omitted_bytes} interior bytes omitted ...]\n{last_portion}'
+        )
diff --git a/openhands/agenthub/terminus_2_agent/terminus_json_plain_parser.py b/openhands/agenthub/terminus_2_agent/terminus_json_plain_parser.py
new file mode 100644
index 000000000000..354d205ae8bb
--- /dev/null
+++ b/openhands/agenthub/terminus_2_agent/terminus_json_plain_parser.py
@@ -0,0 +1,329 @@
+"""Parser for Terminus-2 JSON plain response format.
+
+Ported from terminal-bench's terminus_json_plain_parser.py for use within
+the OpenHands Terminus-2 agent.
+"""
+
+import json
+import re
+from dataclasses import dataclass
+from typing import List
+
+
+@dataclass
+class ParsedCommand:
+    keystrokes: str
+    duration: float
+
+
+@dataclass
+class ParseResult:
+    commands: List[ParsedCommand]
+    is_task_complete: bool
+    error: str
+    warning: str
+
+
+class TerminusJSONPlainParser:
+    """Parser for terminus JSON plain response format."""
+
+    def __init__(self):
+        self.required_fields = ['analysis', 'plan', 'commands']
+
+    def parse_response(self, response: str) -> ParseResult:
+        """Parse a terminus JSON plain response and extract commands.
+
+        Args:
+            response: The full LLM response string
+
+        Returns:
+            ParseResult with commands, completion status, errors and warnings
+        """
+
+        result = self._try_parse_response(response)
+
+        if result.error:
+            for fix_name, fix_function in self._get_auto_fixes():
+                corrected_response, was_fixed = fix_function(response, result.error)
+                if was_fixed:
+                    corrected_result = self._try_parse_response(corrected_response)
+
+                    if corrected_result.error == '':
+                        auto_warning = (
+                            f'AUTO-CORRECTED: {fix_name} - '
+                            'please fix this in future responses'
+                        )
+                        corrected_result.warning = self._combine_warnings(
+                            auto_warning, corrected_result.warning
+                        )
+                        return corrected_result
+
+        return result
+
+    def _try_parse_response(self, response: str) -> ParseResult:
+        """Try to parse a terminus JSON plain response."""
+        warnings: List[str] = []
+
+        json_content, extra_text_warnings = self._extract_json_content(response)
+        warnings.extend(extra_text_warnings)
+
+        if not json_content:
+            return ParseResult(
+                [],
+                False,
+                'No valid JSON found in response',
+                '- ' + '\n- '.join(warnings) if warnings else '',
+            )
+
+        try:
+            parsed_data = json.loads(json_content)
+        except json.JSONDecodeError as e:
+            error_msg = f'Invalid JSON: {str(e)}'
+            if len(json_content) < 200:
+                error_msg += f' | Content: {repr(json_content)}'
+            else:
+                error_msg += f' | Content preview: {repr(json_content[:100])}...'
+            return ParseResult(
+                [], False, error_msg, '- ' + '\n- '.join(warnings) if warnings else ''
+            )
+
+        validation_error = self._validate_json_structure(
+            parsed_data, json_content, warnings
+        )
+        if validation_error:
+            return ParseResult(
+                [],
+                False,
+                validation_error,
+                '- ' + '\n- '.join(warnings) if warnings else '',
+            )
+
+        is_complete = parsed_data.get('task_complete', False)
+        if isinstance(is_complete, str):
+            is_complete = is_complete.lower() in ('true', '1', 'yes')
+
+        commands_data = parsed_data.get('commands', [])
+        commands, parse_error = self._parse_commands(commands_data, warnings)
+        if parse_error:
+            if is_complete:
+                warnings.append(parse_error)
+                return ParseResult(
+                    [], True, '', '- ' + '\n- '.join(warnings) if warnings else ''
+                )
+            return ParseResult(
+                [], False, parse_error, '- ' + '\n- '.join(warnings) if warnings else ''
+            )
+
+        return ParseResult(
+            commands, is_complete, '', '- ' + '\n- '.join(warnings) if warnings else ''
+        )
+
+    def _extract_json_content(self, response: str) -> tuple[str, List[str]]:
+        """Extract JSON content from response, handling extra text."""
+        warnings: List[str] = []
+
+        json_start = -1
+        json_end = -1
+        brace_count = 0
+        in_string = False
+        escape_next = False
+
+        for i, char in enumerate(response):
+            if escape_next:
+                escape_next = False
+                continue
+
+            if char == '\\':
+                escape_next = True
+                continue
+
+            if char == '"' and not escape_next:
+                in_string = not in_string
+                continue
+
+            if not in_string:
+                if char == '{':
+                    if brace_count == 0:
+                        json_start = i
+                    brace_count += 1
+                elif char == '}':
+                    brace_count -= 1
+                    if brace_count == 0 and json_start != -1:
+                        json_end = i + 1
+                        break
+
+        if json_start == -1 or json_end == -1:
+            return '', ['No valid JSON object found']
+
+        before_text = response[:json_start].strip()
+        after_text = response[json_end:].strip()
+
+        if before_text:
+            warnings.append('Extra text detected before JSON object')
+        if after_text:
+            warnings.append('Extra text detected after JSON object')
+
+        return response[json_start:json_end], warnings
+
+    def _validate_json_structure(
+        self, data: dict, json_content: str, warnings: List[str]
+    ) -> str:
+        """Validate the JSON structure has required fields."""
+        if not isinstance(data, dict):
+            return 'Response must be a JSON object'
+
+        missing_fields = []
+        for field in self.required_fields:
+            if field not in data:
+                missing_fields.append(field)
+
+        if missing_fields:
+            return f"Missing required fields: {', '.join(missing_fields)}"
+
+        if not isinstance(data.get('analysis', ''), str):
+            warnings.append("Field 'analysis' should be a string")
+
+        if not isinstance(data.get('plan', ''), str):
+            warnings.append("Field 'plan' should be a string")
+
+        commands = data.get('commands', [])
+        if not isinstance(commands, list):
+            return "Field 'commands' must be an array"
+
+        self._check_field_order(data, json_content, warnings)
+
+        task_complete = data.get('task_complete')
+        if task_complete is not None and not isinstance(task_complete, (bool, str)):
+            warnings.append("Field 'task_complete' should be a boolean or string")
+
+        return ''
+
+    def _parse_commands(
+        self, commands_data: List[dict], warnings: List[str]
+    ) -> tuple[List[ParsedCommand], str]:
+        """Parse commands array into ParsedCommand objects."""
+        commands: List[ParsedCommand] = []
+
+        for i, cmd_data in enumerate(commands_data):
+            if not isinstance(cmd_data, dict):
+                return [], f'Command {i + 1} must be an object'
+
+            if 'keystrokes' not in cmd_data:
+                return [], f"Command {i + 1} missing required 'keystrokes' field"
+
+            keystrokes = cmd_data['keystrokes']
+            if not isinstance(keystrokes, str):
+                return [], f"Command {i + 1} 'keystrokes' must be a string"
+
+            if 'duration' in cmd_data:
+                duration = cmd_data['duration']
+                if not isinstance(duration, (int, float)):
+                    warnings.append(
+                        f'Command {i + 1}: Invalid duration value, using default 1.0'
+                    )
+                    duration = 1.0
+            else:
+                warnings.append(
+                    f'Command {i + 1}: Missing duration field, using default 1.0'
+                )
+                duration = 1.0
+
+            known_fields = {'keystrokes', 'duration'}
+            unknown_fields = set(cmd_data.keys()) - known_fields
+            if unknown_fields:
+                warnings.append(
+                    f"Command {i + 1}: Unknown fields: {', '.join(unknown_fields)}"
+                )
+
+            if i < len(commands_data) - 1 and not keystrokes.endswith('\n'):
+                warnings.append(
+                    f'Command {i + 1} should end with newline when followed '
+                    'by another command. Otherwise the two commands will be '
+                    'concatenated together on the same line.'
+                )
+
+            commands.append(
+                ParsedCommand(keystrokes=keystrokes, duration=float(duration))
+            )
+
+        return commands, ''
+
+    def _get_auto_fixes(self):
+        """Return list of auto-fix functions to try in order."""
+        return [
+            (
+                'Fixed incomplete JSON by adding missing closing brace',
+                self._fix_incomplete_json,
+            ),
+            ('Extracted JSON from mixed content', self._fix_mixed_content),
+        ]
+
+    def _fix_incomplete_json(self, response: str, error: str) -> tuple[str, bool]:
+        """Fix incomplete JSON by adding missing closing braces."""
+        if (
+            'Invalid JSON' in error
+            or 'Expecting' in error
+            or 'Unterminated' in error
+            or 'No valid JSON found' in error
+        ):
+            brace_count = response.count('{') - response.count('}')
+            if brace_count > 0:
+                fixed = response + '}' * brace_count
+                return fixed, True
+        return response, False
+
+    def _fix_mixed_content(self, response: str, error: str) -> tuple[str, bool]:
+        """Extract JSON from response with mixed content."""
+        json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
+        matches = re.findall(json_pattern, response, re.DOTALL)
+
+        for match in matches:
+            try:
+                json.loads(match)
+                return match, True
+            except json.JSONDecodeError:
+                continue
+
+        return response, False
+
+    def _combine_warnings(self, auto_warning: str, existing_warning: str) -> str:
+        """Combine auto-correction warning with existing warnings."""
+        if existing_warning:
+            return f'- {auto_warning}\n{existing_warning}'
+        else:
+            return f'- {auto_warning}'
+
+    def _check_field_order(
+        self, data: dict, response: str, warnings: List[str]
+    ) -> None:
+        """Check if fields appear in the correct order: analysis, plan, commands."""
+        expected_order = ['analysis', 'plan', 'commands']
+
+        positions = {}
+        for field in expected_order:
+            pattern = f'"({field})"\\s*:'
+            match = re.search(pattern, response)
+            if match:
+                positions[field] = match.start()
+
+        if len(positions) < 2:
+            return
+
+        present_fields = []
+        for field in expected_order:
+            if field in positions:
+                present_fields.append((field, positions[field]))
+
+        actual_order = [
+            field for field, pos in sorted(present_fields, key=lambda x: x[1])
+        ]
+
+        expected_present = [f for f in expected_order if f in positions]
+
+        if actual_order != expected_present:
+            actual_str = ' → '.join(actual_order)
+            expected_str = ' → '.join(expected_present)
+            warnings.append(
+                f'Fields appear in wrong order. Found: {actual_str}, '
+                f'expected: {expected_str}'
+            )
diff --git a/openhands/core/config/agent_config.py b/openhands/core/config/agent_config.py
index 713e1ad842e6..e87349a80161 100644
--- a/openhands/core/config/agent_config.py
+++ b/openhands/core/config/agent_config.py
@@ -19,7 +19,7 @@ class AgentConfig(BaseModel):
     """The name of the llm config to use. If specified, this will override global llm config."""
     classpath: str | None = Field(default=None)
     """The classpath of the agent to use. To be used for custom agents that are not defined in the openhands.agenthub package."""
-    system_prompt_filename: str = Field(default='system_prompt.j2')
+    system_prompt_filename: str = Field(default="system_prompt.j2")
     """Filename of the system prompt template file within the agent's prompt directory. Defaults to 'system_prompt.j2'."""
     custom_prompt_dir: str | None = Field(default=None)
     """Optional custom directory containing prompt templates. If set, overrides the agent's default prompt directory."""
@@ -71,7 +71,7 @@ class AgentConfig(BaseModel):
     runtime: str | None = Field(default=None)
     """Runtime type (e.g., 'docker', 'local', 'cli') used for runtime-specific tool behavior."""
 
-    model_config = ConfigDict(extra='forbid')
+    model_config = ConfigDict(extra="forbid")
 
     @property
     def resolved_system_prompt_filename(self) -> str:
@@ -80,8 +80,8 @@ def resolved_system_prompt_filename(self) -> str:
         When enable_plan_mode is True, automatically uses the long horizon system prompt
         unless a custom system_prompt_filename was explicitly set (not the default).
         """
-        if self.enable_plan_mode and self.system_prompt_filename == 'system_prompt.j2':
-            return 'system_prompt_long_horizon.j2'
+        if self.enable_plan_mode and self.system_prompt_filename == "system_prompt.j2":
+            return "system_prompt_long_horizon.j2"
         return self.system_prompt_filename
 
     @classmethod
@@ -119,29 +119,29 @@ def from_toml_section(cls, data: dict) -> dict[str, AgentConfig]:
         # Try to create the base config
         try:
             base_config = cls.model_validate(base_data)
-            agent_mapping['agent'] = base_config
+            agent_mapping["agent"] = base_config
         except ValidationError as e:
-            logger.warning(f'Invalid base agent configuration: {e}. Using defaults.')
+            logger.warning(f"Invalid base agent configuration: {e}. Using defaults.")
             # If base config fails, create a default one
             base_config = cls()
             # Still add it to the mapping
-            agent_mapping['agent'] = base_config
+            agent_mapping["agent"] = base_config
 
         # Process each custom section independently
         for name, overrides in custom_sections.items():
             try:
                 # Merge base config with overrides
                 merged = {**base_config.model_dump(), **overrides}
-                if merged.get('classpath'):
+                if merged.get("classpath"):
                     # if an explicit classpath is given, try to load it and look up its config model class
                     from openhands.controller.agent import Agent
 
                     try:
-                        agent_cls = get_impl(Agent, merged.get('classpath'))
+                        agent_cls = get_impl(Agent, merged.get("classpath"))
                         custom_config = agent_cls.config_model.model_validate(merged)
                     except Exception as e:
                         logger.warning(
-                            f'Failed to load custom agent class [{merged.get("classpath")}]: {e}. Using default config model.'
+                            f"Failed to load custom agent class [{merged.get('classpath')}]: {e}. Using default config model."
                         )
                         custom_config = cls.model_validate(merged)
                 else:
@@ -156,7 +156,7 @@ def from_toml_section(cls, data: dict) -> dict[str, AgentConfig]:
                 agent_mapping[name] = custom_config
             except ValidationError as e:
                 logger.warning(
-                    f'Invalid agent configuration for [{name}]: {e}. This section will be skipped.'
+                    f"Invalid agent configuration for [{name}]: {e}. This section will be skipped."
                 )
                 # Skip this custom section but continue with others
                 continue
diff --git a/openhands/core/schema/action.py b/openhands/core/schema/action.py
index 331bd7e47398..9379f22c9fec 100644
--- a/openhands/core/schema/action.py
+++ b/openhands/core/schema/action.py
@@ -147,3 +147,7 @@ class ActionType(str, Enum):
 
     CODEX_UPDATE_PLAN = 'codex_update_plan'
     """Updates the task plan with steps and statuses."""
+
+    # Terminus-2-style actions
+    TERMINUS_2_CMD_RUN = 'terminus_2_cmd_run'
+    """Sends raw keystrokes to a terminal session and captures the resulting screen state."""
diff --git a/openhands/core/schema/observation.py b/openhands/core/schema/observation.py
index 51626358a045..1dc4cfd8c641 100644
--- a/openhands/core/schema/observation.py
+++ b/openhands/core/schema/observation.py
@@ -99,3 +99,7 @@ class ObservationType(str, Enum):
 
     CODEX_UPDATE_PLAN = 'codex_update_plan'
     """Result of updating the task plan."""
+
+    # Terminus-2-style observations
+    TERMINUS_2_CMD_OUTPUT = 'terminus_2_cmd_output'
+    """Result of sending keystrokes to a terminal session, containing captured screen state."""
diff --git a/openhands/events/action/__init__.py b/openhands/events/action/__init__.py
index 2936a9315a2c..799fd2000f25 100644
--- a/openhands/events/action/__init__.py
+++ b/openhands/events/action/__init__.py
@@ -42,6 +42,7 @@
     CodexReadFileAction,
     CodexUpdatePlanAction,
 )
+from openhands.events.action.terminus_2 import Terminus2CmdRunAction
 
 __all__ = [
     'Action',
@@ -83,4 +84,6 @@
     'CodexGrepFilesAction',
     'CodexApplyPatchAction',
     'CodexUpdatePlanAction',
+    # Terminus-2-style actions
+    'Terminus2CmdRunAction',
 ]
diff --git a/openhands/events/action/terminus_2.py b/openhands/events/action/terminus_2.py
new file mode 100644
index 000000000000..1973527b3885
--- /dev/null
+++ b/openhands/events/action/terminus_2.py
@@ -0,0 +1,41 @@
+"""Terminus-2 action classes for keystroke-based terminal interaction.
+
+Terminus-2 sends raw keystrokes to a terminal session (tmux-style) and captures
+the resulting screen state, rather than running commands and collecting stdout.
+"""
+
+from dataclasses import dataclass
+from typing import ClassVar
+
+from openhands.core.schema import ActionType
+from openhands.events.action.action import Action, ActionSecurityRisk
+
+
+@dataclass
+class Terminus2CmdRunAction(Action):
+    """Sends raw keystrokes to a terminal session.
+
+    Keystrokes are sent verbatim to the terminal. Commands should end with
+    '\\n' to execute. Special key sequences use tmux-style escapes:
+      - C-c for Ctrl+C
+      - C-d for Ctrl+D
+
+    Attributes:
+        keystrokes: The exact keystrokes to send to the terminal.
+        duration: Seconds to wait for the command to complete before
+            capturing output (default 1.0). Cap at 60s.
+    """
+
+    keystrokes: str
+    duration: float = 1.0
+    thought: str = ''
+    action: str = ActionType.TERMINUS_2_CMD_RUN
+    runnable: ClassVar[bool] = True
+    security_risk: ActionSecurityRisk = ActionSecurityRisk.UNKNOWN
+
+    @property
+    def message(self) -> str:
+        ks = self.keystrokes.replace('\n', '\\n')
+        if len(ks) > 60:
+            ks = ks[:57] + '...'
+        return f'Sending keystrokes: {ks} (wait {self.duration}s)'
diff --git a/openhands/events/observation/__init__.py b/openhands/events/observation/__init__.py
index c5ba04b1dbf0..7b2bc0220dff 100644
--- a/openhands/events/observation/__init__.py
+++ b/openhands/events/observation/__init__.py
@@ -36,6 +36,7 @@
     CodexApplyPatchObservation,
     CodexUpdatePlanObservation,
 )
+from openhands.events.observation.terminus_2 import Terminus2CmdOutputObservation
 from openhands.events.observation.reject import UserRejectObservation
 from openhands.events.observation.success import SuccessObservation
 from openhands.events.observation.task_tracking import TaskTrackingObservation
@@ -71,4 +72,6 @@
     # Codex-style observations
     'CodexApplyPatchObservation',
     'CodexUpdatePlanObservation',
+    # Terminus-2-style observations
+    'Terminus2CmdOutputObservation',
 ]
diff --git a/openhands/events/observation/terminus_2.py b/openhands/events/observation/terminus_2.py
new file mode 100644
index 000000000000..243d724e0724
--- /dev/null
+++ b/openhands/events/observation/terminus_2.py
@@ -0,0 +1,33 @@
+"""Terminus-2 observation classes for terminal screen capture output."""
+
+from dataclasses import dataclass
+
+from openhands.core.schema import ObservationType
+from openhands.events.observation.observation import Observation
+
+
+@dataclass
+class Terminus2CmdOutputObservation(Observation):
+    """Observation containing the terminal screen state after keystroke execution.
+
+    Unlike CmdOutputObservation which carries stdout/stderr, this observation
+    carries the full terminal screen capture, preserving tmux-style semantics.
+
+    Attributes:
+        terminal_state: The captured terminal screen content after execution.
+        timed_out: Whether the command timed out before completing.
+        command_keystrokes: The keystrokes that were sent (for reference).
+    """
+
+    terminal_state: str = ''
+    timed_out: bool = False
+    command_keystrokes: str = ''
+    observation: str = ObservationType.TERMINUS_2_CMD_OUTPUT
+
+    @property
+    def message(self) -> str:
+        ks = self.command_keystrokes.replace('\n', '\\n')
+        if len(ks) > 60:
+            ks = ks[:57] + '...'
+        suffix = ' (timed out)' if self.timed_out else ''
+        return f'Terminal output after: {ks}{suffix}'
diff --git a/openhands/events/serialization/action.py b/openhands/events/serialization/action.py
index b0df3d7601ce..b5a8c4f22566 100644
--- a/openhands/events/serialization/action.py
+++ b/openhands/events/serialization/action.py
@@ -45,6 +45,7 @@
     CodexReadFileAction,
     CodexUpdatePlanAction,
 )
+from openhands.events.action.terminus_2 import Terminus2CmdRunAction
 
 actions = (
     NullAction,
@@ -84,6 +85,8 @@
     CodexGrepFilesAction,
     CodexApplyPatchAction,
     CodexUpdatePlanAction,
+    # Terminus-2-style actions
+    Terminus2CmdRunAction,
 )
 
 ACTION_TYPE_TO_CLASS = {action_class.action: action_class for action_class in actions}  # type: ignore[attr-defined]
diff --git a/openhands/events/serialization/observation.py b/openhands/events/serialization/observation.py
index f97bde682c14..62d354d2895c 100644
--- a/openhands/events/serialization/observation.py
+++ b/openhands/events/serialization/observation.py
@@ -39,6 +39,7 @@
     CodexApplyPatchObservation,
     CodexUpdatePlanObservation,
 )
+from openhands.events.observation.terminus_2 import Terminus2CmdOutputObservation
 from openhands.events.observation.reject import UserRejectObservation
 from openhands.events.observation.success import SuccessObservation
 from openhands.events.observation.task_tracking import TaskTrackingObservation
@@ -70,6 +71,8 @@
     # Codex-style observations
     CodexApplyPatchObservation,
     CodexUpdatePlanObservation,
+    # Terminus-2-style observations
+    Terminus2CmdOutputObservation,
 )
 
 OBSERVATION_TYPE_TO_CLASS = {
diff --git a/openhands/memory/conversation_memory.py b/openhands/memory/conversation_memory.py
index 6031840cee01..85f69ae58b12 100644
--- a/openhands/memory/conversation_memory.py
+++ b/openhands/memory/conversation_memory.py
@@ -39,6 +39,7 @@
     CodexReadFileAction,
     CodexUpdatePlanAction,
 )
+from openhands.events.action.terminus_2 import Terminus2CmdRunAction
 from openhands.events.action.message import SystemMessageAction
 from openhands.events.event import Event, RecallType
 from openhands.events.observation import (
@@ -64,6 +65,7 @@
     CodexApplyPatchObservation,
     CodexUpdatePlanObservation,
 )
+from openhands.events.observation.terminus_2 import Terminus2CmdOutputObservation
 from openhands.events.observation.agent import (
     MicroagentKnowledge,
     RecallObservation,
@@ -275,6 +277,8 @@ def _process_action(
                 CodexGrepFilesAction,
                 CodexApplyPatchAction,
                 CodexUpdatePlanAction,
+                # Terminus-2-style actions
+                Terminus2CmdRunAction,
             ),
         ) or (isinstance(action, CmdRunAction) and action.source == 'agent'):
             tool_metadata = action.tool_call_metadata
@@ -621,6 +625,11 @@ def _process_observation(
         elif isinstance(obs, CodexUpdatePlanObservation):
             text = truncate_content(obs.content, max_message_chars)
             message = Message(role='user', content=[TextContent(text=text)])
+        elif isinstance(obs, Terminus2CmdOutputObservation):
+            text = truncate_content(
+                obs.terminal_state or obs.content, max_message_chars
+            )
+            message = Message(role='user', content=[TextContent(text=text)])
         elif isinstance(obs, LoopDetectionObservation):
             # LoopRecovery should not be observed by llm, handled internally.
             return []
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
index b3e42b7f11ae..595fa741f906 100644
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@@ -70,6 +70,7 @@
     CodexReadFileAction,
     CodexUpdatePlanAction,
 )
+from openhands.events.action.terminus_2 import Terminus2CmdRunAction
 from openhands.events.event import FileEditSource, FileReadSource
 from openhands.events.observation import (
     CmdOutputObservation,
@@ -91,6 +92,7 @@
     CodexApplyPatchObservation,
     CodexUpdatePlanObservation,
 )
+from openhands.events.observation.terminus_2 import Terminus2CmdOutputObservation
 from openhands.events.serialization import event_from_dict, event_to_dict
 from openhands.runtime.browser import browse
 from openhands.runtime.browser.browser_env import BrowserEnv
@@ -2219,6 +2221,71 @@ async def codex_update_plan(self, action: CodexUpdatePlanAction) -> Observation:
             logger.exception(f'Error updating plan: {e}')
             return ErrorObservation(f'Failed to update plan: {str(e)}')
 
+    async def terminus_2_cmd_run(
+        self, action: Terminus2CmdRunAction
+    ) -> Terminus2CmdOutputObservation | ErrorObservation:
+        """Execute Terminus-2 keystroke action via BashSession.
+
+        Converts keystrokes to a command, executes via the bash session,
+        and returns the terminal output as a screen capture observation.
+        Handles special tmux-style key sequences (C-c, C-d).
+        """
+        try:
+            bash_session = self.bash_session
+            assert bash_session is not None
+
+            keystrokes = action.keystrokes
+            duration = min(action.duration, 60)
+
+            if keystrokes.strip() == 'C-c':
+                cmd_action = CmdRunAction(command='C-c')
+                cmd_action.set_hard_timeout(duration, blocking=False)
+                obs = await call_sync_from_async(bash_session.execute, cmd_action)
+                return Terminus2CmdOutputObservation(
+                    content=obs.content,
+                    terminal_state=obs.content,
+                    timed_out=False,
+                    command_keystrokes=keystrokes,
+                )
+            elif keystrokes.strip() == 'C-d':
+                cmd_action = CmdRunAction(command='C-d')
+                cmd_action.set_hard_timeout(duration, blocking=False)
+                obs = await call_sync_from_async(bash_session.execute, cmd_action)
+                return Terminus2CmdOutputObservation(
+                    content=obs.content,
+                    terminal_state=obs.content,
+                    timed_out=False,
+                    command_keystrokes=keystrokes,
+                )
+            elif keystrokes == '' or keystrokes.strip() == '':
+                import asyncio as _asyncio
+                await _asyncio.sleep(duration)
+                return Terminus2CmdOutputObservation(
+                    content='[waited {:.1f}s]'.format(duration),
+                    terminal_state='[waited {:.1f}s]'.format(duration),
+                    timed_out=False,
+                    command_keystrokes=keystrokes,
+                )
+            else:
+                command = keystrokes.rstrip('\n')
+                cmd_action = CmdRunAction(command=command)
+                cmd_action.set_hard_timeout(duration + 10, blocking=False)
+                obs = await call_sync_from_async(bash_session.execute, cmd_action)
+
+                timed_out = False
+                if hasattr(obs, 'metadata') and obs.metadata:
+                    timed_out = getattr(obs.metadata, 'exit_code', 0) == -1
+
+                return Terminus2CmdOutputObservation(
+                    content=obs.content,
+                    terminal_state=obs.content,
+                    timed_out=timed_out,
+                    command_keystrokes=keystrokes,
+                )
+        except Exception as e:
+            logger.exception(f'Error executing Terminus-2 keystrokes: {e}')
+            return ErrorObservation(str(e))
+
     async def browse(self, action: BrowseURLAction) -> Observation:
         if self.browser is None:
             return ErrorObservation(
diff --git a/tests/unit/agenthub/test_terminus_2_action_observation.py b/tests/unit/agenthub/test_terminus_2_action_observation.py
new file mode 100644
index 000000000000..599cfb64e74f
--- /dev/null
+++ b/tests/unit/agenthub/test_terminus_2_action_observation.py
@@ -0,0 +1,282 @@
+"""Unit tests for Terminus-2 action and observation serialization/deserialization.
+
+Tests that Terminus-2 actions and observations can be properly serialized to dict
+and deserialized back to action/observation objects.
+"""
+
+import pytest
+
+from openhands.core.schema import ActionType, ObservationType
+from openhands.events.action.terminus_2 import Terminus2CmdRunAction
+from openhands.events.observation.terminus_2 import Terminus2CmdOutputObservation
+from openhands.events.serialization import event_from_dict, event_to_dict
+
+
+# ==============================================================================
+# Terminus2CmdRunAction Serialization Tests
+# ==============================================================================
+
+
+class TestTerminus2CmdRunActionSerialization:
+    """Tests for Terminus2CmdRunAction serialization."""
+
+    def test_serialize_basic(self):
+        action = Terminus2CmdRunAction(keystrokes='ls -la\n')
+        serialized = event_to_dict(action)
+
+        assert serialized['action'] == ActionType.TERMINUS_2_CMD_RUN
+        assert serialized['args']['keystrokes'] == 'ls -la\n'
+        assert serialized['args']['duration'] == 1.0
+
+    def test_serialize_with_duration(self):
+        action = Terminus2CmdRunAction(keystrokes='make\n', duration=30.0)
+        serialized = event_to_dict(action)
+
+        assert serialized['args']['keystrokes'] == 'make\n'
+        assert serialized['args']['duration'] == 30.0
+
+    def test_serialize_with_thought(self):
+        action = Terminus2CmdRunAction(
+            keystrokes='ls\n',
+            duration=0.1,
+            thought='Listing directory contents',
+        )
+        serialized = event_to_dict(action)
+
+        assert serialized['args']['thought'] == 'Listing directory contents'
+
+    def test_serialize_special_keys(self):
+        action = Terminus2CmdRunAction(keystrokes='C-c', duration=0.1)
+        serialized = event_to_dict(action)
+
+        assert serialized['args']['keystrokes'] == 'C-c'
+
+    def test_serialize_empty_keystrokes(self):
+        action = Terminus2CmdRunAction(keystrokes='', duration=10.0)
+        serialized = event_to_dict(action)
+
+        assert serialized['args']['keystrokes'] == ''
+        assert serialized['args']['duration'] == 10.0
+
+    def test_deserialize_basic(self):
+        data = {
+            'id': 1,
+            'action': ActionType.TERMINUS_2_CMD_RUN,
+            'args': {
+                'keystrokes': 'ls -la\n',
+                'duration': 0.1,
+                'thought': '',
+            },
+        }
+        action = event_from_dict(data)
+
+        assert isinstance(action, Terminus2CmdRunAction)
+        assert action.keystrokes == 'ls -la\n'
+        assert action.duration == 0.1
+
+    def test_deserialize_with_all_params(self):
+        data = {
+            'id': 2,
+            'action': ActionType.TERMINUS_2_CMD_RUN,
+            'args': {
+                'keystrokes': 'make build\n',
+                'duration': 30.0,
+                'thought': 'Building project',
+            },
+        }
+        action = event_from_dict(data)
+
+        assert isinstance(action, Terminus2CmdRunAction)
+        assert action.keystrokes == 'make build\n'
+        assert action.duration == 30.0
+        assert action.thought == 'Building project'
+
+    def test_roundtrip_serialization(self):
+        original = Terminus2CmdRunAction(
+            keystrokes='cd /tmp && ls\n',
+            duration=2.5,
+            thought='Navigate and list',
+        )
+        serialized = event_to_dict(original)
+        restored = event_from_dict(serialized)
+
+        assert isinstance(restored, Terminus2CmdRunAction)
+        assert restored.keystrokes == original.keystrokes
+        assert restored.duration == original.duration
+        assert restored.thought == original.thought
+
+    def test_action_type_is_correct(self):
+        action = Terminus2CmdRunAction(keystrokes='test\n')
+        assert action.action == ActionType.TERMINUS_2_CMD_RUN
+        assert action.action == 'terminus_2_cmd_run'
+
+    def test_message_property(self):
+        action = Terminus2CmdRunAction(keystrokes='ls -la\n', duration=0.1)
+        msg = action.message
+        assert 'ls -la' in msg
+        assert '0.1s' in msg
+
+    def test_message_truncation(self):
+        long_cmd = 'a' * 100 + '\n'
+        action = Terminus2CmdRunAction(keystrokes=long_cmd, duration=1.0)
+        msg = action.message
+        assert '...' in msg
+
+
+# ==============================================================================
+# Terminus2CmdOutputObservation Serialization Tests
+# ==============================================================================
+
+
+class TestTerminus2CmdOutputObservationSerialization:
+    """Tests for Terminus2CmdOutputObservation serialization."""
+
+    def test_serialize_basic(self):
+        obs = Terminus2CmdOutputObservation(
+            content='output text',
+            terminal_state='$ ls\nfile1.txt\nfile2.txt\n$',
+        )
+        serialized = event_to_dict(obs)
+
+        assert serialized['observation'] == ObservationType.TERMINUS_2_CMD_OUTPUT
+        assert serialized['content'] == 'output text'
+        assert serialized['extras']['terminal_state'] == '$ ls\nfile1.txt\nfile2.txt\n$'
+
+    def test_serialize_with_timeout(self):
+        obs = Terminus2CmdOutputObservation(
+            content='timed out',
+            terminal_state='$ make\ncompiling...',
+            timed_out=True,
+            command_keystrokes='make\n',
+        )
+        serialized = event_to_dict(obs)
+
+        assert serialized['extras']['timed_out'] is True
+        assert serialized['extras']['command_keystrokes'] == 'make\n'
+
+    def test_deserialize_basic(self):
+        data = {
+            'id': 1,
+            'observation': ObservationType.TERMINUS_2_CMD_OUTPUT,
+            'content': 'terminal output',
+            'extras': {
+                'terminal_state': '$ ls\nfiles...',
+                'timed_out': False,
+                'command_keystrokes': 'ls\n',
+            },
+        }
+        obs = event_from_dict(data)
+
+        assert isinstance(obs, Terminus2CmdOutputObservation)
+        assert obs.terminal_state == '$ ls\nfiles...'
+        assert obs.timed_out is False
+        assert obs.command_keystrokes == 'ls\n'
+
+    def test_deserialize_timed_out(self):
+        data = {
+            'id': 2,
+            'observation': ObservationType.TERMINUS_2_CMD_OUTPUT,
+            'content': 'partial output',
+            'extras': {
+                'terminal_state': 'compiling...',
+                'timed_out': True,
+                'command_keystrokes': 'make\n',
+            },
+        }
+        obs = event_from_dict(data)
+
+        assert isinstance(obs, Terminus2CmdOutputObservation)
+        assert obs.timed_out is True
+
+    def test_roundtrip_serialization(self):
+        original = Terminus2CmdOutputObservation(
+            content='full output',
+            terminal_state='$ ls -la\ntotal 8\nfile1.txt\n$',
+            timed_out=False,
+            command_keystrokes='ls -la\n',
+        )
+        serialized = event_to_dict(original)
+        restored = event_from_dict(serialized)
+
+        assert isinstance(restored, Terminus2CmdOutputObservation)
+        assert restored.terminal_state == original.terminal_state
+        assert restored.timed_out == original.timed_out
+        assert restored.command_keystrokes == original.command_keystrokes
+        assert restored.content == original.content
+
+    def test_observation_type_is_correct(self):
+        obs = Terminus2CmdOutputObservation(content='test')
+        assert obs.observation == ObservationType.TERMINUS_2_CMD_OUTPUT
+        assert obs.observation == 'terminus_2_cmd_output'
+
+    def test_message_property(self):
+        obs = Terminus2CmdOutputObservation(
+            content='output',
+            command_keystrokes='ls\n',
+        )
+        msg = obs.message
+        assert 'ls' in msg
+
+    def test_message_with_timeout(self):
+        obs = Terminus2CmdOutputObservation(
+            content='output',
+            command_keystrokes='make\n',
+            timed_out=True,
+        )
+        msg = obs.message
+        assert 'timed out' in msg
+
+    def test_default_values(self):
+        obs = Terminus2CmdOutputObservation(content='test')
+        assert obs.terminal_state == ''
+        assert obs.timed_out is False
+        assert obs.command_keystrokes == ''
+
+
+# ==============================================================================
+# Schema Type Tests
+# ==============================================================================
+
+
+class TestSchemaTypes:
+    """Tests that the schema enums are correctly defined."""
+
+    def test_action_type_exists(self):
+        assert hasattr(ActionType, 'TERMINUS_2_CMD_RUN')
+        assert ActionType.TERMINUS_2_CMD_RUN == 'terminus_2_cmd_run'
+
+    def test_observation_type_exists(self):
+        assert hasattr(ObservationType, 'TERMINUS_2_CMD_OUTPUT')
+        assert ObservationType.TERMINUS_2_CMD_OUTPUT == 'terminus_2_cmd_output'
+
+    def test_action_type_in_serialization_map(self):
+        from openhands.events.serialization.action import ACTION_TYPE_TO_CLASS
+        assert 'terminus_2_cmd_run' in ACTION_TYPE_TO_CLASS
+        assert ACTION_TYPE_TO_CLASS['terminus_2_cmd_run'] == Terminus2CmdRunAction
+
+    def test_observation_type_in_serialization_map(self):
+        from openhands.events.serialization.observation import OBSERVATION_TYPE_TO_CLASS
+        assert 'terminus_2_cmd_output' in OBSERVATION_TYPE_TO_CLASS
+        assert OBSERVATION_TYPE_TO_CLASS['terminus_2_cmd_output'] == Terminus2CmdOutputObservation
+
+
+# ==============================================================================
+# Import Tests
+# ==============================================================================
+
+
+class TestImports:
+    """Tests that all new types are properly importable."""
+
+    def test_import_action_from_events(self):
+        from openhands.events.action import Terminus2CmdRunAction
+        assert Terminus2CmdRunAction is not None
+
+    def test_import_observation_from_events(self):
+        from openhands.events.observation import Terminus2CmdOutputObservation
+        assert Terminus2CmdOutputObservation is not None
+
+    def test_agent_registration(self):
+        from openhands.controller.agent import Agent
+        import openhands.agenthub.terminus_2_agent  # noqa: F401
+        assert 'Terminus2Agent' in Agent._registry
diff --git a/tests/unit/agenthub/test_terminus_2_agent.py b/tests/unit/agenthub/test_terminus_2_agent.py
new file mode 100644
index 000000000000..077010ec44c5
--- /dev/null
+++ b/tests/unit/agenthub/test_terminus_2_agent.py
@@ -0,0 +1,330 @@
+"""Unit tests for the Terminus-2 Agent.
+
+Tests the Terminus2Agent's message building, output truncation,
+and core agent logic.
+"""
+
+import pytest
+
+from openhands.agenthub.terminus_2_agent.terminus_2_agent import (
+    COMPLETION_CONFIRMATION,
+    TIMEOUT_TEMPLATE,
+    Terminus2Agent,
+)
+from openhands.agenthub.terminus_2_agent.terminus_json_plain_parser import (
+    ParsedCommand,
+    TerminusJSONPlainParser,
+)
+from openhands.events.action import AgentFinishAction, MessageAction
+from openhands.events.action.terminus_2 import Terminus2CmdRunAction
+from openhands.events.event import EventSource
+from openhands.events.observation.terminus_2 import Terminus2CmdOutputObservation
+
+
+# ==============================================================================
+# Output Truncation Tests
+# ==============================================================================
+
+
+class TestOutputTruncation:
+    """Tests for the _limit_output_length static method."""
+
+    def test_short_output_not_truncated(self):
+        output = 'short output'
+        result = Terminus2Agent._limit_output_length(output, max_bytes=10000)
+        assert result == output
+
+    def test_long_output_truncated(self):
+        output = 'x' * 20000
+        result = Terminus2Agent._limit_output_length(output, max_bytes=10000)
+        assert len(result.encode('utf-8')) < len(output.encode('utf-8'))
+        assert 'output limited to 10000 bytes' in result
+        assert 'interior bytes omitted' in result
+
+    def test_exact_limit_not_truncated(self):
+        output = 'x' * 10000
+        result = Terminus2Agent._limit_output_length(output, max_bytes=10000)
+        assert result == output
+
+    def test_truncation_preserves_start_and_end(self):
+        output = 'START' + 'x' * 20000 + 'END'
+        result = Terminus2Agent._limit_output_length(output, max_bytes=1000)
+        assert result.startswith('START')
+        assert result.endswith('END')
+
+    def test_unicode_truncation(self):
+        output = '\u00e9' * 10000  # Each char is 2 bytes in UTF-8
+        result = Terminus2Agent._limit_output_length(output, max_bytes=5000)
+        assert 'output limited to 5000 bytes' in result
+
+    def test_custom_max_bytes(self):
+        output = 'a' * 500
+        result = Terminus2Agent._limit_output_length(output, max_bytes=200)
+        assert 'output limited to 200 bytes' in result
+
+    def test_empty_output(self):
+        result = Terminus2Agent._limit_output_length('', max_bytes=10000)
+        assert result == ''
+
+
+# ==============================================================================
+# Template Tests
+# ==============================================================================
+
+
+class TestTemplates:
+    """Tests for the message templates used by the agent."""
+
+    def test_timeout_template_formatting(self):
+        result = TIMEOUT_TEMPLATE.format(
+            command='make build\n',
+            timeout_sec=30,
+            terminal_state='$ make build\ncompiling...',
+        )
+        assert 'make build' in result
+        assert '30 seconds' in result
+        assert 'compiling...' in result
+        assert 'timed out' in result
+
+    def test_completion_confirmation_formatting(self):
+        result = COMPLETION_CONFIRMATION.format(
+            terminal_state='$ echo done\ndone\n$',
+        )
+        assert 'echo done' in result
+        assert 'task_complete' in result
+        assert 'graded' in result
+
+
+# ==============================================================================
+# Parser Integration Tests
+# ==============================================================================
+
+
+class TestParserIntegration:
+    """Tests that the parser integrates correctly with the agent's expected flow."""
+
+    @pytest.fixture
+    def parser(self):
+        return TerminusJSONPlainParser()
+
+    def test_typical_response_flow(self, parser):
+        """Simulate a typical multi-command response."""
+        response = '''{
+            "analysis": "I need to set up the project. The directory is empty.",
+            "plan": "1. Create a project directory. 2. Initialize it.",
+            "commands": [
+                {"keystrokes": "mkdir myproject\\n", "duration": 0.1},
+                {"keystrokes": "cd myproject\\n", "duration": 0.1},
+                {"keystrokes": "git init\\n", "duration": 1.0}
+            ]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert len(result.commands) == 3
+        assert result.commands[0].keystrokes == 'mkdir myproject\n'
+        assert result.commands[1].keystrokes == 'cd myproject\n'
+        assert result.commands[2].keystrokes == 'git init\n'
+        assert result.commands[2].duration == 1.0
+        assert result.is_task_complete is False
+
+    def test_completion_response(self, parser):
+        """Simulate a task completion response."""
+        response = '''{
+            "analysis": "All tests pass. The implementation is complete.",
+            "plan": "Mark the task as complete.",
+            "commands": [],
+            "task_complete": true
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert result.is_task_complete is True
+        assert len(result.commands) == 0
+
+    def test_wait_response(self, parser):
+        """Simulate a wait-for-output response."""
+        response = '''{
+            "analysis": "The build is still running.",
+            "plan": "Wait for the build to finish.",
+            "commands": [
+                {"keystrokes": "", "duration": 10.0}
+            ]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert len(result.commands) == 1
+        assert result.commands[0].keystrokes == ''
+        assert result.commands[0].duration == 10.0
+
+    def test_ctrl_c_response(self, parser):
+        """Simulate sending Ctrl+C to cancel a running process."""
+        response = '''{
+            "analysis": "The process appears to be stuck.",
+            "plan": "Send Ctrl+C to cancel it.",
+            "commands": [
+                {"keystrokes": "C-c", "duration": 0.1}
+            ]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert len(result.commands) == 1
+        assert result.commands[0].keystrokes == 'C-c'
+
+
+# ==============================================================================
+# Message Building Tests
+# ==============================================================================
+
+
+class TestMessageBuilding:
+    """Tests for the agent's message building logic."""
+
+    def test_find_initial_user_message(self):
+        """Test extraction of initial user message from events."""
+        msg = MessageAction(content='Fix the bug in module X')
+        msg._source = EventSource.USER
+
+        events = [msg]
+        # Use static-like approach
+        for event in events:
+            if isinstance(event, MessageAction) and event.source == EventSource.USER:
+                assert event.content == 'Fix the bug in module X'
+                break
+
+    def test_terminus_2_action_batch_creation(self):
+        """Test that commands are correctly converted to actions."""
+        commands = [
+            ParsedCommand(keystrokes='ls -la\n', duration=0.1),
+            ParsedCommand(keystrokes='cat file.txt\n', duration=0.5),
+        ]
+
+        actions = []
+        for cmd in commands:
+            action = Terminus2CmdRunAction(
+                keystrokes=cmd.keystrokes,
+                duration=min(cmd.duration, 60),
+            )
+            actions.append(action)
+
+        assert len(actions) == 2
+        assert actions[0].keystrokes == 'ls -la\n'
+        assert actions[0].duration == 0.1
+        assert actions[1].keystrokes == 'cat file.txt\n'
+        assert actions[1].duration == 0.5
+
+    def test_duration_capped_at_60(self):
+        """Test that duration is capped at 60 seconds."""
+        cmd = ParsedCommand(keystrokes='sleep 100\n', duration=100.0)
+        action = Terminus2CmdRunAction(
+            keystrokes=cmd.keystrokes,
+            duration=min(cmd.duration, 60),
+        )
+        assert action.duration == 60
+
+
+# ==============================================================================
+# Double Confirmation Tests
+# ==============================================================================
+
+
+class TestDoubleConfirmation:
+    """Tests for the double-confirmation task completion logic."""
+
+    def test_pending_completion_flag_initial(self):
+        """Verify the flag starts as False."""
+        # We can't instantiate the full agent without LLM registry,
+        # but we can test the logic pattern
+        pending = False
+
+        # First task_complete=true
+        is_task_complete = True
+        if is_task_complete:
+            if pending:
+                action = 'finish'
+            else:
+                pending = True
+                action = 'confirm'
+        else:
+            pending = False
+            action = 'continue'
+
+        assert pending is True
+        assert action == 'confirm'
+
+    def test_pending_completion_second_time(self):
+        """Verify second task_complete triggers finish."""
+        pending = True  # Already set from first confirmation
+
+        is_task_complete = True
+        if is_task_complete:
+            if pending:
+                action = 'finish'
+            else:
+                pending = True
+                action = 'confirm'
+        else:
+            pending = False
+            action = 'continue'
+
+        assert action == 'finish'
+
+    def test_pending_completion_reset_on_not_complete(self):
+        """Verify pending is reset when task_complete is False."""
+        pending = True
+
+        is_task_complete = False
+        if is_task_complete:
+            if pending:
+                action = 'finish'
+            else:
+                pending = True
+                action = 'confirm'
+        else:
+            pending = False
+            action = 'continue'
+
+        assert pending is False
+        assert action == 'continue'
+
+
+# ==============================================================================
+# Observation Handling Tests
+# ==============================================================================
+
+
+class TestObservationHandling:
+    """Tests for handling Terminus-2 observations."""
+
+    def test_observation_content_extraction(self):
+        obs = Terminus2CmdOutputObservation(
+            content='$ ls\nfile1.txt\nfile2.txt\n$',
+            terminal_state='$ ls\nfile1.txt\nfile2.txt\n$',
+            timed_out=False,
+            command_keystrokes='ls\n',
+        )
+
+        assert obs.terminal_state == '$ ls\nfile1.txt\nfile2.txt\n$'
+        assert obs.timed_out is False
+        assert obs.command_keystrokes == 'ls\n'
+
+    def test_timed_out_observation(self):
+        obs = Terminus2CmdOutputObservation(
+            content='partial output...',
+            terminal_state='partial output...',
+            timed_out=True,
+            command_keystrokes='make\n',
+        )
+
+        assert obs.timed_out is True
+        assert 'timed out' in obs.message
+
+    def test_observation_with_empty_terminal_state(self):
+        obs = Terminus2CmdOutputObservation(
+            content='',
+            terminal_state='',
+        )
+        assert obs.terminal_state == ''
+        assert obs.content == ''
diff --git a/tests/unit/agenthub/test_terminus_2_parser.py b/tests/unit/agenthub/test_terminus_2_parser.py
new file mode 100644
index 000000000000..37bfab637e85
--- /dev/null
+++ b/tests/unit/agenthub/test_terminus_2_parser.py
@@ -0,0 +1,416 @@
+"""Unit tests for Terminus-2 JSON plain parser.
+
+Tests the TerminusJSONPlainParser for parsing LLM responses into
+structured commands, including auto-correction and validation.
+"""
+
+import pytest
+
+from openhands.agenthub.terminus_2_agent.terminus_json_plain_parser import (
+    ParsedCommand,
+    ParseResult,
+    TerminusJSONPlainParser,
+)
+
+
+@pytest.fixture
+def parser():
+    return TerminusJSONPlainParser()
+
+
+# ==============================================================================
+# Basic Parsing Tests
+# ==============================================================================
+
+
+class TestBasicParsing:
+    """Tests for basic JSON response parsing."""
+
+    def test_parse_valid_response(self, parser):
+        response = '''{
+            "analysis": "Looking at the directory",
+            "plan": "List files and check structure",
+            "commands": [
+                {"keystrokes": "ls -la\\n", "duration": 0.1}
+            ]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert len(result.commands) == 1
+        assert result.commands[0].keystrokes == 'ls -la\n'
+        assert result.commands[0].duration == 0.1
+        assert result.is_task_complete is False
+
+    def test_parse_multiple_commands(self, parser):
+        response = '''{
+            "analysis": "Need to navigate and list",
+            "plan": "cd then ls",
+            "commands": [
+                {"keystrokes": "cd /tmp\\n", "duration": 0.1},
+                {"keystrokes": "ls -la\\n", "duration": 0.1}
+            ]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert len(result.commands) == 2
+        assert result.commands[0].keystrokes == 'cd /tmp\n'
+        assert result.commands[1].keystrokes == 'ls -la\n'
+
+    def test_parse_task_complete(self, parser):
+        response = '''{
+            "analysis": "Task is done",
+            "plan": "Mark complete",
+            "commands": [],
+            "task_complete": true
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert result.is_task_complete is True
+        assert len(result.commands) == 0
+
+    def test_parse_task_complete_string(self, parser):
+        response = '''{
+            "analysis": "Done",
+            "plan": "Finish",
+            "commands": [],
+            "task_complete": "true"
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert result.is_task_complete is True
+
+    def test_parse_task_not_complete(self, parser):
+        response = '''{
+            "analysis": "Still working",
+            "plan": "Continue",
+            "commands": [],
+            "task_complete": false
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert result.is_task_complete is False
+
+    def test_parse_no_task_complete_field(self, parser):
+        """task_complete defaults to False when not present."""
+        response = '''{
+            "analysis": "Working",
+            "plan": "Continue",
+            "commands": []
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert result.is_task_complete is False
+
+    def test_parse_empty_commands(self, parser):
+        response = '''{
+            "analysis": "Waiting for output",
+            "plan": "Do nothing",
+            "commands": []
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert len(result.commands) == 0
+
+
+# ==============================================================================
+# Error Handling Tests
+# ==============================================================================
+
+
+class TestErrorHandling:
+    """Tests for error detection and reporting."""
+
+    def test_no_json_found(self, parser):
+        result = parser.parse_response('This is not JSON at all')
+        assert 'No valid JSON found' in result.error
+
+    def test_invalid_json(self, parser):
+        result = parser.parse_response('{"analysis": "test", "plan": broken}')
+        assert result.error != ''
+
+    def test_missing_required_field_analysis(self, parser):
+        response = '''{
+            "plan": "Do something",
+            "commands": []
+        }'''
+        result = parser.parse_response(response)
+        assert 'Missing required fields' in result.error
+        assert 'analysis' in result.error
+
+    def test_missing_required_field_plan(self, parser):
+        response = '''{
+            "analysis": "Something",
+            "commands": []
+        }'''
+        result = parser.parse_response(response)
+        assert 'Missing required fields' in result.error
+        assert 'plan' in result.error
+
+    def test_missing_required_field_commands(self, parser):
+        response = '''{
+            "analysis": "Something",
+            "plan": "Do it"
+        }'''
+        result = parser.parse_response(response)
+        assert 'Missing required fields' in result.error
+        assert 'commands' in result.error
+
+    def test_commands_not_array(self, parser):
+        response = '''{
+            "analysis": "Something",
+            "plan": "Do it",
+            "commands": "not an array"
+        }'''
+        result = parser.parse_response(response)
+        assert "must be an array" in result.error
+
+    def test_command_not_object(self, parser):
+        response = '''{
+            "analysis": "Something",
+            "plan": "Do it",
+            "commands": ["not an object"]
+        }'''
+        result = parser.parse_response(response)
+        assert 'must be an object' in result.error
+
+    def test_command_missing_keystrokes(self, parser):
+        response = '''{
+            "analysis": "Something",
+            "plan": "Do it",
+            "commands": [{"duration": 1.0}]
+        }'''
+        result = parser.parse_response(response)
+        assert "missing required 'keystrokes' field" in result.error
+
+    def test_command_keystrokes_not_string(self, parser):
+        response = '''{
+            "analysis": "Something",
+            "plan": "Do it",
+            "commands": [{"keystrokes": 123}]
+        }'''
+        result = parser.parse_response(response)
+        assert "'keystrokes' must be a string" in result.error
+
+    def test_not_json_object(self, parser):
+        response = '["not", "an", "object"]'
+        assert parser.parse_response(response).error != ''
+
+
+# ==============================================================================
+# Warning Tests
+# ==============================================================================
+
+
+class TestWarnings:
+    """Tests for warning generation."""
+
+    def test_extra_text_before_json(self, parser):
+        response = 'Here is my response:\n{"analysis": "a", "plan": "b", "commands": []}'
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert 'Extra text detected before JSON' in result.warning
+
+    def test_extra_text_after_json(self, parser):
+        response = '{"analysis": "a", "plan": "b", "commands": []}\nDone!'
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert 'Extra text detected after JSON' in result.warning
+
+    def test_missing_duration_warning(self, parser):
+        response = '''{
+            "analysis": "a",
+            "plan": "b",
+            "commands": [{"keystrokes": "ls\\n"}]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert 'Missing duration field' in result.warning
+        assert result.commands[0].duration == 1.0
+
+    def test_invalid_duration_type_warning(self, parser):
+        response = '''{
+            "analysis": "a",
+            "plan": "b",
+            "commands": [{"keystrokes": "ls\\n", "duration": "fast"}]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert 'Invalid duration value' in result.warning
+        assert result.commands[0].duration == 1.0
+
+    def test_unknown_fields_warning(self, parser):
+        response = '''{
+            "analysis": "a",
+            "plan": "b",
+            "commands": [{"keystrokes": "ls\\n", "duration": 0.1, "extra_field": "x"}]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert 'Unknown fields' in result.warning
+
+    def test_no_newline_between_commands_warning(self, parser):
+        response = '''{
+            "analysis": "a",
+            "plan": "b",
+            "commands": [
+                {"keystrokes": "echo hello", "duration": 0.1},
+                {"keystrokes": "ls\\n", "duration": 0.1}
+            ]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert 'should end with newline' in result.warning
+
+    def test_wrong_field_order_warning(self, parser):
+        response = '''{
+            "commands": [],
+            "analysis": "a",
+            "plan": "b"
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert 'wrong order' in result.warning
+
+
+# ==============================================================================
+# Auto-Fix Tests
+# ==============================================================================
+
+
+class TestAutoFixes:
+    """Tests for auto-correction of malformed responses."""
+
+    def test_fix_incomplete_json(self, parser):
+        response = '{"analysis": "a", "plan": "b", "commands": [{"keystrokes": "ls\\n", "duration": 0.1}]'
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert 'AUTO-CORRECTED' in result.warning
+        assert len(result.commands) == 1
+
+    def test_fix_mixed_content(self, parser):
+        response = 'Here is my analysis:\n{"analysis": "a", "plan": "b", "commands": []}\nEnd of response'
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+
+    def test_deeply_incomplete_json(self, parser):
+        response = '{"analysis": "a", "plan": "b", "commands": [{"keystrokes": "ls\\n"'
+        result = parser.parse_response(response)
+        # May or may not fix - just ensure no crash
+        assert isinstance(result, ParseResult)
+
+
+# ==============================================================================
+# Edge Cases
+# ==============================================================================
+
+
+class TestEdgeCases:
+    """Tests for edge cases and special scenarios."""
+
+    def test_special_characters_in_keystrokes(self, parser):
+        response = '''{
+            "analysis": "a",
+            "plan": "b",
+            "commands": [{"keystrokes": "echo \\"hello world\\"\\n", "duration": 0.1}]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert result.commands[0].keystrokes == 'echo "hello world"\n'
+
+    def test_ctrl_c_keystrokes(self, parser):
+        response = '''{
+            "analysis": "a",
+            "plan": "b",
+            "commands": [{"keystrokes": "C-c", "duration": 0.1}]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert result.commands[0].keystrokes == 'C-c'
+
+    def test_empty_keystrokes(self, parser):
+        response = '''{
+            "analysis": "a",
+            "plan": "b",
+            "commands": [{"keystrokes": "", "duration": 10.0}]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert result.commands[0].keystrokes == ''
+        assert result.commands[0].duration == 10.0
+
+    def test_empty_response(self, parser):
+        result = parser.parse_response('')
+        assert result.error != ''
+
+    def test_task_complete_with_parse_error_becomes_warning(self, parser):
+        """When task_complete is true, command parse errors become warnings."""
+        response = '''{
+            "analysis": "Done",
+            "plan": "Finish",
+            "commands": [{"not_keystrokes": "x"}],
+            "task_complete": true
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert result.is_task_complete is True
+        assert len(result.commands) == 0
+
+    def test_large_number_of_commands(self, parser):
+        commands = ', '.join(
+            [f'{{"keystrokes": "echo {i}\\n", "duration": 0.1}}' for i in range(50)]
+        )
+        response = f'{{"analysis": "a", "plan": "b", "commands": [{commands}]}}'
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert len(result.commands) == 50
+
+    def test_duration_integer_cast(self, parser):
+        response = '''{
+            "analysis": "a",
+            "plan": "b",
+            "commands": [{"keystrokes": "ls\\n", "duration": 5}]
+        }'''
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+        assert result.commands[0].duration == 5.0
+        assert isinstance(result.commands[0].duration, float)
+
+    def test_json_with_markdown_code_fence(self, parser):
+        """Common LLM mistake: wrapping JSON in code fences."""
+        response = '```json\n{"analysis": "a", "plan": "b", "commands": []}\n```'
+        result = parser.parse_response(response)
+
+        assert result.error == ''
+
+    def test_task_complete_string_variants(self, parser):
+        for value in ['true', 'True', 'TRUE', '1', 'yes', 'Yes']:
+            response = f'{{"analysis": "a", "plan": "b", "commands": [], "task_complete": "{value}"}}'
+            result = parser.parse_response(response)
+            assert result.is_task_complete is True, f'Failed for value: {value}'
+
+        for value in ['false', 'False', '0', 'no']:
+            response = f'{{"analysis": "a", "plan": "b", "commands": [], "task_complete": "{value}"}}'
+            result = parser.parse_response(response)
+            assert result.is_task_complete is False, f'Failed for value: {value}'

From 41c8c1ae3ba33a37485a34e7e1fc180b307f4ca2 Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Thu, 19 Feb 2026 22:05:17 -0800
Subject: [PATCH 2/9] fix: user prompt

---
 openhands/agenthub/terminus_2_agent/README.md | 248 ++++++++++++++++++
 .../terminus_2_agent/prompts/user_prompt.j2   |   4 -
 .../terminus_2_agent/terminus_2_agent.py      |  18 +-
 3 files changed, 260 insertions(+), 10 deletions(-)
 create mode 100644 openhands/agenthub/terminus_2_agent/README.md

diff --git a/openhands/agenthub/terminus_2_agent/README.md b/openhands/agenthub/terminus_2_agent/README.md
new file mode 100644
index 000000000000..db52b05ebf49
--- /dev/null
+++ b/openhands/agenthub/terminus_2_agent/README.md
@@ -0,0 +1,248 @@
+# Terminus-2 Agent
+
+The Terminus-2 Agent is a keystroke-based terminal agent ported from the [terminal-bench](../../temp/terminal-bench/) project. Unlike function-calling agents (CodeAct, OpenCode, Codex), it communicates with the LLM using structured JSON responses and interacts with the terminal by sending raw keystrokes and receiving screen capture output.
+
+## Overview
+
+Terminus-2 is designed around a fundamentally different interaction model than other OpenHands agents:
+
+- **No function calling** -- The LLM outputs a raw JSON object instead of tool calls. A dedicated parser extracts structured commands from the response.
+- **Keystroke-based terminal interaction** -- Instead of running shell commands and collecting stdout/stderr, the agent sends raw keystrokes (including tmux-style special keys like `C-c`, `C-d`) and receives the full terminal screen state back.
+- **Batch command execution** -- Each LLM response can contain multiple commands that are executed sequentially before the next LLM call.
+
+## Architecture
+
+```
+                +-----------+
+                |   LLM     |
+                +-----+-----+
+                      |
+          JSON response (text)
+                      |
+                +-----v-----+
+                |  JSON      |
+                |  Parser    |
+                +-----+-----+
+                      |
+          List[ParsedCommand]
+                      |
+        +-------------v--------------+
+        |     Terminus2Agent         |
+        |  (step loop, confirmation) |
+        +-------------+--------------+
+                      |
+       Terminus2CmdRunAction (per command)
+                      |
+        +-------------v--------------+
+        |  ActionExecutionServer     |
+        |  (keystroke execution)     |
+        +-------------+--------------+
+                      |
+       Terminus2CmdOutputObservation
+                      |
+              (terminal screen state)
+```
+
+### Agent Step Cycle
+
+Each call to `step()`:
+
+1. If there are pending actions queued from a previous LLM call, return the next one.
+2. Otherwise, build a conversation message list from the event history.
+3. Call the LLM (with up to 3 retries on parse errors).
+4. Parse the JSON response to extract commands.
+5. Queue a `Terminus2CmdRunAction` for each command.
+6. Return the first action from the queue.
+
+### JSON Response Format
+
+The LLM is expected to respond with a JSON object:
+
+```json
+{
+  "analysis": "What I observe in the terminal output and what has been done so far.",
+  "plan": "My plan for the next steps and what each command will accomplish.",
+  "commands": [
+    {
+      "keystrokes": "ls -la\n",
+      "duration": 0.1
+    },
+    {
+      "keystrokes": "cd project\n",
+      "duration": 0.1
+    }
+  ],
+  "task_complete": false
+}
+```
+
+| Field | Required | Type | Description |
+|-------|----------|------|-------------|
+| `analysis` | Yes | string | Analysis of the current terminal state |
+| `plan` | Yes | string | Plan for the next steps |
+| `commands` | Yes | array | Array of command objects to execute |
+| `task_complete` | No | boolean | Whether the task is finished (default: `false`) |
+
+Each command object:
+
+| Field | Required | Type | Default | Description |
+|-------|----------|------|---------|-------------|
+| `keystrokes` | Yes | string | -- | Exact keystrokes to send to the terminal |
+| `duration` | No | float | 1.0 | Seconds to wait before capturing output (capped at 60) |
+
+### Keystrokes
+
+The `keystrokes` field is sent **verbatim** to the terminal:
+
+- Commands should end with `\n` to execute (e.g., `"ls -la\n"`)
+- Special key sequences use tmux-style escapes:
+  - `C-c` -- Ctrl+C (send SIGINT)
+  - `C-d` -- Ctrl+D (send EOF)
+- Empty keystrokes (`""`) with a duration can be used to poll/wait for output
+- Multiple commands in the same batch are executed sequentially
+
+### Duration Guidelines
+
+| Command type | Recommended duration |
+|-------------|---------------------|
+| Immediate (`cd`, `ls`, `echo`, `cat`) | 0.1s |
+| Standard (`gcc`, `find`, `rustc`) | 1.0s |
+| Slow (`make`, `pip install`, `wget`) | 5.0-30.0s |
+| Polling (wait for output) | 10.0s |
+
+It is better to set a shorter duration and poll again than to set a long one. The maximum allowed duration is 60 seconds.
+
+## Key Features
+
+### JSON Parser with Auto-Correction
+
+The `TerminusJSONPlainParser` handles common LLM formatting mistakes:
+
+- **Incomplete JSON** -- Adds missing closing braces when the response is truncated
+- **Mixed content** -- Extracts JSON from responses that contain extra text before/after
+- **Markdown code fences** -- Handles JSON wrapped in `` ```json ``` `` blocks
+- **Field validation** -- Checks required fields, types, and correct field order
+- **Warnings** -- Non-fatal issues (missing duration, unknown fields, wrong order) are reported as warnings rather than errors
+
+### Double Confirmation for Task Completion
+
+To prevent premature task completion:
+
+1. First `"task_complete": true` -- Triggers a confirmation prompt: *"Are you sure you want to mark the task as complete?"*
+2. Second consecutive `"task_complete": true` -- Actually completes the task via `AgentFinishAction`
+3. If the LLM does *not* set `task_complete` after a confirmation prompt, the pending completion is reset.
+
+### Output Truncation
+
+Terminal output is truncated to 10KB to prevent context window overflow. When truncation occurs, the first and last 5KB are preserved with a marker indicating how many bytes were omitted from the middle.
+
+### Timeout Handling
+
+When a command exceeds its duration, the agent sends a timeout message to the LLM explaining that the command may still be running and showing the current terminal state. The LLM can then decide to wait longer (empty keystrokes with a duration), cancel the command (`C-c`), or proceed.
+
+## File Structure
+
+```
+openhands/agenthub/terminus_2_agent/
+    __init__.py                      # Agent registration
+    terminus_2_agent.py              # Main agent class
+    terminus_json_plain_parser.py    # JSON response parser
+    README.md                        # This file
+    prompts/
+        system_prompt.j2             # System prompt with JSON format spec
+        system_prompt_long_horizon.j2 # Extended prompt for long tasks
+        additional_info.j2           # Repository/runtime info template
+        microagent_info.j2           # Microagent trigger info template
+        user_prompt.j2               # Initial user message template
+```
+
+### Supporting files in other directories
+
+```
+openhands/events/action/terminus_2.py          # Terminus2CmdRunAction
+openhands/events/observation/terminus_2.py     # Terminus2CmdOutputObservation
+openhands/core/schema/action.py                # TERMINUS_2_CMD_RUN enum
+openhands/core/schema/observation.py           # TERMINUS_2_CMD_OUTPUT enum
+tests/unit/agenthub/test_terminus_2_parser.py  # Parser tests (36)
+tests/unit/agenthub/test_terminus_2_agent.py   # Agent tests (22)
+tests/unit/agenthub/test_terminus_2_action_observation.py  # Serialization tests (27)
+```
+
+## Usage
+
+### Quick Start
+
+To use the Terminus-2 agent in code:
+
+```python
+from openhands.core.config import AgentConfig
+from openhands.llm.llm_registry import LLMRegistry
+
+config = AgentConfig(agent_name='Terminus2Agent')
+llm_registry = LLMRegistry()
+agent = Terminus2Agent(config, llm_registry)
+```
+
+Or via configuration:
+
+```yaml
+agent:
+  name: Terminus2Agent
+```
+
+### Evaluation and Benchmarking
+
+To run SWE-bench evaluations with Terminus2Agent:
+
+```bash
+poetry run python evaluation/benchmarks/swe_bench/run_infer.py \
+  --agent-cls Terminus2Agent \
+  --llm-config your_model_config \
+  --max-iterations 50 \
+  --dataset princeton-nlp/SWE-bench_Lite \
+  --split test
+```
+
+### Custom Prompts
+
+Override the system prompt via config:
+
+```yaml
+agent:
+  name: Terminus2Agent
+  system_prompt_path: /path/to/custom/system_prompt.j2
+```
+
+Or override the entire prompt directory:
+
+```yaml
+agent:
+  name: Terminus2Agent
+  custom_prompt_dir: /path/to/custom/prompts/
+```
+
+## Comparison to Other Agents
+
+| Feature | CodeActAgent | OpenCodeAgent | Terminus2Agent |
+|---------|--------------|---------------|----------------|
+| LLM interface | Function calling | Function calling | Raw JSON parsing |
+| Terminal interaction | Command + stdout | Command + stdout | Keystrokes + screen capture |
+| Batch commands | Single per turn | Single per turn | Multiple per turn |
+| Special keys (Ctrl+C) | Via bash | Via bash | Native (`C-c`) |
+| File operations | Tools | Tools | Via terminal commands |
+| Task completion | `finish` tool | `finish` tool | `task_complete` field + double confirmation |
+| Parse error recovery | N/A (function calling) | N/A (function calling) | Auto-fix + retry (up to 3 attempts) |
+| Output format | Structured (exit code, stdout, stderr) | Structured | Full terminal screen state |
+
+### When to Use Terminus-2
+
+Terminus-2 is best suited for:
+
+- **Terminal-centric tasks** where seeing the full screen state matters (interactive programs, TUI applications, vim, etc.)
+- **Models without function calling support** that can reliably produce JSON
+- **Benchmarks** that measure terminal interaction fidelity (e.g., terminal-bench)
+- **Tasks requiring special key sequences** (Ctrl+C to cancel, Ctrl+D for EOF, interactive prompts)
+
+### Provenance
+
+This agent was ported from the standalone Terminus-2 implementation in `terminal-bench`. The original agent used tmux sessions for terminal interaction; this OpenHands port adapts the same logic to work with OpenHands' `BashSession` runtime while preserving the JSON-based LLM interaction model, the parser with auto-correction, and the double-confirmation completion flow.
diff --git a/openhands/agenthub/terminus_2_agent/prompts/user_prompt.j2 b/openhands/agenthub/terminus_2_agent/prompts/user_prompt.j2
index 8d78b425ce67..8b137891791f 100644
--- a/openhands/agenthub/terminus_2_agent/prompts/user_prompt.j2
+++ b/openhands/agenthub/terminus_2_agent/prompts/user_prompt.j2
@@ -1,5 +1 @@
-Task Description:
-{{ task }}
 
-Current terminal state:
-{{ terminal_state }}
diff --git a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
index b37777a9345a..89a2708f23ea 100644
--- a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
+++ b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
@@ -173,8 +173,11 @@ def _build_messages(
 
         Converts the event stream into a user/assistant message sequence:
         - System message: JSON format instructions from the prompt template
-        - First user message: task instruction + initial terminal state
-          (rendered via user_prompt.j2)
+        - First user message: the instruction content (from get_instruction /
+          INSTRUCTION_TEMPLATE_PATH) used directly, with initial terminal state
+          appended if available. This keeps the agent consistent with CodeAct/
+          Codex/OpenCode which pass the MessageAction.content through unchanged,
+          so that INSTRUCTION_TEMPLATE_PATH overrides work without double-wrapping.
         - Subsequent turns: assistant = LLM JSON response, user = terminal output
         """
         messages: list[Message] = []
@@ -185,10 +188,13 @@ def _build_messages(
         initial_user_msg = self._find_initial_user_message(condensed_history)
         if initial_user_msg:
             initial_terminal = self._find_initial_terminal_state(condensed_history)
-            first_user_text = self.prompt_manager.user_template.render(
-                task=initial_user_msg,
-                terminal_state=initial_terminal,
-            ).strip()
+            if initial_terminal:
+                first_user_text = (
+                    f'{initial_user_msg}\n\n'
+                    f'Current terminal state:\n{initial_terminal}'
+                )
+            else:
+                first_user_text = initial_user_msg
             messages.append(
                 Message(role='user', content=[TextContent(text=first_user_text)])
             )

From 5c3116ffe38f9f7c0b223677f427b136ec706b17 Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Thu, 19 Feb 2026 22:18:58 -0800
Subject: [PATCH 3/9] feat: reg action

---
 .../agenthub/terminus_2_agent/terminus_2_agent.py    | 12 ++++++++++++
 .../impl/action_execution/action_execution_client.py |  4 ++++
 2 files changed, 16 insertions(+)

diff --git a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
index 89a2708f23ea..9e2f436ec404 100644
--- a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
+++ b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
@@ -119,9 +119,18 @@ def reset(self) -> None:
         self._conversation_messages = []
         self._needs_llm_call = True
 
+    def _has_terminal_observation(self, events: list[Event]) -> bool:
+        """Check if any Terminus2CmdOutputObservation exists in the event history."""
+        return any(isinstance(e, Terminus2CmdOutputObservation) for e in events)
+
     def step(self, state: State) -> 'Action':
         """Performs one step of the Terminus-2 agent.
 
+        On the very first step (before any terminal observations exist), sends a
+        no-op action to capture the initial terminal screen. This mirrors the
+        original Terminus-2 behavior of capturing tmux state before the first
+        LLM call.
+
         Returns pending actions from the queue, or calls the LLM to get
         new commands when the queue is empty.
         """
@@ -139,6 +148,9 @@ def step(self, state: State) -> 'Action':
             case Condensation(action=condensation_action):
                 return condensation_action
 
+        if not self._has_terminal_observation(condensed_history):
+            return Terminus2CmdRunAction(keystrokes='', duration=0.5)
+
         messages = self._build_messages(condensed_history, state)
 
         commands, is_task_complete = self._call_llm_and_parse(messages)
diff --git a/openhands/runtime/impl/action_execution/action_execution_client.py b/openhands/runtime/impl/action_execution/action_execution_client.py
index 4b7936289ba7..ecde9ac404a0 100644
--- a/openhands/runtime/impl/action_execution/action_execution_client.py
+++ b/openhands/runtime/impl/action_execution/action_execution_client.py
@@ -52,6 +52,7 @@
 from openhands.events.action.action import Action
 from openhands.events.action.files import FileEditSource
 from openhands.events.action.mcp import MCPAction
+from openhands.events.action.terminus_2 import Terminus2CmdRunAction
 from openhands.events.observation import (
     AgentThinkObservation,
     ErrorObservation,
@@ -438,6 +439,9 @@ def codex_apply_patch(self, action: CodexApplyPatchAction) -> Observation:
     def codex_update_plan(self, action: CodexUpdatePlanAction) -> Observation:
         return self.send_action_for_execution(action)
 
+    def terminus_2_cmd_run(self, action: Terminus2CmdRunAction) -> Observation:
+        return self.send_action_for_execution(action)
+
     def get_mcp_config(
         self, extra_stdio_servers: list[MCPStdioServerConfig] | None = None
     ) -> MCPConfig:

From 42ad23b350c95c789d87b36f4c12055febad2af2 Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Thu, 19 Feb 2026 22:31:19 -0800
Subject: [PATCH 4/9] fix: traj formation

---
 .../terminus_2_agent/terminus_2_agent.py      | 44 +++++----
 openhands/runtime/action_execution_server.py  | 93 ++++++++++++-------
 2 files changed, 84 insertions(+), 53 deletions(-)

diff --git a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
index 9e2f436ec404..2d3ab6bccea5 100644
--- a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
+++ b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
@@ -153,14 +153,13 @@ def step(self, state: State) -> 'Action':
 
         messages = self._build_messages(condensed_history, state)
 
-        commands, is_task_complete = self._call_llm_and_parse(messages)
+        commands, is_task_complete, response_text = self._call_llm_and_parse(messages)
 
         if is_task_complete:
             if self._pending_completion:
                 return AgentFinishAction(thought='Task completed (confirmed)')
             else:
                 self._pending_completion = True
-                # Execute any accompanying commands, then ask for confirmation next round
                 if not commands:
                     return AgentFinishAction(thought='Task completed (confirmed)')
         else:
@@ -169,10 +168,11 @@ def step(self, state: State) -> 'Action':
         if not commands:
             return AgentThinkAction(thought='No commands to execute, waiting for next input')
 
-        for cmd in commands:
+        for i, cmd in enumerate(commands):
             action = Terminus2CmdRunAction(
                 keystrokes=cmd.keystrokes,
                 duration=min(cmd.duration, 60),
+                thought=response_text if i == 0 else '',
             )
             self.pending_actions.append(action)
 
@@ -198,12 +198,13 @@ def _build_messages(
         messages.append(Message(role='system', content=[TextContent(text=system_prompt)]))
 
         initial_user_msg = self._find_initial_user_message(condensed_history)
+        initial_terminal_event = self._find_initial_terminal_event(condensed_history)
+
         if initial_user_msg:
-            initial_terminal = self._find_initial_terminal_state(condensed_history)
-            if initial_terminal:
+            if initial_terminal_event is not None:
                 first_user_text = (
                     f'{initial_user_msg}\n\n'
-                    f'Current terminal state:\n{initial_terminal}'
+                    f'Current terminal state:\n{initial_terminal_event.terminal_state}'
                 )
             else:
                 first_user_text = initial_user_msg
@@ -260,6 +261,8 @@ def _build_messages(
                     )
 
             elif isinstance(event, Terminus2CmdOutputObservation):
+                if event is initial_terminal_event:
+                    continue
                 batch_observations.append(event.terminal_state)
                 last_timed_out = event.timed_out
 
@@ -294,17 +297,19 @@ def _find_initial_user_message(self, events: list[Event]) -> str | None:
                 return event.content
         return None
 
-    def _find_initial_terminal_state(self, events: list[Event]) -> str:
-        """Find the first terminal output from the event history.
+    def _find_initial_terminal_event(
+        self, events: list[Event]
+    ) -> Terminus2CmdOutputObservation | None:
+        """Find the first Terminus2CmdOutputObservation in the event history.
 
-        This is used to populate the {{ terminal_state }} variable in
-        user_prompt.j2 for the initial user message. If no terminal
-        observation has been recorded yet, returns an empty string.
+        Returns the event object itself (not just the string) so that
+        _build_messages can skip it in the loop via identity comparison,
+        avoiding duplication with the first user message.
         """
         for event in events:
             if isinstance(event, Terminus2CmdOutputObservation):
-                return event.terminal_state
-        return ''
+                return event
+        return None
 
     def _format_terminal_output(
         self, terminal_output: str, timed_out: bool, keystrokes: str
@@ -320,8 +325,13 @@ def _format_terminal_output(
 
     def _call_llm_and_parse(
         self, messages: list[Message]
-    ) -> tuple[list[ParsedCommand], bool]:
-        """Call the LLM and parse the JSON response, with retry on parse errors."""
+    ) -> tuple[list[ParsedCommand], bool, str]:
+        """Call the LLM and parse the JSON response, with retry on parse errors.
+
+        Returns (commands, is_task_complete, response_text) where response_text
+        is the raw LLM output that must be stored on the first action's thought
+        field so _build_messages can reconstruct the assistant turn later.
+        """
         for attempt in range(MAX_LLM_RETRY):
             params: dict = {
                 'messages': messages,
@@ -356,10 +366,10 @@ def _call_llm_and_parse(
                 ParsedCommand(keystrokes=cmd.keystrokes, duration=min(cmd.duration, 60))
                 for cmd in result.commands
             ]
-            return commands, result.is_task_complete
+            return commands, result.is_task_complete, response_text
 
         logger.error('Terminus-2: exhausted LLM retries due to parse errors')
-        return [], False
+        return [], False, ''
 
     @staticmethod
     def _limit_output_length(output: str, max_bytes: int = MAX_OUTPUT_BYTES) -> str:
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
index 595fa741f906..14210454a867 100644
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@@ -2221,14 +2221,39 @@ async def codex_update_plan(self, action: CodexUpdatePlanAction) -> Observation:
             logger.exception(f'Error updating plan: {e}')
             return ErrorObservation(f'Failed to update plan: {str(e)}')
 
+    def _format_terminal_screen(
+        self, obs: CmdOutputObservation, command: str
+    ) -> str:
+        """Format a CmdOutputObservation to look like a tmux capture-pane screen.
+
+        Produces output like:
+            root@hostname:/app# ls -la
+            total 775
+            drwxr-xr-x  3 root root  3 Sep 13 17:59 .
+            ...
+            root@hostname:/app#
+        """
+        meta = obs.metadata
+        username = meta.username or 'root'
+        hostname = meta.hostname or 'sandbox'
+        cwd = meta.working_dir or '/'
+        suffix = '#' if username == 'root' else '$'
+        prompt = f'{username}@{hostname}:{cwd}{suffix} '
+
+        lines = [f'{prompt}{command}']
+        if obs.content.strip():
+            lines.append(obs.content)
+        lines.append(prompt)
+        return '\n'.join(lines)
+
     async def terminus_2_cmd_run(
         self, action: Terminus2CmdRunAction
     ) -> Terminus2CmdOutputObservation | ErrorObservation:
         """Execute Terminus-2 keystroke action via BashSession.
 
         Converts keystrokes to a command, executes via the bash session,
-        and returns the terminal output as a screen capture observation.
-        Handles special tmux-style key sequences (C-c, C-d).
+        and returns the terminal output formatted like a tmux screen capture
+        (prompt + command echo + output + next prompt).
         """
         try:
             bash_session = self.bash_session
@@ -2237,51 +2262,47 @@ async def terminus_2_cmd_run(
             keystrokes = action.keystrokes
             duration = min(action.duration, 60)
 
-            if keystrokes.strip() == 'C-c':
-                cmd_action = CmdRunAction(command='C-c')
-                cmd_action.set_hard_timeout(duration, blocking=False)
+            if keystrokes == '' or keystrokes.strip() == '':
+                cmd_action = CmdRunAction(command='pwd')
+                cmd_action.set_hard_timeout(duration + 5, blocking=False)
                 obs = await call_sync_from_async(bash_session.execute, cmd_action)
+                terminal_state = self._format_terminal_screen(obs, 'pwd')
                 return Terminus2CmdOutputObservation(
-                    content=obs.content,
-                    terminal_state=obs.content,
+                    content=terminal_state,
+                    terminal_state=terminal_state,
                     timed_out=False,
                     command_keystrokes=keystrokes,
                 )
-            elif keystrokes.strip() == 'C-d':
-                cmd_action = CmdRunAction(command='C-d')
-                cmd_action.set_hard_timeout(duration, blocking=False)
+
+            if keystrokes.strip() in ('C-c', 'C-d'):
+                special_key = keystrokes.strip()
+                cmd_action = CmdRunAction(command=special_key)
+                cmd_action.set_hard_timeout(duration + 5, blocking=False)
                 obs = await call_sync_from_async(bash_session.execute, cmd_action)
+                terminal_state = self._format_terminal_screen(obs, f'^{"C" if special_key == "C-c" else "D"}')
                 return Terminus2CmdOutputObservation(
-                    content=obs.content,
-                    terminal_state=obs.content,
-                    timed_out=False,
-                    command_keystrokes=keystrokes,
-                )
-            elif keystrokes == '' or keystrokes.strip() == '':
-                import asyncio as _asyncio
-                await _asyncio.sleep(duration)
-                return Terminus2CmdOutputObservation(
-                    content='[waited {:.1f}s]'.format(duration),
-                    terminal_state='[waited {:.1f}s]'.format(duration),
+                    content=terminal_state,
+                    terminal_state=terminal_state,
                     timed_out=False,
                     command_keystrokes=keystrokes,
                 )
-            else:
-                command = keystrokes.rstrip('\n')
-                cmd_action = CmdRunAction(command=command)
-                cmd_action.set_hard_timeout(duration + 10, blocking=False)
-                obs = await call_sync_from_async(bash_session.execute, cmd_action)
 
-                timed_out = False
-                if hasattr(obs, 'metadata') and obs.metadata:
-                    timed_out = getattr(obs.metadata, 'exit_code', 0) == -1
-
-                return Terminus2CmdOutputObservation(
-                    content=obs.content,
-                    terminal_state=obs.content,
-                    timed_out=timed_out,
-                    command_keystrokes=keystrokes,
-                )
+            command = keystrokes.rstrip('\n')
+            cmd_action = CmdRunAction(command=command)
+            cmd_action.set_hard_timeout(duration + 10, blocking=False)
+            obs = await call_sync_from_async(bash_session.execute, cmd_action)
+
+            timed_out = False
+            if hasattr(obs, 'metadata') and obs.metadata:
+                timed_out = getattr(obs.metadata, 'exit_code', 0) == -1
+
+            terminal_state = self._format_terminal_screen(obs, command)
+            return Terminus2CmdOutputObservation(
+                content=terminal_state,
+                terminal_state=terminal_state,
+                timed_out=timed_out,
+                command_keystrokes=keystrokes,
+            )
         except Exception as e:
             logger.exception(f'Error executing Terminus-2 keystrokes: {e}')
             return ErrorObservation(str(e))

From 177ac7d02dca328cfde014814c19edccf35cd887 Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Thu, 19 Feb 2026 22:43:40 -0800
Subject: [PATCH 5/9] feat: add prefix based on state

---
 .../terminus_2_agent/terminus_2_agent.py      |   6 +-
 openhands/runtime/action_execution_server.py  |  19 +-
 tests/unit/agenthub/test_terminus_2_agent.py  | 636 +++++++++++++++++-
 3 files changed, 646 insertions(+), 15 deletions(-)

diff --git a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
index 2d3ab6bccea5..17dcf1c0bd53 100644
--- a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
+++ b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
@@ -202,10 +202,8 @@ def _build_messages(
 
         if initial_user_msg:
             if initial_terminal_event is not None:
-                first_user_text = (
-                    f'{initial_user_msg}\n\n'
-                    f'Current terminal state:\n{initial_terminal_event.terminal_state}'
-                )
+                terminal_text = initial_terminal_event.terminal_state
+                first_user_text = f'{initial_user_msg}\n\n{terminal_text}'
             else:
                 first_user_text = initial_user_msg
             messages.append(
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
index 14210454a867..1a8007976adf 100644
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@@ -2252,8 +2252,11 @@ async def terminus_2_cmd_run(
         """Execute Terminus-2 keystroke action via BashSession.
 
         Converts keystrokes to a command, executes via the bash session,
-        and returns the terminal output formatted like a tmux screen capture
-        (prompt + command echo + output + next prompt).
+        and returns terminal output formatted like the original Terminus-2
+        tmux capture with appropriate prefix:
+        - "Current Terminal Screen:" for initial captures (empty keystrokes)
+          and timed-out commands
+        - "New Terminal Output:" for normal command output
         """
         try:
             bash_session = self.bash_session
@@ -2266,7 +2269,8 @@ async def terminus_2_cmd_run(
                 cmd_action = CmdRunAction(command='pwd')
                 cmd_action.set_hard_timeout(duration + 5, blocking=False)
                 obs = await call_sync_from_async(bash_session.execute, cmd_action)
-                terminal_state = self._format_terminal_screen(obs, 'pwd')
+                screen = self._format_terminal_screen(obs, 'pwd')
+                terminal_state = f'Current Terminal Screen:\n{screen}'
                 return Terminus2CmdOutputObservation(
                     content=terminal_state,
                     terminal_state=terminal_state,
@@ -2279,7 +2283,8 @@ async def terminus_2_cmd_run(
                 cmd_action = CmdRunAction(command=special_key)
                 cmd_action.set_hard_timeout(duration + 5, blocking=False)
                 obs = await call_sync_from_async(bash_session.execute, cmd_action)
-                terminal_state = self._format_terminal_screen(obs, f'^{"C" if special_key == "C-c" else "D"}')
+                screen = self._format_terminal_screen(obs, f'^{"C" if special_key == "C-c" else "D"}')
+                terminal_state = f'New Terminal Output:\n{screen}'
                 return Terminus2CmdOutputObservation(
                     content=terminal_state,
                     terminal_state=terminal_state,
@@ -2296,7 +2301,11 @@ async def terminus_2_cmd_run(
             if hasattr(obs, 'metadata') and obs.metadata:
                 timed_out = getattr(obs.metadata, 'exit_code', 0) == -1
 
-            terminal_state = self._format_terminal_screen(obs, command)
+            screen = self._format_terminal_screen(obs, command)
+            if timed_out:
+                terminal_state = f'Current Terminal Screen:\n{screen}'
+            else:
+                terminal_state = f'New Terminal Output:\n{screen}'
             return Terminus2CmdOutputObservation(
                 content=terminal_state,
                 terminal_state=terminal_state,
diff --git a/tests/unit/agenthub/test_terminus_2_agent.py b/tests/unit/agenthub/test_terminus_2_agent.py
index 077010ec44c5..9ed8622e6b72 100644
--- a/tests/unit/agenthub/test_terminus_2_agent.py
+++ b/tests/unit/agenthub/test_terminus_2_agent.py
@@ -1,6 +1,7 @@
 """Unit tests for the Terminus-2 Agent.
 
 Tests the Terminus2Agent's message building, output truncation,
+conversation history reconstruction, initial terminal capture,
 and core agent logic.
 """
 
@@ -15,9 +16,10 @@
     ParsedCommand,
     TerminusJSONPlainParser,
 )
-from openhands.events.action import AgentFinishAction, MessageAction
+from openhands.events.action import AgentFinishAction, AgentThinkAction, MessageAction
 from openhands.events.action.terminus_2 import Terminus2CmdRunAction
 from openhands.events.event import EventSource
+from openhands.events.observation.error import ErrorObservation
 from openhands.events.observation.terminus_2 import Terminus2CmdOutputObservation
 
 
@@ -188,7 +190,6 @@ def test_find_initial_user_message(self):
         msg._source = EventSource.USER
 
         events = [msg]
-        # Use static-like approach
         for event in events:
             if isinstance(event, MessageAction) and event.source == EventSource.USER:
                 assert event.content == 'Fix the bug in module X'
@@ -235,11 +236,8 @@ class TestDoubleConfirmation:
 
     def test_pending_completion_flag_initial(self):
         """Verify the flag starts as False."""
-        # We can't instantiate the full agent without LLM registry,
-        # but we can test the logic pattern
         pending = False
 
-        # First task_complete=true
         is_task_complete = True
         if is_task_complete:
             if pending:
@@ -256,7 +254,7 @@ def test_pending_completion_flag_initial(self):
 
     def test_pending_completion_second_time(self):
         """Verify second task_complete triggers finish."""
-        pending = True  # Already set from first confirmation
+        pending = True
 
         is_task_complete = True
         if is_task_complete:
@@ -328,3 +326,629 @@ def test_observation_with_empty_terminal_state(self):
         )
         assert obs.terminal_state == ''
         assert obs.content == ''
+
+
+# ==============================================================================
+# _has_terminal_observation Tests
+# ==============================================================================
+
+
+class TestHasTerminalObservation:
+    """Tests for the _has_terminal_observation helper."""
+
+    def test_empty_events(self):
+        assert Terminus2Agent._has_terminal_observation(None, []) is False
+
+    def test_only_user_message(self):
+        msg = MessageAction(content='task')
+        msg._source = EventSource.USER
+        assert Terminus2Agent._has_terminal_observation(None, [msg]) is False
+
+    def test_has_observation(self):
+        obs = Terminus2CmdOutputObservation(
+            content='output', terminal_state='root@host:/# pwd\n/\nroot@host:/# '
+        )
+        assert Terminus2Agent._has_terminal_observation(None, [obs]) is True
+
+    def test_observation_after_other_events(self):
+        msg = MessageAction(content='task')
+        msg._source = EventSource.USER
+        action = Terminus2CmdRunAction(keystrokes='', duration=0.5)
+        obs = Terminus2CmdOutputObservation(
+            content='output', terminal_state='root@host:/# '
+        )
+        assert Terminus2Agent._has_terminal_observation(None, [msg, action, obs]) is True
+
+
+# ==============================================================================
+# _find_initial_terminal_event Tests
+# ==============================================================================
+
+
+class TestFindInitialTerminalEvent:
+    """Tests for _find_initial_terminal_event returning the event object."""
+
+    def test_returns_none_on_empty(self):
+        result = Terminus2Agent._find_initial_terminal_event(None, [])
+        assert result is None
+
+    def test_returns_none_when_no_observations(self):
+        msg = MessageAction(content='task')
+        msg._source = EventSource.USER
+        result = Terminus2Agent._find_initial_terminal_event(None, [msg])
+        assert result is None
+
+    def test_returns_first_observation_object(self):
+        obs1 = Terminus2CmdOutputObservation(
+            content='first', terminal_state='screen1'
+        )
+        obs2 = Terminus2CmdOutputObservation(
+            content='second', terminal_state='screen2'
+        )
+        result = Terminus2Agent._find_initial_terminal_event(None, [obs1, obs2])
+        assert result is obs1
+        assert result is not obs2
+
+    def test_identity_comparison_works(self):
+        """The returned event should be the exact same object for identity checks."""
+        obs = Terminus2CmdOutputObservation(
+            content='output', terminal_state='root@host:/# '
+        )
+        msg = MessageAction(content='task')
+        msg._source = EventSource.USER
+        events = [msg, obs]
+        result = Terminus2Agent._find_initial_terminal_event(None, events)
+        assert result is obs
+
+
+# ==============================================================================
+# Response Text Storage Tests (thought field on first action)
+# ==============================================================================
+
+
+class TestResponseTextStorage:
+    """Tests that LLM response text is stored on the first action's thought field."""
+
+    def test_first_action_gets_thought(self):
+        """When creating actions from commands, only the first gets the response text."""
+        response_text = '{"analysis":"test","plan":"test","commands":[{"keystrokes":"ls\\n","duration":0.1},{"keystrokes":"pwd\\n","duration":0.1}]}'
+        commands = [
+            ParsedCommand(keystrokes='ls\n', duration=0.1),
+            ParsedCommand(keystrokes='pwd\n', duration=0.1),
+        ]
+
+        actions = []
+        for i, cmd in enumerate(commands):
+            action = Terminus2CmdRunAction(
+                keystrokes=cmd.keystrokes,
+                duration=min(cmd.duration, 60),
+                thought=response_text if i == 0 else '',
+            )
+            actions.append(action)
+
+        assert actions[0].thought == response_text
+        assert actions[1].thought == ''
+
+    def test_single_command_gets_thought(self):
+        response_text = '{"analysis":"x","plan":"x","commands":[{"keystrokes":"ls\\n"}]}'
+        action = Terminus2CmdRunAction(
+            keystrokes='ls\n',
+            duration=0.1,
+            thought=response_text,
+        )
+        assert action.thought == response_text
+
+    def test_empty_response_stored_as_empty(self):
+        action = Terminus2CmdRunAction(
+            keystrokes='ls\n',
+            duration=0.1,
+            thought='',
+        )
+        assert action.thought == ''
+
+
+# ==============================================================================
+# Conversation History Reconstruction Tests
+# ==============================================================================
+
+
+class TestConversationHistoryReconstruction:
+    """Tests that _build_messages reconstructs full conversation history
+    from the event stream, including assistant turns from action.thought.
+
+    These tests simulate what _build_messages does by processing events
+    using the same algorithm, verifying the message sequence is correct.
+    """
+
+    def _simulate_build_messages(self, events):
+        """Simulate the core _build_messages loop logic to verify message ordering.
+
+        Returns a list of (role, text) tuples representing the conversation.
+        This mirrors the algorithm in Terminus2Agent._build_messages.
+        """
+        messages = []
+        messages.append(('system', 'system_prompt'))
+
+        initial_user_msg = None
+        initial_terminal_event = None
+        for event in events:
+            if isinstance(event, MessageAction) and event.source == EventSource.USER:
+                initial_user_msg = event.content
+                break
+        for event in events:
+            if isinstance(event, Terminus2CmdOutputObservation):
+                initial_terminal_event = event
+                break
+
+        if initial_user_msg:
+            if initial_terminal_event is not None:
+                terminal_text = initial_terminal_event.terminal_state
+                first_text = f'{initial_user_msg}\n\n{terminal_text}'
+            else:
+                first_text = initial_user_msg
+            messages.append(('user', first_text))
+
+        batch_observations = []
+        for event in events:
+            if isinstance(event, MessageAction):
+                if event.source == EventSource.USER:
+                    continue
+                elif event.source == EventSource.AGENT:
+                    if batch_observations:
+                        messages.append(('user', batch_observations[-1]))
+                        batch_observations = []
+                    messages.append(('assistant', event.content))
+
+            elif isinstance(event, Terminus2CmdRunAction):
+                if event.thought:
+                    if batch_observations:
+                        messages.append(('user', batch_observations[-1]))
+                        batch_observations = []
+                    messages.append(('assistant', event.thought))
+
+            elif isinstance(event, Terminus2CmdOutputObservation):
+                if event is initial_terminal_event:
+                    continue
+                batch_observations.append(event.terminal_state)
+
+            elif isinstance(event, ErrorObservation):
+                batch_observations.append(f'ERROR: {event.content}')
+
+        if batch_observations:
+            messages.append(('user', batch_observations[-1]))
+
+        return messages
+
+    def test_initial_state_only(self):
+        """First LLM call: system + user(task+terminal screen)."""
+        user_msg = MessageAction(content='Fix the bug')
+        user_msg._source = EventSource.USER
+        noop = Terminus2CmdRunAction(keystrokes='', duration=0.5)
+        initial_obs = Terminus2CmdOutputObservation(
+            content='Current Terminal Screen:\nroot@host:/app# pwd\n/app\nroot@host:/app# ',
+            terminal_state='Current Terminal Screen:\nroot@host:/app# pwd\n/app\nroot@host:/app# ',
+        )
+
+        events = [user_msg, noop, initial_obs]
+        msgs = self._simulate_build_messages(events)
+
+        assert len(msgs) == 2  # system + user
+        assert msgs[0][0] == 'system'
+        assert msgs[1][0] == 'user'
+        assert 'Fix the bug' in msgs[1][1]
+        assert 'Current Terminal Screen:' in msgs[1][1]
+        assert 'root@host:/app#' in msgs[1][1]
+
+    def test_one_round_trip(self):
+        """After first LLM call: system + user(task+terminal) + assistant + user(output)."""
+        llm_response = '{"analysis":"a","plan":"p","commands":[{"keystrokes":"ls\\n","duration":0.1}]}'
+
+        user_msg = MessageAction(content='Fix the bug')
+        user_msg._source = EventSource.USER
+        noop = Terminus2CmdRunAction(keystrokes='', duration=0.5)
+        initial_obs = Terminus2CmdOutputObservation(
+            content='Current Terminal Screen:\nroot@host:/app# ',
+            terminal_state='Current Terminal Screen:\nroot@host:/app# ',
+        )
+        cmd_action = Terminus2CmdRunAction(
+            keystrokes='ls\n', duration=0.1, thought=llm_response
+        )
+        cmd_obs = Terminus2CmdOutputObservation(
+            content='New Terminal Output:\nroot@host:/app# ls\nfile.py\nroot@host:/app# ',
+            terminal_state='New Terminal Output:\nroot@host:/app# ls\nfile.py\nroot@host:/app# ',
+        )
+
+        events = [user_msg, noop, initial_obs, cmd_action, cmd_obs]
+        msgs = self._simulate_build_messages(events)
+
+        assert len(msgs) == 4  # system, user, assistant, user
+        assert msgs[0][0] == 'system'
+        assert msgs[1][0] == 'user'
+        assert msgs[2][0] == 'assistant'
+        assert msgs[2][1] == llm_response
+        assert msgs[3][0] == 'user'
+        assert 'New Terminal Output:' in msgs[3][1]
+        assert 'file.py' in msgs[3][1]
+
+    def test_multi_command_batch(self):
+        """Multiple commands from one LLM call: only last observation becomes user message."""
+        llm_response = '{"analysis":"a","plan":"p","commands":[{"keystrokes":"ls\\n"},{"keystrokes":"pwd\\n"}]}'
+
+        user_msg = MessageAction(content='Task')
+        user_msg._source = EventSource.USER
+        noop = Terminus2CmdRunAction(keystrokes='', duration=0.5)
+        initial_obs = Terminus2CmdOutputObservation(
+            content='Current Terminal Screen:\nprompt',
+            terminal_state='Current Terminal Screen:\nprompt',
+        )
+        cmd1 = Terminus2CmdRunAction(keystrokes='ls\n', duration=0.1, thought=llm_response)
+        obs1 = Terminus2CmdOutputObservation(
+            content='New Terminal Output:\nls output',
+            terminal_state='New Terminal Output:\nls output',
+        )
+        cmd2 = Terminus2CmdRunAction(keystrokes='pwd\n', duration=0.1, thought='')
+        obs2 = Terminus2CmdOutputObservation(
+            content='New Terminal Output:\npwd output',
+            terminal_state='New Terminal Output:\npwd output',
+        )
+
+        events = [user_msg, noop, initial_obs, cmd1, obs1, cmd2, obs2]
+        msgs = self._simulate_build_messages(events)
+
+        assert len(msgs) == 4  # system, user, assistant, user
+        assert msgs[2][0] == 'assistant'
+        assert msgs[2][1] == llm_response
+        assert msgs[3][0] == 'user'
+        assert 'pwd output' in msgs[3][1]
+
+    def test_two_round_trips(self):
+        """Two LLM calls produce: sys, user, asst, user, asst, user."""
+        resp1 = '{"analysis":"a","plan":"p","commands":[{"keystrokes":"ls\\n"}]}'
+        resp2 = '{"analysis":"b","plan":"q","commands":[{"keystrokes":"cat f\\n"}]}'
+
+        user_msg = MessageAction(content='Task')
+        user_msg._source = EventSource.USER
+        noop = Terminus2CmdRunAction(keystrokes='', duration=0.5)
+        initial_obs = Terminus2CmdOutputObservation(
+            content='Current Terminal Screen:\nprompt',
+            terminal_state='Current Terminal Screen:\nprompt',
+        )
+        cmd1 = Terminus2CmdRunAction(keystrokes='ls\n', duration=0.1, thought=resp1)
+        obs1 = Terminus2CmdOutputObservation(
+            content='New Terminal Output:\nls result',
+            terminal_state='New Terminal Output:\nls result',
+        )
+        cmd2 = Terminus2CmdRunAction(keystrokes='cat f\n', duration=0.1, thought=resp2)
+        obs2 = Terminus2CmdOutputObservation(
+            content='New Terminal Output:\nfile content',
+            terminal_state='New Terminal Output:\nfile content',
+        )
+
+        events = [user_msg, noop, initial_obs, cmd1, obs1, cmd2, obs2]
+        msgs = self._simulate_build_messages(events)
+
+        assert len(msgs) == 6  # system, user, asst, user, asst, user
+        roles = [m[0] for m in msgs]
+        assert roles == ['system', 'user', 'assistant', 'user', 'assistant', 'user']
+        assert msgs[2][1] == resp1
+        assert 'ls result' in msgs[3][1]
+        assert msgs[4][1] == resp2
+        assert 'file content' in msgs[5][1]
+
+    def test_initial_observation_not_duplicated(self):
+        """The initial terminal observation should NOT appear as a separate user message."""
+        user_msg = MessageAction(content='Task')
+        user_msg._source = EventSource.USER
+        noop = Terminus2CmdRunAction(keystrokes='', duration=0.5)
+        initial_obs = Terminus2CmdOutputObservation(
+            content='Current Terminal Screen:\nINITIAL_SCREEN',
+            terminal_state='Current Terminal Screen:\nINITIAL_SCREEN',
+        )
+
+        events = [user_msg, noop, initial_obs]
+        msgs = self._simulate_build_messages(events)
+
+        user_messages = [m[1] for m in msgs if m[0] == 'user']
+        assert len(user_messages) == 1
+        assert 'Current Terminal Screen:' in user_messages[0]
+        assert 'INITIAL_SCREEN' in user_messages[0]
+
+    def test_initial_observation_not_duplicated_after_first_llm_call(self):
+        """After one LLM round, initial screen should only appear in first user msg."""
+        resp = '{"analysis":"a","plan":"p","commands":[{"keystrokes":"ls\\n"}]}'
+        user_msg = MessageAction(content='Task')
+        user_msg._source = EventSource.USER
+        noop = Terminus2CmdRunAction(keystrokes='', duration=0.5)
+        initial_obs = Terminus2CmdOutputObservation(
+            content='Current Terminal Screen:\nINITIAL_SCREEN',
+            terminal_state='Current Terminal Screen:\nINITIAL_SCREEN',
+        )
+        cmd = Terminus2CmdRunAction(keystrokes='ls\n', duration=0.1, thought=resp)
+        obs = Terminus2CmdOutputObservation(
+            content='New Terminal Output:\nls output',
+            terminal_state='New Terminal Output:\nls output',
+        )
+
+        events = [user_msg, noop, initial_obs, cmd, obs]
+        msgs = self._simulate_build_messages(events)
+
+        user_messages = [m[1] for m in msgs if m[0] == 'user']
+        assert len(user_messages) == 2
+        assert 'INITIAL_SCREEN' in user_messages[0]
+        assert 'INITIAL_SCREEN' not in user_messages[1]
+        assert 'New Terminal Output:' in user_messages[1]
+
+    def test_no_initial_terminal_state(self):
+        """When no terminal observation exists, first user message is just the task."""
+        user_msg = MessageAction(content='Task description')
+        user_msg._source = EventSource.USER
+
+        events = [user_msg]
+        msgs = self._simulate_build_messages(events)
+
+        assert len(msgs) == 2  # system + user
+        assert msgs[1][1] == 'Task description'
+
+    def test_error_observation_in_batch(self):
+        """ErrorObservation should be included in batch_observations."""
+        user_msg = MessageAction(content='Task')
+        user_msg._source = EventSource.USER
+        initial_obs = Terminus2CmdOutputObservation(
+            content='Current Terminal Screen:\nprompt',
+            terminal_state='Current Terminal Screen:\nprompt',
+        )
+        resp = '{"analysis":"a","plan":"p","commands":[{"keystrokes":"bad_cmd\\n"}]}'
+        cmd = Terminus2CmdRunAction(keystrokes='bad_cmd\n', duration=0.1, thought=resp)
+        err = ErrorObservation(content='command failed')
+
+        events = [user_msg, initial_obs, cmd, err]
+        msgs = self._simulate_build_messages(events)
+
+        user_messages = [m[1] for m in msgs if m[0] == 'user']
+        assert any('ERROR: command failed' in m for m in user_messages)
+
+    def test_alternating_roles_no_consecutive_same_role(self):
+        """After system, messages should alternate user/assistant (no consecutive same role)."""
+        resp = '{"analysis":"a","plan":"p","commands":[{"keystrokes":"ls\\n"}]}'
+        user_msg = MessageAction(content='Task')
+        user_msg._source = EventSource.USER
+        noop = Terminus2CmdRunAction(keystrokes='', duration=0.5)
+        initial_obs = Terminus2CmdOutputObservation(
+            content='Current Terminal Screen:\nprompt',
+            terminal_state='Current Terminal Screen:\nprompt',
+        )
+        cmd = Terminus2CmdRunAction(keystrokes='ls\n', duration=0.1, thought=resp)
+        obs = Terminus2CmdOutputObservation(
+            content='New Terminal Output:\noutput',
+            terminal_state='New Terminal Output:\noutput',
+        )
+
+        events = [user_msg, noop, initial_obs, cmd, obs]
+        msgs = self._simulate_build_messages(events)
+
+        roles = [m[0] for m in msgs]
+        assert roles[0] == 'system'
+        for i in range(2, len(roles)):
+            assert roles[i] != roles[i - 1], (
+                f'Consecutive same role at {i}: {roles}'
+            )
+
+
+# ==============================================================================
+# Initial Terminal Capture Tests
+# ==============================================================================
+
+
+class TestInitialTerminalCapture:
+    """Tests for the no-op action sent on the first step to capture terminal state."""
+
+    def test_noop_action_has_empty_keystrokes(self):
+        action = Terminus2CmdRunAction(keystrokes='', duration=0.5)
+        assert action.keystrokes == ''
+        assert action.duration == 0.5
+
+    def test_noop_action_is_runnable(self):
+        action = Terminus2CmdRunAction(keystrokes='', duration=0.5)
+        assert action.runnable is True
+
+
+# ==============================================================================
+# Action Execution Client Dispatch Tests
+# ==============================================================================
+
+
+class TestActionExecutionClientDispatch:
+    """Tests that the ActionExecutionClient has the terminus_2_cmd_run method."""
+
+    def test_client_has_terminus_2_method(self):
+        from openhands.runtime.impl.action_execution.action_execution_client import (
+            ActionExecutionClient,
+        )
+        assert hasattr(ActionExecutionClient, 'terminus_2_cmd_run')
+
+    def test_client_method_is_callable(self):
+        from openhands.runtime.impl.action_execution.action_execution_client import (
+            ActionExecutionClient,
+        )
+        assert callable(getattr(ActionExecutionClient, 'terminus_2_cmd_run'))
+
+
+# ==============================================================================
+# Terminal Screen Formatting Tests
+# ==============================================================================
+
+
+class TestTerminalScreenFormatting:
+    """Tests for _format_terminal_screen logic in the action execution server.
+
+    Since ActionExecutor has heavy dependencies (FastAPI, BashSession, etc.),
+    we re-implement the pure formatting logic here to test it in isolation.
+    This mirrors ActionExecutor._format_terminal_screen exactly.
+    """
+
+    @staticmethod
+    def _format_terminal_screen(obs, command):
+        """Pure-function copy of ActionExecutor._format_terminal_screen."""
+        meta = obs.metadata
+        username = meta.username or 'root'
+        hostname = meta.hostname or 'sandbox'
+        cwd = meta.working_dir or '/'
+        suffix = '#' if username == 'root' else '$'
+        prompt = f'{username}@{hostname}:{cwd}{suffix} '
+
+        lines = [f'{prompt}{command}']
+        if obs.content.strip():
+            lines.append(obs.content)
+        lines.append(prompt)
+        return '\n'.join(lines)
+
+    def _make_obs(self, content, username=None, hostname=None, working_dir=None):
+        from openhands.events.observation.commands import (
+            CmdOutputMetadata,
+            CmdOutputObservation,
+        )
+        metadata = CmdOutputMetadata(
+            exit_code=0,
+            username=username,
+            hostname=hostname,
+            working_dir=working_dir,
+        )
+        return CmdOutputObservation(
+            content=content,
+            command='test',
+            metadata=metadata,
+        )
+
+    def test_basic_formatting(self):
+        obs = self._make_obs(
+            'file1.txt\nfile2.txt',
+            username='root',
+            hostname='abc123',
+            working_dir='/app',
+        )
+        result = self._format_terminal_screen(obs, 'ls')
+
+        assert result.startswith('root@abc123:/app# ls')
+        assert 'file1.txt' in result
+        assert 'file2.txt' in result
+        assert result.endswith('root@abc123:/app# ')
+
+    def test_root_user_gets_hash_prompt(self):
+        obs = self._make_obs('', username='root', hostname='h', working_dir='/')
+        result = self._format_terminal_screen(obs, 'pwd')
+        assert 'root@h:/# pwd' in result
+
+    def test_non_root_user_gets_dollar_prompt(self):
+        obs = self._make_obs('', username='developer', hostname='h', working_dir='/home')
+        result = self._format_terminal_screen(obs, 'pwd')
+        assert 'developer@h:/home$ pwd' in result
+
+    def test_empty_content_no_extra_lines(self):
+        obs = self._make_obs('', username='root', hostname='h', working_dir='/')
+        result = self._format_terminal_screen(obs, 'true')
+        lines = result.split('\n')
+        assert len(lines) == 2
+        assert lines[0] == 'root@h:/# true'
+        assert lines[1] == 'root@h:/# '
+
+    def test_multiline_output(self):
+        obs = self._make_obs(
+            'line1\nline2\nline3',
+            username='root',
+            hostname='box',
+            working_dir='/tmp',
+        )
+        result = self._format_terminal_screen(obs, 'cat file')
+        lines = result.split('\n')
+        assert lines[0] == 'root@box:/tmp# cat file'
+        assert lines[1] == 'line1'
+        assert lines[2] == 'line2'
+        assert lines[3] == 'line3'
+        assert lines[4] == 'root@box:/tmp# '
+
+    def test_defaults_when_metadata_missing(self):
+        obs = self._make_obs('output', username=None, hostname=None, working_dir=None)
+        result = self._format_terminal_screen(obs, 'echo hi')
+        assert result.startswith('root@sandbox:/#')
+        assert 'output' in result
+
+    def test_special_key_ctrl_c_display(self):
+        obs = self._make_obs('', username='root', hostname='h', working_dir='/app')
+        result = self._format_terminal_screen(obs, '^C')
+        assert 'root@h:/app# ^C' in result
+
+    def test_whitespace_only_content_treated_as_empty(self):
+        obs = self._make_obs('   \n  \n  ', username='root', hostname='h', working_dir='/')
+        result = self._format_terminal_screen(obs, 'true')
+        lines = result.split('\n')
+        assert len(lines) == 2
+
+    def test_prompt_appears_at_end(self):
+        obs = self._make_obs(
+            'some output',
+            username='root',
+            hostname='container',
+            working_dir='/workspace',
+        )
+        result = self._format_terminal_screen(obs, 'echo hi')
+        assert result.endswith('root@container:/workspace# ')
+
+    def test_long_command_preserved(self):
+        long_cmd = 'find / -name "*.py" -exec grep -l "import os" {} \\;'
+        obs = self._make_obs('result', username='root', hostname='h', working_dir='/')
+        result = self._format_terminal_screen(obs, long_cmd)
+        assert long_cmd in result.split('\n')[0]
+
+
+# ==============================================================================
+# Terminal Output Prefix Tests
+# ==============================================================================
+
+
+class TestTerminalOutputPrefixes:
+    """Tests that the server adds correct prefixes to terminal output.
+
+    In the original Terminus-2:
+    - "Current Terminal Screen:" for initial captures and timed-out commands
+    - "New Terminal Output:" for normal command execution output
+    """
+
+    def test_initial_capture_gets_current_screen_prefix(self):
+        """Empty keystrokes (initial capture) should use 'Current Terminal Screen:' prefix."""
+        terminal_state = 'Current Terminal Screen:\nroot@host:/app# pwd\n/app\nroot@host:/app# '
+        assert terminal_state.startswith('Current Terminal Screen:')
+        assert 'root@host:/app#' in terminal_state
+
+    def test_normal_command_gets_new_output_prefix(self):
+        """Regular command output should use 'New Terminal Output:' prefix."""
+        terminal_state = 'New Terminal Output:\nroot@host:/app# ls\nfile.py\nroot@host:/app# '
+        assert terminal_state.startswith('New Terminal Output:')
+        assert 'file.py' in terminal_state
+
+    def test_timed_out_command_gets_current_screen_prefix(self):
+        """Timed-out commands should use 'Current Terminal Screen:' prefix."""
+        terminal_state = 'Current Terminal Screen:\nroot@host:/app# sleep 100\n'
+        assert terminal_state.startswith('Current Terminal Screen:')
+
+    def test_prefix_followed_by_newline_then_content(self):
+        """Prefix should be followed by newline then the actual screen content."""
+        screen = 'root@host:/app# ls\nfile.py\nroot@host:/app# '
+        prefixed = f'New Terminal Output:\n{screen}'
+        parts = prefixed.split('\n', 1)
+        assert parts[0] == 'New Terminal Output:'
+        assert parts[1] == screen
+
+    def test_initial_message_includes_prefix_from_terminal_state(self):
+        """When building initial user message, the terminal_state already has the prefix."""
+        task = 'Fix the bug in main.py'
+        terminal_state = 'Current Terminal Screen:\nroot@host:/app# '
+        initial_msg = f'{task}\n\n{terminal_state}'
+        assert 'Current Terminal Screen:' in initial_msg
+        assert 'Fix the bug' in initial_msg
+
+    def test_subsequent_output_includes_prefix(self):
+        """Subsequent terminal observations have their prefix baked in."""
+        terminal_state = 'New Terminal Output:\nroot@host:/app# echo hello\nhello\nroot@host:/app# '
+        assert terminal_state.startswith('New Terminal Output:')
+        content_after_prefix = terminal_state[len('New Terminal Output:\n'):]
+        assert content_after_prefix.startswith('root@host:/app#')

From d7d1e0e931b955f6bdda073e1332d636b5cdb49c Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Thu, 19 Feb 2026 22:54:43 -0800
Subject: [PATCH 6/9] feat: terminate confirmation

---
 .../terminus_2_agent/terminus_2_agent.py      |  12 +-
 tests/unit/agenthub/test_terminus_2_agent.py  | 127 ++++++++++++++++++
 2 files changed, 134 insertions(+), 5 deletions(-)

diff --git a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
index 17dcf1c0bd53..5378fb97b381 100644
--- a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
+++ b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
@@ -160,14 +160,9 @@ def step(self, state: State) -> 'Action':
                 return AgentFinishAction(thought='Task completed (confirmed)')
             else:
                 self._pending_completion = True
-                if not commands:
-                    return AgentFinishAction(thought='Task completed (confirmed)')
         else:
             self._pending_completion = False
 
-        if not commands:
-            return AgentThinkAction(thought='No commands to execute, waiting for next input')
-
         for i, cmd in enumerate(commands):
             action = Terminus2CmdRunAction(
                 keystrokes=cmd.keystrokes,
@@ -176,6 +171,13 @@ def step(self, state: State) -> 'Action':
             )
             self.pending_actions.append(action)
 
+        if not self.pending_actions:
+            if self._pending_completion:
+                return Terminus2CmdRunAction(
+                    keystrokes='', duration=0.5, thought=response_text
+                )
+            return AgentThinkAction(thought='No commands to execute, waiting for next input')
+
         return self.pending_actions.popleft()
 
     def _build_messages(
diff --git a/tests/unit/agenthub/test_terminus_2_agent.py b/tests/unit/agenthub/test_terminus_2_agent.py
index 9ed8622e6b72..4a8363f8744c 100644
--- a/tests/unit/agenthub/test_terminus_2_agent.py
+++ b/tests/unit/agenthub/test_terminus_2_agent.py
@@ -287,6 +287,133 @@ def test_pending_completion_reset_on_not_complete(self):
         assert pending is False
         assert action == 'continue'
 
+    def test_first_task_complete_with_no_commands_returns_noop(self):
+        """First task_complete=True with empty commands should return a no-op
+        action (to capture terminal state) rather than finishing immediately."""
+        from collections import deque
+
+        pending_completion = False
+        pending_actions: deque = deque()
+        commands: list = []
+        is_task_complete = True
+        response_text = '{"analysis":"done","plan":"none","commands":[],"task_complete":true}'
+
+        if is_task_complete:
+            if pending_completion:
+                result = 'finish'
+            else:
+                pending_completion = True
+                result = None
+        else:
+            pending_completion = False
+            result = None
+
+        for i, cmd in enumerate(commands):
+            pending_actions.append(cmd)
+
+        if result is None and not pending_actions:
+            if pending_completion:
+                result = 'noop_for_confirmation'
+            else:
+                result = 'think'
+
+        assert pending_completion is True
+        assert result == 'noop_for_confirmation'
+
+    def test_first_task_complete_with_commands_queues_normally(self):
+        """First task_complete=True with commands should queue them normally."""
+        from collections import deque
+
+        pending_completion = False
+        pending_actions: deque = deque()
+        is_task_complete = True
+        commands = [
+            ParsedCommand(keystrokes='ls\n', duration=0.1),
+        ]
+
+        if is_task_complete:
+            if pending_completion:
+                result = 'finish'
+            else:
+                pending_completion = True
+                result = None
+        else:
+            pending_completion = False
+            result = None
+
+        for i, cmd in enumerate(commands):
+            pending_actions.append(cmd)
+
+        if result is None and not pending_actions:
+            if pending_completion:
+                result = 'noop_for_confirmation'
+            else:
+                result = 'think'
+        elif result is None:
+            result = 'pop_pending'
+
+        assert pending_completion is True
+        assert len(pending_actions) == 1
+        assert result == 'pop_pending'
+
+    def test_confirmation_message_appended_when_pending(self):
+        """_build_messages should append COMPLETION_CONFIRMATION when _pending_completion is True."""
+        user_msg = MessageAction(content='Task')
+        user_msg._source = EventSource.USER
+        noop = Terminus2CmdRunAction(keystrokes='', duration=0.5)
+        initial_obs = Terminus2CmdOutputObservation(
+            content='Current Terminal Screen:\nroot@host:/app# ',
+            terminal_state='Current Terminal Screen:\nroot@host:/app# ',
+        )
+        resp = '{"analysis":"done","plan":"done","commands":[],"task_complete":true}'
+        noop2 = Terminus2CmdRunAction(keystrokes='', duration=0.5, thought=resp)
+        confirm_obs = Terminus2CmdOutputObservation(
+            content='Current Terminal Screen:\nroot@host:/app# ',
+            terminal_state='Current Terminal Screen:\nroot@host:/app# ',
+        )
+
+        events = [user_msg, noop, initial_obs, noop2, confirm_obs]
+
+        pending_completion = True
+
+        messages = []
+        messages.append(('system', 'system_prompt'))
+
+        initial_terminal_event = initial_obs
+        first_text = f'{user_msg.content}\n\n{initial_terminal_event.terminal_state}'
+        messages.append(('user', first_text))
+
+        batch_observations: list[str] = []
+        for event in events:
+            if isinstance(event, Terminus2CmdRunAction):
+                if event.thought:
+                    if batch_observations:
+                        messages.append(('user', batch_observations[-1]))
+                        batch_observations = []
+                    messages.append(('assistant', event.thought))
+            elif isinstance(event, Terminus2CmdOutputObservation):
+                if event is initial_terminal_event:
+                    continue
+                batch_observations.append(event.terminal_state)
+
+        if batch_observations:
+            messages.append(('user', batch_observations[-1]))
+
+        if pending_completion:
+            last_terminal = batch_observations[-1] if batch_observations else ''
+            confirmation = (
+                f'Current terminal state:\n{last_terminal}\n\n'
+                'Are you sure you want to mark the task as complete? '
+                "This will trigger your solution to be graded and you won't be able to "
+                'make any further corrections. If so, include "task_complete": true '
+                'in your JSON response again.'
+            )
+            messages.append(('user', confirmation))
+
+        assert any('Are you sure you want to mark the task as complete?' in m[1] for m in messages)
+        assert messages[-1][0] == 'user'
+        assert 'task_complete' in messages[-1][1]
+
 
 # ==============================================================================
 # Observation Handling Tests

From bc6f071f00f4bdea843a8fc4d1df319ebbfdab69 Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Thu, 19 Feb 2026 23:14:58 -0800
Subject: [PATCH 7/9] fix: duplicate user msg

---
 .../terminus_2_agent/terminus_2_agent.py      | 26 ++++++++++++-------
 tests/unit/agenthub/test_terminus_2_agent.py  | 21 +++++++++++----
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
index 5378fb97b381..e84750765990 100644
--- a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
+++ b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
@@ -274,16 +274,22 @@ def _build_messages(
 
         if batch_observations:
             terminal_output = batch_observations[-1]
-            user_text = self._format_terminal_output(
-                terminal_output, last_timed_out, last_keystrokes
-            )
-            messages.append(
-                Message(role='user', content=[TextContent(text=user_text)])
-            )
-
-        if self._pending_completion:
-            last_terminal = batch_observations[-1] if batch_observations else ''
-            confirmation = COMPLETION_CONFIRMATION.format(terminal_state=last_terminal)
+            if self._pending_completion:
+                confirmation = COMPLETION_CONFIRMATION.format(
+                    terminal_state=terminal_output
+                )
+                messages.append(
+                    Message(role='user', content=[TextContent(text=confirmation)])
+                )
+            else:
+                user_text = self._format_terminal_output(
+                    terminal_output, last_timed_out, last_keystrokes
+                )
+                messages.append(
+                    Message(role='user', content=[TextContent(text=user_text)])
+                )
+        elif self._pending_completion:
+            confirmation = COMPLETION_CONFIRMATION.format(terminal_state='')
             messages.append(
                 Message(role='user', content=[TextContent(text=confirmation)])
             )
diff --git a/tests/unit/agenthub/test_terminus_2_agent.py b/tests/unit/agenthub/test_terminus_2_agent.py
index 4a8363f8744c..b58515b6c9ba 100644
--- a/tests/unit/agenthub/test_terminus_2_agent.py
+++ b/tests/unit/agenthub/test_terminus_2_agent.py
@@ -397,12 +397,21 @@ def test_confirmation_message_appended_when_pending(self):
                 batch_observations.append(event.terminal_state)
 
         if batch_observations:
-            messages.append(('user', batch_observations[-1]))
-
-        if pending_completion:
-            last_terminal = batch_observations[-1] if batch_observations else ''
+            terminal_output = batch_observations[-1]
+            if pending_completion:
+                confirmation = (
+                    f'Current terminal state:\n{terminal_output}\n\n'
+                    'Are you sure you want to mark the task as complete? '
+                    "This will trigger your solution to be graded and you won't be able to "
+                    'make any further corrections. If so, include "task_complete": true '
+                    'in your JSON response again.'
+                )
+                messages.append(('user', confirmation))
+            else:
+                messages.append(('user', terminal_output))
+        elif pending_completion:
             confirmation = (
-                f'Current terminal state:\n{last_terminal}\n\n'
+                'Current terminal state:\n\n\n'
                 'Are you sure you want to mark the task as complete? '
                 "This will trigger your solution to be graded and you won't be able to "
                 'make any further corrections. If so, include "task_complete": true '
@@ -413,6 +422,8 @@ def test_confirmation_message_appended_when_pending(self):
         assert any('Are you sure you want to mark the task as complete?' in m[1] for m in messages)
         assert messages[-1][0] == 'user'
         assert 'task_complete' in messages[-1][1]
+        user_messages = [m for m in messages if m[0] == 'user']
+        assert len(user_messages) == 2  # initial task + confirmation (NOT three)
 
 
 # ==============================================================================

From f5daa8127768abdf3888895d54ad9ea2dd050331 Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Fri, 20 Feb 2026 09:17:05 -0800
Subject: [PATCH 8/9] feat: multiple command output

---
 .../terminus_2_agent/terminus_2_agent.py      |  44 +++++-
 openhands/runtime/action_execution_server.py  |  32 ++--
 tests/unit/agenthub/test_terminus_2_agent.py  | 142 ++++++++++++++++--
 3 files changed, 188 insertions(+), 30 deletions(-)

diff --git a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
index e84750765990..bc210627625d 100644
--- a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
+++ b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
@@ -222,7 +222,9 @@ def _build_messages(
                     continue
                 elif event.source == EventSource.AGENT:
                     if batch_observations:
-                        terminal_output = batch_observations[-1]
+                        terminal_output = self._combine_observations(
+                            batch_observations
+                        )
                         user_text = self._format_terminal_output(
                             terminal_output, last_timed_out, last_keystrokes
                         )
@@ -243,7 +245,9 @@ def _build_messages(
                 last_keystrokes = event.keystrokes
                 if event.thought:
                     if batch_observations:
-                        terminal_output = batch_observations[-1]
+                        terminal_output = self._combine_observations(
+                            batch_observations
+                        )
                         user_text = self._format_terminal_output(
                             terminal_output, last_timed_out, last_keystrokes
                         )
@@ -273,7 +277,7 @@ def _build_messages(
                 pass
 
         if batch_observations:
-            terminal_output = batch_observations[-1]
+            terminal_output = self._combine_observations(batch_observations)
             if self._pending_completion:
                 confirmation = COMPLETION_CONFIRMATION.format(
                     terminal_state=terminal_output
@@ -317,6 +321,40 @@ def _find_initial_terminal_event(
                 return event
         return None
 
+    _NEW_OUTPUT_PREFIX = 'New Terminal Output:\n'
+    _SCREEN_PREFIX = 'Current Terminal Screen:\n'
+
+    @staticmethod
+    def _combine_observations(observations: list[str]) -> str:
+        """Combine multiple terminal observations into a single output.
+
+        In the original Terminus-2, all commands in a batch execute in tmux and
+        then a single get_incremental_output() captures the cumulative output.
+        This method replicates that by stripping the per-observation prefix,
+        joining the raw screen content, and re-adding a single prefix.
+        """
+        if not observations:
+            return ''
+        if len(observations) == 1:
+            return observations[0]
+
+        new_pfx = Terminus2Agent._NEW_OUTPUT_PREFIX
+        scr_pfx = Terminus2Agent._SCREEN_PREFIX
+
+        screens: list[str] = []
+        last_prefix = new_pfx
+        for obs in observations:
+            if obs.startswith(new_pfx):
+                screens.append(obs[len(new_pfx):])
+                last_prefix = new_pfx
+            elif obs.startswith(scr_pfx):
+                screens.append(obs[len(scr_pfx):])
+                last_prefix = scr_pfx
+            else:
+                screens.append(obs)
+
+        return last_prefix + '\n'.join(screens)
+
     def _format_terminal_output(
         self, terminal_output: str, timed_out: bool, keystrokes: str
     ) -> str:
diff --git a/openhands/runtime/action_execution_server.py b/openhands/runtime/action_execution_server.py
index 1a8007976adf..97eb2eb9f458 100644
--- a/openhands/runtime/action_execution_server.py
+++ b/openhands/runtime/action_execution_server.py
@@ -2222,28 +2222,33 @@ async def codex_update_plan(self, action: CodexUpdatePlanAction) -> Observation:
             return ErrorObservation(f'Failed to update plan: {str(e)}')
 
     def _format_terminal_screen(
-        self, obs: CmdOutputObservation, command: str
+        self, obs: CmdOutputObservation, command: str, pre_cwd: str | None = None
     ) -> str:
         """Format a CmdOutputObservation to look like a tmux capture-pane screen.
 
+        The pre-command prompt uses pre_cwd (the directory before execution),
+        and the post-command prompt uses the actual post-execution working_dir
+        from metadata. This matches real terminal behavior where e.g.
+        ``cd /app/src`` shows the old cwd before the command and the new cwd after.
+
         Produces output like:
-            root@hostname:/app# ls -la
-            total 775
-            drwxr-xr-x  3 root root  3 Sep 13 17:59 .
-            ...
-            root@hostname:/app#
+            root@hostname:/app# cd /app/src
+            root@hostname:/app/src#
         """
         meta = obs.metadata
         username = meta.username or 'root'
         hostname = meta.hostname or 'sandbox'
-        cwd = meta.working_dir or '/'
+        post_cwd = meta.working_dir or '/'
         suffix = '#' if username == 'root' else '$'
-        prompt = f'{username}@{hostname}:{cwd}{suffix} '
 
-        lines = [f'{prompt}{command}']
+        before_cwd = pre_cwd if pre_cwd else post_cwd
+        pre_prompt = f'{username}@{hostname}:{before_cwd}{suffix} '
+        post_prompt = f'{username}@{hostname}:{post_cwd}{suffix} '
+
+        lines = [f'{pre_prompt}{command}']
         if obs.content.strip():
             lines.append(obs.content)
-        lines.append(prompt)
+        lines.append(post_prompt)
         return '\n'.join(lines)
 
     async def terminus_2_cmd_run(
@@ -2264,12 +2269,13 @@ async def terminus_2_cmd_run(
 
             keystrokes = action.keystrokes
             duration = min(action.duration, 60)
+            pre_cwd = bash_session.cwd
 
             if keystrokes == '' or keystrokes.strip() == '':
                 cmd_action = CmdRunAction(command='pwd')
                 cmd_action.set_hard_timeout(duration + 5, blocking=False)
                 obs = await call_sync_from_async(bash_session.execute, cmd_action)
-                screen = self._format_terminal_screen(obs, 'pwd')
+                screen = self._format_terminal_screen(obs, 'pwd', pre_cwd)
                 terminal_state = f'Current Terminal Screen:\n{screen}'
                 return Terminus2CmdOutputObservation(
                     content=terminal_state,
@@ -2283,7 +2289,7 @@ async def terminus_2_cmd_run(
                 cmd_action = CmdRunAction(command=special_key)
                 cmd_action.set_hard_timeout(duration + 5, blocking=False)
                 obs = await call_sync_from_async(bash_session.execute, cmd_action)
-                screen = self._format_terminal_screen(obs, f'^{"C" if special_key == "C-c" else "D"}')
+                screen = self._format_terminal_screen(obs, f'^{"C" if special_key == "C-c" else "D"}', pre_cwd)
                 terminal_state = f'New Terminal Output:\n{screen}'
                 return Terminus2CmdOutputObservation(
                     content=terminal_state,
@@ -2301,7 +2307,7 @@ async def terminus_2_cmd_run(
             if hasattr(obs, 'metadata') and obs.metadata:
                 timed_out = getattr(obs.metadata, 'exit_code', 0) == -1
 
-            screen = self._format_terminal_screen(obs, command)
+            screen = self._format_terminal_screen(obs, command, pre_cwd)
             if timed_out:
                 terminal_state = f'Current Terminal Screen:\n{screen}'
             else:
diff --git a/tests/unit/agenthub/test_terminus_2_agent.py b/tests/unit/agenthub/test_terminus_2_agent.py
index b58515b6c9ba..9b4fb3f0a1b6 100644
--- a/tests/unit/agenthub/test_terminus_2_agent.py
+++ b/tests/unit/agenthub/test_terminus_2_agent.py
@@ -633,14 +633,20 @@ def _simulate_build_messages(self, events):
                     continue
                 elif event.source == EventSource.AGENT:
                     if batch_observations:
-                        messages.append(('user', batch_observations[-1]))
+                        combined = Terminus2Agent._combine_observations(
+                            batch_observations
+                        )
+                        messages.append(('user', combined))
                         batch_observations = []
                     messages.append(('assistant', event.content))
 
             elif isinstance(event, Terminus2CmdRunAction):
                 if event.thought:
                     if batch_observations:
-                        messages.append(('user', batch_observations[-1]))
+                        combined = Terminus2Agent._combine_observations(
+                            batch_observations
+                        )
+                        messages.append(('user', combined))
                         batch_observations = []
                     messages.append(('assistant', event.thought))
 
@@ -653,7 +659,8 @@ def _simulate_build_messages(self, events):
                 batch_observations.append(f'ERROR: {event.content}')
 
         if batch_observations:
-            messages.append(('user', batch_observations[-1]))
+            combined = Terminus2Agent._combine_observations(batch_observations)
+            messages.append(('user', combined))
 
         return messages
 
@@ -709,7 +716,7 @@ def test_one_round_trip(self):
         assert 'file.py' in msgs[3][1]
 
     def test_multi_command_batch(self):
-        """Multiple commands from one LLM call: only last observation becomes user message."""
+        """Multiple commands from one LLM call: ALL observations are combined into one user message."""
         llm_response = '{"analysis":"a","plan":"p","commands":[{"keystrokes":"ls\\n"},{"keystrokes":"pwd\\n"}]}'
 
         user_msg = MessageAction(content='Task')
@@ -721,13 +728,13 @@ def test_multi_command_batch(self):
         )
         cmd1 = Terminus2CmdRunAction(keystrokes='ls\n', duration=0.1, thought=llm_response)
         obs1 = Terminus2CmdOutputObservation(
-            content='New Terminal Output:\nls output',
-            terminal_state='New Terminal Output:\nls output',
+            content='New Terminal Output:\nroot@host:/app# ls\nfile.py\nroot@host:/app# ',
+            terminal_state='New Terminal Output:\nroot@host:/app# ls\nfile.py\nroot@host:/app# ',
         )
         cmd2 = Terminus2CmdRunAction(keystrokes='pwd\n', duration=0.1, thought='')
         obs2 = Terminus2CmdOutputObservation(
-            content='New Terminal Output:\npwd output',
-            terminal_state='New Terminal Output:\npwd output',
+            content='New Terminal Output:\nroot@host:/app# pwd\n/app\nroot@host:/app# ',
+            terminal_state='New Terminal Output:\nroot@host:/app# pwd\n/app\nroot@host:/app# ',
         )
 
         events = [user_msg, noop, initial_obs, cmd1, obs1, cmd2, obs2]
@@ -737,7 +744,11 @@ def test_multi_command_batch(self):
         assert msgs[2][0] == 'assistant'
         assert msgs[2][1] == llm_response
         assert msgs[3][0] == 'user'
-        assert 'pwd output' in msgs[3][1]
+        assert 'ls' in msgs[3][1]
+        assert 'file.py' in msgs[3][1]
+        assert 'pwd' in msgs[3][1]
+        assert '/app' in msgs[3][1]
+        assert msgs[3][1].count('New Terminal Output:') == 1
 
     def test_two_round_trips(self):
         """Two LLM calls produce: sys, user, asst, user, asst, user."""
@@ -925,19 +936,22 @@ class TestTerminalScreenFormatting:
     """
 
     @staticmethod
-    def _format_terminal_screen(obs, command):
+    def _format_terminal_screen(obs, command, pre_cwd=None):
         """Pure-function copy of ActionExecutor._format_terminal_screen."""
         meta = obs.metadata
         username = meta.username or 'root'
         hostname = meta.hostname or 'sandbox'
-        cwd = meta.working_dir or '/'
+        post_cwd = meta.working_dir or '/'
         suffix = '#' if username == 'root' else '$'
-        prompt = f'{username}@{hostname}:{cwd}{suffix} '
 
-        lines = [f'{prompt}{command}']
+        before_cwd = pre_cwd if pre_cwd else post_cwd
+        pre_prompt = f'{username}@{hostname}:{before_cwd}{suffix} '
+        post_prompt = f'{username}@{hostname}:{post_cwd}{suffix} '
+
+        lines = [f'{pre_prompt}{command}']
         if obs.content.strip():
             lines.append(obs.content)
-        lines.append(prompt)
+        lines.append(post_prompt)
         return '\n'.join(lines)
 
     def _make_obs(self, content, username=None, hostname=None, working_dir=None):
@@ -1037,6 +1051,36 @@ def test_long_command_preserved(self):
         result = self._format_terminal_screen(obs, long_cmd)
         assert long_cmd in result.split('\n')[0]
 
+    def test_cd_pre_cwd_differs_from_post_cwd(self):
+        """cd /app/src: pre-command prompt shows /app, post-command prompt shows /app/src."""
+        obs = self._make_obs(
+            '', username='root', hostname='host', working_dir='/app/src'
+        )
+        result = self._format_terminal_screen(obs, 'cd /app/src', pre_cwd='/app')
+        lines = result.split('\n')
+        assert lines[0] == 'root@host:/app# cd /app/src'
+        assert lines[1] == 'root@host:/app/src# '
+
+    def test_no_pre_cwd_uses_post_cwd_for_both(self):
+        """Without pre_cwd, both prompts use the post-execution cwd (backward compat)."""
+        obs = self._make_obs(
+            '', username='root', hostname='h', working_dir='/new'
+        )
+        result = self._format_terminal_screen(obs, 'cd /new')
+        lines = result.split('\n')
+        assert lines[0] == 'root@h:/new# cd /new'
+        assert lines[1] == 'root@h:/new# '
+
+    def test_non_cd_command_same_cwd(self):
+        """Normal command: pre_cwd == post_cwd, both prompts identical."""
+        obs = self._make_obs(
+            'file.py', username='root', hostname='h', working_dir='/app'
+        )
+        result = self._format_terminal_screen(obs, 'ls', pre_cwd='/app')
+        lines = result.split('\n')
+        assert lines[0] == 'root@h:/app# ls'
+        assert lines[-1] == 'root@h:/app# '
+
 
 # ==============================================================================
 # Terminal Output Prefix Tests
@@ -1090,3 +1134,73 @@ def test_subsequent_output_includes_prefix(self):
         assert terminal_state.startswith('New Terminal Output:')
         content_after_prefix = terminal_state[len('New Terminal Output:\n'):]
         assert content_after_prefix.startswith('root@host:/app#')
+
+
+# ==============================================================================
+# Batch Observation Combination Tests
+# ==============================================================================
+
+
+class TestCombineObservations:
+    """Tests for _combine_observations which merges multiple terminal outputs
+    from a command batch into a single user message, matching the original
+    Terminus-2 behavior where get_incremental_output() captures all commands.
+    """
+
+    def test_single_observation_returned_as_is(self):
+        obs = ['New Terminal Output:\nroot@h:/# ls\nfile.py\nroot@h:/# ']
+        result = Terminus2Agent._combine_observations(obs)
+        assert result == obs[0]
+
+    def test_empty_list_returns_empty_string(self):
+        result = Terminus2Agent._combine_observations([])
+        assert result == ''
+
+    def test_two_observations_combined_under_single_prefix(self):
+        obs1 = 'New Terminal Output:\nroot@h:/app# ls\nfile.py\nroot@h:/app# '
+        obs2 = 'New Terminal Output:\nroot@h:/app# pwd\n/app\nroot@h:/app# '
+        result = Terminus2Agent._combine_observations([obs1, obs2])
+        assert result.startswith('New Terminal Output:\n')
+        assert result.count('New Terminal Output:') == 1
+        assert 'ls' in result
+        assert 'file.py' in result
+        assert 'pwd' in result
+        assert '/app' in result
+
+    def test_three_observations_all_content_present(self):
+        obs1 = 'New Terminal Output:\nroot@h:/# ls -l\ntotal 4\nroot@h:/# '
+        obs2 = 'New Terminal Output:\nroot@h:/# ls *.py\nscript.py\nroot@h:/# '
+        obs3 = 'New Terminal Output:\nroot@h:/# grep foo .\n./match\nroot@h:/# '
+        result = Terminus2Agent._combine_observations([obs1, obs2, obs3])
+        assert result.count('New Terminal Output:') == 1
+        assert 'ls -l' in result
+        assert 'total 4' in result
+        assert 'script.py' in result
+        assert 'grep foo' in result
+        assert './match' in result
+
+    def test_mixed_prefixes_uses_last(self):
+        """If last observation was a timeout (Current Terminal Screen:), use that prefix."""
+        obs1 = 'New Terminal Output:\nroot@h:/# ls\nfile.py\nroot@h:/# '
+        obs2 = 'Current Terminal Screen:\nroot@h:/# sleep 100\n'
+        result = Terminus2Agent._combine_observations([obs1, obs2])
+        assert result.startswith('Current Terminal Screen:\n')
+        assert result.count('Current Terminal Screen:') == 1
+        assert 'ls' in result
+        assert 'sleep 100' in result
+
+    def test_no_prefix_observations_preserved(self):
+        """Observations without a recognized prefix are included as-is."""
+        obs1 = 'some raw output'
+        obs2 = 'New Terminal Output:\nroot@h:/# pwd\n/\nroot@h:/# '
+        result = Terminus2Agent._combine_observations([obs1, obs2])
+        assert 'some raw output' in result
+        assert 'pwd' in result
+
+    def test_error_mixed_with_observations(self):
+        """ERROR observations (no prefix) are combined with normal observations."""
+        obs1 = 'New Terminal Output:\nroot@h:/# ls\nfile.py\nroot@h:/# '
+        obs2 = 'ERROR: command failed'
+        result = Terminus2Agent._combine_observations([obs1, obs2])
+        assert 'file.py' in result
+        assert 'ERROR: command failed' in result

From 0fa3e476d6d488a9cd8a431ff423d0945c9cdc95 Mon Sep 17 00:00:00 2001
From: Sugam Devare <sdevare@nvidia.com>
Date: Fri, 20 Feb 2026 09:31:26 -0800
Subject: [PATCH 9/9] fix: keystroke bug

---
 openhands/agenthub/terminus_2_agent/terminus_2_agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
index bc210627625d..eef91a744a62 100644
--- a/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
+++ b/openhands/agenthub/terminus_2_agent/terminus_2_agent.py
@@ -242,7 +242,6 @@ def _build_messages(
                     )
 
             elif isinstance(event, Terminus2CmdRunAction):
-                last_keystrokes = event.keystrokes
                 if event.thought:
                     if batch_observations:
                         terminal_output = self._combine_observations(
@@ -263,6 +262,7 @@ def _build_messages(
                             content=[TextContent(text=event.thought)],
                         )
                     )
+                last_keystrokes = event.keystrokes
 
             elif isinstance(event, Terminus2CmdOutputObservation):
                 if event is initial_terminal_event: