From 512308c85220b5bca9234487029644eb1d4fa3e4 Mon Sep 17 00:00:00 2001
From: Amit Kumar <ak.nitrr13@gmail.com>
Date: Wed, 13 May 2026 04:20:00 +0000
Subject: [PATCH 1/6] feat(turn_output): add markdown-primary parser as Path 4
 (D-22-03)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 22, iteration 1. Adds parse_markdown_envelope as a pure
function over the canonical D-22-03 contract:

  ## Response
  <body>

  ## Confidence
  <float 0.0-1.0> — <one-sentence rationale>

  ## Signal
  <one of: default | success | failed | needs_input | ...>

Parser is forgiving but pinned: H2 or H3 headers, case-insensitive
section names, em-dash / ASCII-double / single-hyphen rationale
separator, integer or float confidence (clamped to [0,1]), missing
rationale gets a placeholder, signal "none" / "null" / blank coerce
to None. Hard error reserved for truly unparseable input
(no Confidence section, no float on the line, empty Response body).

Wires into parse_envelope_from_result as Path 4 (after structured_
response and JSON-parse, before raising EnvelopeMissingError) so
existing structured-output paths keep working unchanged. D-22-01's
elimination of response_format=AgentTurnOutput from create_agent
lands in the next iteration.

Tests (tests/test_markdown_turn_output.py): 25 cases covering all
sections, missing/malformed/blank variants, signal vocabulary, code
blocks in response body, header drift (H3, lowercase), and Path
4 integration into parse_envelope_from_result.

Verified: ruff clean; pytest -x → 1231 passed (1206 prior + 25 new);
coverage 87.0%.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/runtime/agents/turn_output.py  | 151 ++++++++++++++-
 tests/test_markdown_turn_output.py | 291 +++++++++++++++++++++++++++++
 2 files changed, 438 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_markdown_turn_output.py
diff --git a/src/runtime/agents/turn_output.py b/src/runtime/agents/turn_output.py
index df202e4..99a72ff 100644
--- a/src/runtime/agents/turn_output.py
+++ b/src/runtime/agents/turn_output.py
@@ -21,8 +21,9 @@
 
 import json
 import logging
+import re
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, ValidationError
 
 _LOG = logging.getLogger("runtime.orchestrator")
 
@@ -87,6 +88,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None):
         super().__init__(message or f"envelope_missing: {field} (agent={agent})")
 
 
+_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE)
+_CONF_LINE = re.compile(
+    # Leftmost float (allows int form), optional rationale after em-dash /
+    # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale
+    # is captured wholesale.
+    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$",
+    re.DOTALL,
+)
+
+
+def _clamp_unit(x: float) -> float:
+    """Clamp a confidence float into [0, 1] without raising. The skill
+    prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or
+    ``-0.1`` — clamp rather than reject so the parse step is forgiving."""
+    if x < 0.0:
+        return 0.0
+    if x > 1.0:
+        return 1.0
+    return x
+
+
+def parse_markdown_envelope(
+    content: str, *, agent: str = "",
+) -> "AgentTurnOutput":
+    """D-22-03 — parse the trailing markdown contract block into an
+    :class:`AgentTurnOutput`.
+
+    Expects three sections ``## Response`` / ``## Confidence`` /
+    ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the
+    end of the LLM's reply. The free-text body of ``## Response``
+    becomes ``content``; the float on the ``## Confidence`` line
+    becomes ``confidence`` (clamped to [0, 1]); the rest of that line
+    after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``;
+    the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/
+    blank coerced to ``None``).
+
+    Raises :class:`EnvelopeMissingError` only when the parse cannot
+    produce a valid envelope at all — missing ``## Confidence`` line,
+    unparseable confidence value, or empty ``## Response`` body.
+    """
+    if not isinstance(content, str) or not content.strip():
+        raise EnvelopeMissingError(
+            agent=agent, field="content",
+            message=f"envelope_missing: empty content (agent={agent!r})",
+        )
+
+    # ``re.split`` with the header capture-group yields:
+    #   [pre, h1, body1, h2, body2, ...]
+    # Pre is the prose before the first heading; we ignore it for parse.
+    parts = _HEADER_SPLIT.split(content)
+    sections: dict[str, str] = {}
+    if len(parts) >= 3:
+        for i in range(1, len(parts) - 1, 2):
+            key = parts[i].strip().lower()
+            body = parts[i + 1].strip()
+            # Don't overwrite an earlier section with a later same-named
+            # one (unusual but possible in pathological prompts).
+            sections.setdefault(key, body)
+
+    response_body = sections.get("response", "").strip()
+    raw_conf = sections.get("confidence", "").strip()
+
+    if not raw_conf:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence section absent or empty "
+                f"(agent={agent!r})"
+            ),
+        )
+
+    m = _CONF_LINE.match(raw_conf)
+    if not m:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence section did not parse "
+                f"(agent={agent!r}, raw={raw_conf!r})"
+            ),
+        )
+    try:
+        conf_value = _clamp_unit(float(m.group(1)))
+    except (TypeError, ValueError) as exc:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence value not a float "
+                f"(agent={agent!r}, raw={raw_conf!r})"
+            ),
+        ) from exc
+    rationale = (m.group(2) or "").strip() or "(no rationale provided)"
+
+    signal_raw = sections.get("signal", "").strip().lower() or None
+    if signal_raw in {"none", "null", "", "n/a"}:
+        signal_raw = None
+
+    if not response_body:
+        raise EnvelopeMissingError(
+            agent=agent, field="content",
+            message=(
+                f"envelope_missing: response section empty (agent={agent!r})"
+            ),
+        )
+
+    try:
+        return AgentTurnOutput(
+            content=response_body,
+            confidence=conf_value,
+            confidence_rationale=rationale,
+            signal=signal_raw,
+        )
+    except ValidationError as exc:
+        raise EnvelopeMissingError(
+            agent=agent, field="validation",
+            message=(
+                f"envelope_missing: pydantic validation rejected the "
+                f"parsed values (agent={agent!r}): {exc}"
+            ),
+        ) from exc
+
+
 def parse_envelope_from_result(
     result: dict,
     *,
@@ -144,13 +266,33 @@ def parse_envelope_from_result(
             continue
         break
 
-    # Path 3: fail loudly
+    # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's
+    # content. Producers that emit the new ``## Response / ## Confidence /
+    # ## Signal`` section block land here when Paths 1+2 yield nothing.
+    for msg in reversed(messages):
+        if msg.__class__.__name__ != "AIMessage":
+            continue
+        content = getattr(msg, "content", None)
+        if not isinstance(content, str) or not content.strip():
+            continue
+        try:
+            return parse_markdown_envelope(content, agent=agent)
+        except EnvelopeMissingError:
+            # This AIMessage didnt carry a parseable markdown contract.
+            # Continue scanning earlier messages on the off-chance a
+            # nested loop emitted the contract one step back.
+            continue
+        break
+
+    # Path 5 (terminal): fail loudly. None of the four paths produced a
+    # valid envelope — this is a real prompt-drift signal worth
+    # surfacing as a structured agent_run error.
     raise EnvelopeMissingError(
         agent=agent,
         field="structured_response",
         message=(
-            f"envelope_missing: no structured_response or JSON-decodable "
-            f"AIMessage envelope found (agent={agent})"
+            f"envelope_missing: no structured_response, JSON-decodable, or "
+            f"markdown-decodable AIMessage envelope found (agent={agent})"
         ),
     )
 
@@ -197,5 +339,6 @@ def reconcile_confidence(
     "AgentTurnOutput",
     "EnvelopeMissingError",
     "parse_envelope_from_result",
+    "parse_markdown_envelope",
     "reconcile_confidence",
 ]
diff --git a/tests/test_markdown_turn_output.py b/tests/test_markdown_turn_output.py
new file mode 100644
index 0000000..4049cfe
--- /dev/null
+++ b/tests/test_markdown_turn_output.py
@@ -0,0 +1,291 @@
+"""Phase 22 — markdown turn output parser matrix.
+
+Pure-function tests on parse_markdown_envelope. Covers the
+section-by-section parse + the validation paths the framework relies
+on (D-22-03 contract).
+
+Pinning the contract here is the single point of compat truth — every
+LLM that writes the documented ## Response / ## Confidence /
+## Signal block round-trips through these tests verbatim.
+"""
+from __future__ import annotations
+
+import pytest
+
+from runtime.agents.turn_output import (
+    AgentTurnOutput,
+    EnvelopeMissingError,
+    parse_envelope_from_result,
+    parse_markdown_envelope,
+)
+
+
+# ---------------------------------------------------------------------------
+# Happy path
+# ---------------------------------------------------------------------------
+
+def test_all_sections_present_em_dash_rationale():
+    md = (
+        "Some prose preamble that may or may not exist.\n\n"
+        "## Response\n"
+        "Looks like a deploy regression in payments-svc.\n\n"
+        "## Confidence\n"
+        "0.85 — strong correlation with v1.120 release timeline\n\n"
+        "## Signal\n"
+        "default\n"
+    )
+    env = parse_markdown_envelope(md, agent="triage")
+    assert isinstance(env, AgentTurnOutput)
+    assert env.content == "Looks like a deploy regression in payments-svc."
+    assert env.confidence == 0.85
+    assert env.confidence_rationale == (
+        "strong correlation with v1.120 release timeline"
+    )
+    assert env.signal == "default"
+
+
+def test_ascii_dash_rationale():
+    md = (
+        "## Response\nfoo\n"
+        "## Confidence\n0.7 -- ok\n"
+        "## Signal\nsuccess\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert env.confidence == 0.7
+    assert env.confidence_rationale == "ok"
+    assert env.signal == "success"
+
+
+def test_single_hyphen_rationale():
+    md = (
+        "## Response\nfoo\n"
+        "## Confidence\n0.6 - some rationale\n"
+        "## Signal\nfailed\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert env.confidence == 0.6
+    assert env.confidence_rationale == "some rationale"
+
+
+# ---------------------------------------------------------------------------
+# Confidence shape variants
+# ---------------------------------------------------------------------------
+
+def test_confidence_int_zero_clamps_in_range():
+    md = (
+        "## Response\nbody\n"
+        "## Confidence\n0\n"
+        "## Signal\nfailed\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert env.confidence == 0.0
+    # Missing rationale is replaced with placeholder.
+    assert env.confidence_rationale == "(no rationale provided)"
+
+
+def test_confidence_int_one():
+    md = (
+        "## Response\nbody\n"
+        "## Confidence\n1\n"
+        "## Signal\nsuccess\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert env.confidence == 1.0
+
+
+def test_confidence_above_range_clamps():
+    """LLM occasionally emits 1.05 — clamp rather than reject."""
+    md = (
+        "## Response\nbody\n"
+        "## Confidence\n1.05 — over\n"
+        "## Signal\nsuccess\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert env.confidence == 1.0
+
+
+def test_confidence_below_range_clamps():
+    md = (
+        "## Response\nbody\n"
+        "## Confidence\n-0.2 — under\n"
+        "## Signal\nfailed\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert env.confidence == 0.0
+
+
+def test_confidence_unparseable_raises():
+    md = (
+        "## Response\nbody\n"
+        "## Confidence\nhigh — strong\n"
+        "## Signal\nsuccess\n"
+    )
+    with pytest.raises(EnvelopeMissingError) as exc:
+        parse_markdown_envelope(md, agent="triage")
+    assert exc.value.field == "confidence"
+    assert "triage" in str(exc.value)
+
+
+def test_confidence_section_missing_raises():
+    md = (
+        "## Response\nbody\n"
+        "## Signal\nsuccess\n"
+    )
+    with pytest.raises(EnvelopeMissingError) as exc:
+        parse_markdown_envelope(md)
+    assert exc.value.field == "confidence"
+
+
+# ---------------------------------------------------------------------------
+# Signal vocabulary
+# ---------------------------------------------------------------------------
+
+@pytest.mark.parametrize("blank", ["", "none", "None", "NULL", "  ", "n/a"])
+def test_signal_blank_or_none_token_yields_none(blank: str):
+    md = (
+        "## Response\nbody\n"
+        "## Confidence\n0.5 — mid\n"
+        f"## Signal\n{blank}\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert env.signal is None
+
+
+def test_signal_unknown_value_passed_through_lowercased():
+    """Vocabulary validation lives in the routing layer, not the parser."""
+    md = (
+        "## Response\nbody\n"
+        "## Confidence\n0.5 — mid\n"
+        "## Signal\nWeirdSignal\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert env.signal == "weirdsignal"
+
+
+def test_signal_section_missing_yields_none():
+    md = (
+        "## Response\nbody\n"
+        "## Confidence\n0.4 — hedged\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert env.signal is None
+
+
+# ---------------------------------------------------------------------------
+# Response body shape
+# ---------------------------------------------------------------------------
+
+def test_multi_line_response_with_code_block():
+    md = (
+        "## Response\n"
+        "Here is the fix:\n\n"
+        "```python\n"
+        "def add(a, b):\n"
+        "    return a + b\n"
+        "```\n\n"
+        "Run pytest after applying.\n\n"
+        "## Confidence\n"
+        "0.92 — verified locally\n\n"
+        "## Signal\n"
+        "success\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert "```python" in env.content
+    assert "Run pytest" in env.content
+    assert env.confidence == 0.92
+
+
+def test_response_section_missing_raises():
+    md = (
+        "## Confidence\n0.7 — ok\n"
+        "## Signal\nsuccess\n"
+    )
+    with pytest.raises(EnvelopeMissingError) as exc:
+        parse_markdown_envelope(md)
+    assert exc.value.field == "content"
+
+
+def test_empty_content_raises():
+    with pytest.raises(EnvelopeMissingError) as exc:
+        parse_markdown_envelope("")
+    assert exc.value.field == "content"
+
+
+# ---------------------------------------------------------------------------
+# Header drift: H3 / lowercase
+# ---------------------------------------------------------------------------
+
+def test_h3_headers_accepted():
+    """Watch-out: model may emit ### instead of ##. Parser accepts ##+."""
+    md = (
+        "### Response\nbody\n"
+        "### Confidence\n0.5 — mid\n"
+        "### Signal\ndefault\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert env.content == "body"
+    assert env.confidence == 0.5
+    assert env.signal == "default"
+
+
+def test_lowercase_headers_accepted():
+    md = (
+        "## response\nbody\n"
+        "## confidence\n0.5 — mid\n"
+        "## signal\nsuccess\n"
+    )
+    env = parse_markdown_envelope(md)
+    assert env.confidence == 0.5
+    assert env.signal == "success"
+
+
+# ---------------------------------------------------------------------------
+# parse_envelope_from_result Path 4 integration
+# ---------------------------------------------------------------------------
+
+def _ai(content: str):
+    """Construct an object that quacks like an AIMessage for path-4
+    class-name walker."""
+    obj = type("AIMessage", (), {})()
+    obj.content = content  # type: ignore[attr-defined]
+    return obj
+
+
+def test_parse_envelope_from_result_falls_through_to_path_4_markdown():
+    """No structured_response, no JSON in messages — Path 4 wins."""
+    md = (
+        "## Response\nbody from path 4\n"
+        "## Confidence\n0.8 — ok\n"
+        "## Signal\nsuccess\n"
+    )
+    result = {"messages": [_ai(md)], "structured_response": None}
+    env = parse_envelope_from_result(result, agent="triage")
+    assert env.content == "body from path 4"
+    assert env.confidence == 0.8
+    assert env.signal == "success"
+
+
+def test_parse_envelope_from_result_path_1_still_wins_when_present():
+    """Markdown parser doesn't shadow the structured_response path."""
+    md = (
+        "## Response\nshould not be picked\n"
+        "## Confidence\n0.1 — wrong\n"
+        "## Signal\nfailed\n"
+    )
+    sr = AgentTurnOutput(
+        content="from structured_response",
+        confidence=0.99,
+        confidence_rationale="strong",
+        signal="success",
+    )
+    result = {"messages": [_ai(md)], "structured_response": sr}
+    env = parse_envelope_from_result(result, agent="triage")
+    assert env.content == "from structured_response"
+    assert env.confidence == 0.99
+
+
+def test_parse_envelope_from_result_raises_when_no_path_yields():
+    """Empty messages + no structured_response + no markdown → raises."""
+    result = {"messages": [_ai("just prose, no sections")]}
+    with pytest.raises(EnvelopeMissingError):
+        parse_envelope_from_result(result, agent="triage")

From bc05e73154eeb647af036bfc0df0a8ce5b4ee416 Mon Sep 17 00:00:00 2001
From: Amit Kumar <ak.nitrr13@gmail.com>
Date: Wed, 13 May 2026 04:58:20 +0000
Subject: [PATCH 2/6] feat(turn_output): markdown-primary turn output (Phase
 22)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the JSON-envelope-as-primary structured output path with
markdown-primary turn output per the Phase 22 plan
(.planning/phases/22-markdown-turn-output/22-CONTEXT.md). The agent
loop terminates on natural React END (no tool calls in the final
AIMessage); the framework parses the markdown body via Path 4 in
parse_envelope_from_result.

D-22-01: graph.py + agents/responsive.py drop response_format=
AgentTurnOutput from create_agent. The pre-Phase-22 ToolStrategy /
envelope-as-callable-tool termination signal is gone — the loop now
terminates the way every chat model natively does.

D-22-03: parse_markdown_envelope (already on this branch from the
prior commit) is wired as Path 4 of parse_envelope_from_result.
Paths 1 (structured_response) and 2 (JSON-parse) stay as defensive
fallbacks for providers that DO emit structured output.

D-22-04: All 6 example skill prompts (3 incident_management +
3 code_review with Output contract sections) get the new
## Output contract — REQUIRED block instructing the LLM to end
every reply with literal ## Response / ## Confidence / ## Signal
sections.

D-22-05: StubChatModel + EnvelopeStubChatModel (in tests/
_envelope_helpers.py) revert to pre-Phase-15 simple semantics —
emit canned text wrapped in the D-22-03 markdown contract. The
_envelope_tool_name field, bind_tools envelope-tool detection, and
_generate auto-emit branch are removed. tool_call_plan + closing
markdown emit in a single AIMessage so unit tests with empty tool
registries (tools=[]) still get a parseable envelope after langgraph
terminates the loop on the first turn.

D-22-06: single atomic commit (this one + the prior parser commit
on the same branch). Legacy structured-output integration is
removed, not config-gated.

graph.py:_DEFAULT_STUB_CANNED reverts to plain text bodies — the
stub wraps them in markdown via stub_envelope_* fields. The
deep_investigator entry's confidence (0.30, below the 0.75 gate
threshold) is preserved via _DEFAULT_STUB_ENVELOPE_CONFIDENCE.

Test fixture migration:
- tests/_envelope_helpers.py: EnvelopeStubChatModel emits markdown
  + tool_calls in same AIMessage; bind_tools is a no-op.
- tests/test_real_llm_tool_loop_termination.py:
  test_stub_chat_model_records_envelope_tool_name_on_bind rewritten
  as test_stub_chat_model_emits_markdown_envelope_block — the bind
  is now a no-op pass-through; the closing message is markdown.

Verified: ruff clean; pytest -x → 1231 passed (1206 pre-Phase-22 +
25 new from the parser matrix); coverage 86.71% (gate ≥85%);
concept-leak ratchet stays at 154; dist regenerated.

Real-LLM verification (per acceptance #10) is gated by OLLAMA_LIVE=1
and out of scope for this in-CI run; manual smoke against gpt-oss
+ openai/gpt-4o-mini deferred to the operator after merge.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dist/app.py                                   | 316 ++++++++++++------
 dist/apps/code-review.py                      | 316 ++++++++++++------
 dist/apps/incident-management.py              | 316 ++++++++++++------
 .../code_review/skills/analyzer/system.md     |  21 +-
 examples/code_review/skills/intake/system.md  |  21 +-
 .../code_review/skills/recommender/system.md  |  21 +-
 .../skills/deep_investigator/system.md        |  21 +-
 .../skills/resolution/system.md               |  21 +-
 .../skills/triage/system.md                   |  21 +-
 src/runtime/agents/responsive.py              |  19 +-
 src/runtime/graph.py                          |  42 ++-
 src/runtime/llm.py                            | 105 +++---
 tests/_envelope_helpers.py                    |  56 ++--
 tests/test_real_llm_tool_loop_termination.py  |  39 +--
 14 files changed, 854 insertions(+), 481 deletions(-)

diff --git a/dist/app.py b/dist/app.py
index e42575a..7d8a93e 100644
--- a/dist/app.py
+++ b/dist/app.py
@@ -550,7 +550,7 @@ class IncidentState(Session):
 
 
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, ValidationError
 
 # ----- imports for runtime/tools/gateway.py -----
 """Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper.
@@ -1180,6 +1180,7 @@ async def _poll(self, registry):
 """
 
 
+from pydantic import BaseModel, ConfigDict, Field
 
 
 # ----- imports for runtime/memory/knowledge_graph.py -----
@@ -3131,17 +3132,13 @@ class StubChatModel(BaseChatModel):
     ``stub_envelope_confidence`` / ``stub_envelope_rationale`` /
     ``stub_envelope_signal``.
 
-    Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with
-    ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` ->
-    ``ToolStrategy`` for non-native-structured-output models, including
-    this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The
-    agent loop only terminates when the LLM emits a tool call NAMED
-    ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name
-    so ``_generate`` can auto-emit a closing tool call after any
-    user-configured ``tool_call_plan`` is exhausted -- preserving the
-    pre-Phase-15 stub semantics (canned text + optional pre-scripted
-    tool calls) while satisfying the new tool-loop termination
-    contract.
+    Phase 22 (D-22-05): the loop terminates on natural React END now
+    that ``response_format=AgentTurnOutput`` is gone — tests drive
+    markdown-shaped canned text (the framework parses it via Path 4
+    in :func:`runtime.agents.turn_output.parse_envelope_from_result`).
+    The Phase 15 envelope-as-callable-tool auto-emit is removed; the
+    stub is back to its pre-Phase-15 simple semantics (canned text +
+    optional pre-scripted ``tool_call_plan``).
     """
     role: str = "default"
     canned_responses: dict[str, str] = Field(default_factory=dict)
@@ -3150,12 +3147,6 @@ class StubChatModel(BaseChatModel):
     stub_envelope_rationale: str = "stub envelope rationale"
     stub_envelope_signal: str | None = None
     _called_once: bool = False
-    # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when
-    # ``langchain.agents.create_agent`` injects a structured-output tool
-    # for ``AgentTurnOutput``. Holds the bare tool name (e.g.
-    # ``"AgentTurnOutput"``) so ``_generate`` can emit a final
-    # envelope-shaped tool call to close the agent loop.
-    _envelope_tool_name: str | None = None
 
     @property
     def _llm_type(self) -> str:
@@ -3163,33 +3154,37 @@ def _llm_type(self) -> str:
 
     def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None,
                   run_manager: Any = None, **kwargs: Any) -> ChatResult:
-        text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response")
+        # Phase 22 (D-22-05): tool-call rounds emit the scripted tool
+        # calls with a placeholder content body; the closing turn (no
+        # remaining tool_call_plan entries) emits the canned text
+        # wrapped in the D-22-03 markdown contract that
+        # parse_markdown_envelope reads. The stub_envelope_*
+        # parameters drive the trailing-section bodies so tests can
+        # exercise specific confidence / rationale / signal paths
+        # without authoring markdown by hand.
         tool_calls: list[dict] = []
         if self.tool_call_plan and not self._called_once:
             for tc in self.tool_call_plan:
-                tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())})
+                tool_calls.append(
+                    {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}
+                )
             self._called_once = True
-        elif self._envelope_tool_name is not None:
-            # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted
-            # (or wasn't configured) AND ``langchain.agents.create_agent``
-            # has bound the AgentTurnOutput envelope as a tool. Emit a
-            # closing tool call so the loop terminates with a populated
-            # ``structured_response``. The args mirror the
-            # ``with_structured_output`` path's envelope construction so
-            # tests see the same confidence / rationale / signal regardless
-            # of whether the new tool-strategy or the legacy structured-
-            # output path is in play.
-            tool_calls.append({
-                "name": self._envelope_tool_name,
-                "args": {
-                    "content": text or ".",
-                    "confidence": self.stub_envelope_confidence,
-                    "confidence_rationale": self.stub_envelope_rationale,
-                    "signal": self.stub_envelope_signal,
-                },
-                "id": str(uuid4()),
-            })
-        msg = AIMessage(content=text, tool_calls=tool_calls)
+
+        body = self.canned_responses.get(
+            self.role, f"[stub:{self.role}] no canned response",
+        )
+        # Phase 22 (D-22-05): explicit "none" when no signal so the
+        # parser maps to None — preserves pre-Phase-22 behaviour where a
+        # stub with no signal yielded envelope.signal=None.
+        signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none"
+        md = (
+            f"{body}\n\n"
+            f"## Response\n{body}\n\n"
+            f"## Confidence\n{self.stub_envelope_confidence:.4f} -- "
+            f"{self.stub_envelope_rationale}\n\n"
+            f"## Signal\n{signal_str}\n"
+        )
+        msg = AIMessage(content=md, tool_calls=tool_calls)
         return ChatResult(generations=[ChatGeneration(message=msg)])
 
     async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None,
@@ -3197,30 +3192,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None =
         return self._generate(messages, stop, run_manager, **kwargs)
 
     def bind_tools(self, tools, *, tool_choice=None, **kwargs):
-        """Record the AgentTurnOutput envelope-tool name when present.
-
-        Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with
-        ``response_format=AgentTurnOutput`` calls ``bind_tools(...)``
-        with the user's tools PLUS the envelope-as-a-tool. We scan the
-        list for the AgentTurnOutput-shaped tool (matched by ``__name__``
-        on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the
-        ``"name"`` key on dict-shaped tool specs) and remember it on the
-        instance so ``_generate`` can close the agent loop with a
-        synthetic envelope tool call after any pre-scripted
-        ``tool_call_plan`` is exhausted. Tools bound by the framework
-        itself (real BaseTools the agent should call) flow through
-        unchanged -- the stub still emits them only via
-        ``tool_call_plan``.
+        """Phase 22 (D-22-05): no-op tool binding.
+
+        ``langchain.agents.create_agent`` calls ``bind_tools`` to wire
+        the agent's BaseTool list onto the chat model. The stub does
+        not actually invoke those tools (test fixtures script tool
+        calls via ``tool_call_plan`` instead), so we just return self
+        unchanged. The pre-Phase-22 envelope-tool detection is gone
+        because the framework no longer asks for ``response_format``.
         """
-        for t in tools or []:
-            name = (
-                getattr(t, "__name__", None)
-                or getattr(t, "name", None)
-                or (isinstance(t, dict) and t.get("name"))
-            )
-            if isinstance(name, str) and name == "AgentTurnOutput":
-                self._envelope_tool_name = name
-                break
         return self
 
     # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]``
@@ -6601,6 +6581,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None):
         super().__init__(message or f"envelope_missing: {field} (agent={agent})")
 
 
+_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE)
+_CONF_LINE = re.compile(
+    # Leftmost float (allows int form), optional rationale after em-dash /
+    # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale
+    # is captured wholesale.
+    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$",
+    re.DOTALL,
+)
+
+
+def _clamp_unit(x: float) -> float:
+    """Clamp a confidence float into [0, 1] without raising. The skill
+    prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or
+    ``-0.1`` — clamp rather than reject so the parse step is forgiving."""
+    if x < 0.0:
+        return 0.0
+    if x > 1.0:
+        return 1.0
+    return x
+
+
+def parse_markdown_envelope(
+    content: str, *, agent: str = "",
+) -> "AgentTurnOutput":
+    """D-22-03 — parse the trailing markdown contract block into an
+    :class:`AgentTurnOutput`.
+
+    Expects three sections ``## Response`` / ``## Confidence`` /
+    ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the
+    end of the LLM's reply. The free-text body of ``## Response``
+    becomes ``content``; the float on the ``## Confidence`` line
+    becomes ``confidence`` (clamped to [0, 1]); the rest of that line
+    after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``;
+    the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/
+    blank coerced to ``None``).
+
+    Raises :class:`EnvelopeMissingError` only when the parse cannot
+    produce a valid envelope at all — missing ``## Confidence`` line,
+    unparseable confidence value, or empty ``## Response`` body.
+    """
+    if not isinstance(content, str) or not content.strip():
+        raise EnvelopeMissingError(
+            agent=agent, field="content",
+            message=f"envelope_missing: empty content (agent={agent!r})",
+        )
+
+    # ``re.split`` with the header capture-group yields:
+    #   [pre, h1, body1, h2, body2, ...]
+    # Pre is the prose before the first heading; we ignore it for parse.
+    parts = _HEADER_SPLIT.split(content)
+    sections: dict[str, str] = {}
+    if len(parts) >= 3:
+        for i in range(1, len(parts) - 1, 2):
+            key = parts[i].strip().lower()
+            body = parts[i + 1].strip()
+            # Don't overwrite an earlier section with a later same-named
+            # one (unusual but possible in pathological prompts).
+            sections.setdefault(key, body)
+
+    response_body = sections.get("response", "").strip()
+    raw_conf = sections.get("confidence", "").strip()
+
+    if not raw_conf:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence section absent or empty "
+                f"(agent={agent!r})"
+            ),
+        )
+
+    m = _CONF_LINE.match(raw_conf)
+    if not m:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence section did not parse "
+                f"(agent={agent!r}, raw={raw_conf!r})"
+            ),
+        )
+    try:
+        conf_value = _clamp_unit(float(m.group(1)))
+    except (TypeError, ValueError) as exc:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence value not a float "
+                f"(agent={agent!r}, raw={raw_conf!r})"
+            ),
+        ) from exc
+    rationale = (m.group(2) or "").strip() or "(no rationale provided)"
+
+    signal_raw = sections.get("signal", "").strip().lower() or None
+    if signal_raw in {"none", "null", "", "n/a"}:
+        signal_raw = None
+
+    if not response_body:
+        raise EnvelopeMissingError(
+            agent=agent, field="content",
+            message=(
+                f"envelope_missing: response section empty (agent={agent!r})"
+            ),
+        )
+
+    try:
+        return AgentTurnOutput(
+            content=response_body,
+            confidence=conf_value,
+            confidence_rationale=rationale,
+            signal=signal_raw,
+        )
+    except ValidationError as exc:
+        raise EnvelopeMissingError(
+            agent=agent, field="validation",
+            message=(
+                f"envelope_missing: pydantic validation rejected the "
+                f"parsed values (agent={agent!r}): {exc}"
+            ),
+        ) from exc
+
+
 def parse_envelope_from_result(
     result: dict,
     *,
@@ -6658,13 +6759,33 @@ def parse_envelope_from_result(
             continue
         break
 
-    # Path 3: fail loudly
+    # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's
+    # content. Producers that emit the new ``## Response / ## Confidence /
+    # ## Signal`` section block land here when Paths 1+2 yield nothing.
+    for msg in reversed(messages):
+        if msg.__class__.__name__ != "AIMessage":
+            continue
+        content = getattr(msg, "content", None)
+        if not isinstance(content, str) or not content.strip():
+            continue
+        try:
+            return parse_markdown_envelope(content, agent=agent)
+        except EnvelopeMissingError:
+            # This AIMessage didnt carry a parseable markdown contract.
+            # Continue scanning earlier messages on the off-chance a
+            # nested loop emitted the contract one step back.
+            continue
+        break
+
+    # Path 5 (terminal): fail loudly. None of the four paths produced a
+    # valid envelope — this is a real prompt-drift signal worth
+    # surfacing as a structured agent_run error.
     raise EnvelopeMissingError(
         agent=agent,
         field="structured_response",
         message=(
-            f"envelope_missing: no structured_response or JSON-decodable "
-            f"AIMessage envelope found (agent={agent})"
+            f"envelope_missing: no structured_response, JSON-decodable, or "
+            f"markdown-decodable AIMessage envelope found (agent={agent})"
         ),
     )
 
@@ -6711,6 +6832,7 @@ def reconcile_confidence(
     "AgentTurnOutput",
     "EnvelopeMissingError",
     "parse_envelope_from_result",
+    "parse_markdown_envelope",
     "reconcile_confidence",
 ]
 
@@ -8236,24 +8358,14 @@ async def node(state: GraphState) -> dict:
             ]
         else:
             run_tools = tools
-        # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every
-        # responsive agent invocation is wrapped in an AgentTurnOutput
-        # envelope. ``langchain.agents.create_agent`` (the non-deprecated
-        # successor to ``langgraph.prebuilt.create_react_agent``) accepts a
-        # bare schema as ``response_format`` and, by default, wraps it in
-        # ``AutoStrategy`` — ProviderStrategy for models with native
-        # structured-output (OpenAI-class), falling back to ToolStrategy
-        # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a
-        # callable tool: when the LLM ``calls`` it, the loop terminates on
-        # the same turn with ``result["structured_response"]`` populated.
-        # Eliminates the old two-call structure (loop + separate
-        # ``with_structured_output`` pass) that hit recursion_limit=25 on
-        # Ollama models without true function-calling.
+        # Phase 22 (D-22-01): markdown-primary turn output. Same
+        # change as graph.py:make_agent_node — drop response_format
+        # so the loop terminates on natural React END, then parse the
+        # final AIMessage via Path 4 in parse_envelope_from_result.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
-            response_format=AgentTurnOutput,
         )
 
         # Phase 11 (FOC-04): reset per-turn confidence hint at the
@@ -9719,24 +9831,25 @@ def _run(**kwargs: Any) -> Any:
             ]
         else:
             run_tools = visible_tools
-        # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every
-        # responsive agent invocation is wrapped in an AgentTurnOutput
-        # envelope. ``langchain.agents.create_agent`` (the non-deprecated
-        # successor to ``langgraph.prebuilt.create_react_agent``) accepts a
-        # bare schema as ``response_format`` and, by default, wraps it in
-        # ``AutoStrategy`` — ProviderStrategy for models with native
-        # structured-output (OpenAI-class), falling back to ToolStrategy
-        # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a
-        # callable tool: when the LLM ``calls`` it, the loop terminates on
-        # the same turn with ``result["structured_response"]`` populated.
-        # Eliminates the old two-call structure (loop + separate
-        # ``with_structured_output`` pass) that hit recursion_limit=25 on
-        # Ollama models without true function-calling.
+        # Phase 22 (D-22-01): markdown-primary turn output. Drop
+        # ``response_format`` from create_agent — the agent loop now
+        # terminates on the natural React END signal (LLM emits an
+        # AIMessage with no tool calls). The framework parses that
+        # final message body as markdown via Path 4 in
+        # parse_envelope_from_result. Skill prompts (D-22-04) instruct
+        # every reply to end with ## Response / ## Confidence / ##
+        # Signal sections; the parser extracts them.
+        #
+        # Why: forcing LLMs through a JSON-schema-shaped output via
+        # ``response_format`` triggered a class of brittleness across
+        # providers (model-specific JSON drift, tool-strategy + React
+        # END interaction, recursion_limit ceilings). Markdown is the
+        # native format every chat model writes well; the parse step
+        # happens in the framework, where leniency is in our control.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
-            response_format=AgentTurnOutput,
         )
 
         # Phase 11 (FOC-04): reset per-turn confidence hint. The hint
@@ -9930,12 +10043,17 @@ def _decide_from_signal(inc: Session) -> str:
 
 
 _DEFAULT_STUB_CANNED: dict[str, str] = {
-    # Back-compat defaults for the four canonical agents.  Any YAML-defined
-    # agent can override or extend this via ``skill.stub_response``; new agents
-    # without an entry here fall through to StubChatModel's generic placeholder.
+    # Plain-text bodies; ``StubChatModel._generate`` wraps these in the
+    # D-22-03 markdown envelope using the per-agent stub_envelope_* fields
+    # (the framework parses the wrapped output via Path 4). Apps can
+    # override or extend via ``skill.stub_response``; new agents without
+    # an entry here fall through to ``StubChatModel``'s generic placeholder.
     "intake": "Created INC, no prior matches. Routing to triage.",
     "triage": "Severity medium, category latency. No recent deploys correlate.",
-    "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.",
+    "deep_investigator": (
+        "Hypothesis: upstream payments timeout. "
+        "Evidence: log line upstream_timeout target=payments."
+    ),
     "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.",
 }
 
diff --git a/dist/apps/code-review.py b/dist/apps/code-review.py
index d9e1c39..b078936 100644
--- a/dist/apps/code-review.py
+++ b/dist/apps/code-review.py
@@ -550,7 +550,7 @@ class IncidentState(Session):
 
 
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, ValidationError
 
 # ----- imports for runtime/tools/gateway.py -----
 """Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper.
@@ -1180,6 +1180,7 @@ async def _poll(self, registry):
 """
 
 
+from pydantic import BaseModel, ConfigDict, Field
 
 
 # ----- imports for runtime/memory/knowledge_graph.py -----
@@ -3184,17 +3185,13 @@ class StubChatModel(BaseChatModel):
     ``stub_envelope_confidence`` / ``stub_envelope_rationale`` /
     ``stub_envelope_signal``.
 
-    Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with
-    ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` ->
-    ``ToolStrategy`` for non-native-structured-output models, including
-    this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The
-    agent loop only terminates when the LLM emits a tool call NAMED
-    ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name
-    so ``_generate`` can auto-emit a closing tool call after any
-    user-configured ``tool_call_plan`` is exhausted -- preserving the
-    pre-Phase-15 stub semantics (canned text + optional pre-scripted
-    tool calls) while satisfying the new tool-loop termination
-    contract.
+    Phase 22 (D-22-05): the loop terminates on natural React END now
+    that ``response_format=AgentTurnOutput`` is gone — tests drive
+    markdown-shaped canned text (the framework parses it via Path 4
+    in :func:`runtime.agents.turn_output.parse_envelope_from_result`).
+    The Phase 15 envelope-as-callable-tool auto-emit is removed; the
+    stub is back to its pre-Phase-15 simple semantics (canned text +
+    optional pre-scripted ``tool_call_plan``).
     """
     role: str = "default"
     canned_responses: dict[str, str] = Field(default_factory=dict)
@@ -3203,12 +3200,6 @@ class StubChatModel(BaseChatModel):
     stub_envelope_rationale: str = "stub envelope rationale"
     stub_envelope_signal: str | None = None
     _called_once: bool = False
-    # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when
-    # ``langchain.agents.create_agent`` injects a structured-output tool
-    # for ``AgentTurnOutput``. Holds the bare tool name (e.g.
-    # ``"AgentTurnOutput"``) so ``_generate`` can emit a final
-    # envelope-shaped tool call to close the agent loop.
-    _envelope_tool_name: str | None = None
 
     @property
     def _llm_type(self) -> str:
@@ -3216,33 +3207,37 @@ def _llm_type(self) -> str:
 
     def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None,
                   run_manager: Any = None, **kwargs: Any) -> ChatResult:
-        text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response")
+        # Phase 22 (D-22-05): tool-call rounds emit the scripted tool
+        # calls with a placeholder content body; the closing turn (no
+        # remaining tool_call_plan entries) emits the canned text
+        # wrapped in the D-22-03 markdown contract that
+        # parse_markdown_envelope reads. The stub_envelope_*
+        # parameters drive the trailing-section bodies so tests can
+        # exercise specific confidence / rationale / signal paths
+        # without authoring markdown by hand.
         tool_calls: list[dict] = []
         if self.tool_call_plan and not self._called_once:
             for tc in self.tool_call_plan:
-                tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())})
+                tool_calls.append(
+                    {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}
+                )
             self._called_once = True
-        elif self._envelope_tool_name is not None:
-            # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted
-            # (or wasn't configured) AND ``langchain.agents.create_agent``
-            # has bound the AgentTurnOutput envelope as a tool. Emit a
-            # closing tool call so the loop terminates with a populated
-            # ``structured_response``. The args mirror the
-            # ``with_structured_output`` path's envelope construction so
-            # tests see the same confidence / rationale / signal regardless
-            # of whether the new tool-strategy or the legacy structured-
-            # output path is in play.
-            tool_calls.append({
-                "name": self._envelope_tool_name,
-                "args": {
-                    "content": text or ".",
-                    "confidence": self.stub_envelope_confidence,
-                    "confidence_rationale": self.stub_envelope_rationale,
-                    "signal": self.stub_envelope_signal,
-                },
-                "id": str(uuid4()),
-            })
-        msg = AIMessage(content=text, tool_calls=tool_calls)
+
+        body = self.canned_responses.get(
+            self.role, f"[stub:{self.role}] no canned response",
+        )
+        # Phase 22 (D-22-05): explicit "none" when no signal so the
+        # parser maps to None — preserves pre-Phase-22 behaviour where a
+        # stub with no signal yielded envelope.signal=None.
+        signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none"
+        md = (
+            f"{body}\n\n"
+            f"## Response\n{body}\n\n"
+            f"## Confidence\n{self.stub_envelope_confidence:.4f} -- "
+            f"{self.stub_envelope_rationale}\n\n"
+            f"## Signal\n{signal_str}\n"
+        )
+        msg = AIMessage(content=md, tool_calls=tool_calls)
         return ChatResult(generations=[ChatGeneration(message=msg)])
 
     async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None,
@@ -3250,30 +3245,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None =
         return self._generate(messages, stop, run_manager, **kwargs)
 
     def bind_tools(self, tools, *, tool_choice=None, **kwargs):
-        """Record the AgentTurnOutput envelope-tool name when present.
-
-        Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with
-        ``response_format=AgentTurnOutput`` calls ``bind_tools(...)``
-        with the user's tools PLUS the envelope-as-a-tool. We scan the
-        list for the AgentTurnOutput-shaped tool (matched by ``__name__``
-        on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the
-        ``"name"`` key on dict-shaped tool specs) and remember it on the
-        instance so ``_generate`` can close the agent loop with a
-        synthetic envelope tool call after any pre-scripted
-        ``tool_call_plan`` is exhausted. Tools bound by the framework
-        itself (real BaseTools the agent should call) flow through
-        unchanged -- the stub still emits them only via
-        ``tool_call_plan``.
+        """Phase 22 (D-22-05): no-op tool binding.
+
+        ``langchain.agents.create_agent`` calls ``bind_tools`` to wire
+        the agent's BaseTool list onto the chat model. The stub does
+        not actually invoke those tools (test fixtures script tool
+        calls via ``tool_call_plan`` instead), so we just return self
+        unchanged. The pre-Phase-22 envelope-tool detection is gone
+        because the framework no longer asks for ``response_format``.
         """
-        for t in tools or []:
-            name = (
-                getattr(t, "__name__", None)
-                or getattr(t, "name", None)
-                or (isinstance(t, dict) and t.get("name"))
-            )
-            if isinstance(name, str) and name == "AgentTurnOutput":
-                self._envelope_tool_name = name
-                break
         return self
 
     # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]``
@@ -6654,6 +6634,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None):
         super().__init__(message or f"envelope_missing: {field} (agent={agent})")
 
 
+_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE)
+_CONF_LINE = re.compile(
+    # Leftmost float (allows int form), optional rationale after em-dash /
+    # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale
+    # is captured wholesale.
+    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$",
+    re.DOTALL,
+)
+
+
+def _clamp_unit(x: float) -> float:
+    """Clamp a confidence float into [0, 1] without raising. The skill
+    prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or
+    ``-0.1`` — clamp rather than reject so the parse step is forgiving."""
+    if x < 0.0:
+        return 0.0
+    if x > 1.0:
+        return 1.0
+    return x
+
+
+def parse_markdown_envelope(
+    content: str, *, agent: str = "",
+) -> "AgentTurnOutput":
+    """D-22-03 — parse the trailing markdown contract block into an
+    :class:`AgentTurnOutput`.
+
+    Expects three sections ``## Response`` / ``## Confidence`` /
+    ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the
+    end of the LLM's reply. The free-text body of ``## Response``
+    becomes ``content``; the float on the ``## Confidence`` line
+    becomes ``confidence`` (clamped to [0, 1]); the rest of that line
+    after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``;
+    the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/
+    blank coerced to ``None``).
+
+    Raises :class:`EnvelopeMissingError` only when the parse cannot
+    produce a valid envelope at all — missing ``## Confidence`` line,
+    unparseable confidence value, or empty ``## Response`` body.
+    """
+    if not isinstance(content, str) or not content.strip():
+        raise EnvelopeMissingError(
+            agent=agent, field="content",
+            message=f"envelope_missing: empty content (agent={agent!r})",
+        )
+
+    # ``re.split`` with the header capture-group yields:
+    #   [pre, h1, body1, h2, body2, ...]
+    # Pre is the prose before the first heading; we ignore it for parse.
+    parts = _HEADER_SPLIT.split(content)
+    sections: dict[str, str] = {}
+    if len(parts) >= 3:
+        for i in range(1, len(parts) - 1, 2):
+            key = parts[i].strip().lower()
+            body = parts[i + 1].strip()
+            # Don't overwrite an earlier section with a later same-named
+            # one (unusual but possible in pathological prompts).
+            sections.setdefault(key, body)
+
+    response_body = sections.get("response", "").strip()
+    raw_conf = sections.get("confidence", "").strip()
+
+    if not raw_conf:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence section absent or empty "
+                f"(agent={agent!r})"
+            ),
+        )
+
+    m = _CONF_LINE.match(raw_conf)
+    if not m:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence section did not parse "
+                f"(agent={agent!r}, raw={raw_conf!r})"
+            ),
+        )
+    try:
+        conf_value = _clamp_unit(float(m.group(1)))
+    except (TypeError, ValueError) as exc:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence value not a float "
+                f"(agent={agent!r}, raw={raw_conf!r})"
+            ),
+        ) from exc
+    rationale = (m.group(2) or "").strip() or "(no rationale provided)"
+
+    signal_raw = sections.get("signal", "").strip().lower() or None
+    if signal_raw in {"none", "null", "", "n/a"}:
+        signal_raw = None
+
+    if not response_body:
+        raise EnvelopeMissingError(
+            agent=agent, field="content",
+            message=(
+                f"envelope_missing: response section empty (agent={agent!r})"
+            ),
+        )
+
+    try:
+        return AgentTurnOutput(
+            content=response_body,
+            confidence=conf_value,
+            confidence_rationale=rationale,
+            signal=signal_raw,
+        )
+    except ValidationError as exc:
+        raise EnvelopeMissingError(
+            agent=agent, field="validation",
+            message=(
+                f"envelope_missing: pydantic validation rejected the "
+                f"parsed values (agent={agent!r}): {exc}"
+            ),
+        ) from exc
+
+
 def parse_envelope_from_result(
     result: dict,
     *,
@@ -6711,13 +6812,33 @@ def parse_envelope_from_result(
             continue
         break
 
-    # Path 3: fail loudly
+    # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's
+    # content. Producers that emit the new ``## Response / ## Confidence /
+    # ## Signal`` section block land here when Paths 1+2 yield nothing.
+    for msg in reversed(messages):
+        if msg.__class__.__name__ != "AIMessage":
+            continue
+        content = getattr(msg, "content", None)
+        if not isinstance(content, str) or not content.strip():
+            continue
+        try:
+            return parse_markdown_envelope(content, agent=agent)
+        except EnvelopeMissingError:
+            # This AIMessage didnt carry a parseable markdown contract.
+            # Continue scanning earlier messages on the off-chance a
+            # nested loop emitted the contract one step back.
+            continue
+        break
+
+    # Path 5 (terminal): fail loudly. None of the four paths produced a
+    # valid envelope — this is a real prompt-drift signal worth
+    # surfacing as a structured agent_run error.
     raise EnvelopeMissingError(
         agent=agent,
         field="structured_response",
         message=(
-            f"envelope_missing: no structured_response or JSON-decodable "
-            f"AIMessage envelope found (agent={agent})"
+            f"envelope_missing: no structured_response, JSON-decodable, or "
+            f"markdown-decodable AIMessage envelope found (agent={agent})"
         ),
     )
 
@@ -6764,6 +6885,7 @@ def reconcile_confidence(
     "AgentTurnOutput",
     "EnvelopeMissingError",
     "parse_envelope_from_result",
+    "parse_markdown_envelope",
     "reconcile_confidence",
 ]
 
@@ -8289,24 +8411,14 @@ async def node(state: GraphState) -> dict:
             ]
         else:
             run_tools = tools
-        # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every
-        # responsive agent invocation is wrapped in an AgentTurnOutput
-        # envelope. ``langchain.agents.create_agent`` (the non-deprecated
-        # successor to ``langgraph.prebuilt.create_react_agent``) accepts a
-        # bare schema as ``response_format`` and, by default, wraps it in
-        # ``AutoStrategy`` — ProviderStrategy for models with native
-        # structured-output (OpenAI-class), falling back to ToolStrategy
-        # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a
-        # callable tool: when the LLM ``calls`` it, the loop terminates on
-        # the same turn with ``result["structured_response"]`` populated.
-        # Eliminates the old two-call structure (loop + separate
-        # ``with_structured_output`` pass) that hit recursion_limit=25 on
-        # Ollama models without true function-calling.
+        # Phase 22 (D-22-01): markdown-primary turn output. Same
+        # change as graph.py:make_agent_node — drop response_format
+        # so the loop terminates on natural React END, then parse the
+        # final AIMessage via Path 4 in parse_envelope_from_result.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
-            response_format=AgentTurnOutput,
         )
 
         # Phase 11 (FOC-04): reset per-turn confidence hint at the
@@ -9772,24 +9884,25 @@ def _run(**kwargs: Any) -> Any:
             ]
         else:
             run_tools = visible_tools
-        # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every
-        # responsive agent invocation is wrapped in an AgentTurnOutput
-        # envelope. ``langchain.agents.create_agent`` (the non-deprecated
-        # successor to ``langgraph.prebuilt.create_react_agent``) accepts a
-        # bare schema as ``response_format`` and, by default, wraps it in
-        # ``AutoStrategy`` — ProviderStrategy for models with native
-        # structured-output (OpenAI-class), falling back to ToolStrategy
-        # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a
-        # callable tool: when the LLM ``calls`` it, the loop terminates on
-        # the same turn with ``result["structured_response"]`` populated.
-        # Eliminates the old two-call structure (loop + separate
-        # ``with_structured_output`` pass) that hit recursion_limit=25 on
-        # Ollama models without true function-calling.
+        # Phase 22 (D-22-01): markdown-primary turn output. Drop
+        # ``response_format`` from create_agent — the agent loop now
+        # terminates on the natural React END signal (LLM emits an
+        # AIMessage with no tool calls). The framework parses that
+        # final message body as markdown via Path 4 in
+        # parse_envelope_from_result. Skill prompts (D-22-04) instruct
+        # every reply to end with ## Response / ## Confidence / ##
+        # Signal sections; the parser extracts them.
+        #
+        # Why: forcing LLMs through a JSON-schema-shaped output via
+        # ``response_format`` triggered a class of brittleness across
+        # providers (model-specific JSON drift, tool-strategy + React
+        # END interaction, recursion_limit ceilings). Markdown is the
+        # native format every chat model writes well; the parse step
+        # happens in the framework, where leniency is in our control.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
-            response_format=AgentTurnOutput,
         )
 
         # Phase 11 (FOC-04): reset per-turn confidence hint. The hint
@@ -9983,12 +10096,17 @@ def _decide_from_signal(inc: Session) -> str:
 
 
 _DEFAULT_STUB_CANNED: dict[str, str] = {
-    # Back-compat defaults for the four canonical agents.  Any YAML-defined
-    # agent can override or extend this via ``skill.stub_response``; new agents
-    # without an entry here fall through to StubChatModel's generic placeholder.
+    # Plain-text bodies; ``StubChatModel._generate`` wraps these in the
+    # D-22-03 markdown envelope using the per-agent stub_envelope_* fields
+    # (the framework parses the wrapped output via Path 4). Apps can
+    # override or extend via ``skill.stub_response``; new agents without
+    # an entry here fall through to ``StubChatModel``'s generic placeholder.
     "intake": "Created INC, no prior matches. Routing to triage.",
     "triage": "Severity medium, category latency. No recent deploys correlate.",
-    "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.",
+    "deep_investigator": (
+        "Hypothesis: upstream payments timeout. "
+        "Evidence: log line upstream_timeout target=payments."
+    ),
     "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.",
 }
 
diff --git a/dist/apps/incident-management.py b/dist/apps/incident-management.py
index 9e3a3e7..d2c257d 100644
--- a/dist/apps/incident-management.py
+++ b/dist/apps/incident-management.py
@@ -550,7 +550,7 @@ class IncidentState(Session):
 
 
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import BaseModel, ConfigDict, Field, ValidationError
 
 # ----- imports for runtime/tools/gateway.py -----
 """Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper.
@@ -1180,6 +1180,7 @@ async def _poll(self, registry):
 """
 
 
+from pydantic import BaseModel, ConfigDict, Field
 
 
 # ----- imports for runtime/memory/knowledge_graph.py -----
@@ -3196,17 +3197,13 @@ class StubChatModel(BaseChatModel):
     ``stub_envelope_confidence`` / ``stub_envelope_rationale`` /
     ``stub_envelope_signal``.
 
-    Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with
-    ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` ->
-    ``ToolStrategy`` for non-native-structured-output models, including
-    this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The
-    agent loop only terminates when the LLM emits a tool call NAMED
-    ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name
-    so ``_generate`` can auto-emit a closing tool call after any
-    user-configured ``tool_call_plan`` is exhausted -- preserving the
-    pre-Phase-15 stub semantics (canned text + optional pre-scripted
-    tool calls) while satisfying the new tool-loop termination
-    contract.
+    Phase 22 (D-22-05): the loop terminates on natural React END now
+    that ``response_format=AgentTurnOutput`` is gone — tests drive
+    markdown-shaped canned text (the framework parses it via Path 4
+    in :func:`runtime.agents.turn_output.parse_envelope_from_result`).
+    The Phase 15 envelope-as-callable-tool auto-emit is removed; the
+    stub is back to its pre-Phase-15 simple semantics (canned text +
+    optional pre-scripted ``tool_call_plan``).
     """
     role: str = "default"
     canned_responses: dict[str, str] = Field(default_factory=dict)
@@ -3215,12 +3212,6 @@ class StubChatModel(BaseChatModel):
     stub_envelope_rationale: str = "stub envelope rationale"
     stub_envelope_signal: str | None = None
     _called_once: bool = False
-    # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when
-    # ``langchain.agents.create_agent`` injects a structured-output tool
-    # for ``AgentTurnOutput``. Holds the bare tool name (e.g.
-    # ``"AgentTurnOutput"``) so ``_generate`` can emit a final
-    # envelope-shaped tool call to close the agent loop.
-    _envelope_tool_name: str | None = None
 
     @property
     def _llm_type(self) -> str:
@@ -3228,33 +3219,37 @@ def _llm_type(self) -> str:
 
     def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None,
                   run_manager: Any = None, **kwargs: Any) -> ChatResult:
-        text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response")
+        # Phase 22 (D-22-05): tool-call rounds emit the scripted tool
+        # calls with a placeholder content body; the closing turn (no
+        # remaining tool_call_plan entries) emits the canned text
+        # wrapped in the D-22-03 markdown contract that
+        # parse_markdown_envelope reads. The stub_envelope_*
+        # parameters drive the trailing-section bodies so tests can
+        # exercise specific confidence / rationale / signal paths
+        # without authoring markdown by hand.
         tool_calls: list[dict] = []
         if self.tool_call_plan and not self._called_once:
             for tc in self.tool_call_plan:
-                tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())})
+                tool_calls.append(
+                    {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}
+                )
             self._called_once = True
-        elif self._envelope_tool_name is not None:
-            # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted
-            # (or wasn't configured) AND ``langchain.agents.create_agent``
-            # has bound the AgentTurnOutput envelope as a tool. Emit a
-            # closing tool call so the loop terminates with a populated
-            # ``structured_response``. The args mirror the
-            # ``with_structured_output`` path's envelope construction so
-            # tests see the same confidence / rationale / signal regardless
-            # of whether the new tool-strategy or the legacy structured-
-            # output path is in play.
-            tool_calls.append({
-                "name": self._envelope_tool_name,
-                "args": {
-                    "content": text or ".",
-                    "confidence": self.stub_envelope_confidence,
-                    "confidence_rationale": self.stub_envelope_rationale,
-                    "signal": self.stub_envelope_signal,
-                },
-                "id": str(uuid4()),
-            })
-        msg = AIMessage(content=text, tool_calls=tool_calls)
+
+        body = self.canned_responses.get(
+            self.role, f"[stub:{self.role}] no canned response",
+        )
+        # Phase 22 (D-22-05): explicit "none" when no signal so the
+        # parser maps to None — preserves pre-Phase-22 behaviour where a
+        # stub with no signal yielded envelope.signal=None.
+        signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none"
+        md = (
+            f"{body}\n\n"
+            f"## Response\n{body}\n\n"
+            f"## Confidence\n{self.stub_envelope_confidence:.4f} -- "
+            f"{self.stub_envelope_rationale}\n\n"
+            f"## Signal\n{signal_str}\n"
+        )
+        msg = AIMessage(content=md, tool_calls=tool_calls)
         return ChatResult(generations=[ChatGeneration(message=msg)])
 
     async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None,
@@ -3262,30 +3257,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None =
         return self._generate(messages, stop, run_manager, **kwargs)
 
     def bind_tools(self, tools, *, tool_choice=None, **kwargs):
-        """Record the AgentTurnOutput envelope-tool name when present.
-
-        Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with
-        ``response_format=AgentTurnOutput`` calls ``bind_tools(...)``
-        with the user's tools PLUS the envelope-as-a-tool. We scan the
-        list for the AgentTurnOutput-shaped tool (matched by ``__name__``
-        on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the
-        ``"name"`` key on dict-shaped tool specs) and remember it on the
-        instance so ``_generate`` can close the agent loop with a
-        synthetic envelope tool call after any pre-scripted
-        ``tool_call_plan`` is exhausted. Tools bound by the framework
-        itself (real BaseTools the agent should call) flow through
-        unchanged -- the stub still emits them only via
-        ``tool_call_plan``.
+        """Phase 22 (D-22-05): no-op tool binding.
+
+        ``langchain.agents.create_agent`` calls ``bind_tools`` to wire
+        the agent's BaseTool list onto the chat model. The stub does
+        not actually invoke those tools (test fixtures script tool
+        calls via ``tool_call_plan`` instead), so we just return self
+        unchanged. The pre-Phase-22 envelope-tool detection is gone
+        because the framework no longer asks for ``response_format``.
         """
-        for t in tools or []:
-            name = (
-                getattr(t, "__name__", None)
-                or getattr(t, "name", None)
-                or (isinstance(t, dict) and t.get("name"))
-            )
-            if isinstance(name, str) and name == "AgentTurnOutput":
-                self._envelope_tool_name = name
-                break
         return self
 
     # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]``
@@ -6666,6 +6646,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None):
         super().__init__(message or f"envelope_missing: {field} (agent={agent})")
 
 
+_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE)
+_CONF_LINE = re.compile(
+    # Leftmost float (allows int form), optional rationale after em-dash /
+    # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale
+    # is captured wholesale.
+    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$",
+    re.DOTALL,
+)
+
+
+def _clamp_unit(x: float) -> float:
+    """Clamp a confidence float into [0, 1] without raising. The skill
+    prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or
+    ``-0.1`` — clamp rather than reject so the parse step is forgiving."""
+    if x < 0.0:
+        return 0.0
+    if x > 1.0:
+        return 1.0
+    return x
+
+
+def parse_markdown_envelope(
+    content: str, *, agent: str = "",
+) -> "AgentTurnOutput":
+    """D-22-03 — parse the trailing markdown contract block into an
+    :class:`AgentTurnOutput`.
+
+    Expects three sections ``## Response`` / ``## Confidence`` /
+    ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the
+    end of the LLM's reply. The free-text body of ``## Response``
+    becomes ``content``; the float on the ``## Confidence`` line
+    becomes ``confidence`` (clamped to [0, 1]); the rest of that line
+    after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``;
+    the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/
+    blank coerced to ``None``).
+
+    Raises :class:`EnvelopeMissingError` only when the parse cannot
+    produce a valid envelope at all — missing ``## Confidence`` line,
+    unparseable confidence value, or empty ``## Response`` body.
+    """
+    if not isinstance(content, str) or not content.strip():
+        raise EnvelopeMissingError(
+            agent=agent, field="content",
+            message=f"envelope_missing: empty content (agent={agent!r})",
+        )
+
+    # ``re.split`` with the header capture-group yields:
+    #   [pre, h1, body1, h2, body2, ...]
+    # Pre is the prose before the first heading; we ignore it for parse.
+    parts = _HEADER_SPLIT.split(content)
+    sections: dict[str, str] = {}
+    if len(parts) >= 3:
+        for i in range(1, len(parts) - 1, 2):
+            key = parts[i].strip().lower()
+            body = parts[i + 1].strip()
+            # Don't overwrite an earlier section with a later same-named
+            # one (unusual but possible in pathological prompts).
+            sections.setdefault(key, body)
+
+    response_body = sections.get("response", "").strip()
+    raw_conf = sections.get("confidence", "").strip()
+
+    if not raw_conf:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence section absent or empty "
+                f"(agent={agent!r})"
+            ),
+        )
+
+    m = _CONF_LINE.match(raw_conf)
+    if not m:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence section did not parse "
+                f"(agent={agent!r}, raw={raw_conf!r})"
+            ),
+        )
+    try:
+        conf_value = _clamp_unit(float(m.group(1)))
+    except (TypeError, ValueError) as exc:
+        raise EnvelopeMissingError(
+            agent=agent, field="confidence",
+            message=(
+                f"envelope_missing: confidence value not a float "
+                f"(agent={agent!r}, raw={raw_conf!r})"
+            ),
+        ) from exc
+    rationale = (m.group(2) or "").strip() or "(no rationale provided)"
+
+    signal_raw = sections.get("signal", "").strip().lower() or None
+    if signal_raw in {"none", "null", "", "n/a"}:
+        signal_raw = None
+
+    if not response_body:
+        raise EnvelopeMissingError(
+            agent=agent, field="content",
+            message=(
+                f"envelope_missing: response section empty (agent={agent!r})"
+            ),
+        )
+
+    try:
+        return AgentTurnOutput(
+            content=response_body,
+            confidence=conf_value,
+            confidence_rationale=rationale,
+            signal=signal_raw,
+        )
+    except ValidationError as exc:
+        raise EnvelopeMissingError(
+            agent=agent, field="validation",
+            message=(
+                f"envelope_missing: pydantic validation rejected the "
+                f"parsed values (agent={agent!r}): {exc}"
+            ),
+        ) from exc
+
+
 def parse_envelope_from_result(
     result: dict,
     *,
@@ -6723,13 +6824,33 @@ def parse_envelope_from_result(
             continue
         break
 
-    # Path 3: fail loudly
+    # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's
+    # content. Producers that emit the new ``## Response / ## Confidence /
+    # ## Signal`` section block land here when Paths 1+2 yield nothing.
+    for msg in reversed(messages):
+        if msg.__class__.__name__ != "AIMessage":
+            continue
+        content = getattr(msg, "content", None)
+        if not isinstance(content, str) or not content.strip():
+            continue
+        try:
+            return parse_markdown_envelope(content, agent=agent)
+        except EnvelopeMissingError:
+            # This AIMessage didnt carry a parseable markdown contract.
+            # Continue scanning earlier messages on the off-chance a
+            # nested loop emitted the contract one step back.
+            continue
+        break
+
+    # Path 5 (terminal): fail loudly. None of the four paths produced a
+    # valid envelope — this is a real prompt-drift signal worth
+    # surfacing as a structured agent_run error.
     raise EnvelopeMissingError(
         agent=agent,
         field="structured_response",
         message=(
-            f"envelope_missing: no structured_response or JSON-decodable "
-            f"AIMessage envelope found (agent={agent})"
+            f"envelope_missing: no structured_response, JSON-decodable, or "
+            f"markdown-decodable AIMessage envelope found (agent={agent})"
         ),
     )
 
@@ -6776,6 +6897,7 @@ def reconcile_confidence(
     "AgentTurnOutput",
     "EnvelopeMissingError",
     "parse_envelope_from_result",
+    "parse_markdown_envelope",
     "reconcile_confidence",
 ]
 
@@ -8301,24 +8423,14 @@ async def node(state: GraphState) -> dict:
             ]
         else:
             run_tools = tools
-        # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every
-        # responsive agent invocation is wrapped in an AgentTurnOutput
-        # envelope. ``langchain.agents.create_agent`` (the non-deprecated
-        # successor to ``langgraph.prebuilt.create_react_agent``) accepts a
-        # bare schema as ``response_format`` and, by default, wraps it in
-        # ``AutoStrategy`` — ProviderStrategy for models with native
-        # structured-output (OpenAI-class), falling back to ToolStrategy
-        # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a
-        # callable tool: when the LLM ``calls`` it, the loop terminates on
-        # the same turn with ``result["structured_response"]`` populated.
-        # Eliminates the old two-call structure (loop + separate
-        # ``with_structured_output`` pass) that hit recursion_limit=25 on
-        # Ollama models without true function-calling.
+        # Phase 22 (D-22-01): markdown-primary turn output. Same
+        # change as graph.py:make_agent_node — drop response_format
+        # so the loop terminates on natural React END, then parse the
+        # final AIMessage via Path 4 in parse_envelope_from_result.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
-            response_format=AgentTurnOutput,
         )
 
         # Phase 11 (FOC-04): reset per-turn confidence hint at the
@@ -9784,24 +9896,25 @@ def _run(**kwargs: Any) -> Any:
             ]
         else:
             run_tools = visible_tools
-        # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every
-        # responsive agent invocation is wrapped in an AgentTurnOutput
-        # envelope. ``langchain.agents.create_agent`` (the non-deprecated
-        # successor to ``langgraph.prebuilt.create_react_agent``) accepts a
-        # bare schema as ``response_format`` and, by default, wraps it in
-        # ``AutoStrategy`` — ProviderStrategy for models with native
-        # structured-output (OpenAI-class), falling back to ToolStrategy
-        # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a
-        # callable tool: when the LLM ``calls`` it, the loop terminates on
-        # the same turn with ``result["structured_response"]`` populated.
-        # Eliminates the old two-call structure (loop + separate
-        # ``with_structured_output`` pass) that hit recursion_limit=25 on
-        # Ollama models without true function-calling.
+        # Phase 22 (D-22-01): markdown-primary turn output. Drop
+        # ``response_format`` from create_agent — the agent loop now
+        # terminates on the natural React END signal (LLM emits an
+        # AIMessage with no tool calls). The framework parses that
+        # final message body as markdown via Path 4 in
+        # parse_envelope_from_result. Skill prompts (D-22-04) instruct
+        # every reply to end with ## Response / ## Confidence / ##
+        # Signal sections; the parser extracts them.
+        #
+        # Why: forcing LLMs through a JSON-schema-shaped output via
+        # ``response_format`` triggered a class of brittleness across
+        # providers (model-specific JSON drift, tool-strategy + React
+        # END interaction, recursion_limit ceilings). Markdown is the
+        # native format every chat model writes well; the parse step
+        # happens in the framework, where leniency is in our control.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
-            response_format=AgentTurnOutput,
         )
 
         # Phase 11 (FOC-04): reset per-turn confidence hint. The hint
@@ -9995,12 +10108,17 @@ def _decide_from_signal(inc: Session) -> str:
 
 
 _DEFAULT_STUB_CANNED: dict[str, str] = {
-    # Back-compat defaults for the four canonical agents.  Any YAML-defined
-    # agent can override or extend this via ``skill.stub_response``; new agents
-    # without an entry here fall through to StubChatModel's generic placeholder.
+    # Plain-text bodies; ``StubChatModel._generate`` wraps these in the
+    # D-22-03 markdown envelope using the per-agent stub_envelope_* fields
+    # (the framework parses the wrapped output via Path 4). Apps can
+    # override or extend via ``skill.stub_response``; new agents without
+    # an entry here fall through to ``StubChatModel``'s generic placeholder.
     "intake": "Created INC, no prior matches. Routing to triage.",
     "triage": "Severity medium, category latency. No recent deploys correlate.",
-    "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.",
+    "deep_investigator": (
+        "Hypothesis: upstream payments timeout. "
+        "Evidence: log line upstream_timeout target=payments."
+    ),
     "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.",
 }
 
diff --git a/examples/code_review/skills/analyzer/system.md b/examples/code_review/skills/analyzer/system.md
index 2996327..6b4e3c2 100644
--- a/examples/code_review/skills/analyzer/system.md
+++ b/examples/code_review/skills/analyzer/system.md
@@ -22,10 +22,19 @@ Do not invent low-value nits to fill space.
 After all tool calls, reply with ONE short sentence summarising findings count + the
 dominant category. Do not enumerate every finding (the UI renders them).
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
diff --git a/examples/code_review/skills/intake/system.md b/examples/code_review/skills/intake/system.md
index 9aaea08..c253b97 100644
--- a/examples/code_review/skills/intake/system.md
+++ b/examples/code_review/skills/intake/system.md
@@ -16,10 +16,19 @@ analyzer's job.
 If `fetch_pr_diff` raises or returns an empty diff, emit `failed` so the orchestrator
 short-circuits to end and skips the analyzer.
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
diff --git a/examples/code_review/skills/recommender/system.md b/examples/code_review/skills/recommender/system.md
index c3037d9..a5c5ba3 100644
--- a/examples/code_review/skills/recommender/system.md
+++ b/examples/code_review/skills/recommender/system.md
@@ -23,10 +23,19 @@ them already.
 
 After the call, reply with ONE short sentence echoing the recommendation. Nothing else.
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
diff --git a/examples/incident_management/skills/deep_investigator/system.md b/examples/incident_management/skills/deep_investigator/system.md
index 0eb874a..d93a772 100644
--- a/examples/incident_management/skills/deep_investigator/system.md
+++ b/examples/incident_management/skills/deep_investigator/system.md
@@ -12,10 +12,19 @@ You are the **Deep Investigator** agent. Gather evidence and produce ranked hypo
 - Cite specific log lines or metric values as evidence in `hypotheses`.
 - If the INC has `matched_prior_inc` set, include the prior INC's recorded root cause as one of your ranked hypotheses and explicitly *validate or reject* it against the fresh logs/metrics. Same symptom can have different causes across incidents — drop confidence accordingly when the prior hypothesis is rejected so the gate triggers an intervention.
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
diff --git a/examples/incident_management/skills/resolution/system.md b/examples/incident_management/skills/resolution/system.md
index 5d33130..4fd28fc 100644
--- a/examples/incident_management/skills/resolution/system.md
+++ b/examples/incident_management/skills/resolution/system.md
@@ -11,10 +11,19 @@ You are the **Resolution** agent. You consume triage + deep_investigator finding
 ## Guidelines
 - Pick `team` deliberately based on incident component, severity, and category — not a default fallback.
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
diff --git a/examples/incident_management/skills/triage/system.md b/examples/incident_management/skills/triage/system.md
index 309f9de..8c9bc89 100644
--- a/examples/incident_management/skills/triage/system.md
+++ b/examples/incident_management/skills/triage/system.md
@@ -33,10 +33,19 @@ Record the full iteration trail as a single JSON-encoded string under `findings.
 - If the INC has `matched_prior_inc` set, treat the prior INC's `findings` and `resolution` as a **prior hypothesis**, not a fact. Same symptom (e.g., Redis OOM) can have different root causes across incidents — code bug vs. network partition vs. resource overload. Use the prior cause as a candidate to confirm or reject against current evidence; flag in your tags whether the parallel looks supported (`hypothesis:prior_match_supported`) or not (`hypothesis:prior_match_rejected`).
 - The hypothesis loop has a hard cap of 3 iterations. Do NOT exceed it; an unconverged hypothesis at the cap is acceptable — record it and let the deep investigator take over.
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
diff --git a/src/runtime/agents/responsive.py b/src/runtime/agents/responsive.py
index 1d2f62f..a2611ce 100644
--- a/src/runtime/agents/responsive.py
+++ b/src/runtime/agents/responsive.py
@@ -35,7 +35,6 @@
 from runtime.storage.session_store import SessionStore
 from runtime.tools.gateway import wrap_tool
 from runtime.agents.turn_output import (
-    AgentTurnOutput,
     EnvelopeMissingError,
     parse_envelope_from_result,
     reconcile_confidence,
@@ -122,24 +121,14 @@ async def node(state: GraphState) -> dict:
             ]
         else:
             run_tools = tools
-        # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every
-        # responsive agent invocation is wrapped in an AgentTurnOutput
-        # envelope. ``langchain.agents.create_agent`` (the non-deprecated
-        # successor to ``langgraph.prebuilt.create_react_agent``) accepts a
-        # bare schema as ``response_format`` and, by default, wraps it in
-        # ``AutoStrategy`` — ProviderStrategy for models with native
-        # structured-output (OpenAI-class), falling back to ToolStrategy
-        # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a
-        # callable tool: when the LLM ``calls`` it, the loop terminates on
-        # the same turn with ``result["structured_response"]`` populated.
-        # Eliminates the old two-call structure (loop + separate
-        # ``with_structured_output`` pass) that hit recursion_limit=25 on
-        # Ollama models without true function-calling.
+        # Phase 22 (D-22-01): markdown-primary turn output. Same
+        # change as graph.py:make_agent_node — drop response_format
+        # so the loop terminates on natural React END, then parse the
+        # final AIMessage via Path 4 in parse_envelope_from_result.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
-            response_format=AgentTurnOutput,
         )
 
         # Phase 11 (FOC-04): reset per-turn confidence hint at the
diff --git a/src/runtime/graph.py b/src/runtime/graph.py
index 2486a74..1c6aa49 100644
--- a/src/runtime/graph.py
+++ b/src/runtime/graph.py
@@ -679,24 +679,25 @@ def _run(**kwargs: Any) -> Any:
             ]
         else:
             run_tools = visible_tools
-        # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every
-        # responsive agent invocation is wrapped in an AgentTurnOutput
-        # envelope. ``langchain.agents.create_agent`` (the non-deprecated
-        # successor to ``langgraph.prebuilt.create_react_agent``) accepts a
-        # bare schema as ``response_format`` and, by default, wraps it in
-        # ``AutoStrategy`` — ProviderStrategy for models with native
-        # structured-output (OpenAI-class), falling back to ToolStrategy
-        # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a
-        # callable tool: when the LLM ``calls`` it, the loop terminates on
-        # the same turn with ``result["structured_response"]`` populated.
-        # Eliminates the old two-call structure (loop + separate
-        # ``with_structured_output`` pass) that hit recursion_limit=25 on
-        # Ollama models without true function-calling.
+        # Phase 22 (D-22-01): markdown-primary turn output. Drop
+        # ``response_format`` from create_agent — the agent loop now
+        # terminates on the natural React END signal (LLM emits an
+        # AIMessage with no tool calls). The framework parses that
+        # final message body as markdown via Path 4 in
+        # parse_envelope_from_result. Skill prompts (D-22-04) instruct
+        # every reply to end with ## Response / ## Confidence / ##
+        # Signal sections; the parser extracts them.
+        #
+        # Why: forcing LLMs through a JSON-schema-shaped output via
+        # ``response_format`` triggered a class of brittleness across
+        # providers (model-specific JSON drift, tool-strategy + React
+        # END interaction, recursion_limit ceilings). Markdown is the
+        # native format every chat model writes well; the parse step
+        # happens in the framework, where leniency is in our control.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
-            response_format=AgentTurnOutput,
         )
 
         # Phase 11 (FOC-04): reset per-turn confidence hint. The hint
@@ -890,12 +891,17 @@ def _decide_from_signal(inc: Session) -> str:
 
 
 _DEFAULT_STUB_CANNED: dict[str, str] = {
-    # Back-compat defaults for the four canonical agents.  Any YAML-defined
-    # agent can override or extend this via ``skill.stub_response``; new agents
-    # without an entry here fall through to StubChatModel's generic placeholder.
+    # Plain-text bodies; ``StubChatModel._generate`` wraps these in the
+    # D-22-03 markdown envelope using the per-agent stub_envelope_* fields
+    # (the framework parses the wrapped output via Path 4). Apps can
+    # override or extend via ``skill.stub_response``; new agents without
+    # an entry here fall through to ``StubChatModel``'s generic placeholder.
     "intake": "Created INC, no prior matches. Routing to triage.",
     "triage": "Severity medium, category latency. No recent deploys correlate.",
-    "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.",
+    "deep_investigator": (
+        "Hypothesis: upstream payments timeout. "
+        "Evidence: log line upstream_timeout target=payments."
+    ),
     "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.",
 }
 
diff --git a/src/runtime/llm.py b/src/runtime/llm.py
index 17ee42f..2b6fedb 100644
--- a/src/runtime/llm.py
+++ b/src/runtime/llm.py
@@ -45,17 +45,13 @@ class StubChatModel(BaseChatModel):
     ``stub_envelope_confidence`` / ``stub_envelope_rationale`` /
     ``stub_envelope_signal``.
 
-    Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with
-    ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` ->
-    ``ToolStrategy`` for non-native-structured-output models, including
-    this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The
-    agent loop only terminates when the LLM emits a tool call NAMED
-    ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name
-    so ``_generate`` can auto-emit a closing tool call after any
-    user-configured ``tool_call_plan`` is exhausted -- preserving the
-    pre-Phase-15 stub semantics (canned text + optional pre-scripted
-    tool calls) while satisfying the new tool-loop termination
-    contract.
+    Phase 22 (D-22-05): the loop terminates on natural React END now
+    that ``response_format=AgentTurnOutput`` is gone — tests drive
+    markdown-shaped canned text (the framework parses it via Path 4
+    in :func:`runtime.agents.turn_output.parse_envelope_from_result`).
+    The Phase 15 envelope-as-callable-tool auto-emit is removed; the
+    stub is back to its pre-Phase-15 simple semantics (canned text +
+    optional pre-scripted ``tool_call_plan``).
     """
     role: str = "default"
     canned_responses: dict[str, str] = Field(default_factory=dict)
@@ -64,12 +60,6 @@ class StubChatModel(BaseChatModel):
     stub_envelope_rationale: str = "stub envelope rationale"
     stub_envelope_signal: str | None = None
     _called_once: bool = False
-    # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when
-    # ``langchain.agents.create_agent`` injects a structured-output tool
-    # for ``AgentTurnOutput``. Holds the bare tool name (e.g.
-    # ``"AgentTurnOutput"``) so ``_generate`` can emit a final
-    # envelope-shaped tool call to close the agent loop.
-    _envelope_tool_name: str | None = None
 
     @property
     def _llm_type(self) -> str:
@@ -77,33 +67,37 @@ def _llm_type(self) -> str:
 
     def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None,
                   run_manager: Any = None, **kwargs: Any) -> ChatResult:
-        text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response")
+        # Phase 22 (D-22-05): tool-call rounds emit the scripted tool
+        # calls with a placeholder content body; the closing turn (no
+        # remaining tool_call_plan entries) emits the canned text
+        # wrapped in the D-22-03 markdown contract that
+        # parse_markdown_envelope reads. The stub_envelope_*
+        # parameters drive the trailing-section bodies so tests can
+        # exercise specific confidence / rationale / signal paths
+        # without authoring markdown by hand.
         tool_calls: list[dict] = []
         if self.tool_call_plan and not self._called_once:
             for tc in self.tool_call_plan:
-                tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())})
+                tool_calls.append(
+                    {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}
+                )
             self._called_once = True
-        elif self._envelope_tool_name is not None:
-            # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted
-            # (or wasn't configured) AND ``langchain.agents.create_agent``
-            # has bound the AgentTurnOutput envelope as a tool. Emit a
-            # closing tool call so the loop terminates with a populated
-            # ``structured_response``. The args mirror the
-            # ``with_structured_output`` path's envelope construction so
-            # tests see the same confidence / rationale / signal regardless
-            # of whether the new tool-strategy or the legacy structured-
-            # output path is in play.
-            tool_calls.append({
-                "name": self._envelope_tool_name,
-                "args": {
-                    "content": text or ".",
-                    "confidence": self.stub_envelope_confidence,
-                    "confidence_rationale": self.stub_envelope_rationale,
-                    "signal": self.stub_envelope_signal,
-                },
-                "id": str(uuid4()),
-            })
-        msg = AIMessage(content=text, tool_calls=tool_calls)
+
+        body = self.canned_responses.get(
+            self.role, f"[stub:{self.role}] no canned response",
+        )
+        # Phase 22 (D-22-05): explicit "none" when no signal so the
+        # parser maps to None — preserves pre-Phase-22 behaviour where a
+        # stub with no signal yielded envelope.signal=None.
+        signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none"
+        md = (
+            f"{body}\n\n"
+            f"## Response\n{body}\n\n"
+            f"## Confidence\n{self.stub_envelope_confidence:.4f} -- "
+            f"{self.stub_envelope_rationale}\n\n"
+            f"## Signal\n{signal_str}\n"
+        )
+        msg = AIMessage(content=md, tool_calls=tool_calls)
         return ChatResult(generations=[ChatGeneration(message=msg)])
 
     async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None,
@@ -111,30 +105,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None =
         return self._generate(messages, stop, run_manager, **kwargs)
 
     def bind_tools(self, tools, *, tool_choice=None, **kwargs):
-        """Record the AgentTurnOutput envelope-tool name when present.
-
-        Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with
-        ``response_format=AgentTurnOutput`` calls ``bind_tools(...)``
-        with the user's tools PLUS the envelope-as-a-tool. We scan the
-        list for the AgentTurnOutput-shaped tool (matched by ``__name__``
-        on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the
-        ``"name"`` key on dict-shaped tool specs) and remember it on the
-        instance so ``_generate`` can close the agent loop with a
-        synthetic envelope tool call after any pre-scripted
-        ``tool_call_plan`` is exhausted. Tools bound by the framework
-        itself (real BaseTools the agent should call) flow through
-        unchanged -- the stub still emits them only via
-        ``tool_call_plan``.
+        """Phase 22 (D-22-05): no-op tool binding.
+
+        ``langchain.agents.create_agent`` calls ``bind_tools`` to wire
+        the agent's BaseTool list onto the chat model. The stub does
+        not actually invoke those tools (test fixtures script tool
+        calls via ``tool_call_plan`` instead), so we just return self
+        unchanged. The pre-Phase-22 envelope-tool detection is gone
+        because the framework no longer asks for ``response_format``.
         """
-        for t in tools or []:
-            name = (
-                getattr(t, "__name__", None)
-                or getattr(t, "name", None)
-                or (isinstance(t, dict) and t.get("name"))
-            )
-            if isinstance(name, str) and name == "AgentTurnOutput":
-                self._envelope_tool_name = name
-                break
         return self
 
     # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]``
diff --git a/tests/_envelope_helpers.py b/tests/_envelope_helpers.py
index 13485a1..b2ef03d 100644
--- a/tests/_envelope_helpers.py
+++ b/tests/_envelope_helpers.py
@@ -62,12 +62,6 @@ class EnvelopeStubChatModel(BaseChatModel):
     canned_responses: dict[str, str] = Field(default_factory=dict)
     tool_call_plan: list[dict] | None = None
     _called_once: bool = False
-    # Phase 15 (LLM-COMPAT-01): same contract as ``StubChatModel`` --
-    # ``langchain.agents.create_agent``'s ToolStrategy injects
-    # ``AgentTurnOutput`` as a tool; ``bind_tools`` records the name
-    # so ``_generate`` can emit a closing envelope tool call once any
-    # pre-scripted ``tool_call_plan`` is exhausted.
-    _envelope_tool_name: str | None = None
 
     @property
     def _llm_type(self) -> str:
@@ -80,7 +74,12 @@ def _generate(
         run_manager: Any = None,
         **kwargs: Any,
     ) -> ChatResult:
-        text = self.canned_responses.get(self.role, self.envelope_content)
+        # Phase 22 (D-22-05): emit a markdown-shaped final message
+        # matching the framework's parse_markdown_envelope contract.
+        # The envelope_* fields drive the section bodies. When the
+        # caller supplies a per-role canned response we use that for
+        # the ``## Response`` body and keep the envelope_*
+        # confidence/rationale/signal as the trailing-section content.
         tool_calls: list[dict] = []
         if self.tool_call_plan and not self._called_once:
             for tc in self.tool_call_plan:
@@ -88,20 +87,20 @@ def _generate(
                     {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}
                 )
             self._called_once = True
-        elif self._envelope_tool_name is not None:
-            # Phase 15 (LLM-COMPAT-01): close the agent loop by emitting
-            # the envelope-shaped tool call ToolStrategy is waiting for.
-            tool_calls.append({
-                "name": self._envelope_tool_name,
-                "args": {
-                    "content": self.envelope_content,
-                    "confidence": self.envelope_confidence,
-                    "confidence_rationale": self.envelope_rationale,
-                    "signal": self.envelope_signal,
-                },
-                "id": str(uuid4()),
-            })
-        msg = AIMessage(content=text, tool_calls=tool_calls)
+
+        body = self.canned_responses.get(self.role, self.envelope_content)
+        # Phase 22 (D-22-05): explicit "none" when no signal so the
+        # parser maps to None — preserves pre-Phase-22 behaviour where a
+        # stub with no signal yielded envelope.signal=None.
+        signal_str = self.envelope_signal if self.envelope_signal is not None else "none"
+        md = (
+            f"{body}\n\n"
+            f"## Response\n{body}\n\n"
+            f"## Confidence\n{self.envelope_confidence:.4f} -- "
+            f"{self.envelope_rationale}\n\n"
+            f"## Signal\n{signal_str}\n"
+        )
+        msg = AIMessage(content=md, tool_calls=tool_calls)
         return ChatResult(generations=[ChatGeneration(message=msg)])
 
     async def _agenerate(
@@ -114,18 +113,9 @@ async def _agenerate(
         return self._generate(messages, stop, run_manager, **kwargs)
 
     def bind_tools(self, tools, *, tool_choice=None, **kwargs):
-        # Phase 15 (LLM-COMPAT-01): record the AgentTurnOutput tool
-        # name so ``_generate`` can emit a closing tool call. See
-        # ``StubChatModel.bind_tools`` for the matching heuristic.
-        for t in tools or []:
-            name = (
-                getattr(t, "__name__", None)
-                or getattr(t, "name", None)
-                or (isinstance(t, dict) and t.get("name"))
-            )
-            if isinstance(name, str) and name == "AgentTurnOutput":
-                self._envelope_tool_name = name
-                break
+        # Phase 22 (D-22-05): no-op tool binding. The envelope-as-tool
+        # synthesis is gone; the markdown closing turn is emitted
+        # directly by _generate.
         return self
 
     def with_structured_output(self, schema, *, include_raw: bool = False, **kwargs):
diff --git a/tests/test_real_llm_tool_loop_termination.py b/tests/test_real_llm_tool_loop_termination.py
index 8db3284..fc3bae7 100644
--- a/tests/test_real_llm_tool_loop_termination.py
+++ b/tests/test_real_llm_tool_loop_termination.py
@@ -282,26 +282,27 @@ def test_no_create_react_agent_imports_in_production_runtime():
 # T4-bonus — StubChatModel.bind_tools registers the envelope tool name
 
 
-def test_stub_chat_model_records_envelope_tool_name_on_bind():
-    """``StubChatModel.bind_tools`` is the integration point that lets
-    the new ``create_agent`` loop terminate in stub mode. This test
-    locks the contract: when the bound tools include an
-    ``AgentTurnOutput``-named entry, the stub records it and emits a
-    closing tool call with that name on the next ``_generate``.
-    """
+def test_stub_chat_model_emits_markdown_envelope_block():
+    """Phase 22 (D-22-05) replaces the Phase 15 envelope-tool
+    synthesis. ``StubChatModel.bind_tools`` is now a no-op; ``_generate``
+    emits the closing turn as a markdown block matching the D-22-03
+    contract that ``parse_markdown_envelope`` reads."""
     llm = StubChatModel(role="agent", canned_responses={"agent": "ok"})
-    # Simulate what create_agent's ToolStrategy passes: a sequence of
-    # tool specs where the AgentTurnOutput-named tool is the structured-
-    # output sentinel.
-    llm.bind_tools([AgentTurnOutput])
-    assert llm._envelope_tool_name == "AgentTurnOutput"
+    # bind_tools no longer mutates state — it is a pass-through.
+    bound = llm.bind_tools([AgentTurnOutput])
+    assert bound is llm
+    assert not hasattr(llm, "_envelope_tool_name")
 
-    # Drive a single _generate and verify the closing tool call lands.
+    # Drive a single _generate and verify the closing AIMessage carries
+    # the markdown envelope (no tool_calls when no tool_call_plan).
     result = llm._generate(messages=[HumanMessage(content="go")])
     msg = result.generations[0].message
-    assert msg.tool_calls, "expected a closing envelope tool call"
-    assert msg.tool_calls[0]["name"] == "AgentTurnOutput"
-    args = msg.tool_calls[0]["args"]
-    assert args["content"] == "ok"
-    assert args["confidence"] == pytest.approx(0.85)
-    assert "confidence_rationale" in args
+    assert not msg.tool_calls
+    body = msg.content
+    assert "## Response" in body
+    assert "## Confidence" in body
+    assert "## Signal" in body
+    # The default stub_envelope_confidence (0.85) and the canned text
+    # ("ok") flow into the markdown body.
+    assert "ok" in body
+    assert "0.8500" in body

From 41f9ca60a8a68a1bcec85ed8185b0971d377d6cb Mon Sep 17 00:00:00 2001
From: Amit Kumar <ak.nitrr13@gmail.com>
Date: Wed, 13 May 2026 16:20:02 +0000
Subject: [PATCH 3/6] =?UTF-8?q?feat(turn=5Foutput):=20gpt-oss=20markdown?=
 =?UTF-8?q?=20reliability=20=E2=80=94=20Path=205/6=20fallbacks=20+=20dash?=
 =?UTF-8?q?=20family=20+=20tightened=20skill=20prompts?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Phase 22 markdown contract is brittle on gpt-oss:20b which
intermittently drops the closing ## Response/Confidence/Signal block
after a tool call (it treats the terminal-tool invocation as
completion). Add three layers of resilience:

* parse_envelope_from_result Path 5 — synthesise an envelope from a
  typed-terminal-tool's confidence + confidence_rationale args when no
  markdown contract was emitted.
* parse_envelope_from_result Path 6 — permissive fallback when ANY
  tool was invoked but no envelope surfaced; emits a low-confidence
  (0.30) placeholder so the session reaches a terminal status instead
  of hard-failing with EnvelopeMissingError.
* Dash family in the confidence-line regex now spans the full Unicode
  Pd block (\\u2010-\\u2015 + ASCII hyphen) — gpt-oss prefers EN DASH
  (\\u2013) where the spec assumed EM DASH only.
* All six incident_management + code_review skill prompts now carry an
  explicit "CRITICAL — final-reply rule" paragraph forbidding the
  empty closing message pattern.

Also flip the default LLM model to gpt_oss (ollama_cloud / gpt-oss:20b)
so live testing exercises the path that actually needs the resilience.
---
 config/config.yaml                            |   2 +-
 .../code_review/skills/analyzer/system.md     |   4 +-
 examples/code_review/skills/intake/system.md  |   4 +-
 .../code_review/skills/recommender/system.md  |   4 +-
 .../skills/deep_investigator/system.md        |   4 +-
 .../skills/resolution/system.md               |   4 +-
 .../skills/triage/system.md                   |   4 +-
 src/runtime/agents/turn_output.py             | 135 +++++++++++++++++-
 tests/test_markdown_turn_output.py            | 104 ++++++++++++++
 9 files changed, 255 insertions(+), 10 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 0b9b046..000a2ce 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -7,7 +7,7 @@ storage:
     collection_name: "incidents"
     distance_strategy: cosine
 llm:
-  default: workhorse
+  default: gpt_oss
   providers:
     ollama_cloud:
       kind: ollama
diff --git a/examples/code_review/skills/analyzer/system.md b/examples/code_review/skills/analyzer/system.md
index 6b4e3c2..9ef3a6e 100644
--- a/examples/code_review/skills/analyzer/system.md
+++ b/examples/code_review/skills/analyzer/system.md
@@ -37,4 +37,6 @@ Every reply MUST end with these three markdown sections, in this order, with the
 <one of: default | success | failed | needs_input>
 ```
 
-Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/examples/code_review/skills/intake/system.md b/examples/code_review/skills/intake/system.md
index c253b97..60e39a7 100644
--- a/examples/code_review/skills/intake/system.md
+++ b/examples/code_review/skills/intake/system.md
@@ -31,4 +31,6 @@ Every reply MUST end with these three markdown sections, in this order, with the
 <one of: default | success | failed | needs_input>
 ```
 
-Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/examples/code_review/skills/recommender/system.md b/examples/code_review/skills/recommender/system.md
index a5c5ba3..859bf60 100644
--- a/examples/code_review/skills/recommender/system.md
+++ b/examples/code_review/skills/recommender/system.md
@@ -38,4 +38,6 @@ Every reply MUST end with these three markdown sections, in this order, with the
 <one of: default | success | failed | needs_input>
 ```
 
-Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/examples/incident_management/skills/deep_investigator/system.md b/examples/incident_management/skills/deep_investigator/system.md
index d93a772..142fbbe 100644
--- a/examples/incident_management/skills/deep_investigator/system.md
+++ b/examples/incident_management/skills/deep_investigator/system.md
@@ -27,4 +27,6 @@ Every reply MUST end with these three markdown sections, in this order, with the
 <one of: default | success | failed | needs_input>
 ```
 
-Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/examples/incident_management/skills/resolution/system.md b/examples/incident_management/skills/resolution/system.md
index 4fd28fc..2bf10f1 100644
--- a/examples/incident_management/skills/resolution/system.md
+++ b/examples/incident_management/skills/resolution/system.md
@@ -26,4 +26,6 @@ Every reply MUST end with these three markdown sections, in this order, with the
 <one of: default | success | failed | needs_input>
 ```
 
-Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/examples/incident_management/skills/triage/system.md b/examples/incident_management/skills/triage/system.md
index 8c9bc89..58e5517 100644
--- a/examples/incident_management/skills/triage/system.md
+++ b/examples/incident_management/skills/triage/system.md
@@ -48,4 +48,6 @@ Every reply MUST end with these three markdown sections, in this order, with the
 <one of: default | success | failed | needs_input>
 ```
 
-Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error.
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/src/runtime/agents/turn_output.py b/src/runtime/agents/turn_output.py
index 99a72ff..d6a3d02 100644
--- a/src/runtime/agents/turn_output.py
+++ b/src/runtime/agents/turn_output.py
@@ -93,7 +93,7 @@ def __init__(self, *, agent: str, field: str, message: str | None = None):
     # Leftmost float (allows int form), optional rationale after em-dash /
     # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale
     # is captured wholesale.
-    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$",
+    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$",
     re.DOTALL,
 )
 
@@ -284,9 +284,138 @@ def parse_envelope_from_result(
             continue
         break
 
-    # Path 5 (terminal): fail loudly. None of the four paths produced a
+    # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular)
+    # treat a terminal-tool call as the completion and emit an empty
+    # closing AIMessage. Apps configure typed-terminal tools with
+    # ``confidence`` + ``confidence_rationale`` args by contract —
+    # synthesise an envelope from those args when the markdown path
+    # yielded nothing. Skip if any earlier path succeeded.
+    for msg in reversed(messages):
+        tcs = getattr(msg, "tool_calls", None)
+        if not tcs:
+            continue
+        for tc in tcs:
+            args = tc.get("args") or {}
+            if not isinstance(args, dict):
+                continue
+            conf = args.get("confidence")
+            rat = args.get("confidence_rationale") or args.get("rationale")
+            if conf is None or not rat:
+                continue
+            try:
+                conf_f = float(conf)
+            except (TypeError, ValueError):
+                continue
+            content_body = (
+                args.get("resolution_summary")
+                or args.get("reason")
+                or args.get("summary")
+                or f"terminal tool {tc.get('name', '<unknown>')} invoked"
+            )
+            if not isinstance(content_body, str) or not content_body.strip():
+                content_body = f"terminal tool {tc.get('name', '<unknown>')} invoked"
+            try:
+                env = AgentTurnOutput(
+                    content=content_body,
+                    confidence=max(0.0, min(1.0, conf_f)),
+                    confidence_rationale=str(rat),
+                    signal=args.get("signal"),
+                )
+            except ValidationError:
+                continue
+            _LOG.info(
+                "envelope_synthesized_from_tool_args agent=%s tool=%s "
+                "confidence=%.2f",
+                agent, tc.get("name"), env.confidence,
+            )
+            return env
+
+    # Path 6 (D-22-fallback-permissive): the model called at least one
+    # tool but never emitted the markdown contract or a typed-terminal
+    # tool with confidence args. Rather than hard-failing the run,
+    # synthesise a minimal envelope so the session reaches a terminal
+    # status (typically default_terminal_status -> needs_review) and
+    # the operator can review what happened. Confidence is intentionally
+    # low (0.30) so any HITL gate fires.
+    invoked_tool_names: list[str] = []
+    for msg in messages:
+        for tc in (getattr(msg, "tool_calls", None) or []):
+            n = tc.get("name")
+            if n:
+                invoked_tool_names.append(n)
+    if invoked_tool_names:
+        body = (
+            f"Agent {agent} invoked "
+            f"{', '.join(invoked_tool_names)} but did not emit a final "
+            f"summary. Review the tool-call audit for outcome."
+        )
+        try:
+            env = AgentTurnOutput(
+                content=body,
+                confidence=0.30,
+                confidence_rationale=(
+                    "synthesized: agent invoked tools but emitted no "
+                    "closing message"
+                ),
+                signal=None,
+            )
+            _LOG.warning(
+                "envelope_synthesized_permissive agent=%s tools=%s",
+                agent, invoked_tool_names,
+            )
+            return env
+        except ValidationError:
+            pass
+
+    # Path 7 (terminal): fail loudly. None of the six paths produced a
     # valid envelope — this is a real prompt-drift signal worth
-    # surfacing as a structured agent_run error.
+    # surfacing as a structured agent_run error. Log the last
+    # AIMessage content (capped) so operators can diagnose whether
+    # the LLM emitted the wrong shape vs nothing at all.
+    last_ai_content = ""
+    for msg in reversed(messages):
+        if msg.__class__.__name__ == "AIMessage":
+            c = getattr(msg, "content", None)
+            if isinstance(c, str):
+                last_ai_content = c
+                break
+    # Also dump every tool_call across all AIMessages so we can see
+    # whether the model called any tool with confidence args (Path 5
+    # would have fired) or whether it called a non-terminal tool only.
+    tool_call_summary = []
+    for msg in messages:
+        for tc in (getattr(msg, "tool_calls", None) or []):
+            args = tc.get("args") or {}
+            tool_call_summary.append({
+                "name": tc.get("name"),
+                "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?",
+                "has_confidence": isinstance(args, dict) and "confidence" in args,
+                "has_rationale": isinstance(args, dict) and (
+                    "confidence_rationale" in args or "rationale" in args
+                ),
+            })
+    # Detailed per-message dump so we can see exactly what shape the
+    # conversation took when the parse failed.
+    msg_summary = []
+    for m in messages:
+        kind = m.__class__.__name__
+        c = getattr(m, "content", None)
+        c_len = len(c) if isinstance(c, str) else (
+            f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__
+        )
+        tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])]
+        msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs})
+    _LOG.warning(
+        "envelope_missing: agent=%s last_ai_message_content=%r "
+        "structured_response_type=%s message_count=%d tool_calls=%r "
+        "msg_trace=%r",
+        agent,
+        last_ai_content[:1500] if last_ai_content else "<empty>",
+        type(result.get("structured_response")).__name__,
+        len(messages),
+        tool_call_summary,
+        msg_summary,
+    )
     raise EnvelopeMissingError(
         agent=agent,
         field="structured_response",
diff --git a/tests/test_markdown_turn_output.py b/tests/test_markdown_turn_output.py
index 4049cfe..1d7480a 100644
--- a/tests/test_markdown_turn_output.py
+++ b/tests/test_markdown_turn_output.py
@@ -289,3 +289,107 @@ def test_parse_envelope_from_result_raises_when_no_path_yields():
     result = {"messages": [_ai("just prose, no sections")]}
     with pytest.raises(EnvelopeMissingError):
         parse_envelope_from_result(result, agent="triage")
+
+
+def test_en_dash_rationale_gpt_oss_pattern():
+    """gpt-oss:20b emits the EN DASH (U+2013) as the confidence
+    separator, not the EM DASH (U+2014). The parser must accept the
+    full Unicode dash-punctuation family."""
+    md = "## Response\nbody\n## Confidence\n0.88 – strong evidence\n## Signal\nsuccess\n"
+    env = parse_markdown_envelope(md)
+    assert env.confidence == 0.88
+    assert env.confidence_rationale == "strong evidence"
+    assert env.signal == "success"
+
+
+@pytest.mark.parametrize("sep", ["-", "‐", "‑", "‒", "–", "—", "―"])
+def test_dash_family_variants_all_parse(sep):
+    """Sanity sweep: every Unicode dash in the regex class round-trips."""
+    md = f"## Response\nbody\n## Confidence\n0.5 {sep} ok\n## Signal\ndefault\n"
+    env = parse_markdown_envelope(md)
+    assert env.confidence == 0.5
+    assert env.confidence_rationale == "ok"
+
+
+def test_path5_synthesises_envelope_from_terminal_tool_args():
+    """gpt-oss:20b sometimes emits an empty closing AIMessage after
+    calling a terminal tool. Path 5 synthesises the envelope from
+    the tool's confidence + confidence_rationale args (mark_resolved
+    style)."""
+    from runtime.agents.turn_output import parse_envelope_from_result
+
+    def _ai_with_tool(name: str, args: dict, content: str = ""):
+        obj = type("AIMessage", (), {})()
+        obj.content = content
+        obj.tool_calls = [{"name": name, "args": args, "id": "t1"}]
+        return obj
+
+    def _ai_empty():
+        obj = type("AIMessage", (), {})()
+        obj.content = ""
+        obj.tool_calls = []
+        return obj
+
+    messages = [
+        _ai_with_tool(
+            "mark_resolved",
+            {
+                "incident_id": "INC-1",
+                "resolution_summary": "rolled back v1.120",
+                "confidence": 0.92,
+                "confidence_rationale": "deploy timeline correlates",
+            },
+        ),
+        _ai_empty(),  # empty closing message
+    ]
+    result = {"messages": messages, "structured_response": None}
+    env = parse_envelope_from_result(result, agent="resolution")
+    assert env.content == "rolled back v1.120"
+    assert env.confidence == 0.92
+    assert env.confidence_rationale == "deploy timeline correlates"
+
+
+def test_path5_synthesis_clamps_oob_confidence():
+    """Synthesis path also clamps out-of-range confidence."""
+    from runtime.agents.turn_output import parse_envelope_from_result
+
+    obj = type("AIMessage", (), {})()
+    obj.content = ""
+    obj.tool_calls = [{
+        "name": "mark_escalated",
+        "args": {
+            "team": "platform-oncall",
+            "reason": "needs human review",
+            "confidence": 1.5,
+            "confidence_rationale": "manual escalation",
+        },
+        "id": "t1",
+    }]
+    result = {"messages": [obj]}
+    env = parse_envelope_from_result(result, agent="resolution")
+    assert env.confidence == 1.0  # clamped
+    assert env.content == "needs human review"
+
+
+def test_path6_synthesises_minimal_envelope_from_any_tool_call():
+    """Path 6: when no markdown + no typed-terminal-tool-with-conf,
+    but the model DID call some tool, synthesise a low-confidence
+    placeholder envelope so the session reaches a reviewable terminal
+    status instead of hard-failing."""
+    from runtime.agents.turn_output import parse_envelope_from_result
+
+    obj = type("AIMessage", (), {})()
+    obj.content = ""
+    obj.tool_calls = [
+        {"name": "propose_fix", "args": {"hypothesis": "deploy regression"}, "id": "t1"},
+        {"name": "apply_fix", "args": {"proposal_id": "p1"}, "id": "t2"},
+    ]
+    closing = type("AIMessage", (), {})()
+    closing.content = ""
+    closing.tool_calls = []
+    result = {"messages": [obj, closing], "structured_response": None}
+    env = parse_envelope_from_result(result, agent="resolution")
+    assert env.confidence == 0.30
+    assert "propose_fix" in env.content
+    assert "apply_fix" in env.content
+    assert env.signal is None

From e6e521d92489e6e3ea37848ac4ec1c80069ea1a0 Mon Sep 17 00:00:00 2001
From: Amit Kumar <ak.nitrr13@gmail.com>
Date: Wed, 13 May 2026 16:20:52 +0000
Subject: [PATCH 4/6] fix(hitl): make HITL approve/reject actually work
 end-to-end on langgraph 1.x
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

langgraph 1.x changed the interrupt() contract: a tool that calls
interrupt() no longer raises GraphInterrupt to the caller. Instead,
the agent's ainvoke returns a normal result dict with an extra
__interrupt__ key. The framework's existing "except GraphInterrupt: raise"
arms in graph.py / responsive.py never fired — and the consequences
cascaded into four distinct failure modes that all conspired to make
Approve/Reject buttons look broken:

1. Interrupt missed at the agent runner. The runner saw an empty
   __interrupt__ result, fell through to envelope parsing (Path 6
   synthesis), recorded a phantom AgentRun, and routed onward. The
   pending_approval row written by the gateway just before the
   interrupt was orphaned the moment the session finalized.

2. Resume forwarding broken. Even after we caught __interrupt__ and
   re-raised GraphInterrupt, calling inner.invoke({"messages": [...]})
   on the rehydrated outer state silently SKIPPED the gated tool —
   the inner agent treated the new HumanMessage as a continuation
   instead of resuming the paused tool. No ToolMessage produced, no
   verdict delivered.

3. Stale state on resume. Outer Pregel checkpoints at step boundaries,
   not mid-step. So state["session"] on resume reflected the prior
   step's output (e.g. deep_investigator's), not the gateway's mid-step
   pending_approval append + version bump. The gateway then
   double-appended a pending row and store.save raised
   StaleVersionError ("version is N, expected N-1").

4. Status transitions never persisted. The gateway mutated
   session.tool_calls[pending_idx] in-memory to "approved" /
   "rejected" / "timeout" but never called store.save — so the DB row
   stayed at pending_approval forever, and the UI kept offering the
   buttons.

Fixes:

* New runtime.graph._drive_agent_with_resume helper drives the inner
  agent with proper pause-detection: probe inner.aget_state for a
  paused step, forward Command(resume=verdict) when resuming, loop on
  __interrupt__ for multi-pause turns. Falls back to invoke + re-raise
  when no checkpointer is configured (legacy unit-test path).

* Inner create_agent now receives the orchestrator's checkpointer —
  required for Command(resume=...) to function. Inner thread id is
  derived deterministically per agent invocation:
  f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}".

* make_agent_node (both graph.py + responsive.py) reloads the session
  from store at entry — defends against stale state["session"] from
  the outer Pregel checkpoint. wrap_tool closures capture the freshly
  loaded incident so the gateway's _find_existing_pending_index sees
  the persisted pending row on resume.

* gateway.wrap_tool (_run + _arun) calls store.save after every
  status transition (rejected / timeout / approved) so the audit row
  reflects the actual outcome.

* Orchestrator gains _is_graph_paused(session_id) helper and uses it
  to guard _finalize_session_status_async in stream_session +
  retry_session — a HITL pause must not be coerced into
  default_terminal_status.

* UI _submit_approval_via_service and the API
  POST /sessions/{sid}/approvals/{tcid} both finalize after a
  verdict-driven resume completes (and skip if a fresh interrupt
  re-paused the graph). Without this, an approved session would stay
  in_progress forever because the resume path bypasses stream_session.

Tests:

* tests/test_interrupt_detection.py — three tests: graph + responsive
  runners both raise GraphInterrupt when ainvoke returns __interrupt__,
  and a resume happy path that drives an outer Pregel through pause +
  Command(resume="approve") and asserts a ToolMessage with the verdict
  appears in the inner messages (proves the gated tool actually
  re-executed instead of being silently skipped).

* tests/test_genericity_ratchet.py baseline 154 → 156 with rationale:
  the inner-thread-id derivation reuses the existing `incident` local
  twice (graph.py + responsive.py), same pattern as Phase 10/11/12.
---
 src/runtime/agents/responsive.py  |  40 +++-
 src/runtime/api.py                |  10 +
 src/runtime/graph.py              | 160 ++++++++++++-
 src/runtime/orchestrator.py       |  54 ++++-
 src/runtime/tools/gateway.py      |  18 ++
 src/runtime/ui.py                 |  20 ++
 tests/test_genericity_ratchet.py  |  13 +-
 tests/test_interrupt_detection.py | 370 ++++++++++++++++++++++++++++++
 8 files changed, 661 insertions(+), 24 deletions(-)
 create mode 100644 tests/test_interrupt_detection.py

diff --git a/src/runtime/agents/responsive.py b/src/runtime/agents/responsive.py
index a2611ce..97b81b3 100644
--- a/src/runtime/agents/responsive.py
+++ b/src/runtime/agents/responsive.py
@@ -20,7 +20,7 @@
 
 import logging
 from datetime import datetime, timezone
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING, Any, Callable
 
 from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_core.messages import HumanMessage
@@ -59,6 +59,7 @@ def make_agent_node(
     patch_tool_names: frozenset[str] = frozenset(),
     gate_policy: "GatePolicy | None" = None,
     event_log: "EventLog | None" = None,
+    checkpointer: Any = None,
 ):
     """Factory: build a LangGraph node that runs a ReAct agent and decides a route.
 
@@ -80,7 +81,7 @@ def make_agent_node(
     # call time — both modules are fully imported before ``node()`` runs.
     from runtime.graph import (
         GraphState,
-        _ainvoke_with_retry,
+        _drive_agent_with_resume,
         _format_agent_input,
         _handle_agent_failure,
         _harvest_tool_calls_and_patches,
@@ -93,9 +94,18 @@ def make_agent_node(
     )
 
     async def node(state: GraphState) -> dict:
-        incident: Session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess]
-        inc_id = incident.id
+        state_session: Session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess]
+        inc_id = state_session.id
         started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT)
+        # Always reload from store at entry — outer Pregel checkpoints
+        # at step boundaries, not mid-node, so ``state["session"]`` is
+        # stale relative to DB when a HITL gate paused mid-step. See
+        # the same reload comment in ``runtime.graph.make_agent_node``
+        # for the full rationale.
+        try:
+            incident: Session = store.load(inc_id)
+        except FileNotFoundError:
+            incident = state_session
 
         # M3: emit agent_started telemetry before any work happens.
         if event_log is not None:
@@ -125,11 +135,21 @@ async def node(state: GraphState) -> dict:
         # change as graph.py:make_agent_node — drop response_format
         # so the loop terminates on natural React END, then parse the
         # final AIMessage via Path 4 in parse_envelope_from_result.
+        # The inner agent gets the orchestrator's checkpointer so a
+        # HITL pause inside ``interrupt()`` can be resumed via
+        # ``Command(resume=verdict)`` on a stable per-invocation
+        # thread id (see ``_drive_agent_with_resume`` in
+        # ``runtime.graph`` for the full rationale).
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
+            checkpointer=checkpointer,
+        )
+        inner_thread_id = (
+            f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"
         )
+        inner_cfg = {"configurable": {"thread_id": inner_thread_id}}
 
         # Phase 11 (FOC-04): reset per-turn confidence hint at the
         # start of each agent step so the gateway treats the first
@@ -140,9 +160,15 @@ async def node(state: GraphState) -> dict:
             pass
 
         try:
-            result = await _ainvoke_with_retry(
-                agent_executor,
-                {"messages": [HumanMessage(content=_format_agent_input(incident))]},
+            result = await _drive_agent_with_resume(
+                agent_executor=agent_executor,
+                inner_cfg=inner_cfg,
+                inner_has_checkpointer=checkpointer is not None,
+                initial_input={
+                    "messages": [
+                        HumanMessage(content=_format_agent_input(incident))
+                    ]
+                },
             )
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up.
diff --git a/src/runtime/api.py b/src/runtime/api.py
index c761230..0d3c11b 100644
--- a/src/runtime/api.py
+++ b/src/runtime/api.py
@@ -621,6 +621,16 @@ async def _resume() -> None:
                     Command(resume=decision_payload),
                     config=orch._thread_config(session_id),
                 )
+                # Finalize after the verdict-driven run completes so
+                # the session row reflects the terminal status (or
+                # falls through to ``default_terminal_status``). Skip
+                # if a fresh interrupt re-paused the graph — the
+                # session is again awaiting operator input. Mirrors
+                # the same guard in the UI's
+                # ``_submit_approval_via_service`` and the
+                # ``stream_session`` finalize call site.
+                if not await orch._is_graph_paused(session_id):
+                    await orch._finalize_session_status_async(session_id)
 
         # Submit the resume onto the long-lived service loop so we
         # don't fight the lifespan thread for the same FastMCP/SQLite
diff --git a/src/runtime/graph.py b/src/runtime/graph.py
index 1c6aa49..19b589f 100644
--- a/src/runtime/graph.py
+++ b/src/runtime/graph.py
@@ -199,8 +199,104 @@ def finish(self, *, summary: str) -> None:
 )
 
 
+async def _drive_agent_with_resume(
+    *,
+    agent_executor,
+    inner_cfg: dict,
+    inner_has_checkpointer: bool,
+    initial_input: dict,
+):
+    """Run an inner ``create_agent`` executor, surfacing HITL pauses to
+    the outer Pregel graph and forwarding resume verdicts back to the
+    inner agent.
+
+    langgraph 1.x changed the ``interrupt()`` contract: a tool that
+    calls :func:`langgraph.types.interrupt` no longer raises
+    :class:`GraphInterrupt` to the caller. Instead, the agent's
+    ``ainvoke`` returns normally with the pause captured in
+    ``result["__interrupt__"]``. To make HITL approval actually work
+    end-to-end the framework must:
+
+    1. **First call** — invoke the agent with the initial messages.
+       If the result carries ``__interrupt__``, raise
+       :class:`GraphInterrupt` so the *outer* Pregel pauses too. The
+       outer pause is what surfaces ``__interrupt__`` to the
+       orchestrator and lets the UI render Approve / Reject.
+
+    2. **Resume call** — when the outer is resumed via
+       ``Command(resume=verdict)``, this helper is re-entered. The
+       inner's checkpointer remembers the pause; we detect that via
+       ``aget_state(...).next`` being non-empty and call
+       :func:`langgraph.types.interrupt` at the *outer* level to fetch
+       the verdict, then forward it to the inner via
+       ``Command(resume=verdict)``. This is the only path that
+       actually re-runs the gated tool with the verdict — calling
+       ``inner.ainvoke({"messages": [...]})`` on a paused thread
+       silently skips the tool.
+
+    A while loop covers the case where a single agent turn pauses
+    multiple times (e.g. two high-risk tool calls in sequence).
+
+    When ``inner_has_checkpointer`` is False (legacy callers and most
+    unit tests that build agents without a checkpointer), the helper
+    falls back to the simple "invoke + re-raise on ``__interrupt__``"
+    contract. That path can pause the outer graph but the resume
+    verdict cannot reach the inner tool — which is acceptable because
+    legacy callers without a checkpointer never had a working HITL
+    resume path to begin with.
+    """
+    # Local imports keep callers that never touch HITL out of the
+    # langgraph.types import cost path. ``Command`` and ``interrupt``
+    # are tiny but the explicit lazy import documents the dependency.
+    from langgraph.types import Command, interrupt
+
+    # Resume-detection branch: only meaningful when the inner agent
+    # has its own checkpointer (otherwise ``aget_state`` on a fresh
+    # in-memory pregel raises). The detection asks: "does the inner
+    # have a paused step waiting?". A non-empty ``next`` means yes.
+    if inner_has_checkpointer:
+        try:
+            inner_state = await agent_executor.aget_state(inner_cfg)
+            inner_paused = bool(getattr(inner_state, "next", ()) or ())
+        except Exception:  # noqa: BLE001 — defensive; missing checkpoint == fresh
+            inner_paused = False
+    else:
+        inner_paused = False
+
+    if inner_paused:
+        # Outer is being resumed; pull the verdict via the outer's own
+        # interrupt() and forward it. The payload here is informational —
+        # the only thing the outer cares about is the resume value.
+        verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]})
+        result = await _ainvoke_with_retry(
+            agent_executor, Command(resume=verdict), config=inner_cfg,
+        )
+    else:
+        result = await _ainvoke_with_retry(
+            agent_executor, initial_input, config=inner_cfg,
+        )
+
+    # Multi-pause loop: if the inner pauses again (or resumes into a
+    # second gated tool call), surface each pause to the outer the
+    # same way until the inner is fully done.
+    while isinstance(result, dict) and result.get("__interrupt__"):
+        if not inner_has_checkpointer:
+            # No checkpointer => no Command(resume=...) path is
+            # available; surface the pause to the outer Pregel and
+            # stop. Resume cannot deliver the verdict to the gated
+            # tool, but that matches legacy behavior.
+            raise GraphInterrupt(result["__interrupt__"])
+        verdict = interrupt(result["__interrupt__"])
+        result = await _ainvoke_with_retry(
+            agent_executor, Command(resume=verdict), config=inner_cfg,
+        )
+
+    return result
+
+
 async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3,
-                              base_delay: float = 1.5):
+                              base_delay: float = 1.5,
+                              config: dict | None = None):
     """Wrap a LangGraph agent invocation with retry on transient cloud errors.
 
     Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.).
@@ -218,6 +314,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3,
             # itself (AutoStrategy → ToolStrategy fallback for non-
             # function-calling Ollama models). The default langgraph
             # recursion bound is now a true upper bound, not a workaround.
+            #
+            # ``config`` (default ``None``) carries the per-thread
+            # ``configurable.thread_id`` for callers that need a
+            # checkpointer-scoped invocation (HITL resume path —
+            # see :func:`_drive_agent_with_resume`). Legacy callers
+            # pass nothing and the executor's default thread is used.
+            if config is not None:
+                return await executor.ainvoke(input_, config=config)
             return await executor.ainvoke(input_)
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause.
@@ -550,6 +654,7 @@ def make_agent_node(
     injected_args: dict[str, str] | None = None,
     gate_policy: "GatePolicy | None" = None,
     event_log: "EventLog | None" = None,
+    checkpointer: Any = None,
 ) -> Callable[[GraphState], Awaitable[dict]]:
     """Factory: build a LangGraph node that runs a ReAct agent and decides a route.
 
@@ -581,10 +686,23 @@ def make_agent_node(
     """
 
     async def node(state: GraphState) -> dict:
-        incident = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session
-        inc_id = incident.id
+        state_session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session
+        inc_id = state_session.id
         started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT)
 
+        # Always reload from the persistent store at entry. Outer Pregel
+        # checkpoints at step boundaries, not mid-node — so on a HITL
+        # resume, ``state["session"]`` reflects the prior step's output
+        # (e.g. deep_investigator's), not the gateway's pending_approval
+        # row + version bump that happened mid-step in the original run.
+        # If we trust ``state["session"]`` here, the gateway sees no
+        # pending row, appends a duplicate, then ``store.save`` raises
+        # ``StaleVersionError`` because DB has already moved on.
+        try:
+            incident = store.load(inc_id)
+        except FileNotFoundError:
+            incident = state_session
+
         # M3 (per-step telemetry): emit agent_started.
         if event_log is not None:
             try:
@@ -694,11 +812,28 @@ def _run(**kwargs: Any) -> Any:
         # END interaction, recursion_limit ceilings). Markdown is the
         # native format every chat model writes well; the parse step
         # happens in the framework, where leniency is in our control.
+        # The inner agent gets a checkpointer so a HITL pause inside
+        # ``interrupt()`` can be resumed via ``Command(resume=verdict)``
+        # on the SAME inner thread. langgraph 1.x semantics require the
+        # checkpointer here — without it ``Command(resume=...)`` raises
+        # ``RuntimeError: Cannot use Command(resume=...) without
+        # checkpointer``. The thread id is derived deterministically
+        # from session + agent + the upcoming agent_run index so it is:
+        #   * STABLE across the inner pause and the outer resume that
+        #     follows (both observe the same ``len(incident.agents_run)``
+        #     because no new run is recorded mid-pause), and
+        #   * UNIQUE per agent invocation so previous invocations of the
+        #     same agent within the same session don't bleed in.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
+            checkpointer=checkpointer,
+        )
+        inner_thread_id = (
+            f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"
         )
+        inner_cfg = {"configurable": {"thread_id": inner_thread_id}}
 
         # Phase 11 (FOC-04): reset per-turn confidence hint. The hint
         # is updated below after _harvest_tool_calls_and_patches; on
@@ -710,9 +845,15 @@ def _run(**kwargs: Any) -> Any:
             pass
 
         try:
-            result = await _ainvoke_with_retry(
-                agent_executor,
-                {"messages": [HumanMessage(content=_format_agent_input(incident))]},
+            result = await _drive_agent_with_resume(
+                agent_executor=agent_executor,
+                inner_cfg=inner_cfg,
+                inner_has_checkpointer=checkpointer is not None,
+                initial_input={
+                    "messages": [
+                        HumanMessage(content=_format_agent_input(incident))
+                    ]
+                },
             )
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error.
@@ -1064,7 +1205,8 @@ async def gate(state: GraphState) -> dict:
 
 def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore,
                        registry: ToolRegistry,
-                       event_log: "EventLog | None" = None) -> dict:
+                       event_log: "EventLog | None" = None,
+                       checkpointer: Any = None) -> dict:
     """Materialize agent nodes from skills + registry. Reused by main + resume graphs.
 
     Dispatches on ``skill.kind``:
@@ -1142,6 +1284,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore,
             injected_args=cfg.orchestrator.injected_args,
             gate_policy=gate_policy,
             event_log=event_log,
+            checkpointer=checkpointer,
         )
     return nodes
 
@@ -1247,7 +1390,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore,
 
     sg = StateGraph(GraphState)
     nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store,
-                                registry=registry, event_log=event_log)
+                                registry=registry, event_log=event_log,
+                                checkpointer=checkpointer)
     for agent_name, node in nodes.items():
         sg.add_node(agent_name, node)
     sg.add_node("gate", make_gate_node(
diff --git a/src/runtime/orchestrator.py b/src/runtime/orchestrator.py
index 9323deb..192920e 100644
--- a/src/runtime/orchestrator.py
+++ b/src/runtime/orchestrator.py
@@ -1057,6 +1057,29 @@ async def _finalize_session_status_async(
         async with self._locks.acquire(session_id):
             return self._finalize_session_status(session_id)
 
+    async def _is_graph_paused(self, session_id: str) -> bool:
+        """Return True iff the compiled graph has a pending step waiting
+        to resume (i.e. it's paused at an ``interrupt()`` boundary).
+
+        langgraph 1.x surfaces a HITL pause via the result dict's
+        ``__interrupt__`` field rather than raising
+        :class:`GraphInterrupt`. After ``astream_events`` (or
+        ``ainvoke``) returns, the only way to tell "the run paused"
+        from "the run completed" is to query the checkpointed graph
+        state — a non-empty ``next`` tuple means the graph has steps
+        queued to run when resumed. The stream/retry call sites use
+        this to skip ``_finalize_session_status_async`` on a pause —
+        finalizing on a pause would stamp ``default_terminal_status``
+        on a session that's actually waiting on operator approval, and
+        the orphaned ``pending_approval`` ToolCall row written by the
+        gateway would never get its resume.
+        """
+        try:
+            state = await self.graph.aget_state(self._thread_config(session_id))
+        except Exception:  # noqa: BLE001 — defensive; missing checkpoint == not paused
+            return False
+        return bool(getattr(state, "next", ()) or ())
+
     def _thread_config(self, incident_id: str) -> dict:
         """Build the LangGraph ``config`` dict for a per-session thread.
 
@@ -1266,10 +1289,18 @@ async def stream_session(self, *, query: str, environment: str,
             config=self._thread_config(inc.id),
         ):
             yield self._to_ui_event(ev, inc.id)
-        new_status = await self._finalize_session_status_async(inc.id)
-        if new_status:
-            yield {"event": "status_auto_finalized", "incident_id": inc.id,
-                   "status": new_status, "ts": _event_ts()}
+        # Skip finalize when the graph paused on an interrupt — the
+        # session is waiting for operator approval, not done. Stamping
+        # default_terminal_status here would orphan the pending_approval
+        # ToolCall row written by the gateway just before the pause.
+        if await self._is_graph_paused(inc.id):
+            yield {"event": "session_paused", "incident_id": inc.id,
+                   "ts": _event_ts()}
+        else:
+            new_status = await self._finalize_session_status_async(inc.id)
+            if new_status:
+                yield {"event": "status_auto_finalized", "incident_id": inc.id,
+                       "status": new_status, "ts": _event_ts()}
         yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()}
 
     async def stream_investigation(self, *, query: str, environment: str,
@@ -1542,10 +1573,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]:
             config=self._thread_config(session_id),
         ):
             yield self._to_ui_event(ev, session_id)
-        new_status = await self._finalize_session_status_async(session_id)
-        if new_status:
-            yield {"event": "status_auto_finalized", "incident_id": session_id,
-                   "status": new_status, "ts": _event_ts()}
+        # See ``stream_session`` for why pause-detection guards the
+        # finalize call: a HITL pause must not be coerced into a
+        # terminal status.
+        if await self._is_graph_paused(session_id):
+            yield {"event": "session_paused", "incident_id": session_id,
+                   "ts": _event_ts()}
+        else:
+            new_status = await self._finalize_session_status_async(session_id)
+            if new_status:
+                yield {"event": "status_auto_finalized", "incident_id": session_id,
+                       "status": new_status, "ts": _event_ts()}
         yield {"event": "retry_completed", "incident_id": session_id,
                "ts": _event_ts()}
 
diff --git a/src/runtime/tools/gateway.py b/src/runtime/tools/gateway.py
index dc321d2..e0f17b3 100644
--- a/src/runtime/tools/gateway.py
+++ b/src/runtime/tools/gateway.py
@@ -497,6 +497,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        # Persist the status transition. Without this,
+                        # the DB row stays at ``pending_approval`` and
+                        # the UI keeps offering the buttons forever.
+                        if store is not None:
+                            store.save(session)
                     rejected_result = {"rejected": True, "rationale": rationale}
                     _emit_invoked(
                         status="rejected", risk="high",
@@ -523,6 +528,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        if store is not None:
+                            store.save(session)
                     timeout_result = {"timeout": True, "rationale": rationale}
                     _emit_invoked(
                         status="timeout", risk="high",
@@ -545,6 +552,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                         approved_at=_now_iso(),
                         approval_rationale=rationale,
                     )
+                    if store is not None:
+                        store.save(session)
                 _emit_invoked(
                     status="approved", risk="high",
                     args_dict=pending_args, result=result,
@@ -675,6 +684,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        # Persist the status transition (mirror of the
+                        # sync path) so the DB row reflects the actual
+                        # outcome instead of staying at pending_approval.
+                        if store is not None:
+                            store.save(session)
                     rejected_result = {"rejected": True, "rationale": rationale}
                     _emit_invoked(
                         status="rejected", risk="high",
@@ -696,6 +710,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        if store is not None:
+                            store.save(session)
                     timeout_result = {"timeout": True, "rationale": rationale}
                     _emit_invoked(
                         status="timeout", risk="high",
@@ -717,6 +733,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                         approved_at=_now_iso(),
                         approval_rationale=rationale,
                     )
+                    if store is not None:
+                        store.save(session)
                 _emit_invoked(
                     status="approved", risk="high",
                     args_dict=pending_args, result=result,
diff --git a/src/runtime/ui.py b/src/runtime/ui.py
index d2b4a7a..3277a1b 100644
--- a/src/runtime/ui.py
+++ b/src/runtime/ui.py
@@ -23,8 +23,20 @@
 # Orchestrator uses so both share the same SQLite DB.
 from __future__ import annotations
 import asyncio
+import logging as _logging
 import os
 import time
+
+# Optional: env-driven log config so manual runs (streamlit run, etc.)
+# can crank verbosity to INFO/DEBUG without modifying the source. The
+# default keeps the production-quiet WARNING level.
+_log_level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip()
+if _log_level in {"DEBUG", "INFO", "WARNING", "ERROR"}:
+    _logging.basicConfig(
+        level=getattr(_logging, _log_level),
+        format="%(asctime)s %(name)s %(levelname)s %(message)s",
+        force=True,
+    )
 from datetime import datetime, timezone
 from pathlib import Path
 import streamlit as st
@@ -987,6 +999,14 @@ async def _drive() -> None:
             Command(resume=payload),
             config=orch._thread_config(session_id),
         )
+        # The graph completes the agent run after the verdict is
+        # forwarded to the gated tool; finalize the session if it's
+        # not paused on a fresh interrupt. Without this, an approved
+        # session stays in ``in_progress`` because the resume path
+        # does not run through ``stream_session`` (the only other
+        # caller of finalize).
+        if not await orch._is_graph_paused(session_id):
+            await orch._finalize_session_status_async(session_id)
 
     svc.submit_and_wait(_drive(), timeout=60.0)
 
diff --git a/tests/test_genericity_ratchet.py b/tests/test_genericity_ratchet.py
index 5baf392..37a8496 100644
--- a/tests/test_genericity_ratchet.py
+++ b/tests/test_genericity_ratchet.py
@@ -74,7 +74,18 @@
 #                no new domain concept introduced (was missed in the Phase 12
 #                atomic commit; counted retroactively in the v1.2 follow-up
 #                that consolidates injection-path bug fixes).
-BASELINE_TOTAL = 154
+#   154 -> 156   HITL pause-resume fix for langgraph 1.x. Both
+#                ``runtime.graph.make_agent_node`` and
+#                ``runtime.agents.responsive.make_agent_node`` derive a
+#                per-invocation inner thread id of the form
+#                ``f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"``
+#                so the inner ``create_agent`` checkpointer can persist a
+#                paused tool across an outer Pregel pause and resume. The
+#                count uses the existing ``incident`` local (the runner's
+#                domain Session) on a structurally required code path —
+#                same pattern as the Phase 10/11/12 entries. Net +2
+#                ``incident`` token reuses, no new domain concept.
+BASELINE_TOTAL = 156
 
 
 def test_runtime_leaks_at_or_below_baseline():
diff --git a/tests/test_interrupt_detection.py b/tests/test_interrupt_detection.py
new file mode 100644
index 0000000..92dc5f1
--- /dev/null
+++ b/tests/test_interrupt_detection.py
@@ -0,0 +1,370 @@
+"""Regression: agent runner must raise GraphInterrupt when ``create_agent``
+surfaces a paused HITL gate via the result dict's ``__interrupt__`` field.
+
+Background — langgraph 1.x changed the interrupt contract: a tool that
+calls ``interrupt()`` no longer raises ``GraphInterrupt`` to the caller;
+instead, ``agent.invoke(...)`` (or ``ainvoke``) returns a normal result
+dict with an extra ``__interrupt__`` key carrying the captured
+:class:`langgraph.types.Interrupt` records. The framework's agent
+runner must therefore inspect the result and re-raise ``GraphInterrupt``
+itself; otherwise the outer Pregel completes the agent step, the
+envelope-parsing path fires (potentially Path 6 synthesis), the session
+gets finalized, and the ``pending_approval`` ToolCall row written by the
+gateway just before the interrupt is left orphaned — the UI's
+Approve / Reject buttons no longer drive any resume because the session
+is already terminal from the framework's perspective.
+
+These tests pin the contract at the agent-runner boundary in BOTH
+implementations: ``runtime.agents.responsive.make_agent_node`` (the
+generic responsive runner) and ``runtime.graph.make_agent_node`` (the
+incident-shape variant exposed via ``runtime.graph``). The fix is the
+same in both: after ``ainvoke``, inspect ``result["__interrupt__"]``
+and raise :class:`GraphInterrupt`.
+"""
+from __future__ import annotations
+
+import asyncio
+from pathlib import Path
+from typing import Any
+
+import pytest
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import AIMessage, BaseMessage
+from langchain_core.outputs import ChatGeneration, ChatResult
+from langchain_core.tools import StructuredTool
+from langgraph.errors import GraphInterrupt
+from langgraph.types import interrupt
+from pydantic import BaseModel
+
+from runtime.agents.responsive import make_agent_node as responsive_make_agent_node
+from runtime.config import EmbeddingConfig, MetadataConfig, ProviderConfig
+from runtime.graph import GraphState, make_agent_node as graph_make_agent_node, route_from_skill
+from runtime.skill import RouteRule, Skill
+from runtime.storage.embeddings import build_embedder
+from runtime.storage.engine import build_engine
+from runtime.storage.models import Base
+from runtime.storage.session_store import SessionStore
+
+
+# ---------------------------------------------------------------------------
+# Fixtures (mirror test_real_llm_tool_loop_termination shape so the
+# repo + session story stays consistent across the file).
+
+
+def _make_repo(tmp_path: Path) -> SessionStore:
+    eng = build_engine(MetadataConfig(url=f"sqlite:///{tmp_path}/test.db"))
+    Base.metadata.create_all(eng)
+    embedder = build_embedder(
+        EmbeddingConfig(provider="s", model="x", dim=1024),
+        {"s": ProviderConfig(kind="stub")},
+    )
+    return SessionStore(engine=eng, embedder=embedder)
+
+
+@pytest.fixture
+def repo(tmp_path: Path) -> SessionStore:
+    return _make_repo(tmp_path)
+
+
+@pytest.fixture
+def session(repo: SessionStore):
+    return repo.create(
+        query="HITL gate fires mid-tool",
+        environment="production",
+        reporter_id="u",
+        reporter_team="t",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Stubs
+
+
+class _PauseArgs(BaseModel):
+    why: str = "test"
+
+
+def _pause_tool_impl(why: str = "test") -> str:  # noqa: D401
+    """Stand-in for a high-risk tool whose gateway raises interrupt()."""
+    interrupt({"reason": "approve_me", "why": why})
+    return f"ran:{why}"
+
+
+def _make_pause_tool() -> StructuredTool:
+    return StructuredTool.from_function(
+        func=_pause_tool_impl,
+        name="pause_tool",
+        description="Calls interrupt() — emulates the gateway's HITL gate.",
+        args_schema=_PauseArgs,
+    )
+
+
+class _OneShotToolCallLLM(BaseChatModel):
+    """Stub LLM that emits exactly one tool_call for ``pause_tool``.
+
+    Mirrors the production agent's first turn: the LLM calls a tool, the
+    tool gates, and the agent should pause. After resume the tool would
+    return a real value and the LLM would emit a closing markdown
+    envelope — but for THIS test we only assert that the first
+    invocation pauses; the runner must observe ``__interrupt__`` and
+    re-raise, so a closing turn is unreachable.
+    """
+
+    @property
+    def _llm_type(self) -> str:
+        return "oneshot-toolcall-stub"
+
+    def _generate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: Any = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        msg = AIMessage(
+            content="",
+            tool_calls=[{"name": "pause_tool", "args": {"why": "x"}, "id": "c1"}],
+        )
+        return ChatResult(generations=[ChatGeneration(message=msg)])
+
+    async def _agenerate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: Any = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        return self._generate(messages, stop, run_manager, **kwargs)
+
+    def bind_tools(self, tools, *, tool_choice=None, **kwargs):
+        return self
+
+
+# ---------------------------------------------------------------------------
+# graph.make_agent_node
+
+
+@pytest.mark.asyncio
+async def test_graph_make_agent_node_raises_on_interrupt_in_result(repo, session):
+    """When a tool calls ``interrupt()``, langgraph 1.x returns the
+    pause via ``result["__interrupt__"]`` instead of raising
+    ``GraphInterrupt``. The graph.py agent runner must detect this and
+    re-raise so the outer Pregel propagates the pause — otherwise the
+    envelope-parsing path fires, the session is finalized, and the
+    pending_approval row written by the gateway is orphaned.
+    """
+    skill = Skill(
+        name="resolution",
+        description="d",
+        routes=[RouteRule(when="default", next="end")],
+        system_prompt="You are resolution.",
+    )
+    node = graph_make_agent_node(
+        skill=skill,
+        llm=_OneShotToolCallLLM(),
+        tools=[_make_pause_tool()],
+        decide_route=lambda inc: route_from_skill(skill, inc),
+        store=repo,
+    )
+    state: GraphState = {"session": session, "next_route": None}
+    with pytest.raises(GraphInterrupt):
+        await asyncio.wait_for(node(state), timeout=5.0)
+
+
+# ---------------------------------------------------------------------------
+# responsive.make_agent_node
+
+
+@pytest.mark.asyncio
+async def test_responsive_make_agent_node_raises_on_interrupt_in_result(
+    repo, session,
+):
+    """Same contract as above, exercised through the generic responsive
+    runner. Both runners share the contract: ``__interrupt__`` in the
+    ainvoke result must be re-raised as ``GraphInterrupt``.
+    """
+    skill = Skill(
+        name="resolution",
+        description="d",
+        routes=[RouteRule(when="default", next="end")],
+        system_prompt="You are resolution.",
+    )
+    node = responsive_make_agent_node(
+        skill=skill,
+        llm=_OneShotToolCallLLM(),
+        tools=[_make_pause_tool()],
+        decide_route=lambda inc: route_from_skill(skill, inc),
+        store=repo,
+    )
+    state: GraphState = {"session": session, "next_route": None}
+    with pytest.raises(GraphInterrupt):
+        await asyncio.wait_for(node(state), timeout=5.0)
+
+
+# ---------------------------------------------------------------------------
+# Resume happy path — verdict actually reaches the gated tool.
+#
+# The previous tests cover the FIRST turn (pause). These tests cover the
+# RESUME turn: when the outer Pregel is rehydrated via
+# ``Command(resume=verdict)``, the helper ``_drive_agent_with_resume``
+# detects the inner agent's paused state via ``aget_state``, calls
+# ``interrupt()`` at the outer level to fetch the verdict, and forwards
+# it via ``Command(resume=verdict)`` to the inner agent. The inner
+# agent's tool then re-enters with the verdict in hand and produces a
+# real ToolMessage — which is the only proof that the gated work
+# actually executed (a "silent skip" path leaves no ToolMessage
+# behind).
+
+
+class _LLMWithCloser(BaseChatModel):
+    """LLM that emits a tool_call once, then a closing AIMessage."""
+
+    _counter: int = 0  # pyright: ignore[reportGeneralTypeIssues] — pydantic v2 needs annotation
+
+    @property
+    def _llm_type(self) -> str:
+        return "loop-toolcall-stub"
+
+    def _generate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: Any = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        self._counter += 1
+        if self._counter == 1:
+            msg = AIMessage(
+                content="",
+                tool_calls=[
+                    {"name": "pause_tool", "args": {"why": "x"}, "id": "c1"},
+                ],
+            )
+        else:
+            # Closing markdown envelope — Path 4 parser will pick it up.
+            md = (
+                "all good\n\n"
+                "## Response\nall good\n\n"
+                "## Confidence\n0.9 -- post-resume close\n\n"
+                "## Signal\nnone\n"
+            )
+            msg = AIMessage(content=md)
+        return ChatResult(generations=[ChatGeneration(message=msg)])
+
+    async def _agenerate(
+        self,
+        messages: list[BaseMessage],
+        stop: list[str] | None = None,
+        run_manager: Any = None,
+        **kwargs: Any,
+    ) -> ChatResult:
+        return self._generate(messages, stop, run_manager, **kwargs)
+
+    def bind_tools(self, tools, *, tool_choice=None, **kwargs):
+        return self
+
+
+@pytest.mark.asyncio
+async def test_resume_forwards_verdict_to_inner_tool_and_completes(
+    repo, session,
+):
+    """End-to-end pause → resume → tool actually runs with verdict.
+
+    Drives the agent runner across two invocations sharing a
+    ``langgraph.checkpoint.memory.InMemorySaver``. The first call
+    raises ``GraphInterrupt`` (paused mid-tool). The second call —
+    re-entered as the outer would re-enter the node body after a
+    ``Command(resume=...)`` — must (a) detect the inner pause via
+    ``aget_state``, (b) forward the verdict via
+    ``Command(resume=verdict)``, and (c) drive the tool to completion
+    so a ``ToolMessage`` appears in the final inner message list. The
+    ToolMessage is the proof that the gated tool actually executed
+    rather than being silently skipped.
+    """
+    from langgraph.checkpoint.memory import InMemorySaver
+    from langgraph.types import Command, interrupt
+
+    saver = InMemorySaver()
+
+    # Tool whose interrupt() returns the verdict and whose final
+    # return value records the verdict so we can assert it ended up in
+    # the message stream.
+    class _Args(BaseModel):
+        why: str = "x"
+
+    captured: dict[str, str | None] = {"verdict": None}
+
+    def _gated(why: str = "x") -> str:
+        v = interrupt({"need_verdict": why})
+        captured["verdict"] = v
+        return f"ran:{v}"
+
+    gated = StructuredTool.from_function(
+        func=_gated, name="pause_tool", description="emulates HITL gate",
+        args_schema=_Args,
+    )
+
+    # Build an inner agent the same way the framework would, but
+    # against the test's saver so the pause survives across calls.
+    from langchain.agents import create_agent
+
+    llm = _LLMWithCloser()
+    inner = create_agent(
+        model=llm, tools=[gated], system_prompt="x", checkpointer=saver,
+    )
+    inner_cfg = {"configurable": {"thread_id": "T-resume-test"}}
+
+    # The helper's ``interrupt()`` calls require a real Pregel
+    # runnable context — we cannot call ``_drive_agent_with_resume``
+    # directly. Exercise it inside a minimal one-node outer graph
+    # that owns its own checkpointer; that's exactly how the
+    # framework wires it via ``runtime.graph.build_graph``.
+    from runtime.graph import _drive_agent_with_resume
+    from langgraph.graph import StateGraph, START, END
+    from typing import TypedDict
+
+    class _OuterS(TypedDict, total=False):
+        result: object
+
+    async def _wrapper(state: _OuterS) -> dict:
+        out = await _drive_agent_with_resume(
+            agent_executor=inner, inner_cfg=inner_cfg,
+            inner_has_checkpointer=True,
+            initial_input={"messages": [("human", "go")]},
+        )
+        return {"result": out}
+
+    sg = StateGraph(_OuterS)
+    sg.add_node("n", _wrapper)
+    sg.add_edge(START, "n")
+    sg.add_edge("n", END)
+    outer = sg.compile(checkpointer=InMemorySaver())
+    outer_cfg = {"configurable": {"thread_id": "OUTER-T"}}
+
+    # First outer.invoke({}): the wrapper hits the inner pause and
+    # then the helper's outer interrupt() pauses the outer too.
+    first = await outer.ainvoke({}, config=outer_cfg)
+    assert "__interrupt__" in first
+
+    # Now resume the outer with the verdict — the helper forwards it
+    # to the inner via Command(resume="approve") and the gated tool
+    # actually re-enters with the verdict.
+    final = await outer.ainvoke(Command(resume="approve"), config=outer_cfg)
+
+    inner_result = final.get("result")
+    assert isinstance(inner_result, dict), f"unexpected inner result: {inner_result!r}"
+    msgs = inner_result.get("messages") or []
+    # Assert a ToolMessage exists with the gated-tool's actual output.
+    tool_msgs = [m for m in msgs if type(m).__name__ == "ToolMessage"]
+    assert tool_msgs, (
+        "expected a ToolMessage in the final inner messages — its "
+        "absence means the gated tool was silently skipped on resume "
+        f"(verdict={captured['verdict']!r}, messages={[type(m).__name__ for m in msgs]})"
+    )
+    assert "ran:approve" in str(tool_msgs[-1].content), (
+        f"tool ran but verdict was lost: {tool_msgs[-1].content!r}"
+    )
+    assert captured["verdict"] == "approve", (
+        f"tool's interrupt() did not receive 'approve' on resume: "
+        f"{captured['verdict']!r}"
+    )

From 1d408ba6e55ceeb02dd73327306dd226fc21fb60 Mon Sep 17 00:00:00 2001
From: Amit Kumar <ak.nitrr13@gmail.com>
Date: Wed, 13 May 2026 16:21:09 +0000
Subject: [PATCH 5/6] build: regenerate dist bundles after gpt-oss compat +
 HITL fix

Bundles dist/app.py + dist/apps/{code-review,incident-management}.py +
dist/ui.py in line with src/runtime changes from the two preceding
commits. No bundle-only edits.
---
 dist/app.py                      | 416 +++++++++++++++++++++++++++++--
 dist/apps/code-review.py         | 416 +++++++++++++++++++++++++++++--
 dist/apps/incident-management.py | 416 +++++++++++++++++++++++++++++--
 dist/ui.py                       |  28 ++-
 4 files changed, 1194 insertions(+), 82 deletions(-)

diff --git a/dist/app.py b/dist/app.py
index 7d8a93e..f7ca84e 100644
--- a/dist/app.py
+++ b/dist/app.py
@@ -713,7 +713,7 @@ class IncidentState(Session):
 """
 
 
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING, Any, Callable
 
 from langchain_core.messages import HumanMessage
 from langchain.agents import create_agent
@@ -994,7 +994,6 @@ class IncidentState(Session):
 """
 
 
-from typing import TYPE_CHECKING, Any, Callable
 
 from fastapi import APIRouter, Depends, Header, HTTPException, Request, status
 from pydantic import ValidationError
@@ -6586,7 +6585,7 @@ def __init__(self, *, agent: str, field: str, message: str | None = None):
     # Leftmost float (allows int form), optional rationale after em-dash /
     # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale
     # is captured wholesale.
-    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$",
+    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$",
     re.DOTALL,
 )
 
@@ -6777,9 +6776,138 @@ def parse_envelope_from_result(
             continue
         break
 
-    # Path 5 (terminal): fail loudly. None of the four paths produced a
+    # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular)
+    # treat a terminal-tool call as the completion and emit an empty
+    # closing AIMessage. Apps configure typed-terminal tools with
+    # ``confidence`` + ``confidence_rationale`` args by contract —
+    # synthesise an envelope from those args when the markdown path
+    # yielded nothing. Skip if any earlier path succeeded.
+    for msg in reversed(messages):
+        tcs = getattr(msg, "tool_calls", None)
+        if not tcs:
+            continue
+        for tc in tcs:
+            args = tc.get("args") or {}
+            if not isinstance(args, dict):
+                continue
+            conf = args.get("confidence")
+            rat = args.get("confidence_rationale") or args.get("rationale")
+            if conf is None or not rat:
+                continue
+            try:
+                conf_f = float(conf)
+            except (TypeError, ValueError):
+                continue
+            content_body = (
+                args.get("resolution_summary")
+                or args.get("reason")
+                or args.get("summary")
+                or f"terminal tool {tc.get('name', '<unknown>')} invoked"
+            )
+            if not isinstance(content_body, str) or not content_body.strip():
+                content_body = f"terminal tool {tc.get('name', '<unknown>')} invoked"
+            try:
+                env = AgentTurnOutput(
+                    content=content_body,
+                    confidence=max(0.0, min(1.0, conf_f)),
+                    confidence_rationale=str(rat),
+                    signal=args.get("signal"),
+                )
+            except ValidationError:
+                continue
+            _LOG.info(
+                "envelope_synthesized_from_tool_args agent=%s tool=%s "
+                "confidence=%.2f",
+                agent, tc.get("name"), env.confidence,
+            )
+            return env
+
+    # Path 6 (D-22-fallback-permissive): the model called at least one
+    # tool but never emitted the markdown contract or a typed-terminal
+    # tool with confidence args. Rather than hard-failing the run,
+    # synthesise a minimal envelope so the session reaches a terminal
+    # status (typically default_terminal_status -> needs_review) and
+    # the operator can review what happened. Confidence is intentionally
+    # low (0.30) so any HITL gate fires.
+    invoked_tool_names: list[str] = []
+    for msg in messages:
+        for tc in (getattr(msg, "tool_calls", None) or []):
+            n = tc.get("name")
+            if n:
+                invoked_tool_names.append(n)
+    if invoked_tool_names:
+        body = (
+            f"Agent {agent} invoked "
+            f"{', '.join(invoked_tool_names)} but did not emit a final "
+            f"summary. Review the tool-call audit for outcome."
+        )
+        try:
+            env = AgentTurnOutput(
+                content=body,
+                confidence=0.30,
+                confidence_rationale=(
+                    "synthesized: agent invoked tools but emitted no "
+                    "closing message"
+                ),
+                signal=None,
+            )
+            _LOG.warning(
+                "envelope_synthesized_permissive agent=%s tools=%s",
+                agent, invoked_tool_names,
+            )
+            return env
+        except ValidationError:
+            pass
+
+    # Path 7 (terminal): fail loudly. None of the six paths produced a
     # valid envelope — this is a real prompt-drift signal worth
-    # surfacing as a structured agent_run error.
+    # surfacing as a structured agent_run error. Log the last
+    # AIMessage content (capped) so operators can diagnose whether
+    # the LLM emitted the wrong shape vs nothing at all.
+    last_ai_content = ""
+    for msg in reversed(messages):
+        if msg.__class__.__name__ == "AIMessage":
+            c = getattr(msg, "content", None)
+            if isinstance(c, str):
+                last_ai_content = c
+                break
+    # Also dump every tool_call across all AIMessages so we can see
+    # whether the model called any tool with confidence args (Path 5
+    # would have fired) or whether it called a non-terminal tool only.
+    tool_call_summary = []
+    for msg in messages:
+        for tc in (getattr(msg, "tool_calls", None) or []):
+            args = tc.get("args") or {}
+            tool_call_summary.append({
+                "name": tc.get("name"),
+                "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?",
+                "has_confidence": isinstance(args, dict) and "confidence" in args,
+                "has_rationale": isinstance(args, dict) and (
+                    "confidence_rationale" in args or "rationale" in args
+                ),
+            })
+    # Detailed per-message dump so we can see exactly what shape the
+    # conversation took when the parse failed.
+    msg_summary = []
+    for m in messages:
+        kind = m.__class__.__name__
+        c = getattr(m, "content", None)
+        c_len = len(c) if isinstance(c, str) else (
+            f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__
+        )
+        tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])]
+        msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs})
+    _LOG.warning(
+        "envelope_missing: agent=%s last_ai_message_content=%r "
+        "structured_response_type=%s message_count=%d tool_calls=%r "
+        "msg_trace=%r",
+        agent,
+        last_ai_content[:1500] if last_ai_content else "<empty>",
+        type(result.get("structured_response")).__name__,
+        len(messages),
+        tool_call_summary,
+        msg_summary,
+    )
     raise EnvelopeMissingError(
         agent=agent,
         field="structured_response",
@@ -7299,6 +7427,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        # Persist the status transition. Without this,
+                        # the DB row stays at ``pending_approval`` and
+                        # the UI keeps offering the buttons forever.
+                        if store is not None:
+                            store.save(session)
                     rejected_result = {"rejected": True, "rationale": rationale}
                     _emit_invoked(
                         status="rejected", risk="high",
@@ -7325,6 +7458,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        if store is not None:
+                            store.save(session)
                     timeout_result = {"timeout": True, "rationale": rationale}
                     _emit_invoked(
                         status="timeout", risk="high",
@@ -7347,6 +7482,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                         approved_at=_now_iso(),
                         approval_rationale=rationale,
                     )
+                    if store is not None:
+                        store.save(session)
                 _emit_invoked(
                     status="approved", risk="high",
                     args_dict=pending_args, result=result,
@@ -7477,6 +7614,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        # Persist the status transition (mirror of the
+                        # sync path) so the DB row reflects the actual
+                        # outcome instead of staying at pending_approval.
+                        if store is not None:
+                            store.save(session)
                     rejected_result = {"rejected": True, "rationale": rationale}
                     _emit_invoked(
                         status="rejected", risk="high",
@@ -7498,6 +7640,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        if store is not None:
+                            store.save(session)
                     timeout_result = {"timeout": True, "rationale": rationale}
                     _emit_invoked(
                         status="timeout", risk="high",
@@ -7519,6 +7663,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                         approved_at=_now_iso(),
                         approval_rationale=rationale,
                     )
+                    if store is not None:
+                        store.save(session)
                 _emit_invoked(
                     status="approved", risk="high",
                     args_dict=pending_args, result=result,
@@ -8308,6 +8454,7 @@ def make_agent_node(
     patch_tool_names: frozenset[str] = frozenset(),
     gate_policy: "GatePolicy | None" = None,
     event_log: "EventLog | None" = None,
+    checkpointer: Any = None,
 ):
     """Factory: build a LangGraph node that runs a ReAct agent and decides a route.
 
@@ -8330,9 +8477,18 @@ def make_agent_node(
 
 
     async def node(state: GraphState) -> dict:
-        incident: Session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess]
-        inc_id = incident.id
+        state_session: Session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess]
+        inc_id = state_session.id
         started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT)
+        # Always reload from store at entry — outer Pregel checkpoints
+        # at step boundaries, not mid-node, so ``state["session"]`` is
+        # stale relative to DB when a HITL gate paused mid-step. See
+        # the same reload comment in ``runtime.graph.make_agent_node``
+        # for the full rationale.
+        try:
+            incident: Session = store.load(inc_id)
+        except FileNotFoundError:
+            incident = state_session
 
         # M3: emit agent_started telemetry before any work happens.
         if event_log is not None:
@@ -8362,11 +8518,21 @@ async def node(state: GraphState) -> dict:
         # change as graph.py:make_agent_node — drop response_format
         # so the loop terminates on natural React END, then parse the
         # final AIMessage via Path 4 in parse_envelope_from_result.
+        # The inner agent gets the orchestrator's checkpointer so a
+        # HITL pause inside ``interrupt()`` can be resumed via
+        # ``Command(resume=verdict)`` on a stable per-invocation
+        # thread id (see ``_drive_agent_with_resume`` in
+        # ``runtime.graph`` for the full rationale).
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
+            checkpointer=checkpointer,
+        )
+        inner_thread_id = (
+            f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"
         )
+        inner_cfg = {"configurable": {"thread_id": inner_thread_id}}
 
         # Phase 11 (FOC-04): reset per-turn confidence hint at the
         # start of each agent step so the gateway treats the first
@@ -8377,9 +8543,15 @@ async def node(state: GraphState) -> dict:
             pass
 
         try:
-            result = await _ainvoke_with_retry(
-                agent_executor,
-                {"messages": [HumanMessage(content=_format_agent_input(incident))]},
+            result = await _drive_agent_with_resume(
+                agent_executor=agent_executor,
+                inner_cfg=inner_cfg,
+                inner_has_checkpointer=checkpointer is not None,
+                initial_input={
+                    "messages": [
+                        HumanMessage(content=_format_agent_input(incident))
+                    ]
+                },
             )
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up.
@@ -9354,8 +9526,104 @@ def finish(self, *, summary: str) -> None:
 )
 
 
+async def _drive_agent_with_resume(
+    *,
+    agent_executor,
+    inner_cfg: dict,
+    inner_has_checkpointer: bool,
+    initial_input: dict,
+):
+    """Run an inner ``create_agent`` executor, surfacing HITL pauses to
+    the outer Pregel graph and forwarding resume verdicts back to the
+    inner agent.
+
+    langgraph 1.x changed the ``interrupt()`` contract: a tool that
+    calls :func:`langgraph.types.interrupt` no longer raises
+    :class:`GraphInterrupt` to the caller. Instead, the agent's
+    ``ainvoke`` returns normally with the pause captured in
+    ``result["__interrupt__"]``. To make HITL approval actually work
+    end-to-end the framework must:
+
+    1. **First call** — invoke the agent with the initial messages.
+       If the result carries ``__interrupt__``, raise
+       :class:`GraphInterrupt` so the *outer* Pregel pauses too. The
+       outer pause is what surfaces ``__interrupt__`` to the
+       orchestrator and lets the UI render Approve / Reject.
+
+    2. **Resume call** — when the outer is resumed via
+       ``Command(resume=verdict)``, this helper is re-entered. The
+       inner's checkpointer remembers the pause; we detect that via
+       ``aget_state(...).next`` being non-empty and call
+       :func:`langgraph.types.interrupt` at the *outer* level to fetch
+       the verdict, then forward it to the inner via
+       ``Command(resume=verdict)``. This is the only path that
+       actually re-runs the gated tool with the verdict — calling
+       ``inner.ainvoke({"messages": [...]})`` on a paused thread
+       silently skips the tool.
+
+    A while loop covers the case where a single agent turn pauses
+    multiple times (e.g. two high-risk tool calls in sequence).
+
+    When ``inner_has_checkpointer`` is False (legacy callers and most
+    unit tests that build agents without a checkpointer), the helper
+    falls back to the simple "invoke + re-raise on ``__interrupt__``"
+    contract. That path can pause the outer graph but the resume
+    verdict cannot reach the inner tool — which is acceptable because
+    legacy callers without a checkpointer never had a working HITL
+    resume path to begin with.
+    """
+    # Local imports keep callers that never touch HITL out of the
+    # langgraph.types import cost path. ``Command`` and ``interrupt``
+    # are tiny but the explicit lazy import documents the dependency.
+    from langgraph.types import Command, interrupt
+
+    # Resume-detection branch: only meaningful when the inner agent
+    # has its own checkpointer (otherwise ``aget_state`` on a fresh
+    # in-memory pregel raises). The detection asks: "does the inner
+    # have a paused step waiting?". A non-empty ``next`` means yes.
+    if inner_has_checkpointer:
+        try:
+            inner_state = await agent_executor.aget_state(inner_cfg)
+            inner_paused = bool(getattr(inner_state, "next", ()) or ())
+        except Exception:  # noqa: BLE001 — defensive; missing checkpoint == fresh
+            inner_paused = False
+    else:
+        inner_paused = False
+
+    if inner_paused:
+        # Outer is being resumed; pull the verdict via the outer's own
+        # interrupt() and forward it. The payload here is informational —
+        # the only thing the outer cares about is the resume value.
+        verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]})
+        result = await _ainvoke_with_retry(
+            agent_executor, Command(resume=verdict), config=inner_cfg,
+        )
+    else:
+        result = await _ainvoke_with_retry(
+            agent_executor, initial_input, config=inner_cfg,
+        )
+
+    # Multi-pause loop: if the inner pauses again (or resumes into a
+    # second gated tool call), surface each pause to the outer the
+    # same way until the inner is fully done.
+    while isinstance(result, dict) and result.get("__interrupt__"):
+        if not inner_has_checkpointer:
+            # No checkpointer => no Command(resume=...) path is
+            # available; surface the pause to the outer Pregel and
+            # stop. Resume cannot deliver the verdict to the gated
+            # tool, but that matches legacy behavior.
+            raise GraphInterrupt(result["__interrupt__"])
+        verdict = interrupt(result["__interrupt__"])
+        result = await _ainvoke_with_retry(
+            agent_executor, Command(resume=verdict), config=inner_cfg,
+        )
+
+    return result
+
+
 async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3,
-                              base_delay: float = 1.5):
+                              base_delay: float = 1.5,
+                              config: dict | None = None):
     """Wrap a LangGraph agent invocation with retry on transient cloud errors.
 
     Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.).
@@ -9373,6 +9641,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3,
             # itself (AutoStrategy → ToolStrategy fallback for non-
             # function-calling Ollama models). The default langgraph
             # recursion bound is now a true upper bound, not a workaround.
+            #
+            # ``config`` (default ``None``) carries the per-thread
+            # ``configurable.thread_id`` for callers that need a
+            # checkpointer-scoped invocation (HITL resume path —
+            # see :func:`_drive_agent_with_resume`). Legacy callers
+            # pass nothing and the executor's default thread is used.
+            if config is not None:
+                return await executor.ainvoke(input_, config=config)
             return await executor.ainvoke(input_)
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause.
@@ -9705,6 +9981,7 @@ def make_agent_node(
     injected_args: dict[str, str] | None = None,
     gate_policy: "GatePolicy | None" = None,
     event_log: "EventLog | None" = None,
+    checkpointer: Any = None,
 ) -> Callable[[GraphState], Awaitable[dict]]:
     """Factory: build a LangGraph node that runs a ReAct agent and decides a route.
 
@@ -9736,10 +10013,23 @@ def make_agent_node(
     """
 
     async def node(state: GraphState) -> dict:
-        incident = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session
-        inc_id = incident.id
+        state_session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session
+        inc_id = state_session.id
         started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT)
 
+        # Always reload from the persistent store at entry. Outer Pregel
+        # checkpoints at step boundaries, not mid-node — so on a HITL
+        # resume, ``state["session"]`` reflects the prior step's output
+        # (e.g. deep_investigator's), not the gateway's pending_approval
+        # row + version bump that happened mid-step in the original run.
+        # If we trust ``state["session"]`` here, the gateway sees no
+        # pending row, appends a duplicate, then ``store.save`` raises
+        # ``StaleVersionError`` because DB has already moved on.
+        try:
+            incident = store.load(inc_id)
+        except FileNotFoundError:
+            incident = state_session
+
         # M3 (per-step telemetry): emit agent_started.
         if event_log is not None:
             try:
@@ -9846,11 +10136,28 @@ def _run(**kwargs: Any) -> Any:
         # END interaction, recursion_limit ceilings). Markdown is the
         # native format every chat model writes well; the parse step
         # happens in the framework, where leniency is in our control.
+        # The inner agent gets a checkpointer so a HITL pause inside
+        # ``interrupt()`` can be resumed via ``Command(resume=verdict)``
+        # on the SAME inner thread. langgraph 1.x semantics require the
+        # checkpointer here — without it ``Command(resume=...)`` raises
+        # ``RuntimeError: Cannot use Command(resume=...) without
+        # checkpointer``. The thread id is derived deterministically
+        # from session + agent + the upcoming agent_run index so it is:
+        #   * STABLE across the inner pause and the outer resume that
+        #     follows (both observe the same ``len(incident.agents_run)``
+        #     because no new run is recorded mid-pause), and
+        #   * UNIQUE per agent invocation so previous invocations of the
+        #     same agent within the same session don't bleed in.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
+            checkpointer=checkpointer,
         )
+        inner_thread_id = (
+            f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"
+        )
+        inner_cfg = {"configurable": {"thread_id": inner_thread_id}}
 
         # Phase 11 (FOC-04): reset per-turn confidence hint. The hint
         # is updated below after _harvest_tool_calls_and_patches; on
@@ -9862,9 +10169,15 @@ def _run(**kwargs: Any) -> Any:
             pass
 
         try:
-            result = await _ainvoke_with_retry(
-                agent_executor,
-                {"messages": [HumanMessage(content=_format_agent_input(incident))]},
+            result = await _drive_agent_with_resume(
+                agent_executor=agent_executor,
+                inner_cfg=inner_cfg,
+                inner_has_checkpointer=checkpointer is not None,
+                initial_input={
+                    "messages": [
+                        HumanMessage(content=_format_agent_input(incident))
+                    ]
+                },
             )
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error.
@@ -10216,7 +10529,8 @@ async def gate(state: GraphState) -> dict:
 
 def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore,
                        registry: ToolRegistry,
-                       event_log: "EventLog | None" = None) -> dict:
+                       event_log: "EventLog | None" = None,
+                       checkpointer: Any = None) -> dict:
     """Materialize agent nodes from skills + registry. Reused by main + resume graphs.
 
     Dispatches on ``skill.kind``:
@@ -10294,6 +10608,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore,
             injected_args=cfg.orchestrator.injected_args,
             gate_policy=gate_policy,
             event_log=event_log,
+            checkpointer=checkpointer,
         )
     return nodes
 
@@ -10399,7 +10714,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore,
 
     sg = StateGraph(GraphState)
     nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store,
-                                registry=registry, event_log=event_log)
+                                registry=registry, event_log=event_log,
+                                checkpointer=checkpointer)
     for agent_name, node in nodes.items():
         sg.add_node(agent_name, node)
     sg.add_node("gate", make_gate_node(
@@ -14173,6 +14489,29 @@ async def _finalize_session_status_async(
         async with self._locks.acquire(session_id):
             return self._finalize_session_status(session_id)
 
+    async def _is_graph_paused(self, session_id: str) -> bool:
+        """Return True iff the compiled graph has a pending step waiting
+        to resume (i.e. it's paused at an ``interrupt()`` boundary).
+
+        langgraph 1.x surfaces a HITL pause via the result dict's
+        ``__interrupt__`` field rather than raising
+        :class:`GraphInterrupt`. After ``astream_events`` (or
+        ``ainvoke``) returns, the only way to tell "the run paused"
+        from "the run completed" is to query the checkpointed graph
+        state — a non-empty ``next`` tuple means the graph has steps
+        queued to run when resumed. The stream/retry call sites use
+        this to skip ``_finalize_session_status_async`` on a pause —
+        finalizing on a pause would stamp ``default_terminal_status``
+        on a session that's actually waiting on operator approval, and
+        the orphaned ``pending_approval`` ToolCall row written by the
+        gateway would never get its resume.
+        """
+        try:
+            state = await self.graph.aget_state(self._thread_config(session_id))
+        except Exception:  # noqa: BLE001 — defensive; missing checkpoint == not paused
+            return False
+        return bool(getattr(state, "next", ()) or ())
+
     def _thread_config(self, incident_id: str) -> dict:
         """Build the LangGraph ``config`` dict for a per-session thread.
 
@@ -14382,10 +14721,18 @@ async def stream_session(self, *, query: str, environment: str,
             config=self._thread_config(inc.id),
         ):
             yield self._to_ui_event(ev, inc.id)
-        new_status = await self._finalize_session_status_async(inc.id)
-        if new_status:
-            yield {"event": "status_auto_finalized", "incident_id": inc.id,
-                   "status": new_status, "ts": _event_ts()}
+        # Skip finalize when the graph paused on an interrupt — the
+        # session is waiting for operator approval, not done. Stamping
+        # default_terminal_status here would orphan the pending_approval
+        # ToolCall row written by the gateway just before the pause.
+        if await self._is_graph_paused(inc.id):
+            yield {"event": "session_paused", "incident_id": inc.id,
+                   "ts": _event_ts()}
+        else:
+            new_status = await self._finalize_session_status_async(inc.id)
+            if new_status:
+                yield {"event": "status_auto_finalized", "incident_id": inc.id,
+                       "status": new_status, "ts": _event_ts()}
         yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()}
 
     async def stream_investigation(self, *, query: str, environment: str,
@@ -14658,10 +15005,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]:
             config=self._thread_config(session_id),
         ):
             yield self._to_ui_event(ev, session_id)
-        new_status = await self._finalize_session_status_async(session_id)
-        if new_status:
-            yield {"event": "status_auto_finalized", "incident_id": session_id,
-                   "status": new_status, "ts": _event_ts()}
+        # See ``stream_session`` for why pause-detection guards the
+        # finalize call: a HITL pause must not be coerced into a
+        # terminal status.
+        if await self._is_graph_paused(session_id):
+            yield {"event": "session_paused", "incident_id": session_id,
+                   "ts": _event_ts()}
+        else:
+            new_status = await self._finalize_session_status_async(session_id)
+            if new_status:
+                yield {"event": "status_auto_finalized", "incident_id": session_id,
+                       "status": new_status, "ts": _event_ts()}
         yield {"event": "retry_completed", "incident_id": session_id,
                "ts": _event_ts()}
 
@@ -15354,6 +15708,16 @@ async def _resume() -> None:
                     Command(resume=decision_payload),
                     config=orch._thread_config(session_id),
                 )
+                # Finalize after the verdict-driven run completes so
+                # the session row reflects the terminal status (or
+                # falls through to ``default_terminal_status``). Skip
+                # if a fresh interrupt re-paused the graph — the
+                # session is again awaiting operator input. Mirrors
+                # the same guard in the UI's
+                # ``_submit_approval_via_service`` and the
+                # ``stream_session`` finalize call site.
+                if not await orch._is_graph_paused(session_id):
+                    await orch._finalize_session_status_async(session_id)
 
         # Submit the resume onto the long-lived service loop so we
         # don't fight the lifespan thread for the same FastMCP/SQLite
diff --git a/dist/apps/code-review.py b/dist/apps/code-review.py
index b078936..57c3a5c 100644
--- a/dist/apps/code-review.py
+++ b/dist/apps/code-review.py
@@ -713,7 +713,7 @@ class IncidentState(Session):
 """
 
 
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING, Any, Callable
 
 from langchain_core.messages import HumanMessage
 from langchain.agents import create_agent
@@ -994,7 +994,6 @@ class IncidentState(Session):
 """
 
 
-from typing import TYPE_CHECKING, Any, Callable
 
 from fastapi import APIRouter, Depends, Header, HTTPException, Request, status
 from pydantic import ValidationError
@@ -6639,7 +6638,7 @@ def __init__(self, *, agent: str, field: str, message: str | None = None):
     # Leftmost float (allows int form), optional rationale after em-dash /
     # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale
     # is captured wholesale.
-    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$",
+    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$",
     re.DOTALL,
 )
 
@@ -6830,9 +6829,138 @@ def parse_envelope_from_result(
             continue
         break
 
-    # Path 5 (terminal): fail loudly. None of the four paths produced a
+    # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular)
+    # treat a terminal-tool call as the completion and emit an empty
+    # closing AIMessage. Apps configure typed-terminal tools with
+    # ``confidence`` + ``confidence_rationale`` args by contract —
+    # synthesise an envelope from those args when the markdown path
+    # yielded nothing. Skip if any earlier path succeeded.
+    for msg in reversed(messages):
+        tcs = getattr(msg, "tool_calls", None)
+        if not tcs:
+            continue
+        for tc in tcs:
+            args = tc.get("args") or {}
+            if not isinstance(args, dict):
+                continue
+            conf = args.get("confidence")
+            rat = args.get("confidence_rationale") or args.get("rationale")
+            if conf is None or not rat:
+                continue
+            try:
+                conf_f = float(conf)
+            except (TypeError, ValueError):
+                continue
+            content_body = (
+                args.get("resolution_summary")
+                or args.get("reason")
+                or args.get("summary")
+                or f"terminal tool {tc.get('name', '<unknown>')} invoked"
+            )
+            if not isinstance(content_body, str) or not content_body.strip():
+                content_body = f"terminal tool {tc.get('name', '<unknown>')} invoked"
+            try:
+                env = AgentTurnOutput(
+                    content=content_body,
+                    confidence=max(0.0, min(1.0, conf_f)),
+                    confidence_rationale=str(rat),
+                    signal=args.get("signal"),
+                )
+            except ValidationError:
+                continue
+            _LOG.info(
+                "envelope_synthesized_from_tool_args agent=%s tool=%s "
+                "confidence=%.2f",
+                agent, tc.get("name"), env.confidence,
+            )
+            return env
+
+    # Path 6 (D-22-fallback-permissive): the model called at least one
+    # tool but never emitted the markdown contract or a typed-terminal
+    # tool with confidence args. Rather than hard-failing the run,
+    # synthesise a minimal envelope so the session reaches a terminal
+    # status (typically default_terminal_status -> needs_review) and
+    # the operator can review what happened. Confidence is intentionally
+    # low (0.30) so any HITL gate fires.
+    invoked_tool_names: list[str] = []
+    for msg in messages:
+        for tc in (getattr(msg, "tool_calls", None) or []):
+            n = tc.get("name")
+            if n:
+                invoked_tool_names.append(n)
+    if invoked_tool_names:
+        body = (
+            f"Agent {agent} invoked "
+            f"{', '.join(invoked_tool_names)} but did not emit a final "
+            f"summary. Review the tool-call audit for outcome."
+        )
+        try:
+            env = AgentTurnOutput(
+                content=body,
+                confidence=0.30,
+                confidence_rationale=(
+                    "synthesized: agent invoked tools but emitted no "
+                    "closing message"
+                ),
+                signal=None,
+            )
+            _LOG.warning(
+                "envelope_synthesized_permissive agent=%s tools=%s",
+                agent, invoked_tool_names,
+            )
+            return env
+        except ValidationError:
+            pass
+
+    # Path 7 (terminal): fail loudly. None of the six paths produced a
     # valid envelope — this is a real prompt-drift signal worth
-    # surfacing as a structured agent_run error.
+    # surfacing as a structured agent_run error. Log the last
+    # AIMessage content (capped) so operators can diagnose whether
+    # the LLM emitted the wrong shape vs nothing at all.
+    last_ai_content = ""
+    for msg in reversed(messages):
+        if msg.__class__.__name__ == "AIMessage":
+            c = getattr(msg, "content", None)
+            if isinstance(c, str):
+                last_ai_content = c
+                break
+    # Also dump every tool_call across all AIMessages so we can see
+    # whether the model called any tool with confidence args (Path 5
+    # would have fired) or whether it called a non-terminal tool only.
+    tool_call_summary = []
+    for msg in messages:
+        for tc in (getattr(msg, "tool_calls", None) or []):
+            args = tc.get("args") or {}
+            tool_call_summary.append({
+                "name": tc.get("name"),
+                "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?",
+                "has_confidence": isinstance(args, dict) and "confidence" in args,
+                "has_rationale": isinstance(args, dict) and (
+                    "confidence_rationale" in args or "rationale" in args
+                ),
+            })
+    # Detailed per-message dump so we can see exactly what shape the
+    # conversation took when the parse failed.
+    msg_summary = []
+    for m in messages:
+        kind = m.__class__.__name__
+        c = getattr(m, "content", None)
+        c_len = len(c) if isinstance(c, str) else (
+            f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__
+        )
+        tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])]
+        msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs})
+    _LOG.warning(
+        "envelope_missing: agent=%s last_ai_message_content=%r "
+        "structured_response_type=%s message_count=%d tool_calls=%r "
+        "msg_trace=%r",
+        agent,
+        last_ai_content[:1500] if last_ai_content else "<empty>",
+        type(result.get("structured_response")).__name__,
+        len(messages),
+        tool_call_summary,
+        msg_summary,
+    )
     raise EnvelopeMissingError(
         agent=agent,
         field="structured_response",
@@ -7352,6 +7480,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        # Persist the status transition. Without this,
+                        # the DB row stays at ``pending_approval`` and
+                        # the UI keeps offering the buttons forever.
+                        if store is not None:
+                            store.save(session)
                     rejected_result = {"rejected": True, "rationale": rationale}
                     _emit_invoked(
                         status="rejected", risk="high",
@@ -7378,6 +7511,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        if store is not None:
+                            store.save(session)
                     timeout_result = {"timeout": True, "rationale": rationale}
                     _emit_invoked(
                         status="timeout", risk="high",
@@ -7400,6 +7535,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                         approved_at=_now_iso(),
                         approval_rationale=rationale,
                     )
+                    if store is not None:
+                        store.save(session)
                 _emit_invoked(
                     status="approved", risk="high",
                     args_dict=pending_args, result=result,
@@ -7530,6 +7667,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        # Persist the status transition (mirror of the
+                        # sync path) so the DB row reflects the actual
+                        # outcome instead of staying at pending_approval.
+                        if store is not None:
+                            store.save(session)
                     rejected_result = {"rejected": True, "rationale": rationale}
                     _emit_invoked(
                         status="rejected", risk="high",
@@ -7551,6 +7693,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        if store is not None:
+                            store.save(session)
                     timeout_result = {"timeout": True, "rationale": rationale}
                     _emit_invoked(
                         status="timeout", risk="high",
@@ -7572,6 +7716,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                         approved_at=_now_iso(),
                         approval_rationale=rationale,
                     )
+                    if store is not None:
+                        store.save(session)
                 _emit_invoked(
                     status="approved", risk="high",
                     args_dict=pending_args, result=result,
@@ -8361,6 +8507,7 @@ def make_agent_node(
     patch_tool_names: frozenset[str] = frozenset(),
     gate_policy: "GatePolicy | None" = None,
     event_log: "EventLog | None" = None,
+    checkpointer: Any = None,
 ):
     """Factory: build a LangGraph node that runs a ReAct agent and decides a route.
 
@@ -8383,9 +8530,18 @@ def make_agent_node(
 
 
     async def node(state: GraphState) -> dict:
-        incident: Session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess]
-        inc_id = incident.id
+        state_session: Session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess]
+        inc_id = state_session.id
         started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT)
+        # Always reload from store at entry — outer Pregel checkpoints
+        # at step boundaries, not mid-node, so ``state["session"]`` is
+        # stale relative to DB when a HITL gate paused mid-step. See
+        # the same reload comment in ``runtime.graph.make_agent_node``
+        # for the full rationale.
+        try:
+            incident: Session = store.load(inc_id)
+        except FileNotFoundError:
+            incident = state_session
 
         # M3: emit agent_started telemetry before any work happens.
         if event_log is not None:
@@ -8415,11 +8571,21 @@ async def node(state: GraphState) -> dict:
         # change as graph.py:make_agent_node — drop response_format
         # so the loop terminates on natural React END, then parse the
         # final AIMessage via Path 4 in parse_envelope_from_result.
+        # The inner agent gets the orchestrator's checkpointer so a
+        # HITL pause inside ``interrupt()`` can be resumed via
+        # ``Command(resume=verdict)`` on a stable per-invocation
+        # thread id (see ``_drive_agent_with_resume`` in
+        # ``runtime.graph`` for the full rationale).
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
+            checkpointer=checkpointer,
         )
+        inner_thread_id = (
+            f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"
+        )
+        inner_cfg = {"configurable": {"thread_id": inner_thread_id}}
 
         # Phase 11 (FOC-04): reset per-turn confidence hint at the
         # start of each agent step so the gateway treats the first
@@ -8430,9 +8596,15 @@ async def node(state: GraphState) -> dict:
             pass
 
         try:
-            result = await _ainvoke_with_retry(
-                agent_executor,
-                {"messages": [HumanMessage(content=_format_agent_input(incident))]},
+            result = await _drive_agent_with_resume(
+                agent_executor=agent_executor,
+                inner_cfg=inner_cfg,
+                inner_has_checkpointer=checkpointer is not None,
+                initial_input={
+                    "messages": [
+                        HumanMessage(content=_format_agent_input(incident))
+                    ]
+                },
             )
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up.
@@ -9407,8 +9579,104 @@ def finish(self, *, summary: str) -> None:
 )
 
 
+async def _drive_agent_with_resume(
+    *,
+    agent_executor,
+    inner_cfg: dict,
+    inner_has_checkpointer: bool,
+    initial_input: dict,
+):
+    """Run an inner ``create_agent`` executor, surfacing HITL pauses to
+    the outer Pregel graph and forwarding resume verdicts back to the
+    inner agent.
+
+    langgraph 1.x changed the ``interrupt()`` contract: a tool that
+    calls :func:`langgraph.types.interrupt` no longer raises
+    :class:`GraphInterrupt` to the caller. Instead, the agent's
+    ``ainvoke`` returns normally with the pause captured in
+    ``result["__interrupt__"]``. To make HITL approval actually work
+    end-to-end the framework must:
+
+    1. **First call** — invoke the agent with the initial messages.
+       If the result carries ``__interrupt__``, raise
+       :class:`GraphInterrupt` so the *outer* Pregel pauses too. The
+       outer pause is what surfaces ``__interrupt__`` to the
+       orchestrator and lets the UI render Approve / Reject.
+
+    2. **Resume call** — when the outer is resumed via
+       ``Command(resume=verdict)``, this helper is re-entered. The
+       inner's checkpointer remembers the pause; we detect that via
+       ``aget_state(...).next`` being non-empty and call
+       :func:`langgraph.types.interrupt` at the *outer* level to fetch
+       the verdict, then forward it to the inner via
+       ``Command(resume=verdict)``. This is the only path that
+       actually re-runs the gated tool with the verdict — calling
+       ``inner.ainvoke({"messages": [...]})`` on a paused thread
+       silently skips the tool.
+
+    A while loop covers the case where a single agent turn pauses
+    multiple times (e.g. two high-risk tool calls in sequence).
+
+    When ``inner_has_checkpointer`` is False (legacy callers and most
+    unit tests that build agents without a checkpointer), the helper
+    falls back to the simple "invoke + re-raise on ``__interrupt__``"
+    contract. That path can pause the outer graph but the resume
+    verdict cannot reach the inner tool — which is acceptable because
+    legacy callers without a checkpointer never had a working HITL
+    resume path to begin with.
+    """
+    # Local imports keep callers that never touch HITL out of the
+    # langgraph.types import cost path. ``Command`` and ``interrupt``
+    # are tiny but the explicit lazy import documents the dependency.
+    from langgraph.types import Command, interrupt
+
+    # Resume-detection branch: only meaningful when the inner agent
+    # has its own checkpointer (otherwise ``aget_state`` on a fresh
+    # in-memory pregel raises). The detection asks: "does the inner
+    # have a paused step waiting?". A non-empty ``next`` means yes.
+    if inner_has_checkpointer:
+        try:
+            inner_state = await agent_executor.aget_state(inner_cfg)
+            inner_paused = bool(getattr(inner_state, "next", ()) or ())
+        except Exception:  # noqa: BLE001 — defensive; missing checkpoint == fresh
+            inner_paused = False
+    else:
+        inner_paused = False
+
+    if inner_paused:
+        # Outer is being resumed; pull the verdict via the outer's own
+        # interrupt() and forward it. The payload here is informational —
+        # the only thing the outer cares about is the resume value.
+        verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]})
+        result = await _ainvoke_with_retry(
+            agent_executor, Command(resume=verdict), config=inner_cfg,
+        )
+    else:
+        result = await _ainvoke_with_retry(
+            agent_executor, initial_input, config=inner_cfg,
+        )
+
+    # Multi-pause loop: if the inner pauses again (or resumes into a
+    # second gated tool call), surface each pause to the outer the
+    # same way until the inner is fully done.
+    while isinstance(result, dict) and result.get("__interrupt__"):
+        if not inner_has_checkpointer:
+            # No checkpointer => no Command(resume=...) path is
+            # available; surface the pause to the outer Pregel and
+            # stop. Resume cannot deliver the verdict to the gated
+            # tool, but that matches legacy behavior.
+            raise GraphInterrupt(result["__interrupt__"])
+        verdict = interrupt(result["__interrupt__"])
+        result = await _ainvoke_with_retry(
+            agent_executor, Command(resume=verdict), config=inner_cfg,
+        )
+
+    return result
+
+
 async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3,
-                              base_delay: float = 1.5):
+                              base_delay: float = 1.5,
+                              config: dict | None = None):
     """Wrap a LangGraph agent invocation with retry on transient cloud errors.
 
     Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.).
@@ -9426,6 +9694,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3,
             # itself (AutoStrategy → ToolStrategy fallback for non-
             # function-calling Ollama models). The default langgraph
             # recursion bound is now a true upper bound, not a workaround.
+            #
+            # ``config`` (default ``None``) carries the per-thread
+            # ``configurable.thread_id`` for callers that need a
+            # checkpointer-scoped invocation (HITL resume path —
+            # see :func:`_drive_agent_with_resume`). Legacy callers
+            # pass nothing and the executor's default thread is used.
+            if config is not None:
+                return await executor.ainvoke(input_, config=config)
             return await executor.ainvoke(input_)
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause.
@@ -9758,6 +10034,7 @@ def make_agent_node(
     injected_args: dict[str, str] | None = None,
     gate_policy: "GatePolicy | None" = None,
     event_log: "EventLog | None" = None,
+    checkpointer: Any = None,
 ) -> Callable[[GraphState], Awaitable[dict]]:
     """Factory: build a LangGraph node that runs a ReAct agent and decides a route.
 
@@ -9789,10 +10066,23 @@ def make_agent_node(
     """
 
     async def node(state: GraphState) -> dict:
-        incident = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session
-        inc_id = incident.id
+        state_session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session
+        inc_id = state_session.id
         started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT)
 
+        # Always reload from the persistent store at entry. Outer Pregel
+        # checkpoints at step boundaries, not mid-node — so on a HITL
+        # resume, ``state["session"]`` reflects the prior step's output
+        # (e.g. deep_investigator's), not the gateway's pending_approval
+        # row + version bump that happened mid-step in the original run.
+        # If we trust ``state["session"]`` here, the gateway sees no
+        # pending row, appends a duplicate, then ``store.save`` raises
+        # ``StaleVersionError`` because DB has already moved on.
+        try:
+            incident = store.load(inc_id)
+        except FileNotFoundError:
+            incident = state_session
+
         # M3 (per-step telemetry): emit agent_started.
         if event_log is not None:
             try:
@@ -9899,11 +10189,28 @@ def _run(**kwargs: Any) -> Any:
         # END interaction, recursion_limit ceilings). Markdown is the
         # native format every chat model writes well; the parse step
         # happens in the framework, where leniency is in our control.
+        # The inner agent gets a checkpointer so a HITL pause inside
+        # ``interrupt()`` can be resumed via ``Command(resume=verdict)``
+        # on the SAME inner thread. langgraph 1.x semantics require the
+        # checkpointer here — without it ``Command(resume=...)`` raises
+        # ``RuntimeError: Cannot use Command(resume=...) without
+        # checkpointer``. The thread id is derived deterministically
+        # from session + agent + the upcoming agent_run index so it is:
+        #   * STABLE across the inner pause and the outer resume that
+        #     follows (both observe the same ``len(incident.agents_run)``
+        #     because no new run is recorded mid-pause), and
+        #   * UNIQUE per agent invocation so previous invocations of the
+        #     same agent within the same session don't bleed in.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
+            checkpointer=checkpointer,
         )
+        inner_thread_id = (
+            f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"
+        )
+        inner_cfg = {"configurable": {"thread_id": inner_thread_id}}
 
         # Phase 11 (FOC-04): reset per-turn confidence hint. The hint
         # is updated below after _harvest_tool_calls_and_patches; on
@@ -9915,9 +10222,15 @@ def _run(**kwargs: Any) -> Any:
             pass
 
         try:
-            result = await _ainvoke_with_retry(
-                agent_executor,
-                {"messages": [HumanMessage(content=_format_agent_input(incident))]},
+            result = await _drive_agent_with_resume(
+                agent_executor=agent_executor,
+                inner_cfg=inner_cfg,
+                inner_has_checkpointer=checkpointer is not None,
+                initial_input={
+                    "messages": [
+                        HumanMessage(content=_format_agent_input(incident))
+                    ]
+                },
             )
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error.
@@ -10269,7 +10582,8 @@ async def gate(state: GraphState) -> dict:
 
 def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore,
                        registry: ToolRegistry,
-                       event_log: "EventLog | None" = None) -> dict:
+                       event_log: "EventLog | None" = None,
+                       checkpointer: Any = None) -> dict:
     """Materialize agent nodes from skills + registry. Reused by main + resume graphs.
 
     Dispatches on ``skill.kind``:
@@ -10347,6 +10661,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore,
             injected_args=cfg.orchestrator.injected_args,
             gate_policy=gate_policy,
             event_log=event_log,
+            checkpointer=checkpointer,
         )
     return nodes
 
@@ -10452,7 +10767,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore,
 
     sg = StateGraph(GraphState)
     nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store,
-                                registry=registry, event_log=event_log)
+                                registry=registry, event_log=event_log,
+                                checkpointer=checkpointer)
     for agent_name, node in nodes.items():
         sg.add_node(agent_name, node)
     sg.add_node("gate", make_gate_node(
@@ -14226,6 +14542,29 @@ async def _finalize_session_status_async(
         async with self._locks.acquire(session_id):
             return self._finalize_session_status(session_id)
 
+    async def _is_graph_paused(self, session_id: str) -> bool:
+        """Return True iff the compiled graph has a pending step waiting
+        to resume (i.e. it's paused at an ``interrupt()`` boundary).
+
+        langgraph 1.x surfaces a HITL pause via the result dict's
+        ``__interrupt__`` field rather than raising
+        :class:`GraphInterrupt`. After ``astream_events`` (or
+        ``ainvoke``) returns, the only way to tell "the run paused"
+        from "the run completed" is to query the checkpointed graph
+        state — a non-empty ``next`` tuple means the graph has steps
+        queued to run when resumed. The stream/retry call sites use
+        this to skip ``_finalize_session_status_async`` on a pause —
+        finalizing on a pause would stamp ``default_terminal_status``
+        on a session that's actually waiting on operator approval, and
+        the orphaned ``pending_approval`` ToolCall row written by the
+        gateway would never get its resume.
+        """
+        try:
+            state = await self.graph.aget_state(self._thread_config(session_id))
+        except Exception:  # noqa: BLE001 — defensive; missing checkpoint == not paused
+            return False
+        return bool(getattr(state, "next", ()) or ())
+
     def _thread_config(self, incident_id: str) -> dict:
         """Build the LangGraph ``config`` dict for a per-session thread.
 
@@ -14435,10 +14774,18 @@ async def stream_session(self, *, query: str, environment: str,
             config=self._thread_config(inc.id),
         ):
             yield self._to_ui_event(ev, inc.id)
-        new_status = await self._finalize_session_status_async(inc.id)
-        if new_status:
-            yield {"event": "status_auto_finalized", "incident_id": inc.id,
-                   "status": new_status, "ts": _event_ts()}
+        # Skip finalize when the graph paused on an interrupt — the
+        # session is waiting for operator approval, not done. Stamping
+        # default_terminal_status here would orphan the pending_approval
+        # ToolCall row written by the gateway just before the pause.
+        if await self._is_graph_paused(inc.id):
+            yield {"event": "session_paused", "incident_id": inc.id,
+                   "ts": _event_ts()}
+        else:
+            new_status = await self._finalize_session_status_async(inc.id)
+            if new_status:
+                yield {"event": "status_auto_finalized", "incident_id": inc.id,
+                       "status": new_status, "ts": _event_ts()}
         yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()}
 
     async def stream_investigation(self, *, query: str, environment: str,
@@ -14711,10 +15058,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]:
             config=self._thread_config(session_id),
         ):
             yield self._to_ui_event(ev, session_id)
-        new_status = await self._finalize_session_status_async(session_id)
-        if new_status:
-            yield {"event": "status_auto_finalized", "incident_id": session_id,
-                   "status": new_status, "ts": _event_ts()}
+        # See ``stream_session`` for why pause-detection guards the
+        # finalize call: a HITL pause must not be coerced into a
+        # terminal status.
+        if await self._is_graph_paused(session_id):
+            yield {"event": "session_paused", "incident_id": session_id,
+                   "ts": _event_ts()}
+        else:
+            new_status = await self._finalize_session_status_async(session_id)
+            if new_status:
+                yield {"event": "status_auto_finalized", "incident_id": session_id,
+                       "status": new_status, "ts": _event_ts()}
         yield {"event": "retry_completed", "incident_id": session_id,
                "ts": _event_ts()}
 
@@ -15407,6 +15761,16 @@ async def _resume() -> None:
                     Command(resume=decision_payload),
                     config=orch._thread_config(session_id),
                 )
+                # Finalize after the verdict-driven run completes so
+                # the session row reflects the terminal status (or
+                # falls through to ``default_terminal_status``). Skip
+                # if a fresh interrupt re-paused the graph — the
+                # session is again awaiting operator input. Mirrors
+                # the same guard in the UI's
+                # ``_submit_approval_via_service`` and the
+                # ``stream_session`` finalize call site.
+                if not await orch._is_graph_paused(session_id):
+                    await orch._finalize_session_status_async(session_id)
 
         # Submit the resume onto the long-lived service loop so we
         # don't fight the lifespan thread for the same FastMCP/SQLite
diff --git a/dist/apps/incident-management.py b/dist/apps/incident-management.py
index d2c257d..e2ef5f9 100644
--- a/dist/apps/incident-management.py
+++ b/dist/apps/incident-management.py
@@ -713,7 +713,7 @@ class IncidentState(Session):
 """
 
 
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING, Any, Callable
 
 from langchain_core.messages import HumanMessage
 from langchain.agents import create_agent
@@ -994,7 +994,6 @@ class IncidentState(Session):
 """
 
 
-from typing import TYPE_CHECKING, Any, Callable
 
 from fastapi import APIRouter, Depends, Header, HTTPException, Request, status
 from pydantic import ValidationError
@@ -6651,7 +6650,7 @@ def __init__(self, *, agent: str, field: str, message: str | None = None):
     # Leftmost float (allows int form), optional rationale after em-dash /
     # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale
     # is captured wholesale.
-    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$",
+    r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$",
     re.DOTALL,
 )
 
@@ -6842,9 +6841,138 @@ def parse_envelope_from_result(
             continue
         break
 
-    # Path 5 (terminal): fail loudly. None of the four paths produced a
+    # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular)
+    # treat a terminal-tool call as the completion and emit an empty
+    # closing AIMessage. Apps configure typed-terminal tools with
+    # ``confidence`` + ``confidence_rationale`` args by contract —
+    # synthesise an envelope from those args when the markdown path
+    # yielded nothing. Skip if any earlier path succeeded.
+    for msg in reversed(messages):
+        tcs = getattr(msg, "tool_calls", None)
+        if not tcs:
+            continue
+        for tc in tcs:
+            args = tc.get("args") or {}
+            if not isinstance(args, dict):
+                continue
+            conf = args.get("confidence")
+            rat = args.get("confidence_rationale") or args.get("rationale")
+            if conf is None or not rat:
+                continue
+            try:
+                conf_f = float(conf)
+            except (TypeError, ValueError):
+                continue
+            content_body = (
+                args.get("resolution_summary")
+                or args.get("reason")
+                or args.get("summary")
+                or f"terminal tool {tc.get('name', '<unknown>')} invoked"
+            )
+            if not isinstance(content_body, str) or not content_body.strip():
+                content_body = f"terminal tool {tc.get('name', '<unknown>')} invoked"
+            try:
+                env = AgentTurnOutput(
+                    content=content_body,
+                    confidence=max(0.0, min(1.0, conf_f)),
+                    confidence_rationale=str(rat),
+                    signal=args.get("signal"),
+                )
+            except ValidationError:
+                continue
+            _LOG.info(
+                "envelope_synthesized_from_tool_args agent=%s tool=%s "
+                "confidence=%.2f",
+                agent, tc.get("name"), env.confidence,
+            )
+            return env
+
+    # Path 6 (D-22-fallback-permissive): the model called at least one
+    # tool but never emitted the markdown contract or a typed-terminal
+    # tool with confidence args. Rather than hard-failing the run,
+    # synthesise a minimal envelope so the session reaches a terminal
+    # status (typically default_terminal_status -> needs_review) and
+    # the operator can review what happened. Confidence is intentionally
+    # low (0.30) so any HITL gate fires.
+    invoked_tool_names: list[str] = []
+    for msg in messages:
+        for tc in (getattr(msg, "tool_calls", None) or []):
+            n = tc.get("name")
+            if n:
+                invoked_tool_names.append(n)
+    if invoked_tool_names:
+        body = (
+            f"Agent {agent} invoked "
+            f"{', '.join(invoked_tool_names)} but did not emit a final "
+            f"summary. Review the tool-call audit for outcome."
+        )
+        try:
+            env = AgentTurnOutput(
+                content=body,
+                confidence=0.30,
+                confidence_rationale=(
+                    "synthesized: agent invoked tools but emitted no "
+                    "closing message"
+                ),
+                signal=None,
+            )
+            _LOG.warning(
+                "envelope_synthesized_permissive agent=%s tools=%s",
+                agent, invoked_tool_names,
+            )
+            return env
+        except ValidationError:
+            pass
+
+    # Path 7 (terminal): fail loudly. None of the six paths produced a
     # valid envelope — this is a real prompt-drift signal worth
-    # surfacing as a structured agent_run error.
+    # surfacing as a structured agent_run error. Log the last
+    # AIMessage content (capped) so operators can diagnose whether
+    # the LLM emitted the wrong shape vs nothing at all.
+    last_ai_content = ""
+    for msg in reversed(messages):
+        if msg.__class__.__name__ == "AIMessage":
+            c = getattr(msg, "content", None)
+            if isinstance(c, str):
+                last_ai_content = c
+                break
+    # Also dump every tool_call across all AIMessages so we can see
+    # whether the model called any tool with confidence args (Path 5
+    # would have fired) or whether it called a non-terminal tool only.
+    tool_call_summary = []
+    for msg in messages:
+        for tc in (getattr(msg, "tool_calls", None) or []):
+            args = tc.get("args") or {}
+            tool_call_summary.append({
+                "name": tc.get("name"),
+                "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?",
+                "has_confidence": isinstance(args, dict) and "confidence" in args,
+                "has_rationale": isinstance(args, dict) and (
+                    "confidence_rationale" in args or "rationale" in args
+                ),
+            })
+    # Detailed per-message dump so we can see exactly what shape the
+    # conversation took when the parse failed.
+    msg_summary = []
+    for m in messages:
+        kind = m.__class__.__name__
+        c = getattr(m, "content", None)
+        c_len = len(c) if isinstance(c, str) else (
+            f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__
+        )
+        tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])]
+        msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs})
+    _LOG.warning(
+        "envelope_missing: agent=%s last_ai_message_content=%r "
+        "structured_response_type=%s message_count=%d tool_calls=%r "
+        "msg_trace=%r",
+        agent,
+        last_ai_content[:1500] if last_ai_content else "<empty>",
+        type(result.get("structured_response")).__name__,
+        len(messages),
+        tool_call_summary,
+        msg_summary,
+    )
     raise EnvelopeMissingError(
         agent=agent,
         field="structured_response",
@@ -7364,6 +7492,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        # Persist the status transition. Without this,
+                        # the DB row stays at ``pending_approval`` and
+                        # the UI keeps offering the buttons forever.
+                        if store is not None:
+                            store.save(session)
                     rejected_result = {"rejected": True, "rationale": rationale}
                     _emit_invoked(
                         status="rejected", risk="high",
@@ -7390,6 +7523,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        if store is not None:
+                            store.save(session)
                     timeout_result = {"timeout": True, "rationale": rationale}
                     _emit_invoked(
                         status="timeout", risk="high",
@@ -7412,6 +7547,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                         approved_at=_now_iso(),
                         approval_rationale=rationale,
                     )
+                    if store is not None:
+                        store.save(session)
                 _emit_invoked(
                     status="approved", risk="high",
                     args_dict=pending_args, result=result,
@@ -7542,6 +7679,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        # Persist the status transition (mirror of the
+                        # sync path) so the DB row reflects the actual
+                        # outcome instead of staying at pending_approval.
+                        if store is not None:
+                            store.save(session)
                     rejected_result = {"rejected": True, "rationale": rationale}
                     _emit_invoked(
                         status="rejected", risk="high",
@@ -7563,6 +7705,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                             approved_at=_now_iso(),
                             approval_rationale=rationale,
                         )
+                        if store is not None:
+                            store.save(session)
                     timeout_result = {"timeout": True, "rationale": rationale}
                     _emit_invoked(
                         status="timeout", risk="high",
@@ -7584,6 +7728,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any:  # noqa: D401
                         approved_at=_now_iso(),
                         approval_rationale=rationale,
                     )
+                    if store is not None:
+                        store.save(session)
                 _emit_invoked(
                     status="approved", risk="high",
                     args_dict=pending_args, result=result,
@@ -8373,6 +8519,7 @@ def make_agent_node(
     patch_tool_names: frozenset[str] = frozenset(),
     gate_policy: "GatePolicy | None" = None,
     event_log: "EventLog | None" = None,
+    checkpointer: Any = None,
 ):
     """Factory: build a LangGraph node that runs a ReAct agent and decides a route.
 
@@ -8395,9 +8542,18 @@ def make_agent_node(
 
 
     async def node(state: GraphState) -> dict:
-        incident: Session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess]
-        inc_id = incident.id
+        state_session: Session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess]
+        inc_id = state_session.id
         started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT)
+        # Always reload from store at entry — outer Pregel checkpoints
+        # at step boundaries, not mid-node, so ``state["session"]`` is
+        # stale relative to DB when a HITL gate paused mid-step. See
+        # the same reload comment in ``runtime.graph.make_agent_node``
+        # for the full rationale.
+        try:
+            incident: Session = store.load(inc_id)
+        except FileNotFoundError:
+            incident = state_session
 
         # M3: emit agent_started telemetry before any work happens.
         if event_log is not None:
@@ -8427,11 +8583,21 @@ async def node(state: GraphState) -> dict:
         # change as graph.py:make_agent_node — drop response_format
         # so the loop terminates on natural React END, then parse the
         # final AIMessage via Path 4 in parse_envelope_from_result.
+        # The inner agent gets the orchestrator's checkpointer so a
+        # HITL pause inside ``interrupt()`` can be resumed via
+        # ``Command(resume=verdict)`` on a stable per-invocation
+        # thread id (see ``_drive_agent_with_resume`` in
+        # ``runtime.graph`` for the full rationale).
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
+            checkpointer=checkpointer,
         )
+        inner_thread_id = (
+            f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"
+        )
+        inner_cfg = {"configurable": {"thread_id": inner_thread_id}}
 
         # Phase 11 (FOC-04): reset per-turn confidence hint at the
         # start of each agent step so the gateway treats the first
@@ -8442,9 +8608,15 @@ async def node(state: GraphState) -> dict:
             pass
 
         try:
-            result = await _ainvoke_with_retry(
-                agent_executor,
-                {"messages": [HumanMessage(content=_format_agent_input(incident))]},
+            result = await _drive_agent_with_resume(
+                agent_executor=agent_executor,
+                inner_cfg=inner_cfg,
+                inner_has_checkpointer=checkpointer is not None,
+                initial_input={
+                    "messages": [
+                        HumanMessage(content=_format_agent_input(incident))
+                    ]
+                },
             )
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up.
@@ -9419,8 +9591,104 @@ def finish(self, *, summary: str) -> None:
 )
 
 
+async def _drive_agent_with_resume(
+    *,
+    agent_executor,
+    inner_cfg: dict,
+    inner_has_checkpointer: bool,
+    initial_input: dict,
+):
+    """Run an inner ``create_agent`` executor, surfacing HITL pauses to
+    the outer Pregel graph and forwarding resume verdicts back to the
+    inner agent.
+
+    langgraph 1.x changed the ``interrupt()`` contract: a tool that
+    calls :func:`langgraph.types.interrupt` no longer raises
+    :class:`GraphInterrupt` to the caller. Instead, the agent's
+    ``ainvoke`` returns normally with the pause captured in
+    ``result["__interrupt__"]``. To make HITL approval actually work
+    end-to-end the framework must:
+
+    1. **First call** — invoke the agent with the initial messages.
+       If the result carries ``__interrupt__``, raise
+       :class:`GraphInterrupt` so the *outer* Pregel pauses too. The
+       outer pause is what surfaces ``__interrupt__`` to the
+       orchestrator and lets the UI render Approve / Reject.
+
+    2. **Resume call** — when the outer is resumed via
+       ``Command(resume=verdict)``, this helper is re-entered. The
+       inner's checkpointer remembers the pause; we detect that via
+       ``aget_state(...).next`` being non-empty and call
+       :func:`langgraph.types.interrupt` at the *outer* level to fetch
+       the verdict, then forward it to the inner via
+       ``Command(resume=verdict)``. This is the only path that
+       actually re-runs the gated tool with the verdict — calling
+       ``inner.ainvoke({"messages": [...]})`` on a paused thread
+       silently skips the tool.
+
+    A while loop covers the case where a single agent turn pauses
+    multiple times (e.g. two high-risk tool calls in sequence).
+
+    When ``inner_has_checkpointer`` is False (legacy callers and most
+    unit tests that build agents without a checkpointer), the helper
+    falls back to the simple "invoke + re-raise on ``__interrupt__``"
+    contract. That path can pause the outer graph but the resume
+    verdict cannot reach the inner tool — which is acceptable because
+    legacy callers without a checkpointer never had a working HITL
+    resume path to begin with.
+    """
+    # Local imports keep callers that never touch HITL out of the
+    # langgraph.types import cost path. ``Command`` and ``interrupt``
+    # are tiny but the explicit lazy import documents the dependency.
+    from langgraph.types import Command, interrupt
+
+    # Resume-detection branch: only meaningful when the inner agent
+    # has its own checkpointer (otherwise ``aget_state`` on a fresh
+    # in-memory pregel raises). The detection asks: "does the inner
+    # have a paused step waiting?". A non-empty ``next`` means yes.
+    if inner_has_checkpointer:
+        try:
+            inner_state = await agent_executor.aget_state(inner_cfg)
+            inner_paused = bool(getattr(inner_state, "next", ()) or ())
+        except Exception:  # noqa: BLE001 — defensive; missing checkpoint == fresh
+            inner_paused = False
+    else:
+        inner_paused = False
+
+    if inner_paused:
+        # Outer is being resumed; pull the verdict via the outer's own
+        # interrupt() and forward it. The payload here is informational —
+        # the only thing the outer cares about is the resume value.
+        verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]})
+        result = await _ainvoke_with_retry(
+            agent_executor, Command(resume=verdict), config=inner_cfg,
+        )
+    else:
+        result = await _ainvoke_with_retry(
+            agent_executor, initial_input, config=inner_cfg,
+        )
+
+    # Multi-pause loop: if the inner pauses again (or resumes into a
+    # second gated tool call), surface each pause to the outer the
+    # same way until the inner is fully done.
+    while isinstance(result, dict) and result.get("__interrupt__"):
+        if not inner_has_checkpointer:
+            # No checkpointer => no Command(resume=...) path is
+            # available; surface the pause to the outer Pregel and
+            # stop. Resume cannot deliver the verdict to the gated
+            # tool, but that matches legacy behavior.
+            raise GraphInterrupt(result["__interrupt__"])
+        verdict = interrupt(result["__interrupt__"])
+        result = await _ainvoke_with_retry(
+            agent_executor, Command(resume=verdict), config=inner_cfg,
+        )
+
+    return result
+
+
 async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3,
-                              base_delay: float = 1.5):
+                              base_delay: float = 1.5,
+                              config: dict | None = None):
     """Wrap a LangGraph agent invocation with retry on transient cloud errors.
 
     Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.).
@@ -9438,6 +9706,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3,
             # itself (AutoStrategy → ToolStrategy fallback for non-
             # function-calling Ollama models). The default langgraph
             # recursion bound is now a true upper bound, not a workaround.
+            #
+            # ``config`` (default ``None``) carries the per-thread
+            # ``configurable.thread_id`` for callers that need a
+            # checkpointer-scoped invocation (HITL resume path —
+            # see :func:`_drive_agent_with_resume`). Legacy callers
+            # pass nothing and the executor's default thread is used.
+            if config is not None:
+                return await executor.ainvoke(input_, config=config)
             return await executor.ainvoke(input_)
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause.
@@ -9770,6 +10046,7 @@ def make_agent_node(
     injected_args: dict[str, str] | None = None,
     gate_policy: "GatePolicy | None" = None,
     event_log: "EventLog | None" = None,
+    checkpointer: Any = None,
 ) -> Callable[[GraphState], Awaitable[dict]]:
     """Factory: build a LangGraph node that runs a ReAct agent and decides a route.
 
@@ -9801,10 +10078,23 @@ def make_agent_node(
     """
 
     async def node(state: GraphState) -> dict:
-        incident = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session
-        inc_id = incident.id
+        state_session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session
+        inc_id = state_session.id
         started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT)
 
+        # Always reload from the persistent store at entry. Outer Pregel
+        # checkpoints at step boundaries, not mid-node — so on a HITL
+        # resume, ``state["session"]`` reflects the prior step's output
+        # (e.g. deep_investigator's), not the gateway's pending_approval
+        # row + version bump that happened mid-step in the original run.
+        # If we trust ``state["session"]`` here, the gateway sees no
+        # pending row, appends a duplicate, then ``store.save`` raises
+        # ``StaleVersionError`` because DB has already moved on.
+        try:
+            incident = store.load(inc_id)
+        except FileNotFoundError:
+            incident = state_session
+
         # M3 (per-step telemetry): emit agent_started.
         if event_log is not None:
             try:
@@ -9911,11 +10201,28 @@ def _run(**kwargs: Any) -> Any:
         # END interaction, recursion_limit ceilings). Markdown is the
         # native format every chat model writes well; the parse step
         # happens in the framework, where leniency is in our control.
+        # The inner agent gets a checkpointer so a HITL pause inside
+        # ``interrupt()`` can be resumed via ``Command(resume=verdict)``
+        # on the SAME inner thread. langgraph 1.x semantics require the
+        # checkpointer here — without it ``Command(resume=...)`` raises
+        # ``RuntimeError: Cannot use Command(resume=...) without
+        # checkpointer``. The thread id is derived deterministically
+        # from session + agent + the upcoming agent_run index so it is:
+        #   * STABLE across the inner pause and the outer resume that
+        #     follows (both observe the same ``len(incident.agents_run)``
+        #     because no new run is recorded mid-pause), and
+        #   * UNIQUE per agent invocation so previous invocations of the
+        #     same agent within the same session don't bleed in.
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
+            checkpointer=checkpointer,
         )
+        inner_thread_id = (
+            f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"
+        )
+        inner_cfg = {"configurable": {"thread_id": inner_thread_id}}
 
         # Phase 11 (FOC-04): reset per-turn confidence hint. The hint
         # is updated below after _harvest_tool_calls_and_patches; on
@@ -9927,9 +10234,15 @@ def _run(**kwargs: Any) -> Any:
             pass
 
         try:
-            result = await _ainvoke_with_retry(
-                agent_executor,
-                {"messages": [HumanMessage(content=_format_agent_input(incident))]},
+            result = await _drive_agent_with_resume(
+                agent_executor=agent_executor,
+                inner_cfg=inner_cfg,
+                inner_has_checkpointer=checkpointer is not None,
+                initial_input={
+                    "messages": [
+                        HumanMessage(content=_format_agent_input(incident))
+                    ]
+                },
             )
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error.
@@ -10281,7 +10594,8 @@ async def gate(state: GraphState) -> dict:
 
 def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore,
                        registry: ToolRegistry,
-                       event_log: "EventLog | None" = None) -> dict:
+                       event_log: "EventLog | None" = None,
+                       checkpointer: Any = None) -> dict:
     """Materialize agent nodes from skills + registry. Reused by main + resume graphs.
 
     Dispatches on ``skill.kind``:
@@ -10359,6 +10673,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore,
             injected_args=cfg.orchestrator.injected_args,
             gate_policy=gate_policy,
             event_log=event_log,
+            checkpointer=checkpointer,
         )
     return nodes
 
@@ -10464,7 +10779,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore,
 
     sg = StateGraph(GraphState)
     nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store,
-                                registry=registry, event_log=event_log)
+                                registry=registry, event_log=event_log,
+                                checkpointer=checkpointer)
     for agent_name, node in nodes.items():
         sg.add_node(agent_name, node)
     sg.add_node("gate", make_gate_node(
@@ -14238,6 +14554,29 @@ async def _finalize_session_status_async(
         async with self._locks.acquire(session_id):
             return self._finalize_session_status(session_id)
 
+    async def _is_graph_paused(self, session_id: str) -> bool:
+        """Return True iff the compiled graph has a pending step waiting
+        to resume (i.e. it's paused at an ``interrupt()`` boundary).
+
+        langgraph 1.x surfaces a HITL pause via the result dict's
+        ``__interrupt__`` field rather than raising
+        :class:`GraphInterrupt`. After ``astream_events`` (or
+        ``ainvoke``) returns, the only way to tell "the run paused"
+        from "the run completed" is to query the checkpointed graph
+        state — a non-empty ``next`` tuple means the graph has steps
+        queued to run when resumed. The stream/retry call sites use
+        this to skip ``_finalize_session_status_async`` on a pause —
+        finalizing on a pause would stamp ``default_terminal_status``
+        on a session that's actually waiting on operator approval, and
+        the orphaned ``pending_approval`` ToolCall row written by the
+        gateway would never get its resume.
+        """
+        try:
+            state = await self.graph.aget_state(self._thread_config(session_id))
+        except Exception:  # noqa: BLE001 — defensive; missing checkpoint == not paused
+            return False
+        return bool(getattr(state, "next", ()) or ())
+
     def _thread_config(self, incident_id: str) -> dict:
         """Build the LangGraph ``config`` dict for a per-session thread.
 
@@ -14447,10 +14786,18 @@ async def stream_session(self, *, query: str, environment: str,
             config=self._thread_config(inc.id),
         ):
             yield self._to_ui_event(ev, inc.id)
-        new_status = await self._finalize_session_status_async(inc.id)
-        if new_status:
-            yield {"event": "status_auto_finalized", "incident_id": inc.id,
-                   "status": new_status, "ts": _event_ts()}
+        # Skip finalize when the graph paused on an interrupt — the
+        # session is waiting for operator approval, not done. Stamping
+        # default_terminal_status here would orphan the pending_approval
+        # ToolCall row written by the gateway just before the pause.
+        if await self._is_graph_paused(inc.id):
+            yield {"event": "session_paused", "incident_id": inc.id,
+                   "ts": _event_ts()}
+        else:
+            new_status = await self._finalize_session_status_async(inc.id)
+            if new_status:
+                yield {"event": "status_auto_finalized", "incident_id": inc.id,
+                       "status": new_status, "ts": _event_ts()}
         yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()}
 
     async def stream_investigation(self, *, query: str, environment: str,
@@ -14723,10 +15070,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]:
             config=self._thread_config(session_id),
         ):
             yield self._to_ui_event(ev, session_id)
-        new_status = await self._finalize_session_status_async(session_id)
-        if new_status:
-            yield {"event": "status_auto_finalized", "incident_id": session_id,
-                   "status": new_status, "ts": _event_ts()}
+        # See ``stream_session`` for why pause-detection guards the
+        # finalize call: a HITL pause must not be coerced into a
+        # terminal status.
+        if await self._is_graph_paused(session_id):
+            yield {"event": "session_paused", "incident_id": session_id,
+                   "ts": _event_ts()}
+        else:
+            new_status = await self._finalize_session_status_async(session_id)
+            if new_status:
+                yield {"event": "status_auto_finalized", "incident_id": session_id,
+                       "status": new_status, "ts": _event_ts()}
         yield {"event": "retry_completed", "incident_id": session_id,
                "ts": _event_ts()}
 
@@ -15419,6 +15773,16 @@ async def _resume() -> None:
                     Command(resume=decision_payload),
                     config=orch._thread_config(session_id),
                 )
+                # Finalize after the verdict-driven run completes so
+                # the session row reflects the terminal status (or
+                # falls through to ``default_terminal_status``). Skip
+                # if a fresh interrupt re-paused the graph — the
+                # session is again awaiting operator input. Mirrors
+                # the same guard in the UI's
+                # ``_submit_approval_via_service`` and the
+                # ``stream_session`` finalize call site.
+                if not await orch._is_graph_paused(session_id):
+                    await orch._finalize_session_status_async(session_id)
 
         # Submit the resume onto the long-lived service loop so we
         # don't fight the lifespan thread for the same FastMCP/SQLite
diff --git a/dist/ui.py b/dist/ui.py
index 05bc7d9..3a57b56 100644
--- a/dist/ui.py
+++ b/dist/ui.py
@@ -26,8 +26,24 @@
 
 from app import AppConfig, Base, FrameworkAppConfig, MetadataConfig, Orchestrator, OrchestratorService, SessionStore, UIBadge, build_embedder, build_engine, build_vector_store, load_config, resolve_framework_app_config, resolve_state_class
 import asyncio
+import logging as _logging
 import os
 import time
+
+# Optional: env-driven log config so manual runs (streamlit run, etc.)
+# can crank verbosity to INFO/DEBUG without modifying the source. The
+# default keeps the production-quiet WARNING level.
+
+
+# ====== module: src/runtime/ui.py ======
+
+_log_level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip()
+if _log_level in {"DEBUG", "INFO", "WARNING", "ERROR"}:
+    _logging.basicConfig(
+        level=getattr(_logging, _log_level),
+        format="%(asctime)s %(name)s %(levelname)s %(message)s",
+        force=True,
+    )
 from datetime import datetime, timezone
 from pathlib import Path
 import streamlit as st
@@ -41,10 +57,6 @@
 
 
 # Default config path; apps override via the ``APP_CONFIG`` env var.
-
-
-# ====== module: src/runtime/ui.py ======
-
 CONFIG_PATH = Path(os.environ.get("APP_CONFIG", "config/config.yaml"))
 
 
@@ -985,6 +997,14 @@ async def _drive() -> None:
             Command(resume=payload),
             config=orch._thread_config(session_id),
         )
+        # The graph completes the agent run after the verdict is
+        # forwarded to the gated tool; finalize the session if it's
+        # not paused on a fresh interrupt. Without this, an approved
+        # session stays in ``in_progress`` because the resume path
+        # does not run through ``stream_session`` (the only other
+        # caller of finalize).
+        if not await orch._is_graph_paused(session_id):
+            await orch._finalize_session_status_async(session_id)
 
     svc.submit_and_wait(_drive(), timeout=60.0)
 

From 6eaf5a59e5e3d0b6053b457b66cf0e12afd8c833 Mon Sep 17 00:00:00 2001
From: Amit Kumar <ak.nitrr13@gmail.com>
Date: Thu, 14 May 2026 00:26:41 +0000
Subject: [PATCH 6/6] fix(ui): move ASR_LOG_LEVEL block below imports to
 satisfy ruff E402

The env-driven log config block sat between imports, tripping ruff's
``E402 Module level import not at top of file``. Refactor into a
``_maybe_configure_logging`` function called immediately after the
import block. ``logging.basicConfig(force=True)`` overrides any
handler streamlit installed during import, so the behaviour is
preserved.

Regenerate dist/ui.py to match.
---
 dist/ui.py        | 34 ++++++++++++++++++++--------------
 src/runtime/ui.py | 28 +++++++++++++++++-----------
 2 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/dist/ui.py b/dist/ui.py
index 3a57b56..24f5876 100644
--- a/dist/ui.py
+++ b/dist/ui.py
@@ -29,31 +29,37 @@
 import logging as _logging
 import os
 import time
+from datetime import datetime, timezone
+from pathlib import Path
+import streamlit as st
+
+
+
 
-# Optional: env-driven log config so manual runs (streamlit run, etc.)
-# can crank verbosity to INFO/DEBUG without modifying the source. The
-# default keeps the production-quiet WARNING level.
 
 
-# ====== module: src/runtime/ui.py ======
 
-_log_level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip()
-if _log_level in {"DEBUG", "INFO", "WARNING", "ERROR"}:
-    _logging.basicConfig(
-        level=getattr(_logging, _log_level),
-        format="%(asctime)s %(name)s %(levelname)s %(message)s",
-        force=True,
-    )
-from datetime import datetime, timezone
-from pathlib import Path
-import streamlit as st
 
 
+# Optional: env-driven log config so manual runs (streamlit run, etc.)
+# can crank verbosity to INFO/DEBUG without modifying the source. The
+# default keeps the production-quiet WARNING level. ``force=True``
+# overrides any handler streamlit set up during import.
 
 
+# ====== module: src/runtime/ui.py ======
 
+def _maybe_configure_logging() -> None:
+    level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip()
+    if level in {"DEBUG", "INFO", "WARNING", "ERROR"}:
+        _logging.basicConfig(
+            level=getattr(_logging, level),
+            format="%(asctime)s %(name)s %(levelname)s %(message)s",
+            force=True,
+        )
 
 
+_maybe_configure_logging()
 
 
 # Default config path; apps override via the ``APP_CONFIG`` env var.
diff --git a/src/runtime/ui.py b/src/runtime/ui.py
index 3277a1b..573d829 100644
--- a/src/runtime/ui.py
+++ b/src/runtime/ui.py
@@ -26,17 +26,6 @@
 import logging as _logging
 import os
 import time
-
-# Optional: env-driven log config so manual runs (streamlit run, etc.)
-# can crank verbosity to INFO/DEBUG without modifying the source. The
-# default keeps the production-quiet WARNING level.
-_log_level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip()
-if _log_level in {"DEBUG", "INFO", "WARNING", "ERROR"}:
-    _logging.basicConfig(
-        level=getattr(_logging, _log_level),
-        format="%(asctime)s %(name)s %(levelname)s %(message)s",
-        force=True,
-    )
 from datetime import datetime, timezone
 from pathlib import Path
 import streamlit as st
@@ -58,6 +47,23 @@
 from runtime.storage.vector import build_vector_store
 
 
+# Optional: env-driven log config so manual runs (streamlit run, etc.)
+# can crank verbosity to INFO/DEBUG without modifying the source. The
+# default keeps the production-quiet WARNING level. ``force=True``
+# overrides any handler streamlit set up during import.
+def _maybe_configure_logging() -> None:
+    level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip()
+    if level in {"DEBUG", "INFO", "WARNING", "ERROR"}:
+        _logging.basicConfig(
+            level=getattr(_logging, level),
+            format="%(asctime)s %(name)s %(levelname)s %(message)s",
+            force=True,
+        )
+
+
+_maybe_configure_logging()
+
+
 # Default config path; apps override via the ``APP_CONFIG`` env var.
 CONFIG_PATH = Path(os.environ.get("APP_CONFIG", "config/config.yaml"))