From 512308c85220b5bca9234487029644eb1d4fa3e4 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 04:20:00 +0000 Subject: [PATCH 1/6] feat(turn_output): add markdown-primary parser as Path 4 (D-22-03) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 22, iteration 1. Adds parse_markdown_envelope as a pure function over the canonical D-22-03 contract: ## Response ## Confidence ## Signal Parser is forgiving but pinned: H2 or H3 headers, case-insensitive section names, em-dash / ASCII-double / single-hyphen rationale separator, integer or float confidence (clamped to [0,1]), missing rationale gets a placeholder, signal "none" / "null" / blank coerce to None. Hard error reserved for truly unparseable input (no Confidence section, no float on the line, empty Response body). Wires into parse_envelope_from_result as Path 4 (after structured_ response and JSON-parse, before raising EnvelopeMissingError) so existing structured-output paths keep working unchanged. D-22-01's elimination of response_format=AgentTurnOutput from create_agent lands in the next iteration. Tests (tests/test_markdown_turn_output.py): 25 cases covering all sections, missing/malformed/blank variants, signal vocabulary, code blocks in response body, header drift (H3, lowercase), and Path 4 integration into parse_envelope_from_result. Verified: ruff clean; pytest -x → 1231 passed (1206 prior + 25 new); coverage 87.0%. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/runtime/agents/turn_output.py | 151 ++++++++++++++- tests/test_markdown_turn_output.py | 291 +++++++++++++++++++++++++++++ 2 files changed, 438 insertions(+), 4 deletions(-) create mode 100644 tests/test_markdown_turn_output.py diff --git a/src/runtime/agents/turn_output.py b/src/runtime/agents/turn_output.py index df202e4..99a72ff 100644 --- a/src/runtime/agents/turn_output.py +++ b/src/runtime/agents/turn_output.py @@ -21,8 +21,9 @@ import json import logging +import re -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, ValidationError _LOG = logging.getLogger("runtime.orchestrator") @@ -87,6 +88,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): super().__init__(message or f"envelope_missing: {field} (agent={agent})") +_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE) +_CONF_LINE = re.compile( + # Leftmost float (allows int form), optional rationale after em-dash / + # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale + # is captured wholesale. + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$", + re.DOTALL, +) + + +def _clamp_unit(x: float) -> float: + """Clamp a confidence float into [0, 1] without raising. The skill + prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or + ``-0.1`` — clamp rather than reject so the parse step is forgiving.""" + if x < 0.0: + return 0.0 + if x > 1.0: + return 1.0 + return x + + +def parse_markdown_envelope( + content: str, *, agent: str = "", +) -> "AgentTurnOutput": + """D-22-03 — parse the trailing markdown contract block into an + :class:`AgentTurnOutput`. + + Expects three sections ``## Response`` / ``## Confidence`` / + ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the + end of the LLM's reply. The free-text body of ``## Response`` + becomes ``content``; the float on the ``## Confidence`` line + becomes ``confidence`` (clamped to [0, 1]); the rest of that line + after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``; + the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/ + blank coerced to ``None``). + + Raises :class:`EnvelopeMissingError` only when the parse cannot + produce a valid envelope at all — missing ``## Confidence`` line, + unparseable confidence value, or empty ``## Response`` body. + """ + if not isinstance(content, str) or not content.strip(): + raise EnvelopeMissingError( + agent=agent, field="content", + message=f"envelope_missing: empty content (agent={agent!r})", + ) + + # ``re.split`` with the header capture-group yields: + # [pre, h1, body1, h2, body2, ...] + # Pre is the prose before the first heading; we ignore it for parse. + parts = _HEADER_SPLIT.split(content) + sections: dict[str, str] = {} + if len(parts) >= 3: + for i in range(1, len(parts) - 1, 2): + key = parts[i].strip().lower() + body = parts[i + 1].strip() + # Don't overwrite an earlier section with a later same-named + # one (unusual but possible in pathological prompts). + sections.setdefault(key, body) + + response_body = sections.get("response", "").strip() + raw_conf = sections.get("confidence", "").strip() + + if not raw_conf: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section absent or empty " + f"(agent={agent!r})" + ), + ) + + m = _CONF_LINE.match(raw_conf) + if not m: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section did not parse " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) + try: + conf_value = _clamp_unit(float(m.group(1))) + except (TypeError, ValueError) as exc: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence value not a float " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) from exc + rationale = (m.group(2) or "").strip() or "(no rationale provided)" + + signal_raw = sections.get("signal", "").strip().lower() or None + if signal_raw in {"none", "null", "", "n/a"}: + signal_raw = None + + if not response_body: + raise EnvelopeMissingError( + agent=agent, field="content", + message=( + f"envelope_missing: response section empty (agent={agent!r})" + ), + ) + + try: + return AgentTurnOutput( + content=response_body, + confidence=conf_value, + confidence_rationale=rationale, + signal=signal_raw, + ) + except ValidationError as exc: + raise EnvelopeMissingError( + agent=agent, field="validation", + message=( + f"envelope_missing: pydantic validation rejected the " + f"parsed values (agent={agent!r}): {exc}" + ), + ) from exc + + def parse_envelope_from_result( result: dict, *, @@ -144,13 +266,33 @@ def parse_envelope_from_result( continue break - # Path 3: fail loudly + # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's + # content. Producers that emit the new ``## Response / ## Confidence / + # ## Signal`` section block land here when Paths 1+2 yield nothing. + for msg in reversed(messages): + if msg.__class__.__name__ != "AIMessage": + continue + content = getattr(msg, "content", None) + if not isinstance(content, str) or not content.strip(): + continue + try: + return parse_markdown_envelope(content, agent=agent) + except EnvelopeMissingError: + # This AIMessage didnt carry a parseable markdown contract. + # Continue scanning earlier messages on the off-chance a + # nested loop emitted the contract one step back. + continue + break + + # Path 5 (terminal): fail loudly. None of the four paths produced a + # valid envelope — this is a real prompt-drift signal worth + # surfacing as a structured agent_run error. raise EnvelopeMissingError( agent=agent, field="structured_response", message=( - f"envelope_missing: no structured_response or JSON-decodable " - f"AIMessage envelope found (agent={agent})" + f"envelope_missing: no structured_response, JSON-decodable, or " + f"markdown-decodable AIMessage envelope found (agent={agent})" ), ) @@ -197,5 +339,6 @@ def reconcile_confidence( "AgentTurnOutput", "EnvelopeMissingError", "parse_envelope_from_result", + "parse_markdown_envelope", "reconcile_confidence", ] diff --git a/tests/test_markdown_turn_output.py b/tests/test_markdown_turn_output.py new file mode 100644 index 0000000..4049cfe --- /dev/null +++ b/tests/test_markdown_turn_output.py @@ -0,0 +1,291 @@ +"""Phase 22 — markdown turn output parser matrix. + +Pure-function tests on parse_markdown_envelope. Covers the +section-by-section parse + the validation paths the framework relies +on (D-22-03 contract). + +Pinning the contract here is the single point of compat truth — every +LLM that writes the documented ## Response / ## Confidence / +## Signal block round-trips through these tests verbatim. +""" +from __future__ import annotations + +import pytest + +from runtime.agents.turn_output import ( + AgentTurnOutput, + EnvelopeMissingError, + parse_envelope_from_result, + parse_markdown_envelope, +) + + +# --------------------------------------------------------------------------- +# Happy path +# --------------------------------------------------------------------------- + +def test_all_sections_present_em_dash_rationale(): + md = ( + "Some prose preamble that may or may not exist.\n\n" + "## Response\n" + "Looks like a deploy regression in payments-svc.\n\n" + "## Confidence\n" + "0.85 — strong correlation with v1.120 release timeline\n\n" + "## Signal\n" + "default\n" + ) + env = parse_markdown_envelope(md, agent="triage") + assert isinstance(env, AgentTurnOutput) + assert env.content == "Looks like a deploy regression in payments-svc." + assert env.confidence == 0.85 + assert env.confidence_rationale == ( + "strong correlation with v1.120 release timeline" + ) + assert env.signal == "default" + + +def test_ascii_dash_rationale(): + md = ( + "## Response\nfoo\n" + "## Confidence\n0.7 -- ok\n" + "## Signal\nsuccess\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 0.7 + assert env.confidence_rationale == "ok" + assert env.signal == "success" + + +def test_single_hyphen_rationale(): + md = ( + "## Response\nfoo\n" + "## Confidence\n0.6 - some rationale\n" + "## Signal\nfailed\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 0.6 + assert env.confidence_rationale == "some rationale" + + +# --------------------------------------------------------------------------- +# Confidence shape variants +# --------------------------------------------------------------------------- + +def test_confidence_int_zero_clamps_in_range(): + md = ( + "## Response\nbody\n" + "## Confidence\n0\n" + "## Signal\nfailed\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 0.0 + # Missing rationale is replaced with placeholder. + assert env.confidence_rationale == "(no rationale provided)" + + +def test_confidence_int_one(): + md = ( + "## Response\nbody\n" + "## Confidence\n1\n" + "## Signal\nsuccess\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 1.0 + + +def test_confidence_above_range_clamps(): + """LLM occasionally emits 1.05 — clamp rather than reject.""" + md = ( + "## Response\nbody\n" + "## Confidence\n1.05 — over\n" + "## Signal\nsuccess\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 1.0 + + +def test_confidence_below_range_clamps(): + md = ( + "## Response\nbody\n" + "## Confidence\n-0.2 — under\n" + "## Signal\nfailed\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 0.0 + + +def test_confidence_unparseable_raises(): + md = ( + "## Response\nbody\n" + "## Confidence\nhigh — strong\n" + "## Signal\nsuccess\n" + ) + with pytest.raises(EnvelopeMissingError) as exc: + parse_markdown_envelope(md, agent="triage") + assert exc.value.field == "confidence" + assert "triage" in str(exc.value) + + +def test_confidence_section_missing_raises(): + md = ( + "## Response\nbody\n" + "## Signal\nsuccess\n" + ) + with pytest.raises(EnvelopeMissingError) as exc: + parse_markdown_envelope(md) + assert exc.value.field == "confidence" + + +# --------------------------------------------------------------------------- +# Signal vocabulary +# --------------------------------------------------------------------------- + +@pytest.mark.parametrize("blank", ["", "none", "None", "NULL", " ", "n/a"]) +def test_signal_blank_or_none_token_yields_none(blank: str): + md = ( + "## Response\nbody\n" + "## Confidence\n0.5 — mid\n" + f"## Signal\n{blank}\n" + ) + env = parse_markdown_envelope(md) + assert env.signal is None + + +def test_signal_unknown_value_passed_through_lowercased(): + """Vocabulary validation lives in the routing layer, not the parser.""" + md = ( + "## Response\nbody\n" + "## Confidence\n0.5 — mid\n" + "## Signal\nWeirdSignal\n" + ) + env = parse_markdown_envelope(md) + assert env.signal == "weirdsignal" + + +def test_signal_section_missing_yields_none(): + md = ( + "## Response\nbody\n" + "## Confidence\n0.4 — hedged\n" + ) + env = parse_markdown_envelope(md) + assert env.signal is None + + +# --------------------------------------------------------------------------- +# Response body shape +# --------------------------------------------------------------------------- + +def test_multi_line_response_with_code_block(): + md = ( + "## Response\n" + "Here is the fix:\n\n" + "```python\n" + "def add(a, b):\n" + " return a + b\n" + "```\n\n" + "Run pytest after applying.\n\n" + "## Confidence\n" + "0.92 — verified locally\n\n" + "## Signal\n" + "success\n" + ) + env = parse_markdown_envelope(md) + assert "```python" in env.content + assert "Run pytest" in env.content + assert env.confidence == 0.92 + + +def test_response_section_missing_raises(): + md = ( + "## Confidence\n0.7 — ok\n" + "## Signal\nsuccess\n" + ) + with pytest.raises(EnvelopeMissingError) as exc: + parse_markdown_envelope(md) + assert exc.value.field == "content" + + +def test_empty_content_raises(): + with pytest.raises(EnvelopeMissingError) as exc: + parse_markdown_envelope("") + assert exc.value.field == "content" + + +# --------------------------------------------------------------------------- +# Header drift: H3 / lowercase +# --------------------------------------------------------------------------- + +def test_h3_headers_accepted(): + """Watch-out: model may emit ### instead of ##. Parser accepts ##+.""" + md = ( + "### Response\nbody\n" + "### Confidence\n0.5 — mid\n" + "### Signal\ndefault\n" + ) + env = parse_markdown_envelope(md) + assert env.content == "body" + assert env.confidence == 0.5 + assert env.signal == "default" + + +def test_lowercase_headers_accepted(): + md = ( + "## response\nbody\n" + "## confidence\n0.5 — mid\n" + "## signal\nsuccess\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 0.5 + assert env.signal == "success" + + +# --------------------------------------------------------------------------- +# parse_envelope_from_result Path 4 integration +# --------------------------------------------------------------------------- + +def _ai(content: str): + """Construct an object that quacks like an AIMessage for path-4 + class-name walker.""" + obj = type("AIMessage", (), {})() + obj.content = content # type: ignore[attr-defined] + return obj + + +def test_parse_envelope_from_result_falls_through_to_path_4_markdown(): + """No structured_response, no JSON in messages — Path 4 wins.""" + md = ( + "## Response\nbody from path 4\n" + "## Confidence\n0.8 — ok\n" + "## Signal\nsuccess\n" + ) + result = {"messages": [_ai(md)], "structured_response": None} + env = parse_envelope_from_result(result, agent="triage") + assert env.content == "body from path 4" + assert env.confidence == 0.8 + assert env.signal == "success" + + +def test_parse_envelope_from_result_path_1_still_wins_when_present(): + """Markdown parser doesn't shadow the structured_response path.""" + md = ( + "## Response\nshould not be picked\n" + "## Confidence\n0.1 — wrong\n" + "## Signal\nfailed\n" + ) + sr = AgentTurnOutput( + content="from structured_response", + confidence=0.99, + confidence_rationale="strong", + signal="success", + ) + result = {"messages": [_ai(md)], "structured_response": sr} + env = parse_envelope_from_result(result, agent="triage") + assert env.content == "from structured_response" + assert env.confidence == 0.99 + + +def test_parse_envelope_from_result_raises_when_no_path_yields(): + """Empty messages + no structured_response + no markdown → raises.""" + result = {"messages": [_ai("just prose, no sections")]} + with pytest.raises(EnvelopeMissingError): + parse_envelope_from_result(result, agent="triage") From bc05e73154eeb647af036bfc0df0a8ce5b4ee416 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 04:58:20 +0000 Subject: [PATCH 2/6] feat(turn_output): markdown-primary turn output (Phase 22) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the JSON-envelope-as-primary structured output path with markdown-primary turn output per the Phase 22 plan (.planning/phases/22-markdown-turn-output/22-CONTEXT.md). The agent loop terminates on natural React END (no tool calls in the final AIMessage); the framework parses the markdown body via Path 4 in parse_envelope_from_result. D-22-01: graph.py + agents/responsive.py drop response_format= AgentTurnOutput from create_agent. The pre-Phase-22 ToolStrategy / envelope-as-callable-tool termination signal is gone — the loop now terminates the way every chat model natively does. D-22-03: parse_markdown_envelope (already on this branch from the prior commit) is wired as Path 4 of parse_envelope_from_result. Paths 1 (structured_response) and 2 (JSON-parse) stay as defensive fallbacks for providers that DO emit structured output. D-22-04: All 6 example skill prompts (3 incident_management + 3 code_review with Output contract sections) get the new ## Output contract — REQUIRED block instructing the LLM to end every reply with literal ## Response / ## Confidence / ## Signal sections. D-22-05: StubChatModel + EnvelopeStubChatModel (in tests/ _envelope_helpers.py) revert to pre-Phase-15 simple semantics — emit canned text wrapped in the D-22-03 markdown contract. The _envelope_tool_name field, bind_tools envelope-tool detection, and _generate auto-emit branch are removed. tool_call_plan + closing markdown emit in a single AIMessage so unit tests with empty tool registries (tools=[]) still get a parseable envelope after langgraph terminates the loop on the first turn. D-22-06: single atomic commit (this one + the prior parser commit on the same branch). Legacy structured-output integration is removed, not config-gated. graph.py:_DEFAULT_STUB_CANNED reverts to plain text bodies — the stub wraps them in markdown via stub_envelope_* fields. The deep_investigator entry's confidence (0.30, below the 0.75 gate threshold) is preserved via _DEFAULT_STUB_ENVELOPE_CONFIDENCE. Test fixture migration: - tests/_envelope_helpers.py: EnvelopeStubChatModel emits markdown + tool_calls in same AIMessage; bind_tools is a no-op. - tests/test_real_llm_tool_loop_termination.py: test_stub_chat_model_records_envelope_tool_name_on_bind rewritten as test_stub_chat_model_emits_markdown_envelope_block — the bind is now a no-op pass-through; the closing message is markdown. Verified: ruff clean; pytest -x → 1231 passed (1206 pre-Phase-22 + 25 new from the parser matrix); coverage 86.71% (gate ≥85%); concept-leak ratchet stays at 154; dist regenerated. Real-LLM verification (per acceptance #10) is gated by OLLAMA_LIVE=1 and out of scope for this in-CI run; manual smoke against gpt-oss + openai/gpt-4o-mini deferred to the operator after merge. Co-Authored-By: Claude Opus 4.7 (1M context) --- dist/app.py | 316 ++++++++++++------ dist/apps/code-review.py | 316 ++++++++++++------ dist/apps/incident-management.py | 316 ++++++++++++------ .../code_review/skills/analyzer/system.md | 21 +- examples/code_review/skills/intake/system.md | 21 +- .../code_review/skills/recommender/system.md | 21 +- .../skills/deep_investigator/system.md | 21 +- .../skills/resolution/system.md | 21 +- .../skills/triage/system.md | 21 +- src/runtime/agents/responsive.py | 19 +- src/runtime/graph.py | 42 ++- src/runtime/llm.py | 105 +++--- tests/_envelope_helpers.py | 56 ++-- tests/test_real_llm_tool_loop_termination.py | 39 +-- 14 files changed, 854 insertions(+), 481 deletions(-) diff --git a/dist/app.py b/dist/app.py index e42575a..7d8a93e 100644 --- a/dist/app.py +++ b/dist/app.py @@ -550,7 +550,7 @@ class IncidentState(Session): -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, ValidationError # ----- imports for runtime/tools/gateway.py ----- """Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper. @@ -1180,6 +1180,7 @@ async def _poll(self, registry): """ +from pydantic import BaseModel, ConfigDict, Field # ----- imports for runtime/memory/knowledge_graph.py ----- @@ -3131,17 +3132,13 @@ class StubChatModel(BaseChatModel): ``stub_envelope_confidence`` / ``stub_envelope_rationale`` / ``stub_envelope_signal``. - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` -> - ``ToolStrategy`` for non-native-structured-output models, including - this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The - agent loop only terminates when the LLM emits a tool call NAMED - ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name - so ``_generate`` can auto-emit a closing tool call after any - user-configured ``tool_call_plan`` is exhausted -- preserving the - pre-Phase-15 stub semantics (canned text + optional pre-scripted - tool calls) while satisfying the new tool-loop termination - contract. + Phase 22 (D-22-05): the loop terminates on natural React END now + that ``response_format=AgentTurnOutput`` is gone — tests drive + markdown-shaped canned text (the framework parses it via Path 4 + in :func:`runtime.agents.turn_output.parse_envelope_from_result`). + The Phase 15 envelope-as-callable-tool auto-emit is removed; the + stub is back to its pre-Phase-15 simple semantics (canned text + + optional pre-scripted ``tool_call_plan``). """ role: str = "default" canned_responses: dict[str, str] = Field(default_factory=dict) @@ -3150,12 +3147,6 @@ class StubChatModel(BaseChatModel): stub_envelope_rationale: str = "stub envelope rationale" stub_envelope_signal: str | None = None _called_once: bool = False - # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when - # ``langchain.agents.create_agent`` injects a structured-output tool - # for ``AgentTurnOutput``. Holds the bare tool name (e.g. - # ``"AgentTurnOutput"``) so ``_generate`` can emit a final - # envelope-shaped tool call to close the agent loop. - _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -3163,33 +3154,37 @@ def _llm_type(self) -> str: def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None, run_manager: Any = None, **kwargs: Any) -> ChatResult: - text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response") + # Phase 22 (D-22-05): tool-call rounds emit the scripted tool + # calls with a placeholder content body; the closing turn (no + # remaining tool_call_plan entries) emits the canned text + # wrapped in the D-22-03 markdown contract that + # parse_markdown_envelope reads. The stub_envelope_* + # parameters drive the trailing-section bodies so tests can + # exercise specific confidence / rationale / signal paths + # without authoring markdown by hand. tool_calls: list[dict] = [] if self.tool_call_plan and not self._called_once: for tc in self.tool_call_plan: - tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}) + tool_calls.append( + {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())} + ) self._called_once = True - elif self._envelope_tool_name is not None: - # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted - # (or wasn't configured) AND ``langchain.agents.create_agent`` - # has bound the AgentTurnOutput envelope as a tool. Emit a - # closing tool call so the loop terminates with a populated - # ``structured_response``. The args mirror the - # ``with_structured_output`` path's envelope construction so - # tests see the same confidence / rationale / signal regardless - # of whether the new tool-strategy or the legacy structured- - # output path is in play. - tool_calls.append({ - "name": self._envelope_tool_name, - "args": { - "content": text or ".", - "confidence": self.stub_envelope_confidence, - "confidence_rationale": self.stub_envelope_rationale, - "signal": self.stub_envelope_signal, - }, - "id": str(uuid4()), - }) - msg = AIMessage(content=text, tool_calls=tool_calls) + + body = self.canned_responses.get( + self.role, f"[stub:{self.role}] no canned response", + ) + # Phase 22 (D-22-05): explicit "none" when no signal so the + # parser maps to None — preserves pre-Phase-22 behaviour where a + # stub with no signal yielded envelope.signal=None. + signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none" + md = ( + f"{body}\n\n" + f"## Response\n{body}\n\n" + f"## Confidence\n{self.stub_envelope_confidence:.4f} -- " + f"{self.stub_envelope_rationale}\n\n" + f"## Signal\n{signal_str}\n" + ) + msg = AIMessage(content=md, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None, @@ -3197,30 +3192,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - """Record the AgentTurnOutput envelope-tool name when present. - - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` calls ``bind_tools(...)`` - with the user's tools PLUS the envelope-as-a-tool. We scan the - list for the AgentTurnOutput-shaped tool (matched by ``__name__`` - on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the - ``"name"`` key on dict-shaped tool specs) and remember it on the - instance so ``_generate`` can close the agent loop with a - synthetic envelope tool call after any pre-scripted - ``tool_call_plan`` is exhausted. Tools bound by the framework - itself (real BaseTools the agent should call) flow through - unchanged -- the stub still emits them only via - ``tool_call_plan``. + """Phase 22 (D-22-05): no-op tool binding. + + ``langchain.agents.create_agent`` calls ``bind_tools`` to wire + the agent's BaseTool list onto the chat model. The stub does + not actually invoke those tools (test fixtures script tool + calls via ``tool_call_plan`` instead), so we just return self + unchanged. The pre-Phase-22 envelope-tool detection is gone + because the framework no longer asks for ``response_format``. """ - for t in tools or []: - name = ( - getattr(t, "__name__", None) - or getattr(t, "name", None) - or (isinstance(t, dict) and t.get("name")) - ) - if isinstance(name, str) and name == "AgentTurnOutput": - self._envelope_tool_name = name - break return self # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]`` @@ -6601,6 +6581,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): super().__init__(message or f"envelope_missing: {field} (agent={agent})") +_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE) +_CONF_LINE = re.compile( + # Leftmost float (allows int form), optional rationale after em-dash / + # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale + # is captured wholesale. + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$", + re.DOTALL, +) + + +def _clamp_unit(x: float) -> float: + """Clamp a confidence float into [0, 1] without raising. The skill + prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or + ``-0.1`` — clamp rather than reject so the parse step is forgiving.""" + if x < 0.0: + return 0.0 + if x > 1.0: + return 1.0 + return x + + +def parse_markdown_envelope( + content: str, *, agent: str = "", +) -> "AgentTurnOutput": + """D-22-03 — parse the trailing markdown contract block into an + :class:`AgentTurnOutput`. + + Expects three sections ``## Response`` / ``## Confidence`` / + ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the + end of the LLM's reply. The free-text body of ``## Response`` + becomes ``content``; the float on the ``## Confidence`` line + becomes ``confidence`` (clamped to [0, 1]); the rest of that line + after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``; + the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/ + blank coerced to ``None``). + + Raises :class:`EnvelopeMissingError` only when the parse cannot + produce a valid envelope at all — missing ``## Confidence`` line, + unparseable confidence value, or empty ``## Response`` body. + """ + if not isinstance(content, str) or not content.strip(): + raise EnvelopeMissingError( + agent=agent, field="content", + message=f"envelope_missing: empty content (agent={agent!r})", + ) + + # ``re.split`` with the header capture-group yields: + # [pre, h1, body1, h2, body2, ...] + # Pre is the prose before the first heading; we ignore it for parse. + parts = _HEADER_SPLIT.split(content) + sections: dict[str, str] = {} + if len(parts) >= 3: + for i in range(1, len(parts) - 1, 2): + key = parts[i].strip().lower() + body = parts[i + 1].strip() + # Don't overwrite an earlier section with a later same-named + # one (unusual but possible in pathological prompts). + sections.setdefault(key, body) + + response_body = sections.get("response", "").strip() + raw_conf = sections.get("confidence", "").strip() + + if not raw_conf: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section absent or empty " + f"(agent={agent!r})" + ), + ) + + m = _CONF_LINE.match(raw_conf) + if not m: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section did not parse " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) + try: + conf_value = _clamp_unit(float(m.group(1))) + except (TypeError, ValueError) as exc: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence value not a float " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) from exc + rationale = (m.group(2) or "").strip() or "(no rationale provided)" + + signal_raw = sections.get("signal", "").strip().lower() or None + if signal_raw in {"none", "null", "", "n/a"}: + signal_raw = None + + if not response_body: + raise EnvelopeMissingError( + agent=agent, field="content", + message=( + f"envelope_missing: response section empty (agent={agent!r})" + ), + ) + + try: + return AgentTurnOutput( + content=response_body, + confidence=conf_value, + confidence_rationale=rationale, + signal=signal_raw, + ) + except ValidationError as exc: + raise EnvelopeMissingError( + agent=agent, field="validation", + message=( + f"envelope_missing: pydantic validation rejected the " + f"parsed values (agent={agent!r}): {exc}" + ), + ) from exc + + def parse_envelope_from_result( result: dict, *, @@ -6658,13 +6759,33 @@ def parse_envelope_from_result( continue break - # Path 3: fail loudly + # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's + # content. Producers that emit the new ``## Response / ## Confidence / + # ## Signal`` section block land here when Paths 1+2 yield nothing. + for msg in reversed(messages): + if msg.__class__.__name__ != "AIMessage": + continue + content = getattr(msg, "content", None) + if not isinstance(content, str) or not content.strip(): + continue + try: + return parse_markdown_envelope(content, agent=agent) + except EnvelopeMissingError: + # This AIMessage didnt carry a parseable markdown contract. + # Continue scanning earlier messages on the off-chance a + # nested loop emitted the contract one step back. + continue + break + + # Path 5 (terminal): fail loudly. None of the four paths produced a + # valid envelope — this is a real prompt-drift signal worth + # surfacing as a structured agent_run error. raise EnvelopeMissingError( agent=agent, field="structured_response", message=( - f"envelope_missing: no structured_response or JSON-decodable " - f"AIMessage envelope found (agent={agent})" + f"envelope_missing: no structured_response, JSON-decodable, or " + f"markdown-decodable AIMessage envelope found (agent={agent})" ), ) @@ -6711,6 +6832,7 @@ def reconcile_confidence( "AgentTurnOutput", "EnvelopeMissingError", "parse_envelope_from_result", + "parse_markdown_envelope", "reconcile_confidence", ] @@ -8236,24 +8358,14 @@ async def node(state: GraphState) -> dict: ] else: run_tools = tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Same + # change as graph.py:make_agent_node — drop response_format + # so the loop terminates on natural React END, then parse the + # final AIMessage via Path 4 in parse_envelope_from_result. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, ) # Phase 11 (FOC-04): reset per-turn confidence hint at the @@ -9719,24 +9831,25 @@ def _run(**kwargs: Any) -> Any: ] else: run_tools = visible_tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Drop + # ``response_format`` from create_agent — the agent loop now + # terminates on the natural React END signal (LLM emits an + # AIMessage with no tool calls). The framework parses that + # final message body as markdown via Path 4 in + # parse_envelope_from_result. Skill prompts (D-22-04) instruct + # every reply to end with ## Response / ## Confidence / ## + # Signal sections; the parser extracts them. + # + # Why: forcing LLMs through a JSON-schema-shaped output via + # ``response_format`` triggered a class of brittleness across + # providers (model-specific JSON drift, tool-strategy + React + # END interaction, recursion_limit ceilings). Markdown is the + # native format every chat model writes well; the parse step + # happens in the framework, where leniency is in our control. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, ) # Phase 11 (FOC-04): reset per-turn confidence hint. The hint @@ -9930,12 +10043,17 @@ def _decide_from_signal(inc: Session) -> str: _DEFAULT_STUB_CANNED: dict[str, str] = { - # Back-compat defaults for the four canonical agents. Any YAML-defined - # agent can override or extend this via ``skill.stub_response``; new agents - # without an entry here fall through to StubChatModel's generic placeholder. + # Plain-text bodies; ``StubChatModel._generate`` wraps these in the + # D-22-03 markdown envelope using the per-agent stub_envelope_* fields + # (the framework parses the wrapped output via Path 4). Apps can + # override or extend via ``skill.stub_response``; new agents without + # an entry here fall through to ``StubChatModel``'s generic placeholder. "intake": "Created INC, no prior matches. Routing to triage.", "triage": "Severity medium, category latency. No recent deploys correlate.", - "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.", + "deep_investigator": ( + "Hypothesis: upstream payments timeout. " + "Evidence: log line upstream_timeout target=payments." + ), "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.", } diff --git a/dist/apps/code-review.py b/dist/apps/code-review.py index d9e1c39..b078936 100644 --- a/dist/apps/code-review.py +++ b/dist/apps/code-review.py @@ -550,7 +550,7 @@ class IncidentState(Session): -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, ValidationError # ----- imports for runtime/tools/gateway.py ----- """Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper. @@ -1180,6 +1180,7 @@ async def _poll(self, registry): """ +from pydantic import BaseModel, ConfigDict, Field # ----- imports for runtime/memory/knowledge_graph.py ----- @@ -3184,17 +3185,13 @@ class StubChatModel(BaseChatModel): ``stub_envelope_confidence`` / ``stub_envelope_rationale`` / ``stub_envelope_signal``. - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` -> - ``ToolStrategy`` for non-native-structured-output models, including - this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The - agent loop only terminates when the LLM emits a tool call NAMED - ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name - so ``_generate`` can auto-emit a closing tool call after any - user-configured ``tool_call_plan`` is exhausted -- preserving the - pre-Phase-15 stub semantics (canned text + optional pre-scripted - tool calls) while satisfying the new tool-loop termination - contract. + Phase 22 (D-22-05): the loop terminates on natural React END now + that ``response_format=AgentTurnOutput`` is gone — tests drive + markdown-shaped canned text (the framework parses it via Path 4 + in :func:`runtime.agents.turn_output.parse_envelope_from_result`). + The Phase 15 envelope-as-callable-tool auto-emit is removed; the + stub is back to its pre-Phase-15 simple semantics (canned text + + optional pre-scripted ``tool_call_plan``). """ role: str = "default" canned_responses: dict[str, str] = Field(default_factory=dict) @@ -3203,12 +3200,6 @@ class StubChatModel(BaseChatModel): stub_envelope_rationale: str = "stub envelope rationale" stub_envelope_signal: str | None = None _called_once: bool = False - # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when - # ``langchain.agents.create_agent`` injects a structured-output tool - # for ``AgentTurnOutput``. Holds the bare tool name (e.g. - # ``"AgentTurnOutput"``) so ``_generate`` can emit a final - # envelope-shaped tool call to close the agent loop. - _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -3216,33 +3207,37 @@ def _llm_type(self) -> str: def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None, run_manager: Any = None, **kwargs: Any) -> ChatResult: - text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response") + # Phase 22 (D-22-05): tool-call rounds emit the scripted tool + # calls with a placeholder content body; the closing turn (no + # remaining tool_call_plan entries) emits the canned text + # wrapped in the D-22-03 markdown contract that + # parse_markdown_envelope reads. The stub_envelope_* + # parameters drive the trailing-section bodies so tests can + # exercise specific confidence / rationale / signal paths + # without authoring markdown by hand. tool_calls: list[dict] = [] if self.tool_call_plan and not self._called_once: for tc in self.tool_call_plan: - tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}) + tool_calls.append( + {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())} + ) self._called_once = True - elif self._envelope_tool_name is not None: - # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted - # (or wasn't configured) AND ``langchain.agents.create_agent`` - # has bound the AgentTurnOutput envelope as a tool. Emit a - # closing tool call so the loop terminates with a populated - # ``structured_response``. The args mirror the - # ``with_structured_output`` path's envelope construction so - # tests see the same confidence / rationale / signal regardless - # of whether the new tool-strategy or the legacy structured- - # output path is in play. - tool_calls.append({ - "name": self._envelope_tool_name, - "args": { - "content": text or ".", - "confidence": self.stub_envelope_confidence, - "confidence_rationale": self.stub_envelope_rationale, - "signal": self.stub_envelope_signal, - }, - "id": str(uuid4()), - }) - msg = AIMessage(content=text, tool_calls=tool_calls) + + body = self.canned_responses.get( + self.role, f"[stub:{self.role}] no canned response", + ) + # Phase 22 (D-22-05): explicit "none" when no signal so the + # parser maps to None — preserves pre-Phase-22 behaviour where a + # stub with no signal yielded envelope.signal=None. + signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none" + md = ( + f"{body}\n\n" + f"## Response\n{body}\n\n" + f"## Confidence\n{self.stub_envelope_confidence:.4f} -- " + f"{self.stub_envelope_rationale}\n\n" + f"## Signal\n{signal_str}\n" + ) + msg = AIMessage(content=md, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None, @@ -3250,30 +3245,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - """Record the AgentTurnOutput envelope-tool name when present. - - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` calls ``bind_tools(...)`` - with the user's tools PLUS the envelope-as-a-tool. We scan the - list for the AgentTurnOutput-shaped tool (matched by ``__name__`` - on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the - ``"name"`` key on dict-shaped tool specs) and remember it on the - instance so ``_generate`` can close the agent loop with a - synthetic envelope tool call after any pre-scripted - ``tool_call_plan`` is exhausted. Tools bound by the framework - itself (real BaseTools the agent should call) flow through - unchanged -- the stub still emits them only via - ``tool_call_plan``. + """Phase 22 (D-22-05): no-op tool binding. + + ``langchain.agents.create_agent`` calls ``bind_tools`` to wire + the agent's BaseTool list onto the chat model. The stub does + not actually invoke those tools (test fixtures script tool + calls via ``tool_call_plan`` instead), so we just return self + unchanged. The pre-Phase-22 envelope-tool detection is gone + because the framework no longer asks for ``response_format``. """ - for t in tools or []: - name = ( - getattr(t, "__name__", None) - or getattr(t, "name", None) - or (isinstance(t, dict) and t.get("name")) - ) - if isinstance(name, str) and name == "AgentTurnOutput": - self._envelope_tool_name = name - break return self # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]`` @@ -6654,6 +6634,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): super().__init__(message or f"envelope_missing: {field} (agent={agent})") +_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE) +_CONF_LINE = re.compile( + # Leftmost float (allows int form), optional rationale after em-dash / + # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale + # is captured wholesale. + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$", + re.DOTALL, +) + + +def _clamp_unit(x: float) -> float: + """Clamp a confidence float into [0, 1] without raising. The skill + prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or + ``-0.1`` — clamp rather than reject so the parse step is forgiving.""" + if x < 0.0: + return 0.0 + if x > 1.0: + return 1.0 + return x + + +def parse_markdown_envelope( + content: str, *, agent: str = "", +) -> "AgentTurnOutput": + """D-22-03 — parse the trailing markdown contract block into an + :class:`AgentTurnOutput`. + + Expects three sections ``## Response`` / ``## Confidence`` / + ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the + end of the LLM's reply. The free-text body of ``## Response`` + becomes ``content``; the float on the ``## Confidence`` line + becomes ``confidence`` (clamped to [0, 1]); the rest of that line + after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``; + the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/ + blank coerced to ``None``). + + Raises :class:`EnvelopeMissingError` only when the parse cannot + produce a valid envelope at all — missing ``## Confidence`` line, + unparseable confidence value, or empty ``## Response`` body. + """ + if not isinstance(content, str) or not content.strip(): + raise EnvelopeMissingError( + agent=agent, field="content", + message=f"envelope_missing: empty content (agent={agent!r})", + ) + + # ``re.split`` with the header capture-group yields: + # [pre, h1, body1, h2, body2, ...] + # Pre is the prose before the first heading; we ignore it for parse. + parts = _HEADER_SPLIT.split(content) + sections: dict[str, str] = {} + if len(parts) >= 3: + for i in range(1, len(parts) - 1, 2): + key = parts[i].strip().lower() + body = parts[i + 1].strip() + # Don't overwrite an earlier section with a later same-named + # one (unusual but possible in pathological prompts). + sections.setdefault(key, body) + + response_body = sections.get("response", "").strip() + raw_conf = sections.get("confidence", "").strip() + + if not raw_conf: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section absent or empty " + f"(agent={agent!r})" + ), + ) + + m = _CONF_LINE.match(raw_conf) + if not m: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section did not parse " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) + try: + conf_value = _clamp_unit(float(m.group(1))) + except (TypeError, ValueError) as exc: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence value not a float " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) from exc + rationale = (m.group(2) or "").strip() or "(no rationale provided)" + + signal_raw = sections.get("signal", "").strip().lower() or None + if signal_raw in {"none", "null", "", "n/a"}: + signal_raw = None + + if not response_body: + raise EnvelopeMissingError( + agent=agent, field="content", + message=( + f"envelope_missing: response section empty (agent={agent!r})" + ), + ) + + try: + return AgentTurnOutput( + content=response_body, + confidence=conf_value, + confidence_rationale=rationale, + signal=signal_raw, + ) + except ValidationError as exc: + raise EnvelopeMissingError( + agent=agent, field="validation", + message=( + f"envelope_missing: pydantic validation rejected the " + f"parsed values (agent={agent!r}): {exc}" + ), + ) from exc + + def parse_envelope_from_result( result: dict, *, @@ -6711,13 +6812,33 @@ def parse_envelope_from_result( continue break - # Path 3: fail loudly + # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's + # content. Producers that emit the new ``## Response / ## Confidence / + # ## Signal`` section block land here when Paths 1+2 yield nothing. + for msg in reversed(messages): + if msg.__class__.__name__ != "AIMessage": + continue + content = getattr(msg, "content", None) + if not isinstance(content, str) or not content.strip(): + continue + try: + return parse_markdown_envelope(content, agent=agent) + except EnvelopeMissingError: + # This AIMessage didnt carry a parseable markdown contract. + # Continue scanning earlier messages on the off-chance a + # nested loop emitted the contract one step back. + continue + break + + # Path 5 (terminal): fail loudly. None of the four paths produced a + # valid envelope — this is a real prompt-drift signal worth + # surfacing as a structured agent_run error. raise EnvelopeMissingError( agent=agent, field="structured_response", message=( - f"envelope_missing: no structured_response or JSON-decodable " - f"AIMessage envelope found (agent={agent})" + f"envelope_missing: no structured_response, JSON-decodable, or " + f"markdown-decodable AIMessage envelope found (agent={agent})" ), ) @@ -6764,6 +6885,7 @@ def reconcile_confidence( "AgentTurnOutput", "EnvelopeMissingError", "parse_envelope_from_result", + "parse_markdown_envelope", "reconcile_confidence", ] @@ -8289,24 +8411,14 @@ async def node(state: GraphState) -> dict: ] else: run_tools = tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Same + # change as graph.py:make_agent_node — drop response_format + # so the loop terminates on natural React END, then parse the + # final AIMessage via Path 4 in parse_envelope_from_result. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, ) # Phase 11 (FOC-04): reset per-turn confidence hint at the @@ -9772,24 +9884,25 @@ def _run(**kwargs: Any) -> Any: ] else: run_tools = visible_tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Drop + # ``response_format`` from create_agent — the agent loop now + # terminates on the natural React END signal (LLM emits an + # AIMessage with no tool calls). The framework parses that + # final message body as markdown via Path 4 in + # parse_envelope_from_result. Skill prompts (D-22-04) instruct + # every reply to end with ## Response / ## Confidence / ## + # Signal sections; the parser extracts them. + # + # Why: forcing LLMs through a JSON-schema-shaped output via + # ``response_format`` triggered a class of brittleness across + # providers (model-specific JSON drift, tool-strategy + React + # END interaction, recursion_limit ceilings). Markdown is the + # native format every chat model writes well; the parse step + # happens in the framework, where leniency is in our control. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, ) # Phase 11 (FOC-04): reset per-turn confidence hint. The hint @@ -9983,12 +10096,17 @@ def _decide_from_signal(inc: Session) -> str: _DEFAULT_STUB_CANNED: dict[str, str] = { - # Back-compat defaults for the four canonical agents. Any YAML-defined - # agent can override or extend this via ``skill.stub_response``; new agents - # without an entry here fall through to StubChatModel's generic placeholder. + # Plain-text bodies; ``StubChatModel._generate`` wraps these in the + # D-22-03 markdown envelope using the per-agent stub_envelope_* fields + # (the framework parses the wrapped output via Path 4). Apps can + # override or extend via ``skill.stub_response``; new agents without + # an entry here fall through to ``StubChatModel``'s generic placeholder. "intake": "Created INC, no prior matches. Routing to triage.", "triage": "Severity medium, category latency. No recent deploys correlate.", - "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.", + "deep_investigator": ( + "Hypothesis: upstream payments timeout. " + "Evidence: log line upstream_timeout target=payments." + ), "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.", } diff --git a/dist/apps/incident-management.py b/dist/apps/incident-management.py index 9e3a3e7..d2c257d 100644 --- a/dist/apps/incident-management.py +++ b/dist/apps/incident-management.py @@ -550,7 +550,7 @@ class IncidentState(Session): -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, ValidationError # ----- imports for runtime/tools/gateway.py ----- """Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper. @@ -1180,6 +1180,7 @@ async def _poll(self, registry): """ +from pydantic import BaseModel, ConfigDict, Field # ----- imports for runtime/memory/knowledge_graph.py ----- @@ -3196,17 +3197,13 @@ class StubChatModel(BaseChatModel): ``stub_envelope_confidence`` / ``stub_envelope_rationale`` / ``stub_envelope_signal``. - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` -> - ``ToolStrategy`` for non-native-structured-output models, including - this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The - agent loop only terminates when the LLM emits a tool call NAMED - ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name - so ``_generate`` can auto-emit a closing tool call after any - user-configured ``tool_call_plan`` is exhausted -- preserving the - pre-Phase-15 stub semantics (canned text + optional pre-scripted - tool calls) while satisfying the new tool-loop termination - contract. + Phase 22 (D-22-05): the loop terminates on natural React END now + that ``response_format=AgentTurnOutput`` is gone — tests drive + markdown-shaped canned text (the framework parses it via Path 4 + in :func:`runtime.agents.turn_output.parse_envelope_from_result`). + The Phase 15 envelope-as-callable-tool auto-emit is removed; the + stub is back to its pre-Phase-15 simple semantics (canned text + + optional pre-scripted ``tool_call_plan``). """ role: str = "default" canned_responses: dict[str, str] = Field(default_factory=dict) @@ -3215,12 +3212,6 @@ class StubChatModel(BaseChatModel): stub_envelope_rationale: str = "stub envelope rationale" stub_envelope_signal: str | None = None _called_once: bool = False - # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when - # ``langchain.agents.create_agent`` injects a structured-output tool - # for ``AgentTurnOutput``. Holds the bare tool name (e.g. - # ``"AgentTurnOutput"``) so ``_generate`` can emit a final - # envelope-shaped tool call to close the agent loop. - _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -3228,33 +3219,37 @@ def _llm_type(self) -> str: def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None, run_manager: Any = None, **kwargs: Any) -> ChatResult: - text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response") + # Phase 22 (D-22-05): tool-call rounds emit the scripted tool + # calls with a placeholder content body; the closing turn (no + # remaining tool_call_plan entries) emits the canned text + # wrapped in the D-22-03 markdown contract that + # parse_markdown_envelope reads. The stub_envelope_* + # parameters drive the trailing-section bodies so tests can + # exercise specific confidence / rationale / signal paths + # without authoring markdown by hand. tool_calls: list[dict] = [] if self.tool_call_plan and not self._called_once: for tc in self.tool_call_plan: - tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}) + tool_calls.append( + {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())} + ) self._called_once = True - elif self._envelope_tool_name is not None: - # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted - # (or wasn't configured) AND ``langchain.agents.create_agent`` - # has bound the AgentTurnOutput envelope as a tool. Emit a - # closing tool call so the loop terminates with a populated - # ``structured_response``. The args mirror the - # ``with_structured_output`` path's envelope construction so - # tests see the same confidence / rationale / signal regardless - # of whether the new tool-strategy or the legacy structured- - # output path is in play. - tool_calls.append({ - "name": self._envelope_tool_name, - "args": { - "content": text or ".", - "confidence": self.stub_envelope_confidence, - "confidence_rationale": self.stub_envelope_rationale, - "signal": self.stub_envelope_signal, - }, - "id": str(uuid4()), - }) - msg = AIMessage(content=text, tool_calls=tool_calls) + + body = self.canned_responses.get( + self.role, f"[stub:{self.role}] no canned response", + ) + # Phase 22 (D-22-05): explicit "none" when no signal so the + # parser maps to None — preserves pre-Phase-22 behaviour where a + # stub with no signal yielded envelope.signal=None. + signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none" + md = ( + f"{body}\n\n" + f"## Response\n{body}\n\n" + f"## Confidence\n{self.stub_envelope_confidence:.4f} -- " + f"{self.stub_envelope_rationale}\n\n" + f"## Signal\n{signal_str}\n" + ) + msg = AIMessage(content=md, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None, @@ -3262,30 +3257,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - """Record the AgentTurnOutput envelope-tool name when present. - - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` calls ``bind_tools(...)`` - with the user's tools PLUS the envelope-as-a-tool. We scan the - list for the AgentTurnOutput-shaped tool (matched by ``__name__`` - on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the - ``"name"`` key on dict-shaped tool specs) and remember it on the - instance so ``_generate`` can close the agent loop with a - synthetic envelope tool call after any pre-scripted - ``tool_call_plan`` is exhausted. Tools bound by the framework - itself (real BaseTools the agent should call) flow through - unchanged -- the stub still emits them only via - ``tool_call_plan``. + """Phase 22 (D-22-05): no-op tool binding. + + ``langchain.agents.create_agent`` calls ``bind_tools`` to wire + the agent's BaseTool list onto the chat model. The stub does + not actually invoke those tools (test fixtures script tool + calls via ``tool_call_plan`` instead), so we just return self + unchanged. The pre-Phase-22 envelope-tool detection is gone + because the framework no longer asks for ``response_format``. """ - for t in tools or []: - name = ( - getattr(t, "__name__", None) - or getattr(t, "name", None) - or (isinstance(t, dict) and t.get("name")) - ) - if isinstance(name, str) and name == "AgentTurnOutput": - self._envelope_tool_name = name - break return self # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]`` @@ -6666,6 +6646,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): super().__init__(message or f"envelope_missing: {field} (agent={agent})") +_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE) +_CONF_LINE = re.compile( + # Leftmost float (allows int form), optional rationale after em-dash / + # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale + # is captured wholesale. + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$", + re.DOTALL, +) + + +def _clamp_unit(x: float) -> float: + """Clamp a confidence float into [0, 1] without raising. The skill + prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or + ``-0.1`` — clamp rather than reject so the parse step is forgiving.""" + if x < 0.0: + return 0.0 + if x > 1.0: + return 1.0 + return x + + +def parse_markdown_envelope( + content: str, *, agent: str = "", +) -> "AgentTurnOutput": + """D-22-03 — parse the trailing markdown contract block into an + :class:`AgentTurnOutput`. + + Expects three sections ``## Response`` / ``## Confidence`` / + ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the + end of the LLM's reply. The free-text body of ``## Response`` + becomes ``content``; the float on the ``## Confidence`` line + becomes ``confidence`` (clamped to [0, 1]); the rest of that line + after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``; + the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/ + blank coerced to ``None``). + + Raises :class:`EnvelopeMissingError` only when the parse cannot + produce a valid envelope at all — missing ``## Confidence`` line, + unparseable confidence value, or empty ``## Response`` body. + """ + if not isinstance(content, str) or not content.strip(): + raise EnvelopeMissingError( + agent=agent, field="content", + message=f"envelope_missing: empty content (agent={agent!r})", + ) + + # ``re.split`` with the header capture-group yields: + # [pre, h1, body1, h2, body2, ...] + # Pre is the prose before the first heading; we ignore it for parse. + parts = _HEADER_SPLIT.split(content) + sections: dict[str, str] = {} + if len(parts) >= 3: + for i in range(1, len(parts) - 1, 2): + key = parts[i].strip().lower() + body = parts[i + 1].strip() + # Don't overwrite an earlier section with a later same-named + # one (unusual but possible in pathological prompts). + sections.setdefault(key, body) + + response_body = sections.get("response", "").strip() + raw_conf = sections.get("confidence", "").strip() + + if not raw_conf: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section absent or empty " + f"(agent={agent!r})" + ), + ) + + m = _CONF_LINE.match(raw_conf) + if not m: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section did not parse " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) + try: + conf_value = _clamp_unit(float(m.group(1))) + except (TypeError, ValueError) as exc: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence value not a float " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) from exc + rationale = (m.group(2) or "").strip() or "(no rationale provided)" + + signal_raw = sections.get("signal", "").strip().lower() or None + if signal_raw in {"none", "null", "", "n/a"}: + signal_raw = None + + if not response_body: + raise EnvelopeMissingError( + agent=agent, field="content", + message=( + f"envelope_missing: response section empty (agent={agent!r})" + ), + ) + + try: + return AgentTurnOutput( + content=response_body, + confidence=conf_value, + confidence_rationale=rationale, + signal=signal_raw, + ) + except ValidationError as exc: + raise EnvelopeMissingError( + agent=agent, field="validation", + message=( + f"envelope_missing: pydantic validation rejected the " + f"parsed values (agent={agent!r}): {exc}" + ), + ) from exc + + def parse_envelope_from_result( result: dict, *, @@ -6723,13 +6824,33 @@ def parse_envelope_from_result( continue break - # Path 3: fail loudly + # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's + # content. Producers that emit the new ``## Response / ## Confidence / + # ## Signal`` section block land here when Paths 1+2 yield nothing. + for msg in reversed(messages): + if msg.__class__.__name__ != "AIMessage": + continue + content = getattr(msg, "content", None) + if not isinstance(content, str) or not content.strip(): + continue + try: + return parse_markdown_envelope(content, agent=agent) + except EnvelopeMissingError: + # This AIMessage didnt carry a parseable markdown contract. + # Continue scanning earlier messages on the off-chance a + # nested loop emitted the contract one step back. + continue + break + + # Path 5 (terminal): fail loudly. None of the four paths produced a + # valid envelope — this is a real prompt-drift signal worth + # surfacing as a structured agent_run error. raise EnvelopeMissingError( agent=agent, field="structured_response", message=( - f"envelope_missing: no structured_response or JSON-decodable " - f"AIMessage envelope found (agent={agent})" + f"envelope_missing: no structured_response, JSON-decodable, or " + f"markdown-decodable AIMessage envelope found (agent={agent})" ), ) @@ -6776,6 +6897,7 @@ def reconcile_confidence( "AgentTurnOutput", "EnvelopeMissingError", "parse_envelope_from_result", + "parse_markdown_envelope", "reconcile_confidence", ] @@ -8301,24 +8423,14 @@ async def node(state: GraphState) -> dict: ] else: run_tools = tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Same + # change as graph.py:make_agent_node — drop response_format + # so the loop terminates on natural React END, then parse the + # final AIMessage via Path 4 in parse_envelope_from_result. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, ) # Phase 11 (FOC-04): reset per-turn confidence hint at the @@ -9784,24 +9896,25 @@ def _run(**kwargs: Any) -> Any: ] else: run_tools = visible_tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Drop + # ``response_format`` from create_agent — the agent loop now + # terminates on the natural React END signal (LLM emits an + # AIMessage with no tool calls). The framework parses that + # final message body as markdown via Path 4 in + # parse_envelope_from_result. Skill prompts (D-22-04) instruct + # every reply to end with ## Response / ## Confidence / ## + # Signal sections; the parser extracts them. + # + # Why: forcing LLMs through a JSON-schema-shaped output via + # ``response_format`` triggered a class of brittleness across + # providers (model-specific JSON drift, tool-strategy + React + # END interaction, recursion_limit ceilings). Markdown is the + # native format every chat model writes well; the parse step + # happens in the framework, where leniency is in our control. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, ) # Phase 11 (FOC-04): reset per-turn confidence hint. The hint @@ -9995,12 +10108,17 @@ def _decide_from_signal(inc: Session) -> str: _DEFAULT_STUB_CANNED: dict[str, str] = { - # Back-compat defaults for the four canonical agents. Any YAML-defined - # agent can override or extend this via ``skill.stub_response``; new agents - # without an entry here fall through to StubChatModel's generic placeholder. + # Plain-text bodies; ``StubChatModel._generate`` wraps these in the + # D-22-03 markdown envelope using the per-agent stub_envelope_* fields + # (the framework parses the wrapped output via Path 4). Apps can + # override or extend via ``skill.stub_response``; new agents without + # an entry here fall through to ``StubChatModel``'s generic placeholder. "intake": "Created INC, no prior matches. Routing to triage.", "triage": "Severity medium, category latency. No recent deploys correlate.", - "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.", + "deep_investigator": ( + "Hypothesis: upstream payments timeout. " + "Evidence: log line upstream_timeout target=payments." + ), "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.", } diff --git a/examples/code_review/skills/analyzer/system.md b/examples/code_review/skills/analyzer/system.md index 2996327..6b4e3c2 100644 --- a/examples/code_review/skills/analyzer/system.md +++ b/examples/code_review/skills/analyzer/system.md @@ -22,10 +22,19 @@ Do not invent low-value nits to fill space. After all tool calls, reply with ONE short sentence summarising findings count + the dominant category. Do not enumerate every finding (the UI renders them). -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. diff --git a/examples/code_review/skills/intake/system.md b/examples/code_review/skills/intake/system.md index 9aaea08..c253b97 100644 --- a/examples/code_review/skills/intake/system.md +++ b/examples/code_review/skills/intake/system.md @@ -16,10 +16,19 @@ analyzer's job. If `fetch_pr_diff` raises or returns an empty diff, emit `failed` so the orchestrator short-circuits to end and skips the analyzer. -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. diff --git a/examples/code_review/skills/recommender/system.md b/examples/code_review/skills/recommender/system.md index c3037d9..a5c5ba3 100644 --- a/examples/code_review/skills/recommender/system.md +++ b/examples/code_review/skills/recommender/system.md @@ -23,10 +23,19 @@ them already. After the call, reply with ONE short sentence echoing the recommendation. Nothing else. -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. diff --git a/examples/incident_management/skills/deep_investigator/system.md b/examples/incident_management/skills/deep_investigator/system.md index 0eb874a..d93a772 100644 --- a/examples/incident_management/skills/deep_investigator/system.md +++ b/examples/incident_management/skills/deep_investigator/system.md @@ -12,10 +12,19 @@ You are the **Deep Investigator** agent. Gather evidence and produce ranked hypo - Cite specific log lines or metric values as evidence in `hypotheses`. - If the INC has `matched_prior_inc` set, include the prior INC's recorded root cause as one of your ranked hypotheses and explicitly *validate or reject* it against the fresh logs/metrics. Same symptom can have different causes across incidents — drop confidence accordingly when the prior hypothesis is rejected so the gate triggers an intervention. -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. diff --git a/examples/incident_management/skills/resolution/system.md b/examples/incident_management/skills/resolution/system.md index 5d33130..4fd28fc 100644 --- a/examples/incident_management/skills/resolution/system.md +++ b/examples/incident_management/skills/resolution/system.md @@ -11,10 +11,19 @@ You are the **Resolution** agent. You consume triage + deep_investigator finding ## Guidelines - Pick `team` deliberately based on incident component, severity, and category — not a default fallback. -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. diff --git a/examples/incident_management/skills/triage/system.md b/examples/incident_management/skills/triage/system.md index 309f9de..8c9bc89 100644 --- a/examples/incident_management/skills/triage/system.md +++ b/examples/incident_management/skills/triage/system.md @@ -33,10 +33,19 @@ Record the full iteration trail as a single JSON-encoded string under `findings. - If the INC has `matched_prior_inc` set, treat the prior INC's `findings` and `resolution` as a **prior hypothesis**, not a fact. Same symptom (e.g., Redis OOM) can have different root causes across incidents — code bug vs. network partition vs. resource overload. Use the prior cause as a candidate to confirm or reject against current evidence; flag in your tags whether the parallel looks supported (`hypothesis:prior_match_supported`) or not (`hypothesis:prior_match_rejected`). - The hypothesis loop has a hard cap of 3 iterations. Do NOT exceed it; an unconverged hypothesis at the cap is acceptable — record it and let the deep investigator take over. -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. diff --git a/src/runtime/agents/responsive.py b/src/runtime/agents/responsive.py index 1d2f62f..a2611ce 100644 --- a/src/runtime/agents/responsive.py +++ b/src/runtime/agents/responsive.py @@ -35,7 +35,6 @@ from runtime.storage.session_store import SessionStore from runtime.tools.gateway import wrap_tool from runtime.agents.turn_output import ( - AgentTurnOutput, EnvelopeMissingError, parse_envelope_from_result, reconcile_confidence, @@ -122,24 +121,14 @@ async def node(state: GraphState) -> dict: ] else: run_tools = tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Same + # change as graph.py:make_agent_node — drop response_format + # so the loop terminates on natural React END, then parse the + # final AIMessage via Path 4 in parse_envelope_from_result. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, ) # Phase 11 (FOC-04): reset per-turn confidence hint at the diff --git a/src/runtime/graph.py b/src/runtime/graph.py index 2486a74..1c6aa49 100644 --- a/src/runtime/graph.py +++ b/src/runtime/graph.py @@ -679,24 +679,25 @@ def _run(**kwargs: Any) -> Any: ] else: run_tools = visible_tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Drop + # ``response_format`` from create_agent — the agent loop now + # terminates on the natural React END signal (LLM emits an + # AIMessage with no tool calls). The framework parses that + # final message body as markdown via Path 4 in + # parse_envelope_from_result. Skill prompts (D-22-04) instruct + # every reply to end with ## Response / ## Confidence / ## + # Signal sections; the parser extracts them. + # + # Why: forcing LLMs through a JSON-schema-shaped output via + # ``response_format`` triggered a class of brittleness across + # providers (model-specific JSON drift, tool-strategy + React + # END interaction, recursion_limit ceilings). Markdown is the + # native format every chat model writes well; the parse step + # happens in the framework, where leniency is in our control. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, ) # Phase 11 (FOC-04): reset per-turn confidence hint. The hint @@ -890,12 +891,17 @@ def _decide_from_signal(inc: Session) -> str: _DEFAULT_STUB_CANNED: dict[str, str] = { - # Back-compat defaults for the four canonical agents. Any YAML-defined - # agent can override or extend this via ``skill.stub_response``; new agents - # without an entry here fall through to StubChatModel's generic placeholder. + # Plain-text bodies; ``StubChatModel._generate`` wraps these in the + # D-22-03 markdown envelope using the per-agent stub_envelope_* fields + # (the framework parses the wrapped output via Path 4). Apps can + # override or extend via ``skill.stub_response``; new agents without + # an entry here fall through to ``StubChatModel``'s generic placeholder. "intake": "Created INC, no prior matches. Routing to triage.", "triage": "Severity medium, category latency. No recent deploys correlate.", - "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.", + "deep_investigator": ( + "Hypothesis: upstream payments timeout. " + "Evidence: log line upstream_timeout target=payments." + ), "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.", } diff --git a/src/runtime/llm.py b/src/runtime/llm.py index 17ee42f..2b6fedb 100644 --- a/src/runtime/llm.py +++ b/src/runtime/llm.py @@ -45,17 +45,13 @@ class StubChatModel(BaseChatModel): ``stub_envelope_confidence`` / ``stub_envelope_rationale`` / ``stub_envelope_signal``. - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` -> - ``ToolStrategy`` for non-native-structured-output models, including - this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The - agent loop only terminates when the LLM emits a tool call NAMED - ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name - so ``_generate`` can auto-emit a closing tool call after any - user-configured ``tool_call_plan`` is exhausted -- preserving the - pre-Phase-15 stub semantics (canned text + optional pre-scripted - tool calls) while satisfying the new tool-loop termination - contract. + Phase 22 (D-22-05): the loop terminates on natural React END now + that ``response_format=AgentTurnOutput`` is gone — tests drive + markdown-shaped canned text (the framework parses it via Path 4 + in :func:`runtime.agents.turn_output.parse_envelope_from_result`). + The Phase 15 envelope-as-callable-tool auto-emit is removed; the + stub is back to its pre-Phase-15 simple semantics (canned text + + optional pre-scripted ``tool_call_plan``). """ role: str = "default" canned_responses: dict[str, str] = Field(default_factory=dict) @@ -64,12 +60,6 @@ class StubChatModel(BaseChatModel): stub_envelope_rationale: str = "stub envelope rationale" stub_envelope_signal: str | None = None _called_once: bool = False - # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when - # ``langchain.agents.create_agent`` injects a structured-output tool - # for ``AgentTurnOutput``. Holds the bare tool name (e.g. - # ``"AgentTurnOutput"``) so ``_generate`` can emit a final - # envelope-shaped tool call to close the agent loop. - _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -77,33 +67,37 @@ def _llm_type(self) -> str: def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None, run_manager: Any = None, **kwargs: Any) -> ChatResult: - text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response") + # Phase 22 (D-22-05): tool-call rounds emit the scripted tool + # calls with a placeholder content body; the closing turn (no + # remaining tool_call_plan entries) emits the canned text + # wrapped in the D-22-03 markdown contract that + # parse_markdown_envelope reads. The stub_envelope_* + # parameters drive the trailing-section bodies so tests can + # exercise specific confidence / rationale / signal paths + # without authoring markdown by hand. tool_calls: list[dict] = [] if self.tool_call_plan and not self._called_once: for tc in self.tool_call_plan: - tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}) + tool_calls.append( + {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())} + ) self._called_once = True - elif self._envelope_tool_name is not None: - # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted - # (or wasn't configured) AND ``langchain.agents.create_agent`` - # has bound the AgentTurnOutput envelope as a tool. Emit a - # closing tool call so the loop terminates with a populated - # ``structured_response``. The args mirror the - # ``with_structured_output`` path's envelope construction so - # tests see the same confidence / rationale / signal regardless - # of whether the new tool-strategy or the legacy structured- - # output path is in play. - tool_calls.append({ - "name": self._envelope_tool_name, - "args": { - "content": text or ".", - "confidence": self.stub_envelope_confidence, - "confidence_rationale": self.stub_envelope_rationale, - "signal": self.stub_envelope_signal, - }, - "id": str(uuid4()), - }) - msg = AIMessage(content=text, tool_calls=tool_calls) + + body = self.canned_responses.get( + self.role, f"[stub:{self.role}] no canned response", + ) + # Phase 22 (D-22-05): explicit "none" when no signal so the + # parser maps to None — preserves pre-Phase-22 behaviour where a + # stub with no signal yielded envelope.signal=None. + signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none" + md = ( + f"{body}\n\n" + f"## Response\n{body}\n\n" + f"## Confidence\n{self.stub_envelope_confidence:.4f} -- " + f"{self.stub_envelope_rationale}\n\n" + f"## Signal\n{signal_str}\n" + ) + msg = AIMessage(content=md, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None, @@ -111,30 +105,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - """Record the AgentTurnOutput envelope-tool name when present. - - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` calls ``bind_tools(...)`` - with the user's tools PLUS the envelope-as-a-tool. We scan the - list for the AgentTurnOutput-shaped tool (matched by ``__name__`` - on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the - ``"name"`` key on dict-shaped tool specs) and remember it on the - instance so ``_generate`` can close the agent loop with a - synthetic envelope tool call after any pre-scripted - ``tool_call_plan`` is exhausted. Tools bound by the framework - itself (real BaseTools the agent should call) flow through - unchanged -- the stub still emits them only via - ``tool_call_plan``. + """Phase 22 (D-22-05): no-op tool binding. + + ``langchain.agents.create_agent`` calls ``bind_tools`` to wire + the agent's BaseTool list onto the chat model. The stub does + not actually invoke those tools (test fixtures script tool + calls via ``tool_call_plan`` instead), so we just return self + unchanged. The pre-Phase-22 envelope-tool detection is gone + because the framework no longer asks for ``response_format``. """ - for t in tools or []: - name = ( - getattr(t, "__name__", None) - or getattr(t, "name", None) - or (isinstance(t, dict) and t.get("name")) - ) - if isinstance(name, str) and name == "AgentTurnOutput": - self._envelope_tool_name = name - break return self # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]`` diff --git a/tests/_envelope_helpers.py b/tests/_envelope_helpers.py index 13485a1..b2ef03d 100644 --- a/tests/_envelope_helpers.py +++ b/tests/_envelope_helpers.py @@ -62,12 +62,6 @@ class EnvelopeStubChatModel(BaseChatModel): canned_responses: dict[str, str] = Field(default_factory=dict) tool_call_plan: list[dict] | None = None _called_once: bool = False - # Phase 15 (LLM-COMPAT-01): same contract as ``StubChatModel`` -- - # ``langchain.agents.create_agent``'s ToolStrategy injects - # ``AgentTurnOutput`` as a tool; ``bind_tools`` records the name - # so ``_generate`` can emit a closing envelope tool call once any - # pre-scripted ``tool_call_plan`` is exhausted. - _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -80,7 +74,12 @@ def _generate( run_manager: Any = None, **kwargs: Any, ) -> ChatResult: - text = self.canned_responses.get(self.role, self.envelope_content) + # Phase 22 (D-22-05): emit a markdown-shaped final message + # matching the framework's parse_markdown_envelope contract. + # The envelope_* fields drive the section bodies. When the + # caller supplies a per-role canned response we use that for + # the ``## Response`` body and keep the envelope_* + # confidence/rationale/signal as the trailing-section content. tool_calls: list[dict] = [] if self.tool_call_plan and not self._called_once: for tc in self.tool_call_plan: @@ -88,20 +87,20 @@ def _generate( {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())} ) self._called_once = True - elif self._envelope_tool_name is not None: - # Phase 15 (LLM-COMPAT-01): close the agent loop by emitting - # the envelope-shaped tool call ToolStrategy is waiting for. - tool_calls.append({ - "name": self._envelope_tool_name, - "args": { - "content": self.envelope_content, - "confidence": self.envelope_confidence, - "confidence_rationale": self.envelope_rationale, - "signal": self.envelope_signal, - }, - "id": str(uuid4()), - }) - msg = AIMessage(content=text, tool_calls=tool_calls) + + body = self.canned_responses.get(self.role, self.envelope_content) + # Phase 22 (D-22-05): explicit "none" when no signal so the + # parser maps to None — preserves pre-Phase-22 behaviour where a + # stub with no signal yielded envelope.signal=None. + signal_str = self.envelope_signal if self.envelope_signal is not None else "none" + md = ( + f"{body}\n\n" + f"## Response\n{body}\n\n" + f"## Confidence\n{self.envelope_confidence:.4f} -- " + f"{self.envelope_rationale}\n\n" + f"## Signal\n{signal_str}\n" + ) + msg = AIMessage(content=md, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) async def _agenerate( @@ -114,18 +113,9 @@ async def _agenerate( return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - # Phase 15 (LLM-COMPAT-01): record the AgentTurnOutput tool - # name so ``_generate`` can emit a closing tool call. See - # ``StubChatModel.bind_tools`` for the matching heuristic. - for t in tools or []: - name = ( - getattr(t, "__name__", None) - or getattr(t, "name", None) - or (isinstance(t, dict) and t.get("name")) - ) - if isinstance(name, str) and name == "AgentTurnOutput": - self._envelope_tool_name = name - break + # Phase 22 (D-22-05): no-op tool binding. The envelope-as-tool + # synthesis is gone; the markdown closing turn is emitted + # directly by _generate. return self def with_structured_output(self, schema, *, include_raw: bool = False, **kwargs): diff --git a/tests/test_real_llm_tool_loop_termination.py b/tests/test_real_llm_tool_loop_termination.py index 8db3284..fc3bae7 100644 --- a/tests/test_real_llm_tool_loop_termination.py +++ b/tests/test_real_llm_tool_loop_termination.py @@ -282,26 +282,27 @@ def test_no_create_react_agent_imports_in_production_runtime(): # T4-bonus — StubChatModel.bind_tools registers the envelope tool name -def test_stub_chat_model_records_envelope_tool_name_on_bind(): - """``StubChatModel.bind_tools`` is the integration point that lets - the new ``create_agent`` loop terminate in stub mode. This test - locks the contract: when the bound tools include an - ``AgentTurnOutput``-named entry, the stub records it and emits a - closing tool call with that name on the next ``_generate``. - """ +def test_stub_chat_model_emits_markdown_envelope_block(): + """Phase 22 (D-22-05) replaces the Phase 15 envelope-tool + synthesis. ``StubChatModel.bind_tools`` is now a no-op; ``_generate`` + emits the closing turn as a markdown block matching the D-22-03 + contract that ``parse_markdown_envelope`` reads.""" llm = StubChatModel(role="agent", canned_responses={"agent": "ok"}) - # Simulate what create_agent's ToolStrategy passes: a sequence of - # tool specs where the AgentTurnOutput-named tool is the structured- - # output sentinel. - llm.bind_tools([AgentTurnOutput]) - assert llm._envelope_tool_name == "AgentTurnOutput" + # bind_tools no longer mutates state — it is a pass-through. + bound = llm.bind_tools([AgentTurnOutput]) + assert bound is llm + assert not hasattr(llm, "_envelope_tool_name") - # Drive a single _generate and verify the closing tool call lands. + # Drive a single _generate and verify the closing AIMessage carries + # the markdown envelope (no tool_calls when no tool_call_plan). result = llm._generate(messages=[HumanMessage(content="go")]) msg = result.generations[0].message - assert msg.tool_calls, "expected a closing envelope tool call" - assert msg.tool_calls[0]["name"] == "AgentTurnOutput" - args = msg.tool_calls[0]["args"] - assert args["content"] == "ok" - assert args["confidence"] == pytest.approx(0.85) - assert "confidence_rationale" in args + assert not msg.tool_calls + body = msg.content + assert "## Response" in body + assert "## Confidence" in body + assert "## Signal" in body + # The default stub_envelope_confidence (0.85) and the canned text + # ("ok") flow into the markdown body. + assert "ok" in body + assert "0.8500" in body From 41f9ca60a8a68a1bcec85ed8185b0971d377d6cb Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 16:20:02 +0000 Subject: [PATCH 3/6] =?UTF-8?q?feat(turn=5Foutput):=20gpt-oss=20markdown?= =?UTF-8?q?=20reliability=20=E2=80=94=20Path=205/6=20fallbacks=20+=20dash?= =?UTF-8?q?=20family=20+=20tightened=20skill=20prompts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Phase 22 markdown contract is brittle on gpt-oss:20b which intermittently drops the closing ## Response/Confidence/Signal block after a tool call (it treats the terminal-tool invocation as completion). Add three layers of resilience: * parse_envelope_from_result Path 5 — synthesise an envelope from a typed-terminal-tool's confidence + confidence_rationale args when no markdown contract was emitted. * parse_envelope_from_result Path 6 — permissive fallback when ANY tool was invoked but no envelope surfaced; emits a low-confidence (0.30) placeholder so the session reaches a terminal status instead of hard-failing with EnvelopeMissingError. * Dash family in the confidence-line regex now spans the full Unicode Pd block (\\u2010-\\u2015 + ASCII hyphen) — gpt-oss prefers EN DASH (\\u2013) where the spec assumed EM DASH only. * All six incident_management + code_review skill prompts now carry an explicit "CRITICAL — final-reply rule" paragraph forbidding the empty closing message pattern. Also flip the default LLM model to gpt_oss (ollama_cloud / gpt-oss:20b) so live testing exercises the path that actually needs the resilience. --- config/config.yaml | 2 +- .../code_review/skills/analyzer/system.md | 4 +- examples/code_review/skills/intake/system.md | 4 +- .../code_review/skills/recommender/system.md | 4 +- .../skills/deep_investigator/system.md | 4 +- .../skills/resolution/system.md | 4 +- .../skills/triage/system.md | 4 +- src/runtime/agents/turn_output.py | 135 +++++++++++++++++- tests/test_markdown_turn_output.py | 104 ++++++++++++++ 9 files changed, 255 insertions(+), 10 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 0b9b046..000a2ce 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -7,7 +7,7 @@ storage: collection_name: "incidents" distance_strategy: cosine llm: - default: workhorse + default: gpt_oss providers: ollama_cloud: kind: ollama diff --git a/examples/code_review/skills/analyzer/system.md b/examples/code_review/skills/analyzer/system.md index 6b4e3c2..9ef3a6e 100644 --- a/examples/code_review/skills/analyzer/system.md +++ b/examples/code_review/skills/analyzer/system.md @@ -37,4 +37,6 @@ Every reply MUST end with these three markdown sections, in this order, with the ``` -Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/examples/code_review/skills/intake/system.md b/examples/code_review/skills/intake/system.md index c253b97..60e39a7 100644 --- a/examples/code_review/skills/intake/system.md +++ b/examples/code_review/skills/intake/system.md @@ -31,4 +31,6 @@ Every reply MUST end with these three markdown sections, in this order, with the ``` -Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/examples/code_review/skills/recommender/system.md b/examples/code_review/skills/recommender/system.md index a5c5ba3..859bf60 100644 --- a/examples/code_review/skills/recommender/system.md +++ b/examples/code_review/skills/recommender/system.md @@ -38,4 +38,6 @@ Every reply MUST end with these three markdown sections, in this order, with the ``` -Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/examples/incident_management/skills/deep_investigator/system.md b/examples/incident_management/skills/deep_investigator/system.md index d93a772..142fbbe 100644 --- a/examples/incident_management/skills/deep_investigator/system.md +++ b/examples/incident_management/skills/deep_investigator/system.md @@ -27,4 +27,6 @@ Every reply MUST end with these three markdown sections, in this order, with the ``` -Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/examples/incident_management/skills/resolution/system.md b/examples/incident_management/skills/resolution/system.md index 4fd28fc..2bf10f1 100644 --- a/examples/incident_management/skills/resolution/system.md +++ b/examples/incident_management/skills/resolution/system.md @@ -26,4 +26,6 @@ Every reply MUST end with these three markdown sections, in this order, with the ``` -Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/examples/incident_management/skills/triage/system.md b/examples/incident_management/skills/triage/system.md index 8c9bc89..58e5517 100644 --- a/examples/incident_management/skills/triage/system.md +++ b/examples/incident_management/skills/triage/system.md @@ -48,4 +48,6 @@ Every reply MUST end with these three markdown sections, in this order, with the ``` -Tool calls happen BEFORE this block. Once you emit `## Response` you are done — no more tool calls. The framework parses these sections; missing sections are a hard error. +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/src/runtime/agents/turn_output.py b/src/runtime/agents/turn_output.py index 99a72ff..d6a3d02 100644 --- a/src/runtime/agents/turn_output.py +++ b/src/runtime/agents/turn_output.py @@ -93,7 +93,7 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): # Leftmost float (allows int form), optional rationale after em-dash / # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale # is captured wholesale. - r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$", + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$", re.DOTALL, ) @@ -284,9 +284,138 @@ def parse_envelope_from_result( continue break - # Path 5 (terminal): fail loudly. None of the four paths produced a + # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular) + # treat a terminal-tool call as the completion and emit an empty + # closing AIMessage. Apps configure typed-terminal tools with + # ``confidence`` + ``confidence_rationale`` args by contract — + # synthesise an envelope from those args when the markdown path + # yielded nothing. Skip if any earlier path succeeded. + for msg in reversed(messages): + tcs = getattr(msg, "tool_calls", None) + if not tcs: + continue + for tc in tcs: + args = tc.get("args") or {} + if not isinstance(args, dict): + continue + conf = args.get("confidence") + rat = args.get("confidence_rationale") or args.get("rationale") + if conf is None or not rat: + continue + try: + conf_f = float(conf) + except (TypeError, ValueError): + continue + content_body = ( + args.get("resolution_summary") + or args.get("reason") + or args.get("summary") + or f"terminal tool {tc.get('name', '')} invoked" + ) + if not isinstance(content_body, str) or not content_body.strip(): + content_body = f"terminal tool {tc.get('name', '')} invoked" + try: + env = AgentTurnOutput( + content=content_body, + confidence=max(0.0, min(1.0, conf_f)), + confidence_rationale=str(rat), + signal=args.get("signal"), + ) + except ValidationError: + continue + _LOG.info( + "envelope_synthesized_from_tool_args agent=%s tool=%s " + "confidence=%.2f", + agent, tc.get("name"), env.confidence, + ) + return env + + # Path 6 (D-22-fallback-permissive): the model called at least one + # tool but never emitted the markdown contract or a typed-terminal + # tool with confidence args. Rather than hard-failing the run, + # synthesise a minimal envelope so the session reaches a terminal + # status (typically default_terminal_status -> needs_review) and + # the operator can review what happened. Confidence is intentionally + # low (0.30) so any HITL gate fires. + invoked_tool_names: list[str] = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + n = tc.get("name") + if n: + invoked_tool_names.append(n) + if invoked_tool_names: + body = ( + f"Agent {agent} invoked " + f"{', '.join(invoked_tool_names)} but did not emit a final " + f"summary. Review the tool-call audit for outcome." + ) + try: + env = AgentTurnOutput( + content=body, + confidence=0.30, + confidence_rationale=( + "synthesized: agent invoked tools but emitted no " + "closing message" + ), + signal=None, + ) + _LOG.warning( + "envelope_synthesized_permissive agent=%s tools=%s", + agent, invoked_tool_names, + ) + return env + except ValidationError: + pass + + # Path 7 (terminal): fail loudly. None of the six paths produced a # valid envelope — this is a real prompt-drift signal worth - # surfacing as a structured agent_run error. + # surfacing as a structured agent_run error. Log the last + # AIMessage content (capped) so operators can diagnose whether + # the LLM emitted the wrong shape vs nothing at all. + last_ai_content = "" + for msg in reversed(messages): + if msg.__class__.__name__ == "AIMessage": + c = getattr(msg, "content", None) + if isinstance(c, str): + last_ai_content = c + break + # Also dump every tool_call across all AIMessages so we can see + # whether the model called any tool with confidence args (Path 5 + # would have fired) or whether it called a non-terminal tool only. + tool_call_summary = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + args = tc.get("args") or {} + tool_call_summary.append({ + "name": tc.get("name"), + "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?", + "has_confidence": isinstance(args, dict) and "confidence" in args, + "has_rationale": isinstance(args, dict) and ( + "confidence_rationale" in args or "rationale" in args + ), + }) + # Detailed per-message dump so we can see exactly what shape the + # conversation took when the parse failed. + msg_summary = [] + for m in messages: + kind = m.__class__.__name__ + c = getattr(m, "content", None) + c_len = len(c) if isinstance(c, str) else ( + f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__ + ) + tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])] + msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs}) + _LOG.warning( + "envelope_missing: agent=%s last_ai_message_content=%r " + "structured_response_type=%s message_count=%d tool_calls=%r " + "msg_trace=%r", + agent, + last_ai_content[:1500] if last_ai_content else "", + type(result.get("structured_response")).__name__, + len(messages), + tool_call_summary, + msg_summary, + ) raise EnvelopeMissingError( agent=agent, field="structured_response", diff --git a/tests/test_markdown_turn_output.py b/tests/test_markdown_turn_output.py index 4049cfe..1d7480a 100644 --- a/tests/test_markdown_turn_output.py +++ b/tests/test_markdown_turn_output.py @@ -289,3 +289,107 @@ def test_parse_envelope_from_result_raises_when_no_path_yields(): result = {"messages": [_ai("just prose, no sections")]} with pytest.raises(EnvelopeMissingError): parse_envelope_from_result(result, agent="triage") + + +def test_en_dash_rationale_gpt_oss_pattern(): + """gpt-oss:20b emits the EN DASH (U+2013) as the confidence + separator, not the EM DASH (U+2014). The parser must accept the + full Unicode dash-punctuation family.""" + md = "## Response\nbody\n## Confidence\n0.88 – strong evidence\n## Signal\nsuccess\n" + env = parse_markdown_envelope(md) + assert env.confidence == 0.88 + assert env.confidence_rationale == "strong evidence" + assert env.signal == "success" + + +@pytest.mark.parametrize("sep", ["-", "‐", "‑", "‒", "–", "—", "―"]) +def test_dash_family_variants_all_parse(sep): + """Sanity sweep: every Unicode dash in the regex class round-trips.""" + md = f"## Response\nbody\n## Confidence\n0.5 {sep} ok\n## Signal\ndefault\n" + env = parse_markdown_envelope(md) + assert env.confidence == 0.5 + assert env.confidence_rationale == "ok" + + +def test_path5_synthesises_envelope_from_terminal_tool_args(): + """gpt-oss:20b sometimes emits an empty closing AIMessage after + calling a terminal tool. Path 5 synthesises the envelope from + the tool's confidence + confidence_rationale args (mark_resolved + style).""" + from runtime.agents.turn_output import parse_envelope_from_result + + def _ai_with_tool(name: str, args: dict, content: str = ""): + obj = type("AIMessage", (), {})() + obj.content = content + obj.tool_calls = [{"name": name, "args": args, "id": "t1"}] + return obj + + def _ai_empty(): + obj = type("AIMessage", (), {})() + obj.content = "" + obj.tool_calls = [] + return obj + + messages = [ + _ai_with_tool( + "mark_resolved", + { + "incident_id": "INC-1", + "resolution_summary": "rolled back v1.120", + "confidence": 0.92, + "confidence_rationale": "deploy timeline correlates", + }, + ), + _ai_empty(), # empty closing message + ] + result = {"messages": messages, "structured_response": None} + env = parse_envelope_from_result(result, agent="resolution") + assert env.content == "rolled back v1.120" + assert env.confidence == 0.92 + assert env.confidence_rationale == "deploy timeline correlates" + + +def test_path5_synthesis_clamps_oob_confidence(): + """Synthesis path also clamps out-of-range confidence.""" + from runtime.agents.turn_output import parse_envelope_from_result + + obj = type("AIMessage", (), {})() + obj.content = "" + obj.tool_calls = [{ + "name": "mark_escalated", + "args": { + "team": "platform-oncall", + "reason": "needs human review", + "confidence": 1.5, + "confidence_rationale": "manual escalation", + }, + "id": "t1", + }] + result = {"messages": [obj]} + env = parse_envelope_from_result(result, agent="resolution") + assert env.confidence == 1.0 # clamped + assert env.content == "needs human review" + + +def test_path6_synthesises_minimal_envelope_from_any_tool_call(): + """Path 6: when no markdown + no typed-terminal-tool-with-conf, + but the model DID call some tool, synthesise a low-confidence + placeholder envelope so the session reaches a reviewable terminal + status instead of hard-failing.""" + from runtime.agents.turn_output import parse_envelope_from_result + + obj = type("AIMessage", (), {})() + obj.content = "" + obj.tool_calls = [ + {"name": "propose_fix", "args": {"hypothesis": "deploy regression"}, "id": "t1"}, + {"name": "apply_fix", "args": {"proposal_id": "p1"}, "id": "t2"}, + ] + closing = type("AIMessage", (), {})() + closing.content = "" + closing.tool_calls = [] + result = {"messages": [obj, closing], "structured_response": None} + env = parse_envelope_from_result(result, agent="resolution") + assert env.confidence == 0.30 + assert "propose_fix" in env.content + assert "apply_fix" in env.content + assert env.signal is None From e6e521d92489e6e3ea37848ac4ec1c80069ea1a0 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 16:20:52 +0000 Subject: [PATCH 4/6] fix(hitl): make HITL approve/reject actually work end-to-end on langgraph 1.x MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit langgraph 1.x changed the interrupt() contract: a tool that calls interrupt() no longer raises GraphInterrupt to the caller. Instead, the agent's ainvoke returns a normal result dict with an extra __interrupt__ key. The framework's existing "except GraphInterrupt: raise" arms in graph.py / responsive.py never fired — and the consequences cascaded into four distinct failure modes that all conspired to make Approve/Reject buttons look broken: 1. Interrupt missed at the agent runner. The runner saw an empty __interrupt__ result, fell through to envelope parsing (Path 6 synthesis), recorded a phantom AgentRun, and routed onward. The pending_approval row written by the gateway just before the interrupt was orphaned the moment the session finalized. 2. Resume forwarding broken. Even after we caught __interrupt__ and re-raised GraphInterrupt, calling inner.invoke({"messages": [...]}) on the rehydrated outer state silently SKIPPED the gated tool — the inner agent treated the new HumanMessage as a continuation instead of resuming the paused tool. No ToolMessage produced, no verdict delivered. 3. Stale state on resume. Outer Pregel checkpoints at step boundaries, not mid-step. So state["session"] on resume reflected the prior step's output (e.g. deep_investigator's), not the gateway's mid-step pending_approval append + version bump. The gateway then double-appended a pending row and store.save raised StaleVersionError ("version is N, expected N-1"). 4. Status transitions never persisted. The gateway mutated session.tool_calls[pending_idx] in-memory to "approved" / "rejected" / "timeout" but never called store.save — so the DB row stayed at pending_approval forever, and the UI kept offering the buttons. Fixes: * New runtime.graph._drive_agent_with_resume helper drives the inner agent with proper pause-detection: probe inner.aget_state for a paused step, forward Command(resume=verdict) when resuming, loop on __interrupt__ for multi-pause turns. Falls back to invoke + re-raise when no checkpointer is configured (legacy unit-test path). * Inner create_agent now receives the orchestrator's checkpointer — required for Command(resume=...) to function. Inner thread id is derived deterministically per agent invocation: f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}". * make_agent_node (both graph.py + responsive.py) reloads the session from store at entry — defends against stale state["session"] from the outer Pregel checkpoint. wrap_tool closures capture the freshly loaded incident so the gateway's _find_existing_pending_index sees the persisted pending row on resume. * gateway.wrap_tool (_run + _arun) calls store.save after every status transition (rejected / timeout / approved) so the audit row reflects the actual outcome. * Orchestrator gains _is_graph_paused(session_id) helper and uses it to guard _finalize_session_status_async in stream_session + retry_session — a HITL pause must not be coerced into default_terminal_status. * UI _submit_approval_via_service and the API POST /sessions/{sid}/approvals/{tcid} both finalize after a verdict-driven resume completes (and skip if a fresh interrupt re-paused the graph). Without this, an approved session would stay in_progress forever because the resume path bypasses stream_session. Tests: * tests/test_interrupt_detection.py — three tests: graph + responsive runners both raise GraphInterrupt when ainvoke returns __interrupt__, and a resume happy path that drives an outer Pregel through pause + Command(resume="approve") and asserts a ToolMessage with the verdict appears in the inner messages (proves the gated tool actually re-executed instead of being silently skipped). * tests/test_genericity_ratchet.py baseline 154 → 156 with rationale: the inner-thread-id derivation reuses the existing `incident` local twice (graph.py + responsive.py), same pattern as Phase 10/11/12. --- src/runtime/agents/responsive.py | 40 +++- src/runtime/api.py | 10 + src/runtime/graph.py | 160 ++++++++++++- src/runtime/orchestrator.py | 54 ++++- src/runtime/tools/gateway.py | 18 ++ src/runtime/ui.py | 20 ++ tests/test_genericity_ratchet.py | 13 +- tests/test_interrupt_detection.py | 370 ++++++++++++++++++++++++++++++ 8 files changed, 661 insertions(+), 24 deletions(-) create mode 100644 tests/test_interrupt_detection.py diff --git a/src/runtime/agents/responsive.py b/src/runtime/agents/responsive.py index a2611ce..97b81b3 100644 --- a/src/runtime/agents/responsive.py +++ b/src/runtime/agents/responsive.py @@ -20,7 +20,7 @@ import logging from datetime import datetime, timezone -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING, Any, Callable from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import HumanMessage @@ -59,6 +59,7 @@ def make_agent_node( patch_tool_names: frozenset[str] = frozenset(), gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ): """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -80,7 +81,7 @@ def make_agent_node( # call time — both modules are fully imported before ``node()`` runs. from runtime.graph import ( GraphState, - _ainvoke_with_retry, + _drive_agent_with_resume, _format_agent_input, _handle_agent_failure, _harvest_tool_calls_and_patches, @@ -93,9 +94,18 @@ def make_agent_node( ) async def node(state: GraphState) -> dict: - incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] - inc_id = incident.id + state_session: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from store at entry — outer Pregel checkpoints + # at step boundaries, not mid-node, so ``state["session"]`` is + # stale relative to DB when a HITL gate paused mid-step. See + # the same reload comment in ``runtime.graph.make_agent_node`` + # for the full rationale. + try: + incident: Session = store.load(inc_id) + except FileNotFoundError: + incident = state_session # M3: emit agent_started telemetry before any work happens. if event_log is not None: @@ -125,11 +135,21 @@ async def node(state: GraphState) -> dict: # change as graph.py:make_agent_node — drop response_format # so the loop terminates on natural React END, then parse the # final AIMessage via Path 4 in parse_envelope_from_result. + # The inner agent gets the orchestrator's checkpointer so a + # HITL pause inside ``interrupt()`` can be resumed via + # ``Command(resume=verdict)`` on a stable per-invocation + # thread id (see ``_drive_agent_with_resume`` in + # ``runtime.graph`` for the full rationale). agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, + checkpointer=checkpointer, + ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint at the # start of each agent step so the gateway treats the first @@ -140,9 +160,15 @@ async def node(state: GraphState) -> dict: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up. diff --git a/src/runtime/api.py b/src/runtime/api.py index c761230..0d3c11b 100644 --- a/src/runtime/api.py +++ b/src/runtime/api.py @@ -621,6 +621,16 @@ async def _resume() -> None: Command(resume=decision_payload), config=orch._thread_config(session_id), ) + # Finalize after the verdict-driven run completes so + # the session row reflects the terminal status (or + # falls through to ``default_terminal_status``). Skip + # if a fresh interrupt re-paused the graph — the + # session is again awaiting operator input. Mirrors + # the same guard in the UI's + # ``_submit_approval_via_service`` and the + # ``stream_session`` finalize call site. + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) # Submit the resume onto the long-lived service loop so we # don't fight the lifespan thread for the same FastMCP/SQLite diff --git a/src/runtime/graph.py b/src/runtime/graph.py index 1c6aa49..19b589f 100644 --- a/src/runtime/graph.py +++ b/src/runtime/graph.py @@ -199,8 +199,104 @@ def finish(self, *, summary: str) -> None: ) +async def _drive_agent_with_resume( + *, + agent_executor, + inner_cfg: dict, + inner_has_checkpointer: bool, + initial_input: dict, +): + """Run an inner ``create_agent`` executor, surfacing HITL pauses to + the outer Pregel graph and forwarding resume verdicts back to the + inner agent. + + langgraph 1.x changed the ``interrupt()`` contract: a tool that + calls :func:`langgraph.types.interrupt` no longer raises + :class:`GraphInterrupt` to the caller. Instead, the agent's + ``ainvoke`` returns normally with the pause captured in + ``result["__interrupt__"]``. To make HITL approval actually work + end-to-end the framework must: + + 1. **First call** — invoke the agent with the initial messages. + If the result carries ``__interrupt__``, raise + :class:`GraphInterrupt` so the *outer* Pregel pauses too. The + outer pause is what surfaces ``__interrupt__`` to the + orchestrator and lets the UI render Approve / Reject. + + 2. **Resume call** — when the outer is resumed via + ``Command(resume=verdict)``, this helper is re-entered. The + inner's checkpointer remembers the pause; we detect that via + ``aget_state(...).next`` being non-empty and call + :func:`langgraph.types.interrupt` at the *outer* level to fetch + the verdict, then forward it to the inner via + ``Command(resume=verdict)``. This is the only path that + actually re-runs the gated tool with the verdict — calling + ``inner.ainvoke({"messages": [...]})`` on a paused thread + silently skips the tool. + + A while loop covers the case where a single agent turn pauses + multiple times (e.g. two high-risk tool calls in sequence). + + When ``inner_has_checkpointer`` is False (legacy callers and most + unit tests that build agents without a checkpointer), the helper + falls back to the simple "invoke + re-raise on ``__interrupt__``" + contract. That path can pause the outer graph but the resume + verdict cannot reach the inner tool — which is acceptable because + legacy callers without a checkpointer never had a working HITL + resume path to begin with. + """ + # Local imports keep callers that never touch HITL out of the + # langgraph.types import cost path. ``Command`` and ``interrupt`` + # are tiny but the explicit lazy import documents the dependency. + from langgraph.types import Command, interrupt + + # Resume-detection branch: only meaningful when the inner agent + # has its own checkpointer (otherwise ``aget_state`` on a fresh + # in-memory pregel raises). The detection asks: "does the inner + # have a paused step waiting?". A non-empty ``next`` means yes. + if inner_has_checkpointer: + try: + inner_state = await agent_executor.aget_state(inner_cfg) + inner_paused = bool(getattr(inner_state, "next", ()) or ()) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == fresh + inner_paused = False + else: + inner_paused = False + + if inner_paused: + # Outer is being resumed; pull the verdict via the outer's own + # interrupt() and forward it. The payload here is informational — + # the only thing the outer cares about is the resume value. + verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]}) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + else: + result = await _ainvoke_with_retry( + agent_executor, initial_input, config=inner_cfg, + ) + + # Multi-pause loop: if the inner pauses again (or resumes into a + # second gated tool call), surface each pause to the outer the + # same way until the inner is fully done. + while isinstance(result, dict) and result.get("__interrupt__"): + if not inner_has_checkpointer: + # No checkpointer => no Command(resume=...) path is + # available; surface the pause to the outer Pregel and + # stop. Resume cannot deliver the verdict to the gated + # tool, but that matches legacy behavior. + raise GraphInterrupt(result["__interrupt__"]) + verdict = interrupt(result["__interrupt__"]) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + + return result + + async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, - base_delay: float = 1.5): + base_delay: float = 1.5, + config: dict | None = None): """Wrap a LangGraph agent invocation with retry on transient cloud errors. Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.). @@ -218,6 +314,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, # itself (AutoStrategy → ToolStrategy fallback for non- # function-calling Ollama models). The default langgraph # recursion bound is now a true upper bound, not a workaround. + # + # ``config`` (default ``None``) carries the per-thread + # ``configurable.thread_id`` for callers that need a + # checkpointer-scoped invocation (HITL resume path — + # see :func:`_drive_agent_with_resume`). Legacy callers + # pass nothing and the executor's default thread is used. + if config is not None: + return await executor.ainvoke(input_, config=config) return await executor.ainvoke(input_) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause. @@ -550,6 +654,7 @@ def make_agent_node( injected_args: dict[str, str] | None = None, gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ) -> Callable[[GraphState], Awaitable[dict]]: """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -581,10 +686,23 @@ def make_agent_node( """ async def node(state: GraphState) -> dict: - incident = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session - inc_id = incident.id + state_session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from the persistent store at entry. Outer Pregel + # checkpoints at step boundaries, not mid-node — so on a HITL + # resume, ``state["session"]`` reflects the prior step's output + # (e.g. deep_investigator's), not the gateway's pending_approval + # row + version bump that happened mid-step in the original run. + # If we trust ``state["session"]`` here, the gateway sees no + # pending row, appends a duplicate, then ``store.save`` raises + # ``StaleVersionError`` because DB has already moved on. + try: + incident = store.load(inc_id) + except FileNotFoundError: + incident = state_session + # M3 (per-step telemetry): emit agent_started. if event_log is not None: try: @@ -694,11 +812,28 @@ def _run(**kwargs: Any) -> Any: # END interaction, recursion_limit ceilings). Markdown is the # native format every chat model writes well; the parse step # happens in the framework, where leniency is in our control. + # The inner agent gets a checkpointer so a HITL pause inside + # ``interrupt()`` can be resumed via ``Command(resume=verdict)`` + # on the SAME inner thread. langgraph 1.x semantics require the + # checkpointer here — without it ``Command(resume=...)`` raises + # ``RuntimeError: Cannot use Command(resume=...) without + # checkpointer``. The thread id is derived deterministically + # from session + agent + the upcoming agent_run index so it is: + # * STABLE across the inner pause and the outer resume that + # follows (both observe the same ``len(incident.agents_run)`` + # because no new run is recorded mid-pause), and + # * UNIQUE per agent invocation so previous invocations of the + # same agent within the same session don't bleed in. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, + checkpointer=checkpointer, + ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint. The hint # is updated below after _harvest_tool_calls_and_patches; on @@ -710,9 +845,15 @@ def _run(**kwargs: Any) -> Any: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error. @@ -1064,7 +1205,8 @@ async def gate(state: GraphState) -> dict: def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, registry: ToolRegistry, - event_log: "EventLog | None" = None) -> dict: + event_log: "EventLog | None" = None, + checkpointer: Any = None) -> dict: """Materialize agent nodes from skills + registry. Reused by main + resume graphs. Dispatches on ``skill.kind``: @@ -1142,6 +1284,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, injected_args=cfg.orchestrator.injected_args, gate_policy=gate_policy, event_log=event_log, + checkpointer=checkpointer, ) return nodes @@ -1247,7 +1390,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore, sg = StateGraph(GraphState) nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store, - registry=registry, event_log=event_log) + registry=registry, event_log=event_log, + checkpointer=checkpointer) for agent_name, node in nodes.items(): sg.add_node(agent_name, node) sg.add_node("gate", make_gate_node( diff --git a/src/runtime/orchestrator.py b/src/runtime/orchestrator.py index 9323deb..192920e 100644 --- a/src/runtime/orchestrator.py +++ b/src/runtime/orchestrator.py @@ -1057,6 +1057,29 @@ async def _finalize_session_status_async( async with self._locks.acquire(session_id): return self._finalize_session_status(session_id) + async def _is_graph_paused(self, session_id: str) -> bool: + """Return True iff the compiled graph has a pending step waiting + to resume (i.e. it's paused at an ``interrupt()`` boundary). + + langgraph 1.x surfaces a HITL pause via the result dict's + ``__interrupt__`` field rather than raising + :class:`GraphInterrupt`. After ``astream_events`` (or + ``ainvoke``) returns, the only way to tell "the run paused" + from "the run completed" is to query the checkpointed graph + state — a non-empty ``next`` tuple means the graph has steps + queued to run when resumed. The stream/retry call sites use + this to skip ``_finalize_session_status_async`` on a pause — + finalizing on a pause would stamp ``default_terminal_status`` + on a session that's actually waiting on operator approval, and + the orphaned ``pending_approval`` ToolCall row written by the + gateway would never get its resume. + """ + try: + state = await self.graph.aget_state(self._thread_config(session_id)) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == not paused + return False + return bool(getattr(state, "next", ()) or ()) + def _thread_config(self, incident_id: str) -> dict: """Build the LangGraph ``config`` dict for a per-session thread. @@ -1266,10 +1289,18 @@ async def stream_session(self, *, query: str, environment: str, config=self._thread_config(inc.id), ): yield self._to_ui_event(ev, inc.id) - new_status = await self._finalize_session_status_async(inc.id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": inc.id, - "status": new_status, "ts": _event_ts()} + # Skip finalize when the graph paused on an interrupt — the + # session is waiting for operator approval, not done. Stamping + # default_terminal_status here would orphan the pending_approval + # ToolCall row written by the gateway just before the pause. + if await self._is_graph_paused(inc.id): + yield {"event": "session_paused", "incident_id": inc.id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(inc.id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": inc.id, + "status": new_status, "ts": _event_ts()} yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()} async def stream_investigation(self, *, query: str, environment: str, @@ -1542,10 +1573,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]: config=self._thread_config(session_id), ): yield self._to_ui_event(ev, session_id) - new_status = await self._finalize_session_status_async(session_id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": session_id, - "status": new_status, "ts": _event_ts()} + # See ``stream_session`` for why pause-detection guards the + # finalize call: a HITL pause must not be coerced into a + # terminal status. + if await self._is_graph_paused(session_id): + yield {"event": "session_paused", "incident_id": session_id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(session_id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": session_id, + "status": new_status, "ts": _event_ts()} yield {"event": "retry_completed", "incident_id": session_id, "ts": _event_ts()} diff --git a/src/runtime/tools/gateway.py b/src/runtime/tools/gateway.py index dc321d2..e0f17b3 100644 --- a/src/runtime/tools/gateway.py +++ b/src/runtime/tools/gateway.py @@ -497,6 +497,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition. Without this, + # the DB row stays at ``pending_approval`` and + # the UI keeps offering the buttons forever. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -523,6 +528,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -545,6 +552,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -675,6 +684,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition (mirror of the + # sync path) so the DB row reflects the actual + # outcome instead of staying at pending_approval. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -696,6 +710,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -717,6 +733,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, diff --git a/src/runtime/ui.py b/src/runtime/ui.py index d2b4a7a..3277a1b 100644 --- a/src/runtime/ui.py +++ b/src/runtime/ui.py @@ -23,8 +23,20 @@ # Orchestrator uses so both share the same SQLite DB. from __future__ import annotations import asyncio +import logging as _logging import os import time + +# Optional: env-driven log config so manual runs (streamlit run, etc.) +# can crank verbosity to INFO/DEBUG without modifying the source. The +# default keeps the production-quiet WARNING level. +_log_level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip() +if _log_level in {"DEBUG", "INFO", "WARNING", "ERROR"}: + _logging.basicConfig( + level=getattr(_logging, _log_level), + format="%(asctime)s %(name)s %(levelname)s %(message)s", + force=True, + ) from datetime import datetime, timezone from pathlib import Path import streamlit as st @@ -987,6 +999,14 @@ async def _drive() -> None: Command(resume=payload), config=orch._thread_config(session_id), ) + # The graph completes the agent run after the verdict is + # forwarded to the gated tool; finalize the session if it's + # not paused on a fresh interrupt. Without this, an approved + # session stays in ``in_progress`` because the resume path + # does not run through ``stream_session`` (the only other + # caller of finalize). + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) svc.submit_and_wait(_drive(), timeout=60.0) diff --git a/tests/test_genericity_ratchet.py b/tests/test_genericity_ratchet.py index 5baf392..37a8496 100644 --- a/tests/test_genericity_ratchet.py +++ b/tests/test_genericity_ratchet.py @@ -74,7 +74,18 @@ # no new domain concept introduced (was missed in the Phase 12 # atomic commit; counted retroactively in the v1.2 follow-up # that consolidates injection-path bug fixes). -BASELINE_TOTAL = 154 +# 154 -> 156 HITL pause-resume fix for langgraph 1.x. Both +# ``runtime.graph.make_agent_node`` and +# ``runtime.agents.responsive.make_agent_node`` derive a +# per-invocation inner thread id of the form +# ``f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"`` +# so the inner ``create_agent`` checkpointer can persist a +# paused tool across an outer Pregel pause and resume. The +# count uses the existing ``incident`` local (the runner's +# domain Session) on a structurally required code path — +# same pattern as the Phase 10/11/12 entries. Net +2 +# ``incident`` token reuses, no new domain concept. +BASELINE_TOTAL = 156 def test_runtime_leaks_at_or_below_baseline(): diff --git a/tests/test_interrupt_detection.py b/tests/test_interrupt_detection.py new file mode 100644 index 0000000..92dc5f1 --- /dev/null +++ b/tests/test_interrupt_detection.py @@ -0,0 +1,370 @@ +"""Regression: agent runner must raise GraphInterrupt when ``create_agent`` +surfaces a paused HITL gate via the result dict's ``__interrupt__`` field. + +Background — langgraph 1.x changed the interrupt contract: a tool that +calls ``interrupt()`` no longer raises ``GraphInterrupt`` to the caller; +instead, ``agent.invoke(...)`` (or ``ainvoke``) returns a normal result +dict with an extra ``__interrupt__`` key carrying the captured +:class:`langgraph.types.Interrupt` records. The framework's agent +runner must therefore inspect the result and re-raise ``GraphInterrupt`` +itself; otherwise the outer Pregel completes the agent step, the +envelope-parsing path fires (potentially Path 6 synthesis), the session +gets finalized, and the ``pending_approval`` ToolCall row written by the +gateway just before the interrupt is left orphaned — the UI's +Approve / Reject buttons no longer drive any resume because the session +is already terminal from the framework's perspective. + +These tests pin the contract at the agent-runner boundary in BOTH +implementations: ``runtime.agents.responsive.make_agent_node`` (the +generic responsive runner) and ``runtime.graph.make_agent_node`` (the +incident-shape variant exposed via ``runtime.graph``). The fix is the +same in both: after ``ainvoke``, inspect ``result["__interrupt__"]`` +and raise :class:`GraphInterrupt`. +""" +from __future__ import annotations + +import asyncio +from pathlib import Path +from typing import Any + +import pytest +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import AIMessage, BaseMessage +from langchain_core.outputs import ChatGeneration, ChatResult +from langchain_core.tools import StructuredTool +from langgraph.errors import GraphInterrupt +from langgraph.types import interrupt +from pydantic import BaseModel + +from runtime.agents.responsive import make_agent_node as responsive_make_agent_node +from runtime.config import EmbeddingConfig, MetadataConfig, ProviderConfig +from runtime.graph import GraphState, make_agent_node as graph_make_agent_node, route_from_skill +from runtime.skill import RouteRule, Skill +from runtime.storage.embeddings import build_embedder +from runtime.storage.engine import build_engine +from runtime.storage.models import Base +from runtime.storage.session_store import SessionStore + + +# --------------------------------------------------------------------------- +# Fixtures (mirror test_real_llm_tool_loop_termination shape so the +# repo + session story stays consistent across the file). + + +def _make_repo(tmp_path: Path) -> SessionStore: + eng = build_engine(MetadataConfig(url=f"sqlite:///{tmp_path}/test.db")) + Base.metadata.create_all(eng) + embedder = build_embedder( + EmbeddingConfig(provider="s", model="x", dim=1024), + {"s": ProviderConfig(kind="stub")}, + ) + return SessionStore(engine=eng, embedder=embedder) + + +@pytest.fixture +def repo(tmp_path: Path) -> SessionStore: + return _make_repo(tmp_path) + + +@pytest.fixture +def session(repo: SessionStore): + return repo.create( + query="HITL gate fires mid-tool", + environment="production", + reporter_id="u", + reporter_team="t", + ) + + +# --------------------------------------------------------------------------- +# Stubs + + +class _PauseArgs(BaseModel): + why: str = "test" + + +def _pause_tool_impl(why: str = "test") -> str: # noqa: D401 + """Stand-in for a high-risk tool whose gateway raises interrupt().""" + interrupt({"reason": "approve_me", "why": why}) + return f"ran:{why}" + + +def _make_pause_tool() -> StructuredTool: + return StructuredTool.from_function( + func=_pause_tool_impl, + name="pause_tool", + description="Calls interrupt() — emulates the gateway's HITL gate.", + args_schema=_PauseArgs, + ) + + +class _OneShotToolCallLLM(BaseChatModel): + """Stub LLM that emits exactly one tool_call for ``pause_tool``. + + Mirrors the production agent's first turn: the LLM calls a tool, the + tool gates, and the agent should pause. After resume the tool would + return a real value and the LLM would emit a closing markdown + envelope — but for THIS test we only assert that the first + invocation pauses; the runner must observe ``__interrupt__`` and + re-raise, so a closing turn is unreachable. + """ + + @property + def _llm_type(self) -> str: + return "oneshot-toolcall-stub" + + def _generate( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: Any = None, + **kwargs: Any, + ) -> ChatResult: + msg = AIMessage( + content="", + tool_calls=[{"name": "pause_tool", "args": {"why": "x"}, "id": "c1"}], + ) + return ChatResult(generations=[ChatGeneration(message=msg)]) + + async def _agenerate( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: Any = None, + **kwargs: Any, + ) -> ChatResult: + return self._generate(messages, stop, run_manager, **kwargs) + + def bind_tools(self, tools, *, tool_choice=None, **kwargs): + return self + + +# --------------------------------------------------------------------------- +# graph.make_agent_node + + +@pytest.mark.asyncio +async def test_graph_make_agent_node_raises_on_interrupt_in_result(repo, session): + """When a tool calls ``interrupt()``, langgraph 1.x returns the + pause via ``result["__interrupt__"]`` instead of raising + ``GraphInterrupt``. The graph.py agent runner must detect this and + re-raise so the outer Pregel propagates the pause — otherwise the + envelope-parsing path fires, the session is finalized, and the + pending_approval row written by the gateway is orphaned. + """ + skill = Skill( + name="resolution", + description="d", + routes=[RouteRule(when="default", next="end")], + system_prompt="You are resolution.", + ) + node = graph_make_agent_node( + skill=skill, + llm=_OneShotToolCallLLM(), + tools=[_make_pause_tool()], + decide_route=lambda inc: route_from_skill(skill, inc), + store=repo, + ) + state: GraphState = {"session": session, "next_route": None} + with pytest.raises(GraphInterrupt): + await asyncio.wait_for(node(state), timeout=5.0) + + +# --------------------------------------------------------------------------- +# responsive.make_agent_node + + +@pytest.mark.asyncio +async def test_responsive_make_agent_node_raises_on_interrupt_in_result( + repo, session, +): + """Same contract as above, exercised through the generic responsive + runner. Both runners share the contract: ``__interrupt__`` in the + ainvoke result must be re-raised as ``GraphInterrupt``. + """ + skill = Skill( + name="resolution", + description="d", + routes=[RouteRule(when="default", next="end")], + system_prompt="You are resolution.", + ) + node = responsive_make_agent_node( + skill=skill, + llm=_OneShotToolCallLLM(), + tools=[_make_pause_tool()], + decide_route=lambda inc: route_from_skill(skill, inc), + store=repo, + ) + state: GraphState = {"session": session, "next_route": None} + with pytest.raises(GraphInterrupt): + await asyncio.wait_for(node(state), timeout=5.0) + + +# --------------------------------------------------------------------------- +# Resume happy path — verdict actually reaches the gated tool. +# +# The previous tests cover the FIRST turn (pause). These tests cover the +# RESUME turn: when the outer Pregel is rehydrated via +# ``Command(resume=verdict)``, the helper ``_drive_agent_with_resume`` +# detects the inner agent's paused state via ``aget_state``, calls +# ``interrupt()`` at the outer level to fetch the verdict, and forwards +# it via ``Command(resume=verdict)`` to the inner agent. The inner +# agent's tool then re-enters with the verdict in hand and produces a +# real ToolMessage — which is the only proof that the gated work +# actually executed (a "silent skip" path leaves no ToolMessage +# behind). + + +class _LLMWithCloser(BaseChatModel): + """LLM that emits a tool_call once, then a closing AIMessage.""" + + _counter: int = 0 # pyright: ignore[reportGeneralTypeIssues] — pydantic v2 needs annotation + + @property + def _llm_type(self) -> str: + return "loop-toolcall-stub" + + def _generate( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: Any = None, + **kwargs: Any, + ) -> ChatResult: + self._counter += 1 + if self._counter == 1: + msg = AIMessage( + content="", + tool_calls=[ + {"name": "pause_tool", "args": {"why": "x"}, "id": "c1"}, + ], + ) + else: + # Closing markdown envelope — Path 4 parser will pick it up. + md = ( + "all good\n\n" + "## Response\nall good\n\n" + "## Confidence\n0.9 -- post-resume close\n\n" + "## Signal\nnone\n" + ) + msg = AIMessage(content=md) + return ChatResult(generations=[ChatGeneration(message=msg)]) + + async def _agenerate( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: Any = None, + **kwargs: Any, + ) -> ChatResult: + return self._generate(messages, stop, run_manager, **kwargs) + + def bind_tools(self, tools, *, tool_choice=None, **kwargs): + return self + + +@pytest.mark.asyncio +async def test_resume_forwards_verdict_to_inner_tool_and_completes( + repo, session, +): + """End-to-end pause → resume → tool actually runs with verdict. + + Drives the agent runner across two invocations sharing a + ``langgraph.checkpoint.memory.InMemorySaver``. The first call + raises ``GraphInterrupt`` (paused mid-tool). The second call — + re-entered as the outer would re-enter the node body after a + ``Command(resume=...)`` — must (a) detect the inner pause via + ``aget_state``, (b) forward the verdict via + ``Command(resume=verdict)``, and (c) drive the tool to completion + so a ``ToolMessage`` appears in the final inner message list. The + ToolMessage is the proof that the gated tool actually executed + rather than being silently skipped. + """ + from langgraph.checkpoint.memory import InMemorySaver + from langgraph.types import Command, interrupt + + saver = InMemorySaver() + + # Tool whose interrupt() returns the verdict and whose final + # return value records the verdict so we can assert it ended up in + # the message stream. + class _Args(BaseModel): + why: str = "x" + + captured: dict[str, str | None] = {"verdict": None} + + def _gated(why: str = "x") -> str: + v = interrupt({"need_verdict": why}) + captured["verdict"] = v + return f"ran:{v}" + + gated = StructuredTool.from_function( + func=_gated, name="pause_tool", description="emulates HITL gate", + args_schema=_Args, + ) + + # Build an inner agent the same way the framework would, but + # against the test's saver so the pause survives across calls. + from langchain.agents import create_agent + + llm = _LLMWithCloser() + inner = create_agent( + model=llm, tools=[gated], system_prompt="x", checkpointer=saver, + ) + inner_cfg = {"configurable": {"thread_id": "T-resume-test"}} + + # The helper's ``interrupt()`` calls require a real Pregel + # runnable context — we cannot call ``_drive_agent_with_resume`` + # directly. Exercise it inside a minimal one-node outer graph + # that owns its own checkpointer; that's exactly how the + # framework wires it via ``runtime.graph.build_graph``. + from runtime.graph import _drive_agent_with_resume + from langgraph.graph import StateGraph, START, END + from typing import TypedDict + + class _OuterS(TypedDict, total=False): + result: object + + async def _wrapper(state: _OuterS) -> dict: + out = await _drive_agent_with_resume( + agent_executor=inner, inner_cfg=inner_cfg, + inner_has_checkpointer=True, + initial_input={"messages": [("human", "go")]}, + ) + return {"result": out} + + sg = StateGraph(_OuterS) + sg.add_node("n", _wrapper) + sg.add_edge(START, "n") + sg.add_edge("n", END) + outer = sg.compile(checkpointer=InMemorySaver()) + outer_cfg = {"configurable": {"thread_id": "OUTER-T"}} + + # First outer.invoke({}): the wrapper hits the inner pause and + # then the helper's outer interrupt() pauses the outer too. + first = await outer.ainvoke({}, config=outer_cfg) + assert "__interrupt__" in first + + # Now resume the outer with the verdict — the helper forwards it + # to the inner via Command(resume="approve") and the gated tool + # actually re-enters with the verdict. + final = await outer.ainvoke(Command(resume="approve"), config=outer_cfg) + + inner_result = final.get("result") + assert isinstance(inner_result, dict), f"unexpected inner result: {inner_result!r}" + msgs = inner_result.get("messages") or [] + # Assert a ToolMessage exists with the gated-tool's actual output. + tool_msgs = [m for m in msgs if type(m).__name__ == "ToolMessage"] + assert tool_msgs, ( + "expected a ToolMessage in the final inner messages — its " + "absence means the gated tool was silently skipped on resume " + f"(verdict={captured['verdict']!r}, messages={[type(m).__name__ for m in msgs]})" + ) + assert "ran:approve" in str(tool_msgs[-1].content), ( + f"tool ran but verdict was lost: {tool_msgs[-1].content!r}" + ) + assert captured["verdict"] == "approve", ( + f"tool's interrupt() did not receive 'approve' on resume: " + f"{captured['verdict']!r}" + ) From 1d408ba6e55ceeb02dd73327306dd226fc21fb60 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Wed, 13 May 2026 16:21:09 +0000 Subject: [PATCH 5/6] build: regenerate dist bundles after gpt-oss compat + HITL fix Bundles dist/app.py + dist/apps/{code-review,incident-management}.py + dist/ui.py in line with src/runtime changes from the two preceding commits. No bundle-only edits. --- dist/app.py | 416 +++++++++++++++++++++++++++++-- dist/apps/code-review.py | 416 +++++++++++++++++++++++++++++-- dist/apps/incident-management.py | 416 +++++++++++++++++++++++++++++-- dist/ui.py | 28 ++- 4 files changed, 1194 insertions(+), 82 deletions(-) diff --git a/dist/app.py b/dist/app.py index 7d8a93e..f7ca84e 100644 --- a/dist/app.py +++ b/dist/app.py @@ -713,7 +713,7 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING, Any, Callable from langchain_core.messages import HumanMessage from langchain.agents import create_agent @@ -994,7 +994,6 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Any, Callable from fastapi import APIRouter, Depends, Header, HTTPException, Request, status from pydantic import ValidationError @@ -6586,7 +6585,7 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): # Leftmost float (allows int form), optional rationale after em-dash / # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale # is captured wholesale. - r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$", + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$", re.DOTALL, ) @@ -6777,9 +6776,138 @@ def parse_envelope_from_result( continue break - # Path 5 (terminal): fail loudly. None of the four paths produced a + # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular) + # treat a terminal-tool call as the completion and emit an empty + # closing AIMessage. Apps configure typed-terminal tools with + # ``confidence`` + ``confidence_rationale`` args by contract — + # synthesise an envelope from those args when the markdown path + # yielded nothing. Skip if any earlier path succeeded. + for msg in reversed(messages): + tcs = getattr(msg, "tool_calls", None) + if not tcs: + continue + for tc in tcs: + args = tc.get("args") or {} + if not isinstance(args, dict): + continue + conf = args.get("confidence") + rat = args.get("confidence_rationale") or args.get("rationale") + if conf is None or not rat: + continue + try: + conf_f = float(conf) + except (TypeError, ValueError): + continue + content_body = ( + args.get("resolution_summary") + or args.get("reason") + or args.get("summary") + or f"terminal tool {tc.get('name', '')} invoked" + ) + if not isinstance(content_body, str) or not content_body.strip(): + content_body = f"terminal tool {tc.get('name', '')} invoked" + try: + env = AgentTurnOutput( + content=content_body, + confidence=max(0.0, min(1.0, conf_f)), + confidence_rationale=str(rat), + signal=args.get("signal"), + ) + except ValidationError: + continue + _LOG.info( + "envelope_synthesized_from_tool_args agent=%s tool=%s " + "confidence=%.2f", + agent, tc.get("name"), env.confidence, + ) + return env + + # Path 6 (D-22-fallback-permissive): the model called at least one + # tool but never emitted the markdown contract or a typed-terminal + # tool with confidence args. Rather than hard-failing the run, + # synthesise a minimal envelope so the session reaches a terminal + # status (typically default_terminal_status -> needs_review) and + # the operator can review what happened. Confidence is intentionally + # low (0.30) so any HITL gate fires. + invoked_tool_names: list[str] = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + n = tc.get("name") + if n: + invoked_tool_names.append(n) + if invoked_tool_names: + body = ( + f"Agent {agent} invoked " + f"{', '.join(invoked_tool_names)} but did not emit a final " + f"summary. Review the tool-call audit for outcome." + ) + try: + env = AgentTurnOutput( + content=body, + confidence=0.30, + confidence_rationale=( + "synthesized: agent invoked tools but emitted no " + "closing message" + ), + signal=None, + ) + _LOG.warning( + "envelope_synthesized_permissive agent=%s tools=%s", + agent, invoked_tool_names, + ) + return env + except ValidationError: + pass + + # Path 7 (terminal): fail loudly. None of the six paths produced a # valid envelope — this is a real prompt-drift signal worth - # surfacing as a structured agent_run error. + # surfacing as a structured agent_run error. Log the last + # AIMessage content (capped) so operators can diagnose whether + # the LLM emitted the wrong shape vs nothing at all. + last_ai_content = "" + for msg in reversed(messages): + if msg.__class__.__name__ == "AIMessage": + c = getattr(msg, "content", None) + if isinstance(c, str): + last_ai_content = c + break + # Also dump every tool_call across all AIMessages so we can see + # whether the model called any tool with confidence args (Path 5 + # would have fired) or whether it called a non-terminal tool only. + tool_call_summary = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + args = tc.get("args") or {} + tool_call_summary.append({ + "name": tc.get("name"), + "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?", + "has_confidence": isinstance(args, dict) and "confidence" in args, + "has_rationale": isinstance(args, dict) and ( + "confidence_rationale" in args or "rationale" in args + ), + }) + # Detailed per-message dump so we can see exactly what shape the + # conversation took when the parse failed. + msg_summary = [] + for m in messages: + kind = m.__class__.__name__ + c = getattr(m, "content", None) + c_len = len(c) if isinstance(c, str) else ( + f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__ + ) + tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])] + msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs}) + _LOG.warning( + "envelope_missing: agent=%s last_ai_message_content=%r " + "structured_response_type=%s message_count=%d tool_calls=%r " + "msg_trace=%r", + agent, + last_ai_content[:1500] if last_ai_content else "", + type(result.get("structured_response")).__name__, + len(messages), + tool_call_summary, + msg_summary, + ) raise EnvelopeMissingError( agent=agent, field="structured_response", @@ -7299,6 +7427,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition. Without this, + # the DB row stays at ``pending_approval`` and + # the UI keeps offering the buttons forever. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7325,6 +7458,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7347,6 +7482,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -7477,6 +7614,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition (mirror of the + # sync path) so the DB row reflects the actual + # outcome instead of staying at pending_approval. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7498,6 +7640,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7519,6 +7663,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -8308,6 +8454,7 @@ def make_agent_node( patch_tool_names: frozenset[str] = frozenset(), gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ): """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -8330,9 +8477,18 @@ def make_agent_node( async def node(state: GraphState) -> dict: - incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] - inc_id = incident.id + state_session: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from store at entry — outer Pregel checkpoints + # at step boundaries, not mid-node, so ``state["session"]`` is + # stale relative to DB when a HITL gate paused mid-step. See + # the same reload comment in ``runtime.graph.make_agent_node`` + # for the full rationale. + try: + incident: Session = store.load(inc_id) + except FileNotFoundError: + incident = state_session # M3: emit agent_started telemetry before any work happens. if event_log is not None: @@ -8362,11 +8518,21 @@ async def node(state: GraphState) -> dict: # change as graph.py:make_agent_node — drop response_format # so the loop terminates on natural React END, then parse the # final AIMessage via Path 4 in parse_envelope_from_result. + # The inner agent gets the orchestrator's checkpointer so a + # HITL pause inside ``interrupt()`` can be resumed via + # ``Command(resume=verdict)`` on a stable per-invocation + # thread id (see ``_drive_agent_with_resume`` in + # ``runtime.graph`` for the full rationale). agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, + checkpointer=checkpointer, + ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint at the # start of each agent step so the gateway treats the first @@ -8377,9 +8543,15 @@ async def node(state: GraphState) -> dict: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up. @@ -9354,8 +9526,104 @@ def finish(self, *, summary: str) -> None: ) +async def _drive_agent_with_resume( + *, + agent_executor, + inner_cfg: dict, + inner_has_checkpointer: bool, + initial_input: dict, +): + """Run an inner ``create_agent`` executor, surfacing HITL pauses to + the outer Pregel graph and forwarding resume verdicts back to the + inner agent. + + langgraph 1.x changed the ``interrupt()`` contract: a tool that + calls :func:`langgraph.types.interrupt` no longer raises + :class:`GraphInterrupt` to the caller. Instead, the agent's + ``ainvoke`` returns normally with the pause captured in + ``result["__interrupt__"]``. To make HITL approval actually work + end-to-end the framework must: + + 1. **First call** — invoke the agent with the initial messages. + If the result carries ``__interrupt__``, raise + :class:`GraphInterrupt` so the *outer* Pregel pauses too. The + outer pause is what surfaces ``__interrupt__`` to the + orchestrator and lets the UI render Approve / Reject. + + 2. **Resume call** — when the outer is resumed via + ``Command(resume=verdict)``, this helper is re-entered. The + inner's checkpointer remembers the pause; we detect that via + ``aget_state(...).next`` being non-empty and call + :func:`langgraph.types.interrupt` at the *outer* level to fetch + the verdict, then forward it to the inner via + ``Command(resume=verdict)``. This is the only path that + actually re-runs the gated tool with the verdict — calling + ``inner.ainvoke({"messages": [...]})`` on a paused thread + silently skips the tool. + + A while loop covers the case where a single agent turn pauses + multiple times (e.g. two high-risk tool calls in sequence). + + When ``inner_has_checkpointer`` is False (legacy callers and most + unit tests that build agents without a checkpointer), the helper + falls back to the simple "invoke + re-raise on ``__interrupt__``" + contract. That path can pause the outer graph but the resume + verdict cannot reach the inner tool — which is acceptable because + legacy callers without a checkpointer never had a working HITL + resume path to begin with. + """ + # Local imports keep callers that never touch HITL out of the + # langgraph.types import cost path. ``Command`` and ``interrupt`` + # are tiny but the explicit lazy import documents the dependency. + from langgraph.types import Command, interrupt + + # Resume-detection branch: only meaningful when the inner agent + # has its own checkpointer (otherwise ``aget_state`` on a fresh + # in-memory pregel raises). The detection asks: "does the inner + # have a paused step waiting?". A non-empty ``next`` means yes. + if inner_has_checkpointer: + try: + inner_state = await agent_executor.aget_state(inner_cfg) + inner_paused = bool(getattr(inner_state, "next", ()) or ()) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == fresh + inner_paused = False + else: + inner_paused = False + + if inner_paused: + # Outer is being resumed; pull the verdict via the outer's own + # interrupt() and forward it. The payload here is informational — + # the only thing the outer cares about is the resume value. + verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]}) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + else: + result = await _ainvoke_with_retry( + agent_executor, initial_input, config=inner_cfg, + ) + + # Multi-pause loop: if the inner pauses again (or resumes into a + # second gated tool call), surface each pause to the outer the + # same way until the inner is fully done. + while isinstance(result, dict) and result.get("__interrupt__"): + if not inner_has_checkpointer: + # No checkpointer => no Command(resume=...) path is + # available; surface the pause to the outer Pregel and + # stop. Resume cannot deliver the verdict to the gated + # tool, but that matches legacy behavior. + raise GraphInterrupt(result["__interrupt__"]) + verdict = interrupt(result["__interrupt__"]) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + + return result + + async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, - base_delay: float = 1.5): + base_delay: float = 1.5, + config: dict | None = None): """Wrap a LangGraph agent invocation with retry on transient cloud errors. Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.). @@ -9373,6 +9641,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, # itself (AutoStrategy → ToolStrategy fallback for non- # function-calling Ollama models). The default langgraph # recursion bound is now a true upper bound, not a workaround. + # + # ``config`` (default ``None``) carries the per-thread + # ``configurable.thread_id`` for callers that need a + # checkpointer-scoped invocation (HITL resume path — + # see :func:`_drive_agent_with_resume`). Legacy callers + # pass nothing and the executor's default thread is used. + if config is not None: + return await executor.ainvoke(input_, config=config) return await executor.ainvoke(input_) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause. @@ -9705,6 +9981,7 @@ def make_agent_node( injected_args: dict[str, str] | None = None, gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ) -> Callable[[GraphState], Awaitable[dict]]: """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -9736,10 +10013,23 @@ def make_agent_node( """ async def node(state: GraphState) -> dict: - incident = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session - inc_id = incident.id + state_session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from the persistent store at entry. Outer Pregel + # checkpoints at step boundaries, not mid-node — so on a HITL + # resume, ``state["session"]`` reflects the prior step's output + # (e.g. deep_investigator's), not the gateway's pending_approval + # row + version bump that happened mid-step in the original run. + # If we trust ``state["session"]`` here, the gateway sees no + # pending row, appends a duplicate, then ``store.save`` raises + # ``StaleVersionError`` because DB has already moved on. + try: + incident = store.load(inc_id) + except FileNotFoundError: + incident = state_session + # M3 (per-step telemetry): emit agent_started. if event_log is not None: try: @@ -9846,11 +10136,28 @@ def _run(**kwargs: Any) -> Any: # END interaction, recursion_limit ceilings). Markdown is the # native format every chat model writes well; the parse step # happens in the framework, where leniency is in our control. + # The inner agent gets a checkpointer so a HITL pause inside + # ``interrupt()`` can be resumed via ``Command(resume=verdict)`` + # on the SAME inner thread. langgraph 1.x semantics require the + # checkpointer here — without it ``Command(resume=...)`` raises + # ``RuntimeError: Cannot use Command(resume=...) without + # checkpointer``. The thread id is derived deterministically + # from session + agent + the upcoming agent_run index so it is: + # * STABLE across the inner pause and the outer resume that + # follows (both observe the same ``len(incident.agents_run)`` + # because no new run is recorded mid-pause), and + # * UNIQUE per agent invocation so previous invocations of the + # same agent within the same session don't bleed in. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, + checkpointer=checkpointer, ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" + ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint. The hint # is updated below after _harvest_tool_calls_and_patches; on @@ -9862,9 +10169,15 @@ def _run(**kwargs: Any) -> Any: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error. @@ -10216,7 +10529,8 @@ async def gate(state: GraphState) -> dict: def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, registry: ToolRegistry, - event_log: "EventLog | None" = None) -> dict: + event_log: "EventLog | None" = None, + checkpointer: Any = None) -> dict: """Materialize agent nodes from skills + registry. Reused by main + resume graphs. Dispatches on ``skill.kind``: @@ -10294,6 +10608,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, injected_args=cfg.orchestrator.injected_args, gate_policy=gate_policy, event_log=event_log, + checkpointer=checkpointer, ) return nodes @@ -10399,7 +10714,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore, sg = StateGraph(GraphState) nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store, - registry=registry, event_log=event_log) + registry=registry, event_log=event_log, + checkpointer=checkpointer) for agent_name, node in nodes.items(): sg.add_node(agent_name, node) sg.add_node("gate", make_gate_node( @@ -14173,6 +14489,29 @@ async def _finalize_session_status_async( async with self._locks.acquire(session_id): return self._finalize_session_status(session_id) + async def _is_graph_paused(self, session_id: str) -> bool: + """Return True iff the compiled graph has a pending step waiting + to resume (i.e. it's paused at an ``interrupt()`` boundary). + + langgraph 1.x surfaces a HITL pause via the result dict's + ``__interrupt__`` field rather than raising + :class:`GraphInterrupt`. After ``astream_events`` (or + ``ainvoke``) returns, the only way to tell "the run paused" + from "the run completed" is to query the checkpointed graph + state — a non-empty ``next`` tuple means the graph has steps + queued to run when resumed. The stream/retry call sites use + this to skip ``_finalize_session_status_async`` on a pause — + finalizing on a pause would stamp ``default_terminal_status`` + on a session that's actually waiting on operator approval, and + the orphaned ``pending_approval`` ToolCall row written by the + gateway would never get its resume. + """ + try: + state = await self.graph.aget_state(self._thread_config(session_id)) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == not paused + return False + return bool(getattr(state, "next", ()) or ()) + def _thread_config(self, incident_id: str) -> dict: """Build the LangGraph ``config`` dict for a per-session thread. @@ -14382,10 +14721,18 @@ async def stream_session(self, *, query: str, environment: str, config=self._thread_config(inc.id), ): yield self._to_ui_event(ev, inc.id) - new_status = await self._finalize_session_status_async(inc.id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": inc.id, - "status": new_status, "ts": _event_ts()} + # Skip finalize when the graph paused on an interrupt — the + # session is waiting for operator approval, not done. Stamping + # default_terminal_status here would orphan the pending_approval + # ToolCall row written by the gateway just before the pause. + if await self._is_graph_paused(inc.id): + yield {"event": "session_paused", "incident_id": inc.id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(inc.id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": inc.id, + "status": new_status, "ts": _event_ts()} yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()} async def stream_investigation(self, *, query: str, environment: str, @@ -14658,10 +15005,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]: config=self._thread_config(session_id), ): yield self._to_ui_event(ev, session_id) - new_status = await self._finalize_session_status_async(session_id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": session_id, - "status": new_status, "ts": _event_ts()} + # See ``stream_session`` for why pause-detection guards the + # finalize call: a HITL pause must not be coerced into a + # terminal status. + if await self._is_graph_paused(session_id): + yield {"event": "session_paused", "incident_id": session_id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(session_id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": session_id, + "status": new_status, "ts": _event_ts()} yield {"event": "retry_completed", "incident_id": session_id, "ts": _event_ts()} @@ -15354,6 +15708,16 @@ async def _resume() -> None: Command(resume=decision_payload), config=orch._thread_config(session_id), ) + # Finalize after the verdict-driven run completes so + # the session row reflects the terminal status (or + # falls through to ``default_terminal_status``). Skip + # if a fresh interrupt re-paused the graph — the + # session is again awaiting operator input. Mirrors + # the same guard in the UI's + # ``_submit_approval_via_service`` and the + # ``stream_session`` finalize call site. + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) # Submit the resume onto the long-lived service loop so we # don't fight the lifespan thread for the same FastMCP/SQLite diff --git a/dist/apps/code-review.py b/dist/apps/code-review.py index b078936..57c3a5c 100644 --- a/dist/apps/code-review.py +++ b/dist/apps/code-review.py @@ -713,7 +713,7 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING, Any, Callable from langchain_core.messages import HumanMessage from langchain.agents import create_agent @@ -994,7 +994,6 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Any, Callable from fastapi import APIRouter, Depends, Header, HTTPException, Request, status from pydantic import ValidationError @@ -6639,7 +6638,7 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): # Leftmost float (allows int form), optional rationale after em-dash / # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale # is captured wholesale. - r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$", + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$", re.DOTALL, ) @@ -6830,9 +6829,138 @@ def parse_envelope_from_result( continue break - # Path 5 (terminal): fail loudly. None of the four paths produced a + # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular) + # treat a terminal-tool call as the completion and emit an empty + # closing AIMessage. Apps configure typed-terminal tools with + # ``confidence`` + ``confidence_rationale`` args by contract — + # synthesise an envelope from those args when the markdown path + # yielded nothing. Skip if any earlier path succeeded. + for msg in reversed(messages): + tcs = getattr(msg, "tool_calls", None) + if not tcs: + continue + for tc in tcs: + args = tc.get("args") or {} + if not isinstance(args, dict): + continue + conf = args.get("confidence") + rat = args.get("confidence_rationale") or args.get("rationale") + if conf is None or not rat: + continue + try: + conf_f = float(conf) + except (TypeError, ValueError): + continue + content_body = ( + args.get("resolution_summary") + or args.get("reason") + or args.get("summary") + or f"terminal tool {tc.get('name', '')} invoked" + ) + if not isinstance(content_body, str) or not content_body.strip(): + content_body = f"terminal tool {tc.get('name', '')} invoked" + try: + env = AgentTurnOutput( + content=content_body, + confidence=max(0.0, min(1.0, conf_f)), + confidence_rationale=str(rat), + signal=args.get("signal"), + ) + except ValidationError: + continue + _LOG.info( + "envelope_synthesized_from_tool_args agent=%s tool=%s " + "confidence=%.2f", + agent, tc.get("name"), env.confidence, + ) + return env + + # Path 6 (D-22-fallback-permissive): the model called at least one + # tool but never emitted the markdown contract or a typed-terminal + # tool with confidence args. Rather than hard-failing the run, + # synthesise a minimal envelope so the session reaches a terminal + # status (typically default_terminal_status -> needs_review) and + # the operator can review what happened. Confidence is intentionally + # low (0.30) so any HITL gate fires. + invoked_tool_names: list[str] = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + n = tc.get("name") + if n: + invoked_tool_names.append(n) + if invoked_tool_names: + body = ( + f"Agent {agent} invoked " + f"{', '.join(invoked_tool_names)} but did not emit a final " + f"summary. Review the tool-call audit for outcome." + ) + try: + env = AgentTurnOutput( + content=body, + confidence=0.30, + confidence_rationale=( + "synthesized: agent invoked tools but emitted no " + "closing message" + ), + signal=None, + ) + _LOG.warning( + "envelope_synthesized_permissive agent=%s tools=%s", + agent, invoked_tool_names, + ) + return env + except ValidationError: + pass + + # Path 7 (terminal): fail loudly. None of the six paths produced a # valid envelope — this is a real prompt-drift signal worth - # surfacing as a structured agent_run error. + # surfacing as a structured agent_run error. Log the last + # AIMessage content (capped) so operators can diagnose whether + # the LLM emitted the wrong shape vs nothing at all. + last_ai_content = "" + for msg in reversed(messages): + if msg.__class__.__name__ == "AIMessage": + c = getattr(msg, "content", None) + if isinstance(c, str): + last_ai_content = c + break + # Also dump every tool_call across all AIMessages so we can see + # whether the model called any tool with confidence args (Path 5 + # would have fired) or whether it called a non-terminal tool only. + tool_call_summary = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + args = tc.get("args") or {} + tool_call_summary.append({ + "name": tc.get("name"), + "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?", + "has_confidence": isinstance(args, dict) and "confidence" in args, + "has_rationale": isinstance(args, dict) and ( + "confidence_rationale" in args or "rationale" in args + ), + }) + # Detailed per-message dump so we can see exactly what shape the + # conversation took when the parse failed. + msg_summary = [] + for m in messages: + kind = m.__class__.__name__ + c = getattr(m, "content", None) + c_len = len(c) if isinstance(c, str) else ( + f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__ + ) + tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])] + msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs}) + _LOG.warning( + "envelope_missing: agent=%s last_ai_message_content=%r " + "structured_response_type=%s message_count=%d tool_calls=%r " + "msg_trace=%r", + agent, + last_ai_content[:1500] if last_ai_content else "", + type(result.get("structured_response")).__name__, + len(messages), + tool_call_summary, + msg_summary, + ) raise EnvelopeMissingError( agent=agent, field="structured_response", @@ -7352,6 +7480,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition. Without this, + # the DB row stays at ``pending_approval`` and + # the UI keeps offering the buttons forever. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7378,6 +7511,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7400,6 +7535,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -7530,6 +7667,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition (mirror of the + # sync path) so the DB row reflects the actual + # outcome instead of staying at pending_approval. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7551,6 +7693,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7572,6 +7716,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -8361,6 +8507,7 @@ def make_agent_node( patch_tool_names: frozenset[str] = frozenset(), gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ): """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -8383,9 +8530,18 @@ def make_agent_node( async def node(state: GraphState) -> dict: - incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] - inc_id = incident.id + state_session: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from store at entry — outer Pregel checkpoints + # at step boundaries, not mid-node, so ``state["session"]`` is + # stale relative to DB when a HITL gate paused mid-step. See + # the same reload comment in ``runtime.graph.make_agent_node`` + # for the full rationale. + try: + incident: Session = store.load(inc_id) + except FileNotFoundError: + incident = state_session # M3: emit agent_started telemetry before any work happens. if event_log is not None: @@ -8415,11 +8571,21 @@ async def node(state: GraphState) -> dict: # change as graph.py:make_agent_node — drop response_format # so the loop terminates on natural React END, then parse the # final AIMessage via Path 4 in parse_envelope_from_result. + # The inner agent gets the orchestrator's checkpointer so a + # HITL pause inside ``interrupt()`` can be resumed via + # ``Command(resume=verdict)`` on a stable per-invocation + # thread id (see ``_drive_agent_with_resume`` in + # ``runtime.graph`` for the full rationale). agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, + checkpointer=checkpointer, ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" + ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint at the # start of each agent step so the gateway treats the first @@ -8430,9 +8596,15 @@ async def node(state: GraphState) -> dict: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up. @@ -9407,8 +9579,104 @@ def finish(self, *, summary: str) -> None: ) +async def _drive_agent_with_resume( + *, + agent_executor, + inner_cfg: dict, + inner_has_checkpointer: bool, + initial_input: dict, +): + """Run an inner ``create_agent`` executor, surfacing HITL pauses to + the outer Pregel graph and forwarding resume verdicts back to the + inner agent. + + langgraph 1.x changed the ``interrupt()`` contract: a tool that + calls :func:`langgraph.types.interrupt` no longer raises + :class:`GraphInterrupt` to the caller. Instead, the agent's + ``ainvoke`` returns normally with the pause captured in + ``result["__interrupt__"]``. To make HITL approval actually work + end-to-end the framework must: + + 1. **First call** — invoke the agent with the initial messages. + If the result carries ``__interrupt__``, raise + :class:`GraphInterrupt` so the *outer* Pregel pauses too. The + outer pause is what surfaces ``__interrupt__`` to the + orchestrator and lets the UI render Approve / Reject. + + 2. **Resume call** — when the outer is resumed via + ``Command(resume=verdict)``, this helper is re-entered. The + inner's checkpointer remembers the pause; we detect that via + ``aget_state(...).next`` being non-empty and call + :func:`langgraph.types.interrupt` at the *outer* level to fetch + the verdict, then forward it to the inner via + ``Command(resume=verdict)``. This is the only path that + actually re-runs the gated tool with the verdict — calling + ``inner.ainvoke({"messages": [...]})`` on a paused thread + silently skips the tool. + + A while loop covers the case where a single agent turn pauses + multiple times (e.g. two high-risk tool calls in sequence). + + When ``inner_has_checkpointer`` is False (legacy callers and most + unit tests that build agents without a checkpointer), the helper + falls back to the simple "invoke + re-raise on ``__interrupt__``" + contract. That path can pause the outer graph but the resume + verdict cannot reach the inner tool — which is acceptable because + legacy callers without a checkpointer never had a working HITL + resume path to begin with. + """ + # Local imports keep callers that never touch HITL out of the + # langgraph.types import cost path. ``Command`` and ``interrupt`` + # are tiny but the explicit lazy import documents the dependency. + from langgraph.types import Command, interrupt + + # Resume-detection branch: only meaningful when the inner agent + # has its own checkpointer (otherwise ``aget_state`` on a fresh + # in-memory pregel raises). The detection asks: "does the inner + # have a paused step waiting?". A non-empty ``next`` means yes. + if inner_has_checkpointer: + try: + inner_state = await agent_executor.aget_state(inner_cfg) + inner_paused = bool(getattr(inner_state, "next", ()) or ()) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == fresh + inner_paused = False + else: + inner_paused = False + + if inner_paused: + # Outer is being resumed; pull the verdict via the outer's own + # interrupt() and forward it. The payload here is informational — + # the only thing the outer cares about is the resume value. + verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]}) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + else: + result = await _ainvoke_with_retry( + agent_executor, initial_input, config=inner_cfg, + ) + + # Multi-pause loop: if the inner pauses again (or resumes into a + # second gated tool call), surface each pause to the outer the + # same way until the inner is fully done. + while isinstance(result, dict) and result.get("__interrupt__"): + if not inner_has_checkpointer: + # No checkpointer => no Command(resume=...) path is + # available; surface the pause to the outer Pregel and + # stop. Resume cannot deliver the verdict to the gated + # tool, but that matches legacy behavior. + raise GraphInterrupt(result["__interrupt__"]) + verdict = interrupt(result["__interrupt__"]) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + + return result + + async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, - base_delay: float = 1.5): + base_delay: float = 1.5, + config: dict | None = None): """Wrap a LangGraph agent invocation with retry on transient cloud errors. Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.). @@ -9426,6 +9694,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, # itself (AutoStrategy → ToolStrategy fallback for non- # function-calling Ollama models). The default langgraph # recursion bound is now a true upper bound, not a workaround. + # + # ``config`` (default ``None``) carries the per-thread + # ``configurable.thread_id`` for callers that need a + # checkpointer-scoped invocation (HITL resume path — + # see :func:`_drive_agent_with_resume`). Legacy callers + # pass nothing and the executor's default thread is used. + if config is not None: + return await executor.ainvoke(input_, config=config) return await executor.ainvoke(input_) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause. @@ -9758,6 +10034,7 @@ def make_agent_node( injected_args: dict[str, str] | None = None, gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ) -> Callable[[GraphState], Awaitable[dict]]: """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -9789,10 +10066,23 @@ def make_agent_node( """ async def node(state: GraphState) -> dict: - incident = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session - inc_id = incident.id + state_session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from the persistent store at entry. Outer Pregel + # checkpoints at step boundaries, not mid-node — so on a HITL + # resume, ``state["session"]`` reflects the prior step's output + # (e.g. deep_investigator's), not the gateway's pending_approval + # row + version bump that happened mid-step in the original run. + # If we trust ``state["session"]`` here, the gateway sees no + # pending row, appends a duplicate, then ``store.save`` raises + # ``StaleVersionError`` because DB has already moved on. + try: + incident = store.load(inc_id) + except FileNotFoundError: + incident = state_session + # M3 (per-step telemetry): emit agent_started. if event_log is not None: try: @@ -9899,11 +10189,28 @@ def _run(**kwargs: Any) -> Any: # END interaction, recursion_limit ceilings). Markdown is the # native format every chat model writes well; the parse step # happens in the framework, where leniency is in our control. + # The inner agent gets a checkpointer so a HITL pause inside + # ``interrupt()`` can be resumed via ``Command(resume=verdict)`` + # on the SAME inner thread. langgraph 1.x semantics require the + # checkpointer here — without it ``Command(resume=...)`` raises + # ``RuntimeError: Cannot use Command(resume=...) without + # checkpointer``. The thread id is derived deterministically + # from session + agent + the upcoming agent_run index so it is: + # * STABLE across the inner pause and the outer resume that + # follows (both observe the same ``len(incident.agents_run)`` + # because no new run is recorded mid-pause), and + # * UNIQUE per agent invocation so previous invocations of the + # same agent within the same session don't bleed in. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, + checkpointer=checkpointer, ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" + ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint. The hint # is updated below after _harvest_tool_calls_and_patches; on @@ -9915,9 +10222,15 @@ def _run(**kwargs: Any) -> Any: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error. @@ -10269,7 +10582,8 @@ async def gate(state: GraphState) -> dict: def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, registry: ToolRegistry, - event_log: "EventLog | None" = None) -> dict: + event_log: "EventLog | None" = None, + checkpointer: Any = None) -> dict: """Materialize agent nodes from skills + registry. Reused by main + resume graphs. Dispatches on ``skill.kind``: @@ -10347,6 +10661,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, injected_args=cfg.orchestrator.injected_args, gate_policy=gate_policy, event_log=event_log, + checkpointer=checkpointer, ) return nodes @@ -10452,7 +10767,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore, sg = StateGraph(GraphState) nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store, - registry=registry, event_log=event_log) + registry=registry, event_log=event_log, + checkpointer=checkpointer) for agent_name, node in nodes.items(): sg.add_node(agent_name, node) sg.add_node("gate", make_gate_node( @@ -14226,6 +14542,29 @@ async def _finalize_session_status_async( async with self._locks.acquire(session_id): return self._finalize_session_status(session_id) + async def _is_graph_paused(self, session_id: str) -> bool: + """Return True iff the compiled graph has a pending step waiting + to resume (i.e. it's paused at an ``interrupt()`` boundary). + + langgraph 1.x surfaces a HITL pause via the result dict's + ``__interrupt__`` field rather than raising + :class:`GraphInterrupt`. After ``astream_events`` (or + ``ainvoke``) returns, the only way to tell "the run paused" + from "the run completed" is to query the checkpointed graph + state — a non-empty ``next`` tuple means the graph has steps + queued to run when resumed. The stream/retry call sites use + this to skip ``_finalize_session_status_async`` on a pause — + finalizing on a pause would stamp ``default_terminal_status`` + on a session that's actually waiting on operator approval, and + the orphaned ``pending_approval`` ToolCall row written by the + gateway would never get its resume. + """ + try: + state = await self.graph.aget_state(self._thread_config(session_id)) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == not paused + return False + return bool(getattr(state, "next", ()) or ()) + def _thread_config(self, incident_id: str) -> dict: """Build the LangGraph ``config`` dict for a per-session thread. @@ -14435,10 +14774,18 @@ async def stream_session(self, *, query: str, environment: str, config=self._thread_config(inc.id), ): yield self._to_ui_event(ev, inc.id) - new_status = await self._finalize_session_status_async(inc.id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": inc.id, - "status": new_status, "ts": _event_ts()} + # Skip finalize when the graph paused on an interrupt — the + # session is waiting for operator approval, not done. Stamping + # default_terminal_status here would orphan the pending_approval + # ToolCall row written by the gateway just before the pause. + if await self._is_graph_paused(inc.id): + yield {"event": "session_paused", "incident_id": inc.id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(inc.id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": inc.id, + "status": new_status, "ts": _event_ts()} yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()} async def stream_investigation(self, *, query: str, environment: str, @@ -14711,10 +15058,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]: config=self._thread_config(session_id), ): yield self._to_ui_event(ev, session_id) - new_status = await self._finalize_session_status_async(session_id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": session_id, - "status": new_status, "ts": _event_ts()} + # See ``stream_session`` for why pause-detection guards the + # finalize call: a HITL pause must not be coerced into a + # terminal status. + if await self._is_graph_paused(session_id): + yield {"event": "session_paused", "incident_id": session_id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(session_id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": session_id, + "status": new_status, "ts": _event_ts()} yield {"event": "retry_completed", "incident_id": session_id, "ts": _event_ts()} @@ -15407,6 +15761,16 @@ async def _resume() -> None: Command(resume=decision_payload), config=orch._thread_config(session_id), ) + # Finalize after the verdict-driven run completes so + # the session row reflects the terminal status (or + # falls through to ``default_terminal_status``). Skip + # if a fresh interrupt re-paused the graph — the + # session is again awaiting operator input. Mirrors + # the same guard in the UI's + # ``_submit_approval_via_service`` and the + # ``stream_session`` finalize call site. + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) # Submit the resume onto the long-lived service loop so we # don't fight the lifespan thread for the same FastMCP/SQLite diff --git a/dist/apps/incident-management.py b/dist/apps/incident-management.py index d2c257d..e2ef5f9 100644 --- a/dist/apps/incident-management.py +++ b/dist/apps/incident-management.py @@ -713,7 +713,7 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING, Any, Callable from langchain_core.messages import HumanMessage from langchain.agents import create_agent @@ -994,7 +994,6 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Any, Callable from fastapi import APIRouter, Depends, Header, HTTPException, Request, status from pydantic import ValidationError @@ -6651,7 +6650,7 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): # Leftmost float (allows int form), optional rationale after em-dash / # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale # is captured wholesale. - r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\u2014\-]+\s*(.*))?$", + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$", re.DOTALL, ) @@ -6842,9 +6841,138 @@ def parse_envelope_from_result( continue break - # Path 5 (terminal): fail loudly. None of the four paths produced a + # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular) + # treat a terminal-tool call as the completion and emit an empty + # closing AIMessage. Apps configure typed-terminal tools with + # ``confidence`` + ``confidence_rationale`` args by contract — + # synthesise an envelope from those args when the markdown path + # yielded nothing. Skip if any earlier path succeeded. + for msg in reversed(messages): + tcs = getattr(msg, "tool_calls", None) + if not tcs: + continue + for tc in tcs: + args = tc.get("args") or {} + if not isinstance(args, dict): + continue + conf = args.get("confidence") + rat = args.get("confidence_rationale") or args.get("rationale") + if conf is None or not rat: + continue + try: + conf_f = float(conf) + except (TypeError, ValueError): + continue + content_body = ( + args.get("resolution_summary") + or args.get("reason") + or args.get("summary") + or f"terminal tool {tc.get('name', '')} invoked" + ) + if not isinstance(content_body, str) or not content_body.strip(): + content_body = f"terminal tool {tc.get('name', '')} invoked" + try: + env = AgentTurnOutput( + content=content_body, + confidence=max(0.0, min(1.0, conf_f)), + confidence_rationale=str(rat), + signal=args.get("signal"), + ) + except ValidationError: + continue + _LOG.info( + "envelope_synthesized_from_tool_args agent=%s tool=%s " + "confidence=%.2f", + agent, tc.get("name"), env.confidence, + ) + return env + + # Path 6 (D-22-fallback-permissive): the model called at least one + # tool but never emitted the markdown contract or a typed-terminal + # tool with confidence args. Rather than hard-failing the run, + # synthesise a minimal envelope so the session reaches a terminal + # status (typically default_terminal_status -> needs_review) and + # the operator can review what happened. Confidence is intentionally + # low (0.30) so any HITL gate fires. + invoked_tool_names: list[str] = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + n = tc.get("name") + if n: + invoked_tool_names.append(n) + if invoked_tool_names: + body = ( + f"Agent {agent} invoked " + f"{', '.join(invoked_tool_names)} but did not emit a final " + f"summary. Review the tool-call audit for outcome." + ) + try: + env = AgentTurnOutput( + content=body, + confidence=0.30, + confidence_rationale=( + "synthesized: agent invoked tools but emitted no " + "closing message" + ), + signal=None, + ) + _LOG.warning( + "envelope_synthesized_permissive agent=%s tools=%s", + agent, invoked_tool_names, + ) + return env + except ValidationError: + pass + + # Path 7 (terminal): fail loudly. None of the six paths produced a # valid envelope — this is a real prompt-drift signal worth - # surfacing as a structured agent_run error. + # surfacing as a structured agent_run error. Log the last + # AIMessage content (capped) so operators can diagnose whether + # the LLM emitted the wrong shape vs nothing at all. + last_ai_content = "" + for msg in reversed(messages): + if msg.__class__.__name__ == "AIMessage": + c = getattr(msg, "content", None) + if isinstance(c, str): + last_ai_content = c + break + # Also dump every tool_call across all AIMessages so we can see + # whether the model called any tool with confidence args (Path 5 + # would have fired) or whether it called a non-terminal tool only. + tool_call_summary = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + args = tc.get("args") or {} + tool_call_summary.append({ + "name": tc.get("name"), + "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?", + "has_confidence": isinstance(args, dict) and "confidence" in args, + "has_rationale": isinstance(args, dict) and ( + "confidence_rationale" in args or "rationale" in args + ), + }) + # Detailed per-message dump so we can see exactly what shape the + # conversation took when the parse failed. + msg_summary = [] + for m in messages: + kind = m.__class__.__name__ + c = getattr(m, "content", None) + c_len = len(c) if isinstance(c, str) else ( + f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__ + ) + tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])] + msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs}) + _LOG.warning( + "envelope_missing: agent=%s last_ai_message_content=%r " + "structured_response_type=%s message_count=%d tool_calls=%r " + "msg_trace=%r", + agent, + last_ai_content[:1500] if last_ai_content else "", + type(result.get("structured_response")).__name__, + len(messages), + tool_call_summary, + msg_summary, + ) raise EnvelopeMissingError( agent=agent, field="structured_response", @@ -7364,6 +7492,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition. Without this, + # the DB row stays at ``pending_approval`` and + # the UI keeps offering the buttons forever. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7390,6 +7523,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7412,6 +7547,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -7542,6 +7679,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition (mirror of the + # sync path) so the DB row reflects the actual + # outcome instead of staying at pending_approval. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7563,6 +7705,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7584,6 +7728,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -8373,6 +8519,7 @@ def make_agent_node( patch_tool_names: frozenset[str] = frozenset(), gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ): """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -8395,9 +8542,18 @@ def make_agent_node( async def node(state: GraphState) -> dict: - incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] - inc_id = incident.id + state_session: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from store at entry — outer Pregel checkpoints + # at step boundaries, not mid-node, so ``state["session"]`` is + # stale relative to DB when a HITL gate paused mid-step. See + # the same reload comment in ``runtime.graph.make_agent_node`` + # for the full rationale. + try: + incident: Session = store.load(inc_id) + except FileNotFoundError: + incident = state_session # M3: emit agent_started telemetry before any work happens. if event_log is not None: @@ -8427,11 +8583,21 @@ async def node(state: GraphState) -> dict: # change as graph.py:make_agent_node — drop response_format # so the loop terminates on natural React END, then parse the # final AIMessage via Path 4 in parse_envelope_from_result. + # The inner agent gets the orchestrator's checkpointer so a + # HITL pause inside ``interrupt()`` can be resumed via + # ``Command(resume=verdict)`` on a stable per-invocation + # thread id (see ``_drive_agent_with_resume`` in + # ``runtime.graph`` for the full rationale). agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, + checkpointer=checkpointer, ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" + ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint at the # start of each agent step so the gateway treats the first @@ -8442,9 +8608,15 @@ async def node(state: GraphState) -> dict: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up. @@ -9419,8 +9591,104 @@ def finish(self, *, summary: str) -> None: ) +async def _drive_agent_with_resume( + *, + agent_executor, + inner_cfg: dict, + inner_has_checkpointer: bool, + initial_input: dict, +): + """Run an inner ``create_agent`` executor, surfacing HITL pauses to + the outer Pregel graph and forwarding resume verdicts back to the + inner agent. + + langgraph 1.x changed the ``interrupt()`` contract: a tool that + calls :func:`langgraph.types.interrupt` no longer raises + :class:`GraphInterrupt` to the caller. Instead, the agent's + ``ainvoke`` returns normally with the pause captured in + ``result["__interrupt__"]``. To make HITL approval actually work + end-to-end the framework must: + + 1. **First call** — invoke the agent with the initial messages. + If the result carries ``__interrupt__``, raise + :class:`GraphInterrupt` so the *outer* Pregel pauses too. The + outer pause is what surfaces ``__interrupt__`` to the + orchestrator and lets the UI render Approve / Reject. + + 2. **Resume call** — when the outer is resumed via + ``Command(resume=verdict)``, this helper is re-entered. The + inner's checkpointer remembers the pause; we detect that via + ``aget_state(...).next`` being non-empty and call + :func:`langgraph.types.interrupt` at the *outer* level to fetch + the verdict, then forward it to the inner via + ``Command(resume=verdict)``. This is the only path that + actually re-runs the gated tool with the verdict — calling + ``inner.ainvoke({"messages": [...]})`` on a paused thread + silently skips the tool. + + A while loop covers the case where a single agent turn pauses + multiple times (e.g. two high-risk tool calls in sequence). + + When ``inner_has_checkpointer`` is False (legacy callers and most + unit tests that build agents without a checkpointer), the helper + falls back to the simple "invoke + re-raise on ``__interrupt__``" + contract. That path can pause the outer graph but the resume + verdict cannot reach the inner tool — which is acceptable because + legacy callers without a checkpointer never had a working HITL + resume path to begin with. + """ + # Local imports keep callers that never touch HITL out of the + # langgraph.types import cost path. ``Command`` and ``interrupt`` + # are tiny but the explicit lazy import documents the dependency. + from langgraph.types import Command, interrupt + + # Resume-detection branch: only meaningful when the inner agent + # has its own checkpointer (otherwise ``aget_state`` on a fresh + # in-memory pregel raises). The detection asks: "does the inner + # have a paused step waiting?". A non-empty ``next`` means yes. + if inner_has_checkpointer: + try: + inner_state = await agent_executor.aget_state(inner_cfg) + inner_paused = bool(getattr(inner_state, "next", ()) or ()) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == fresh + inner_paused = False + else: + inner_paused = False + + if inner_paused: + # Outer is being resumed; pull the verdict via the outer's own + # interrupt() and forward it. The payload here is informational — + # the only thing the outer cares about is the resume value. + verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]}) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + else: + result = await _ainvoke_with_retry( + agent_executor, initial_input, config=inner_cfg, + ) + + # Multi-pause loop: if the inner pauses again (or resumes into a + # second gated tool call), surface each pause to the outer the + # same way until the inner is fully done. + while isinstance(result, dict) and result.get("__interrupt__"): + if not inner_has_checkpointer: + # No checkpointer => no Command(resume=...) path is + # available; surface the pause to the outer Pregel and + # stop. Resume cannot deliver the verdict to the gated + # tool, but that matches legacy behavior. + raise GraphInterrupt(result["__interrupt__"]) + verdict = interrupt(result["__interrupt__"]) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + + return result + + async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, - base_delay: float = 1.5): + base_delay: float = 1.5, + config: dict | None = None): """Wrap a LangGraph agent invocation with retry on transient cloud errors. Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.). @@ -9438,6 +9706,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, # itself (AutoStrategy → ToolStrategy fallback for non- # function-calling Ollama models). The default langgraph # recursion bound is now a true upper bound, not a workaround. + # + # ``config`` (default ``None``) carries the per-thread + # ``configurable.thread_id`` for callers that need a + # checkpointer-scoped invocation (HITL resume path — + # see :func:`_drive_agent_with_resume`). Legacy callers + # pass nothing and the executor's default thread is used. + if config is not None: + return await executor.ainvoke(input_, config=config) return await executor.ainvoke(input_) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause. @@ -9770,6 +10046,7 @@ def make_agent_node( injected_args: dict[str, str] | None = None, gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ) -> Callable[[GraphState], Awaitable[dict]]: """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -9801,10 +10078,23 @@ def make_agent_node( """ async def node(state: GraphState) -> dict: - incident = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session - inc_id = incident.id + state_session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from the persistent store at entry. Outer Pregel + # checkpoints at step boundaries, not mid-node — so on a HITL + # resume, ``state["session"]`` reflects the prior step's output + # (e.g. deep_investigator's), not the gateway's pending_approval + # row + version bump that happened mid-step in the original run. + # If we trust ``state["session"]`` here, the gateway sees no + # pending row, appends a duplicate, then ``store.save`` raises + # ``StaleVersionError`` because DB has already moved on. + try: + incident = store.load(inc_id) + except FileNotFoundError: + incident = state_session + # M3 (per-step telemetry): emit agent_started. if event_log is not None: try: @@ -9911,11 +10201,28 @@ def _run(**kwargs: Any) -> Any: # END interaction, recursion_limit ceilings). Markdown is the # native format every chat model writes well; the parse step # happens in the framework, where leniency is in our control. + # The inner agent gets a checkpointer so a HITL pause inside + # ``interrupt()`` can be resumed via ``Command(resume=verdict)`` + # on the SAME inner thread. langgraph 1.x semantics require the + # checkpointer here — without it ``Command(resume=...)`` raises + # ``RuntimeError: Cannot use Command(resume=...) without + # checkpointer``. The thread id is derived deterministically + # from session + agent + the upcoming agent_run index so it is: + # * STABLE across the inner pause and the outer resume that + # follows (both observe the same ``len(incident.agents_run)`` + # because no new run is recorded mid-pause), and + # * UNIQUE per agent invocation so previous invocations of the + # same agent within the same session don't bleed in. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, + checkpointer=checkpointer, ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" + ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint. The hint # is updated below after _harvest_tool_calls_and_patches; on @@ -9927,9 +10234,15 @@ def _run(**kwargs: Any) -> Any: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error. @@ -10281,7 +10594,8 @@ async def gate(state: GraphState) -> dict: def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, registry: ToolRegistry, - event_log: "EventLog | None" = None) -> dict: + event_log: "EventLog | None" = None, + checkpointer: Any = None) -> dict: """Materialize agent nodes from skills + registry. Reused by main + resume graphs. Dispatches on ``skill.kind``: @@ -10359,6 +10673,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, injected_args=cfg.orchestrator.injected_args, gate_policy=gate_policy, event_log=event_log, + checkpointer=checkpointer, ) return nodes @@ -10464,7 +10779,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore, sg = StateGraph(GraphState) nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store, - registry=registry, event_log=event_log) + registry=registry, event_log=event_log, + checkpointer=checkpointer) for agent_name, node in nodes.items(): sg.add_node(agent_name, node) sg.add_node("gate", make_gate_node( @@ -14238,6 +14554,29 @@ async def _finalize_session_status_async( async with self._locks.acquire(session_id): return self._finalize_session_status(session_id) + async def _is_graph_paused(self, session_id: str) -> bool: + """Return True iff the compiled graph has a pending step waiting + to resume (i.e. it's paused at an ``interrupt()`` boundary). + + langgraph 1.x surfaces a HITL pause via the result dict's + ``__interrupt__`` field rather than raising + :class:`GraphInterrupt`. After ``astream_events`` (or + ``ainvoke``) returns, the only way to tell "the run paused" + from "the run completed" is to query the checkpointed graph + state — a non-empty ``next`` tuple means the graph has steps + queued to run when resumed. The stream/retry call sites use + this to skip ``_finalize_session_status_async`` on a pause — + finalizing on a pause would stamp ``default_terminal_status`` + on a session that's actually waiting on operator approval, and + the orphaned ``pending_approval`` ToolCall row written by the + gateway would never get its resume. + """ + try: + state = await self.graph.aget_state(self._thread_config(session_id)) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == not paused + return False + return bool(getattr(state, "next", ()) or ()) + def _thread_config(self, incident_id: str) -> dict: """Build the LangGraph ``config`` dict for a per-session thread. @@ -14447,10 +14786,18 @@ async def stream_session(self, *, query: str, environment: str, config=self._thread_config(inc.id), ): yield self._to_ui_event(ev, inc.id) - new_status = await self._finalize_session_status_async(inc.id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": inc.id, - "status": new_status, "ts": _event_ts()} + # Skip finalize when the graph paused on an interrupt — the + # session is waiting for operator approval, not done. Stamping + # default_terminal_status here would orphan the pending_approval + # ToolCall row written by the gateway just before the pause. + if await self._is_graph_paused(inc.id): + yield {"event": "session_paused", "incident_id": inc.id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(inc.id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": inc.id, + "status": new_status, "ts": _event_ts()} yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()} async def stream_investigation(self, *, query: str, environment: str, @@ -14723,10 +15070,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]: config=self._thread_config(session_id), ): yield self._to_ui_event(ev, session_id) - new_status = await self._finalize_session_status_async(session_id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": session_id, - "status": new_status, "ts": _event_ts()} + # See ``stream_session`` for why pause-detection guards the + # finalize call: a HITL pause must not be coerced into a + # terminal status. + if await self._is_graph_paused(session_id): + yield {"event": "session_paused", "incident_id": session_id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(session_id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": session_id, + "status": new_status, "ts": _event_ts()} yield {"event": "retry_completed", "incident_id": session_id, "ts": _event_ts()} @@ -15419,6 +15773,16 @@ async def _resume() -> None: Command(resume=decision_payload), config=orch._thread_config(session_id), ) + # Finalize after the verdict-driven run completes so + # the session row reflects the terminal status (or + # falls through to ``default_terminal_status``). Skip + # if a fresh interrupt re-paused the graph — the + # session is again awaiting operator input. Mirrors + # the same guard in the UI's + # ``_submit_approval_via_service`` and the + # ``stream_session`` finalize call site. + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) # Submit the resume onto the long-lived service loop so we # don't fight the lifespan thread for the same FastMCP/SQLite diff --git a/dist/ui.py b/dist/ui.py index 05bc7d9..3a57b56 100644 --- a/dist/ui.py +++ b/dist/ui.py @@ -26,8 +26,24 @@ from app import AppConfig, Base, FrameworkAppConfig, MetadataConfig, Orchestrator, OrchestratorService, SessionStore, UIBadge, build_embedder, build_engine, build_vector_store, load_config, resolve_framework_app_config, resolve_state_class import asyncio +import logging as _logging import os import time + +# Optional: env-driven log config so manual runs (streamlit run, etc.) +# can crank verbosity to INFO/DEBUG without modifying the source. The +# default keeps the production-quiet WARNING level. + + +# ====== module: src/runtime/ui.py ====== + +_log_level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip() +if _log_level in {"DEBUG", "INFO", "WARNING", "ERROR"}: + _logging.basicConfig( + level=getattr(_logging, _log_level), + format="%(asctime)s %(name)s %(levelname)s %(message)s", + force=True, + ) from datetime import datetime, timezone from pathlib import Path import streamlit as st @@ -41,10 +57,6 @@ # Default config path; apps override via the ``APP_CONFIG`` env var. - - -# ====== module: src/runtime/ui.py ====== - CONFIG_PATH = Path(os.environ.get("APP_CONFIG", "config/config.yaml")) @@ -985,6 +997,14 @@ async def _drive() -> None: Command(resume=payload), config=orch._thread_config(session_id), ) + # The graph completes the agent run after the verdict is + # forwarded to the gated tool; finalize the session if it's + # not paused on a fresh interrupt. Without this, an approved + # session stays in ``in_progress`` because the resume path + # does not run through ``stream_session`` (the only other + # caller of finalize). + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) svc.submit_and_wait(_drive(), timeout=60.0) From 6eaf5a59e5e3d0b6053b457b66cf0e12afd8c833 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Thu, 14 May 2026 00:26:41 +0000 Subject: [PATCH 6/6] fix(ui): move ASR_LOG_LEVEL block below imports to satisfy ruff E402 The env-driven log config block sat between imports, tripping ruff's ``E402 Module level import not at top of file``. Refactor into a ``_maybe_configure_logging`` function called immediately after the import block. ``logging.basicConfig(force=True)`` overrides any handler streamlit installed during import, so the behaviour is preserved. Regenerate dist/ui.py to match. --- dist/ui.py | 34 ++++++++++++++++++++-------------- src/runtime/ui.py | 28 +++++++++++++++++----------- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/dist/ui.py b/dist/ui.py index 3a57b56..24f5876 100644 --- a/dist/ui.py +++ b/dist/ui.py @@ -29,31 +29,37 @@ import logging as _logging import os import time +from datetime import datetime, timezone +from pathlib import Path +import streamlit as st + + + -# Optional: env-driven log config so manual runs (streamlit run, etc.) -# can crank verbosity to INFO/DEBUG without modifying the source. The -# default keeps the production-quiet WARNING level. -# ====== module: src/runtime/ui.py ====== -_log_level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip() -if _log_level in {"DEBUG", "INFO", "WARNING", "ERROR"}: - _logging.basicConfig( - level=getattr(_logging, _log_level), - format="%(asctime)s %(name)s %(levelname)s %(message)s", - force=True, - ) -from datetime import datetime, timezone -from pathlib import Path -import streamlit as st +# Optional: env-driven log config so manual runs (streamlit run, etc.) +# can crank verbosity to INFO/DEBUG without modifying the source. The +# default keeps the production-quiet WARNING level. ``force=True`` +# overrides any handler streamlit set up during import. +# ====== module: src/runtime/ui.py ====== +def _maybe_configure_logging() -> None: + level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip() + if level in {"DEBUG", "INFO", "WARNING", "ERROR"}: + _logging.basicConfig( + level=getattr(_logging, level), + format="%(asctime)s %(name)s %(levelname)s %(message)s", + force=True, + ) +_maybe_configure_logging() # Default config path; apps override via the ``APP_CONFIG`` env var. diff --git a/src/runtime/ui.py b/src/runtime/ui.py index 3277a1b..573d829 100644 --- a/src/runtime/ui.py +++ b/src/runtime/ui.py @@ -26,17 +26,6 @@ import logging as _logging import os import time - -# Optional: env-driven log config so manual runs (streamlit run, etc.) -# can crank verbosity to INFO/DEBUG without modifying the source. The -# default keeps the production-quiet WARNING level. -_log_level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip() -if _log_level in {"DEBUG", "INFO", "WARNING", "ERROR"}: - _logging.basicConfig( - level=getattr(_logging, _log_level), - format="%(asctime)s %(name)s %(levelname)s %(message)s", - force=True, - ) from datetime import datetime, timezone from pathlib import Path import streamlit as st @@ -58,6 +47,23 @@ from runtime.storage.vector import build_vector_store +# Optional: env-driven log config so manual runs (streamlit run, etc.) +# can crank verbosity to INFO/DEBUG without modifying the source. The +# default keeps the production-quiet WARNING level. ``force=True`` +# overrides any handler streamlit set up during import. +def _maybe_configure_logging() -> None: + level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip() + if level in {"DEBUG", "INFO", "WARNING", "ERROR"}: + _logging.basicConfig( + level=getattr(_logging, level), + format="%(asctime)s %(name)s %(levelname)s %(message)s", + force=True, + ) + + +_maybe_configure_logging() + + # Default config path; apps override via the ``APP_CONFIG`` env var. CONFIG_PATH = Path(os.environ.get("APP_CONFIG", "config/config.yaml"))