diff --git a/config/config.yaml b/config/config.yaml index 0b9b046..000a2ce 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -7,7 +7,7 @@ storage: collection_name: "incidents" distance_strategy: cosine llm: - default: workhorse + default: gpt_oss providers: ollama_cloud: kind: ollama diff --git a/dist/app.py b/dist/app.py index e42575a..f7ca84e 100644 --- a/dist/app.py +++ b/dist/app.py @@ -550,7 +550,7 @@ class IncidentState(Session): -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, ValidationError # ----- imports for runtime/tools/gateway.py ----- """Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper. @@ -713,7 +713,7 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING, Any, Callable from langchain_core.messages import HumanMessage from langchain.agents import create_agent @@ -994,7 +994,6 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Any, Callable from fastapi import APIRouter, Depends, Header, HTTPException, Request, status from pydantic import ValidationError @@ -1180,6 +1179,7 @@ async def _poll(self, registry): """ +from pydantic import BaseModel, ConfigDict, Field # ----- imports for runtime/memory/knowledge_graph.py ----- @@ -3131,17 +3131,13 @@ class StubChatModel(BaseChatModel): ``stub_envelope_confidence`` / ``stub_envelope_rationale`` / ``stub_envelope_signal``. - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` -> - ``ToolStrategy`` for non-native-structured-output models, including - this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The - agent loop only terminates when the LLM emits a tool call NAMED - ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name - so ``_generate`` can auto-emit a closing tool call after any - user-configured ``tool_call_plan`` is exhausted -- preserving the - pre-Phase-15 stub semantics (canned text + optional pre-scripted - tool calls) while satisfying the new tool-loop termination - contract. + Phase 22 (D-22-05): the loop terminates on natural React END now + that ``response_format=AgentTurnOutput`` is gone — tests drive + markdown-shaped canned text (the framework parses it via Path 4 + in :func:`runtime.agents.turn_output.parse_envelope_from_result`). + The Phase 15 envelope-as-callable-tool auto-emit is removed; the + stub is back to its pre-Phase-15 simple semantics (canned text + + optional pre-scripted ``tool_call_plan``). """ role: str = "default" canned_responses: dict[str, str] = Field(default_factory=dict) @@ -3150,12 +3146,6 @@ class StubChatModel(BaseChatModel): stub_envelope_rationale: str = "stub envelope rationale" stub_envelope_signal: str | None = None _called_once: bool = False - # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when - # ``langchain.agents.create_agent`` injects a structured-output tool - # for ``AgentTurnOutput``. Holds the bare tool name (e.g. - # ``"AgentTurnOutput"``) so ``_generate`` can emit a final - # envelope-shaped tool call to close the agent loop. - _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -3163,33 +3153,37 @@ def _llm_type(self) -> str: def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None, run_manager: Any = None, **kwargs: Any) -> ChatResult: - text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response") + # Phase 22 (D-22-05): tool-call rounds emit the scripted tool + # calls with a placeholder content body; the closing turn (no + # remaining tool_call_plan entries) emits the canned text + # wrapped in the D-22-03 markdown contract that + # parse_markdown_envelope reads. The stub_envelope_* + # parameters drive the trailing-section bodies so tests can + # exercise specific confidence / rationale / signal paths + # without authoring markdown by hand. tool_calls: list[dict] = [] if self.tool_call_plan and not self._called_once: for tc in self.tool_call_plan: - tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}) + tool_calls.append( + {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())} + ) self._called_once = True - elif self._envelope_tool_name is not None: - # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted - # (or wasn't configured) AND ``langchain.agents.create_agent`` - # has bound the AgentTurnOutput envelope as a tool. Emit a - # closing tool call so the loop terminates with a populated - # ``structured_response``. The args mirror the - # ``with_structured_output`` path's envelope construction so - # tests see the same confidence / rationale / signal regardless - # of whether the new tool-strategy or the legacy structured- - # output path is in play. - tool_calls.append({ - "name": self._envelope_tool_name, - "args": { - "content": text or ".", - "confidence": self.stub_envelope_confidence, - "confidence_rationale": self.stub_envelope_rationale, - "signal": self.stub_envelope_signal, - }, - "id": str(uuid4()), - }) - msg = AIMessage(content=text, tool_calls=tool_calls) + + body = self.canned_responses.get( + self.role, f"[stub:{self.role}] no canned response", + ) + # Phase 22 (D-22-05): explicit "none" when no signal so the + # parser maps to None — preserves pre-Phase-22 behaviour where a + # stub with no signal yielded envelope.signal=None. + signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none" + md = ( + f"{body}\n\n" + f"## Response\n{body}\n\n" + f"## Confidence\n{self.stub_envelope_confidence:.4f} -- " + f"{self.stub_envelope_rationale}\n\n" + f"## Signal\n{signal_str}\n" + ) + msg = AIMessage(content=md, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None, @@ -3197,30 +3191,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - """Record the AgentTurnOutput envelope-tool name when present. - - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` calls ``bind_tools(...)`` - with the user's tools PLUS the envelope-as-a-tool. We scan the - list for the AgentTurnOutput-shaped tool (matched by ``__name__`` - on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the - ``"name"`` key on dict-shaped tool specs) and remember it on the - instance so ``_generate`` can close the agent loop with a - synthetic envelope tool call after any pre-scripted - ``tool_call_plan`` is exhausted. Tools bound by the framework - itself (real BaseTools the agent should call) flow through - unchanged -- the stub still emits them only via - ``tool_call_plan``. + """Phase 22 (D-22-05): no-op tool binding. + + ``langchain.agents.create_agent`` calls ``bind_tools`` to wire + the agent's BaseTool list onto the chat model. The stub does + not actually invoke those tools (test fixtures script tool + calls via ``tool_call_plan`` instead), so we just return self + unchanged. The pre-Phase-22 envelope-tool detection is gone + because the framework no longer asks for ``response_format``. """ - for t in tools or []: - name = ( - getattr(t, "__name__", None) - or getattr(t, "name", None) - or (isinstance(t, dict) and t.get("name")) - ) - if isinstance(name, str) and name == "AgentTurnOutput": - self._envelope_tool_name = name - break return self # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]`` @@ -6601,6 +6580,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): super().__init__(message or f"envelope_missing: {field} (agent={agent})") +_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE) +_CONF_LINE = re.compile( + # Leftmost float (allows int form), optional rationale after em-dash / + # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale + # is captured wholesale. + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$", + re.DOTALL, +) + + +def _clamp_unit(x: float) -> float: + """Clamp a confidence float into [0, 1] without raising. The skill + prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or + ``-0.1`` — clamp rather than reject so the parse step is forgiving.""" + if x < 0.0: + return 0.0 + if x > 1.0: + return 1.0 + return x + + +def parse_markdown_envelope( + content: str, *, agent: str = "", +) -> "AgentTurnOutput": + """D-22-03 — parse the trailing markdown contract block into an + :class:`AgentTurnOutput`. + + Expects three sections ``## Response`` / ``## Confidence`` / + ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the + end of the LLM's reply. The free-text body of ``## Response`` + becomes ``content``; the float on the ``## Confidence`` line + becomes ``confidence`` (clamped to [0, 1]); the rest of that line + after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``; + the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/ + blank coerced to ``None``). + + Raises :class:`EnvelopeMissingError` only when the parse cannot + produce a valid envelope at all — missing ``## Confidence`` line, + unparseable confidence value, or empty ``## Response`` body. + """ + if not isinstance(content, str) or not content.strip(): + raise EnvelopeMissingError( + agent=agent, field="content", + message=f"envelope_missing: empty content (agent={agent!r})", + ) + + # ``re.split`` with the header capture-group yields: + # [pre, h1, body1, h2, body2, ...] + # Pre is the prose before the first heading; we ignore it for parse. + parts = _HEADER_SPLIT.split(content) + sections: dict[str, str] = {} + if len(parts) >= 3: + for i in range(1, len(parts) - 1, 2): + key = parts[i].strip().lower() + body = parts[i + 1].strip() + # Don't overwrite an earlier section with a later same-named + # one (unusual but possible in pathological prompts). + sections.setdefault(key, body) + + response_body = sections.get("response", "").strip() + raw_conf = sections.get("confidence", "").strip() + + if not raw_conf: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section absent or empty " + f"(agent={agent!r})" + ), + ) + + m = _CONF_LINE.match(raw_conf) + if not m: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section did not parse " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) + try: + conf_value = _clamp_unit(float(m.group(1))) + except (TypeError, ValueError) as exc: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence value not a float " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) from exc + rationale = (m.group(2) or "").strip() or "(no rationale provided)" + + signal_raw = sections.get("signal", "").strip().lower() or None + if signal_raw in {"none", "null", "", "n/a"}: + signal_raw = None + + if not response_body: + raise EnvelopeMissingError( + agent=agent, field="content", + message=( + f"envelope_missing: response section empty (agent={agent!r})" + ), + ) + + try: + return AgentTurnOutput( + content=response_body, + confidence=conf_value, + confidence_rationale=rationale, + signal=signal_raw, + ) + except ValidationError as exc: + raise EnvelopeMissingError( + agent=agent, field="validation", + message=( + f"envelope_missing: pydantic validation rejected the " + f"parsed values (agent={agent!r}): {exc}" + ), + ) from exc + + def parse_envelope_from_result( result: dict, *, @@ -6658,13 +6758,162 @@ def parse_envelope_from_result( continue break - # Path 3: fail loudly + # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's + # content. Producers that emit the new ``## Response / ## Confidence / + # ## Signal`` section block land here when Paths 1+2 yield nothing. + for msg in reversed(messages): + if msg.__class__.__name__ != "AIMessage": + continue + content = getattr(msg, "content", None) + if not isinstance(content, str) or not content.strip(): + continue + try: + return parse_markdown_envelope(content, agent=agent) + except EnvelopeMissingError: + # This AIMessage didnt carry a parseable markdown contract. + # Continue scanning earlier messages on the off-chance a + # nested loop emitted the contract one step back. + continue + break + + # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular) + # treat a terminal-tool call as the completion and emit an empty + # closing AIMessage. Apps configure typed-terminal tools with + # ``confidence`` + ``confidence_rationale`` args by contract — + # synthesise an envelope from those args when the markdown path + # yielded nothing. Skip if any earlier path succeeded. + for msg in reversed(messages): + tcs = getattr(msg, "tool_calls", None) + if not tcs: + continue + for tc in tcs: + args = tc.get("args") or {} + if not isinstance(args, dict): + continue + conf = args.get("confidence") + rat = args.get("confidence_rationale") or args.get("rationale") + if conf is None or not rat: + continue + try: + conf_f = float(conf) + except (TypeError, ValueError): + continue + content_body = ( + args.get("resolution_summary") + or args.get("reason") + or args.get("summary") + or f"terminal tool {tc.get('name', '')} invoked" + ) + if not isinstance(content_body, str) or not content_body.strip(): + content_body = f"terminal tool {tc.get('name', '')} invoked" + try: + env = AgentTurnOutput( + content=content_body, + confidence=max(0.0, min(1.0, conf_f)), + confidence_rationale=str(rat), + signal=args.get("signal"), + ) + except ValidationError: + continue + _LOG.info( + "envelope_synthesized_from_tool_args agent=%s tool=%s " + "confidence=%.2f", + agent, tc.get("name"), env.confidence, + ) + return env + + # Path 6 (D-22-fallback-permissive): the model called at least one + # tool but never emitted the markdown contract or a typed-terminal + # tool with confidence args. Rather than hard-failing the run, + # synthesise a minimal envelope so the session reaches a terminal + # status (typically default_terminal_status -> needs_review) and + # the operator can review what happened. Confidence is intentionally + # low (0.30) so any HITL gate fires. + invoked_tool_names: list[str] = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + n = tc.get("name") + if n: + invoked_tool_names.append(n) + if invoked_tool_names: + body = ( + f"Agent {agent} invoked " + f"{', '.join(invoked_tool_names)} but did not emit a final " + f"summary. Review the tool-call audit for outcome." + ) + try: + env = AgentTurnOutput( + content=body, + confidence=0.30, + confidence_rationale=( + "synthesized: agent invoked tools but emitted no " + "closing message" + ), + signal=None, + ) + _LOG.warning( + "envelope_synthesized_permissive agent=%s tools=%s", + agent, invoked_tool_names, + ) + return env + except ValidationError: + pass + + # Path 7 (terminal): fail loudly. None of the six paths produced a + # valid envelope — this is a real prompt-drift signal worth + # surfacing as a structured agent_run error. Log the last + # AIMessage content (capped) so operators can diagnose whether + # the LLM emitted the wrong shape vs nothing at all. + last_ai_content = "" + for msg in reversed(messages): + if msg.__class__.__name__ == "AIMessage": + c = getattr(msg, "content", None) + if isinstance(c, str): + last_ai_content = c + break + # Also dump every tool_call across all AIMessages so we can see + # whether the model called any tool with confidence args (Path 5 + # would have fired) or whether it called a non-terminal tool only. + tool_call_summary = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + args = tc.get("args") or {} + tool_call_summary.append({ + "name": tc.get("name"), + "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?", + "has_confidence": isinstance(args, dict) and "confidence" in args, + "has_rationale": isinstance(args, dict) and ( + "confidence_rationale" in args or "rationale" in args + ), + }) + # Detailed per-message dump so we can see exactly what shape the + # conversation took when the parse failed. + msg_summary = [] + for m in messages: + kind = m.__class__.__name__ + c = getattr(m, "content", None) + c_len = len(c) if isinstance(c, str) else ( + f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__ + ) + tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])] + msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs}) + _LOG.warning( + "envelope_missing: agent=%s last_ai_message_content=%r " + "structured_response_type=%s message_count=%d tool_calls=%r " + "msg_trace=%r", + agent, + last_ai_content[:1500] if last_ai_content else "", + type(result.get("structured_response")).__name__, + len(messages), + tool_call_summary, + msg_summary, + ) raise EnvelopeMissingError( agent=agent, field="structured_response", message=( - f"envelope_missing: no structured_response or JSON-decodable " - f"AIMessage envelope found (agent={agent})" + f"envelope_missing: no structured_response, JSON-decodable, or " + f"markdown-decodable AIMessage envelope found (agent={agent})" ), ) @@ -6711,6 +6960,7 @@ def reconcile_confidence( "AgentTurnOutput", "EnvelopeMissingError", "parse_envelope_from_result", + "parse_markdown_envelope", "reconcile_confidence", ] @@ -7177,6 +7427,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition. Without this, + # the DB row stays at ``pending_approval`` and + # the UI keeps offering the buttons forever. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7203,6 +7458,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7225,6 +7482,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -7355,6 +7614,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition (mirror of the + # sync path) so the DB row reflects the actual + # outcome instead of staying at pending_approval. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7376,6 +7640,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7397,6 +7663,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -8186,6 +8454,7 @@ def make_agent_node( patch_tool_names: frozenset[str] = frozenset(), gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ): """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -8208,9 +8477,18 @@ def make_agent_node( async def node(state: GraphState) -> dict: - incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] - inc_id = incident.id + state_session: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from store at entry — outer Pregel checkpoints + # at step boundaries, not mid-node, so ``state["session"]`` is + # stale relative to DB when a HITL gate paused mid-step. See + # the same reload comment in ``runtime.graph.make_agent_node`` + # for the full rationale. + try: + incident: Session = store.load(inc_id) + except FileNotFoundError: + incident = state_session # M3: emit agent_started telemetry before any work happens. if event_log is not None: @@ -8236,25 +8514,25 @@ async def node(state: GraphState) -> dict: ] else: run_tools = tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Same + # change as graph.py:make_agent_node — drop response_format + # so the loop terminates on natural React END, then parse the + # final AIMessage via Path 4 in parse_envelope_from_result. + # The inner agent gets the orchestrator's checkpointer so a + # HITL pause inside ``interrupt()`` can be resumed via + # ``Command(resume=verdict)`` on a stable per-invocation + # thread id (see ``_drive_agent_with_resume`` in + # ``runtime.graph`` for the full rationale). agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, + checkpointer=checkpointer, ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" + ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint at the # start of each agent step so the gateway treats the first @@ -8265,9 +8543,15 @@ async def node(state: GraphState) -> dict: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up. @@ -9242,8 +9526,104 @@ def finish(self, *, summary: str) -> None: ) +async def _drive_agent_with_resume( + *, + agent_executor, + inner_cfg: dict, + inner_has_checkpointer: bool, + initial_input: dict, +): + """Run an inner ``create_agent`` executor, surfacing HITL pauses to + the outer Pregel graph and forwarding resume verdicts back to the + inner agent. + + langgraph 1.x changed the ``interrupt()`` contract: a tool that + calls :func:`langgraph.types.interrupt` no longer raises + :class:`GraphInterrupt` to the caller. Instead, the agent's + ``ainvoke`` returns normally with the pause captured in + ``result["__interrupt__"]``. To make HITL approval actually work + end-to-end the framework must: + + 1. **First call** — invoke the agent with the initial messages. + If the result carries ``__interrupt__``, raise + :class:`GraphInterrupt` so the *outer* Pregel pauses too. The + outer pause is what surfaces ``__interrupt__`` to the + orchestrator and lets the UI render Approve / Reject. + + 2. **Resume call** — when the outer is resumed via + ``Command(resume=verdict)``, this helper is re-entered. The + inner's checkpointer remembers the pause; we detect that via + ``aget_state(...).next`` being non-empty and call + :func:`langgraph.types.interrupt` at the *outer* level to fetch + the verdict, then forward it to the inner via + ``Command(resume=verdict)``. This is the only path that + actually re-runs the gated tool with the verdict — calling + ``inner.ainvoke({"messages": [...]})`` on a paused thread + silently skips the tool. + + A while loop covers the case where a single agent turn pauses + multiple times (e.g. two high-risk tool calls in sequence). + + When ``inner_has_checkpointer`` is False (legacy callers and most + unit tests that build agents without a checkpointer), the helper + falls back to the simple "invoke + re-raise on ``__interrupt__``" + contract. That path can pause the outer graph but the resume + verdict cannot reach the inner tool — which is acceptable because + legacy callers without a checkpointer never had a working HITL + resume path to begin with. + """ + # Local imports keep callers that never touch HITL out of the + # langgraph.types import cost path. ``Command`` and ``interrupt`` + # are tiny but the explicit lazy import documents the dependency. + from langgraph.types import Command, interrupt + + # Resume-detection branch: only meaningful when the inner agent + # has its own checkpointer (otherwise ``aget_state`` on a fresh + # in-memory pregel raises). The detection asks: "does the inner + # have a paused step waiting?". A non-empty ``next`` means yes. + if inner_has_checkpointer: + try: + inner_state = await agent_executor.aget_state(inner_cfg) + inner_paused = bool(getattr(inner_state, "next", ()) or ()) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == fresh + inner_paused = False + else: + inner_paused = False + + if inner_paused: + # Outer is being resumed; pull the verdict via the outer's own + # interrupt() and forward it. The payload here is informational — + # the only thing the outer cares about is the resume value. + verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]}) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + else: + result = await _ainvoke_with_retry( + agent_executor, initial_input, config=inner_cfg, + ) + + # Multi-pause loop: if the inner pauses again (or resumes into a + # second gated tool call), surface each pause to the outer the + # same way until the inner is fully done. + while isinstance(result, dict) and result.get("__interrupt__"): + if not inner_has_checkpointer: + # No checkpointer => no Command(resume=...) path is + # available; surface the pause to the outer Pregel and + # stop. Resume cannot deliver the verdict to the gated + # tool, but that matches legacy behavior. + raise GraphInterrupt(result["__interrupt__"]) + verdict = interrupt(result["__interrupt__"]) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + + return result + + async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, - base_delay: float = 1.5): + base_delay: float = 1.5, + config: dict | None = None): """Wrap a LangGraph agent invocation with retry on transient cloud errors. Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.). @@ -9261,6 +9641,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, # itself (AutoStrategy → ToolStrategy fallback for non- # function-calling Ollama models). The default langgraph # recursion bound is now a true upper bound, not a workaround. + # + # ``config`` (default ``None``) carries the per-thread + # ``configurable.thread_id`` for callers that need a + # checkpointer-scoped invocation (HITL resume path — + # see :func:`_drive_agent_with_resume`). Legacy callers + # pass nothing and the executor's default thread is used. + if config is not None: + return await executor.ainvoke(input_, config=config) return await executor.ainvoke(input_) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause. @@ -9593,6 +9981,7 @@ def make_agent_node( injected_args: dict[str, str] | None = None, gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ) -> Callable[[GraphState], Awaitable[dict]]: """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -9624,10 +10013,23 @@ def make_agent_node( """ async def node(state: GraphState) -> dict: - incident = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session - inc_id = incident.id + state_session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from the persistent store at entry. Outer Pregel + # checkpoints at step boundaries, not mid-node — so on a HITL + # resume, ``state["session"]`` reflects the prior step's output + # (e.g. deep_investigator's), not the gateway's pending_approval + # row + version bump that happened mid-step in the original run. + # If we trust ``state["session"]`` here, the gateway sees no + # pending row, appends a duplicate, then ``store.save`` raises + # ``StaleVersionError`` because DB has already moved on. + try: + incident = store.load(inc_id) + except FileNotFoundError: + incident = state_session + # M3 (per-step telemetry): emit agent_started. if event_log is not None: try: @@ -9719,25 +10121,43 @@ def _run(**kwargs: Any) -> Any: ] else: run_tools = visible_tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Drop + # ``response_format`` from create_agent — the agent loop now + # terminates on the natural React END signal (LLM emits an + # AIMessage with no tool calls). The framework parses that + # final message body as markdown via Path 4 in + # parse_envelope_from_result. Skill prompts (D-22-04) instruct + # every reply to end with ## Response / ## Confidence / ## + # Signal sections; the parser extracts them. + # + # Why: forcing LLMs through a JSON-schema-shaped output via + # ``response_format`` triggered a class of brittleness across + # providers (model-specific JSON drift, tool-strategy + React + # END interaction, recursion_limit ceilings). Markdown is the + # native format every chat model writes well; the parse step + # happens in the framework, where leniency is in our control. + # The inner agent gets a checkpointer so a HITL pause inside + # ``interrupt()`` can be resumed via ``Command(resume=verdict)`` + # on the SAME inner thread. langgraph 1.x semantics require the + # checkpointer here — without it ``Command(resume=...)`` raises + # ``RuntimeError: Cannot use Command(resume=...) without + # checkpointer``. The thread id is derived deterministically + # from session + agent + the upcoming agent_run index so it is: + # * STABLE across the inner pause and the outer resume that + # follows (both observe the same ``len(incident.agents_run)`` + # because no new run is recorded mid-pause), and + # * UNIQUE per agent invocation so previous invocations of the + # same agent within the same session don't bleed in. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, + checkpointer=checkpointer, + ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint. The hint # is updated below after _harvest_tool_calls_and_patches; on @@ -9749,9 +10169,15 @@ def _run(**kwargs: Any) -> Any: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error. @@ -9930,12 +10356,17 @@ def _decide_from_signal(inc: Session) -> str: _DEFAULT_STUB_CANNED: dict[str, str] = { - # Back-compat defaults for the four canonical agents. Any YAML-defined - # agent can override or extend this via ``skill.stub_response``; new agents - # without an entry here fall through to StubChatModel's generic placeholder. + # Plain-text bodies; ``StubChatModel._generate`` wraps these in the + # D-22-03 markdown envelope using the per-agent stub_envelope_* fields + # (the framework parses the wrapped output via Path 4). Apps can + # override or extend via ``skill.stub_response``; new agents without + # an entry here fall through to ``StubChatModel``'s generic placeholder. "intake": "Created INC, no prior matches. Routing to triage.", "triage": "Severity medium, category latency. No recent deploys correlate.", - "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.", + "deep_investigator": ( + "Hypothesis: upstream payments timeout. " + "Evidence: log line upstream_timeout target=payments." + ), "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.", } @@ -10098,7 +10529,8 @@ async def gate(state: GraphState) -> dict: def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, registry: ToolRegistry, - event_log: "EventLog | None" = None) -> dict: + event_log: "EventLog | None" = None, + checkpointer: Any = None) -> dict: """Materialize agent nodes from skills + registry. Reused by main + resume graphs. Dispatches on ``skill.kind``: @@ -10176,6 +10608,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, injected_args=cfg.orchestrator.injected_args, gate_policy=gate_policy, event_log=event_log, + checkpointer=checkpointer, ) return nodes @@ -10281,7 +10714,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore, sg = StateGraph(GraphState) nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store, - registry=registry, event_log=event_log) + registry=registry, event_log=event_log, + checkpointer=checkpointer) for agent_name, node in nodes.items(): sg.add_node(agent_name, node) sg.add_node("gate", make_gate_node( @@ -14055,6 +14489,29 @@ async def _finalize_session_status_async( async with self._locks.acquire(session_id): return self._finalize_session_status(session_id) + async def _is_graph_paused(self, session_id: str) -> bool: + """Return True iff the compiled graph has a pending step waiting + to resume (i.e. it's paused at an ``interrupt()`` boundary). + + langgraph 1.x surfaces a HITL pause via the result dict's + ``__interrupt__`` field rather than raising + :class:`GraphInterrupt`. After ``astream_events`` (or + ``ainvoke``) returns, the only way to tell "the run paused" + from "the run completed" is to query the checkpointed graph + state — a non-empty ``next`` tuple means the graph has steps + queued to run when resumed. The stream/retry call sites use + this to skip ``_finalize_session_status_async`` on a pause — + finalizing on a pause would stamp ``default_terminal_status`` + on a session that's actually waiting on operator approval, and + the orphaned ``pending_approval`` ToolCall row written by the + gateway would never get its resume. + """ + try: + state = await self.graph.aget_state(self._thread_config(session_id)) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == not paused + return False + return bool(getattr(state, "next", ()) or ()) + def _thread_config(self, incident_id: str) -> dict: """Build the LangGraph ``config`` dict for a per-session thread. @@ -14264,10 +14721,18 @@ async def stream_session(self, *, query: str, environment: str, config=self._thread_config(inc.id), ): yield self._to_ui_event(ev, inc.id) - new_status = await self._finalize_session_status_async(inc.id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": inc.id, - "status": new_status, "ts": _event_ts()} + # Skip finalize when the graph paused on an interrupt — the + # session is waiting for operator approval, not done. Stamping + # default_terminal_status here would orphan the pending_approval + # ToolCall row written by the gateway just before the pause. + if await self._is_graph_paused(inc.id): + yield {"event": "session_paused", "incident_id": inc.id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(inc.id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": inc.id, + "status": new_status, "ts": _event_ts()} yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()} async def stream_investigation(self, *, query: str, environment: str, @@ -14540,10 +15005,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]: config=self._thread_config(session_id), ): yield self._to_ui_event(ev, session_id) - new_status = await self._finalize_session_status_async(session_id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": session_id, - "status": new_status, "ts": _event_ts()} + # See ``stream_session`` for why pause-detection guards the + # finalize call: a HITL pause must not be coerced into a + # terminal status. + if await self._is_graph_paused(session_id): + yield {"event": "session_paused", "incident_id": session_id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(session_id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": session_id, + "status": new_status, "ts": _event_ts()} yield {"event": "retry_completed", "incident_id": session_id, "ts": _event_ts()} @@ -15236,6 +15708,16 @@ async def _resume() -> None: Command(resume=decision_payload), config=orch._thread_config(session_id), ) + # Finalize after the verdict-driven run completes so + # the session row reflects the terminal status (or + # falls through to ``default_terminal_status``). Skip + # if a fresh interrupt re-paused the graph — the + # session is again awaiting operator input. Mirrors + # the same guard in the UI's + # ``_submit_approval_via_service`` and the + # ``stream_session`` finalize call site. + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) # Submit the resume onto the long-lived service loop so we # don't fight the lifespan thread for the same FastMCP/SQLite diff --git a/dist/apps/code-review.py b/dist/apps/code-review.py index d9e1c39..57c3a5c 100644 --- a/dist/apps/code-review.py +++ b/dist/apps/code-review.py @@ -550,7 +550,7 @@ class IncidentState(Session): -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, ValidationError # ----- imports for runtime/tools/gateway.py ----- """Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper. @@ -713,7 +713,7 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING, Any, Callable from langchain_core.messages import HumanMessage from langchain.agents import create_agent @@ -994,7 +994,6 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Any, Callable from fastapi import APIRouter, Depends, Header, HTTPException, Request, status from pydantic import ValidationError @@ -1180,6 +1179,7 @@ async def _poll(self, registry): """ +from pydantic import BaseModel, ConfigDict, Field # ----- imports for runtime/memory/knowledge_graph.py ----- @@ -3184,17 +3184,13 @@ class StubChatModel(BaseChatModel): ``stub_envelope_confidence`` / ``stub_envelope_rationale`` / ``stub_envelope_signal``. - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` -> - ``ToolStrategy`` for non-native-structured-output models, including - this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The - agent loop only terminates when the LLM emits a tool call NAMED - ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name - so ``_generate`` can auto-emit a closing tool call after any - user-configured ``tool_call_plan`` is exhausted -- preserving the - pre-Phase-15 stub semantics (canned text + optional pre-scripted - tool calls) while satisfying the new tool-loop termination - contract. + Phase 22 (D-22-05): the loop terminates on natural React END now + that ``response_format=AgentTurnOutput`` is gone — tests drive + markdown-shaped canned text (the framework parses it via Path 4 + in :func:`runtime.agents.turn_output.parse_envelope_from_result`). + The Phase 15 envelope-as-callable-tool auto-emit is removed; the + stub is back to its pre-Phase-15 simple semantics (canned text + + optional pre-scripted ``tool_call_plan``). """ role: str = "default" canned_responses: dict[str, str] = Field(default_factory=dict) @@ -3203,12 +3199,6 @@ class StubChatModel(BaseChatModel): stub_envelope_rationale: str = "stub envelope rationale" stub_envelope_signal: str | None = None _called_once: bool = False - # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when - # ``langchain.agents.create_agent`` injects a structured-output tool - # for ``AgentTurnOutput``. Holds the bare tool name (e.g. - # ``"AgentTurnOutput"``) so ``_generate`` can emit a final - # envelope-shaped tool call to close the agent loop. - _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -3216,33 +3206,37 @@ def _llm_type(self) -> str: def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None, run_manager: Any = None, **kwargs: Any) -> ChatResult: - text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response") + # Phase 22 (D-22-05): tool-call rounds emit the scripted tool + # calls with a placeholder content body; the closing turn (no + # remaining tool_call_plan entries) emits the canned text + # wrapped in the D-22-03 markdown contract that + # parse_markdown_envelope reads. The stub_envelope_* + # parameters drive the trailing-section bodies so tests can + # exercise specific confidence / rationale / signal paths + # without authoring markdown by hand. tool_calls: list[dict] = [] if self.tool_call_plan and not self._called_once: for tc in self.tool_call_plan: - tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}) + tool_calls.append( + {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())} + ) self._called_once = True - elif self._envelope_tool_name is not None: - # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted - # (or wasn't configured) AND ``langchain.agents.create_agent`` - # has bound the AgentTurnOutput envelope as a tool. Emit a - # closing tool call so the loop terminates with a populated - # ``structured_response``. The args mirror the - # ``with_structured_output`` path's envelope construction so - # tests see the same confidence / rationale / signal regardless - # of whether the new tool-strategy or the legacy structured- - # output path is in play. - tool_calls.append({ - "name": self._envelope_tool_name, - "args": { - "content": text or ".", - "confidence": self.stub_envelope_confidence, - "confidence_rationale": self.stub_envelope_rationale, - "signal": self.stub_envelope_signal, - }, - "id": str(uuid4()), - }) - msg = AIMessage(content=text, tool_calls=tool_calls) + + body = self.canned_responses.get( + self.role, f"[stub:{self.role}] no canned response", + ) + # Phase 22 (D-22-05): explicit "none" when no signal so the + # parser maps to None — preserves pre-Phase-22 behaviour where a + # stub with no signal yielded envelope.signal=None. + signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none" + md = ( + f"{body}\n\n" + f"## Response\n{body}\n\n" + f"## Confidence\n{self.stub_envelope_confidence:.4f} -- " + f"{self.stub_envelope_rationale}\n\n" + f"## Signal\n{signal_str}\n" + ) + msg = AIMessage(content=md, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None, @@ -3250,30 +3244,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - """Record the AgentTurnOutput envelope-tool name when present. - - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` calls ``bind_tools(...)`` - with the user's tools PLUS the envelope-as-a-tool. We scan the - list for the AgentTurnOutput-shaped tool (matched by ``__name__`` - on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the - ``"name"`` key on dict-shaped tool specs) and remember it on the - instance so ``_generate`` can close the agent loop with a - synthetic envelope tool call after any pre-scripted - ``tool_call_plan`` is exhausted. Tools bound by the framework - itself (real BaseTools the agent should call) flow through - unchanged -- the stub still emits them only via - ``tool_call_plan``. + """Phase 22 (D-22-05): no-op tool binding. + + ``langchain.agents.create_agent`` calls ``bind_tools`` to wire + the agent's BaseTool list onto the chat model. The stub does + not actually invoke those tools (test fixtures script tool + calls via ``tool_call_plan`` instead), so we just return self + unchanged. The pre-Phase-22 envelope-tool detection is gone + because the framework no longer asks for ``response_format``. """ - for t in tools or []: - name = ( - getattr(t, "__name__", None) - or getattr(t, "name", None) - or (isinstance(t, dict) and t.get("name")) - ) - if isinstance(name, str) and name == "AgentTurnOutput": - self._envelope_tool_name = name - break return self # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]`` @@ -6654,6 +6633,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): super().__init__(message or f"envelope_missing: {field} (agent={agent})") +_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE) +_CONF_LINE = re.compile( + # Leftmost float (allows int form), optional rationale after em-dash / + # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale + # is captured wholesale. + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$", + re.DOTALL, +) + + +def _clamp_unit(x: float) -> float: + """Clamp a confidence float into [0, 1] without raising. The skill + prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or + ``-0.1`` — clamp rather than reject so the parse step is forgiving.""" + if x < 0.0: + return 0.0 + if x > 1.0: + return 1.0 + return x + + +def parse_markdown_envelope( + content: str, *, agent: str = "", +) -> "AgentTurnOutput": + """D-22-03 — parse the trailing markdown contract block into an + :class:`AgentTurnOutput`. + + Expects three sections ``## Response`` / ``## Confidence`` / + ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the + end of the LLM's reply. The free-text body of ``## Response`` + becomes ``content``; the float on the ``## Confidence`` line + becomes ``confidence`` (clamped to [0, 1]); the rest of that line + after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``; + the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/ + blank coerced to ``None``). + + Raises :class:`EnvelopeMissingError` only when the parse cannot + produce a valid envelope at all — missing ``## Confidence`` line, + unparseable confidence value, or empty ``## Response`` body. + """ + if not isinstance(content, str) or not content.strip(): + raise EnvelopeMissingError( + agent=agent, field="content", + message=f"envelope_missing: empty content (agent={agent!r})", + ) + + # ``re.split`` with the header capture-group yields: + # [pre, h1, body1, h2, body2, ...] + # Pre is the prose before the first heading; we ignore it for parse. + parts = _HEADER_SPLIT.split(content) + sections: dict[str, str] = {} + if len(parts) >= 3: + for i in range(1, len(parts) - 1, 2): + key = parts[i].strip().lower() + body = parts[i + 1].strip() + # Don't overwrite an earlier section with a later same-named + # one (unusual but possible in pathological prompts). + sections.setdefault(key, body) + + response_body = sections.get("response", "").strip() + raw_conf = sections.get("confidence", "").strip() + + if not raw_conf: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section absent or empty " + f"(agent={agent!r})" + ), + ) + + m = _CONF_LINE.match(raw_conf) + if not m: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section did not parse " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) + try: + conf_value = _clamp_unit(float(m.group(1))) + except (TypeError, ValueError) as exc: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence value not a float " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) from exc + rationale = (m.group(2) or "").strip() or "(no rationale provided)" + + signal_raw = sections.get("signal", "").strip().lower() or None + if signal_raw in {"none", "null", "", "n/a"}: + signal_raw = None + + if not response_body: + raise EnvelopeMissingError( + agent=agent, field="content", + message=( + f"envelope_missing: response section empty (agent={agent!r})" + ), + ) + + try: + return AgentTurnOutput( + content=response_body, + confidence=conf_value, + confidence_rationale=rationale, + signal=signal_raw, + ) + except ValidationError as exc: + raise EnvelopeMissingError( + agent=agent, field="validation", + message=( + f"envelope_missing: pydantic validation rejected the " + f"parsed values (agent={agent!r}): {exc}" + ), + ) from exc + + def parse_envelope_from_result( result: dict, *, @@ -6711,13 +6811,162 @@ def parse_envelope_from_result( continue break - # Path 3: fail loudly + # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's + # content. Producers that emit the new ``## Response / ## Confidence / + # ## Signal`` section block land here when Paths 1+2 yield nothing. + for msg in reversed(messages): + if msg.__class__.__name__ != "AIMessage": + continue + content = getattr(msg, "content", None) + if not isinstance(content, str) or not content.strip(): + continue + try: + return parse_markdown_envelope(content, agent=agent) + except EnvelopeMissingError: + # This AIMessage didnt carry a parseable markdown contract. + # Continue scanning earlier messages on the off-chance a + # nested loop emitted the contract one step back. + continue + break + + # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular) + # treat a terminal-tool call as the completion and emit an empty + # closing AIMessage. Apps configure typed-terminal tools with + # ``confidence`` + ``confidence_rationale`` args by contract — + # synthesise an envelope from those args when the markdown path + # yielded nothing. Skip if any earlier path succeeded. + for msg in reversed(messages): + tcs = getattr(msg, "tool_calls", None) + if not tcs: + continue + for tc in tcs: + args = tc.get("args") or {} + if not isinstance(args, dict): + continue + conf = args.get("confidence") + rat = args.get("confidence_rationale") or args.get("rationale") + if conf is None or not rat: + continue + try: + conf_f = float(conf) + except (TypeError, ValueError): + continue + content_body = ( + args.get("resolution_summary") + or args.get("reason") + or args.get("summary") + or f"terminal tool {tc.get('name', '')} invoked" + ) + if not isinstance(content_body, str) or not content_body.strip(): + content_body = f"terminal tool {tc.get('name', '')} invoked" + try: + env = AgentTurnOutput( + content=content_body, + confidence=max(0.0, min(1.0, conf_f)), + confidence_rationale=str(rat), + signal=args.get("signal"), + ) + except ValidationError: + continue + _LOG.info( + "envelope_synthesized_from_tool_args agent=%s tool=%s " + "confidence=%.2f", + agent, tc.get("name"), env.confidence, + ) + return env + + # Path 6 (D-22-fallback-permissive): the model called at least one + # tool but never emitted the markdown contract or a typed-terminal + # tool with confidence args. Rather than hard-failing the run, + # synthesise a minimal envelope so the session reaches a terminal + # status (typically default_terminal_status -> needs_review) and + # the operator can review what happened. Confidence is intentionally + # low (0.30) so any HITL gate fires. + invoked_tool_names: list[str] = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + n = tc.get("name") + if n: + invoked_tool_names.append(n) + if invoked_tool_names: + body = ( + f"Agent {agent} invoked " + f"{', '.join(invoked_tool_names)} but did not emit a final " + f"summary. Review the tool-call audit for outcome." + ) + try: + env = AgentTurnOutput( + content=body, + confidence=0.30, + confidence_rationale=( + "synthesized: agent invoked tools but emitted no " + "closing message" + ), + signal=None, + ) + _LOG.warning( + "envelope_synthesized_permissive agent=%s tools=%s", + agent, invoked_tool_names, + ) + return env + except ValidationError: + pass + + # Path 7 (terminal): fail loudly. None of the six paths produced a + # valid envelope — this is a real prompt-drift signal worth + # surfacing as a structured agent_run error. Log the last + # AIMessage content (capped) so operators can diagnose whether + # the LLM emitted the wrong shape vs nothing at all. + last_ai_content = "" + for msg in reversed(messages): + if msg.__class__.__name__ == "AIMessage": + c = getattr(msg, "content", None) + if isinstance(c, str): + last_ai_content = c + break + # Also dump every tool_call across all AIMessages so we can see + # whether the model called any tool with confidence args (Path 5 + # would have fired) or whether it called a non-terminal tool only. + tool_call_summary = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + args = tc.get("args") or {} + tool_call_summary.append({ + "name": tc.get("name"), + "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?", + "has_confidence": isinstance(args, dict) and "confidence" in args, + "has_rationale": isinstance(args, dict) and ( + "confidence_rationale" in args or "rationale" in args + ), + }) + # Detailed per-message dump so we can see exactly what shape the + # conversation took when the parse failed. + msg_summary = [] + for m in messages: + kind = m.__class__.__name__ + c = getattr(m, "content", None) + c_len = len(c) if isinstance(c, str) else ( + f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__ + ) + tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])] + msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs}) + _LOG.warning( + "envelope_missing: agent=%s last_ai_message_content=%r " + "structured_response_type=%s message_count=%d tool_calls=%r " + "msg_trace=%r", + agent, + last_ai_content[:1500] if last_ai_content else "", + type(result.get("structured_response")).__name__, + len(messages), + tool_call_summary, + msg_summary, + ) raise EnvelopeMissingError( agent=agent, field="structured_response", message=( - f"envelope_missing: no structured_response or JSON-decodable " - f"AIMessage envelope found (agent={agent})" + f"envelope_missing: no structured_response, JSON-decodable, or " + f"markdown-decodable AIMessage envelope found (agent={agent})" ), ) @@ -6764,6 +7013,7 @@ def reconcile_confidence( "AgentTurnOutput", "EnvelopeMissingError", "parse_envelope_from_result", + "parse_markdown_envelope", "reconcile_confidence", ] @@ -7230,6 +7480,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition. Without this, + # the DB row stays at ``pending_approval`` and + # the UI keeps offering the buttons forever. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7256,6 +7511,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7278,6 +7535,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -7408,6 +7667,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition (mirror of the + # sync path) so the DB row reflects the actual + # outcome instead of staying at pending_approval. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7429,6 +7693,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7450,6 +7716,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -8239,6 +8507,7 @@ def make_agent_node( patch_tool_names: frozenset[str] = frozenset(), gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ): """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -8261,9 +8530,18 @@ def make_agent_node( async def node(state: GraphState) -> dict: - incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] - inc_id = incident.id + state_session: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from store at entry — outer Pregel checkpoints + # at step boundaries, not mid-node, so ``state["session"]`` is + # stale relative to DB when a HITL gate paused mid-step. See + # the same reload comment in ``runtime.graph.make_agent_node`` + # for the full rationale. + try: + incident: Session = store.load(inc_id) + except FileNotFoundError: + incident = state_session # M3: emit agent_started telemetry before any work happens. if event_log is not None: @@ -8289,25 +8567,25 @@ async def node(state: GraphState) -> dict: ] else: run_tools = tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Same + # change as graph.py:make_agent_node — drop response_format + # so the loop terminates on natural React END, then parse the + # final AIMessage via Path 4 in parse_envelope_from_result. + # The inner agent gets the orchestrator's checkpointer so a + # HITL pause inside ``interrupt()`` can be resumed via + # ``Command(resume=verdict)`` on a stable per-invocation + # thread id (see ``_drive_agent_with_resume`` in + # ``runtime.graph`` for the full rationale). agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, + checkpointer=checkpointer, ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" + ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint at the # start of each agent step so the gateway treats the first @@ -8318,9 +8596,15 @@ async def node(state: GraphState) -> dict: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up. @@ -9295,8 +9579,104 @@ def finish(self, *, summary: str) -> None: ) +async def _drive_agent_with_resume( + *, + agent_executor, + inner_cfg: dict, + inner_has_checkpointer: bool, + initial_input: dict, +): + """Run an inner ``create_agent`` executor, surfacing HITL pauses to + the outer Pregel graph and forwarding resume verdicts back to the + inner agent. + + langgraph 1.x changed the ``interrupt()`` contract: a tool that + calls :func:`langgraph.types.interrupt` no longer raises + :class:`GraphInterrupt` to the caller. Instead, the agent's + ``ainvoke`` returns normally with the pause captured in + ``result["__interrupt__"]``. To make HITL approval actually work + end-to-end the framework must: + + 1. **First call** — invoke the agent with the initial messages. + If the result carries ``__interrupt__``, raise + :class:`GraphInterrupt` so the *outer* Pregel pauses too. The + outer pause is what surfaces ``__interrupt__`` to the + orchestrator and lets the UI render Approve / Reject. + + 2. **Resume call** — when the outer is resumed via + ``Command(resume=verdict)``, this helper is re-entered. The + inner's checkpointer remembers the pause; we detect that via + ``aget_state(...).next`` being non-empty and call + :func:`langgraph.types.interrupt` at the *outer* level to fetch + the verdict, then forward it to the inner via + ``Command(resume=verdict)``. This is the only path that + actually re-runs the gated tool with the verdict — calling + ``inner.ainvoke({"messages": [...]})`` on a paused thread + silently skips the tool. + + A while loop covers the case where a single agent turn pauses + multiple times (e.g. two high-risk tool calls in sequence). + + When ``inner_has_checkpointer`` is False (legacy callers and most + unit tests that build agents without a checkpointer), the helper + falls back to the simple "invoke + re-raise on ``__interrupt__``" + contract. That path can pause the outer graph but the resume + verdict cannot reach the inner tool — which is acceptable because + legacy callers without a checkpointer never had a working HITL + resume path to begin with. + """ + # Local imports keep callers that never touch HITL out of the + # langgraph.types import cost path. ``Command`` and ``interrupt`` + # are tiny but the explicit lazy import documents the dependency. + from langgraph.types import Command, interrupt + + # Resume-detection branch: only meaningful when the inner agent + # has its own checkpointer (otherwise ``aget_state`` on a fresh + # in-memory pregel raises). The detection asks: "does the inner + # have a paused step waiting?". A non-empty ``next`` means yes. + if inner_has_checkpointer: + try: + inner_state = await agent_executor.aget_state(inner_cfg) + inner_paused = bool(getattr(inner_state, "next", ()) or ()) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == fresh + inner_paused = False + else: + inner_paused = False + + if inner_paused: + # Outer is being resumed; pull the verdict via the outer's own + # interrupt() and forward it. The payload here is informational — + # the only thing the outer cares about is the resume value. + verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]}) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + else: + result = await _ainvoke_with_retry( + agent_executor, initial_input, config=inner_cfg, + ) + + # Multi-pause loop: if the inner pauses again (or resumes into a + # second gated tool call), surface each pause to the outer the + # same way until the inner is fully done. + while isinstance(result, dict) and result.get("__interrupt__"): + if not inner_has_checkpointer: + # No checkpointer => no Command(resume=...) path is + # available; surface the pause to the outer Pregel and + # stop. Resume cannot deliver the verdict to the gated + # tool, but that matches legacy behavior. + raise GraphInterrupt(result["__interrupt__"]) + verdict = interrupt(result["__interrupt__"]) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + + return result + + async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, - base_delay: float = 1.5): + base_delay: float = 1.5, + config: dict | None = None): """Wrap a LangGraph agent invocation with retry on transient cloud errors. Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.). @@ -9314,6 +9694,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, # itself (AutoStrategy → ToolStrategy fallback for non- # function-calling Ollama models). The default langgraph # recursion bound is now a true upper bound, not a workaround. + # + # ``config`` (default ``None``) carries the per-thread + # ``configurable.thread_id`` for callers that need a + # checkpointer-scoped invocation (HITL resume path — + # see :func:`_drive_agent_with_resume`). Legacy callers + # pass nothing and the executor's default thread is used. + if config is not None: + return await executor.ainvoke(input_, config=config) return await executor.ainvoke(input_) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause. @@ -9646,6 +10034,7 @@ def make_agent_node( injected_args: dict[str, str] | None = None, gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ) -> Callable[[GraphState], Awaitable[dict]]: """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -9677,10 +10066,23 @@ def make_agent_node( """ async def node(state: GraphState) -> dict: - incident = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session - inc_id = incident.id + state_session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from the persistent store at entry. Outer Pregel + # checkpoints at step boundaries, not mid-node — so on a HITL + # resume, ``state["session"]`` reflects the prior step's output + # (e.g. deep_investigator's), not the gateway's pending_approval + # row + version bump that happened mid-step in the original run. + # If we trust ``state["session"]`` here, the gateway sees no + # pending row, appends a duplicate, then ``store.save`` raises + # ``StaleVersionError`` because DB has already moved on. + try: + incident = store.load(inc_id) + except FileNotFoundError: + incident = state_session + # M3 (per-step telemetry): emit agent_started. if event_log is not None: try: @@ -9772,25 +10174,43 @@ def _run(**kwargs: Any) -> Any: ] else: run_tools = visible_tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Drop + # ``response_format`` from create_agent — the agent loop now + # terminates on the natural React END signal (LLM emits an + # AIMessage with no tool calls). The framework parses that + # final message body as markdown via Path 4 in + # parse_envelope_from_result. Skill prompts (D-22-04) instruct + # every reply to end with ## Response / ## Confidence / ## + # Signal sections; the parser extracts them. + # + # Why: forcing LLMs through a JSON-schema-shaped output via + # ``response_format`` triggered a class of brittleness across + # providers (model-specific JSON drift, tool-strategy + React + # END interaction, recursion_limit ceilings). Markdown is the + # native format every chat model writes well; the parse step + # happens in the framework, where leniency is in our control. + # The inner agent gets a checkpointer so a HITL pause inside + # ``interrupt()`` can be resumed via ``Command(resume=verdict)`` + # on the SAME inner thread. langgraph 1.x semantics require the + # checkpointer here — without it ``Command(resume=...)`` raises + # ``RuntimeError: Cannot use Command(resume=...) without + # checkpointer``. The thread id is derived deterministically + # from session + agent + the upcoming agent_run index so it is: + # * STABLE across the inner pause and the outer resume that + # follows (both observe the same ``len(incident.agents_run)`` + # because no new run is recorded mid-pause), and + # * UNIQUE per agent invocation so previous invocations of the + # same agent within the same session don't bleed in. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, + checkpointer=checkpointer, ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" + ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint. The hint # is updated below after _harvest_tool_calls_and_patches; on @@ -9802,9 +10222,15 @@ def _run(**kwargs: Any) -> Any: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error. @@ -9983,12 +10409,17 @@ def _decide_from_signal(inc: Session) -> str: _DEFAULT_STUB_CANNED: dict[str, str] = { - # Back-compat defaults for the four canonical agents. Any YAML-defined - # agent can override or extend this via ``skill.stub_response``; new agents - # without an entry here fall through to StubChatModel's generic placeholder. + # Plain-text bodies; ``StubChatModel._generate`` wraps these in the + # D-22-03 markdown envelope using the per-agent stub_envelope_* fields + # (the framework parses the wrapped output via Path 4). Apps can + # override or extend via ``skill.stub_response``; new agents without + # an entry here fall through to ``StubChatModel``'s generic placeholder. "intake": "Created INC, no prior matches. Routing to triage.", "triage": "Severity medium, category latency. No recent deploys correlate.", - "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.", + "deep_investigator": ( + "Hypothesis: upstream payments timeout. " + "Evidence: log line upstream_timeout target=payments." + ), "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.", } @@ -10151,7 +10582,8 @@ async def gate(state: GraphState) -> dict: def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, registry: ToolRegistry, - event_log: "EventLog | None" = None) -> dict: + event_log: "EventLog | None" = None, + checkpointer: Any = None) -> dict: """Materialize agent nodes from skills + registry. Reused by main + resume graphs. Dispatches on ``skill.kind``: @@ -10229,6 +10661,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, injected_args=cfg.orchestrator.injected_args, gate_policy=gate_policy, event_log=event_log, + checkpointer=checkpointer, ) return nodes @@ -10334,7 +10767,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore, sg = StateGraph(GraphState) nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store, - registry=registry, event_log=event_log) + registry=registry, event_log=event_log, + checkpointer=checkpointer) for agent_name, node in nodes.items(): sg.add_node(agent_name, node) sg.add_node("gate", make_gate_node( @@ -14108,6 +14542,29 @@ async def _finalize_session_status_async( async with self._locks.acquire(session_id): return self._finalize_session_status(session_id) + async def _is_graph_paused(self, session_id: str) -> bool: + """Return True iff the compiled graph has a pending step waiting + to resume (i.e. it's paused at an ``interrupt()`` boundary). + + langgraph 1.x surfaces a HITL pause via the result dict's + ``__interrupt__`` field rather than raising + :class:`GraphInterrupt`. After ``astream_events`` (or + ``ainvoke``) returns, the only way to tell "the run paused" + from "the run completed" is to query the checkpointed graph + state — a non-empty ``next`` tuple means the graph has steps + queued to run when resumed. The stream/retry call sites use + this to skip ``_finalize_session_status_async`` on a pause — + finalizing on a pause would stamp ``default_terminal_status`` + on a session that's actually waiting on operator approval, and + the orphaned ``pending_approval`` ToolCall row written by the + gateway would never get its resume. + """ + try: + state = await self.graph.aget_state(self._thread_config(session_id)) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == not paused + return False + return bool(getattr(state, "next", ()) or ()) + def _thread_config(self, incident_id: str) -> dict: """Build the LangGraph ``config`` dict for a per-session thread. @@ -14317,10 +14774,18 @@ async def stream_session(self, *, query: str, environment: str, config=self._thread_config(inc.id), ): yield self._to_ui_event(ev, inc.id) - new_status = await self._finalize_session_status_async(inc.id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": inc.id, - "status": new_status, "ts": _event_ts()} + # Skip finalize when the graph paused on an interrupt — the + # session is waiting for operator approval, not done. Stamping + # default_terminal_status here would orphan the pending_approval + # ToolCall row written by the gateway just before the pause. + if await self._is_graph_paused(inc.id): + yield {"event": "session_paused", "incident_id": inc.id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(inc.id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": inc.id, + "status": new_status, "ts": _event_ts()} yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()} async def stream_investigation(self, *, query: str, environment: str, @@ -14593,10 +15058,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]: config=self._thread_config(session_id), ): yield self._to_ui_event(ev, session_id) - new_status = await self._finalize_session_status_async(session_id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": session_id, - "status": new_status, "ts": _event_ts()} + # See ``stream_session`` for why pause-detection guards the + # finalize call: a HITL pause must not be coerced into a + # terminal status. + if await self._is_graph_paused(session_id): + yield {"event": "session_paused", "incident_id": session_id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(session_id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": session_id, + "status": new_status, "ts": _event_ts()} yield {"event": "retry_completed", "incident_id": session_id, "ts": _event_ts()} @@ -15289,6 +15761,16 @@ async def _resume() -> None: Command(resume=decision_payload), config=orch._thread_config(session_id), ) + # Finalize after the verdict-driven run completes so + # the session row reflects the terminal status (or + # falls through to ``default_terminal_status``). Skip + # if a fresh interrupt re-paused the graph — the + # session is again awaiting operator input. Mirrors + # the same guard in the UI's + # ``_submit_approval_via_service`` and the + # ``stream_session`` finalize call site. + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) # Submit the resume onto the long-lived service loop so we # don't fight the lifespan thread for the same FastMCP/SQLite diff --git a/dist/apps/incident-management.py b/dist/apps/incident-management.py index 9e3a3e7..e2ef5f9 100644 --- a/dist/apps/incident-management.py +++ b/dist/apps/incident-management.py @@ -550,7 +550,7 @@ class IncidentState(Session): -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, ValidationError # ----- imports for runtime/tools/gateway.py ----- """Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper. @@ -713,7 +713,7 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING, Any, Callable from langchain_core.messages import HumanMessage from langchain.agents import create_agent @@ -994,7 +994,6 @@ class IncidentState(Session): """ -from typing import TYPE_CHECKING, Any, Callable from fastapi import APIRouter, Depends, Header, HTTPException, Request, status from pydantic import ValidationError @@ -1180,6 +1179,7 @@ async def _poll(self, registry): """ +from pydantic import BaseModel, ConfigDict, Field # ----- imports for runtime/memory/knowledge_graph.py ----- @@ -3196,17 +3196,13 @@ class StubChatModel(BaseChatModel): ``stub_envelope_confidence`` / ``stub_envelope_rationale`` / ``stub_envelope_signal``. - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` -> - ``ToolStrategy`` for non-native-structured-output models, including - this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The - agent loop only terminates when the LLM emits a tool call NAMED - ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name - so ``_generate`` can auto-emit a closing tool call after any - user-configured ``tool_call_plan`` is exhausted -- preserving the - pre-Phase-15 stub semantics (canned text + optional pre-scripted - tool calls) while satisfying the new tool-loop termination - contract. + Phase 22 (D-22-05): the loop terminates on natural React END now + that ``response_format=AgentTurnOutput`` is gone — tests drive + markdown-shaped canned text (the framework parses it via Path 4 + in :func:`runtime.agents.turn_output.parse_envelope_from_result`). + The Phase 15 envelope-as-callable-tool auto-emit is removed; the + stub is back to its pre-Phase-15 simple semantics (canned text + + optional pre-scripted ``tool_call_plan``). """ role: str = "default" canned_responses: dict[str, str] = Field(default_factory=dict) @@ -3215,12 +3211,6 @@ class StubChatModel(BaseChatModel): stub_envelope_rationale: str = "stub envelope rationale" stub_envelope_signal: str | None = None _called_once: bool = False - # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when - # ``langchain.agents.create_agent`` injects a structured-output tool - # for ``AgentTurnOutput``. Holds the bare tool name (e.g. - # ``"AgentTurnOutput"``) so ``_generate`` can emit a final - # envelope-shaped tool call to close the agent loop. - _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -3228,33 +3218,37 @@ def _llm_type(self) -> str: def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None, run_manager: Any = None, **kwargs: Any) -> ChatResult: - text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response") + # Phase 22 (D-22-05): tool-call rounds emit the scripted tool + # calls with a placeholder content body; the closing turn (no + # remaining tool_call_plan entries) emits the canned text + # wrapped in the D-22-03 markdown contract that + # parse_markdown_envelope reads. The stub_envelope_* + # parameters drive the trailing-section bodies so tests can + # exercise specific confidence / rationale / signal paths + # without authoring markdown by hand. tool_calls: list[dict] = [] if self.tool_call_plan and not self._called_once: for tc in self.tool_call_plan: - tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}) + tool_calls.append( + {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())} + ) self._called_once = True - elif self._envelope_tool_name is not None: - # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted - # (or wasn't configured) AND ``langchain.agents.create_agent`` - # has bound the AgentTurnOutput envelope as a tool. Emit a - # closing tool call so the loop terminates with a populated - # ``structured_response``. The args mirror the - # ``with_structured_output`` path's envelope construction so - # tests see the same confidence / rationale / signal regardless - # of whether the new tool-strategy or the legacy structured- - # output path is in play. - tool_calls.append({ - "name": self._envelope_tool_name, - "args": { - "content": text or ".", - "confidence": self.stub_envelope_confidence, - "confidence_rationale": self.stub_envelope_rationale, - "signal": self.stub_envelope_signal, - }, - "id": str(uuid4()), - }) - msg = AIMessage(content=text, tool_calls=tool_calls) + + body = self.canned_responses.get( + self.role, f"[stub:{self.role}] no canned response", + ) + # Phase 22 (D-22-05): explicit "none" when no signal so the + # parser maps to None — preserves pre-Phase-22 behaviour where a + # stub with no signal yielded envelope.signal=None. + signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none" + md = ( + f"{body}\n\n" + f"## Response\n{body}\n\n" + f"## Confidence\n{self.stub_envelope_confidence:.4f} -- " + f"{self.stub_envelope_rationale}\n\n" + f"## Signal\n{signal_str}\n" + ) + msg = AIMessage(content=md, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None, @@ -3262,30 +3256,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - """Record the AgentTurnOutput envelope-tool name when present. - - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` calls ``bind_tools(...)`` - with the user's tools PLUS the envelope-as-a-tool. We scan the - list for the AgentTurnOutput-shaped tool (matched by ``__name__`` - on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the - ``"name"`` key on dict-shaped tool specs) and remember it on the - instance so ``_generate`` can close the agent loop with a - synthetic envelope tool call after any pre-scripted - ``tool_call_plan`` is exhausted. Tools bound by the framework - itself (real BaseTools the agent should call) flow through - unchanged -- the stub still emits them only via - ``tool_call_plan``. + """Phase 22 (D-22-05): no-op tool binding. + + ``langchain.agents.create_agent`` calls ``bind_tools`` to wire + the agent's BaseTool list onto the chat model. The stub does + not actually invoke those tools (test fixtures script tool + calls via ``tool_call_plan`` instead), so we just return self + unchanged. The pre-Phase-22 envelope-tool detection is gone + because the framework no longer asks for ``response_format``. """ - for t in tools or []: - name = ( - getattr(t, "__name__", None) - or getattr(t, "name", None) - or (isinstance(t, dict) and t.get("name")) - ) - if isinstance(name, str) and name == "AgentTurnOutput": - self._envelope_tool_name = name - break return self # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]`` @@ -6666,6 +6645,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): super().__init__(message or f"envelope_missing: {field} (agent={agent})") +_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE) +_CONF_LINE = re.compile( + # Leftmost float (allows int form), optional rationale after em-dash / + # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale + # is captured wholesale. + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$", + re.DOTALL, +) + + +def _clamp_unit(x: float) -> float: + """Clamp a confidence float into [0, 1] without raising. The skill + prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or + ``-0.1`` — clamp rather than reject so the parse step is forgiving.""" + if x < 0.0: + return 0.0 + if x > 1.0: + return 1.0 + return x + + +def parse_markdown_envelope( + content: str, *, agent: str = "", +) -> "AgentTurnOutput": + """D-22-03 — parse the trailing markdown contract block into an + :class:`AgentTurnOutput`. + + Expects three sections ``## Response`` / ``## Confidence`` / + ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the + end of the LLM's reply. The free-text body of ``## Response`` + becomes ``content``; the float on the ``## Confidence`` line + becomes ``confidence`` (clamped to [0, 1]); the rest of that line + after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``; + the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/ + blank coerced to ``None``). + + Raises :class:`EnvelopeMissingError` only when the parse cannot + produce a valid envelope at all — missing ``## Confidence`` line, + unparseable confidence value, or empty ``## Response`` body. + """ + if not isinstance(content, str) or not content.strip(): + raise EnvelopeMissingError( + agent=agent, field="content", + message=f"envelope_missing: empty content (agent={agent!r})", + ) + + # ``re.split`` with the header capture-group yields: + # [pre, h1, body1, h2, body2, ...] + # Pre is the prose before the first heading; we ignore it for parse. + parts = _HEADER_SPLIT.split(content) + sections: dict[str, str] = {} + if len(parts) >= 3: + for i in range(1, len(parts) - 1, 2): + key = parts[i].strip().lower() + body = parts[i + 1].strip() + # Don't overwrite an earlier section with a later same-named + # one (unusual but possible in pathological prompts). + sections.setdefault(key, body) + + response_body = sections.get("response", "").strip() + raw_conf = sections.get("confidence", "").strip() + + if not raw_conf: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section absent or empty " + f"(agent={agent!r})" + ), + ) + + m = _CONF_LINE.match(raw_conf) + if not m: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section did not parse " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) + try: + conf_value = _clamp_unit(float(m.group(1))) + except (TypeError, ValueError) as exc: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence value not a float " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) from exc + rationale = (m.group(2) or "").strip() or "(no rationale provided)" + + signal_raw = sections.get("signal", "").strip().lower() or None + if signal_raw in {"none", "null", "", "n/a"}: + signal_raw = None + + if not response_body: + raise EnvelopeMissingError( + agent=agent, field="content", + message=( + f"envelope_missing: response section empty (agent={agent!r})" + ), + ) + + try: + return AgentTurnOutput( + content=response_body, + confidence=conf_value, + confidence_rationale=rationale, + signal=signal_raw, + ) + except ValidationError as exc: + raise EnvelopeMissingError( + agent=agent, field="validation", + message=( + f"envelope_missing: pydantic validation rejected the " + f"parsed values (agent={agent!r}): {exc}" + ), + ) from exc + + def parse_envelope_from_result( result: dict, *, @@ -6723,13 +6823,162 @@ def parse_envelope_from_result( continue break - # Path 3: fail loudly + # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's + # content. Producers that emit the new ``## Response / ## Confidence / + # ## Signal`` section block land here when Paths 1+2 yield nothing. + for msg in reversed(messages): + if msg.__class__.__name__ != "AIMessage": + continue + content = getattr(msg, "content", None) + if not isinstance(content, str) or not content.strip(): + continue + try: + return parse_markdown_envelope(content, agent=agent) + except EnvelopeMissingError: + # This AIMessage didnt carry a parseable markdown contract. + # Continue scanning earlier messages on the off-chance a + # nested loop emitted the contract one step back. + continue + break + + # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular) + # treat a terminal-tool call as the completion and emit an empty + # closing AIMessage. Apps configure typed-terminal tools with + # ``confidence`` + ``confidence_rationale`` args by contract — + # synthesise an envelope from those args when the markdown path + # yielded nothing. Skip if any earlier path succeeded. + for msg in reversed(messages): + tcs = getattr(msg, "tool_calls", None) + if not tcs: + continue + for tc in tcs: + args = tc.get("args") or {} + if not isinstance(args, dict): + continue + conf = args.get("confidence") + rat = args.get("confidence_rationale") or args.get("rationale") + if conf is None or not rat: + continue + try: + conf_f = float(conf) + except (TypeError, ValueError): + continue + content_body = ( + args.get("resolution_summary") + or args.get("reason") + or args.get("summary") + or f"terminal tool {tc.get('name', '')} invoked" + ) + if not isinstance(content_body, str) or not content_body.strip(): + content_body = f"terminal tool {tc.get('name', '')} invoked" + try: + env = AgentTurnOutput( + content=content_body, + confidence=max(0.0, min(1.0, conf_f)), + confidence_rationale=str(rat), + signal=args.get("signal"), + ) + except ValidationError: + continue + _LOG.info( + "envelope_synthesized_from_tool_args agent=%s tool=%s " + "confidence=%.2f", + agent, tc.get("name"), env.confidence, + ) + return env + + # Path 6 (D-22-fallback-permissive): the model called at least one + # tool but never emitted the markdown contract or a typed-terminal + # tool with confidence args. Rather than hard-failing the run, + # synthesise a minimal envelope so the session reaches a terminal + # status (typically default_terminal_status -> needs_review) and + # the operator can review what happened. Confidence is intentionally + # low (0.30) so any HITL gate fires. + invoked_tool_names: list[str] = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + n = tc.get("name") + if n: + invoked_tool_names.append(n) + if invoked_tool_names: + body = ( + f"Agent {agent} invoked " + f"{', '.join(invoked_tool_names)} but did not emit a final " + f"summary. Review the tool-call audit for outcome." + ) + try: + env = AgentTurnOutput( + content=body, + confidence=0.30, + confidence_rationale=( + "synthesized: agent invoked tools but emitted no " + "closing message" + ), + signal=None, + ) + _LOG.warning( + "envelope_synthesized_permissive agent=%s tools=%s", + agent, invoked_tool_names, + ) + return env + except ValidationError: + pass + + # Path 7 (terminal): fail loudly. None of the six paths produced a + # valid envelope — this is a real prompt-drift signal worth + # surfacing as a structured agent_run error. Log the last + # AIMessage content (capped) so operators can diagnose whether + # the LLM emitted the wrong shape vs nothing at all. + last_ai_content = "" + for msg in reversed(messages): + if msg.__class__.__name__ == "AIMessage": + c = getattr(msg, "content", None) + if isinstance(c, str): + last_ai_content = c + break + # Also dump every tool_call across all AIMessages so we can see + # whether the model called any tool with confidence args (Path 5 + # would have fired) or whether it called a non-terminal tool only. + tool_call_summary = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + args = tc.get("args") or {} + tool_call_summary.append({ + "name": tc.get("name"), + "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?", + "has_confidence": isinstance(args, dict) and "confidence" in args, + "has_rationale": isinstance(args, dict) and ( + "confidence_rationale" in args or "rationale" in args + ), + }) + # Detailed per-message dump so we can see exactly what shape the + # conversation took when the parse failed. + msg_summary = [] + for m in messages: + kind = m.__class__.__name__ + c = getattr(m, "content", None) + c_len = len(c) if isinstance(c, str) else ( + f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__ + ) + tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])] + msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs}) + _LOG.warning( + "envelope_missing: agent=%s last_ai_message_content=%r " + "structured_response_type=%s message_count=%d tool_calls=%r " + "msg_trace=%r", + agent, + last_ai_content[:1500] if last_ai_content else "", + type(result.get("structured_response")).__name__, + len(messages), + tool_call_summary, + msg_summary, + ) raise EnvelopeMissingError( agent=agent, field="structured_response", message=( - f"envelope_missing: no structured_response or JSON-decodable " - f"AIMessage envelope found (agent={agent})" + f"envelope_missing: no structured_response, JSON-decodable, or " + f"markdown-decodable AIMessage envelope found (agent={agent})" ), ) @@ -6776,6 +7025,7 @@ def reconcile_confidence( "AgentTurnOutput", "EnvelopeMissingError", "parse_envelope_from_result", + "parse_markdown_envelope", "reconcile_confidence", ] @@ -7242,6 +7492,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition. Without this, + # the DB row stays at ``pending_approval`` and + # the UI keeps offering the buttons forever. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7268,6 +7523,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7290,6 +7547,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -7420,6 +7679,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition (mirror of the + # sync path) so the DB row reflects the actual + # outcome instead of staying at pending_approval. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -7441,6 +7705,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -7462,6 +7728,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -8251,6 +8519,7 @@ def make_agent_node( patch_tool_names: frozenset[str] = frozenset(), gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ): """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -8273,9 +8542,18 @@ def make_agent_node( async def node(state: GraphState) -> dict: - incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] - inc_id = incident.id + state_session: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from store at entry — outer Pregel checkpoints + # at step boundaries, not mid-node, so ``state["session"]`` is + # stale relative to DB when a HITL gate paused mid-step. See + # the same reload comment in ``runtime.graph.make_agent_node`` + # for the full rationale. + try: + incident: Session = store.load(inc_id) + except FileNotFoundError: + incident = state_session # M3: emit agent_started telemetry before any work happens. if event_log is not None: @@ -8301,25 +8579,25 @@ async def node(state: GraphState) -> dict: ] else: run_tools = tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Same + # change as graph.py:make_agent_node — drop response_format + # so the loop terminates on natural React END, then parse the + # final AIMessage via Path 4 in parse_envelope_from_result. + # The inner agent gets the orchestrator's checkpointer so a + # HITL pause inside ``interrupt()`` can be resumed via + # ``Command(resume=verdict)`` on a stable per-invocation + # thread id (see ``_drive_agent_with_resume`` in + # ``runtime.graph`` for the full rationale). agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, + checkpointer=checkpointer, + ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint at the # start of each agent step so the gateway treats the first @@ -8330,9 +8608,15 @@ async def node(state: GraphState) -> dict: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up. @@ -9307,8 +9591,104 @@ def finish(self, *, summary: str) -> None: ) +async def _drive_agent_with_resume( + *, + agent_executor, + inner_cfg: dict, + inner_has_checkpointer: bool, + initial_input: dict, +): + """Run an inner ``create_agent`` executor, surfacing HITL pauses to + the outer Pregel graph and forwarding resume verdicts back to the + inner agent. + + langgraph 1.x changed the ``interrupt()`` contract: a tool that + calls :func:`langgraph.types.interrupt` no longer raises + :class:`GraphInterrupt` to the caller. Instead, the agent's + ``ainvoke`` returns normally with the pause captured in + ``result["__interrupt__"]``. To make HITL approval actually work + end-to-end the framework must: + + 1. **First call** — invoke the agent with the initial messages. + If the result carries ``__interrupt__``, raise + :class:`GraphInterrupt` so the *outer* Pregel pauses too. The + outer pause is what surfaces ``__interrupt__`` to the + orchestrator and lets the UI render Approve / Reject. + + 2. **Resume call** — when the outer is resumed via + ``Command(resume=verdict)``, this helper is re-entered. The + inner's checkpointer remembers the pause; we detect that via + ``aget_state(...).next`` being non-empty and call + :func:`langgraph.types.interrupt` at the *outer* level to fetch + the verdict, then forward it to the inner via + ``Command(resume=verdict)``. This is the only path that + actually re-runs the gated tool with the verdict — calling + ``inner.ainvoke({"messages": [...]})`` on a paused thread + silently skips the tool. + + A while loop covers the case where a single agent turn pauses + multiple times (e.g. two high-risk tool calls in sequence). + + When ``inner_has_checkpointer`` is False (legacy callers and most + unit tests that build agents without a checkpointer), the helper + falls back to the simple "invoke + re-raise on ``__interrupt__``" + contract. That path can pause the outer graph but the resume + verdict cannot reach the inner tool — which is acceptable because + legacy callers without a checkpointer never had a working HITL + resume path to begin with. + """ + # Local imports keep callers that never touch HITL out of the + # langgraph.types import cost path. ``Command`` and ``interrupt`` + # are tiny but the explicit lazy import documents the dependency. + from langgraph.types import Command, interrupt + + # Resume-detection branch: only meaningful when the inner agent + # has its own checkpointer (otherwise ``aget_state`` on a fresh + # in-memory pregel raises). The detection asks: "does the inner + # have a paused step waiting?". A non-empty ``next`` means yes. + if inner_has_checkpointer: + try: + inner_state = await agent_executor.aget_state(inner_cfg) + inner_paused = bool(getattr(inner_state, "next", ()) or ()) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == fresh + inner_paused = False + else: + inner_paused = False + + if inner_paused: + # Outer is being resumed; pull the verdict via the outer's own + # interrupt() and forward it. The payload here is informational — + # the only thing the outer cares about is the resume value. + verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]}) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + else: + result = await _ainvoke_with_retry( + agent_executor, initial_input, config=inner_cfg, + ) + + # Multi-pause loop: if the inner pauses again (or resumes into a + # second gated tool call), surface each pause to the outer the + # same way until the inner is fully done. + while isinstance(result, dict) and result.get("__interrupt__"): + if not inner_has_checkpointer: + # No checkpointer => no Command(resume=...) path is + # available; surface the pause to the outer Pregel and + # stop. Resume cannot deliver the verdict to the gated + # tool, but that matches legacy behavior. + raise GraphInterrupt(result["__interrupt__"]) + verdict = interrupt(result["__interrupt__"]) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + + return result + + async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, - base_delay: float = 1.5): + base_delay: float = 1.5, + config: dict | None = None): """Wrap a LangGraph agent invocation with retry on transient cloud errors. Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.). @@ -9326,6 +9706,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, # itself (AutoStrategy → ToolStrategy fallback for non- # function-calling Ollama models). The default langgraph # recursion bound is now a true upper bound, not a workaround. + # + # ``config`` (default ``None``) carries the per-thread + # ``configurable.thread_id`` for callers that need a + # checkpointer-scoped invocation (HITL resume path — + # see :func:`_drive_agent_with_resume`). Legacy callers + # pass nothing and the executor's default thread is used. + if config is not None: + return await executor.ainvoke(input_, config=config) return await executor.ainvoke(input_) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause. @@ -9658,6 +10046,7 @@ def make_agent_node( injected_args: dict[str, str] | None = None, gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ) -> Callable[[GraphState], Awaitable[dict]]: """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -9689,10 +10078,23 @@ def make_agent_node( """ async def node(state: GraphState) -> dict: - incident = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session - inc_id = incident.id + state_session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from the persistent store at entry. Outer Pregel + # checkpoints at step boundaries, not mid-node — so on a HITL + # resume, ``state["session"]`` reflects the prior step's output + # (e.g. deep_investigator's), not the gateway's pending_approval + # row + version bump that happened mid-step in the original run. + # If we trust ``state["session"]`` here, the gateway sees no + # pending row, appends a duplicate, then ``store.save`` raises + # ``StaleVersionError`` because DB has already moved on. + try: + incident = store.load(inc_id) + except FileNotFoundError: + incident = state_session + # M3 (per-step telemetry): emit agent_started. if event_log is not None: try: @@ -9784,25 +10186,43 @@ def _run(**kwargs: Any) -> Any: ] else: run_tools = visible_tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Drop + # ``response_format`` from create_agent — the agent loop now + # terminates on the natural React END signal (LLM emits an + # AIMessage with no tool calls). The framework parses that + # final message body as markdown via Path 4 in + # parse_envelope_from_result. Skill prompts (D-22-04) instruct + # every reply to end with ## Response / ## Confidence / ## + # Signal sections; the parser extracts them. + # + # Why: forcing LLMs through a JSON-schema-shaped output via + # ``response_format`` triggered a class of brittleness across + # providers (model-specific JSON drift, tool-strategy + React + # END interaction, recursion_limit ceilings). Markdown is the + # native format every chat model writes well; the parse step + # happens in the framework, where leniency is in our control. + # The inner agent gets a checkpointer so a HITL pause inside + # ``interrupt()`` can be resumed via ``Command(resume=verdict)`` + # on the SAME inner thread. langgraph 1.x semantics require the + # checkpointer here — without it ``Command(resume=...)`` raises + # ``RuntimeError: Cannot use Command(resume=...) without + # checkpointer``. The thread id is derived deterministically + # from session + agent + the upcoming agent_run index so it is: + # * STABLE across the inner pause and the outer resume that + # follows (both observe the same ``len(incident.agents_run)`` + # because no new run is recorded mid-pause), and + # * UNIQUE per agent invocation so previous invocations of the + # same agent within the same session don't bleed in. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, + checkpointer=checkpointer, ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" + ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint. The hint # is updated below after _harvest_tool_calls_and_patches; on @@ -9814,9 +10234,15 @@ def _run(**kwargs: Any) -> Any: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error. @@ -9995,12 +10421,17 @@ def _decide_from_signal(inc: Session) -> str: _DEFAULT_STUB_CANNED: dict[str, str] = { - # Back-compat defaults for the four canonical agents. Any YAML-defined - # agent can override or extend this via ``skill.stub_response``; new agents - # without an entry here fall through to StubChatModel's generic placeholder. + # Plain-text bodies; ``StubChatModel._generate`` wraps these in the + # D-22-03 markdown envelope using the per-agent stub_envelope_* fields + # (the framework parses the wrapped output via Path 4). Apps can + # override or extend via ``skill.stub_response``; new agents without + # an entry here fall through to ``StubChatModel``'s generic placeholder. "intake": "Created INC, no prior matches. Routing to triage.", "triage": "Severity medium, category latency. No recent deploys correlate.", - "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.", + "deep_investigator": ( + "Hypothesis: upstream payments timeout. " + "Evidence: log line upstream_timeout target=payments." + ), "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.", } @@ -10163,7 +10594,8 @@ async def gate(state: GraphState) -> dict: def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, registry: ToolRegistry, - event_log: "EventLog | None" = None) -> dict: + event_log: "EventLog | None" = None, + checkpointer: Any = None) -> dict: """Materialize agent nodes from skills + registry. Reused by main + resume graphs. Dispatches on ``skill.kind``: @@ -10241,6 +10673,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, injected_args=cfg.orchestrator.injected_args, gate_policy=gate_policy, event_log=event_log, + checkpointer=checkpointer, ) return nodes @@ -10346,7 +10779,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore, sg = StateGraph(GraphState) nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store, - registry=registry, event_log=event_log) + registry=registry, event_log=event_log, + checkpointer=checkpointer) for agent_name, node in nodes.items(): sg.add_node(agent_name, node) sg.add_node("gate", make_gate_node( @@ -14120,6 +14554,29 @@ async def _finalize_session_status_async( async with self._locks.acquire(session_id): return self._finalize_session_status(session_id) + async def _is_graph_paused(self, session_id: str) -> bool: + """Return True iff the compiled graph has a pending step waiting + to resume (i.e. it's paused at an ``interrupt()`` boundary). + + langgraph 1.x surfaces a HITL pause via the result dict's + ``__interrupt__`` field rather than raising + :class:`GraphInterrupt`. After ``astream_events`` (or + ``ainvoke``) returns, the only way to tell "the run paused" + from "the run completed" is to query the checkpointed graph + state — a non-empty ``next`` tuple means the graph has steps + queued to run when resumed. The stream/retry call sites use + this to skip ``_finalize_session_status_async`` on a pause — + finalizing on a pause would stamp ``default_terminal_status`` + on a session that's actually waiting on operator approval, and + the orphaned ``pending_approval`` ToolCall row written by the + gateway would never get its resume. + """ + try: + state = await self.graph.aget_state(self._thread_config(session_id)) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == not paused + return False + return bool(getattr(state, "next", ()) or ()) + def _thread_config(self, incident_id: str) -> dict: """Build the LangGraph ``config`` dict for a per-session thread. @@ -14329,10 +14786,18 @@ async def stream_session(self, *, query: str, environment: str, config=self._thread_config(inc.id), ): yield self._to_ui_event(ev, inc.id) - new_status = await self._finalize_session_status_async(inc.id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": inc.id, - "status": new_status, "ts": _event_ts()} + # Skip finalize when the graph paused on an interrupt — the + # session is waiting for operator approval, not done. Stamping + # default_terminal_status here would orphan the pending_approval + # ToolCall row written by the gateway just before the pause. + if await self._is_graph_paused(inc.id): + yield {"event": "session_paused", "incident_id": inc.id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(inc.id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": inc.id, + "status": new_status, "ts": _event_ts()} yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()} async def stream_investigation(self, *, query: str, environment: str, @@ -14605,10 +15070,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]: config=self._thread_config(session_id), ): yield self._to_ui_event(ev, session_id) - new_status = await self._finalize_session_status_async(session_id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": session_id, - "status": new_status, "ts": _event_ts()} + # See ``stream_session`` for why pause-detection guards the + # finalize call: a HITL pause must not be coerced into a + # terminal status. + if await self._is_graph_paused(session_id): + yield {"event": "session_paused", "incident_id": session_id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(session_id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": session_id, + "status": new_status, "ts": _event_ts()} yield {"event": "retry_completed", "incident_id": session_id, "ts": _event_ts()} @@ -15301,6 +15773,16 @@ async def _resume() -> None: Command(resume=decision_payload), config=orch._thread_config(session_id), ) + # Finalize after the verdict-driven run completes so + # the session row reflects the terminal status (or + # falls through to ``default_terminal_status``). Skip + # if a fresh interrupt re-paused the graph — the + # session is again awaiting operator input. Mirrors + # the same guard in the UI's + # ``_submit_approval_via_service`` and the + # ``stream_session`` finalize call site. + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) # Submit the resume onto the long-lived service loop so we # don't fight the lifespan thread for the same FastMCP/SQLite diff --git a/dist/ui.py b/dist/ui.py index 05bc7d9..24f5876 100644 --- a/dist/ui.py +++ b/dist/ui.py @@ -26,6 +26,7 @@ from app import AppConfig, Base, FrameworkAppConfig, MetadataConfig, Orchestrator, OrchestratorService, SessionStore, UIBadge, build_embedder, build_engine, build_vector_store, load_config, resolve_framework_app_config, resolve_state_class import asyncio +import logging as _logging import os import time from datetime import datetime, timezone @@ -40,11 +41,28 @@ -# Default config path; apps override via the ``APP_CONFIG`` env var. +# Optional: env-driven log config so manual runs (streamlit run, etc.) +# can crank verbosity to INFO/DEBUG without modifying the source. The +# default keeps the production-quiet WARNING level. ``force=True`` +# overrides any handler streamlit set up during import. # ====== module: src/runtime/ui.py ====== +def _maybe_configure_logging() -> None: + level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip() + if level in {"DEBUG", "INFO", "WARNING", "ERROR"}: + _logging.basicConfig( + level=getattr(_logging, level), + format="%(asctime)s %(name)s %(levelname)s %(message)s", + force=True, + ) + + +_maybe_configure_logging() + + +# Default config path; apps override via the ``APP_CONFIG`` env var. CONFIG_PATH = Path(os.environ.get("APP_CONFIG", "config/config.yaml")) @@ -985,6 +1003,14 @@ async def _drive() -> None: Command(resume=payload), config=orch._thread_config(session_id), ) + # The graph completes the agent run after the verdict is + # forwarded to the gated tool; finalize the session if it's + # not paused on a fresh interrupt. Without this, an approved + # session stays in ``in_progress`` because the resume path + # does not run through ``stream_session`` (the only other + # caller of finalize). + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) svc.submit_and_wait(_drive(), timeout=60.0) diff --git a/examples/code_review/skills/analyzer/system.md b/examples/code_review/skills/analyzer/system.md index 2996327..9ef3a6e 100644 --- a/examples/code_review/skills/analyzer/system.md +++ b/examples/code_review/skills/analyzer/system.md @@ -22,10 +22,21 @@ Do not invent low-value nits to fill space. After all tool calls, reply with ONE short sentence summarising findings count + the dominant category. Do not enumerate every finding (the UI renders them). -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/examples/code_review/skills/intake/system.md b/examples/code_review/skills/intake/system.md index 9aaea08..60e39a7 100644 --- a/examples/code_review/skills/intake/system.md +++ b/examples/code_review/skills/intake/system.md @@ -16,10 +16,21 @@ analyzer's job. If `fetch_pr_diff` raises or returns an empty diff, emit `failed` so the orchestrator short-circuits to end and skips the analyzer. -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/examples/code_review/skills/recommender/system.md b/examples/code_review/skills/recommender/system.md index c3037d9..859bf60 100644 --- a/examples/code_review/skills/recommender/system.md +++ b/examples/code_review/skills/recommender/system.md @@ -23,10 +23,21 @@ them already. After the call, reply with ONE short sentence echoing the recommendation. Nothing else. -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/examples/incident_management/skills/deep_investigator/system.md b/examples/incident_management/skills/deep_investigator/system.md index 0eb874a..142fbbe 100644 --- a/examples/incident_management/skills/deep_investigator/system.md +++ b/examples/incident_management/skills/deep_investigator/system.md @@ -12,10 +12,21 @@ You are the **Deep Investigator** agent. Gather evidence and produce ranked hypo - Cite specific log lines or metric values as evidence in `hypotheses`. - If the INC has `matched_prior_inc` set, include the prior INC's recorded root cause as one of your ranked hypotheses and explicitly *validate or reject* it against the fresh logs/metrics. Same symptom can have different causes across incidents — drop confidence accordingly when the prior hypothesis is rejected so the gate triggers an intervention. -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/examples/incident_management/skills/resolution/system.md b/examples/incident_management/skills/resolution/system.md index 5d33130..2bf10f1 100644 --- a/examples/incident_management/skills/resolution/system.md +++ b/examples/incident_management/skills/resolution/system.md @@ -11,10 +11,21 @@ You are the **Resolution** agent. You consume triage + deep_investigator finding ## Guidelines - Pick `team` deliberately based on incident component, severity, and category — not a default fallback. -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/examples/incident_management/skills/triage/system.md b/examples/incident_management/skills/triage/system.md index 309f9de..58e5517 100644 --- a/examples/incident_management/skills/triage/system.md +++ b/examples/incident_management/skills/triage/system.md @@ -33,10 +33,21 @@ Record the full iteration trail as a single JSON-encoded string under `findings. - If the INC has `matched_prior_inc` set, treat the prior INC's `findings` and `resolution` as a **prior hypothesis**, not a fact. Same symptom (e.g., Redis OOM) can have different root causes across incidents — code bug vs. network partition vs. resource overload. Use the prior cause as a candidate to confirm or reject against current evidence; flag in your tags whether the parallel looks supported (`hypothesis:prior_match_supported`) or not (`hypothesis:prior_match_rejected`). - The hypothesis loop has a hard cap of 3 iterations. Do NOT exceed it; an unconverged hypothesis at the cap is acceptable — record it and let the deep investigator take over. -## Output contract +## Output contract — REQUIRED -The framework wraps your reply in an `AgentTurnOutput` envelope (content, -confidence ∈ [0, 1], confidence_rationale, optional signal). The runner -enforces this structurally — answer truthfully and the envelope captures -your confidence and rationale. Do not mention "confidence" in your prose -unless it's part of substantive analysis (e.g. ranking hypotheses). +Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers: + +``` +## Response + + +## Confidence + + +## Signal + +``` + +**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`. + +Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response. diff --git a/src/runtime/agents/responsive.py b/src/runtime/agents/responsive.py index 1d2f62f..97b81b3 100644 --- a/src/runtime/agents/responsive.py +++ b/src/runtime/agents/responsive.py @@ -20,7 +20,7 @@ import logging from datetime import datetime, timezone -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING, Any, Callable from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.messages import HumanMessage @@ -35,7 +35,6 @@ from runtime.storage.session_store import SessionStore from runtime.tools.gateway import wrap_tool from runtime.agents.turn_output import ( - AgentTurnOutput, EnvelopeMissingError, parse_envelope_from_result, reconcile_confidence, @@ -60,6 +59,7 @@ def make_agent_node( patch_tool_names: frozenset[str] = frozenset(), gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ): """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -81,7 +81,7 @@ def make_agent_node( # call time — both modules are fully imported before ``node()`` runs. from runtime.graph import ( GraphState, - _ainvoke_with_retry, + _drive_agent_with_resume, _format_agent_input, _handle_agent_failure, _harvest_tool_calls_and_patches, @@ -94,9 +94,18 @@ def make_agent_node( ) async def node(state: GraphState) -> dict: - incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] - inc_id = incident.id + state_session: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from store at entry — outer Pregel checkpoints + # at step boundaries, not mid-node, so ``state["session"]`` is + # stale relative to DB when a HITL gate paused mid-step. See + # the same reload comment in ``runtime.graph.make_agent_node`` + # for the full rationale. + try: + incident: Session = store.load(inc_id) + except FileNotFoundError: + incident = state_session # M3: emit agent_started telemetry before any work happens. if event_log is not None: @@ -122,25 +131,25 @@ async def node(state: GraphState) -> dict: ] else: run_tools = tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Same + # change as graph.py:make_agent_node — drop response_format + # so the loop terminates on natural React END, then parse the + # final AIMessage via Path 4 in parse_envelope_from_result. + # The inner agent gets the orchestrator's checkpointer so a + # HITL pause inside ``interrupt()`` can be resumed via + # ``Command(resume=verdict)`` on a stable per-invocation + # thread id (see ``_drive_agent_with_resume`` in + # ``runtime.graph`` for the full rationale). agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, + checkpointer=checkpointer, + ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint at the # start of each agent step so the gateway treats the first @@ -151,9 +160,15 @@ async def node(state: GraphState) -> dict: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up. diff --git a/src/runtime/agents/turn_output.py b/src/runtime/agents/turn_output.py index df202e4..d6a3d02 100644 --- a/src/runtime/agents/turn_output.py +++ b/src/runtime/agents/turn_output.py @@ -21,8 +21,9 @@ import json import logging +import re -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, ValidationError _LOG = logging.getLogger("runtime.orchestrator") @@ -87,6 +88,127 @@ def __init__(self, *, agent: str, field: str, message: str | None = None): super().__init__(message or f"envelope_missing: {field} (agent={agent})") +_HEADER_SPLIT = re.compile(r"^#{2,}\s+(\w+)\s*$", re.MULTILINE) +_CONF_LINE = re.compile( + # Leftmost float (allows int form), optional rationale after em-dash / + # ASCII dash / hyphen separator. ``re.DOTALL`` so a multi-line rationale + # is captured wholesale. + r"^\s*(-?[0-9]*\.?[0-9]+)\s*(?:[\-\u2010\u2011\u2012\u2013\u2014\u2015]+\s*(.*))?$", + re.DOTALL, +) + + +def _clamp_unit(x: float) -> float: + """Clamp a confidence float into [0, 1] without raising. The skill + prompt asks for [0, 1]; an LLM occasionally emits ``1.05`` or + ``-0.1`` — clamp rather than reject so the parse step is forgiving.""" + if x < 0.0: + return 0.0 + if x > 1.0: + return 1.0 + return x + + +def parse_markdown_envelope( + content: str, *, agent: str = "", +) -> "AgentTurnOutput": + """D-22-03 — parse the trailing markdown contract block into an + :class:`AgentTurnOutput`. + + Expects three sections ``## Response`` / ``## Confidence`` / + ``## Signal`` (case-insensitive headers, ``##+`` accepted) at the + end of the LLM's reply. The free-text body of ``## Response`` + becomes ``content``; the float on the ``## Confidence`` line + becomes ``confidence`` (clamped to [0, 1]); the rest of that line + after ``-`` / ``--`` / ``\u2014`` becomes ``confidence_rationale``; + the ``## Signal`` body becomes ``signal`` (with ``none``/``null``/ + blank coerced to ``None``). + + Raises :class:`EnvelopeMissingError` only when the parse cannot + produce a valid envelope at all — missing ``## Confidence`` line, + unparseable confidence value, or empty ``## Response`` body. + """ + if not isinstance(content, str) or not content.strip(): + raise EnvelopeMissingError( + agent=agent, field="content", + message=f"envelope_missing: empty content (agent={agent!r})", + ) + + # ``re.split`` with the header capture-group yields: + # [pre, h1, body1, h2, body2, ...] + # Pre is the prose before the first heading; we ignore it for parse. + parts = _HEADER_SPLIT.split(content) + sections: dict[str, str] = {} + if len(parts) >= 3: + for i in range(1, len(parts) - 1, 2): + key = parts[i].strip().lower() + body = parts[i + 1].strip() + # Don't overwrite an earlier section with a later same-named + # one (unusual but possible in pathological prompts). + sections.setdefault(key, body) + + response_body = sections.get("response", "").strip() + raw_conf = sections.get("confidence", "").strip() + + if not raw_conf: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section absent or empty " + f"(agent={agent!r})" + ), + ) + + m = _CONF_LINE.match(raw_conf) + if not m: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence section did not parse " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) + try: + conf_value = _clamp_unit(float(m.group(1))) + except (TypeError, ValueError) as exc: + raise EnvelopeMissingError( + agent=agent, field="confidence", + message=( + f"envelope_missing: confidence value not a float " + f"(agent={agent!r}, raw={raw_conf!r})" + ), + ) from exc + rationale = (m.group(2) or "").strip() or "(no rationale provided)" + + signal_raw = sections.get("signal", "").strip().lower() or None + if signal_raw in {"none", "null", "", "n/a"}: + signal_raw = None + + if not response_body: + raise EnvelopeMissingError( + agent=agent, field="content", + message=( + f"envelope_missing: response section empty (agent={agent!r})" + ), + ) + + try: + return AgentTurnOutput( + content=response_body, + confidence=conf_value, + confidence_rationale=rationale, + signal=signal_raw, + ) + except ValidationError as exc: + raise EnvelopeMissingError( + agent=agent, field="validation", + message=( + f"envelope_missing: pydantic validation rejected the " + f"parsed values (agent={agent!r}): {exc}" + ), + ) from exc + + def parse_envelope_from_result( result: dict, *, @@ -144,13 +266,162 @@ def parse_envelope_from_result( continue break - # Path 3: fail loudly + # Path 4 (D-22-01): markdown-primary parse on the last AIMessage's + # content. Producers that emit the new ``## Response / ## Confidence / + # ## Signal`` section block land here when Paths 1+2 yield nothing. + for msg in reversed(messages): + if msg.__class__.__name__ != "AIMessage": + continue + content = getattr(msg, "content", None) + if not isinstance(content, str) or not content.strip(): + continue + try: + return parse_markdown_envelope(content, agent=agent) + except EnvelopeMissingError: + # This AIMessage didnt carry a parseable markdown contract. + # Continue scanning earlier messages on the off-chance a + # nested loop emitted the contract one step back. + continue + break + + # Path 5 (D-22-fallback): some models (gpt-oss:20b in particular) + # treat a terminal-tool call as the completion and emit an empty + # closing AIMessage. Apps configure typed-terminal tools with + # ``confidence`` + ``confidence_rationale`` args by contract — + # synthesise an envelope from those args when the markdown path + # yielded nothing. Skip if any earlier path succeeded. + for msg in reversed(messages): + tcs = getattr(msg, "tool_calls", None) + if not tcs: + continue + for tc in tcs: + args = tc.get("args") or {} + if not isinstance(args, dict): + continue + conf = args.get("confidence") + rat = args.get("confidence_rationale") or args.get("rationale") + if conf is None or not rat: + continue + try: + conf_f = float(conf) + except (TypeError, ValueError): + continue + content_body = ( + args.get("resolution_summary") + or args.get("reason") + or args.get("summary") + or f"terminal tool {tc.get('name', '')} invoked" + ) + if not isinstance(content_body, str) or not content_body.strip(): + content_body = f"terminal tool {tc.get('name', '')} invoked" + try: + env = AgentTurnOutput( + content=content_body, + confidence=max(0.0, min(1.0, conf_f)), + confidence_rationale=str(rat), + signal=args.get("signal"), + ) + except ValidationError: + continue + _LOG.info( + "envelope_synthesized_from_tool_args agent=%s tool=%s " + "confidence=%.2f", + agent, tc.get("name"), env.confidence, + ) + return env + + # Path 6 (D-22-fallback-permissive): the model called at least one + # tool but never emitted the markdown contract or a typed-terminal + # tool with confidence args. Rather than hard-failing the run, + # synthesise a minimal envelope so the session reaches a terminal + # status (typically default_terminal_status -> needs_review) and + # the operator can review what happened. Confidence is intentionally + # low (0.30) so any HITL gate fires. + invoked_tool_names: list[str] = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + n = tc.get("name") + if n: + invoked_tool_names.append(n) + if invoked_tool_names: + body = ( + f"Agent {agent} invoked " + f"{', '.join(invoked_tool_names)} but did not emit a final " + f"summary. Review the tool-call audit for outcome." + ) + try: + env = AgentTurnOutput( + content=body, + confidence=0.30, + confidence_rationale=( + "synthesized: agent invoked tools but emitted no " + "closing message" + ), + signal=None, + ) + _LOG.warning( + "envelope_synthesized_permissive agent=%s tools=%s", + agent, invoked_tool_names, + ) + return env + except ValidationError: + pass + + # Path 7 (terminal): fail loudly. None of the six paths produced a + # valid envelope — this is a real prompt-drift signal worth + # surfacing as a structured agent_run error. Log the last + # AIMessage content (capped) so operators can diagnose whether + # the LLM emitted the wrong shape vs nothing at all. + last_ai_content = "" + for msg in reversed(messages): + if msg.__class__.__name__ == "AIMessage": + c = getattr(msg, "content", None) + if isinstance(c, str): + last_ai_content = c + break + # Also dump every tool_call across all AIMessages so we can see + # whether the model called any tool with confidence args (Path 5 + # would have fired) or whether it called a non-terminal tool only. + tool_call_summary = [] + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + args = tc.get("args") or {} + tool_call_summary.append({ + "name": tc.get("name"), + "arg_keys": sorted(args.keys()) if isinstance(args, dict) else "?", + "has_confidence": isinstance(args, dict) and "confidence" in args, + "has_rationale": isinstance(args, dict) and ( + "confidence_rationale" in args or "rationale" in args + ), + }) + # Detailed per-message dump so we can see exactly what shape the + # conversation took when the parse failed. + msg_summary = [] + for m in messages: + kind = m.__class__.__name__ + c = getattr(m, "content", None) + c_len = len(c) if isinstance(c, str) else ( + f"list[{len(c)}]" if isinstance(c, list) else type(c).__name__ + ) + tcs = [tc.get("name") for tc in (getattr(m, "tool_calls", None) or [])] + msg_summary.append({"kind": kind, "content_len": c_len, "tool_calls": tcs}) + _LOG.warning( + "envelope_missing: agent=%s last_ai_message_content=%r " + "structured_response_type=%s message_count=%d tool_calls=%r " + "msg_trace=%r", + agent, + last_ai_content[:1500] if last_ai_content else "", + type(result.get("structured_response")).__name__, + len(messages), + tool_call_summary, + msg_summary, + ) raise EnvelopeMissingError( agent=agent, field="structured_response", message=( - f"envelope_missing: no structured_response or JSON-decodable " - f"AIMessage envelope found (agent={agent})" + f"envelope_missing: no structured_response, JSON-decodable, or " + f"markdown-decodable AIMessage envelope found (agent={agent})" ), ) @@ -197,5 +468,6 @@ def reconcile_confidence( "AgentTurnOutput", "EnvelopeMissingError", "parse_envelope_from_result", + "parse_markdown_envelope", "reconcile_confidence", ] diff --git a/src/runtime/api.py b/src/runtime/api.py index c761230..0d3c11b 100644 --- a/src/runtime/api.py +++ b/src/runtime/api.py @@ -621,6 +621,16 @@ async def _resume() -> None: Command(resume=decision_payload), config=orch._thread_config(session_id), ) + # Finalize after the verdict-driven run completes so + # the session row reflects the terminal status (or + # falls through to ``default_terminal_status``). Skip + # if a fresh interrupt re-paused the graph — the + # session is again awaiting operator input. Mirrors + # the same guard in the UI's + # ``_submit_approval_via_service`` and the + # ``stream_session`` finalize call site. + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) # Submit the resume onto the long-lived service loop so we # don't fight the lifespan thread for the same FastMCP/SQLite diff --git a/src/runtime/graph.py b/src/runtime/graph.py index 2486a74..19b589f 100644 --- a/src/runtime/graph.py +++ b/src/runtime/graph.py @@ -199,8 +199,104 @@ def finish(self, *, summary: str) -> None: ) +async def _drive_agent_with_resume( + *, + agent_executor, + inner_cfg: dict, + inner_has_checkpointer: bool, + initial_input: dict, +): + """Run an inner ``create_agent`` executor, surfacing HITL pauses to + the outer Pregel graph and forwarding resume verdicts back to the + inner agent. + + langgraph 1.x changed the ``interrupt()`` contract: a tool that + calls :func:`langgraph.types.interrupt` no longer raises + :class:`GraphInterrupt` to the caller. Instead, the agent's + ``ainvoke`` returns normally with the pause captured in + ``result["__interrupt__"]``. To make HITL approval actually work + end-to-end the framework must: + + 1. **First call** — invoke the agent with the initial messages. + If the result carries ``__interrupt__``, raise + :class:`GraphInterrupt` so the *outer* Pregel pauses too. The + outer pause is what surfaces ``__interrupt__`` to the + orchestrator and lets the UI render Approve / Reject. + + 2. **Resume call** — when the outer is resumed via + ``Command(resume=verdict)``, this helper is re-entered. The + inner's checkpointer remembers the pause; we detect that via + ``aget_state(...).next`` being non-empty and call + :func:`langgraph.types.interrupt` at the *outer* level to fetch + the verdict, then forward it to the inner via + ``Command(resume=verdict)``. This is the only path that + actually re-runs the gated tool with the verdict — calling + ``inner.ainvoke({"messages": [...]})`` on a paused thread + silently skips the tool. + + A while loop covers the case where a single agent turn pauses + multiple times (e.g. two high-risk tool calls in sequence). + + When ``inner_has_checkpointer`` is False (legacy callers and most + unit tests that build agents without a checkpointer), the helper + falls back to the simple "invoke + re-raise on ``__interrupt__``" + contract. That path can pause the outer graph but the resume + verdict cannot reach the inner tool — which is acceptable because + legacy callers without a checkpointer never had a working HITL + resume path to begin with. + """ + # Local imports keep callers that never touch HITL out of the + # langgraph.types import cost path. ``Command`` and ``interrupt`` + # are tiny but the explicit lazy import documents the dependency. + from langgraph.types import Command, interrupt + + # Resume-detection branch: only meaningful when the inner agent + # has its own checkpointer (otherwise ``aget_state`` on a fresh + # in-memory pregel raises). The detection asks: "does the inner + # have a paused step waiting?". A non-empty ``next`` means yes. + if inner_has_checkpointer: + try: + inner_state = await agent_executor.aget_state(inner_cfg) + inner_paused = bool(getattr(inner_state, "next", ()) or ()) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == fresh + inner_paused = False + else: + inner_paused = False + + if inner_paused: + # Outer is being resumed; pull the verdict via the outer's own + # interrupt() and forward it. The payload here is informational — + # the only thing the outer cares about is the resume value. + verdict = interrupt({"resume_inner_agent": inner_cfg["configurable"]["thread_id"]}) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + else: + result = await _ainvoke_with_retry( + agent_executor, initial_input, config=inner_cfg, + ) + + # Multi-pause loop: if the inner pauses again (or resumes into a + # second gated tool call), surface each pause to the outer the + # same way until the inner is fully done. + while isinstance(result, dict) and result.get("__interrupt__"): + if not inner_has_checkpointer: + # No checkpointer => no Command(resume=...) path is + # available; surface the pause to the outer Pregel and + # stop. Resume cannot deliver the verdict to the gated + # tool, but that matches legacy behavior. + raise GraphInterrupt(result["__interrupt__"]) + verdict = interrupt(result["__interrupt__"]) + result = await _ainvoke_with_retry( + agent_executor, Command(resume=verdict), config=inner_cfg, + ) + + return result + + async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, - base_delay: float = 1.5): + base_delay: float = 1.5, + config: dict | None = None): """Wrap a LangGraph agent invocation with retry on transient cloud errors. Retries on common Ollama Cloud / streaming hiccups (500, status -1, etc.). @@ -218,6 +314,14 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, # itself (AutoStrategy → ToolStrategy fallback for non- # function-calling Ollama models). The default langgraph # recursion bound is now a true upper bound, not a workaround. + # + # ``config`` (default ``None``) carries the per-thread + # ``configurable.thread_id`` for callers that need a + # checkpointer-scoped invocation (HITL resume path — + # see :func:`_drive_agent_with_resume`). Legacy callers + # pass nothing and the executor's default thread is used. + if config is not None: + return await executor.ainvoke(input_, config=config) return await executor.ainvoke(input_) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause. @@ -550,6 +654,7 @@ def make_agent_node( injected_args: dict[str, str] | None = None, gate_policy: "GatePolicy | None" = None, event_log: "EventLog | None" = None, + checkpointer: Any = None, ) -> Callable[[GraphState], Awaitable[dict]]: """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -581,10 +686,23 @@ def make_agent_node( """ async def node(state: GraphState) -> dict: - incident = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session - inc_id = incident.id + state_session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] — orchestrator runtime always supplies session + inc_id = state_session.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Always reload from the persistent store at entry. Outer Pregel + # checkpoints at step boundaries, not mid-node — so on a HITL + # resume, ``state["session"]`` reflects the prior step's output + # (e.g. deep_investigator's), not the gateway's pending_approval + # row + version bump that happened mid-step in the original run. + # If we trust ``state["session"]`` here, the gateway sees no + # pending row, appends a duplicate, then ``store.save`` raises + # ``StaleVersionError`` because DB has already moved on. + try: + incident = store.load(inc_id) + except FileNotFoundError: + incident = state_session + # M3 (per-step telemetry): emit agent_started. if event_log is not None: try: @@ -679,25 +797,43 @@ def _run(**kwargs: Any) -> Any: ] else: run_tools = visible_tools - # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every - # responsive agent invocation is wrapped in an AgentTurnOutput - # envelope. ``langchain.agents.create_agent`` (the non-deprecated - # successor to ``langgraph.prebuilt.create_react_agent``) accepts a - # bare schema as ``response_format`` and, by default, wraps it in - # ``AutoStrategy`` — ProviderStrategy for models with native - # structured-output (OpenAI-class), falling back to ToolStrategy - # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a - # callable tool: when the LLM ``calls`` it, the loop terminates on - # the same turn with ``result["structured_response"]`` populated. - # Eliminates the old two-call structure (loop + separate - # ``with_structured_output`` pass) that hit recursion_limit=25 on - # Ollama models without true function-calling. + # Phase 22 (D-22-01): markdown-primary turn output. Drop + # ``response_format`` from create_agent — the agent loop now + # terminates on the natural React END signal (LLM emits an + # AIMessage with no tool calls). The framework parses that + # final message body as markdown via Path 4 in + # parse_envelope_from_result. Skill prompts (D-22-04) instruct + # every reply to end with ## Response / ## Confidence / ## + # Signal sections; the parser extracts them. + # + # Why: forcing LLMs through a JSON-schema-shaped output via + # ``response_format`` triggered a class of brittleness across + # providers (model-specific JSON drift, tool-strategy + React + # END interaction, recursion_limit ceilings). Markdown is the + # native format every chat model writes well; the parse step + # happens in the framework, where leniency is in our control. + # The inner agent gets a checkpointer so a HITL pause inside + # ``interrupt()`` can be resumed via ``Command(resume=verdict)`` + # on the SAME inner thread. langgraph 1.x semantics require the + # checkpointer here — without it ``Command(resume=...)`` raises + # ``RuntimeError: Cannot use Command(resume=...) without + # checkpointer``. The thread id is derived deterministically + # from session + agent + the upcoming agent_run index so it is: + # * STABLE across the inner pause and the outer resume that + # follows (both observe the same ``len(incident.agents_run)`` + # because no new run is recorded mid-pause), and + # * UNIQUE per agent invocation so previous invocations of the + # same agent within the same session don't bleed in. agent_executor = create_agent( model=llm, tools=run_tools, system_prompt=skill.system_prompt, - response_format=AgentTurnOutput, + checkpointer=checkpointer, + ) + inner_thread_id = ( + f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}" ) + inner_cfg = {"configurable": {"thread_id": inner_thread_id}} # Phase 11 (FOC-04): reset per-turn confidence hint. The hint # is updated below after _harvest_tool_calls_and_patches; on @@ -709,9 +845,15 @@ def _run(**kwargs: Any) -> Any: pass try: - result = await _ainvoke_with_retry( - agent_executor, - {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + result = await _drive_agent_with_resume( + agent_executor=agent_executor, + inner_cfg=inner_cfg, + inner_has_checkpointer=checkpointer is not None, + initial_input={ + "messages": [ + HumanMessage(content=_format_agent_input(incident)) + ] + }, ) except GraphInterrupt: # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error. @@ -890,12 +1032,17 @@ def _decide_from_signal(inc: Session) -> str: _DEFAULT_STUB_CANNED: dict[str, str] = { - # Back-compat defaults for the four canonical agents. Any YAML-defined - # agent can override or extend this via ``skill.stub_response``; new agents - # without an entry here fall through to StubChatModel's generic placeholder. + # Plain-text bodies; ``StubChatModel._generate`` wraps these in the + # D-22-03 markdown envelope using the per-agent stub_envelope_* fields + # (the framework parses the wrapped output via Path 4). Apps can + # override or extend via ``skill.stub_response``; new agents without + # an entry here fall through to ``StubChatModel``'s generic placeholder. "intake": "Created INC, no prior matches. Routing to triage.", "triage": "Severity medium, category latency. No recent deploys correlate.", - "deep_investigator": "Hypothesis: upstream payments timeout. Evidence: log line 'upstream_timeout target=payments'.", + "deep_investigator": ( + "Hypothesis: upstream payments timeout. " + "Evidence: log line upstream_timeout target=payments." + ), "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.", } @@ -1058,7 +1205,8 @@ async def gate(state: GraphState) -> dict: def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, registry: ToolRegistry, - event_log: "EventLog | None" = None) -> dict: + event_log: "EventLog | None" = None, + checkpointer: Any = None) -> dict: """Materialize agent nodes from skills + registry. Reused by main + resume graphs. Dispatches on ``skill.kind``: @@ -1136,6 +1284,7 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, injected_args=cfg.orchestrator.injected_args, gate_policy=gate_policy, event_log=event_log, + checkpointer=checkpointer, ) return nodes @@ -1241,7 +1390,8 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore, sg = StateGraph(GraphState) nodes = _build_agent_nodes(cfg=cfg, skills=skills, store=store, - registry=registry, event_log=event_log) + registry=registry, event_log=event_log, + checkpointer=checkpointer) for agent_name, node in nodes.items(): sg.add_node(agent_name, node) sg.add_node("gate", make_gate_node( diff --git a/src/runtime/llm.py b/src/runtime/llm.py index 17ee42f..2b6fedb 100644 --- a/src/runtime/llm.py +++ b/src/runtime/llm.py @@ -45,17 +45,13 @@ class StubChatModel(BaseChatModel): ``stub_envelope_confidence`` / ``stub_envelope_rationale`` / ``stub_envelope_signal``. - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` -> - ``ToolStrategy`` for non-native-structured-output models, including - this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The - agent loop only terminates when the LLM emits a tool call NAMED - ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name - so ``_generate`` can auto-emit a closing tool call after any - user-configured ``tool_call_plan`` is exhausted -- preserving the - pre-Phase-15 stub semantics (canned text + optional pre-scripted - tool calls) while satisfying the new tool-loop termination - contract. + Phase 22 (D-22-05): the loop terminates on natural React END now + that ``response_format=AgentTurnOutput`` is gone — tests drive + markdown-shaped canned text (the framework parses it via Path 4 + in :func:`runtime.agents.turn_output.parse_envelope_from_result`). + The Phase 15 envelope-as-callable-tool auto-emit is removed; the + stub is back to its pre-Phase-15 simple semantics (canned text + + optional pre-scripted ``tool_call_plan``). """ role: str = "default" canned_responses: dict[str, str] = Field(default_factory=dict) @@ -64,12 +60,6 @@ class StubChatModel(BaseChatModel): stub_envelope_rationale: str = "stub envelope rationale" stub_envelope_signal: str | None = None _called_once: bool = False - # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when - # ``langchain.agents.create_agent`` injects a structured-output tool - # for ``AgentTurnOutput``. Holds the bare tool name (e.g. - # ``"AgentTurnOutput"``) so ``_generate`` can emit a final - # envelope-shaped tool call to close the agent loop. - _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -77,33 +67,37 @@ def _llm_type(self) -> str: def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None, run_manager: Any = None, **kwargs: Any) -> ChatResult: - text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response") + # Phase 22 (D-22-05): tool-call rounds emit the scripted tool + # calls with a placeholder content body; the closing turn (no + # remaining tool_call_plan entries) emits the canned text + # wrapped in the D-22-03 markdown contract that + # parse_markdown_envelope reads. The stub_envelope_* + # parameters drive the trailing-section bodies so tests can + # exercise specific confidence / rationale / signal paths + # without authoring markdown by hand. tool_calls: list[dict] = [] if self.tool_call_plan and not self._called_once: for tc in self.tool_call_plan: - tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}) + tool_calls.append( + {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())} + ) self._called_once = True - elif self._envelope_tool_name is not None: - # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted - # (or wasn't configured) AND ``langchain.agents.create_agent`` - # has bound the AgentTurnOutput envelope as a tool. Emit a - # closing tool call so the loop terminates with a populated - # ``structured_response``. The args mirror the - # ``with_structured_output`` path's envelope construction so - # tests see the same confidence / rationale / signal regardless - # of whether the new tool-strategy or the legacy structured- - # output path is in play. - tool_calls.append({ - "name": self._envelope_tool_name, - "args": { - "content": text or ".", - "confidence": self.stub_envelope_confidence, - "confidence_rationale": self.stub_envelope_rationale, - "signal": self.stub_envelope_signal, - }, - "id": str(uuid4()), - }) - msg = AIMessage(content=text, tool_calls=tool_calls) + + body = self.canned_responses.get( + self.role, f"[stub:{self.role}] no canned response", + ) + # Phase 22 (D-22-05): explicit "none" when no signal so the + # parser maps to None — preserves pre-Phase-22 behaviour where a + # stub with no signal yielded envelope.signal=None. + signal_str = self.stub_envelope_signal if self.stub_envelope_signal is not None else "none" + md = ( + f"{body}\n\n" + f"## Response\n{body}\n\n" + f"## Confidence\n{self.stub_envelope_confidence:.4f} -- " + f"{self.stub_envelope_rationale}\n\n" + f"## Signal\n{signal_str}\n" + ) + msg = AIMessage(content=md, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = None, @@ -111,30 +105,15 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - """Record the AgentTurnOutput envelope-tool name when present. - - Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with - ``response_format=AgentTurnOutput`` calls ``bind_tools(...)`` - with the user's tools PLUS the envelope-as-a-tool. We scan the - list for the AgentTurnOutput-shaped tool (matched by ``__name__`` - on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the - ``"name"`` key on dict-shaped tool specs) and remember it on the - instance so ``_generate`` can close the agent loop with a - synthetic envelope tool call after any pre-scripted - ``tool_call_plan`` is exhausted. Tools bound by the framework - itself (real BaseTools the agent should call) flow through - unchanged -- the stub still emits them only via - ``tool_call_plan``. + """Phase 22 (D-22-05): no-op tool binding. + + ``langchain.agents.create_agent`` calls ``bind_tools`` to wire + the agent's BaseTool list onto the chat model. The stub does + not actually invoke those tools (test fixtures script tool + calls via ``tool_call_plan`` instead), so we just return self + unchanged. The pre-Phase-22 envelope-tool detection is gone + because the framework no longer asks for ``response_format``. """ - for t in tools or []: - name = ( - getattr(t, "__name__", None) - or getattr(t, "name", None) - or (isinstance(t, dict) and t.get("name")) - ) - if isinstance(name, str) and name == "AgentTurnOutput": - self._envelope_tool_name = name - break return self # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]`` diff --git a/src/runtime/orchestrator.py b/src/runtime/orchestrator.py index 9323deb..192920e 100644 --- a/src/runtime/orchestrator.py +++ b/src/runtime/orchestrator.py @@ -1057,6 +1057,29 @@ async def _finalize_session_status_async( async with self._locks.acquire(session_id): return self._finalize_session_status(session_id) + async def _is_graph_paused(self, session_id: str) -> bool: + """Return True iff the compiled graph has a pending step waiting + to resume (i.e. it's paused at an ``interrupt()`` boundary). + + langgraph 1.x surfaces a HITL pause via the result dict's + ``__interrupt__`` field rather than raising + :class:`GraphInterrupt`. After ``astream_events`` (or + ``ainvoke``) returns, the only way to tell "the run paused" + from "the run completed" is to query the checkpointed graph + state — a non-empty ``next`` tuple means the graph has steps + queued to run when resumed. The stream/retry call sites use + this to skip ``_finalize_session_status_async`` on a pause — + finalizing on a pause would stamp ``default_terminal_status`` + on a session that's actually waiting on operator approval, and + the orphaned ``pending_approval`` ToolCall row written by the + gateway would never get its resume. + """ + try: + state = await self.graph.aget_state(self._thread_config(session_id)) + except Exception: # noqa: BLE001 — defensive; missing checkpoint == not paused + return False + return bool(getattr(state, "next", ()) or ()) + def _thread_config(self, incident_id: str) -> dict: """Build the LangGraph ``config`` dict for a per-session thread. @@ -1266,10 +1289,18 @@ async def stream_session(self, *, query: str, environment: str, config=self._thread_config(inc.id), ): yield self._to_ui_event(ev, inc.id) - new_status = await self._finalize_session_status_async(inc.id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": inc.id, - "status": new_status, "ts": _event_ts()} + # Skip finalize when the graph paused on an interrupt — the + # session is waiting for operator approval, not done. Stamping + # default_terminal_status here would orphan the pending_approval + # ToolCall row written by the gateway just before the pause. + if await self._is_graph_paused(inc.id): + yield {"event": "session_paused", "incident_id": inc.id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(inc.id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": inc.id, + "status": new_status, "ts": _event_ts()} yield {"event": "investigation_completed", "incident_id": inc.id, "ts": _event_ts()} async def stream_investigation(self, *, query: str, environment: str, @@ -1542,10 +1573,17 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]: config=self._thread_config(session_id), ): yield self._to_ui_event(ev, session_id) - new_status = await self._finalize_session_status_async(session_id) - if new_status: - yield {"event": "status_auto_finalized", "incident_id": session_id, - "status": new_status, "ts": _event_ts()} + # See ``stream_session`` for why pause-detection guards the + # finalize call: a HITL pause must not be coerced into a + # terminal status. + if await self._is_graph_paused(session_id): + yield {"event": "session_paused", "incident_id": session_id, + "ts": _event_ts()} + else: + new_status = await self._finalize_session_status_async(session_id) + if new_status: + yield {"event": "status_auto_finalized", "incident_id": session_id, + "status": new_status, "ts": _event_ts()} yield {"event": "retry_completed", "incident_id": session_id, "ts": _event_ts()} diff --git a/src/runtime/tools/gateway.py b/src/runtime/tools/gateway.py index dc321d2..e0f17b3 100644 --- a/src/runtime/tools/gateway.py +++ b/src/runtime/tools/gateway.py @@ -497,6 +497,11 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition. Without this, + # the DB row stays at ``pending_approval`` and + # the UI keeps offering the buttons forever. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -523,6 +528,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -545,6 +552,8 @@ def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, @@ -675,6 +684,11 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + # Persist the status transition (mirror of the + # sync path) so the DB row reflects the actual + # outcome instead of staying at pending_approval. + if store is not None: + store.save(session) rejected_result = {"rejected": True, "rationale": rationale} _emit_invoked( status="rejected", risk="high", @@ -696,6 +710,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) timeout_result = {"timeout": True, "rationale": rationale} _emit_invoked( status="timeout", risk="high", @@ -717,6 +733,8 @@ async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 approved_at=_now_iso(), approval_rationale=rationale, ) + if store is not None: + store.save(session) _emit_invoked( status="approved", risk="high", args_dict=pending_args, result=result, diff --git a/src/runtime/ui.py b/src/runtime/ui.py index d2b4a7a..573d829 100644 --- a/src/runtime/ui.py +++ b/src/runtime/ui.py @@ -23,6 +23,7 @@ # Orchestrator uses so both share the same SQLite DB. from __future__ import annotations import asyncio +import logging as _logging import os import time from datetime import datetime, timezone @@ -46,6 +47,23 @@ from runtime.storage.vector import build_vector_store +# Optional: env-driven log config so manual runs (streamlit run, etc.) +# can crank verbosity to INFO/DEBUG without modifying the source. The +# default keeps the production-quiet WARNING level. ``force=True`` +# overrides any handler streamlit set up during import. +def _maybe_configure_logging() -> None: + level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip() + if level in {"DEBUG", "INFO", "WARNING", "ERROR"}: + _logging.basicConfig( + level=getattr(_logging, level), + format="%(asctime)s %(name)s %(levelname)s %(message)s", + force=True, + ) + + +_maybe_configure_logging() + + # Default config path; apps override via the ``APP_CONFIG`` env var. CONFIG_PATH = Path(os.environ.get("APP_CONFIG", "config/config.yaml")) @@ -987,6 +1005,14 @@ async def _drive() -> None: Command(resume=payload), config=orch._thread_config(session_id), ) + # The graph completes the agent run after the verdict is + # forwarded to the gated tool; finalize the session if it's + # not paused on a fresh interrupt. Without this, an approved + # session stays in ``in_progress`` because the resume path + # does not run through ``stream_session`` (the only other + # caller of finalize). + if not await orch._is_graph_paused(session_id): + await orch._finalize_session_status_async(session_id) svc.submit_and_wait(_drive(), timeout=60.0) diff --git a/tests/_envelope_helpers.py b/tests/_envelope_helpers.py index 13485a1..b2ef03d 100644 --- a/tests/_envelope_helpers.py +++ b/tests/_envelope_helpers.py @@ -62,12 +62,6 @@ class EnvelopeStubChatModel(BaseChatModel): canned_responses: dict[str, str] = Field(default_factory=dict) tool_call_plan: list[dict] | None = None _called_once: bool = False - # Phase 15 (LLM-COMPAT-01): same contract as ``StubChatModel`` -- - # ``langchain.agents.create_agent``'s ToolStrategy injects - # ``AgentTurnOutput`` as a tool; ``bind_tools`` records the name - # so ``_generate`` can emit a closing envelope tool call once any - # pre-scripted ``tool_call_plan`` is exhausted. - _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -80,7 +74,12 @@ def _generate( run_manager: Any = None, **kwargs: Any, ) -> ChatResult: - text = self.canned_responses.get(self.role, self.envelope_content) + # Phase 22 (D-22-05): emit a markdown-shaped final message + # matching the framework's parse_markdown_envelope contract. + # The envelope_* fields drive the section bodies. When the + # caller supplies a per-role canned response we use that for + # the ``## Response`` body and keep the envelope_* + # confidence/rationale/signal as the trailing-section content. tool_calls: list[dict] = [] if self.tool_call_plan and not self._called_once: for tc in self.tool_call_plan: @@ -88,20 +87,20 @@ def _generate( {"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())} ) self._called_once = True - elif self._envelope_tool_name is not None: - # Phase 15 (LLM-COMPAT-01): close the agent loop by emitting - # the envelope-shaped tool call ToolStrategy is waiting for. - tool_calls.append({ - "name": self._envelope_tool_name, - "args": { - "content": self.envelope_content, - "confidence": self.envelope_confidence, - "confidence_rationale": self.envelope_rationale, - "signal": self.envelope_signal, - }, - "id": str(uuid4()), - }) - msg = AIMessage(content=text, tool_calls=tool_calls) + + body = self.canned_responses.get(self.role, self.envelope_content) + # Phase 22 (D-22-05): explicit "none" when no signal so the + # parser maps to None — preserves pre-Phase-22 behaviour where a + # stub with no signal yielded envelope.signal=None. + signal_str = self.envelope_signal if self.envelope_signal is not None else "none" + md = ( + f"{body}\n\n" + f"## Response\n{body}\n\n" + f"## Confidence\n{self.envelope_confidence:.4f} -- " + f"{self.envelope_rationale}\n\n" + f"## Signal\n{signal_str}\n" + ) + msg = AIMessage(content=md, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) async def _agenerate( @@ -114,18 +113,9 @@ async def _agenerate( return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - # Phase 15 (LLM-COMPAT-01): record the AgentTurnOutput tool - # name so ``_generate`` can emit a closing tool call. See - # ``StubChatModel.bind_tools`` for the matching heuristic. - for t in tools or []: - name = ( - getattr(t, "__name__", None) - or getattr(t, "name", None) - or (isinstance(t, dict) and t.get("name")) - ) - if isinstance(name, str) and name == "AgentTurnOutput": - self._envelope_tool_name = name - break + # Phase 22 (D-22-05): no-op tool binding. The envelope-as-tool + # synthesis is gone; the markdown closing turn is emitted + # directly by _generate. return self def with_structured_output(self, schema, *, include_raw: bool = False, **kwargs): diff --git a/tests/test_genericity_ratchet.py b/tests/test_genericity_ratchet.py index 5baf392..37a8496 100644 --- a/tests/test_genericity_ratchet.py +++ b/tests/test_genericity_ratchet.py @@ -74,7 +74,18 @@ # no new domain concept introduced (was missed in the Phase 12 # atomic commit; counted retroactively in the v1.2 follow-up # that consolidates injection-path bug fixes). -BASELINE_TOTAL = 154 +# 154 -> 156 HITL pause-resume fix for langgraph 1.x. Both +# ``runtime.graph.make_agent_node`` and +# ``runtime.agents.responsive.make_agent_node`` derive a +# per-invocation inner thread id of the form +# ``f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"`` +# so the inner ``create_agent`` checkpointer can persist a +# paused tool across an outer Pregel pause and resume. The +# count uses the existing ``incident`` local (the runner's +# domain Session) on a structurally required code path — +# same pattern as the Phase 10/11/12 entries. Net +2 +# ``incident`` token reuses, no new domain concept. +BASELINE_TOTAL = 156 def test_runtime_leaks_at_or_below_baseline(): diff --git a/tests/test_interrupt_detection.py b/tests/test_interrupt_detection.py new file mode 100644 index 0000000..92dc5f1 --- /dev/null +++ b/tests/test_interrupt_detection.py @@ -0,0 +1,370 @@ +"""Regression: agent runner must raise GraphInterrupt when ``create_agent`` +surfaces a paused HITL gate via the result dict's ``__interrupt__`` field. + +Background — langgraph 1.x changed the interrupt contract: a tool that +calls ``interrupt()`` no longer raises ``GraphInterrupt`` to the caller; +instead, ``agent.invoke(...)`` (or ``ainvoke``) returns a normal result +dict with an extra ``__interrupt__`` key carrying the captured +:class:`langgraph.types.Interrupt` records. The framework's agent +runner must therefore inspect the result and re-raise ``GraphInterrupt`` +itself; otherwise the outer Pregel completes the agent step, the +envelope-parsing path fires (potentially Path 6 synthesis), the session +gets finalized, and the ``pending_approval`` ToolCall row written by the +gateway just before the interrupt is left orphaned — the UI's +Approve / Reject buttons no longer drive any resume because the session +is already terminal from the framework's perspective. + +These tests pin the contract at the agent-runner boundary in BOTH +implementations: ``runtime.agents.responsive.make_agent_node`` (the +generic responsive runner) and ``runtime.graph.make_agent_node`` (the +incident-shape variant exposed via ``runtime.graph``). The fix is the +same in both: after ``ainvoke``, inspect ``result["__interrupt__"]`` +and raise :class:`GraphInterrupt`. +""" +from __future__ import annotations + +import asyncio +from pathlib import Path +from typing import Any + +import pytest +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import AIMessage, BaseMessage +from langchain_core.outputs import ChatGeneration, ChatResult +from langchain_core.tools import StructuredTool +from langgraph.errors import GraphInterrupt +from langgraph.types import interrupt +from pydantic import BaseModel + +from runtime.agents.responsive import make_agent_node as responsive_make_agent_node +from runtime.config import EmbeddingConfig, MetadataConfig, ProviderConfig +from runtime.graph import GraphState, make_agent_node as graph_make_agent_node, route_from_skill +from runtime.skill import RouteRule, Skill +from runtime.storage.embeddings import build_embedder +from runtime.storage.engine import build_engine +from runtime.storage.models import Base +from runtime.storage.session_store import SessionStore + + +# --------------------------------------------------------------------------- +# Fixtures (mirror test_real_llm_tool_loop_termination shape so the +# repo + session story stays consistent across the file). + + +def _make_repo(tmp_path: Path) -> SessionStore: + eng = build_engine(MetadataConfig(url=f"sqlite:///{tmp_path}/test.db")) + Base.metadata.create_all(eng) + embedder = build_embedder( + EmbeddingConfig(provider="s", model="x", dim=1024), + {"s": ProviderConfig(kind="stub")}, + ) + return SessionStore(engine=eng, embedder=embedder) + + +@pytest.fixture +def repo(tmp_path: Path) -> SessionStore: + return _make_repo(tmp_path) + + +@pytest.fixture +def session(repo: SessionStore): + return repo.create( + query="HITL gate fires mid-tool", + environment="production", + reporter_id="u", + reporter_team="t", + ) + + +# --------------------------------------------------------------------------- +# Stubs + + +class _PauseArgs(BaseModel): + why: str = "test" + + +def _pause_tool_impl(why: str = "test") -> str: # noqa: D401 + """Stand-in for a high-risk tool whose gateway raises interrupt().""" + interrupt({"reason": "approve_me", "why": why}) + return f"ran:{why}" + + +def _make_pause_tool() -> StructuredTool: + return StructuredTool.from_function( + func=_pause_tool_impl, + name="pause_tool", + description="Calls interrupt() — emulates the gateway's HITL gate.", + args_schema=_PauseArgs, + ) + + +class _OneShotToolCallLLM(BaseChatModel): + """Stub LLM that emits exactly one tool_call for ``pause_tool``. + + Mirrors the production agent's first turn: the LLM calls a tool, the + tool gates, and the agent should pause. After resume the tool would + return a real value and the LLM would emit a closing markdown + envelope — but for THIS test we only assert that the first + invocation pauses; the runner must observe ``__interrupt__`` and + re-raise, so a closing turn is unreachable. + """ + + @property + def _llm_type(self) -> str: + return "oneshot-toolcall-stub" + + def _generate( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: Any = None, + **kwargs: Any, + ) -> ChatResult: + msg = AIMessage( + content="", + tool_calls=[{"name": "pause_tool", "args": {"why": "x"}, "id": "c1"}], + ) + return ChatResult(generations=[ChatGeneration(message=msg)]) + + async def _agenerate( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: Any = None, + **kwargs: Any, + ) -> ChatResult: + return self._generate(messages, stop, run_manager, **kwargs) + + def bind_tools(self, tools, *, tool_choice=None, **kwargs): + return self + + +# --------------------------------------------------------------------------- +# graph.make_agent_node + + +@pytest.mark.asyncio +async def test_graph_make_agent_node_raises_on_interrupt_in_result(repo, session): + """When a tool calls ``interrupt()``, langgraph 1.x returns the + pause via ``result["__interrupt__"]`` instead of raising + ``GraphInterrupt``. The graph.py agent runner must detect this and + re-raise so the outer Pregel propagates the pause — otherwise the + envelope-parsing path fires, the session is finalized, and the + pending_approval row written by the gateway is orphaned. + """ + skill = Skill( + name="resolution", + description="d", + routes=[RouteRule(when="default", next="end")], + system_prompt="You are resolution.", + ) + node = graph_make_agent_node( + skill=skill, + llm=_OneShotToolCallLLM(), + tools=[_make_pause_tool()], + decide_route=lambda inc: route_from_skill(skill, inc), + store=repo, + ) + state: GraphState = {"session": session, "next_route": None} + with pytest.raises(GraphInterrupt): + await asyncio.wait_for(node(state), timeout=5.0) + + +# --------------------------------------------------------------------------- +# responsive.make_agent_node + + +@pytest.mark.asyncio +async def test_responsive_make_agent_node_raises_on_interrupt_in_result( + repo, session, +): + """Same contract as above, exercised through the generic responsive + runner. Both runners share the contract: ``__interrupt__`` in the + ainvoke result must be re-raised as ``GraphInterrupt``. + """ + skill = Skill( + name="resolution", + description="d", + routes=[RouteRule(when="default", next="end")], + system_prompt="You are resolution.", + ) + node = responsive_make_agent_node( + skill=skill, + llm=_OneShotToolCallLLM(), + tools=[_make_pause_tool()], + decide_route=lambda inc: route_from_skill(skill, inc), + store=repo, + ) + state: GraphState = {"session": session, "next_route": None} + with pytest.raises(GraphInterrupt): + await asyncio.wait_for(node(state), timeout=5.0) + + +# --------------------------------------------------------------------------- +# Resume happy path — verdict actually reaches the gated tool. +# +# The previous tests cover the FIRST turn (pause). These tests cover the +# RESUME turn: when the outer Pregel is rehydrated via +# ``Command(resume=verdict)``, the helper ``_drive_agent_with_resume`` +# detects the inner agent's paused state via ``aget_state``, calls +# ``interrupt()`` at the outer level to fetch the verdict, and forwards +# it via ``Command(resume=verdict)`` to the inner agent. The inner +# agent's tool then re-enters with the verdict in hand and produces a +# real ToolMessage — which is the only proof that the gated work +# actually executed (a "silent skip" path leaves no ToolMessage +# behind). + + +class _LLMWithCloser(BaseChatModel): + """LLM that emits a tool_call once, then a closing AIMessage.""" + + _counter: int = 0 # pyright: ignore[reportGeneralTypeIssues] — pydantic v2 needs annotation + + @property + def _llm_type(self) -> str: + return "loop-toolcall-stub" + + def _generate( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: Any = None, + **kwargs: Any, + ) -> ChatResult: + self._counter += 1 + if self._counter == 1: + msg = AIMessage( + content="", + tool_calls=[ + {"name": "pause_tool", "args": {"why": "x"}, "id": "c1"}, + ], + ) + else: + # Closing markdown envelope — Path 4 parser will pick it up. + md = ( + "all good\n\n" + "## Response\nall good\n\n" + "## Confidence\n0.9 -- post-resume close\n\n" + "## Signal\nnone\n" + ) + msg = AIMessage(content=md) + return ChatResult(generations=[ChatGeneration(message=msg)]) + + async def _agenerate( + self, + messages: list[BaseMessage], + stop: list[str] | None = None, + run_manager: Any = None, + **kwargs: Any, + ) -> ChatResult: + return self._generate(messages, stop, run_manager, **kwargs) + + def bind_tools(self, tools, *, tool_choice=None, **kwargs): + return self + + +@pytest.mark.asyncio +async def test_resume_forwards_verdict_to_inner_tool_and_completes( + repo, session, +): + """End-to-end pause → resume → tool actually runs with verdict. + + Drives the agent runner across two invocations sharing a + ``langgraph.checkpoint.memory.InMemorySaver``. The first call + raises ``GraphInterrupt`` (paused mid-tool). The second call — + re-entered as the outer would re-enter the node body after a + ``Command(resume=...)`` — must (a) detect the inner pause via + ``aget_state``, (b) forward the verdict via + ``Command(resume=verdict)``, and (c) drive the tool to completion + so a ``ToolMessage`` appears in the final inner message list. The + ToolMessage is the proof that the gated tool actually executed + rather than being silently skipped. + """ + from langgraph.checkpoint.memory import InMemorySaver + from langgraph.types import Command, interrupt + + saver = InMemorySaver() + + # Tool whose interrupt() returns the verdict and whose final + # return value records the verdict so we can assert it ended up in + # the message stream. + class _Args(BaseModel): + why: str = "x" + + captured: dict[str, str | None] = {"verdict": None} + + def _gated(why: str = "x") -> str: + v = interrupt({"need_verdict": why}) + captured["verdict"] = v + return f"ran:{v}" + + gated = StructuredTool.from_function( + func=_gated, name="pause_tool", description="emulates HITL gate", + args_schema=_Args, + ) + + # Build an inner agent the same way the framework would, but + # against the test's saver so the pause survives across calls. + from langchain.agents import create_agent + + llm = _LLMWithCloser() + inner = create_agent( + model=llm, tools=[gated], system_prompt="x", checkpointer=saver, + ) + inner_cfg = {"configurable": {"thread_id": "T-resume-test"}} + + # The helper's ``interrupt()`` calls require a real Pregel + # runnable context — we cannot call ``_drive_agent_with_resume`` + # directly. Exercise it inside a minimal one-node outer graph + # that owns its own checkpointer; that's exactly how the + # framework wires it via ``runtime.graph.build_graph``. + from runtime.graph import _drive_agent_with_resume + from langgraph.graph import StateGraph, START, END + from typing import TypedDict + + class _OuterS(TypedDict, total=False): + result: object + + async def _wrapper(state: _OuterS) -> dict: + out = await _drive_agent_with_resume( + agent_executor=inner, inner_cfg=inner_cfg, + inner_has_checkpointer=True, + initial_input={"messages": [("human", "go")]}, + ) + return {"result": out} + + sg = StateGraph(_OuterS) + sg.add_node("n", _wrapper) + sg.add_edge(START, "n") + sg.add_edge("n", END) + outer = sg.compile(checkpointer=InMemorySaver()) + outer_cfg = {"configurable": {"thread_id": "OUTER-T"}} + + # First outer.invoke({}): the wrapper hits the inner pause and + # then the helper's outer interrupt() pauses the outer too. + first = await outer.ainvoke({}, config=outer_cfg) + assert "__interrupt__" in first + + # Now resume the outer with the verdict — the helper forwards it + # to the inner via Command(resume="approve") and the gated tool + # actually re-enters with the verdict. + final = await outer.ainvoke(Command(resume="approve"), config=outer_cfg) + + inner_result = final.get("result") + assert isinstance(inner_result, dict), f"unexpected inner result: {inner_result!r}" + msgs = inner_result.get("messages") or [] + # Assert a ToolMessage exists with the gated-tool's actual output. + tool_msgs = [m for m in msgs if type(m).__name__ == "ToolMessage"] + assert tool_msgs, ( + "expected a ToolMessage in the final inner messages — its " + "absence means the gated tool was silently skipped on resume " + f"(verdict={captured['verdict']!r}, messages={[type(m).__name__ for m in msgs]})" + ) + assert "ran:approve" in str(tool_msgs[-1].content), ( + f"tool ran but verdict was lost: {tool_msgs[-1].content!r}" + ) + assert captured["verdict"] == "approve", ( + f"tool's interrupt() did not receive 'approve' on resume: " + f"{captured['verdict']!r}" + ) diff --git a/tests/test_markdown_turn_output.py b/tests/test_markdown_turn_output.py new file mode 100644 index 0000000..1d7480a --- /dev/null +++ b/tests/test_markdown_turn_output.py @@ -0,0 +1,395 @@ +"""Phase 22 — markdown turn output parser matrix. + +Pure-function tests on parse_markdown_envelope. Covers the +section-by-section parse + the validation paths the framework relies +on (D-22-03 contract). + +Pinning the contract here is the single point of compat truth — every +LLM that writes the documented ## Response / ## Confidence / +## Signal block round-trips through these tests verbatim. +""" +from __future__ import annotations + +import pytest + +from runtime.agents.turn_output import ( + AgentTurnOutput, + EnvelopeMissingError, + parse_envelope_from_result, + parse_markdown_envelope, +) + + +# --------------------------------------------------------------------------- +# Happy path +# --------------------------------------------------------------------------- + +def test_all_sections_present_em_dash_rationale(): + md = ( + "Some prose preamble that may or may not exist.\n\n" + "## Response\n" + "Looks like a deploy regression in payments-svc.\n\n" + "## Confidence\n" + "0.85 — strong correlation with v1.120 release timeline\n\n" + "## Signal\n" + "default\n" + ) + env = parse_markdown_envelope(md, agent="triage") + assert isinstance(env, AgentTurnOutput) + assert env.content == "Looks like a deploy regression in payments-svc." + assert env.confidence == 0.85 + assert env.confidence_rationale == ( + "strong correlation with v1.120 release timeline" + ) + assert env.signal == "default" + + +def test_ascii_dash_rationale(): + md = ( + "## Response\nfoo\n" + "## Confidence\n0.7 -- ok\n" + "## Signal\nsuccess\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 0.7 + assert env.confidence_rationale == "ok" + assert env.signal == "success" + + +def test_single_hyphen_rationale(): + md = ( + "## Response\nfoo\n" + "## Confidence\n0.6 - some rationale\n" + "## Signal\nfailed\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 0.6 + assert env.confidence_rationale == "some rationale" + + +# --------------------------------------------------------------------------- +# Confidence shape variants +# --------------------------------------------------------------------------- + +def test_confidence_int_zero_clamps_in_range(): + md = ( + "## Response\nbody\n" + "## Confidence\n0\n" + "## Signal\nfailed\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 0.0 + # Missing rationale is replaced with placeholder. + assert env.confidence_rationale == "(no rationale provided)" + + +def test_confidence_int_one(): + md = ( + "## Response\nbody\n" + "## Confidence\n1\n" + "## Signal\nsuccess\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 1.0 + + +def test_confidence_above_range_clamps(): + """LLM occasionally emits 1.05 — clamp rather than reject.""" + md = ( + "## Response\nbody\n" + "## Confidence\n1.05 — over\n" + "## Signal\nsuccess\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 1.0 + + +def test_confidence_below_range_clamps(): + md = ( + "## Response\nbody\n" + "## Confidence\n-0.2 — under\n" + "## Signal\nfailed\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 0.0 + + +def test_confidence_unparseable_raises(): + md = ( + "## Response\nbody\n" + "## Confidence\nhigh — strong\n" + "## Signal\nsuccess\n" + ) + with pytest.raises(EnvelopeMissingError) as exc: + parse_markdown_envelope(md, agent="triage") + assert exc.value.field == "confidence" + assert "triage" in str(exc.value) + + +def test_confidence_section_missing_raises(): + md = ( + "## Response\nbody\n" + "## Signal\nsuccess\n" + ) + with pytest.raises(EnvelopeMissingError) as exc: + parse_markdown_envelope(md) + assert exc.value.field == "confidence" + + +# --------------------------------------------------------------------------- +# Signal vocabulary +# --------------------------------------------------------------------------- + +@pytest.mark.parametrize("blank", ["", "none", "None", "NULL", " ", "n/a"]) +def test_signal_blank_or_none_token_yields_none(blank: str): + md = ( + "## Response\nbody\n" + "## Confidence\n0.5 — mid\n" + f"## Signal\n{blank}\n" + ) + env = parse_markdown_envelope(md) + assert env.signal is None + + +def test_signal_unknown_value_passed_through_lowercased(): + """Vocabulary validation lives in the routing layer, not the parser.""" + md = ( + "## Response\nbody\n" + "## Confidence\n0.5 — mid\n" + "## Signal\nWeirdSignal\n" + ) + env = parse_markdown_envelope(md) + assert env.signal == "weirdsignal" + + +def test_signal_section_missing_yields_none(): + md = ( + "## Response\nbody\n" + "## Confidence\n0.4 — hedged\n" + ) + env = parse_markdown_envelope(md) + assert env.signal is None + + +# --------------------------------------------------------------------------- +# Response body shape +# --------------------------------------------------------------------------- + +def test_multi_line_response_with_code_block(): + md = ( + "## Response\n" + "Here is the fix:\n\n" + "```python\n" + "def add(a, b):\n" + " return a + b\n" + "```\n\n" + "Run pytest after applying.\n\n" + "## Confidence\n" + "0.92 — verified locally\n\n" + "## Signal\n" + "success\n" + ) + env = parse_markdown_envelope(md) + assert "```python" in env.content + assert "Run pytest" in env.content + assert env.confidence == 0.92 + + +def test_response_section_missing_raises(): + md = ( + "## Confidence\n0.7 — ok\n" + "## Signal\nsuccess\n" + ) + with pytest.raises(EnvelopeMissingError) as exc: + parse_markdown_envelope(md) + assert exc.value.field == "content" + + +def test_empty_content_raises(): + with pytest.raises(EnvelopeMissingError) as exc: + parse_markdown_envelope("") + assert exc.value.field == "content" + + +# --------------------------------------------------------------------------- +# Header drift: H3 / lowercase +# --------------------------------------------------------------------------- + +def test_h3_headers_accepted(): + """Watch-out: model may emit ### instead of ##. Parser accepts ##+.""" + md = ( + "### Response\nbody\n" + "### Confidence\n0.5 — mid\n" + "### Signal\ndefault\n" + ) + env = parse_markdown_envelope(md) + assert env.content == "body" + assert env.confidence == 0.5 + assert env.signal == "default" + + +def test_lowercase_headers_accepted(): + md = ( + "## response\nbody\n" + "## confidence\n0.5 — mid\n" + "## signal\nsuccess\n" + ) + env = parse_markdown_envelope(md) + assert env.confidence == 0.5 + assert env.signal == "success" + + +# --------------------------------------------------------------------------- +# parse_envelope_from_result Path 4 integration +# --------------------------------------------------------------------------- + +def _ai(content: str): + """Construct an object that quacks like an AIMessage for path-4 + class-name walker.""" + obj = type("AIMessage", (), {})() + obj.content = content # type: ignore[attr-defined] + return obj + + +def test_parse_envelope_from_result_falls_through_to_path_4_markdown(): + """No structured_response, no JSON in messages — Path 4 wins.""" + md = ( + "## Response\nbody from path 4\n" + "## Confidence\n0.8 — ok\n" + "## Signal\nsuccess\n" + ) + result = {"messages": [_ai(md)], "structured_response": None} + env = parse_envelope_from_result(result, agent="triage") + assert env.content == "body from path 4" + assert env.confidence == 0.8 + assert env.signal == "success" + + +def test_parse_envelope_from_result_path_1_still_wins_when_present(): + """Markdown parser doesn't shadow the structured_response path.""" + md = ( + "## Response\nshould not be picked\n" + "## Confidence\n0.1 — wrong\n" + "## Signal\nfailed\n" + ) + sr = AgentTurnOutput( + content="from structured_response", + confidence=0.99, + confidence_rationale="strong", + signal="success", + ) + result = {"messages": [_ai(md)], "structured_response": sr} + env = parse_envelope_from_result(result, agent="triage") + assert env.content == "from structured_response" + assert env.confidence == 0.99 + + +def test_parse_envelope_from_result_raises_when_no_path_yields(): + """Empty messages + no structured_response + no markdown → raises.""" + result = {"messages": [_ai("just prose, no sections")]} + with pytest.raises(EnvelopeMissingError): + parse_envelope_from_result(result, agent="triage") + + +def test_en_dash_rationale_gpt_oss_pattern(): + """gpt-oss:20b emits the EN DASH (U+2013) as the confidence + separator, not the EM DASH (U+2014). The parser must accept the + full Unicode dash-punctuation family.""" + md = "## Response\nbody\n## Confidence\n0.88 – strong evidence\n## Signal\nsuccess\n" + env = parse_markdown_envelope(md) + assert env.confidence == 0.88 + assert env.confidence_rationale == "strong evidence" + assert env.signal == "success" + + +@pytest.mark.parametrize("sep", ["-", "‐", "‑", "‒", "–", "—", "―"]) +def test_dash_family_variants_all_parse(sep): + """Sanity sweep: every Unicode dash in the regex class round-trips.""" + md = f"## Response\nbody\n## Confidence\n0.5 {sep} ok\n## Signal\ndefault\n" + env = parse_markdown_envelope(md) + assert env.confidence == 0.5 + assert env.confidence_rationale == "ok" + + +def test_path5_synthesises_envelope_from_terminal_tool_args(): + """gpt-oss:20b sometimes emits an empty closing AIMessage after + calling a terminal tool. Path 5 synthesises the envelope from + the tool's confidence + confidence_rationale args (mark_resolved + style).""" + from runtime.agents.turn_output import parse_envelope_from_result + + def _ai_with_tool(name: str, args: dict, content: str = ""): + obj = type("AIMessage", (), {})() + obj.content = content + obj.tool_calls = [{"name": name, "args": args, "id": "t1"}] + return obj + + def _ai_empty(): + obj = type("AIMessage", (), {})() + obj.content = "" + obj.tool_calls = [] + return obj + + messages = [ + _ai_with_tool( + "mark_resolved", + { + "incident_id": "INC-1", + "resolution_summary": "rolled back v1.120", + "confidence": 0.92, + "confidence_rationale": "deploy timeline correlates", + }, + ), + _ai_empty(), # empty closing message + ] + result = {"messages": messages, "structured_response": None} + env = parse_envelope_from_result(result, agent="resolution") + assert env.content == "rolled back v1.120" + assert env.confidence == 0.92 + assert env.confidence_rationale == "deploy timeline correlates" + + +def test_path5_synthesis_clamps_oob_confidence(): + """Synthesis path also clamps out-of-range confidence.""" + from runtime.agents.turn_output import parse_envelope_from_result + + obj = type("AIMessage", (), {})() + obj.content = "" + obj.tool_calls = [{ + "name": "mark_escalated", + "args": { + "team": "platform-oncall", + "reason": "needs human review", + "confidence": 1.5, + "confidence_rationale": "manual escalation", + }, + "id": "t1", + }] + result = {"messages": [obj]} + env = parse_envelope_from_result(result, agent="resolution") + assert env.confidence == 1.0 # clamped + assert env.content == "needs human review" + + +def test_path6_synthesises_minimal_envelope_from_any_tool_call(): + """Path 6: when no markdown + no typed-terminal-tool-with-conf, + but the model DID call some tool, synthesise a low-confidence + placeholder envelope so the session reaches a reviewable terminal + status instead of hard-failing.""" + from runtime.agents.turn_output import parse_envelope_from_result + + obj = type("AIMessage", (), {})() + obj.content = "" + obj.tool_calls = [ + {"name": "propose_fix", "args": {"hypothesis": "deploy regression"}, "id": "t1"}, + {"name": "apply_fix", "args": {"proposal_id": "p1"}, "id": "t2"}, + ] + closing = type("AIMessage", (), {})() + closing.content = "" + closing.tool_calls = [] + result = {"messages": [obj, closing], "structured_response": None} + env = parse_envelope_from_result(result, agent="resolution") + assert env.confidence == 0.30 + assert "propose_fix" in env.content + assert "apply_fix" in env.content + assert env.signal is None diff --git a/tests/test_real_llm_tool_loop_termination.py b/tests/test_real_llm_tool_loop_termination.py index 8db3284..fc3bae7 100644 --- a/tests/test_real_llm_tool_loop_termination.py +++ b/tests/test_real_llm_tool_loop_termination.py @@ -282,26 +282,27 @@ def test_no_create_react_agent_imports_in_production_runtime(): # T4-bonus — StubChatModel.bind_tools registers the envelope tool name -def test_stub_chat_model_records_envelope_tool_name_on_bind(): - """``StubChatModel.bind_tools`` is the integration point that lets - the new ``create_agent`` loop terminate in stub mode. This test - locks the contract: when the bound tools include an - ``AgentTurnOutput``-named entry, the stub records it and emits a - closing tool call with that name on the next ``_generate``. - """ +def test_stub_chat_model_emits_markdown_envelope_block(): + """Phase 22 (D-22-05) replaces the Phase 15 envelope-tool + synthesis. ``StubChatModel.bind_tools`` is now a no-op; ``_generate`` + emits the closing turn as a markdown block matching the D-22-03 + contract that ``parse_markdown_envelope`` reads.""" llm = StubChatModel(role="agent", canned_responses={"agent": "ok"}) - # Simulate what create_agent's ToolStrategy passes: a sequence of - # tool specs where the AgentTurnOutput-named tool is the structured- - # output sentinel. - llm.bind_tools([AgentTurnOutput]) - assert llm._envelope_tool_name == "AgentTurnOutput" + # bind_tools no longer mutates state — it is a pass-through. + bound = llm.bind_tools([AgentTurnOutput]) + assert bound is llm + assert not hasattr(llm, "_envelope_tool_name") - # Drive a single _generate and verify the closing tool call lands. + # Drive a single _generate and verify the closing AIMessage carries + # the markdown envelope (no tool_calls when no tool_call_plan). result = llm._generate(messages=[HumanMessage(content="go")]) msg = result.generations[0].message - assert msg.tool_calls, "expected a closing envelope tool call" - assert msg.tool_calls[0]["name"] == "AgentTurnOutput" - args = msg.tool_calls[0]["args"] - assert args["content"] == "ok" - assert args["confidence"] == pytest.approx(0.85) - assert "confidence_rationale" in args + assert not msg.tool_calls + body = msg.content + assert "## Response" in body + assert "## Confidence" in body + assert "## Signal" in body + # The default stub_envelope_confidence (0.85) and the canned text + # ("ok") flow into the markdown body. + assert "ok" in body + assert "0.8500" in body