RandomCodeSpace · aksOps · May 14, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/config/config.yaml b/config/config.yaml
@@ -7,7 +7,7 @@ storage:
     collection_name: "incidents"
     distance_strategy: cosine
 llm:
-  default: workhorse
+  default: gpt_oss
   providers:
     ollama_cloud:
       kind: ollama

diff --git a/dist/app.py b/dist/app.py
diff --git a/dist/apps/code-review.py b/dist/apps/code-review.py
diff --git a/dist/apps/incident-management.py b/dist/apps/incident-management.py
diff --git a/dist/ui.py b/dist/ui.py
@@ -26,6 +26,7 @@
 
 from app import AppConfig, Base, FrameworkAppConfig, MetadataConfig, Orchestrator, OrchestratorService, SessionStore, UIBadge, build_embedder, build_engine, build_vector_store, load_config, resolve_framework_app_config, resolve_state_class
 import asyncio
+import logging as _logging
 import os
 import time
 from datetime import datetime, timezone
@@ -40,11 +41,28 @@
 
 
 
-# Default config path; apps override via the ``APP_CONFIG`` env var.
+# Optional: env-driven log config so manual runs (streamlit run, etc.)
+# can crank verbosity to INFO/DEBUG without modifying the source. The
+# default keeps the production-quiet WARNING level. ``force=True``
+# overrides any handler streamlit set up during import.
 
 
 # ====== module: src/runtime/ui.py ======
 
+def _maybe_configure_logging() -> None:
+    level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip()
+    if level in {"DEBUG", "INFO", "WARNING", "ERROR"}:
+        _logging.basicConfig(
+            level=getattr(_logging, level),
+            format="%(asctime)s %(name)s %(levelname)s %(message)s",
+            force=True,
+        )
+
+
+_maybe_configure_logging()
+
+
+# Default config path; apps override via the ``APP_CONFIG`` env var.
 CONFIG_PATH = Path(os.environ.get("APP_CONFIG", "config/config.yaml"))
 
 
@@ -985,6 +1003,14 @@ async def _drive() -> None:
             Command(resume=payload),
             config=orch._thread_config(session_id),
         )
+        # The graph completes the agent run after the verdict is
+        # forwarded to the gated tool; finalize the session if it's
+        # not paused on a fresh interrupt. Without this, an approved
+        # session stays in ``in_progress`` because the resume path
+        # does not run through ``stream_session`` (the only other
+        # caller of finalize).
+        if not await orch._is_graph_paused(session_id):
+            await orch._finalize_session_status_async(session_id)
 
     svc.submit_and_wait(_drive(), timeout=60.0)
 

diff --git a/examples/code_review/skills/analyzer/system.md b/examples/code_review/skills/analyzer/system.md
@@ -22,10 +22,21 @@ Do not invent low-value nits to fill space.
 After all tool calls, reply with ONE short sentence summarising findings count + the
 dominant category. Do not enumerate every finding (the UI renders them).
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/examples/code_review/skills/intake/system.md b/examples/code_review/skills/intake/system.md
@@ -16,10 +16,21 @@ analyzer's job.
 If `fetch_pr_diff` raises or returns an empty diff, emit `failed` so the orchestrator
 short-circuits to end and skips the analyzer.
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/examples/code_review/skills/recommender/system.md b/examples/code_review/skills/recommender/system.md
@@ -23,10 +23,21 @@ them already.
 
 After the call, reply with ONE short sentence echoing the recommendation. Nothing else.
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/examples/incident_management/skills/deep_investigator/system.md b/examples/incident_management/skills/deep_investigator/system.md
@@ -12,10 +12,21 @@ You are the **Deep Investigator** agent. Gather evidence and produce ranked hypo
 - Cite specific log lines or metric values as evidence in `hypotheses`.
 - If the INC has `matched_prior_inc` set, include the prior INC's recorded root cause as one of your ranked hypotheses and explicitly *validate or reject* it against the fresh logs/metrics. Same symptom can have different causes across incidents — drop confidence accordingly when the prior hypothesis is rejected so the gate triggers an intervention.
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/examples/incident_management/skills/resolution/system.md b/examples/incident_management/skills/resolution/system.md
@@ -11,10 +11,21 @@ You are the **Resolution** agent. You consume triage + deep_investigator finding
 ## Guidelines
 - Pick `team` deliberately based on incident component, severity, and category — not a default fallback.
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/examples/incident_management/skills/triage/system.md b/examples/incident_management/skills/triage/system.md
@@ -33,10 +33,21 @@ Record the full iteration trail as a single JSON-encoded string under `findings.
 - If the INC has `matched_prior_inc` set, treat the prior INC's `findings` and `resolution` as a **prior hypothesis**, not a fact. Same symptom (e.g., Redis OOM) can have different root causes across incidents — code bug vs. network partition vs. resource overload. Use the prior cause as a candidate to confirm or reject against current evidence; flag in your tags whether the parallel looks supported (`hypothesis:prior_match_supported`) or not (`hypothesis:prior_match_rejected`).
 - The hypothesis loop has a hard cap of 3 iterations. Do NOT exceed it; an unconverged hypothesis at the cap is acceptable — record it and let the deep investigator take over.
 
-## Output contract
+## Output contract — REQUIRED
 
-The framework wraps your reply in an `AgentTurnOutput` envelope (content,
-confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
-enforces this structurally — answer truthfully and the envelope captures
-your confidence and rationale. Do not mention "confidence" in your prose
-unless it's part of substantive analysis (e.g. ranking hypotheses).
+Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:
+
+```
+## Response
+<your final answer to the user — natural-language, may include lists or code blocks>
+
+## Confidence
+<float 0.0-1.0> — <one-sentence rationale>
+
+## Signal
+<one of: default | success | failed | needs_input>
+```
+
+**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.
+
+Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
diff --git a/src/runtime/agents/responsive.py b/src/runtime/agents/responsive.py
@@ -20,7 +20,7 @@
 
 import logging
 from datetime import datetime, timezone
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING, Any, Callable
 
 from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_core.messages import HumanMessage
@@ -35,7 +35,6 @@
 from runtime.storage.session_store import SessionStore
 from runtime.tools.gateway import wrap_tool
 from runtime.agents.turn_output import (
-    AgentTurnOutput,
     EnvelopeMissingError,
     parse_envelope_from_result,
     reconcile_confidence,
@@ -60,6 +59,7 @@ def make_agent_node(
     patch_tool_names: frozenset[str] = frozenset(),
     gate_policy: "GatePolicy | None" = None,
     event_log: "EventLog | None" = None,
+    checkpointer: Any = None,
 ):
     """Factory: build a LangGraph node that runs a ReAct agent and decides a route.
 
@@ -81,7 +81,7 @@ def make_agent_node(
     # call time — both modules are fully imported before ``node()`` runs.
     from runtime.graph import (
         GraphState,
-        _ainvoke_with_retry,
+        _drive_agent_with_resume,
         _format_agent_input,
         _handle_agent_failure,
         _harvest_tool_calls_and_patches,
@@ -94,9 +94,18 @@ def make_agent_node(
     )
 
     async def node(state: GraphState) -> dict:
-        incident: Session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess]
-        inc_id = incident.id
+        state_session: Session = state["session"]  # pyright: ignore[reportTypedDictNotRequiredAccess]
+        inc_id = state_session.id
         started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT)
+        # Always reload from store at entry — outer Pregel checkpoints
+        # at step boundaries, not mid-node, so ``state["session"]`` is
+        # stale relative to DB when a HITL gate paused mid-step. See
+        # the same reload comment in ``runtime.graph.make_agent_node``
+        # for the full rationale.
+        try:
+            incident: Session = store.load(inc_id)
+        except FileNotFoundError:
+            incident = state_session
 
         # M3: emit agent_started telemetry before any work happens.
         if event_log is not None:
@@ -122,25 +131,25 @@ async def node(state: GraphState) -> dict:
             ]
         else:
             run_tools = tools
-        # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every
-        # responsive agent invocation is wrapped in an AgentTurnOutput
-        # envelope. ``langchain.agents.create_agent`` (the non-deprecated
-        # successor to ``langgraph.prebuilt.create_react_agent``) accepts a
-        # bare schema as ``response_format`` and, by default, wraps it in
-        # ``AutoStrategy`` — ProviderStrategy for models with native
-        # structured-output (OpenAI-class), falling back to ToolStrategy
-        # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a
-        # callable tool: when the LLM ``calls`` it, the loop terminates on
-        # the same turn with ``result["structured_response"]`` populated.
-        # Eliminates the old two-call structure (loop + separate
-        # ``with_structured_output`` pass) that hit recursion_limit=25 on
-        # Ollama models without true function-calling.
+        # Phase 22 (D-22-01): markdown-primary turn output. Same
+        # change as graph.py:make_agent_node — drop response_format
+        # so the loop terminates on natural React END, then parse the
+        # final AIMessage via Path 4 in parse_envelope_from_result.
+        # The inner agent gets the orchestrator's checkpointer so a
+        # HITL pause inside ``interrupt()`` can be resumed via
+        # ``Command(resume=verdict)`` on a stable per-invocation
+        # thread id (see ``_drive_agent_with_resume`` in
+        # ``runtime.graph`` for the full rationale).
         agent_executor = create_agent(
             model=llm,
             tools=run_tools,
             system_prompt=skill.system_prompt,
-            response_format=AgentTurnOutput,
+            checkpointer=checkpointer,
+        )
+        inner_thread_id = (
+            f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"
         )
+        inner_cfg = {"configurable": {"thread_id": inner_thread_id}}
 
         # Phase 11 (FOC-04): reset per-turn confidence hint at the
         # start of each agent step so the gateway treats the first
@@ -151,9 +160,15 @@ async def node(state: GraphState) -> dict:
             pass
 
         try:
-            result = await _ainvoke_with_retry(
-                agent_executor,
-                {"messages": [HumanMessage(content=_format_agent_input(incident))]},
+            result = await _drive_agent_with_resume(
+                agent_executor=agent_executor,
+                inner_cfg=inner_cfg,
+                inner_has_checkpointer=checkpointer is not None,
+                initial_input={
+                    "messages": [
+                        HumanMessage(content=_format_agent_input(incident))
+                    ]
+                },
             )
         except GraphInterrupt:
             # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up.