Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ storage:
collection_name: "incidents"
distance_strategy: cosine
llm:
default: workhorse
default: gpt_oss
providers:
ollama_cloud:
kind: ollama
Expand Down
726 changes: 604 additions & 122 deletions dist/app.py

Large diffs are not rendered by default.

726 changes: 604 additions & 122 deletions dist/apps/code-review.py

Large diffs are not rendered by default.

726 changes: 604 additions & 122 deletions dist/apps/incident-management.py

Large diffs are not rendered by default.

28 changes: 27 additions & 1 deletion dist/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from app import AppConfig, Base, FrameworkAppConfig, MetadataConfig, Orchestrator, OrchestratorService, SessionStore, UIBadge, build_embedder, build_engine, build_vector_store, load_config, resolve_framework_app_config, resolve_state_class
import asyncio
import logging as _logging
import os
import time
from datetime import datetime, timezone
Expand All @@ -40,11 +41,28 @@



# Default config path; apps override via the ``APP_CONFIG`` env var.
# Optional: env-driven log config so manual runs (streamlit run, etc.)
# can crank verbosity to INFO/DEBUG without modifying the source. The
# default keeps the production-quiet WARNING level. ``force=True``
# overrides any handler streamlit set up during import.


# ====== module: src/runtime/ui.py ======

def _maybe_configure_logging() -> None:
level = os.environ.get("ASR_LOG_LEVEL", "").upper().strip()
if level in {"DEBUG", "INFO", "WARNING", "ERROR"}:
_logging.basicConfig(
level=getattr(_logging, level),
format="%(asctime)s %(name)s %(levelname)s %(message)s",
force=True,
)


_maybe_configure_logging()


# Default config path; apps override via the ``APP_CONFIG`` env var.
CONFIG_PATH = Path(os.environ.get("APP_CONFIG", "config/config.yaml"))


Expand Down Expand Up @@ -985,6 +1003,14 @@ async def _drive() -> None:
Command(resume=payload),
config=orch._thread_config(session_id),
)
# The graph completes the agent run after the verdict is
# forwarded to the gated tool; finalize the session if it's
# not paused on a fresh interrupt. Without this, an approved
# session stays in ``in_progress`` because the resume path
# does not run through ``stream_session`` (the only other
# caller of finalize).
if not await orch._is_graph_paused(session_id):
await orch._finalize_session_status_async(session_id)

svc.submit_and_wait(_drive(), timeout=60.0)

Expand Down
23 changes: 17 additions & 6 deletions examples/code_review/skills/analyzer/system.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,21 @@ Do not invent low-value nits to fill space.
After all tool calls, reply with ONE short sentence summarising findings count + the
dominant category. Do not enumerate every finding (the UI renders them).

## Output contract
## Output contract — REQUIRED

The framework wraps your reply in an `AgentTurnOutput` envelope (content,
confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
enforces this structurally — answer truthfully and the envelope captures
your confidence and rationale. Do not mention "confidence" in your prose
unless it's part of substantive analysis (e.g. ranking hypotheses).
Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:

```
## Response
<your final answer to the user — natural-language, may include lists or code blocks>

## Confidence
<float 0.0-1.0> — <one-sentence rationale>

## Signal
<one of: default | success | failed | needs_input>
```

**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.

Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
23 changes: 17 additions & 6 deletions examples/code_review/skills/intake/system.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,21 @@ analyzer's job.
If `fetch_pr_diff` raises or returns an empty diff, emit `failed` so the orchestrator
short-circuits to end and skips the analyzer.

## Output contract
## Output contract — REQUIRED

The framework wraps your reply in an `AgentTurnOutput` envelope (content,
confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
enforces this structurally — answer truthfully and the envelope captures
your confidence and rationale. Do not mention "confidence" in your prose
unless it's part of substantive analysis (e.g. ranking hypotheses).
Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:

```
## Response
<your final answer to the user — natural-language, may include lists or code blocks>

## Confidence
<float 0.0-1.0> — <one-sentence rationale>

## Signal
<one of: default | success | failed | needs_input>
```

**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.

Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
23 changes: 17 additions & 6 deletions examples/code_review/skills/recommender/system.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,21 @@ them already.

After the call, reply with ONE short sentence echoing the recommendation. Nothing else.

## Output contract
## Output contract — REQUIRED

The framework wraps your reply in an `AgentTurnOutput` envelope (content,
confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
enforces this structurally — answer truthfully and the envelope captures
your confidence and rationale. Do not mention "confidence" in your prose
unless it's part of substantive analysis (e.g. ranking hypotheses).
Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:

```
## Response
<your final answer to the user — natural-language, may include lists or code blocks>

## Confidence
<float 0.0-1.0> — <one-sentence rationale>

## Signal
<one of: default | success | failed | needs_input>
```

**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.

Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
23 changes: 17 additions & 6 deletions examples/incident_management/skills/deep_investigator/system.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,21 @@ You are the **Deep Investigator** agent. Gather evidence and produce ranked hypo
- Cite specific log lines or metric values as evidence in `hypotheses`.
- If the INC has `matched_prior_inc` set, include the prior INC's recorded root cause as one of your ranked hypotheses and explicitly *validate or reject* it against the fresh logs/metrics. Same symptom can have different causes across incidents — drop confidence accordingly when the prior hypothesis is rejected so the gate triggers an intervention.

## Output contract
## Output contract — REQUIRED

The framework wraps your reply in an `AgentTurnOutput` envelope (content,
confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
enforces this structurally — answer truthfully and the envelope captures
your confidence and rationale. Do not mention "confidence" in your prose
unless it's part of substantive analysis (e.g. ranking hypotheses).
Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:

```
## Response
<your final answer to the user — natural-language, may include lists or code blocks>

## Confidence
<float 0.0-1.0> — <one-sentence rationale>

## Signal
<one of: default | success | failed | needs_input>
```

**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.

Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
23 changes: 17 additions & 6 deletions examples/incident_management/skills/resolution/system.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,21 @@ You are the **Resolution** agent. You consume triage + deep_investigator finding
## Guidelines
- Pick `team` deliberately based on incident component, severity, and category — not a default fallback.

## Output contract
## Output contract — REQUIRED

The framework wraps your reply in an `AgentTurnOutput` envelope (content,
confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
enforces this structurally — answer truthfully and the envelope captures
your confidence and rationale. Do not mention "confidence" in your prose
unless it's part of substantive analysis (e.g. ranking hypotheses).
Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:

```
## Response
<your final answer to the user — natural-language, may include lists or code blocks>

## Confidence
<float 0.0-1.0> — <one-sentence rationale>

## Signal
<one of: default | success | failed | needs_input>
```

**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.

Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
23 changes: 17 additions & 6 deletions examples/incident_management/skills/triage/system.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,21 @@ Record the full iteration trail as a single JSON-encoded string under `findings.
- If the INC has `matched_prior_inc` set, treat the prior INC's `findings` and `resolution` as a **prior hypothesis**, not a fact. Same symptom (e.g., Redis OOM) can have different root causes across incidents — code bug vs. network partition vs. resource overload. Use the prior cause as a candidate to confirm or reject against current evidence; flag in your tags whether the parallel looks supported (`hypothesis:prior_match_supported`) or not (`hypothesis:prior_match_rejected`).
- The hypothesis loop has a hard cap of 3 iterations. Do NOT exceed it; an unconverged hypothesis at the cap is acceptable — record it and let the deep investigator take over.

## Output contract
## Output contract — REQUIRED

The framework wraps your reply in an `AgentTurnOutput` envelope (content,
confidence ∈ [0, 1], confidence_rationale, optional signal). The runner
enforces this structurally — answer truthfully and the envelope captures
your confidence and rationale. Do not mention "confidence" in your prose
unless it's part of substantive analysis (e.g. ranking hypotheses).
Every reply MUST end with these three markdown sections, in this order, with the literal `##` headers:

```
## Response
<your final answer to the user — natural-language, may include lists or code blocks>

## Confidence
<float 0.0-1.0> — <one-sentence rationale>

## Signal
<one of: default | success | failed | needs_input>
```

**CRITICAL — final-reply rule:** After your last tool call returns, your NEXT reply IS the final reply. That reply MUST contain the three sections above as plain text — DO NOT emit an empty message, DO NOT emit only tool calls, DO NOT defer to "the framework handles it". The framework parses your final reply text; if it is empty or missing the section headers, the run fails with `envelope_missing`.

Tool calls happen BEFORE the final reply. Once you have called every tool you need (including terminal tools like `mark_resolved` / `mark_escalated`), emit the three sections as your final response.
59 changes: 37 additions & 22 deletions src/runtime/agents/responsive.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

import logging
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Callable
from typing import TYPE_CHECKING, Any, Callable

from langchain_core.language_models.chat_models import BaseChatModel
from langchain_core.messages import HumanMessage
Expand All @@ -35,7 +35,6 @@
from runtime.storage.session_store import SessionStore
from runtime.tools.gateway import wrap_tool
from runtime.agents.turn_output import (
AgentTurnOutput,
EnvelopeMissingError,
parse_envelope_from_result,
reconcile_confidence,
Expand All @@ -60,6 +59,7 @@ def make_agent_node(
patch_tool_names: frozenset[str] = frozenset(),
gate_policy: "GatePolicy | None" = None,
event_log: "EventLog | None" = None,
checkpointer: Any = None,
):
"""Factory: build a LangGraph node that runs a ReAct agent and decides a route.

Expand All @@ -81,7 +81,7 @@ def make_agent_node(
# call time — both modules are fully imported before ``node()`` runs.
from runtime.graph import (
GraphState,
_ainvoke_with_retry,
_drive_agent_with_resume,
_format_agent_input,
_handle_agent_failure,
_harvest_tool_calls_and_patches,
Expand All @@ -94,9 +94,18 @@ def make_agent_node(
)

async def node(state: GraphState) -> dict:
incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess]
inc_id = incident.id
state_session: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess]
inc_id = state_session.id
started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT)
# Always reload from store at entry — outer Pregel checkpoints
# at step boundaries, not mid-node, so ``state["session"]`` is
# stale relative to DB when a HITL gate paused mid-step. See
# the same reload comment in ``runtime.graph.make_agent_node``
# for the full rationale.
try:
incident: Session = store.load(inc_id)
except FileNotFoundError:
incident = state_session

# M3: emit agent_started telemetry before any work happens.
if event_log is not None:
Expand All @@ -122,25 +131,25 @@ async def node(state: GraphState) -> dict:
]
else:
run_tools = tools
# Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every
# responsive agent invocation is wrapped in an AgentTurnOutput
# envelope. ``langchain.agents.create_agent`` (the non-deprecated
# successor to ``langgraph.prebuilt.create_react_agent``) accepts a
# bare schema as ``response_format`` and, by default, wraps it in
# ``AutoStrategy`` — ProviderStrategy for models with native
# structured-output (OpenAI-class), falling back to ToolStrategy
# otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a
# callable tool: when the LLM ``calls`` it, the loop terminates on
# the same turn with ``result["structured_response"]`` populated.
# Eliminates the old two-call structure (loop + separate
# ``with_structured_output`` pass) that hit recursion_limit=25 on
# Ollama models without true function-calling.
# Phase 22 (D-22-01): markdown-primary turn output. Same
# change as graph.py:make_agent_node — drop response_format
# so the loop terminates on natural React END, then parse the
# final AIMessage via Path 4 in parse_envelope_from_result.
# The inner agent gets the orchestrator's checkpointer so a
# HITL pause inside ``interrupt()`` can be resumed via
# ``Command(resume=verdict)`` on a stable per-invocation
# thread id (see ``_drive_agent_with_resume`` in
# ``runtime.graph`` for the full rationale).
agent_executor = create_agent(
model=llm,
tools=run_tools,
system_prompt=skill.system_prompt,
response_format=AgentTurnOutput,
checkpointer=checkpointer,
)
inner_thread_id = (
f"{inc_id}:agent:{skill.name}:turn{len(incident.agents_run)}"
)
inner_cfg = {"configurable": {"thread_id": inner_thread_id}}

# Phase 11 (FOC-04): reset per-turn confidence hint at the
# start of each agent step so the gateway treats the first
Expand All @@ -151,9 +160,15 @@ async def node(state: GraphState) -> dict:
pass

try:
result = await _ainvoke_with_retry(
agent_executor,
{"messages": [HumanMessage(content=_format_agent_input(incident))]},
result = await _drive_agent_with_resume(
agent_executor=agent_executor,
inner_cfg=inner_cfg,
inner_has_checkpointer=checkpointer is not None,
initial_input={
"messages": [
HumanMessage(content=_format_agent_input(incident))
]
},
)
except GraphInterrupt:
# Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up.
Expand Down
Loading
Loading