diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index dc3415c..0b40b43 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,25 +21,62 @@ jobs: uses: actions/setup-python@v6.2.0 with: python-version: "3.11" - cache: "pip" - - name: Install dependencies - run: pip install -e ".[dev]" + - name: Set up uv + uses: astral-sh/setup-uv@v6 + with: + # Pin uv version for reproducible CI; bump deliberately when bumping locally. + version: "0.11.7" + enable-cache: true + + - name: Lockfile freshness gate (HARD-02) + # Fails the build if pyproject.toml drifts from uv.lock — no silent + # resolves on CI, no surprise transitive upgrades. Phase 14 / SC-4. + run: uv lock --check + + - name: Install dependencies (from lockfile) + # `--frozen` forbids re-resolving; uv installs the exact set pinned in + # uv.lock with hash verification. Phase 14 / SC-3. + run: uv sync --frozen --extra dev + + - name: Bundle staleness gate (HARD-08) + # Regenerates dist/* from src/runtime + examples/* and fails the + # build if anything in dist/ would change. Forces every PR that + # touches src/runtime, examples/, or the bundler to commit fresh + # bundles — the air-gap deploy bundle stays repaired by + # construction (Phase 16 / BUNDLER-01 + HARD-08). Contributors + # run `python scripts/build_single_file.py` before every push; + # see docs/DEVELOPMENT.md. + run: | + uv run python scripts/build_single_file.py + git diff --exit-code dist/ - name: Lint (ruff) - run: ruff check src/ tests/ + run: uv run ruff check src/ tests/ - - name: Type check (pyright) - # Pyright was previously pointed at src/orchestrator (a shim layer - # of star-imports) so its real coverage of the framework was nil. - # After deleting src/orchestrator, the target moved to src/runtime - # and surfaces ~41 pre-existing generic/typed-dict issues. Don't - # block the build on those; track via the follow-up cleanup plan. - continue-on-error: true - run: pyright src/runtime + - name: Type check (pyright) (HARD-03) + # Phase 19 -- the gate is now fail-on-error against ``src/runtime``. + # The earlier 54-error backlog was resolved via type-annotation + # tightening + per-line ``# pyright: ignore[] -- `` + # comments for legitimate stub gaps. ``pyproject.toml`` carries + # the ``[tool.pyright]`` block (``include = ["src"]``, + # ``extraPaths = ["src"]``, ``typeCheckingMode = "basic"``). + # Test files and ``dist/`` bundles are out of scope for this + # phase; future phases may extend coverage outward. + run: uv run pyright src/runtime - name: Test with coverage - run: pytest --cov=src/runtime --cov-report=xml --junitxml=junit.xml + run: uv run pytest --cov=src/runtime --cov-report=xml --junitxml=junit.xml + + - name: Skill-prompt-vs-schema lint (SKILL-LINTER-01) + # Phase 21. Walks every examples/*/skills/*/system.md and asserts + # that every referenced tool name + arg field exists in the + # canonically discovered tool inventory (AST-extracted from + # examples/*/mcp_server*.py + mcp_servers/*.py) and the typed + # patch models (UpdateIncidentPatch). Catches LLM-emit-vs-schema + # drift like `findings_triage` vs `findings.triage`, hallucinated + # injected args, and unknown tool names. Binary-pass gate. + run: uv run python scripts/lint_skill_prompts.py - name: SonarCloud Scan uses: SonarSource/sonarqube-scan-action@v8.0.0 diff --git a/.gitignore b/.gitignore index 2c7f45c..20c5588 100644 --- a/.gitignore +++ b/.gitignore @@ -50,10 +50,18 @@ Thumbs.db # --- Claude tooling artifacts ---------------------------------------- AGENTS.md ASR.md -docs/ +# Tracked docs are explicitly listed below; everything else under docs/ +# is Claude scratch (plans, brainstorm output, etc) and stays gitignored. +# - AIRGAP_INSTALL.md: Phase 14 (HARD-02) air-gap install path. +# - DEVELOPMENT.md: Phase 16 (BUNDLER-01) contributor workflow. +docs/* +!docs/AIRGAP_INSTALL.md +!docs/DEVELOPMENT.md REVIEW_*.md review_*.md .planning/ +# Dev integration test driver (out-of-repo tool, runs against live UI). +scripts/integration_scenarios.py # Coverage / CI artefacts coverage.xml diff --git a/.planning/phases/14-reproducible-air-gap-lockfile/14-01-PLAN.md b/.planning/phases/14-reproducible-air-gap-lockfile/14-01-PLAN.md new file mode 100644 index 0000000..97986f8 --- /dev/null +++ b/.planning/phases/14-reproducible-air-gap-lockfile/14-01-PLAN.md @@ -0,0 +1,75 @@ +--- +phase: 14-reproducible-air-gap-lockfile +plan: 01 +title: Reproducible air-gap dependency lockfile (HARD-02) +status: in_progress +date: 2026-05-07 +requirement: HARD-02 (CONCERNS C2) +--- + +# Plan 14-01 — Reproducible Air-Gap Dependency Lockfile + +## One-liner + +Commit a `uv.lock` that pins every transitive dependency with hashes; CI installs from the lockfile and a freshness gate fails the build when `pyproject.toml` drifts from `uv.lock`; document the offline install path so an engineer behind a corporate firewall can reproduce the dependency graph from an internal mirror without public-internet access. + +## Tool Selection — `uv` (rationale) + +Considered `uv`, `pip-tools`, `poetry`. Selected **`uv`** (locally installed: `uv 0.11.7`). + +| Criterion (`~/.claude/rules/dependencies.md`) | `uv` | `pip-tools` | `poetry` | +| --- | --- | --- | --- | +| License | Apache-2.0 / MIT (dual) | BSD-3-Clause | MIT | +| Active maintenance / bus factor | Astral team, daily releases | jazzband collective | python-poetry org | +| Lockfile format | `uv.lock` (TOML, hashes per platform marker) | `requirements.txt` w/ `--generate-hashes` | `poetry.lock` (TOML) | +| PEP 621 (`pyproject.toml` `[project]`) native | Yes — already what we use | Reads `pyproject.toml` direct | Requires `[tool.poetry]` rewrite of `[project]` | +| Resolver speed (171 pkgs) | ~14 ms (measured) | seconds | seconds | +| Single static binary | Yes (Rust) | No (Python pkg) | No (Python pkg) | +| Works fully offline (`--offline`, `--frozen`) | Yes (first-class) | Indirect via `pip install --no-index` | Yes | +| Drift gate (`--check`) | `uv lock --check` | `pip-compile --check` (since 7.4) | `poetry check --lock` | +| Already adopted in repo | **Yes** (`uv.lock` already present, 4430 lines, 171 pkgs) | No | No | + +**Decision:** `uv`. The lockfile already exists in-repo and is in sync (`uv lock --check` exits 0 in 14 ms). `poetry` is rejected because adopting it would require rewriting `[project]` into `[tool.poetry]` — a pyproject-format migration that violates "minimal diff" scope. `pip-tools` would lose the `uv.lock` work already present and forfeit the multi-platform marker pinning that `uv.lock` gives for free. + +## Tasks (8) + +1. **Confirm lockfile freshness against current `pyproject.toml`** — `uv lock --check` (already passes; recorded as baseline). +2. **Add `[tool.uv]` block to `pyproject.toml` if needed** — likely no-op; defaults already satisfy our needs. Verify behaviour. +3. **Rewrite CI install step in `.github/workflows/ci.yml`** — replace `pip install -e ".[dev]"` with `uv sync --frozen --extra dev`, plus `astral-sh/setup-uv@v6` for the runner. +4. **Add CI lockfile-freshness gate** — new step `uv lock --check` runs before install; fails CI when `pyproject.toml` and `uv.lock` drift. +5. **Switch CI test/lint/type-check steps to `uv run`** — `uv run pytest …`, `uv run ruff check …`, `uv run pyright …` so tools execute against the locked virtualenv. +6. **Document the offline install path** — new `docs/AIRGAP_INSTALL.md` (≤50 lines): clone, `UV_INDEX_URL=https://internal-mirror`, `uv sync --frozen --offline`, `uv run pytest tests/ -x`. +7. **Local verification (acceptance gates)**: + - `uv lock --check` → exit 0 + - `python -m pytest tests/ -x` → all collected tests pass (baseline 1047) + - `ruff check src tests` → unchanged from baseline (13 pre-existing errors — NOT regressed) + - `pyright src/runtime` → unchanged from baseline (54 pre-existing errors — NOT regressed) + - `python scripts/build_single_file.py && git diff --exit-code dist/` → clean + - `git grep -nE 'https://ollama\.com|ollama\.com/api' -- src/` → zero matches (HARD-05 ratchet) + - `python -c 'import yaml; yaml.safe_load(open(".github/workflows/ci.yml"))'` → no parse error (no local yamllint installed) +8. **Single atomic commit** on `refactor/framework-flow-control` per phase precedent. + +## Files Touched + +| File | Status | Why | +| --- | --- | --- | +| `pyproject.toml` | possibly add `[tool.uv]` block (else unchanged) | UV config / extras declaration | +| `uv.lock` | **already present, unchanged** | Pre-existing; freshness re-verified at commit time | +| `.github/workflows/ci.yml` | modified | Install via `uv sync --frozen`; add lockfile-freshness gate; run tools via `uv run` | +| `docs/AIRGAP_INSTALL.md` | NEW | Offline install instructions | +| `.planning/phases/14-reproducible-air-gap-lockfile/14-01-PLAN.md` | NEW | This file | +| `.planning/phases/14-reproducible-air-gap-lockfile/14-01-SUMMARY.md` | NEW | After-action | +| `.planning/phases/14-reproducible-air-gap-lockfile/14-VERIFICATION.md` | NEW | Per-success-criterion gates | + +## Out of Scope (deferred) + +- **Vendored wheels tarball** for true `--no-index` install — separate phase (called out in 14-CONTEXT.md `Deferred Ideas`). +- **`Makefile` / `make bootstrap`** scaffolding — ROADMAP SC-2 wording mentions `make bootstrap` "or equivalent"; the equivalent is `uv sync --frozen [--offline]`. Documented in `docs/AIRGAP_INSTALL.md`. +- **Pyright / ruff baseline cleanup** — existing pre-Phase-14 baselines preserved exactly; not a Phase 14 concern. + +## Hard-Stop Triggers (HALT, write BLOCKER.md) + +- `uv lock --check` reports drift after commit → root-cause and stop. +- Any test in `tests/` newly fails with the lockfile-driven install AND root cause is the lockfile. +- CI YAML edits don't validate as YAML. +- `dist/*` regen produces a non-empty `git diff` after Phase 14 changes. diff --git a/.planning/phases/14-reproducible-air-gap-lockfile/14-01-SUMMARY.md b/.planning/phases/14-reproducible-air-gap-lockfile/14-01-SUMMARY.md new file mode 100644 index 0000000..c62278d --- /dev/null +++ b/.planning/phases/14-reproducible-air-gap-lockfile/14-01-SUMMARY.md @@ -0,0 +1,83 @@ +--- +status: completed +phase: 14-reproducible-air-gap-lockfile +plan: 01 +subsystem: build / ci / dependencies +tags: [hardening, air-gap, build, ci, lockfile] +requires: [phase-13-llm-provider-hardening] +provides: [uv.lock-CI-install, uv-lock-check-freshness-gate, docs/AIRGAP_INSTALL.md] +affects: [pyproject.toml, .github/workflows/ci.yml, .gitignore, docs/AIRGAP_INSTALL.md, uv.lock] +tech-stack: + added: [uv (Apache-2.0/MIT, single static binary, Astral)] + patterns: [pin+hash transitive lockfile, --frozen install, lockfile-drift CI gate] +key-files: + created: + - docs/AIRGAP_INSTALL.md + modified: + - .github/workflows/ci.yml + - .gitignore + unchanged-but-canonical: + - pyproject.toml # already PEP 621; no [tool.uv] needed + - uv.lock # already in sync (uv lock --check exit 0) +decisions: + - "Tool: uv 0.11.7 (Apache-2.0/MIT). Picked over pip-tools (loses uv.lock investment, no per-marker pinning) and poetry (would require [project] -> [tool.poetry] rewrite, violates minimal diff)." + - "uv.lock already exists (171 packages, 4430 lines, in sync per `uv lock --check`); Phase 14 wires CI to install from it, adds the freshness gate, and documents the offline path. No new lockfile generation required." + - "CI install: `uv sync --frozen --extra dev` (replaces `pip install -e .[dev]`). `--frozen` forbids re-resolving." + - "CI lockfile-drift gate: `uv lock --check` runs as the FIRST step inside the job (before install) so a stale uv.lock fails the build before anything else." + - "Tools (ruff, pyright, pytest) run via `uv run` so they execute against the locked virtualenv." + - "Pinned uv version 0.11.7 in CI (matches local) — bumps are deliberate, not silent." + - "Documented offline path in `docs/AIRGAP_INSTALL.md` (38 lines): clone -> UV_INDEX_URL=internal-mirror -> `uv sync --frozen [--offline]`. Negation rule added to .gitignore so docs/AIRGAP_INSTALL.md is the single shipped doc." + - "Single atomic commit per phase precedent (Phase 9-13)." +metrics: + duration: "~15 min" + tasks-completed: 8 + files-touched: 4 # (1 new, 2 modified, 1 planning .md whitelisted) + tests-added: 0 # pure infra, no new test surface + tests-total: 1044 # (1044 passed, 3 skipped — same as Phase 13) + ratchet-status: green + bundle-determinism: deterministic (`git diff --exit-code dist/` clean after regen) +gates: + uv-lock-check: "Resolved 171 packages in 2ms — exit 0" + yaml-valid: "9 steps, parses clean" + ollama-grep-src: "0 matches (HARD-05 ratchet preserved)" + ruff: "13 errors (pre-Phase-14 baseline, unchanged)" + pyright-runtime: "54 errors (pre-Phase-14 baseline, unchanged)" + pyright-full: "329 errors (pre-Phase-14 baseline, unchanged)" + dist-regen-diff: "clean (exit 0)" + pytest: "1044 passed, 3 skipped" +--- + +# Phase 14 Plan 01 Summary — Reproducible Air-Gap Dependency Lockfile + +## One-liner + +Wired the existing in-repo `uv.lock` into CI via `uv sync --frozen`, added a `uv lock --check` lockfile-freshness gate that fails the build on `pyproject.toml`/`uv.lock` drift, and documented the offline install path in `docs/AIRGAP_INSTALL.md` so an engineer behind a corporate firewall can reproduce the exact dependency graph from an internal mirror without public-internet access. Closes HARD-02 (CONCERNS C2). + +## What changed + +| File | Change | +| --- | --- | +| `.github/workflows/ci.yml` | Added `astral-sh/setup-uv@v6` (uv 0.11.7); added `uv lock --check` gate as first job step; replaced `pip install -e ".[dev]"` with `uv sync --frozen --extra dev`; rewrote `ruff` / `pyright` / `pytest` invocations as `uv run …` so they hit the locked venv. | +| `docs/AIRGAP_INSTALL.md` (new) | 38-line offline-install recipe: clone → set `UV_INDEX_URL` → `uv sync --frozen [--offline]` → `uv run pytest tests/ -x`. | +| `.gitignore` | Added `!docs/AIRGAP_INSTALL.md` negation so the air-gap install doc ships while the rest of `docs/` (Claude artefacts) stays ignored. | +| `pyproject.toml` | Unchanged — already PEP 621; uv reads `[project]` natively, no `[tool.uv]` block required. | +| `uv.lock` | Unchanged — already present, 4430 lines, 171 packages, in sync. Verified by `uv lock --check` exit 0. | + +## Acceptance gates (all green) + +``` +uv lock --check : EXIT 0 (171 pkgs, 2 ms) +python -c 'import yaml; yaml.safe_load(open(ci.yml))' : 9 steps, parses +git grep -nE 'https://ollama\.com|ollama\.com/api' src/ : 0 matches (HARD-05 ratchet) +ruff check src tests : 13 errors (pre-existing baseline) +pyright src/runtime : 54 errors (pre-existing baseline) +pyright : 329 errors (pre-existing baseline) +python scripts/build_single_file.py && git diff dist/ : clean (exit 0) +pytest tests/ -x : 1044 passed, 3 skipped +``` + +## Out of scope (deferred) + +- A vendored-wheels tarball (truly `--no-index` install kit) — separate phase. +- Pyright / ruff baseline cleanup — pre-existing baselines, not Phase 14 territory. +- `Makefile` `make bootstrap` shim — `uv sync --frozen [--offline]` is the documented equivalent (ROADMAP SC-2 wording allows "or equivalent"). diff --git a/.planning/phases/14-reproducible-air-gap-lockfile/14-VERIFICATION.md b/.planning/phases/14-reproducible-air-gap-lockfile/14-VERIFICATION.md new file mode 100644 index 0000000..57bca93 --- /dev/null +++ b/.planning/phases/14-reproducible-air-gap-lockfile/14-VERIFICATION.md @@ -0,0 +1,141 @@ +--- +status: passed +phase: 14 +phase_name: Reproducible Air-Gap Lockfile +date: 2026-05-07 +verified: 2026-05-07T09:35:00Z +score: 5/5 ROADMAP success criteria + 8/8 plan tasks verified +overrides_applied: 0 +re_verification: + previous_status: null + is_re_verification: false +--- + +# Phase 14 Verification Report — Reproducible Air-Gap Dependency Lockfile + +**Phase Goal (ROADMAP):** An engineer behind a corporate firewall can clone the repo, point at an internal package mirror, and reproduce the exact dependency graph used in CI / dev. Today `pyproject.toml` resolves freshly on every install — non-deterministic and breaks `~/.claude/rules/build.md`'s "vendor all dependencies" rule. + +**Requirement:** HARD-02 (CONCERNS C2) +**Verified:** 2026-05-07 +**Status:** passed + +--- + +## Goal-Backward Verification (ROADMAP Success Criteria) + +### SC-1 — Committed lockfile pins every direct + transitive dep with version + hash — VERIFIED + +**Evidence:** +- `uv.lock` present at repo root: 4430 lines, **171 packages** pinned (verified via `grep -E '^(name|version) = ' uv.lock | head`). +- Every entry includes `source`, `version`, and per-distribution `sha256` hash (sample: `aiofile==3.9.0` with sdist + wheel hashes). +- `requires-python = ">=3.11"` matches `pyproject.toml`. +- `uv lock --check` exit code: **0** ("Resolved 171 packages in 2ms") — lockfile is in sync with `pyproject.toml`. + +### SC-2 — `make bootstrap` (or equivalent) installs from lockfile alone via internal mirror — VERIFIED + +**Evidence:** +- `docs/AIRGAP_INSTALL.md` (NEW, 38 lines) documents the recipe: + ``` + export UV_INDEX_URL="https:///simple/" + uv sync --frozen --extra dev + # or, fully offline (cache pre-warmed): + uv sync --frozen --offline --extra dev + ``` +- `uv sync --frozen` is the documented equivalent of `make bootstrap` (ROADMAP wording: "make bootstrap or equivalent"). It refuses to re-resolve and installs the exact set in `uv.lock` with hash verification. +- `UV_INDEX_URL` env override redirects all package resolution to an internal mirror (no hardcoded public URLs). + +### SC-3 — CI installs from the lockfile, not the `pyproject.toml` solver — VERIFIED + +**Evidence (`.github/workflows/ci.yml`):** +- New step `Set up uv` pins uv `0.11.7` via `astral-sh/setup-uv@v6`. +- Replaced `run: pip install -e ".[dev]"` with `run: uv sync --frozen --extra dev`. +- All downstream tool invocations (`ruff`, `pyright`, `pytest`) use `uv run`, ensuring they execute inside the locked virtualenv rather than a side-installed Python. +- `--frozen` flag forbids re-resolution: any drift between `pyproject.toml` and `uv.lock` would fail this step (also caught earlier by SC-4). + +### SC-4 — Lockfile-drift CI gate fails the build on `pyproject.toml` change without lockfile update — VERIFIED + +**Evidence (`.github/workflows/ci.yml`):** +- New step `Lockfile freshness gate (HARD-02)` runs `uv lock --check` BEFORE the install step. +- `uv lock --check` exits non-zero when `pyproject.toml` and `uv.lock` are out of sync (would attempt to update the lockfile in dry-run mode). +- Gate is positioned first so a stale lockfile fails fast. +- Local invocation against current tree: exit 0 (clean baseline). + +### SC-5 — `dist/*` regenerated; existing test suite passes — VERIFIED + +**Evidence:** +- `python scripts/build_single_file.py` ran clean; `git diff --exit-code dist/` exit code: **0** (no drift). +- `python -m pytest tests/ -x` result: **1044 passed, 3 skipped, 0 failed** — matches Phase 13 baseline (`tests-total: 1044` per `13-01-SUMMARY.md` metrics). + +--- + +## Cross-Phase Ratchet Gates (preserved, not regressed) + +| Gate | Baseline (pre-Phase-14) | Phase 14 result | Status | +| --- | --- | --- | --- | +| `git grep -nE 'https://ollama\.com|ollama\.com/api' -- src/` (HARD-05) | 0 matches | 0 matches (exit 1) | Preserved | +| `ruff check src tests` | 13 errors | 13 errors | Preserved (pre-existing baseline; not a Phase 14 deliverable) | +| `pyright src/runtime` | 54 errors | 54 errors | Preserved (pre-existing baseline) | +| `pyright` (full) | 329 errors | 329 errors | Preserved (pre-existing baseline) | +| `pytest tests/ -x` | 1044 passed / 3 skipped | 1044 passed / 3 skipped | Preserved | +| `git diff --exit-code dist/` after `build_single_file.py` | clean | clean | Preserved | +| `uv lock --check` | exit 0 | exit 0 | Preserved (still in sync) | + +--- + +## Hard-Constraint Verification (from prompt) + +| Constraint | Verdict | Notes | +| --- | --- | --- | +| Air-gapped target — no new public-internet calls | PASS | uv reads from `UV_INDEX_URL` (internal mirror); `--frozen` + `--offline` documented. | +| No `curl | sh` in any script | PASS | `docs/AIRGAP_INSTALL.md` explicitly says "ship via your internal artifact store — do not `curl | sh`". | +| Permissive license for new tooling | PASS | uv: Apache-2.0 / MIT (dual-licensed). | +| No version downgrades vs `pyproject.toml` `>=` | PASS | uv.lock unchanged from already-resolved state; `uv lock --check` exit 0 confirms no rewrite. | +| Reproducible — same inputs same dep set | PASS | uv.lock pins version + sha256 per platform marker. | +| Existing test suite passes | PASS | 1044 passed / 3 skipped. | +| CI builds successfully from lockfile | PASS (locally validated; CI run will land on next push) | YAML parses; steps in correct order; `uv sync --frozen` is the canonical install command. | +| No code outside Phase 14 scope touched | PASS | Only `.github/workflows/ci.yml`, `.gitignore`, new `docs/AIRGAP_INSTALL.md`, plus phase planning files. | + +--- + +## Tool Selection Audit (`~/.claude/rules/dependencies.md`) + +| Criterion | uv (chosen) | +| --- | --- | +| License: MIT/Apache/BSD only | Apache-2.0 + MIT (dual) — PASS | +| Active maintenance | Astral, weekly releases — PASS | +| Single-maintainer bus factor | Backed by Astral team — PASS | +| Low transitive footprint | Zero Python deps (Rust binary) — PASS | +| Works fully offline once installed | `--offline`, `--frozen` first-class flags — PASS | +| Lockfile with full hashes | `uv.lock` pins sha256 per dist per platform marker — PASS | +| PEP 621 (`pyproject.toml` `[project]`) compatible | Native, no rewrite — PASS | +| Generates lockfile reproducibly | Same `pyproject.toml` + uv version → identical `uv.lock` — PASS | + +Rejected alternatives: +- **pip-tools** — Would forfeit `uv.lock` (already in repo, 171 pkgs) and per-marker hash pinning. +- **poetry** — Would require rewriting `[project]` → `[tool.poetry]`, violating minimal-diff scope. + +--- + +## Hard-Stop Triggers Checklist (none triggered) + +- Selected tool requires public internet at runtime/CI: **NO** — uv supports `--offline` and reads from `UV_INDEX_URL`. +- Lockfile downgrades a dep below `pyproject.toml` `>=`: **NO** — `uv lock --check` exit 0 means no resolution changes occurred. +- Test suite fails after lockfile in place AND root cause is the lockfile: **NO** — 1044 passed / 3 skipped, identical to Phase 13 baseline. +- CI YAML edits don't validate: **NO** — `python -c 'import yaml; yaml.safe_load(open(...))'` parses cleanly; 9 steps detected. +- Selected tool requires non-permissive license: **NO** — uv is Apache-2.0 + MIT. +- `dist/*` not deterministic: **NO** — `git diff --exit-code dist/` clean. + +--- + +## Files of Record + +- `pyproject.toml` (unchanged — already PEP 621; uv reads `[project]` natively) +- `uv.lock` (unchanged — already in sync, 171 packages, sha256-pinned) +- `.github/workflows/ci.yml` (modified — uv setup + lockfile gate + `uv sync --frozen` + `uv run` for tools) +- `.gitignore` (modified — `!docs/AIRGAP_INSTALL.md` negation so the install doc ships) +- `docs/AIRGAP_INSTALL.md` (NEW — 38-line offline install recipe) +- `.planning/phases/14-reproducible-air-gap-lockfile/14-01-PLAN.md` (NEW) +- `.planning/phases/14-reproducible-air-gap-lockfile/14-01-SUMMARY.md` (NEW) +- `.planning/phases/14-reproducible-air-gap-lockfile/14-VERIFICATION.md` (NEW — this file) + +**Verdict:** All 5 ROADMAP success criteria, all 8 plan tasks, all 7 cross-phase ratchet gates, and all 8 hard constraints verified. Phase 14 status: **passed**. diff --git a/config/code_review.runtime.yaml b/config/code_review.runtime.yaml index 2879cd2..664a9f3 100644 --- a/config/code_review.runtime.yaml +++ b/config/code_review.runtime.yaml @@ -41,6 +41,20 @@ paths: # When no rule fires the session falls through to ``unreviewed`` # (the v1.0 framework-default failure mode). orchestrator: + # Phase 11 (FOC-04): declarative HITL gating policy. Framework + # default threshold (0.7) -- code review is less prod-blast-radius + # than incident remediation so the stricter incident threshold + # (0.8) is unwarranted here. + gate_policy: + confidence_threshold: 0.7 + gated_environments: [production] + gated_risk_actions: [approve] + # Phase 12 (FOC-05): declarative retry policy. Framework default -- + # max_retries=2, transient retries on, confidence floor 0.4. + retry_policy: + max_retries: 2 + retry_on_transient: true + retry_low_confidence_threshold: 0.4 entry_agent: intake default_terminal_status: unreviewed statuses: @@ -85,6 +99,16 @@ orchestrator: # state_overrides; orchestrator validates start_session's # state_overrides kwarg against this class. state_overrides_schema: examples.code_review.state.CodeReviewStateOverrides + # Phase 9 (D-09-02 / FOC-01 / FOC-02): session-derived tool-arg + # injection map. code_review's pr_url / repo live under + # ``Session.extra_fields`` (the framework-default Session has no + # typed fields for them) so the dotted paths reach into the dict. + # The framework's ``_resolve_dotted`` walks dict-valued attrs + # transparently. + injected_args: + session_id: session.id + pr_url: session.extra_fields.pr_url + repo: session.extra_fields.repo # Cross-cutting framework knobs read directly off AppConfig.framework. framework: # Per-app session-id prefix. Threaded through SessionStore into diff --git a/config/config.yaml b/config/config.yaml index df732ac..7ed01ef 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -21,10 +21,14 @@ llm: endpoint: ${AZURE_ENDPOINT} api_version: 2024-08-01-preview api_key: ${AZURE_OPENAI_KEY} + openrouter: + kind: openai_compat + base_url: https://openrouter.ai/api/v1 + api_key: ${OPENROUTER_API_KEY} models: workhorse: - provider: ollama_cloud - model: gpt-oss:120b + provider: openrouter + model: openai/gpt-4o-mini temperature: 0.0 cheap: provider: ollama_cloud @@ -135,6 +139,19 @@ dedup: # ``incident_management.yaml`` since this is the bundled deployment # config for the example app. orchestrator: + # Phase 11 (FOC-04): declarative HITL gating policy. Framework + # default (threshold 0.7) -- mirrors incident_management v1.1 + # behaviour with the production-class environment gate. + gate_policy: + confidence_threshold: 0.7 + gated_environments: [production] + gated_risk_actions: [approve] + # Phase 12 (FOC-05): declarative retry policy. Framework default -- + # max_retries=2, transient retries on, confidence floor 0.4. + retry_policy: + max_retries: 2 + retry_on_transient: true + retry_low_confidence_threshold: 0.4 entry_agent: intake default_terminal_status: needs_review statuses: @@ -186,6 +203,15 @@ orchestrator: # state_overrides; orchestrator validates the start_session # kwarg against this class. state_overrides_schema: examples.incident_management.state.IncidentStateOverrides + # Phase 9 (D-09-02 / FOC-01 / FOC-02): session-derived tool-arg + # injection map. Strips the named args from each tool's LLM-visible + # signature and re-supplies them from the live Session at invocation + # time. Mirrors incident_management.yaml since this file is the + # bundled deployment config for the example app. + injected_args: + environment: session.extra_fields.environment + incident_id: session.id + session_id: session.id runtime: # Wires the orchestrator and storage layer to the incident-management # domain state class (see examples/incident_management/state.py). diff --git a/config/incident_management.yaml b/config/incident_management.yaml index a28e651..f84c3e5 100644 --- a/config/incident_management.yaml +++ b/config/incident_management.yaml @@ -16,6 +16,24 @@ similarity_method: keyword # ``_TERMINAL_TOOL_RULES`` table in ``orchestrator.py`` (Phase 6 / # DECOUPLE-02 / DECOUPLE-03 / D-06-01..06). orchestrator: + # Phase 11 (FOC-04): declarative HITL gating policy. Tighter + # threshold than the framework default -- incident remediation + # pauses on production-class medium-risk tools and on any tool + # call below 80% turn confidence. + gate_policy: + confidence_threshold: 0.8 + gated_environments: [production] + gated_risk_actions: [approve] + # Phase 12 (FOC-05): declarative retry policy. Default + # max_retries=2 mirrors the v1.2 ROADMAP. retry_on_transient=true + # keeps current auto-retry-on-network-blip behaviour. + # retry_low_confidence_threshold=0.4 sits below the gate_policy + # confidence_threshold (0.8) so the gate fires HITL approval + # before the retry path even considers a low-confidence give-up. + retry_policy: + max_retries: 2 + retry_on_transient: true + retry_low_confidence_threshold: 0.4 entry_agent: intake default_terminal_status: needs_review statuses: @@ -74,6 +92,15 @@ orchestrator: # state_overrides; orchestrator validates the start_session # kwarg against this class. state_overrides_schema: examples.incident_management.state.IncidentStateOverrides + # Phase 9 (D-09-02 / FOC-01 / FOC-02): session-derived tool-arg + # injection map. Each entry strips the named arg from every tool's + # LLM-visible signature and re-supplies the value from the live + # Session at invocation time. The LLM cannot hallucinate values + # for args it cannot see. + injected_args: + environment: session.environment + incident_id: session.id + session_id: session.id # Cross-cutting framework knobs the runtime consumes directly. framework: diff --git a/dist/app.py b/dist/app.py index 63cb3ed..5feb3e6 100644 --- a/dist/app.py +++ b/dist/app.py @@ -1,4 +1,30 @@ from __future__ import annotations +# ----- imports for runtime/errors.py ----- +"""Typed runtime errors. Phase 13 lands the LLM-call surface; future +hardening (HARD-04 silent-failure sweep, HARD-03 pyright flip, +real-LLM follow-ups) extends here. + +Importable as ``from runtime.errors import LLMTimeoutError, LLMConfigError``. +""" + + + +# ----- imports for runtime/terminal_tools.py ----- +"""Generic terminal-tool registry types. + +Apps register their terminal-tool rules and status vocabulary via +``OrchestratorConfig.terminal_tools`` / ``OrchestratorConfig.statuses``; +the framework reads these models without knowing app-specific tool +or status names. Cf. .planning/phases/06-generic-terminal-tool-registry/ +06-CONTEXT.md (D-06-01, D-06-02, D-06-05). +""" + + +from typing import Literal + +from pydantic import BaseModel, Field + + # ----- imports for runtime/config.py ----- """Config schemas for the orchestrator.""" @@ -6,11 +32,12 @@ import re from pathlib import Path from typing import Any, Literal -from pydantic import BaseModel, Field, field_validator, model_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator import yaml + # Session-id prefix grammar. The framework mints session ids of the form # ``{PREFIX}-YYYYMMDD-NNN`` (see ``runtime.state.Session.id_format``); # the prefix is the only piece an app picks. Allow alphanumerics + hyphens, @@ -34,7 +61,6 @@ class IncidentState(Session): -from pydantic import BaseModel, Field # ----- imports for runtime/state_resolver.py ----- """Resolve ``RuntimeConfig.state_class`` (a dotted path) to a class object. @@ -109,6 +135,7 @@ class IncidentState(Session): import ast from typing import Any, Callable, Literal +from pydantic import BaseModel, Field, field_validator, model_validator # ----- imports for runtime/llm.py ----- @@ -118,8 +145,21 @@ class IncidentState(Session): provider (kind + connection) to a model id and optional temperature/deployment. ``get_llm(cfg, "smart")`` looks up ``cfg.models["smart"]`` and uses its referenced ``cfg.providers[]`` to build a langchain ``BaseChatModel``. + +Phase 13 (HARD-01 / HARD-05): every chat + embedding HTTP call is bounded +by an effective ``request_timeout`` resolved as +``provider.request_timeout if not None else default_llm_request_timeout`` +(default 120.0s on ``OrchestratorConfig``). The native langchain timeout +knob is wired AND an ``asyncio.wait_for`` wrapper raises +``LLMTimeoutError(provider, model, elapsed_ms)`` on hang -- defence in +depth against partial-byte stalls where the httpx layer doesn't fire. +The hardcoded public-Ollama fallback is removed; ollama providers +must declare ``base_url`` (validated at config-load via +``LLMConfigError``). """ +import asyncio +import time from typing import Any from uuid import uuid4 from langchain_core.embeddings import Embeddings @@ -130,6 +170,7 @@ class IncidentState(Session): + # ----- imports for runtime/storage/models.py ----- """SQLAlchemy declarative model for the ``incidents`` table. @@ -183,6 +224,7 @@ class IncidentState(Session): import hashlib import numpy as np +from pydantic import SecretStr @@ -230,16 +272,19 @@ class IncidentState(Session): from typing import Any, Generic, Mapping, Optional, Type, TypeVar -from pydantic import BaseModel from sqlalchemy import select from sqlalchemy.engine import Engine -from sqlalchemy.orm import Session +from sqlalchemy.orm import Session as SqlaSession -# Mirrors the bound on ``SessionStore.StateT`` — kept permissive at -# ``BaseModel`` so framework code does not need to import the -# example-app subclass. The resolver in :mod:`runtime.state_resolver` -# enforces a ``runtime.state.Session`` subclass at config time. + +# Mirrors the bound on ``SessionStore.StateT`` — tightened from +# ``BaseModel`` to ``runtime.state.Session`` in Phase 19 (HARD-03) so +# pyright sees the typed fields (``id``, ``status``, ``deleted_at`` …) +# this store reads. The resolver in :mod:`runtime.state_resolver` +# already enforces a ``Session`` subclass at config time, and every +# in-tree caller passes either bare ``Session`` or a ``Session`` +# subclass. # ----- imports for runtime/storage/session_store.py ----- """Active session lifecycle store. @@ -261,6 +306,7 @@ class IncidentState(Session): from datetime import datetime, timezone from typing import Generic, Optional, Type, TypeVar +from pydantic import BaseModel from sqlalchemy import desc, select from sqlalchemy.orm import Session as SqlSession @@ -271,6 +317,66 @@ class IncidentState(Session): # hook existed. New rows are validated by ``_SESSION_ID_RE`` which # accepts any ``PREFIX-YYYYMMDD-NNN`` shape the app's ``id_format`` may # emit (e.g. ``CR-...`` for code-review). +# ----- imports for runtime/storage/event_log.py ----- +"""Append-only session event log. + +Events drive the status finalizer's inference (e.g. a registered +```` event appearing in the log -> session reached +the corresponding terminal status). They are never mutated or +deleted. +""" + + +from dataclasses import dataclass +from typing import Iterator + +from sqlalchemy.orm import Session + + + +# ----- imports for runtime/storage/migrations.py ----- +"""Idempotent migrations for the JSON-shaped row payloads. + +Fills the per-call audit fields on :class:`runtime.state.ToolCall` for +legacy rows. The risk-rated tool gateway uses five optional audit fields: + + * ``risk`` — ``"low" | "medium" | "high" | None`` + * ``status`` — ``ToolStatus`` literal (default ``"executed"``) + * ``approver`` — operator id, set when status in {approved, rejected} + * ``approved_at`` — ISO-8601 timestamp of the decision + * ``approval_rationale`` — free-text justification + +Older rows in the ``incidents.tool_calls`` JSON column lack these +fields. Pydantic hydrates the missing keys with their defaults at read +time so reading is already back-compat — but the on-disk JSON still +shows the legacy shape until something rewrites the row. + +This migration walks every session, normalises the JSON-shaped +``tool_calls`` list to the current audit schema, and saves the row back +when (and only when) at least one entry changed. Idempotent — running +twice is safe (the second pass is a no-op because every row already +has the fields). + +The function operates on the row's JSON list directly (not via the +``ToolCall`` Pydantic model) so we don't accidentally widen the +migration's contract — for example, dropping unknown extra keys via +Pydantic's ``extra='ignore'`` would silently delete forward-compat +fields in a downgrade scenario. JSON-walk is conservative: only fill +what's missing; leave everything else alone. +""" + + +from typing import Any, Iterable + +from sqlalchemy import inspect, text + + +# Columns added after the initial schema. Each entry is +# ``(column_name, sql_type, default_clause_or_None)``. SQLite ``ADD +# COLUMN`` cannot add a non-nullable column without a constant default, +# so every entry here is nullable — Pydantic hydrates the missing keys +# at read time. Append-only: never reorder, never delete. Removing a +# column needs a separate destructive migration with explicit sign-off. # ----- imports for runtime/mcp_loader.py ----- """Load MCP servers (in_process / stdio / http / sse) and build a tool registry. @@ -299,15 +405,313 @@ class IncidentState(Session): -# ----- imports for runtime/graph.py ----- -"""LangGraph state, routing helpers, and node runner.""" +# ----- imports for runtime/service.py ----- +"""Long-lived orchestrator service. + +Owns a background asyncio event loop and a shared FastMCP client pool. +All session execution will run as asyncio tasks on this loop. Sync callers +(Streamlit, FastAPI request handlers, CLI) submit coroutines via +``submit(coro) -> concurrent.futures.Future``. + +Lifecycle:: + + svc = OrchestratorService.get_or_create(cfg) + svc.start() # spins up background thread + loop + fut = svc.submit(some_coro) + result = fut.result(timeout=30) + svc.shutdown() # cancels in-flight tasks, closes MCP clients, joins thread + +Capabilities: + - Skeleton + singleton + start/shutdown lifecycle. + - ``submit()`` / ``submit_and_wait()`` thread-safe bridge. + - Shared ``MCPClientPool`` with per-server ``asyncio.Lock``. + - ``start_session()`` schedules a per-session asyncio task on the + service's loop and returns the session id immediately (the agent run + continues in the background). Active tasks are tracked in an + in-memory registry that evicts on completion / cancellation. + - ``list_active_sessions()`` returns a thread-safe snapshot of + the in-flight registry; the snapshot coroutine runs on the loop so + readers from any thread see a point-in-time consistent view. + - ``stop_session(sid)`` cancels the in-flight task, waits up + to 5 s for graceful exit, and persists ``status="stopped"`` on the + row (clearing ``pending_intervention``). Idempotent — a no-op for + unknown ids or already-completed sessions. + - Hard cap on concurrent sessions. ``start_session`` raises + ``SessionCapExceeded`` once ``len(self._registry) >= + self.max_concurrent_sessions``. Fail fast; queueing is not supported. + +The singleton is process-scoped and reset on ``shutdown()`` so that test +suites can build, tear down, and rebuild the service without leaking +state across cases. +""" -import asyncio + +import concurrent.futures import logging -from typing import TypedDict, Callable, Awaitable +import threading +from typing import Any, Awaitable, Coroutine, TypeVar, cast + + + +# ----- imports for runtime/agents/turn_output.py ----- +"""Phase 10 (FOC-03) — AgentTurnOutput envelope + reconciliation helpers. + +The envelope is the structural contract every responsive agent invocation +must satisfy: content + confidence in [0,1] + confidence_rationale + optional +signal. The framework wires it as ``response_format=AgentTurnOutput`` into +``langchain.agents.create_agent`` (see Phase 15 / LLM-COMPAT-01); the +agent loop terminates on the same turn the LLM emits the envelope-shaped +tool call, populating ``result["structured_response"]``, which the +framework reads and persists onto the ``AgentRun`` row. + +D-10-02 — pydantic envelope wrapped via ``response_format``. +D-10-03 — when a typed-terminal-tool was called this turn, the framework +reconciles its ``confidence`` arg against the envelope's. Tolerance 0.05 +inclusive; tool-arg wins on mismatch with an INFO log. + +This is a leaf module: no imports from ``runtime.graph`` or +``runtime.orchestrator``. Both of those depend on it; the dependency +graph is acyclic. +""" + + + +from pydantic import BaseModel, ConfigDict, Field + +# ----- imports for runtime/tools/gateway.py ----- +"""Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper. + +The gateway sits between the ReAct agent and each tool the orchestrator +configures. It enforces the *hybrid* HITL policy resolved by +``effective_action``: + + ``auto`` -> call the underlying tool directly (no plumbing) + ``notify`` -> call the tool, then persist a soft-notify audit entry + ``approve`` -> raise ``langgraph.types.interrupt(...)`` BEFORE calling + the tool; on resume re-invoke + +The resolver is a plain function with no I/O so it can be unit-tested +exhaustively without spinning up Pydantic Sessions, MCP servers, or a +LangGraph runtime. The wrapper is a closure factory deliberately built +inside ``make_agent_node`` so the closure captures the live ``Session`` +per agent invocation (mitigation R2 in the Phase-4 plan). +""" + + +from fnmatch import fnmatchcase +from typing import TYPE_CHECKING, Any, Literal + + + + +# ``GateDecision`` is imported lazily inside ``_evaluate_gate`` (function +# body) to avoid a runtime cycle (policy.py imports gateway types). The +# type-only import below lets pyright resolve the string-literal return +# annotation on ``_evaluate_gate`` without forming a real cycle. +# ----- imports for runtime/tools/arg_injection.py ----- +"""Session-derived tool-arg injection (Phase 9 / FOC-01 / FOC-02). + +Two responsibilities, one module: + +1. :func:`strip_injected_params` — clones a ``BaseTool``'s args_schema with + one or more parameters removed. The LLM only sees the stripped sig and + therefore cannot hallucinate values for those params (D-09-01). The + original tool is left untouched so direct downstream callers (tests, + scripts, in-process MCP fixtures) keep working. + +2. :func:`inject_injected_args` — at tool-invocation time, re-adds the + real values resolved from the live :class:`runtime.state.Session` via + the configured dotted paths. When the LLM still supplied a value for + an injected arg, the framework's session-derived value wins and an + INFO log captures the override (D-09-03). + +The framework stays generic — apps declare which args to inject and from +where via :attr:`runtime.config.OrchestratorConfig.injected_args` (D-09-02). +""" + + + +from pydantic import BaseModel, create_model + + + +# Module-private logger. Tests assert against logger name +# ``"runtime.orchestrator"`` so the override-log line shows up alongside +# the rest of the orchestrator-side observability without requiring a +# separate caplog target. +# ----- imports for runtime/tools/approval_watchdog.py ----- +"""Pending-approval timeout watchdog. + +A high-risk tool call enters ``langgraph.types.interrupt()`` and the +session sits in ``awaiting_input`` indefinitely. Without a watchdog +the slot leaks against ``OrchestratorService.max_concurrent_sessions`` +forever — the cap eventually starves out new traffic. + +The :class:`ApprovalWatchdog` is an asyncio task that runs on the +service's background loop. Every ``poll_interval_seconds`` it: + + 1. Snapshots the in-flight session registry. + 2. For each session whose row has ``status="awaiting_input"``, + scans ``tool_calls`` for entries with ``status="pending_approval"`` + whose ``ts`` is older than ``approval_timeout_seconds``. + 3. Resumes each such session via ``Command(resume={"decision": + "timeout", "approver": "system", "rationale": "approval window + expired"})``. The wrapped tool's resume path updates the audit + row to ``status="timeout"``. + +Failures during polling (DB hiccup, malformed row) are logged and +swallowed so a single bad session cannot kill the watchdog. +""" + + +from typing import TYPE_CHECKING, Any + + +# ----- imports for runtime/policy.py ----- +"""Pure HITL gating policy (Phase 11 / FOC-04). + +The :func:`should_gate` function is the SOLE place the framework decides +whether a tool call requires human-in-the-loop approval. It composes +three orthogonal inputs: + + 1. ``effective_action(tool_call.tool, env=session.environment, + gateway_cfg=cfg.gateway)`` -- preserves the v1.0 PVC-08 + prefixed-form lookup invariant. + 2. ``session.environment`` -- gated when in + ``cfg.gate_policy.gated_environments``. + 3. ``confidence`` -- gated when below + ``cfg.gate_policy.confidence_threshold``. + +Pure: same inputs always yield identical :class:`GateDecision`; no I/O, +no skill-prompt input, no mutation. + +Precedence (descending): + + 1. ``effective_action`` returns a value in + ``cfg.gate_policy.gated_risk_actions`` + -> ``GateDecision(gate=True, reason="high_risk_tool")`` + 2. ``session.environment`` in ``cfg.gate_policy.gated_environments`` + AND ``effective_action != "auto"`` + -> ``GateDecision(gate=True, reason="gated_env")`` + 3. ``confidence`` is not None AND + ``confidence < cfg.gate_policy.confidence_threshold`` + AND ``effective_action != "auto"`` + -> ``GateDecision(gate=True, reason="low_confidence")`` + 4. otherwise -> ``GateDecision(gate=False, reason="auto")`` + +The literal ``"blocked"`` is reserved on :class:`GateDecision.reason` +for future hard-stop semantics; Phase 11 itself never returns it from a +production code path. +""" + + + +from pydantic import BaseModel, ConfigDict + + +# Phase 11 (FOC-04): forward-reference imports for the should_gate +# signature only; kept inside ``TYPE_CHECKING`` so the bundle's +# intra-import stripper does not remove a load-bearing import. The +# ``pass`` keeps the block syntactically valid after stripping. +# ----- imports for runtime/agents/responsive.py ----- +"""Responsive agent kind — the today-default LLM agent. + +A responsive skill is a LangGraph node that: + +1. Builds a ReAct executor over the skill's ``tools`` and ``model``. +2. Invokes the executor with the live ``Session`` payload as a human + message preamble. +3. Records ``ToolCall`` and ``AgentRun`` rows on the session, harvests + the agent's confidence / signal / rationale, and decides the next + route from ``skill.routes``. + +This module owns only the node-factory entrypoint +(``make_agent_node``); the implementation reuses helpers in +:mod:`runtime.graph` so existing call sites and the gate node continue +to work unchanged. Supervisor and monitor factories live alongside it +under :mod:`runtime.agents` rather than piling more kinds into +``graph.py``. +""" + + +from typing import Callable from langchain_core.messages import HumanMessage -from langgraph.prebuilt import create_react_agent +from langchain.agents import create_agent + +from langgraph.errors import GraphInterrupt + + + + + + + +# ----- imports for runtime/agents/supervisor.py ----- +"""Supervisor agent kind — no-LLM router. + +A supervisor skill is a LangGraph node that: + +1. Reads the live ``Session`` plus the current dispatch depth. +2. Picks one or more subordinate agents per ``dispatch_strategy``: + ``rule`` (deterministic, evaluated via the same safe-eval AST that + gates monitor expressions) or ``llm`` (one short LLM call against + ``dispatch_prompt``). +3. Emits a structured ``supervisor_dispatch`` log entry (no + ``AgentRun`` row — supervisors are bookkeeping, not token-burning + agents). +4. Returns ``next_route`` set to the chosen subordinate (or to + ``__end__`` when the depth limit is hit). + +The recursion depth is tracked in :class:`runtime.graph.GraphState`'s +``dispatch_depth`` field; if a supervisor would exceed +``skill.max_dispatch_depth`` the node aborts with a clean error +instead of recursing forever. + +This is **not** a fan-out implementation; we always pick a single +target. Multi-target ``Send()`` is intentionally not supported. +""" + + +from typing import Any, Callable + +from langchain_core.messages import HumanMessage, SystemMessage + + + +# ----- imports for runtime/agents/monitor.py ----- +"""Monitor agent kind — out-of-band scheduled observer. + +A monitor skill runs **outside** any session graph. The orchestrator +owns one :class:`MonitorRunner` (a singleton) which schedules registered +monitor skills on a small bounded +:class:`concurrent.futures.ThreadPoolExecutor`. +Each tick: + +1. Calls every tool name in ``observe`` via the supplied callable + (``observe_fn``); aggregates results into one dict keyed by tool. +2. Evaluates ``emit_signal_when`` against the observation using the + stdlib safe-eval evaluator (R7). +3. If true, looks up ``trigger_target`` in the supplied trigger + registry / fire callback and fires it with the observation as the + payload. + +APScheduler is intentionally *not* a dependency: the air-gapped target +env doesn't ship it (see ``rules/build.md``). We get away with a tiny +single-threaded scheduler thread because monitor schedules are coarse +(minute-resolution cron) and tool calls are dispatched into the +executor; the scheduler thread itself never blocks on tool I/O. +""" + + +from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeout + + +# ----- imports for runtime/graph.py ----- +"""LangGraph state, routing helpers, and node runner.""" + +from typing import Any, TypedDict, Callable, Awaitable + from langgraph.graph import StateGraph, END @@ -316,6 +720,11 @@ class IncidentState(Session): +# Phase 11 (FOC-04 / D-11-04): GraphInterrupt is the LangGraph +# pending-approval pause signal. It is NOT an error and must NOT route +# through _handle_agent_failure -- the orchestrator's interrupt-aware +# bridge handles the resume protocol via the checkpointer. + # ----- imports for runtime/checkpointer_postgres.py ----- """Postgres checkpointer wrapper. @@ -383,7 +792,6 @@ class IncidentState(Session): from abc import ABC, abstractmethod -from dataclasses import dataclass from typing import TYPE_CHECKING # ----- imports for runtime/triggers/config.py ----- @@ -418,7 +826,7 @@ class IncidentState(Session): """ -from typing import Any, Callable, Type +from typing import Any, Callable, Type, cast @@ -448,7 +856,6 @@ class IncidentState(Session): """ -import threading from collections import OrderedDict from datetime import datetime, timezone, timedelta @@ -471,7 +878,6 @@ class IncidentState(Session): import hmac -from typing import Callable from fastapi import Header, HTTPException, status @@ -683,7 +1089,6 @@ async def _poll(self, registry): """ -from typing import Any, Callable # ----- imports for runtime/memory/session_state.py ----- @@ -700,7 +1105,6 @@ async def _poll(self, registry): """ -from pydantic import BaseModel, ConfigDict, Field # ----- imports for runtime/memory/knowledge_graph.py ----- @@ -878,6 +1282,37 @@ async def _poll(self, registry): from typing import AsyncIterator +# ----- imports for runtime/skill_validator.py ----- +"""Load-time validation of skill YAML against the live MCP registry. + +Catches: + * tools.local entries that reference a non-existent (server, tool) + pair (typically typos that would silently make the tool invisible). + * routes that omit ``when: default`` (would cause graph hangs at + __end__ when no signal matches). +""" + + + +# ----- imports for runtime/storage/checkpoint_gc.py ----- +"""Garbage-collect orphaned LangGraph checkpoints. + +When ``Orchestrator.retry_session`` rebinds a session to a new +``thread_id`` (e.g. ``INC-1:retry-1``), the original ``INC-1`` thread's +checkpoint becomes orphaned — no code path will ever resume it. Over +time these accumulate. ``gc_orphaned_checkpoints`` removes any +checkpoint whose ``thread_id`` does not reference an active session +(or a known retry suffix). + +This is intentionally conservative: only checkpoints whose thread_id +prefix matches no live session row at all are removed. +""" + + +from sqlalchemy import text +from sqlalchemy.exc import OperationalError + + # ----- imports for runtime/orchestrator.py ----- """Public Orchestrator class — the API consumed by the UI and (future) FastAPI.""" @@ -918,7 +1353,6 @@ async def _poll(self, registry): from fastapi.responses import StreamingResponse - # ----- imports for runtime/api_dedup.py ----- """Dedup retraction HTTP routes. @@ -947,12 +1381,119 @@ async def _poll(self, registry): +# ====== module: runtime/errors.py ====== + +class LLMTimeoutError(TimeoutError): + """Raised when an LLM provider HTTP call exceeds request_timeout. + + Subclasses ``TimeoutError`` so ``runtime.policy._TRANSIENT_TYPES`` + auto-classifies it as transient via ``isinstance`` -- no policy.py + edit needed (D-13-04). + + The ``__str__`` includes the substring ``"timed out"`` so existing + string-matchers in ``runtime.graph._TRANSIENT_MARKERS`` and + ``runtime.orchestrator._reconstruct_last_error`` also catch it + without modification. + """ + + def __init__(self, provider: str, model: str, elapsed_ms: int) -> None: + self.provider = provider + self.model = model + self.elapsed_ms = elapsed_ms + super().__init__( + f"LLM request timed out after {elapsed_ms}ms " + f"(provider={provider}, model={model})" + ) + + +class LLMConfigError(ValueError): + """Raised at config-load when a provider is missing a required field. + + Subclasses ``ValueError`` so pydantic ``@model_validator(mode='after')`` + propagates it cleanly into ``ValidationError`` (D-13-05). + """ + + def __init__(self, provider: str, missing_field: str) -> None: + self.provider = provider + self.missing_field = missing_field + super().__init__( + f"{provider} provider requires {missing_field!r}" + ) + + +__all__ = ["LLMTimeoutError", "LLMConfigError"] + +# ====== module: runtime/terminal_tools.py ====== + +class TerminalToolRule(BaseModel): + """Maps a terminal tool name to the session status it produces. + + ``tool_name`` matches both bare (``set_recommendation``) and prefixed + (``:set_recommendation``) MCP tool-call names — the framework + does the suffix check. + + ``status`` must reference a name declared in the same + ``OrchestratorConfig.statuses`` map; ``OrchestratorConfig``'s + cross-field validator enforces this at config-load. + + ``extract_fields`` declares per-rule extra-metadata pulls. Each + key is the destination field name on the session + (``Session.extra_fields[]``); each value is an ordered list + of ``args.X`` / ``result.X`` lookup hints. The framework picks + the first non-falsy match. Empty dict (default) means "no extra + metadata to capture". Generalises the v1.0 + ``_extract_team(tc, team_keys)`` path; the same lookup syntax is + preserved (D-06-02). + + ``match_args`` is an optional argument-value discriminator. When + non-empty, the rule matches a tool call only if EVERY ``(key, + value)`` pair in ``match_args`` matches ``tool_call.args[key]`` + exactly. Lets one tool name route to multiple statuses based on + a discriminator argument (e.g. ``set_recommendation`` with + ``recommendation=approve`` vs ``recommendation=request_changes``). + Empty default = no arg dispatch; preserves the v1.0 single-rule + shape (DECOUPLE-07 / D-08-03). + """ + + model_config = {"extra": "forbid"} + + tool_name: str = Field(min_length=1) + status: str = Field(min_length=1) + extract_fields: dict[str, list[str]] = Field(default_factory=dict) + match_args: dict[str, str] = Field(default_factory=dict) + + +StatusKind = Literal[ + "success", # e.g. set_recommendation(approve) -> approved + "failure", # e.g. set_recommendation(request_changes) -> changes_requested + "escalation", # app-defined escalation terminal (e.g. ) + "needs_review", # finalize fired with no rule match + "pending", # session in flight +] + + +class StatusDef(BaseModel): + """Pydantic record of one app status. + + Framework reads ``terminal`` to decide finalize-vs-pending and + ``kind`` to dispatch the needs_review fallback path / let UIs + group statuses without owning their own taxonomy. ``color`` and + other presentation fields stay in ``UIConfig.badges`` (D-06-05 + rejected alternative — presentation leak). + """ + + model_config = {"extra": "forbid"} + + name: str = Field(min_length=1) + terminal: bool + kind: StatusKind + # ====== module: runtime/config.py ====== _SESSION_ID_PREFIX_RE = re.compile(r"^[A-Za-z0-9-]{1,16}$") -ProviderKind = Literal["ollama", "azure_openai", "stub"] +ProviderKind = Literal["ollama", "azure_openai", "openai_compat", "stub"] class ProviderConfig(BaseModel): @@ -960,12 +1501,35 @@ class ProviderConfig(BaseModel): Multiple named ``ModelConfig`` entries can reference the same provider so that, e.g., two Ollama models share a single base_url + api_key. + + Phase 13 (HARD-01 / D-13-01): per-provider ``request_timeout`` + override (None means "use OrchestratorConfig.default_llm_request_timeout"). + Phase 13 (HARD-05 / D-13-06): ollama providers MUST declare + ``base_url``; the @model_validator below catches the omission at + config-load and raises ``LLMConfigError``. The hardcoded public + Ollama fallback in ``runtime.llm`` is removed in the same phase. """ kind: ProviderKind - base_url: str | None = None # ollama + base_url: str | None = None # ollama (REQUIRED via validator) api_key: str | None = None # ollama, azure_openai - endpoint: str | None = None # azure_openai + endpoint: str | None = None # azure_openai (validated lazily in builder) api_version: str | None = None # azure_openai + request_timeout: float | None = Field( + default=None, gt=0, le=600, + ) # NEW Phase 13 (D-13-01) — None -> OrchestratorConfig default + + @model_validator(mode="after") + def _validate_required_fields(self) -> "ProviderConfig": + # D-13-06: only ollama is promoted to config-load validation in + # Phase 13. azure_openai (`endpoint`) and openai_compat + # (`base_url` + `api_key`) keep their existing first-request + # ValueError raises in `_build_*_chat`. Promoting them is a + # potential follow-up; see CONTEXT.md "Deferred Ideas". + if self.kind == "ollama" and not self.base_url: + raise LLMConfigError( + provider="ollama", missing_field="base_url", + ) + return self class ModelConfig(BaseModel): @@ -1072,6 +1636,76 @@ class Paths(BaseModel): incidents_dir: str = "incidents" +class GatePolicy(BaseModel): + """Phase 11 (FOC-04): declarative HITL gating policy. + + Drives the framework's pure ``should_gate`` boundary. The LLM never + sees this config -- flow control is a framework decision, not a + skill-prompt incantation. + + ``confidence_threshold`` is the strict-less-than predicate the gate + applies to the active turn confidence; tool calls below the + threshold fire a low_confidence pause for any non-auto-rated tool. + + ``gated_environments`` enumerates Session.environment values that + automatically gate every non-auto-rated tool call regardless of + confidence -- lifecycle defence against blast radius in production. + + ``gated_risk_actions`` enumerates GatewayAction Literal values + (``auto``/``notify``/``approve``) that ALWAYS trigger a gate + regardless of env or confidence. Default ``{"approve"}`` mirrors + v1.0 HITL behaviour. + + Phase 11 chooses ``"approve"`` (the actual GatewayAction literal) + over CONTEXT.md's sketched ``"hitl"`` -- see + src/runtime/tools/gateway.py:32 for the canonical 3-valued + GatewayAction Literal. + """ + + model_config = ConfigDict(extra="forbid") + + confidence_threshold: float = Field(default=0.7, ge=0.0, le=1.0) + gated_environments: set[str] = Field( + default_factory=lambda: {"production"}, + ) + gated_risk_actions: set[str] = Field( + default_factory=lambda: {"approve"}, + ) + + +class RetryPolicy(BaseModel): + """Phase 12 (FOC-05): declarative retry policy. + + Drives the framework's pure ``should_retry`` boundary. The LLM never + sees this config -- flow control is a framework decision, not a + skill-prompt incantation. Mirrors GatePolicy's shape so the + OrchestratorConfig surface stays uniform. + + ``max_retries`` is the absolute cap on automatic retries (compared + with ``retry_count`` via ``>=``). 0 disables auto-retry entirely; + the recommended default 2 mirrors the v1.2 ROADMAP sketch and the + existing transient-5xx auto-retry budget in graph.py. + + ``retry_on_transient`` lets apps with strict SLOs disable framework + auto-retry of transient errors entirely (escalate immediately + instead). + + ``retry_low_confidence_threshold`` is the strict-less-than predicate + for "the LLM gave up; don't burn budget on a retry". Defaults to + 0.4 -- well below the typical gate_policy 0.7-0.8 threshold so a + low-confidence escalation triggers HITL intervention before the + retry path even considers it. + """ + + model_config = ConfigDict(extra="forbid") + + max_retries: int = Field(default=2, ge=0, le=10) + retry_on_transient: bool = True + retry_low_confidence_threshold: float = Field( + default=0.4, ge=0.0, le=1.0, + ) + + class OrchestratorConfig(BaseModel): model_config = {"extra": "forbid"} @@ -1162,6 +1796,41 @@ class OrchestratorConfig(BaseModel): # bad path raises at boot with a useful message (DECOUPLE-05 / D-08-01). state_overrides_schema: str | None = None + # Phase 9 (D-09-02 / FOC-01): map of LLM-visible-arg -> dotted-path + # on the live Session. Tools whose param name matches a key in this + # dict get the param stripped from the LLM-visible signature, and + # the framework supplies the resolved value at _invoke_tool / + # _GatedTool._run / _arun time. Apps declare what to inject; the + # framework stays generic. Empty default = no injection (legacy + # behaviour). Validated at config-load: keys are non-empty + # identifiers, values are dotted paths starting with "session.". + injected_args: dict[str, str] = Field(default_factory=dict) + + # Phase 11 (FOC-04): declarative HITL gating policy. Apps tune + # thresholds in YAML; the framework's should_gate boundary reads + # this struct and the LLM never sees it. Default keeps v1.1 + # behaviour (production gates "approve"-risk tools, threshold 0.7). + gate_policy: "GatePolicy" = Field(default_factory=lambda: GatePolicy()) + + # Phase 12 (FOC-05): declarative retry policy. Apps tune + # max_retries / retry_on_transient / low-confidence threshold in + # YAML; the framework's should_retry boundary reads this struct + # and the LLM never sees it. Default keeps v1.2 behaviour + # (max_retries=2, transient retries enabled, confidence floor 0.4). + retry_policy: "RetryPolicy" = Field( + default_factory=lambda: RetryPolicy(), + ) + + # Phase 13 (HARD-01 / D-13-02): framework-default LLM HTTP request + # timeout in seconds. Per-provider ``ProviderConfig.request_timeout`` + # overrides this; ``None`` on the provider means "use this default". + # Bounded to catch indefinite hangs (CONCERNS C1) while leaving room + # for slow CPU Ollama runs (e.g., gpt-oss:120b). 600s upper bound + # prevents accidentally-disabling the protection. + default_llm_request_timeout: float = Field( + default=120.0, gt=0, le=600, + ) + @field_validator("state_overrides_schema") @classmethod def _validate_state_overrides_schema_format( @@ -1196,6 +1865,38 @@ def _validate_state_overrides_schema_format( ) return v + @field_validator("injected_args") + @classmethod + def _validate_injected_args( + cls, v: dict[str, str], + ) -> dict[str, str]: + """Phase 9 (D-09-02): config-load validation for injected_args. + + Each entry is ``arg_name -> dotted_path`` where ``arg_name`` must + be a valid Python identifier (it is the keyword name on a tool + signature) and ``dotted_path`` must be a non-empty string with at + least one dot (e.g. ``session.environment``). Real attribute + resolution happens at injection time in + :func:`runtime.tools.arg_injection.inject_injected_args` so + config-load doesn't drag the live ``Session`` into every consumer. + """ + for key, path in v.items(): + if not key or not key.isidentifier(): + raise ValueError( + f"injected_args key {key!r} must be a non-empty " + f"Python identifier" + ) + if not isinstance(path, str) or not path.strip(): + raise ValueError( + f"injected_args[{key!r}] must be a non-empty dotted path" + ) + if "." not in path: + raise ValueError( + f"injected_args[{key!r}]={path!r} must be a dotted path " + f"(e.g. 'session.environment')" + ) + return v + @model_validator(mode="after") def _validate_terminal_tool_registry(self) -> "OrchestratorConfig": """Cross-field invariants for the terminal-tool registry. @@ -1531,7 +2232,11 @@ def _coerce_dedup(self) -> "AppConfig": if isinstance(self.dedup, DedupConfig): return self if isinstance(self.dedup, dict): - self.__dict__["dedup"] = DedupConfig(**self.dedup) + # ``BaseModel.__dict__`` is typed as ``MappingProxyType`` in + # the pydantic stub; the documented post-validator mutation + # path is direct ``__dict__`` assignment, which works at + # runtime (pydantic stores fields in a plain dict). + self.__dict__["dedup"] = DedupConfig(**self.dedup) # pyright: ignore[reportIndexIssue] return self raise ValueError( f"app.dedup must be a DedupConfig or dict; got " @@ -1572,8 +2277,9 @@ def _coerce_triggers(self) -> "AppConfig": ) coerced.append(cls(**raw)) # Pydantic v2 stores fields in ``__dict__``; assigning here is - # the documented way to mutate after validation. - self.__dict__["triggers"] = coerced + # the documented way to mutate after validation. (Stub types + # ``__dict__`` as MappingProxyType; runtime is a plain dict.) + self.__dict__["triggers"] = coerced # pyright: ignore[reportIndexIssue] return self @@ -1690,6 +2396,17 @@ class Session(BaseModel): # with a stale version raise ``StaleVersionError`` so the caller can # reload + retry. version: int = 1 + # Phase 11 (FOC-04): transient per-turn confidence hint set by the + # agent runner (graph.py / responsive.py) AFTER each + # _harvest_tool_calls_and_patches call so the gateway's should_gate + # boundary can apply low_confidence gating using whatever + # confidence the agent has emitted so far. Reset to ``None`` at + # turn start; never persisted (``Field(exclude=True)``). The + # framework treats ``None`` as "no signal yet" and does NOT fire a + # low_confidence gate -- this avoids a false-positive gate on the + # very first tool call of a turn before any envelope/tool-arg + # carrying confidence has surfaced. + turn_confidence_hint: float | None = Field(default=None, exclude=True) # ------------------------------------------------------------------ # App-overridable agent-input formatter hook. @@ -2305,11 +3022,40 @@ class StubChatModel(BaseChatModel): """Deterministic chat model for tests/CI. Returns canned text per role. Optionally emits one tool call on first invocation if `tool_call_plan` is set. + + Phase 10 (FOC-03): also honours + ``llm.with_structured_output(AgentTurnOutput)`` so stub-driven tests + survive the runner's envelope contract. The structured response is + derived from the same canned text + a default 0.85 confidence; tests + that need a specific envelope shape can override + ``stub_envelope_confidence`` / ``stub_envelope_rationale`` / + ``stub_envelope_signal``. + + Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with + ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` -> + ``ToolStrategy`` for non-native-structured-output models, including + this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The + agent loop only terminates when the LLM emits a tool call NAMED + ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name + so ``_generate`` can auto-emit a closing tool call after any + user-configured ``tool_call_plan`` is exhausted -- preserving the + pre-Phase-15 stub semantics (canned text + optional pre-scripted + tool calls) while satisfying the new tool-loop termination + contract. """ role: str = "default" canned_responses: dict[str, str] = Field(default_factory=dict) tool_call_plan: list[dict] | None = None + stub_envelope_confidence: float = 0.85 + stub_envelope_rationale: str = "stub envelope rationale" + stub_envelope_signal: str | None = None _called_once: bool = False + # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when + # ``langchain.agents.create_agent`` injects a structured-output tool + # for ``AgentTurnOutput``. Holds the bare tool name (e.g. + # ``"AgentTurnOutput"``) so ``_generate`` can emit a final + # envelope-shaped tool call to close the agent loop. + _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -2323,6 +3069,26 @@ def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None, for tc in self.tool_call_plan: tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}) self._called_once = True + elif self._envelope_tool_name is not None: + # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted + # (or wasn't configured) AND ``langchain.agents.create_agent`` + # has bound the AgentTurnOutput envelope as a tool. Emit a + # closing tool call so the loop terminates with a populated + # ``structured_response``. The args mirror the + # ``with_structured_output`` path's envelope construction so + # tests see the same confidence / rationale / signal regardless + # of whether the new tool-strategy or the legacy structured- + # output path is in play. + tool_calls.append({ + "name": self._envelope_tool_name, + "args": { + "content": text or ".", + "confidence": self.stub_envelope_confidence, + "confidence_rationale": self.stub_envelope_rationale, + "signal": self.stub_envelope_signal, + }, + "id": str(uuid4()), + }) msg = AIMessage(content=text, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) @@ -2331,25 +3097,217 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - """No-op binder: stub emits tool calls only via `tool_call_plan`, not via real binding.""" + """Record the AgentTurnOutput envelope-tool name when present. + + Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with + ``response_format=AgentTurnOutput`` calls ``bind_tools(...)`` + with the user's tools PLUS the envelope-as-a-tool. We scan the + list for the AgentTurnOutput-shaped tool (matched by ``__name__`` + on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the + ``"name"`` key on dict-shaped tool specs) and remember it on the + instance so ``_generate`` can close the agent loop with a + synthetic envelope tool call after any pre-scripted + ``tool_call_plan`` is exhausted. Tools bound by the framework + itself (real BaseTools the agent should call) flow through + unchanged -- the stub still emits them only via + ``tool_call_plan``. + """ + for t in tools or []: + name = ( + getattr(t, "__name__", None) + or getattr(t, "name", None) + or (isinstance(t, dict) and t.get("name")) + ) + if isinstance(name, str) and name == "AgentTurnOutput": + self._envelope_tool_name = name + break return self + # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]`` + # in the langchain stub; this stub override returns a deterministic + # ``_StructuredRunnable`` so tests can drive structured outputs + # without a live provider. Functionally a Runnable (it implements + # ``invoke`` + ``ainvoke``); the stub mismatch is cosmetic. + def with_structured_output(self, schema, *, include_raw: bool = False, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] + """Phase 10 (FOC-03): honour the structured-output pass. + + Historically (pre-Phase-15) the deprecated + ``langgraph.prebuilt.create_react_agent`` factory called this + after its tool loop completed. The current + ``langchain.agents.create_agent`` path uses a tool-strategy + binding instead (see ``bind_tools`` above), but providers and + test code that call ``with_structured_output`` directly still + get a deterministic schema instance. + + We return a Runnable-like that yields a valid ``schema`` + instance derived from the stub's canned text and the + per-instance envelope configuration. Tests can tune + ``stub_envelope_confidence`` etc. to drive gate / reconcile + paths. + """ + text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response") + confidence = self.stub_envelope_confidence + rationale = self.stub_envelope_rationale + signal = self.stub_envelope_signal + + class _StructuredRunnable: + def __init__(self, schema_cls): + self._schema = schema_cls + + def _build(self): + # Construct an instance of whatever schema was passed. + # Common case: AgentTurnOutput; permissive fallback handles + # other pydantic schemas the test may pass. + try: + return self._schema( + content=text or ".", + confidence=confidence, + confidence_rationale=rationale, + signal=signal, + ) + except Exception: + # Permissive fallback for unfamiliar schemas: try + # model_validate on a minimal dict. + return self._schema.model_validate({ + "content": text or ".", + "confidence": confidence, + "confidence_rationale": rationale, + "signal": signal, + }) -def _build_ollama_chat(provider: ProviderConfig, model_id: str, - temperature: float) -> BaseChatModel: - from langchain_ollama import ChatOllama - kwargs: dict[str, Any] = { - "base_url": provider.base_url or "https://ollama.com", - "model": model_id, - "temperature": temperature, - } + def invoke(self, *_args, **_kwargs): + return self._build() + + async def ainvoke(self, *_args, **_kwargs): + return self._build() + + return _StructuredRunnable(schema) + + +def _resolve_timeout( + provider: ProviderConfig, default: float, +) -> float: + """Resolve effective request timeout for a provider. + + Per-provider override wins; falls back to the framework default + (typically ``OrchestratorConfig.default_llm_request_timeout``). + """ + if provider.request_timeout is not None: + return provider.request_timeout + return default + + +def _wrap_chat_with_timeout( + base: BaseChatModel, + provider_name: str, + model_id: str, + request_timeout: float, +) -> BaseChatModel: + """Wrap ``base`` so every ``ainvoke`` is bounded by + ``asyncio.wait_for(..., timeout=request_timeout)`` and raises + ``LLMTimeoutError(provider, model, elapsed_ms)`` on hang. + + The native langchain timeout knob (``request_timeout=`` on + openai/azure or ``client_kwargs={'timeout': ...}`` on ollama) is + honoured at the httpx layer; this wrapper guarantees the + framework-typed exception AND a hard ceiling even if the + underlying client hangs in a way httpx misses (e.g., post-headers + TCP read stall on a slow Ollama). D-13-04: subclassing + ``TimeoutError`` means ``policy._TRANSIENT_TYPES`` auto-classifies + the error as transient (zero edits to ``policy.py``). + """ + base_cls = type(base) + + class _Bounded(base_cls): # type: ignore[misc, valid-type] + async def ainvoke(self, *args: Any, **kwargs: Any) -> Any: + t0 = time.monotonic() + try: + return await asyncio.wait_for( + super().ainvoke(*args, **kwargs), + timeout=request_timeout, + ) + except (asyncio.TimeoutError, TimeoutError) as e: + if isinstance(e, LLMTimeoutError): + # Already typed; don't double-wrap. + raise + elapsed_ms = int((time.monotonic() - t0) * 1000) + raise LLMTimeoutError( + provider=provider_name, + model=model_id, + elapsed_ms=elapsed_ms, + ) from e + + # Reuse the live pydantic instance's state without re-running + # __init__ (which would re-init the underlying httpx clients). + bounded = _Bounded.model_construct(**base.model_dump()) + # Some langchain client classes initialise non-pydantic attrs + # (httpx clients, run_manager, etc.) inside __init__. Copy them + # through so the wrapped instance shares the same network state. + for attr_name in ( + "_client", "_async_client", + "_async_httpx_client", "_sync_httpx_client", + "client", "async_client", + ): + if hasattr(base, attr_name): + try: + object.__setattr__( + bounded, attr_name, getattr(base, attr_name), + ) + except (AttributeError, TypeError): + # Slot-only or read-only attrs on some langchain + # versions -- the bounded instance will re-init on + # first use; not a correctness issue. + pass + return bounded + + +def _build_ollama_chat( + provider: ProviderConfig, model_id: str, temperature: float, + *, request_timeout: float, +) -> BaseChatModel: + from langchain_ollama import ChatOllama + + # Many Ollama models (gemma*, gpt-oss, ministral, etc.) don't support + # native function-calling, which is langchain-ollama's default method + # for ``with_structured_output``. Subclass to force + # ``method='json_schema'`` (uses Ollama's structured-output API) so + # Phase 10's ``response_format=AgentTurnOutput`` envelope actually + # round-trips instead of failing with ``OutputParserException`` + # when the LLM emits prose. + class _ChatOllamaJsonSchema(ChatOllama): # type: ignore[misc, valid-type] + def with_structured_output(self, schema, *, method=None, **kw): + return super().with_structured_output( + schema, method=method or "json_schema", **kw, + ) + + # Phase 13 (HARD-01): ChatOllama has NO native ``request_timeout`` + # field; the canonical incantation is ``client_kwargs={"timeout": ...}``, + # which propagates to the underlying httpx.AsyncClient. + client_kwargs: dict[str, Any] = {"timeout": request_timeout} api_key = provider.api_key or os.environ.get("OLLAMA_API_KEY") if api_key: - kwargs["client_kwargs"] = {"headers": {"Authorization": f"Bearer {api_key}"}} - return ChatOllama(**kwargs) + client_kwargs["headers"] = { + "Authorization": f"Bearer {api_key}", + } + # Phase 13 (HARD-05): base_url is now config-load-validated by + # ProviderConfig._validate_required_fields. NO fallback to a + # public Ollama URL (air-gap rule violation). + kwargs: dict[str, Any] = { + "base_url": provider.base_url, + "model": model_id, + "temperature": temperature, + "client_kwargs": client_kwargs, + } + base = _ChatOllamaJsonSchema(**kwargs) + return _wrap_chat_with_timeout( + base, "ollama", model_id, request_timeout, + ) -def _build_azure_chat(provider: ProviderConfig, model: ModelConfig) -> BaseChatModel: +def _build_azure_chat( + provider: ProviderConfig, model: ModelConfig, + *, request_timeout: float, +) -> BaseChatModel: from langchain_openai import AzureChatOpenAI if provider.endpoint is None: raise ValueError("azure_openai provider requires 'endpoint'") @@ -2358,24 +3316,49 @@ def _build_azure_chat(provider: ProviderConfig, model: ModelConfig) -> BaseChatM f"azure_openai model {model.model!r} requires 'deployment'" ) _ak = provider.api_key or os.environ.get("AZURE_OPENAI_KEY") - return AzureChatOpenAI( + # ``request_timeout`` is a runtime alias for ``timeout`` on + # AzureChatOpenAI (langchain-openai > 0.3 declares it via Pydantic + # ``Field(alias="timeout")``); the langchain stubs only expose + # ``timeout``, hence the stub gap. + base = AzureChatOpenAI( azure_endpoint=provider.endpoint, api_version=provider.api_version or "2024-08-01-preview", azure_deployment=model.deployment, api_key=SecretStr(_ak) if _ak else None, temperature=model.temperature, + request_timeout=request_timeout, # pyright: ignore[reportCallIssue] -- Phase 13 (HARD-01) -- alias for ``timeout`` not in stub + ) + return _wrap_chat_with_timeout( + base, "azure_openai", model.model, request_timeout, ) def get_llm(cfg: LLMConfig, model_name: str | None = None, *, role: str = "default", stub_canned: dict[str, str] | None = None, - stub_tool_plan: list[dict] | None = None) -> BaseChatModel: + stub_tool_plan: list[dict] | None = None, + stub_envelope_confidence: float | None = None, + stub_envelope_rationale: str | None = None, + stub_envelope_signal: str | None = None, + default_llm_request_timeout: float = 120.0, + ) -> BaseChatModel: """Build a chat model by named entry from ``cfg.models``. ``model_name`` defaults to ``cfg.default``. Validation that the name exists is enforced by ``LLMConfig`` itself (model_validator), so a - missing name here means caller passed a typo — raise loudly. + missing name here means caller passed a typo -- raise loudly. + + Phase 10 (FOC-03): stub callers can now tune the canned envelope + (confidence / rationale / signal) so gate-trigger tests preserve their + pre-Phase-10 semantics by emitting a low-confidence envelope. + + Phase 13 (HARD-01): non-stub builds are bounded by an effective + ``request_timeout`` resolved as ``provider.request_timeout`` (per- + provider override) -> ``default_llm_request_timeout`` (framework + default; callers pass ``cfg.orchestrator.default_llm_request_timeout``). + The default keyword value (120.0) matches OrchestratorConfig's default + so test paths that build LLMs without an OrchestratorConfig in scope + still get a sane bound. """ name = model_name or cfg.default model = cfg.models.get(name) @@ -2387,44 +3370,127 @@ def get_llm(cfg: LLMConfig, model_name: str | None = None, *, provider = cfg.providers[model.provider] # validated at config load if provider.kind == "stub": - return StubChatModel( - role=role, - canned_responses=stub_canned or {}, - tool_call_plan=stub_tool_plan, - ) + kwargs: dict[str, Any] = { + "role": role, + "canned_responses": stub_canned or {}, + "tool_call_plan": stub_tool_plan, + } + if stub_envelope_confidence is not None: + kwargs["stub_envelope_confidence"] = stub_envelope_confidence + if stub_envelope_rationale is not None: + kwargs["stub_envelope_rationale"] = stub_envelope_rationale + if stub_envelope_signal is not None: + kwargs["stub_envelope_signal"] = stub_envelope_signal + return StubChatModel(**kwargs) + + effective = _resolve_timeout(provider, default_llm_request_timeout) + if provider.kind == "ollama": - return _build_ollama_chat(provider, model.model, model.temperature) + return _build_ollama_chat( + provider, model.model, model.temperature, + request_timeout=effective, + ) if provider.kind == "azure_openai": - return _build_azure_chat(provider, model) + return _build_azure_chat( + provider, model, request_timeout=effective, + ) + if provider.kind == "openai_compat": + return _build_openai_compat_chat( + provider, model, request_timeout=effective, + ) raise ValueError(f"Unknown provider kind: {provider.kind!r}") -def get_embedding(cfg: LLMConfig) -> Embeddings: - """Build the configured embedding model. Raises if ``cfg.embedding`` is None.""" +def _build_openai_compat_chat( + provider: ProviderConfig, model: ModelConfig, + *, request_timeout: float, +) -> BaseChatModel: + """Build a ``ChatOpenAI`` pointed at an OpenAI-compatible endpoint + (OpenRouter, Together, vLLM, etc.). Reuses langchain-openai's + ``ChatOpenAI`` with ``base_url=`` override and the provider's + ``api_key`` (resolved from env via the YAML loader). + """ + from langchain_openai import ChatOpenAI + if provider.base_url is None: + raise ValueError( + "openai_compat provider requires 'base_url' " + "(e.g. https://openrouter.ai/api/v1)" + ) + if provider.api_key is None: + raise ValueError("openai_compat provider requires 'api_key'") + # See AzureChatOpenAI block above: ``request_timeout`` is a runtime + # alias for ``timeout`` not in the langchain stubs. + base = ChatOpenAI( + base_url=provider.base_url, + api_key=provider.api_key, + model=model.model, + temperature=model.temperature, + request_timeout=request_timeout, # pyright: ignore[reportCallIssue] -- Phase 13 (HARD-01) -- alias for ``timeout`` not in stub + ) + return _wrap_chat_with_timeout( + base, "openai_compat", model.model, request_timeout, + ) + + +def get_embedding( + cfg: LLMConfig, *, default_llm_request_timeout: float = 120.0, +) -> Embeddings: + """Build the configured embedding model. Raises if ``cfg.embedding`` is None. + + Phase 13 (HARD-01): same per-provider override -> framework default + timeout resolution as ``get_llm``. Embeddings traffic shares the + request_timeout knob with chat (see CONTEXT.md "Deferred Ideas" -- + splitting embedding timeout from chat is a future refinement). + + Note (Phase 13 review WR-01): unlike the chat builders -- which apply a + defence-in-depth ``asyncio.wait_for`` wrapper (``_wrap_chat_with_timeout``) + that guarantees a structured ``LLMTimeoutError`` with ``elapsed_ms`` even + on partial-byte stalls -- embeddings rely SOLELY on the underlying + httpx-layer timeout configured above (``client_kwargs={"timeout": ...}`` + for Ollama, ``request_timeout=`` for Azure). This asymmetry is a + deliberate scope choice tied to Phase 13 CONTEXT.md "Deferred Ideas" #4 + (splitting embeddings timeout from chat timeout). If embeddings need + stricter bounds than chat -- or if the httpx-layer timeout proves + insufficient against post-headers TCP read stalls on the embeddings + path the same way it can on chat -- a future phase can mirror + ``_wrap_chat_with_timeout`` for the embeddings public surface + (``aembed_query`` / ``aembed_documents``). + """ if cfg.embedding is None: raise ValueError("llm.embedding is not configured") provider = cfg.providers[cfg.embedding.provider] + effective = _resolve_timeout(provider, default_llm_request_timeout) if provider.kind == "ollama": from langchain_ollama import OllamaEmbeddings - kwargs: dict[str, Any] = { - "base_url": provider.base_url or "https://ollama.com", - "model": cfg.embedding.model, - } + # Phase 13 (HARD-01): OllamaEmbeddings has NO native + # ``request_timeout`` field; canonical incantation is + # ``client_kwargs={"timeout": ...}`` (same as ChatOllama). + client_kwargs: dict[str, Any] = {"timeout": effective} api_key = provider.api_key or os.environ.get("OLLAMA_API_KEY") if api_key: - kwargs["client_kwargs"] = {"headers": {"Authorization": f"Bearer {api_key}"}} - return OllamaEmbeddings(**kwargs) + client_kwargs["headers"] = { + "Authorization": f"Bearer {api_key}", + } + # Phase 13 (HARD-05): base_url config-load-validated; NO public fallback. + return OllamaEmbeddings( + base_url=provider.base_url, + model=cfg.embedding.model, + client_kwargs=client_kwargs, + ) if provider.kind == "azure_openai": from langchain_openai import AzureOpenAIEmbeddings if provider.endpoint is None: raise ValueError("azure_openai provider requires 'endpoint'") deployment = cfg.embedding.deployment or cfg.embedding.model _ak = provider.api_key or os.environ.get("AZURE_OPENAI_KEY") + # See chat builders above: ``request_timeout`` is a runtime + # alias for ``timeout`` not surfaced in the langchain-openai stub. return AzureOpenAIEmbeddings( azure_endpoint=provider.endpoint, api_version=provider.api_version or "2024-08-01-preview", azure_deployment=deployment, api_key=SecretStr(_ak) if _ak else None, + request_timeout=effective, # pyright: ignore[reportCallIssue] -- Phase 13 (HARD-01) -- alias for ``timeout`` not in stub ) raise ValueError( f"Embedding not supported for provider kind {provider.kind!r}" @@ -2641,12 +3707,14 @@ def build_embedder( ) if p.kind == "azure_openai": from langchain_openai import AzureOpenAIEmbeddings + # AzureOpenAIEmbeddings.api_key is typed as ``SecretStr | None`` + # (pydantic v2). Wrap the env-sourced str so the type matches. return AzureOpenAIEmbeddings( azure_deployment=cfg.deployment, model=cfg.model, azure_endpoint=p.endpoint, api_version=p.api_version, - api_key=p.api_key, + api_key=SecretStr(p.api_key) if p.api_key else None, ) if p.kind == "stub": return _StubEmbeddings(dim=cfg.dim) @@ -2668,10 +3736,13 @@ def _faiss_distance_strategy(name: str): def _pgvector_distance_strategy(name: str): from langchain_postgres.vectorstores import DistanceStrategy + # ``langchain_postgres.DistanceStrategy.INNER_PRODUCT`` exists at + # runtime (verified via the live module) but the langchain-postgres + # stubs only expose ``COSINE`` / ``EUCLIDEAN``. return { "cosine": DistanceStrategy.COSINE, "euclidean": DistanceStrategy.EUCLIDEAN, - "inner_product": DistanceStrategy.INNER_PRODUCT, + "inner_product": DistanceStrategy.INNER_PRODUCT, # pyright: ignore[reportAttributeAccessIssue] }[name] @@ -2747,7 +3818,7 @@ def distance_to_similarity(distance: float, strategy: str) -> float: # ====== module: runtime/storage/history_store.py ====== -StateT = TypeVar("StateT", bound=BaseModel) +StateT = TypeVar("StateT", bound=Session) # Allowed ``filter_kwargs`` keys = IncidentRow column names. # Computed at module load so we can produce a precise error for typos. @@ -2799,7 +3870,7 @@ def _row_to_incident(self, row: IncidentRow) -> StateT: return self._converter._row_to_incident(row) def _load(self, incident_id: str) -> StateT: - with Session(self.engine) as session: + with SqlaSession(self.engine) as session: row = session.get(IncidentRow, incident_id) if row is None: raise FileNotFoundError(incident_id) @@ -2810,7 +3881,7 @@ def _list_filtered(self, *, filter_kwargs: Mapping[str, Any]) -> list[StateT]: Pure SQL prefilter — used by both vector and keyword paths. """ - with Session(self.engine) as session: + with SqlaSession(self.engine) as session: stmt = select(IncidentRow).where(IncidentRow.deleted_at.is_(None)) for col, val in filter_kwargs.items(): stmt = stmt.where(getattr(IncidentRow, col) == val) @@ -2867,7 +3938,12 @@ def find_similar( threshold = self.similarity_threshold if threshold is None else threshold vec = self.embedder.embed_query(query) - raw = self.vector_store.similarity_search_with_score_by_vector(vec, k=limit * 4) + # ``similarity_search_with_score_by_vector`` is provided by the + # concrete FAISS / pgvector / langchain-postgres backends (and + # validated by ``runtime.storage.vector.build_vector_store``) + # but the abstract ``langchain_core.vectorstores.VectorStore`` + # base class does not declare it. + raw = self.vector_store.similarity_search_with_score_by_vector(vec, k=limit * 4) # pyright: ignore[reportAttributeAccessIssue] out: list[tuple[StateT, float]] = [] for doc, distance in raw: score = distance_to_similarity(float(distance), self.distance_strategy) @@ -2904,7 +3980,7 @@ def _keyword_similar(self, *, query, filter_kwargs, status_filter, threshold, li if getattr(i, "status", None) == status_filter and getattr(i, "deleted_at", None) is None ] - def _ef(i, key, default=""): + def _ef(i, key, default: Any = ""): """Read a field from typed attribute first, then extra_fields.""" val = getattr(i, key, None) if val: @@ -2936,12 +4012,16 @@ def _ef(i, key, default=""): _INC_ID_RE = re.compile(r"^INC-\d{8}-\d{3}$") _SESSION_ID_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_-]*-\d{8}-\d{3}$") -# StateT is bound to ``BaseModel`` so callers can pass either bare -# ``Session`` or any pydantic subclass. The resolver in -# :mod:`runtime.state_resolver` enforces a ``runtime.state.Session`` -# subclass at config time; the looser bound here keeps the storage -# layer usable by ad-hoc tests that build a ``BaseModel`` directly. -StateT = TypeVar("StateT", bound=BaseModel) +# StateT is bound to ``Session`` (not bare ``BaseModel``) because the +# store body reads typed fields (``id``, ``status``, ``version``, +# ``updated_at`` …) that are declared on ``runtime.state.Session`` and +# not on ``pydantic.BaseModel``. The resolver in +# :mod:`runtime.state_resolver` already enforces a ``Session`` subclass +# at config time, and every existing caller (production + tests) passes +# either bare ``Session`` or a ``Session`` subclass — see +# Phase 19 / HARD-03 for the rationale (was: ``bound=BaseModel`` which +# made pyright flag every typed-field access). +StateT = TypeVar("StateT", bound=Session) def _embed_source(inc: BaseModel) -> str: @@ -3139,7 +4219,12 @@ def save(self, incident: StateT) -> None: raise ValueError( f"Invalid incident id {incident.id!r}; expected PREFIX-YYYYMMDD-NNN" ) - incident.updated_at = _iso(_now()) + # ``_iso(_now())`` returns ``str`` here -- the input datetime is + # never None -- but the helper's signature is the broader + # ``Optional[str]``. ``or ""`` keeps pyright + the typed + # ``Session.updated_at: str`` field consistent without changing + # behaviour (real value is always present). + incident.updated_at = _iso(_now()) or "" sess = incident # local alias — avoids repeating the domain token in new code expected_version = getattr(sess, "version", 1) # Bump in-memory BEFORE building the row dict so the persisted @@ -3284,12 +4369,16 @@ def _persist_vector(self) -> None: from pathlib import Path folder = Path(self.vector_path) folder.mkdir(parents=True, exist_ok=True) - self.vector_store.save_local( + # ``save_local`` is FAISS-specific; the runtime ``hasattr`` guard + # at the top of this method already ensured this codepath only + # runs against FAISS (other VectorStores omit the method). + # ``langchain_core.vectorstores.VectorStore`` doesn't declare it. + self.vector_store.save_local( # pyright: ignore[reportAttributeAccessIssue] folder_path=str(folder), index_name=self.vector_index_name, ) - def _add_vector(self, inc: BaseModel) -> None: + def _add_vector(self, inc: Session) -> None: if self.vector_store is None or self.embedder is None: return text = _embed_source(inc) @@ -3302,7 +4391,7 @@ def _add_vector(self, inc: BaseModel) -> None: ) self._persist_vector() - def _refresh_vector(self, inc: BaseModel, *, prior_text: str) -> None: + def _refresh_vector(self, inc: Session, *, prior_text: str) -> None: if self.vector_store is None or self.embedder is None: return text = _embed_source(inc) @@ -3477,7 +4566,13 @@ def _row_to_incident(self, row: IncidentRow) -> StateT: merged_extras[k] = v kwargs["extra_fields"] = merged_extras - return self._state_cls(**kwargs) + # ``kwargs`` is built up from heterogeneous sources (typed row + # columns + ``extra_fields`` blob) so pyright infers each value + # as ``object``. At runtime each entry matches the concrete + # ``state_cls`` field type by construction (the row schema is + # the source of truth); pydantic's own validation rejects bad + # shapes at the constructor. + return self._state_cls(**kwargs) # pyright: ignore[reportArgumentType] def _incident_to_row_dict(self, inc: StateT) -> dict: """Serialize a state instance into a row-shaped dict. @@ -3580,6 +4675,204 @@ def _field(name: str, default=None): "version": getattr(inc, "version", 1), } +# ====== module: runtime/storage/event_log.py ====== + +@dataclass(frozen=True) +class SessionEvent: + """Immutable view of one row in the event log.""" + seq: int + session_id: str + kind: str + payload: dict + ts: str + + +def _now() -> str: + return datetime.now(timezone.utc).isoformat() + + +class EventLog: + """Append-only log of session events. + + Events drive the status finalizer's inference (e.g. a registered + ```` event appearing in the log -> session reached + the corresponding terminal status). They are never mutated or + deleted. + """ + + def __init__(self, *, engine: Engine) -> None: + self.engine = engine + + def append(self, session_id: str, kind: str, payload: dict) -> None: + """Append a new event row. Never mutates existing rows.""" + with Session(self.engine) as s: + with s.begin(): + s.add(SessionEventRow( + session_id=session_id, + kind=kind, + payload=dict(payload), + ts=_now(), + )) + + def iter_for(self, session_id: str) -> Iterator[SessionEvent]: + """Yield events for ``session_id`` in monotonic insertion order.""" + with Session(self.engine) as s: + stmt = ( + select(SessionEventRow) + .where(SessionEventRow.session_id == session_id) + .order_by(SessionEventRow.seq) + ) + for row in s.execute(stmt).scalars(): + yield SessionEvent( + seq=row.seq, + session_id=row.session_id, + kind=row.kind, + payload=row.payload, + ts=row.ts, + ) + +# ====== module: runtime/storage/migrations.py ====== + +_FORWARD_COLUMNS: list[tuple[str, str]] = [ + ("parent_session_id", "VARCHAR"), # dedup linkage + ("dedup_rationale", "TEXT"), # LLM rationale + ("extra_fields", "JSON"), # generic round-trip tunnel +] +_FORWARD_INDEXES: list[tuple[str, str, str]] = [ + # (index_name, table, column) — mirrors models.IncidentRow.__table_args__. + ("ix_incidents_parent_session_id", "incidents", "parent_session_id"), +] + +# Default audit fields. Mirrors the Pydantic defaults on +# :class:`runtime.state.ToolCall`. Keep these in sync — a divergence +# means rows hydrated post-migration would carry different defaults +# than rows hydrated via the Pydantic constructor, which would surface +# as subtle test flakes long after the migration ran. +_AUDIT_DEFAULTS: dict[str, Any] = { + "status": "executed", + "risk": None, + "approver": None, + "approved_at": None, + "approval_rationale": None, +} + + +def _fill_audit_fields(tc: dict[str, Any]) -> bool: + """Mutate ``tc`` in place, filling any missing audit field with its + default. Returns ``True`` when at least one key was added. + + Existing values (including explicit ``None`` already on the row) + are left untouched — this is the idempotency guarantee. + """ + changed = False + for key, default in _AUDIT_DEFAULTS.items(): + if key not in tc: + tc[key] = default + changed = True + return changed + + +def _normalise_tool_calls_list( + tool_calls: Iterable[Any] | None, +) -> tuple[list[Any], bool]: + """Walk a session's tool_calls JSON list, fill missing audit fields. + + Returns ``(new_list, changed)``. Non-dict entries (corrupt rows) + are passed through unchanged — the migration is not a validator. + """ + if not tool_calls: + return [], False + new: list[Any] = [] + changed = False + for tc in tool_calls: + if isinstance(tc, dict): + # Copy so we don't mutate caller-owned data accidentally. + tc_copy = dict(tc) + if _fill_audit_fields(tc_copy): + changed = True + new.append(tc_copy) + else: + new.append(tc) + return new, changed + + +def migrate_tool_calls_audit(engine: Engine) -> dict[str, int]: + """Walk every session's ``tool_calls`` and fill missing audit fields. + + Idempotent — running on a freshly-migrated DB is a no-op. + + Returns a small stats dict:: + + {"sessions_scanned": N, "sessions_updated": M, "rows_filled": K} + + where ``rows_filled`` is the count of individual ToolCall entries + that received at least one default. Useful for ops dashboards and + post-migration verification. + """ + scanned = 0 + updated = 0 + filled = 0 + with SqlSession(engine) as session: + rows = session.query(IncidentRow).all() + for row in rows: + scanned += 1 + new_list, changed = _normalise_tool_calls_list(row.tool_calls) + if changed: + # Count individual entries that gained at least one + # field. Cheap re-walk — rows.tool_calls is already in + # memory. + for old, new in zip(row.tool_calls or [], new_list): + if isinstance(old, dict) and isinstance(new, dict): + if any(k not in old for k in _AUDIT_DEFAULTS): + filled += 1 + row.tool_calls = new_list + updated += 1 + if updated: + session.commit() + return { + "sessions_scanned": scanned, + "sessions_updated": updated, + "rows_filled": filled, + } + + +def migrate_add_session_columns(engine: Engine) -> dict[str, int]: + """Add post-initial columns to ``incidents`` if missing. Idempotent. + + Older on-disk databases may lack ``extra_fields``, + ``parent_session_id``, or ``dedup_rationale``; SQLAlchemy's read-side + query then errors with ``no such column``. This walker uses + ``PRAGMA table_info`` (via SQLAlchemy's ``inspect``) to detect + missing columns and adds each one nullable. Running on a freshly- + migrated DB is a no-op. + + Returns ``{"columns_added": N, "indexes_added": M}``. + """ + inspector = inspect(engine) + if "incidents" not in inspector.get_table_names(): + # Fresh DB; ``Base.metadata.create_all`` already produced the + # full schema. Nothing to backfill. + return {"columns_added": 0, "indexes_added": 0} + existing_cols = {c["name"] for c in inspector.get_columns("incidents")} + existing_idx = {i["name"] for i in inspector.get_indexes("incidents")} + added_cols = 0 + added_idx = 0 + with engine.begin() as conn: + for col, sql_type in _FORWARD_COLUMNS: + if col not in existing_cols: + conn.execute(text(f"ALTER TABLE incidents ADD COLUMN {col} {sql_type}")) + added_cols += 1 + for idx_name, table, col in _FORWARD_INDEXES: + if idx_name in existing_idx: + continue + # If the column itself was just added (or already present) + # the index is safe to create now. + cols_after = {c["name"] for c in inspect(conn).get_columns(table)} + if col in cols_after: + conn.execute(text(f"CREATE INDEX {idx_name} ON {table} ({col})")) + added_idx += 1 + return {"columns_added": added_cols, "indexes_added": added_idx} + # ====== module: runtime/mcp_loader.py ====== @dataclass @@ -3702,83 +4995,3150 @@ def build_fastmcp_client(server_cfg: MCPServerConfig): if server_cfg.transport == "stdio": if not server_cfg.command: raise ValueError( - f"stdio server '{server_cfg.name}' missing 'command'" + f"stdio server '{server_cfg.name}' missing 'command'" + ) + return Client( + {"command": server_cfg.command[0], "args": server_cfg.command[1:]} + ) + raise ValueError(f"Unknown transport: {server_cfg.transport}") + + +async def _load_in_process(server_cfg: MCPServerConfig, + stack: AsyncExitStack) -> list[BaseTool]: + if server_cfg.module is None: + raise ValueError(f"in_process server '{server_cfg.name}' missing 'module'") + mod = importlib.import_module(server_cfg.module) + fmcp = getattr(mod, "mcp", None) + if fmcp is None: + raise ValueError(f"Module {server_cfg.module} has no 'mcp' (FastMCP instance)") + # FastMCP exposes tools as functions; convert to langchain tools via adapter. + # We use the in-memory client transport. The client is registered into the + # caller's exit stack so its session/transport stays open while the loaded + # tools are in use. + from fastmcp import Client + client = Client(fmcp) + await stack.enter_async_context(client) + tools = await load_mcp_tools(client.session) + # Rewrite each tool's .name to ":" for LLM disambiguation. + for t in tools: + original_name = t.name + t.name = f"{server_cfg.name}:{original_name}" + t._original_mcp_name = original_name # type: ignore[attr-defined] + return tools + + +async def _load_remote(server_cfg: MCPServerConfig, + stack: AsyncExitStack) -> list[BaseTool]: + from fastmcp import Client + if server_cfg.transport in ("http", "sse"): + if not server_cfg.url: + raise ValueError(f"remote server '{server_cfg.name}' missing 'url'") + client = Client(server_cfg.url, headers=server_cfg.headers or None) + elif server_cfg.transport == "stdio": + if not server_cfg.command: + raise ValueError(f"stdio server '{server_cfg.name}' missing 'command'") + client = Client({"command": server_cfg.command[0], "args": server_cfg.command[1:]}) + else: + raise ValueError(f"Unknown transport: {server_cfg.transport}") + await stack.enter_async_context(client) + tools = await load_mcp_tools(client.session) + # Rewrite each tool's .name to ":" for LLM disambiguation. + for t in tools: + original_name = t.name + t.name = f"{server_cfg.name}:{original_name}" + t._original_mcp_name = original_name # type: ignore[attr-defined] + return tools + + +async def load_tools(cfg: MCPConfig, stack: AsyncExitStack) -> ToolRegistry: + """Load all enabled MCP servers and return a :class:`ToolRegistry`. + + The caller MUST pass an already-entered :class:`AsyncExitStack`. Each + FastMCP ``Client`` is registered into it; the caller controls lifetime via + ``await stack.aclose()``. + """ + registry = ToolRegistry() + for server_cfg in cfg.servers: + if not server_cfg.enabled: + continue + if server_cfg.transport == "in_process": + tools = await _load_in_process(server_cfg, stack) + else: + tools = await _load_remote(server_cfg, stack) + for t in tools: + original = getattr(t, "_original_mcp_name", t.name) + registry.add(ToolEntry( + name=original, description=t.description or "", + server=server_cfg.name, category=server_cfg.category, tool=t, + )) + return registry + +# ====== module: runtime/service.py ====== + +_log = logging.getLogger("runtime.service") + +T = TypeVar("T") + + +@dataclass +class _ActiveSession: + """In-memory metadata for an in-flight session. + + Lives in ``OrchestratorService._registry``; mutated only on the + loop thread so the dict itself needs no thread lock. Snapshots are + produced via :meth:`OrchestratorService.list_active_sessions`, + which submits a coroutine to the loop and returns a list of plain + dicts to the calling thread. + """ + + session_id: str + started_at: str + status: str = "running" + current_agent: str | None = None + task: asyncio.Task | None = None + + +def _utc_iso_now() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +class SessionCapExceeded(RuntimeError): + """Raised by ``start_session`` when the service is already running + ``max_concurrent_sessions`` sessions. + + Fail fast, do not queue. Callers (Streamlit, FastAPI handlers) + catch this and surface a clear error — Streamlit shows a toast; + the HTTP layer translates it to a 429 with ``Retry-After``. + """ + + def __init__(self, cap: int) -> None: + super().__init__( + f"OrchestratorService at capacity ({cap} concurrent); " + f"reject incoming start_session" + ) + self.cap = cap + + +class OrchestratorService: + """Process-singleton orchestrator service. + + Surface: construction, singleton accessor, ``start()`` / + ``shutdown()``, coroutine submission bridge, and the shared MCP + client pool. + + Thread-safety (HARD-06): ``get_or_create()`` and + ``_reset_singleton()`` serialise singleton mutation through a + class-level ``threading.Lock``. Concurrent first-callers + (Streamlit warmup + FastAPI startup hook racing during process + boot) all observe the same instance — the loser of the race blocks + on the lock briefly, then short-circuits on the + ``_instance is None`` check inside the critical section. + """ + + # Class-level singleton state. Guarded by ``_lock`` so concurrent + # ``get_or_create()`` callers can't double-construct the service. + # Reset on ``shutdown()`` via :meth:`_reset_singleton`. + _lock: threading.Lock = threading.Lock() + _instance: "OrchestratorService | None" = None + + def __init__( + self, + cfg: AppConfig, + max_concurrent_sessions: int | None = None, + ) -> None: + self.cfg = cfg + # Resource cap. Prefer the explicit constructor arg; fall back + # to ``cfg.runtime.max_concurrent_sessions``. Tests mutate this + # attribute directly to drive cap behaviour deterministically. + self.max_concurrent_sessions: int = ( + max_concurrent_sessions + if max_concurrent_sessions is not None + else cfg.runtime.max_concurrent_sessions + ) + self._loop: asyncio.AbstractEventLoop | None = None + self._thread: threading.Thread | None = None + self._started = threading.Event() + # Shared MCP client pool — built lazily on first ``get_mcp_client`` + # so processes that never touch MCP pay zero startup cost. All + # mutations of ``_mcp_clients`` / ``_mcp_locks`` happen on the + # background loop, so the dicts themselves don't need a thread + # lock. + self._mcp_stack: AsyncExitStack | None = None + self._mcp_clients: dict[str, Any] = {} + self._mcp_locks: dict[str, asyncio.Lock] = {} + # Per-server-name asyncio.Lock guarding lazy build. Created on the + # loop the first time the server is requested. + self._mcp_build_locks: dict[str, asyncio.Lock] = {} + # Shared Orchestrator (lazy-built on first session start) and + # the in-flight session registry. The registry dict itself is + # only mutated from the loop thread (writers go through + # ``submit_and_wait``); readers also hop through the loop so the + # snapshot is point-in-time consistent with concurrent mutators. + self._orch: Any | None = None + self._registry: dict[str, _ActiveSession] = {} + # Lazily-built lock for serialising orchestrator construction + # under concurrent ``start_session`` calls. Created on the loop. + self._orch_build_lock: asyncio.Lock | None = None + # Pending-approval timeout watchdog. Started in ``start()`` iff + # ``cfg.runtime.gateway`` is configured; otherwise None and the + # lifecycle hooks are no-ops. + self._approval_watchdog: Any | None = None + + @classmethod + def get_or_create(cls, cfg: AppConfig) -> "OrchestratorService": + """Return the process-singleton service, building it on first call. + + Subsequent calls ignore the supplied ``cfg`` and return the + existing instance — there is exactly one orchestrator service per + Python process. To rebuild with a new config, call + ``shutdown()`` first. + + Thread-safe (HARD-06): the check-and-construct pair runs inside + a class-level ``threading.Lock``. A concurrent second caller + either blocks until the first caller's ``__init__`` returns and + then short-circuits on the ``_instance is not None`` check, or + wins the race and constructs alone — no double construction. + """ + with cls._lock: + if cls._instance is None: + cls._instance = cls(cfg) + return cls._instance + + def start(self) -> None: + """Spin up the background thread + asyncio loop. + + Idempotent: a no-op if the loop is already running. Blocks until + the background thread reports the loop is ready (5s timeout) so + callers can ``submit()`` immediately after ``start()`` returns. + """ + if self._thread is not None and self._thread.is_alive(): + return + self._started.clear() + self._loop = asyncio.new_event_loop() + self._thread = threading.Thread( + target=self._run_loop, + name="OrchestratorService", + daemon=True, + ) + self._thread.start() + if not self._started.wait(timeout=5.0): + raise RuntimeError("OrchestratorService loop failed to start within 5s") + # Arm the pending-approval watchdog iff a gateway is configured. + # The watchdog is harmless when no high-risk tool calls ever + # fire (it scans the empty registry), but skipping the start + # when the gateway is off keeps process startup quiet for apps + # that have not opted into HITL. + gateway_cfg = getattr(self.cfg.runtime, "gateway", None) + if gateway_cfg is not None: + + + timeout_s = getattr( + gateway_cfg, "approval_timeout_seconds", 3600, + ) + self._approval_watchdog = ApprovalWatchdog( + self, + approval_timeout_seconds=timeout_s, + ) + self._approval_watchdog.start(self._loop) + + def _run_loop(self) -> None: + assert self._loop is not None + asyncio.set_event_loop(self._loop) + self._started.set() + try: + self._loop.run_forever() + finally: + # Drain any remaining tasks before closing so no coroutine is + # left dangling without a chance to clean up. + try: + pending = asyncio.all_tasks(loop=self._loop) + for task in pending: + task.cancel() + if pending: + self._loop.run_until_complete( + asyncio.gather(*pending, return_exceptions=True) + ) + finally: + self._loop.close() + + def submit( + self, coro: Awaitable[T] + ) -> concurrent.futures.Future[T]: + """Submit a coroutine to the background loop from any thread. + + Returns a ``concurrent.futures.Future`` whose ``.result()`` blocks + the calling thread until the coroutine resolves on the loop. Safe + to call concurrently from multiple threads. + """ + if self._loop is None: + raise RuntimeError( + "OrchestratorService not started; call start() first" + ) + if not self._loop.is_running(): + raise RuntimeError("OrchestratorService loop is not running") + # Public signature accepts ``Awaitable[T]`` for caller flexibility; + # ``run_coroutine_threadsafe`` requires a ``Coroutine``. Every + # in-tree caller passes ``async def fn()`` — a Coroutine — so the + # cast is sound. Outside callers passing a non-coroutine + # Awaitable would already fail at runtime. + return asyncio.run_coroutine_threadsafe( + cast(Coroutine[Any, Any, T], coro), self._loop, + ) + + def submit_and_wait( + self, coro: Awaitable[T], timeout: float | None = None + ) -> T: + """Submit a coroutine and block the caller until it resolves. + + Convenience wrapper for sync callers (Streamlit, FastAPI request + handlers, CLI). Raises ``concurrent.futures.TimeoutError`` if the + coroutine doesn't complete within ``timeout`` seconds. + + WARNING: do not call from an async function whose event loop is + the same loop ``OrchestratorService`` is hosting (e.g. tests using + ``httpx.AsyncClient + ASGITransport`` against the FastAPI app + share the same loop the service runs on). The caller would block + the loop while waiting for work scheduled onto that same loop — + a deadlock. Use :meth:`submit_async` from async code. + """ + return self.submit(coro).result(timeout=timeout) + + async def submit_async(self, coro: Awaitable[T]) -> T: + """Bridge a coroutine onto the service's background loop, awaitable + from any caller's loop. + + Async equivalent of :meth:`submit_and_wait`. ``asyncio.wrap_future`` + exposes the cross-thread ``concurrent.futures.Future`` returned by + ``run_coroutine_threadsafe`` as awaitable on the calling loop, so + the caller yields control while the work runs on the service's + loop. Safe to call from a request handler whose event loop is the + same one the service is hosting (no deadlock). + """ + if self._loop is None: + raise RuntimeError( + "OrchestratorService not started; call start() first" + ) + if not self._loop.is_running(): + raise RuntimeError("OrchestratorService loop is not running") + # See ``submit`` above for the Awaitable-vs-Coroutine cast. + fut = asyncio.run_coroutine_threadsafe( + cast(Coroutine[Any, Any, T], coro), self._loop, + ) + return await asyncio.wrap_future(fut) + + async def get_mcp_client(self, server_name: str) -> Any: + """Return the shared FastMCP client for ``server_name``, building + on first request. + + Lookup is serialised via a per-server ``asyncio.Lock`` so two + concurrent sessions racing for the same server don't double-build + the client. The clients themselves are reused across all sessions + for the lifetime of the service; teardown happens in + :meth:`shutdown`. + + Raises ``KeyError`` if ``server_name`` is not declared in + ``cfg.mcp.servers``. + """ + # Build-lock dict mutation must happen on the loop; we *are* on + # the loop here (this is an async method). + if server_name not in self._mcp_build_locks: + self._mcp_build_locks[server_name] = asyncio.Lock() + async with self._mcp_build_locks[server_name]: + if server_name in self._mcp_clients: + return self._mcp_clients[server_name] + server_cfg = next( + (s for s in self.cfg.mcp.servers if s.name == server_name), + None, + ) + if server_cfg is None: + raise KeyError( + f"MCP server {server_name!r} not declared in cfg.mcp.servers" + ) + if self._mcp_stack is None: + self._mcp_stack = AsyncExitStack() + await self._mcp_stack.__aenter__() + client = build_fastmcp_client(server_cfg) + await self._mcp_stack.enter_async_context(client) + self._mcp_clients[server_name] = client + self._mcp_locks[server_name] = asyncio.Lock() + return client + + def lock_for(self, server_name: str) -> asyncio.Lock: + """Return the per-server ``asyncio.Lock`` that serialises tool + calls against a single FastMCP client. + + Must be called after ``get_mcp_client(server_name)`` has built + the client, otherwise ``KeyError``. + """ + return self._mcp_locks[server_name] + + # ------------------------------------------------------------------ + # Per-session task scheduling + in-flight registry + # ------------------------------------------------------------------ + + async def _ensure_orchestrator(self) -> Any: + """Lazily build the shared ``Orchestrator`` on the loop thread. + + Concurrent ``start_session`` calls coordinate through + ``_orch_build_lock`` so we never build the orchestrator twice. + Returns the cached instance on subsequent calls. + """ + # Build-lock construction must happen on the loop. We *are* on + # the loop here (this is an async method invoked via the bridge). + if self._orch_build_lock is None: + self._orch_build_lock = asyncio.Lock() + async with self._orch_build_lock: + if self._orch is None: + # Lazy import to avoid a circular dependency at module + # load time (orchestrator transitively imports a lot). + + self._orch = await Orchestrator.create(self.cfg) + return self._orch + + def start_session( + self, + *, + query: str = "", + state_overrides: dict | None = None, + environment: str | None = None, + submitter: dict | None = None, + reporter_id: str | None = None, + reporter_team: str | None = None, + trigger: Any | None = None, + ) -> str: + """Start a new agent session. Returns the session id immediately. + + The session row is created (and the id minted) synchronously on + the loop so the caller has a stable handle before this method + returns. The actual graph run is launched as an ``asyncio.Task`` + on the same loop and runs in the background — the caller does + **not** block on it. Listen via :meth:`list_active_sessions` and + per-session state lookups for progress. + + ``state_overrides`` is a free-form dict of domain fields the app + stamps onto the new session row. The framework only projects + ``environment`` onto the storage column today; other keys ride + through to app-specific MCP tools. + + ``submitter`` is a free-form dict the calling app interprets. + For incident-management it is ``{"id": "...", "team": "..."}``; + other apps can carry app-specific keys (e.g. code-review's + ``{"id": "", "pr_url": "..."}``). The framework + only projects ``id``/``team`` onto the row's reporter columns. + + Deprecated kwargs (coerced and warned): + * ``environment`` -> ``state_overrides={"environment": ...}`` + * ``reporter_id`` / ``reporter_team`` -> ``submitter`` + + The registry entry is evicted by a ``Task.add_done_callback`` on + completion, cancellation, or failure — so a session that crashes + does not leak a stale entry. + """ + + + + # Resolve the generic ``submitter`` and ``state_overrides`` once + # on the caller's thread — the deprecation warnings fire here + # (in the user's frame), not deep inside the loop's ``_scheduler``. + resolved_overrides = _coerce_state_overrides( + state_overrides, environment, + ) + resolved_submitter = _coerce_submitter( + submitter, reporter_id, reporter_team + ) + sub_id = (resolved_submitter or {}).get("id", "user-mock") + sub_team = (resolved_submitter or {}).get("team", "platform") + env = (resolved_overrides or {}).get("environment", "") + + async def _scheduler() -> str: + # Enforce the concurrency cap on the loop thread so the + # registry size check is race-free. Fail-fast with + # ``SessionCapExceeded``; the exception propagates through + # ``submit_and_wait`` -> ``Future.result()`` to the caller. + if len(self._registry) >= self.max_concurrent_sessions: + raise SessionCapExceeded(self.max_concurrent_sessions) + orch = await self._ensure_orchestrator() + # Allocate the row (and its id) synchronously on the loop + # so the caller gets a stable id back. The graph then runs + # in a separate task — registration happens here, before + # the task is created, so ``list_active_sessions`` sees the + # entry immediately. + inc = orch.store.create( + query=query, + environment=env, + reporter_id=sub_id, + reporter_team=sub_team, + ) + session_id = inc.id + # Stamp trigger provenance onto the row before the graph + # runs so any crash mid-graph still leaves an audit trail. + # ``inc.findings`` is a JSON dict on the row. + if trigger is not None: + try: + received_at = trigger.received_at.strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + except Exception: # noqa: BLE001 + received_at = _utc_iso_now() + inc.findings["trigger"] = { + "name": getattr(trigger, "name", None), + "transport": getattr(trigger, "transport", None), + "target_app": getattr(trigger, "target_app", None), + "received_at": received_at, + } + orch.store.save(inc) + entry = _ActiveSession( + session_id=session_id, + started_at=_utc_iso_now(), + ) + self._registry[session_id] = entry + + async def _run() -> None: + # Fail-fast on contention (D-03): if another task already + # holds the session lock, refuse the new turn immediately. + if orch._locks.is_locked(session_id): + + raise SessionBusy(session_id) + # Hold the per-session lock for the full graph turn, + # including any HITL interrupt() pause (D-01). + async with orch._locks.acquire(session_id): + try: + await orch.graph.ainvoke( + GraphState( + session=inc, + next_route=None, + last_agent=None, + error=None, + ), + config=orch._thread_config(session_id), + ) + except asyncio.CancelledError: + raise + except Exception as exc: # noqa: BLE001 + # Phase 11 (FOC-04 / D-11-04): GraphInterrupt is a + # pending-approval pause, not a failure. Don't stamp + # status='error' on the registry entry -- let + # LangGraph's checkpointer hold the paused state + # and let the UI's Approve/Reject action drive + # resume. + try: + from langgraph.errors import GraphInterrupt + if isinstance(exc, GraphInterrupt): + # Propagate so the underlying Task + # observer (stop_session etc.) still + # sees the exception, but skip the + # status='error' write. + raise + except ImportError: # pragma: no cover + pass + # Mark the registry entry so any concurrent snapshot + # observes the failure before the done-callback + # evicts it. The exception itself is preserved on + # the task object for ``stop_session`` and any + # other observer that holds a Task reference. + e = self._registry.get(session_id) + if e is not None: + e.status = "error" + raise + + task = asyncio.create_task(_run(), name=f"session:{session_id}") + entry.task = task + + # Eviction is loop-local: ``add_done_callback`` fires on the + # loop thread, so the dict mutation is single-threaded. + def _evict(_t: asyncio.Task) -> None: + self._registry.pop(session_id, None) + + task.add_done_callback(_evict) + return session_id + + return self.submit_and_wait(_scheduler(), timeout=30.0) + + # ------------------------------------------------------------------ + # stop_session — cancel in-flight task + persist stopped status + # ------------------------------------------------------------------ + + def stop_session(self, session_id: str) -> None: + """Cancel an in-flight session and mark its row ``status="stopped"``. + + Idempotent: calling on an unknown id, an already-stopped session, + or a session that completed naturally is a no-op (does not raise). + Also clears ``pending_intervention`` so a session interrupted + mid-resume doesn't leave a stale prompt on the row. + + Partial work (recorded ``tool_calls``, ``agents_run``) is + preserved — they are written as they happen, and stopping is + not a rollback. + """ + + async def _stop() -> None: + entry = self._registry.get(session_id) + task = entry.task if entry is not None else None + if task is not None and not task.done(): + task.cancel() + try: + await asyncio.wait_for(task, timeout=5.0) + except (asyncio.CancelledError, asyncio.TimeoutError): + pass + except Exception: # noqa: BLE001 + # The graph itself may have raised; we still want to + # mark the row stopped below. Swallow here, but log + # so post-mortem reveals the underlying failure. + _log.warning( + "stop_session: graph raised during cancel-await for %s", + session_id, + exc_info=True, + ) + # Persist the stopped status. The orchestrator may not have + # been built yet (caller passed an unknown id before any + # session ran) — in that case there's nothing to persist. + orch = self._orch + if orch is not None: + try: + inc = orch.store.load(session_id) + except Exception: # noqa: BLE001 + # Unknown id: nothing to persist; treat as no-op. A + # genuine store failure is still observable via the log. + _log.debug( + "stop_session: store.load(%s) failed; treating as unknown id", + session_id, + exc_info=True, + ) + inc = None + if inc is not None: + inc.status = "stopped" + inc.pending_intervention = None + orch.store.save(inc) + # Drop the registry entry if the done-callback didn't already + # evict it (it always does, but be defensive). + self._registry.pop(session_id, None) + + # If the loop isn't running (caller stopped the service), be a + # silent no-op rather than raising — keeps idempotency guarantees. + if self._loop is None or not self._loop.is_running(): + return + self.submit_and_wait(_stop(), timeout=10.0) + + # ------------------------------------------------------------------ + # Active-session registry snapshot accessor + # ------------------------------------------------------------------ + + def list_active_sessions(self) -> list[dict[str, Any]]: + """Return a thread-safe snapshot of in-flight sessions. + + The snapshot coroutine runs on the loop thread, so the view is + point-in-time consistent w.r.t. concurrent registry mutators + (which also run on the loop). Each entry is a plain ``dict`` + with ``session_id``, ``status``, ``started_at``, and + ``current_agent`` keys — callers in any thread can pass it + around without holding any asyncio resources. + + Returns an empty list when the service has never run a session + or when every previously-started run has completed. + """ + + async def _snapshot() -> list[dict[str, Any]]: + return [ + { + "session_id": e.session_id, + "status": e.status, + "started_at": e.started_at, + "current_agent": e.current_agent, + } + for e in self._registry.values() + ] + + return self.submit_and_wait(_snapshot(), timeout=5.0) + + def shutdown(self, timeout: float = 10.0) -> None: + """Stop the loop, tear down MCP clients, join the thread, + reset the singleton. + + Idempotent: safe to call multiple times, including after the + loop has already been torn down. Resets the module-level + singleton so ``get_or_create()`` will rebuild on the next call. + """ + if self._loop is None: + self._reset_singleton() + return + loop = self._loop + thread = self._thread + # Stop the watchdog before draining sessions so its scan + # doesn't race against the registry teardown below. + if loop.is_running() and self._approval_watchdog is not None: + try: + fut = asyncio.run_coroutine_threadsafe( + self._approval_watchdog.stop(), loop, + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: shutdown must continue even if the watchdog + # refuses to stop cleanly. Surface the cause so it doesn't + # silently rot. + _log.warning( + "shutdown: approval watchdog stop failed", + exc_info=True, + ) + self._approval_watchdog = None + # Cancel in-flight session tasks first so they observe a + # CancelledError before the orchestrator's underlying + # resources (DB engine, FastMCP transports) are torn down. + if loop.is_running() and self._registry: + try: + fut = asyncio.run_coroutine_threadsafe( + self._cancel_all_sessions(), loop + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: a stuck task that ignores cancellation must + # not block the loop teardown below. Surface for diagnosis. + _log.warning( + "shutdown: cancel_all_sessions failed", + exc_info=True, + ) + # Close the shared orchestrator on the loop, releasing its + # checkpointer connection / MCP exit-stack. + if loop.is_running() and self._orch is not None: + try: + fut = asyncio.run_coroutine_threadsafe( + self._close_orchestrator(), loop + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: a misbehaving aclose() must not block + # the loop / thread join below. Surface for diagnosis. + _log.warning( + "shutdown: orchestrator close failed", + exc_info=True, + ) + # Close MCP clients on the loop *before* stopping it. + if loop.is_running() and self._mcp_stack is not None: + try: + fut = asyncio.run_coroutine_threadsafe( + self._close_mcp_pool(), loop + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: don't block shutdown on a misbehaving + # client. Log so diagnostics survive the silent cleanup. + _log.warning( + "shutdown: MCP pool close failed", + exc_info=True, + ) + if loop.is_running(): + loop.call_soon_threadsafe(loop.stop) + if thread is not None: + thread.join(timeout=timeout) + self._loop = None + self._thread = None + self._started.clear() + self._mcp_stack = None + self._mcp_clients.clear() + self._mcp_locks.clear() + self._mcp_build_locks.clear() + self._orch = None + self._orch_build_lock = None + self._registry.clear() + self._approval_watchdog = None + self._reset_singleton() + + async def _cancel_all_sessions(self) -> None: + """Cancel every in-flight session task and wait for them to exit. + + Runs on the loop thread. Each task gets up to 5s to honour the + ``CancelledError``; misbehaving tasks that ignore cancellation + do not block shutdown beyond that — ``run_loop`` will sweep + them in its final ``gather`` pass. + """ + tasks = [e.task for e in self._registry.values() if e.task is not None] + for t in tasks: + t.cancel() + if tasks: + await asyncio.gather(*tasks, return_exceptions=True) + self._registry.clear() + + async def _close_orchestrator(self) -> None: + if self._orch is None: + return + orch = self._orch + self._orch = None + try: + await orch.aclose() + except Exception: # noqa: BLE001 + # Best-effort cleanup: a checkpointer / MCP exit-stack that + # blew up on close still leaves the process to exit cleanly. + # Surface so the failure is observable post-mortem. + _log.warning( + "_close_orchestrator: orch.aclose() failed", + exc_info=True, + ) + + async def _close_mcp_pool(self) -> None: + if self._mcp_stack is None: + return + stack = self._mcp_stack + self._mcp_stack = None + await stack.__aexit__(None, None, None) + self._mcp_clients.clear() + self._mcp_locks.clear() + self._mcp_build_locks.clear() + + @classmethod + def _reset_singleton(cls) -> None: + """Clear the class-level singleton under the same lock that + ``get_or_create`` uses — so a reset racing with a fresh + ``get_or_create`` call cannot leak the stale instance. + """ + with cls._lock: + cls._instance = None + +# ====== module: runtime/agents/turn_output.py ====== + +_LOG = logging.getLogger("runtime.orchestrator") + +# D-10-03 — heuristic tolerance for envelope-vs-tool-arg confidence mismatch. +# Inclusive boundary (|env - tool| <= 0.05 is silent). Documented for future +# tuning; widening is cheap, narrowing requires care because the LLM's +# self-reported turn confidence is naturally ~5pp noisier than its +# tool-call-time confidence. +_DEFAULT_TOLERANCE: float = 0.05 + + +class AgentTurnOutput(BaseModel): + """Structural envelope every agent invocation MUST emit. + + The framework wires this as ``response_format=AgentTurnOutput`` on both + ``create_agent`` call sites (``runtime.graph`` and + ``runtime.agents.responsive``). Pydantic's ``extra="forbid"`` keeps the + contract narrow — adding fields is a deliberate schema migration, not a + free-for-all. + """ + + model_config = ConfigDict(extra="forbid") + + content: str = Field( + min_length=1, + description="Final user-facing message text.", + ) + confidence: float = Field( + ge=0.0, + le=1.0, + description=( + "Calibrated confidence in this turn's output: " + "0.85+ strong, 0.5 hedged, <0.4 weak." + ), + ) + confidence_rationale: str = Field( + min_length=1, + description="One-sentence explanation of the confidence value.", + ) + signal: str | None = Field( + default=None, + description=( + "Optional next-state signal " + "(e.g. success | failed | needs_input | default). " + "Routing layer validates the vocabulary." + ), + ) + + +class EnvelopeMissingError(Exception): + """Raised by :func:`parse_envelope_from_result` when neither + ``result["structured_response"]`` nor a JSON-shaped final AIMessage + yields a valid :class:`AgentTurnOutput`. + + Carries structured cause attributes (``agent``, ``field``) so the + runner can mark the agent_run as ``error`` with a precise reason. + """ + + def __init__(self, *, agent: str, field: str, message: str | None = None): + self.agent = agent + self.field = field + super().__init__(message or f"envelope_missing: {field} (agent={agent})") + + +def parse_envelope_from_result( + result: dict, + *, + agent: str, +) -> AgentTurnOutput: + """Extract an :class:`AgentTurnOutput` from a ``create_react_agent`` result. + + Three-step defensive fallback (Risk #1 — Ollama may not honor + ``response_format`` cleanly across all providers): + + 1. ``result["structured_response"]`` — preferred path; LangGraph 1.1.x + populates it when ``response_format`` is set and the LLM honors + structured output. + 2. ``result["messages"][-1].content`` parsed as JSON, validated against + :class:`AgentTurnOutput` — covers providers that stuff envelope JSON + in the AIMessage body instead of a separate structured field. + 3. Both fail → :class:`EnvelopeMissingError` so the runner marks + agent_run ``error`` with a structured cause. + """ + # Path 1: structured_response (preferred) + sr = result.get("structured_response") + if isinstance(sr, AgentTurnOutput): + return sr + if isinstance(sr, dict): + try: + return AgentTurnOutput.model_validate(sr) + except Exception: # noqa: BLE001 + # Path 1 produced a dict that doesn't match the envelope + # schema. Fall through to Path 2 (parse last AIMessage), but + # log so providers shipping malformed structured_response are + # observable instead of silently degraded. + _LOG.debug( + "envelope path 1 (structured_response dict) failed validation; " + "falling through to AIMessage JSON parse", + exc_info=True, + ) + + # Path 2: JSON-parse last AIMessage content + messages = result.get("messages") or [] + for msg in reversed(messages): + if msg.__class__.__name__ != "AIMessage": + continue + content = getattr(msg, "content", None) + if not isinstance(content, str) or not content.strip(): + continue + try: + payload = json.loads(content) + except (json.JSONDecodeError, ValueError): + continue + if not isinstance(payload, dict): + continue + try: + return AgentTurnOutput.model_validate(payload) + except Exception: # noqa: BLE001 + continue + break + + # Path 3: fail loudly + raise EnvelopeMissingError( + agent=agent, + field="structured_response", + message=( + f"envelope_missing: no structured_response or JSON-decodable " + f"AIMessage envelope found (agent={agent})" + ), + ) + + +def reconcile_confidence( + envelope_value: float, + tool_arg_value: float | None, + *, + agent: str, + session_id: str, + tool_name: str | None, + tolerance: float = _DEFAULT_TOLERANCE, +) -> float: + """Reconcile envelope confidence against typed-terminal-tool-arg confidence. + + D-10-03 contract: + - When ``tool_arg_value`` is None: return envelope value silently. + - When both present and ``|envelope - tool_arg| <= tolerance``: return + tool-arg silently (tool-arg wins on the return regardless — it's the + finer-grained, gated value). + - When both present and ``|envelope - tool_arg| > tolerance``: log INFO + with the verbatim format from CONTEXT.md / D-10-03 and return tool-arg. + + Log shape (preserved verbatim for grep-based observability assertions): + ``runtime.orchestrator: turn.confidence_mismatch agent={a} turn_value={e:.2f} tool_value={t:.2f} tool={tn} session_id={sid}`` + """ + if tool_arg_value is None: + return envelope_value + diff = abs(envelope_value - tool_arg_value) + if diff > tolerance: + _LOG.info( + "turn.confidence_mismatch " + "agent=%s turn_value=%.2f tool_value=%.2f tool=%s session_id=%s", + agent, + envelope_value, + tool_arg_value, + tool_name, + session_id, + ) + return tool_arg_value + + +__all__ = [ + "AgentTurnOutput", + "EnvelopeMissingError", + "parse_envelope_from_result", + "reconcile_confidence", +] + +# ====== module: runtime/tools/gateway.py ====== + +if TYPE_CHECKING: + pass +GatewayAction = Literal["auto", "notify", "approve"] + +_RISK_TO_ACTION: dict[str, GatewayAction] = { + "low": "auto", + "medium": "notify", + "high": "approve", +} + +_UTC_TS_FMT = "%Y-%m-%dT%H:%M:%SZ" + + +def effective_action( + tool_name: str, + *, + env: str | None, + gateway_cfg: GatewayConfig | None, +) -> GatewayAction: + """Resolve the effective gateway action for a tool invocation. + + Order of evaluation (the prod-override predicate runs FIRST so it can + only TIGHTEN the action — never relax it): + + 1. ``gateway_cfg is None`` -> ``"auto"`` (gateway disabled). + 2. Prod override: if ``cfg.prod_overrides`` is configured AND + ``env`` is in ``prod_environments`` AND ``tool_name`` matches + one of the ``resolution_trigger_tools`` globs -> ``"approve"``. + 3. Risk-tier lookup: ``cfg.policy.get(tool_name)`` mapped via + ``low->auto``, ``medium->notify``, ``high->approve``. + 4. No policy entry -> ``"auto"`` (safe default). + + Tool-name lookups try the fully-qualified name (``:``, + as registered by ``runtime.mcp_loader``) FIRST, then the bare + suffix as a fallback. This lets app config use bare names without + knowing the server prefix while keeping prefixed-form policy keys + deterministically more specific. Globs in + ``resolution_trigger_tools`` are matched against both forms for + the same reason, prefixed first. + + The function is pure: same inputs always yield the same output and + no argument is mutated. + """ + if gateway_cfg is None: + return "auto" + + bare = tool_name.split(":", 1)[1] if ":" in tool_name else None + + overrides = gateway_cfg.prod_overrides + if overrides is not None and env and env in overrides.prod_environments: + for pattern in overrides.resolution_trigger_tools: + if fnmatchcase(tool_name, pattern): + return "approve" + if bare is not None and fnmatchcase(bare, pattern): + return "approve" + + risk = gateway_cfg.policy.get(tool_name) + if risk is not None: + return _RISK_TO_ACTION[risk] + if bare is not None: + risk = gateway_cfg.policy.get(bare) + if risk is not None: + return _RISK_TO_ACTION[risk] + return "auto" + + +def _now_iso() -> str: + return datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + + +def _find_pending_index( + tool_calls: list, + tool_name: str, + ts: str, +) -> int | None: + """Locate the index of the ``pending_approval`` ToolCall row that + matches ``tool_name`` and ``ts``. + + Used by the wrap_tool resume path to update the in-place audit row + rather than appending a duplicate. The watchdog may have replaced + the row with a ``timeout`` entry while the graph was paused — in + that case we return ``None`` and the resume path leaves the audit + list unchanged (the watchdog already wrote the canonical record). + + Searches from the end of the list because the pending row is + almost always the most recent ToolCall. + """ + for idx in range(len(tool_calls) - 1, -1, -1): + tc = tool_calls[idx] + if (getattr(tc, "tool", None) == tool_name + and getattr(tc, "ts", None) == ts + and getattr(tc, "status", None) == "pending_approval"): + return idx + return None + + +def _find_existing_pending_index( + tool_calls: list, + tool_name: str, +) -> int | None: + """Find the most recent ``pending_approval`` row for ``tool_name``. + + LangGraph's interrupt/resume model re-runs the gated node from the + top after ``Command(resume=...)``; we re-use the existing pending + row rather than appending a duplicate every time the closure + re-enters the approve branch. + """ + for idx in range(len(tool_calls) - 1, -1, -1): + tc = tool_calls[idx] + if (getattr(tc, "tool", None) == tool_name + and getattr(tc, "status", None) == "pending_approval"): + return idx + return None + + +def _evaluate_gate( + *, + session: Session, + tool_name: str, + gate_policy: GatePolicy | None, + gateway_cfg: GatewayConfig | None, +) -> "GateDecision": + """Phase 11 (FOC-04) bridge: invoke ``should_gate`` from the wrap. + + Constructs a minimal ``ToolCall`` shape for the pure-function + boundary, and a temporary ``OrchestratorConfig`` shim with the + in-flight ``gate_policy`` + ``gateway`` so the pure function sees + a single config object (its declared signature). + + When ``gate_policy`` is ``None`` -- the legacy callers that have + not yet been threaded -- a default ``GatePolicy()`` is used so + Phase-11 behaviour applies uniformly. The default mirrors v1.0 + HITL behaviour (``gated_risk_actions={"approve"}``), so existing + pre-Phase-11 tests keep passing. + """ + # Local imports (avoid cycle on policy.py importing gateway). + # ``GateDecision`` is type-only here -- the lazy import sits in the + # TYPE_CHECKING block at module top. + + + + effective_policy = gate_policy if gate_policy is not None else GatePolicy() + # OrchestratorConfig has model_config={"extra": "forbid"} so we + # cannot stash gateway as a top-level field. We thread gateway via + # the cfg.gateway lookup that should_gate already performs via + # ``getattr(cfg, "gateway", None)``. Building a transient cfg with + # gate_policy and a stashed gateway attr is the smallest-diff + # pathway -- avoids changing should_gate's signature. + cfg = OrchestratorConfig(gate_policy=effective_policy) + object.__setattr__(cfg, "gateway", gateway_cfg) + + minimal_tc = ToolCall( + agent="", + tool=tool_name, + args={}, + result=None, + ts=_now_iso(), + risk="low", + status="executed", + ) + confidence = getattr(session, "turn_confidence_hint", None) + decision: GateDecision = should_gate( + session=session, tool_call=minimal_tc, confidence=confidence, cfg=cfg, + ) + return decision + + +class _GatedToolMarker(BaseTool): + """Marker base class so ``isinstance(t, _GatedToolMarker)`` identifies + a tool that has already been wrapped by :func:`wrap_tool`. Used to + short-circuit ``wrap_tool(wrap_tool(t))`` and avoid wrapper recursion. + + Not instantiated directly — every ``_GatedTool`` defined inside + :func:`wrap_tool` inherits from this. + """ + + name: str = "_gated_marker" + description: str = "internal — never invoked" + + def _run(self, *args: Any, **kwargs: Any) -> Any: # pragma: no cover + raise NotImplementedError("marker base — _GatedTool overrides this") + + +def wrap_tool( + base_tool: BaseTool, + *, + session: Session, + gateway_cfg: GatewayConfig | None, + agent_name: str = "", + store: "SessionStore | None" = None, + injected_args: dict[str, str] | None = None, + gate_policy: GatePolicy | None = None, +) -> BaseTool: + """Wrap ``base_tool`` so every invocation passes through the gateway. + + The factory closes over ``session`` and ``gateway_cfg`` so the live + audit log (``session.tool_calls``) is the same instance the rest of + the orchestrator reads — no detour through a separate audit table. + + Returned object is a ``BaseTool`` subclass instance whose ``name`` + and ``description`` mirror the underlying tool, so LangGraph's ReAct + prompt builder still sees the right tool surface. + + Idempotent: wrapping an already-gated tool returns it unchanged so a + second ``wrap_tool(wrap_tool(t))`` does not nest wrappers (which would + cause unbounded recursion when ``_run`` calls ``inner.invoke`` and + that dispatches back into another ``_GatedTool._run``). + + Phase 9 (D-09-01 / D-09-03): when ``injected_args`` is supplied, the + gateway expands ``kwargs`` with session-derived values BEFORE + ``effective_action`` is consulted — so the gateway's risk-rating + sees the canonical ``environment`` (avoiding T-09-05: gateway + misclassifies prod as auto because env was missing from the LLM + args). + """ + if isinstance(base_tool, _GatedToolMarker): + return base_tool + + env = getattr(session, "environment", None) + inner = base_tool + inject_cfg = injected_args or {} + + # Phase 9 (D-09-01): the LLM-visible args_schema on the wrapper must + # exclude every injected key — otherwise BaseTool's input validator + # rejects the call when the LLM omits a "required" arg the framework + # is about to supply. The inner tool keeps its full schema so the + # downstream invoke still sees every kwarg. + if inject_cfg: + + _llm_visible_schema = strip_injected_params( + inner, frozenset(inject_cfg.keys()), + ).args_schema + else: + _llm_visible_schema = inner.args_schema + + # Phase 9 follow-up: compute the set of param names the inner tool + # actually accepts so injection skips keys the target tool doesn't + # declare. Without this filter, a config-wide ``injected_args`` + # entry like ``session_id: session.id`` is unconditionally written + # to every tool's kwargs — tools that don't accept ``session_id`` + # then raise pydantic ``unexpected_keyword`` errors at the FastMCP + # validation boundary. ``accepted_params_for_tool`` handles both + # pydantic-model and JSON-Schema-dict ``args_schema`` shapes. + + _accepted_params: frozenset[str] | None = accepted_params_for_tool(inner) + + def _sync_invoke_inner(payload: Any) -> Any: + """Sync-invoke the inner tool, translating BaseTool's + default-``_run`` ``NotImplementedError`` into a clearer message + for native-async-only tools. Without this, callers see a vague + ``NotImplementedError`` from langchain core with no hint that + the right path is ``ainvoke``.""" + try: + return inner.invoke(payload) + except NotImplementedError as exc: + raise NotImplementedError( + f"Tool {inner.name!r} appears to be async-only " + f"(``_run`` not implemented). Use ``ainvoke`` / ``_arun`` " + f"for this tool instead of the sync invoke path." + ) from exc + + # Tool-naming regex differs across LLM providers — Ollama allows + # ``[a-zA-Z0-9_.\-]{1,256}``, OpenAI is stricter at + # ``^[a-zA-Z0-9_-]+$`` (no dots). The framework's internal naming + # uses ``:`` for PVC-08 prefixed-form policy lookups, + # but the LLM only sees the *wrapper*'s ``.name``. Use ``__`` + # (double underscore) as the LLM-visible separator: it satisfies + # both providers' regexes and is unambiguous (no real tool name + # contains a double underscore). ``inner.name`` keeps the colon + # form so ``effective_action`` / ``should_gate`` policy lookups + # stay PVC-08-compliant. + _llm_visible_name = inner.name.replace(":", "__") + + class _GatedTool(_GatedToolMarker): + name: str = _llm_visible_name + description: str = inner.description + # The wrapper does its own arg coercion via the inner tool's schema, + # so no need to copy it here. Keep ``args_schema`` aligned with the + # LLM-visible (post-strip) schema so BaseTool's input validator + # accepts the post-strip kwargs the LLM emits. Phase 9 strips + # injected keys here; pre-Phase-9 callers see the full schema. + args_schema: Any = _llm_visible_schema # type: ignore[assignment] + + def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 + # Phase 9 (D-09-01 / T-09-05): inject session-derived args + # BEFORE the gateway risk lookup so risk-rating sees the + # post-injection environment value. Pure no-op when + # ``injected_args`` is empty. + if inject_cfg: + + kwargs = inject_injected_args( + kwargs, + session=session, + injected_args_cfg=inject_cfg, + tool_name=inner.name, + accepted_params=_accepted_params or None, + ) + # Phase 11 (FOC-04): pure-policy gating boundary. Call + # should_gate to decide whether to pause for HITL approval; + # also call effective_action so the notify-audit branch + # below still fires for medium-risk tools that should NOT + # gate but should record an audit row. + action = effective_action( + inner.name, env=env, gateway_cfg=gateway_cfg, + ) + decision = _evaluate_gate( + session=session, + tool_name=inner.name, + gate_policy=gate_policy, + gateway_cfg=gateway_cfg, + ) + if decision.gate: + from langgraph.types import interrupt + + # Persist a ``pending_approval`` ToolCall row BEFORE + # raising GraphInterrupt so the approval-timeout watchdog + # has a record to scan. ``ts`` is the moment the human + # approval window opened. Stored args mirror the post- + # decision rows so the audit history reads consistently. + # + # On resume, LangGraph re-enters this node and runs us + # again from the top — so we must re-use the existing + # pending row instead of appending a duplicate. The most + # recent ``pending_approval`` row for this tool wins. + pending_args = dict(kwargs) if kwargs else {"args": list(args)} + existing_idx = _find_existing_pending_index( + session.tool_calls, inner.name, + ) + if existing_idx is not None: + pending_ts = session.tool_calls[existing_idx].ts + else: + pending_ts = _now_iso() + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=None, + ts=pending_ts, + risk="high", + status="pending_approval", + ) + ) + # CRITICAL: persist the pending_approval row BEFORE + # raising interrupt() so the approval-timeout + # watchdog (which reads from the DB) and the + # /approvals UI can see the pending state. Without + # this save the in-memory mutation is invisible to + # any out-of-process observer. + if store is not None: + store.save(session) + payload = { + "kind": "tool_approval", + "tool": inner.name, + "args": kwargs or args, + "tool_call_id": kwargs.get("tool_call_id"), + } + # First execution: raises GraphInterrupt, checkpointer pauses. + # Resume: returns whatever Command(resume=...) supplied. + decision = interrupt(payload) + # Decision payload may be a string ("approve" / "reject" / + # "timeout") or a dict {decision, approver, rationale}. + if isinstance(decision, dict): + verdict = decision.get("decision", "approve") + approver = decision.get("approver") + rationale = decision.get("rationale") + else: + verdict = decision or "approve" + approver = None + rationale = None + # Update the pending_approval row in place rather than + # appending a second audit entry. The watchdog and the + # /approvals UI both reason about a single audit row per + # high-risk call. + pending_idx = _find_pending_index( + session.tool_calls, inner.name, pending_ts, + ) + verdict_str = str(verdict).lower() + if verdict_str == "reject": + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"rejected": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="rejected", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"rejected": True, "rationale": rationale} + if verdict_str == "timeout": + # The approval window expired. Do NOT run the tool; + # mark the audit row ``status="timeout"`` so + # downstream consumers (UI, retraining) can + # distinguish operator-initiated rejections from + # automatic timeouts. + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"timeout": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="timeout", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"timeout": True, "rationale": rationale} + # Approved -> run the tool, then update the audit row. + result = _sync_invoke_inner(kwargs if kwargs else args[0] if args else {}) + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=result, + ts=pending_ts, + risk="high", + status="approved", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return result + + # auto / notify both run the tool now. + result = _sync_invoke_inner(kwargs if kwargs else args[0] if args else {}) + + if action == "notify": + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=dict(kwargs) if kwargs else {"args": list(args)}, + result=result, + ts=_now_iso(), + risk="medium", + status="executed_with_notify", + ) + ) + return result + + async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 + # Phase 9 (D-09-01 / T-09-05): inject session-derived args + # BEFORE the gateway risk lookup. Mirror of the sync ``_run``. + if inject_cfg: + + kwargs = inject_injected_args( + kwargs, + session=session, + injected_args_cfg=inject_cfg, + tool_name=inner.name, + accepted_params=_accepted_params or None, + ) + # Phase 11 (FOC-04): pure-policy gating boundary. Mirror of + # the sync ``_run`` -- consult should_gate via + # ``_evaluate_gate``; still call ``effective_action`` to + # keep the notify-audit branch for medium-risk tools. + action = effective_action( + inner.name, env=env, gateway_cfg=gateway_cfg, + ) + decision = _evaluate_gate( + session=session, + tool_name=inner.name, + gate_policy=gate_policy, + gateway_cfg=gateway_cfg, + ) + if decision.gate: + from langgraph.types import interrupt + + # Persist a ``pending_approval`` audit row BEFORE the + # GraphInterrupt fires so the watchdog can spot stale + # approvals. See the sync ``_run`` mirror for details. + pending_args = dict(kwargs) if kwargs else {"args": list(args)} + existing_idx = _find_existing_pending_index( + session.tool_calls, inner.name, + ) + if existing_idx is not None: + pending_ts = session.tool_calls[existing_idx].ts + else: + pending_ts = _now_iso() + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=None, + ts=pending_ts, + risk="high", + status="pending_approval", + ) + ) + # CRITICAL: persist the pending_approval row BEFORE + # raising interrupt() so the approval-timeout + # watchdog (which reads from the DB) and the + # /approvals UI can see the pending state. + if store is not None: + store.save(session) + payload = { + "kind": "tool_approval", + "tool": inner.name, + "args": kwargs or args, + "tool_call_id": kwargs.get("tool_call_id"), + } + decision = interrupt(payload) + if isinstance(decision, dict): + verdict = decision.get("decision", "approve") + approver = decision.get("approver") + rationale = decision.get("rationale") + else: + verdict = decision or "approve" + approver = None + rationale = None + pending_idx = _find_pending_index( + session.tool_calls, inner.name, pending_ts, + ) + verdict_str = str(verdict).lower() + if verdict_str == "reject": + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"rejected": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="rejected", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"rejected": True, "rationale": rationale} + if verdict_str == "timeout": + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"timeout": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="timeout", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"timeout": True, "rationale": rationale} + result = await inner.ainvoke(kwargs if kwargs else args[0] if args else {}) + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=result, + ts=pending_ts, + risk="high", + status="approved", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return result + + result = await inner.ainvoke(kwargs if kwargs else args[0] if args else {}) + + if action == "notify": + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=dict(kwargs) if kwargs else {"args": list(args)}, + result=result, + ts=_now_iso(), + risk="medium", + status="executed_with_notify", + ) + ) + return result + + return _GatedTool() + +# ====== module: runtime/tools/arg_injection.py ====== + +_LOG = logging.getLogger("runtime.orchestrator") + + +def strip_injected_params( + tool: BaseTool, + injected_keys: frozenset[str], +) -> BaseTool: + """Return a ``BaseTool`` whose ``args_schema`` hides every param named + in ``injected_keys``. + + The LLM only sees the stripped sig; the framework re-adds the real + values at invocation time via :func:`inject_injected_args` (D-09-01). + + Properties: + + * **Pure.** The original tool is left unchanged — its ``args_schema`` + is not mutated, so tests and in-process callers that hold a direct + reference keep their full schema. + * **Idempotent.** Calling twice with the same keys is equivalent to + calling once. The cloned schema is structurally identical. + * **Identity short-circuit.** Empty ``injected_keys`` (or no overlap + between ``injected_keys`` and the tool's params) returns the tool + unchanged so unconfigured apps and tools without any injectable + params pay nothing. + """ + if not injected_keys: + return tool + schema = getattr(tool, "args_schema", None) + if schema is None: + return tool + + # --- dict path: FastMCP / JSON-Schema tools --------------------------- + # FastMCP exposes ``args_schema`` as a plain JSON-Schema dict rather + # than a Pydantic model. Strip injected keys directly from the dict. + if isinstance(schema, dict): + props = schema.get("properties", {}) + overlap = injected_keys & set(props) + if not overlap: + return tool + new_props = {k: v for k, v in props.items() if k not in injected_keys} + required = [r for r in schema.get("required", []) if r not in injected_keys] + new_dict_schema: dict[str, Any] = {**schema, "properties": new_props, "required": required} + try: + return tool.model_copy(update={"args_schema": new_dict_schema}) + except Exception: # pragma: no cover — defensive fallback + import copy + stripped = copy.copy(tool) + stripped.args_schema = new_dict_schema # type: ignore[attr-defined] + return stripped + + # --- Pydantic path: BaseModel subclass tools -------------------------- + if not hasattr(schema, "model_fields"): + return tool + overlap = injected_keys & set(schema.model_fields.keys()) + if not overlap: + # No params to strip — preserve identity (no clone). + return tool + + # Build the kwargs for ``create_model`` from the surviving fields. + # Pydantic v2's ``create_model`` accepts ``(annotation, FieldInfo)`` + # tuples; FieldInfo carries default + description + alias so the + # cloned schema is functionally equivalent to the original minus + # the stripped fields. + keep: dict[str, tuple[Any, Any]] = { + name: (f.annotation, f) + for name, f in schema.model_fields.items() + if name not in injected_keys + } + new_schema = create_model( + f"{schema.__name__}__StrippedForLLM", + __base__=BaseModel, + **keep, # type: ignore[arg-type] + ) + + # ``BaseTool`` is itself a pydantic BaseModel — ``model_copy`` clones + # it cheaply and lets us swap ``args_schema`` without touching the + # original. Tools that are not pydantic models (extremely rare; only + # custom subclasses) fall back to a regular shallow copy. + try: + stripped = tool.model_copy(update={"args_schema": new_schema}) + except Exception: # pragma: no cover — defensive fallback + import copy + stripped = copy.copy(tool) + stripped.args_schema = new_schema # type: ignore[attr-defined] + return stripped + + +def _resolve_dotted(root: Session, path: str) -> Any | None: + """Walk ``path`` ('session.foo.bar') against ``root`` and return the + terminal value or ``None`` if any segment is missing / None. + + ``path`` must start with ``session.``. The leading ``session`` token + pins the resolution root to the live Session — config-declared paths + cannot reach into arbitrary modules. Subsequent segments walk + attributes (``getattr``) — for fields stored under ``extra_fields`` + apps use ``session.extra_fields.foo`` which goes through the dict + branch below. + """ + parts = path.split(".") + if not parts or parts[0] != "session": + raise ValueError( + f"injected_args path {path!r} must start with 'session.'" + ) + cur: Any = root + for seg in parts[1:]: + if cur is None: + return None + # Support dict-valued attrs (notably ``Session.extra_fields``) + # transparently — ``session.extra_fields.pr_url`` resolves + # whether ``extra_fields`` is a real attribute or a dict on + # the model. Plain attribute walks work for typed Session + # subclasses (``IncidentState.environment``). + if isinstance(cur, dict): + cur = cur.get(seg) + else: + cur = getattr(cur, seg, None) + return cur + + +def inject_injected_args( + tool_args: dict[str, Any], + *, + session: Session, + injected_args_cfg: dict[str, str], + tool_name: str, + accepted_params: set[str] | frozenset[str] | None = None, +) -> dict[str, Any]: + """Return a NEW dict with each injected arg resolved from ``session``. + + Behaviour (D-09-03): + + * Mutation-free: ``tool_args`` is never modified. Callers that need + to keep the LLM's original call shape can compare ``tool_args`` to + the return value. + * Framework wins on conflict. When the LLM already supplied a value + and the resolved framework value differs, the framework value is + written and a single INFO record is emitted on the + ``runtime.orchestrator`` logger with the documented payload tokens + (``tool``, ``arg``, ``llm_value``, ``framework_value``, + ``session_id``). + * Missing/None resolutions are skipped. The arg is left absent so + the tool's own default-handling (or the MCP server's required-arg + validator) decides what to do — never silently ``None``. + * When ``accepted_params`` is provided, injected keys not present in + that set are skipped. Prevents writing kwargs the target tool + doesn't accept (which would raise pydantic ``unexpected_keyword`` + validation errors at the FastMCP boundary). + """ + out = dict(tool_args) + for arg_name, path in injected_args_cfg.items(): + if accepted_params is not None and arg_name not in accepted_params: + # The tool doesn't declare this injectable param. Strip any + # LLM-supplied value too — the LLM shouldn't be emitting it + # (Phase 9 strips injectable keys from the LLM-visible sig) + # and forwarding it to the tool would raise pydantic + # ``unexpected_keyword`` at the FastMCP boundary. + if arg_name in out: + _LOG.info( + "tool_call.injected_arg_dropped tool=%s arg=%s " + "llm_value=%r reason=not_accepted_by_tool session_id=%s", + tool_name, + arg_name, + out[arg_name], + getattr(session, "id", "?"), + ) + del out[arg_name] + continue + framework_value = _resolve_dotted(session, path) + if framework_value is None: + continue + if arg_name in out and out[arg_name] != framework_value: + _LOG.info( + "tool_call.injected_arg_overridden tool=%s arg=%s " + "llm_value=%r framework_value=%r session_id=%s", + tool_name, + arg_name, + out[arg_name], + framework_value, + getattr(session, "id", "?"), + ) + out[arg_name] = framework_value + return out + + +def accepted_params_for_tool(tool: Any) -> frozenset[str] | None: + """Return the set of parameter names a wrapped tool accepts. + + Handles both shapes ``args_schema`` can take in this codebase: + + * pydantic ``BaseModel`` subclass — read ``model_fields.keys()`` + (used by mock tools and by tests). + * JSON-Schema ``dict`` — read ``schema["properties"].keys()`` + (used by real FastMCP-derived tools, which expose the underlying + function's input schema as a JSON Schema rather than a pydantic + class). + + Returns ``None`` when the tool has no introspectable schema (caller + should treat this as "skip filtering" — preserves prior behaviour). + """ + schema = getattr(tool, "args_schema", None) + if schema is None: + return None + if hasattr(schema, "model_fields"): + return frozenset(schema.model_fields.keys()) + if isinstance(schema, dict): + props = schema.get("properties") + if isinstance(props, dict): + return frozenset(props.keys()) + return None + + +__all__ = [ + "strip_injected_params", + "inject_injected_args", + "accepted_params_for_tool", + "_LOG", +] + +# ====== module: runtime/tools/approval_watchdog.py ====== + +if TYPE_CHECKING: + pass +logger = logging.getLogger(__name__) + +_UTC_TS_FMT = "%Y-%m-%dT%H:%M:%SZ" + +# Sessions whose status is in this set are *not* candidates for the +# watchdog — either they never paused for approval, or they have already +# moved past it. ``awaiting_input`` is the only status produced by +# ``langgraph.types.interrupt()`` while a high-risk gate is open. +_TERMINAL_STATUSES = frozenset({ + "resolved", "stopped", "escalated", "duplicate", "deleted", "error", +}) + + +def _parse_iso(ts: str | None) -> datetime | None: + """Parse an ISO-8601 ``YYYY-MM-DDTHH:MM:SSZ`` ts back into UTC. + + Returns ``None`` for malformed values; callers treat that as + "skip this row" so the watchdog never crashes on a bad audit + record. + """ + if not ts: + return None + try: + # Replace trailing 'Z' so ``fromisoformat`` accepts it on + # Python <3.11. The format is fixed by ``_UTC_TS_FMT`` so this + # round-trips cleanly. + if ts.endswith("Z"): + ts = ts[:-1] + "+00:00" + dt = datetime.fromisoformat(ts) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc) + except (ValueError, TypeError): + return None + + +class ApprovalWatchdog: + """Background asyncio task that resumes stale pending-approval sessions. + + Owned by :class:`runtime.service.OrchestratorService`; started in + ``OrchestratorService.start()`` and stopped in ``shutdown()``. The + task runs on the service's background loop so it shares the same + checkpointer / SQLite engine / FastMCP transports the live + sessions are using. + """ + + def __init__( + self, + service: "OrchestratorService", + *, + approval_timeout_seconds: int, + poll_interval_seconds: float = 60.0, + ) -> None: + self._service = service + self._approval_timeout_seconds = approval_timeout_seconds + self._poll_interval_seconds = poll_interval_seconds + self._task: asyncio.Task | None = None + self._stop_event: asyncio.Event | None = None + # HARD-07: ``stop()`` is idempotent. Once a stop has been + # initiated (or completed), subsequent calls return immediately + # rather than racing on ``_task`` / ``_stop_event`` which the + # first caller is already clearing. Mutated only on the loop + # thread (where ``stop()`` runs), so no extra lock needed. + self._stopped: bool = False + + @property + def is_running(self) -> bool: + return self._task is not None and not self._task.done() + + def start(self, loop: asyncio.AbstractEventLoop) -> None: + """Schedule the watchdog onto ``loop``. Idempotent. + + Must be called from a thread that is not the loop's own thread — + the typical caller is :meth:`OrchestratorService.start`. Returns + immediately; the polling coroutine runs in the background. + """ + if self._task is not None and not self._task.done(): + return + + async def _arm() -> None: + # Re-arm: a previous ``stop()`` may have flipped this; a + # fresh ``start()`` re-enables ``stop()``. + self._stopped = False + self._stop_event = asyncio.Event() + self._task = asyncio.create_task( + self._run(), name="approval_watchdog", + ) + + fut = asyncio.run_coroutine_threadsafe(_arm(), loop) + fut.result(timeout=5.0) + + async def stop(self) -> None: + """Signal the polling loop to exit and await termination. + + HARD-07: Idempotent and abrupt-shutdown safe. Safe to call: + * before ``start()`` (no-op), + * multiple times (subsequent calls short-circuit on + ``_stopped`` after the first caller flips it), + * concurrently from two callers — the first claims ownership + of ``_task`` and drains it; the second sees the task is + already gone and returns. + + Cancellation strategy: signal via ``_stop_event`` first so the + polling loop exits its ``wait_for`` cleanly; then bound the + drain by ``asyncio.wait_for(task, timeout=1.0)``. If the task + ignores the event (or the event loop is being torn down under + us), fall back to ``task.cancel()`` and one final drain. + ``CancelledError`` and ``TimeoutError`` are suppressed — there + is no useful recovery from a watchdog that won't die. + + Runs on the loop thread (called from ``OrchestratorService._close_*`` + helpers, or as a graceful no-op cleanup hook). + """ + # First-call wins. Subsequent callers (and the after-shutdown + # path) see ``_stopped`` and return without re-running the + # drain — protects against double-await on ``_task``. + if self._stopped: + return + self._stopped = True + # Snapshot to LOCAL variables so concurrent ``stop()`` calls + # never re-await the same task. We do NOT null out ``_task`` / + # ``_stop_event`` until after the drain because ``_run()`` + # reads ``self._stop_event`` on every loop iteration; clearing + # it before signalling would crash the polling loop with + # ``AttributeError: 'NoneType' object has no attribute + # 'is_set'`` and produce exactly the noisy teardown this fix + # is meant to prevent. + task = self._task + stop_event = self._stop_event + if stop_event is not None: + stop_event.set() + if task is None or task.done(): + self._task = None + self._stop_event = None + return + try: + await asyncio.wait_for(asyncio.shield(task), timeout=1.0) + except (asyncio.TimeoutError, asyncio.CancelledError): + task.cancel() + try: + await asyncio.wait_for(task, timeout=1.0) + except (asyncio.TimeoutError, asyncio.CancelledError): + # Task is wedged or the loop is shutting down under us. + # The ``cancel()`` call above is enough to flip the task + # state; ``run_loop`` 's final ``gather`` pass will sweep + # it during loop teardown. Don't block shutdown further. + pass + finally: + # Always clear the bookkeeping refs so a subsequent + # ``start()`` arms cleanly and ``is_running`` reports False. + self._task = None + self._stop_event = None + + async def close(self) -> None: + """Alias for :meth:`stop` — symmetric with aiohttp/httpx. + + Idempotent. Provided so callers using a "close-on-cleanup" + pattern (``async with`` on parent owners) read naturally. + """ + await self.stop() + + async def _run(self) -> None: + """Polling loop. Runs until ``_stop_event`` is set. + + We bind ``stop_event`` to a LOCAL variable on entry so a + concurrent ``stop()`` cannot null out ``self._stop_event`` + from underneath us mid-iteration (HARD-07: that nulling-while- + running was the original source of ``AttributeError`` at + teardown). + """ + stop_event = self._stop_event + assert stop_event is not None + while not stop_event.is_set(): + try: + await self._tick() + except asyncio.CancelledError: + raise + except Exception: # noqa: BLE001 + logger.exception("approval watchdog tick failed") + try: + await asyncio.wait_for( + stop_event.wait(), + timeout=self._poll_interval_seconds, + ) + except asyncio.TimeoutError: + # Expected — wakes the loop every ``poll_interval_seconds``. + continue + + async def _tick(self) -> None: + """One scan + resume pass. Visible for tests via ``run_once``.""" + await self.run_once() + + async def run_once(self) -> int: + """Single scan pass. Returns the number of sessions resumed. + + Exposed publicly so tests can drive the watchdog + deterministically without waiting on the polling cadence. + """ + orch = getattr(self._service, "_orch", None) + if orch is None: + return 0 + registry = dict(self._service._registry) + if not registry: + return 0 + now = datetime.now(timezone.utc) + resumed = 0 + for session_id in list(registry.keys()): + try: + inc = orch.store.load(session_id) + except Exception: # noqa: BLE001 + continue + status = getattr(inc, "status", None) + if status in _TERMINAL_STATUSES: + continue + if status != "awaiting_input": + # Only sessions paused on a high-risk gate are watchdog + # candidates. ``in_progress`` / ``new`` are still + # actively running on the loop. + continue + stale = self._find_stale_pending(inc, now) + if not stale: + continue + # No is_locked() peek here — try_acquire (inside + # _resume_with_timeout) is the single contention check, so + # there is no TOCTOU window between check and acquire. The + # SessionBusy handler below fires on real contention. + try: + await self._resume_with_timeout(orch, session_id) + resumed += 1 + except SessionBusy: + logger.debug( + "approval watchdog: session %s SessionBusy at resume, skipping", + session_id, + ) + continue + except Exception: # noqa: BLE001 + logger.exception( + "approval watchdog: resume failed for session %s", + session_id, + ) + return resumed + + def _find_stale_pending(self, inc: Any, now: datetime) -> list[int]: + """Return indices of ``pending_approval`` ToolCalls older than the + configured timeout.""" + out: list[int] = [] + tool_calls = getattr(inc, "tool_calls", []) or [] + threshold = self._approval_timeout_seconds + for idx, tc in enumerate(tool_calls): + if getattr(tc, "status", None) != "pending_approval": + continue + ts = _parse_iso(getattr(tc, "ts", None)) + if ts is None: + continue + age = (now - ts).total_seconds() + if age >= threshold: + out.append(idx) + return out + + async def _resume_with_timeout( + self, orch: Any, session_id: str, + ) -> None: + """Resume the paused graph with a synthetic timeout decision. + + Uses ``Command(resume=...)`` against the same ``thread_id`` the + approval API would use — the wrap_tool resume path updates the + audit row to ``status="timeout"`` automatically. + + Per D-18: the ``ainvoke`` call is wrapped in + ``orch._locks.try_acquire(session_id)`` so a concurrent user- + driven turn cannot interleave checkpoint writes for the same + ``thread_id``. If the lock is already held, ``try_acquire`` + raises ``SessionBusy`` immediately (no waiting); the caller + (``run_once``) catches that and skips the tick — this is how + the watchdog tolerates a busy session without piling up. + """ + from langgraph.types import Command # local: heavy import + + decision_payload = { + "decision": "timeout", + "approver": "system", + "rationale": "approval window expired", + } + async with orch._locks.try_acquire(session_id): + await orch.graph.ainvoke( + Command(resume=decision_payload), + config=orch._thread_config(session_id), + ) + +# ====== module: runtime/policy.py ====== + +if TYPE_CHECKING: # pragma: no cover -- type checking only + + + pass # noqa: PIE790 -- bundle survives even if imports are stripped + + +GateReason = Literal[ + "auto", + "high_risk_tool", + "gated_env", + "low_confidence", + "blocked", +] + + +class GateDecision(BaseModel): + """Outcome of a single gating evaluation.""" + + model_config = ConfigDict(extra="forbid") + gate: bool + reason: GateReason + + +def should_gate( + session: Any, + tool_call: "ToolCall", + confidence: float | None, + cfg: "OrchestratorConfig", +) -> GateDecision: + """Decide whether ``tool_call`` should pause for HITL approval. + + Pure -- delegates the per-tool risk lookup to + :func:`runtime.tools.gateway.effective_action` (so the v1.0 PVC-08 + prefixed-form lookup invariant is preserved) and combines the + result with ``session.environment`` and ``confidence`` per the + precedence rules in the module docstring. + + ``session`` is typed as ``Any`` because the framework's base + :class:`runtime.state.Session` does not own the ``environment`` + field (apps subclass and add it). The function reads + ``session.environment`` and tolerates a missing attribute by + treating it as ``None``. + + ``confidence=None`` means "no signal yet" -- treated internally as + 1.0 to avoid a false-positive low_confidence gate before any + envelope/tool-arg has surfaced for the active turn. + """ + # Read gateway config off the OrchestratorConfig. The runtime threads + # it via cfg.gateway today (sibling of cfg.gate_policy in the + # OrchestratorConfig namespace) -- gracefully tolerate the legacy + # path where gateway is configured on RuntimeConfig instead. + gateway_cfg = getattr(cfg, "gateway", None) + env = getattr(session, "environment", None) + + risk_action = effective_action( + tool_call.tool, + env=env, + gateway_cfg=gateway_cfg, + ) + + # 1. high-risk tool gates first. + if risk_action in cfg.gate_policy.gated_risk_actions: + return GateDecision(gate=True, reason="high_risk_tool") + + # 2. gated env: any non-"auto" risk in a gated environment. + if (env in cfg.gate_policy.gated_environments + and risk_action != "auto"): + return GateDecision(gate=True, reason="gated_env") + + # 3. low confidence: only an actionable tool. None == "no signal yet". + effective_conf = 1.0 if confidence is None else confidence + if (effective_conf < cfg.gate_policy.confidence_threshold + and risk_action != "auto"): + return GateDecision(gate=True, reason="low_confidence") + + return GateDecision(gate=False, reason="auto") + + +# --------------------------------------------------------------- +# Phase 12 (FOC-05): pure should_retry policy. +# --------------------------------------------------------------- + +import asyncio as _asyncio + +import pydantic as _pydantic + + +RetryReason = Literal[ + "auto_retry", + "max_retries_exceeded", + "permanent_error", + "low_confidence_no_retry", + "transient_disabled", +] + + +class RetryDecision(BaseModel): + """Outcome of a single retry-policy evaluation. + + Pure surface: produced by :func:`should_retry` from + ``(retry_count, error, confidence, cfg)``. The orchestrator's + ``_retry_session_locked`` consults this BEFORE running the retry; + the UI consults the same value via + ``Orchestrator.preview_retry_decision`` to render the button label / + disabled state. + """ + + model_config = ConfigDict(extra="forbid") + retry: bool + reason: RetryReason + + +# Whitelist of exception types that are NEVER auto-retryable. +# Schema/validation errors -- the LLM produced bad data; retrying +# without addressing root cause burns budget. Adding a new entry is a +# one-line PR (D-12-02 explicit choice -- no new ToolError ABC). +_PERMANENT_TYPES: tuple[type[BaseException], ...] = ( + _pydantic.ValidationError, + EnvelopeMissingError, +) + +# Whitelist of exception types that are ALWAYS auto-retryable +# (subject to max_retries). Network blips, asyncio timeouts, +# filesystem/socket transients. httpx is NOT imported because the +# runtime does not raise httpx errors today; built-in TimeoutError +# covers asyncio's 3.11+ alias. +_TRANSIENT_TYPES: tuple[type[BaseException], ...] = ( + _asyncio.TimeoutError, + TimeoutError, + OSError, + ConnectionError, +) + + +def _is_permanent_error(error: Exception | None) -> bool: + if error is None: + return False + return isinstance(error, _PERMANENT_TYPES) + + +def _is_transient_error(error: Exception | None) -> bool: + if error is None: + return False + return isinstance(error, _TRANSIENT_TYPES) + + +def should_retry( + retry_count: int, + error: Exception | None, + confidence: float | None, + cfg: "OrchestratorConfig", +) -> RetryDecision: + """Decide whether the framework should auto-retry a failed turn. + + Pure -- same inputs always yield identical RetryDecision. + + Precedence (descending; first match wins): + 1. ``retry_count >= cfg.retry_policy.max_retries`` + -> ``RetryDecision(retry=False, reason="max_retries_exceeded")`` + 2. ``error`` matches ``_PERMANENT_TYPES`` + -> ``RetryDecision(retry=False, reason="permanent_error")`` + 3. ``confidence is not None`` AND + ``confidence < cfg.retry_policy.retry_low_confidence_threshold`` + AND ``error`` is NOT in ``_TRANSIENT_TYPES`` + -> ``RetryDecision(retry=False, reason="low_confidence_no_retry")`` + 4. ``error`` matches ``_TRANSIENT_TYPES`` AND + ``cfg.retry_policy.retry_on_transient is False`` + -> ``RetryDecision(retry=False, reason="transient_disabled")`` + 5. ``error`` matches ``_TRANSIENT_TYPES`` AND + ``cfg.retry_policy.retry_on_transient is True`` + -> ``RetryDecision(retry=True, reason="auto_retry")`` + 6. Default fall-through (no match) -> ``RetryDecision( + retry=False, reason="permanent_error")`` -- fail-closed + conservative default (D-12-02). + + ``retry_count`` is the count of PRIOR retries (0 on the first + retry attempt). Caller is responsible for the bump. + + ``error`` may be ``None`` (caller has no exception object); that is + treated as a permanent error for safety. + + ``confidence`` is the last AgentRun.confidence for the failed turn; + ``None`` means "no signal recorded" and skips the low-confidence + gate. + """ + # 1. absolute cap -- regardless of error class + if retry_count >= cfg.retry_policy.max_retries: + return RetryDecision(retry=False, reason="max_retries_exceeded") + + # 2. permanent errors -- never auto-retry + if _is_permanent_error(error): + return RetryDecision(retry=False, reason="permanent_error") + + is_transient = _is_transient_error(error) + + # 3. low-confidence -- only when error is NOT transient (transient + # errors are mechanical; the LLM's confidence in the business + # decision is still trustworthy on retry). + if (confidence is not None + and confidence < cfg.retry_policy.retry_low_confidence_threshold + and not is_transient): + return RetryDecision( + retry=False, reason="low_confidence_no_retry", + ) + + # 4 + 5. transient classification + if is_transient: + if not cfg.retry_policy.retry_on_transient: + return RetryDecision(retry=False, reason="transient_disabled") + return RetryDecision(retry=True, reason="auto_retry") + + # 6. fail-closed default + return RetryDecision(retry=False, reason="permanent_error") + + +__all__ = [ + # Phase 11 + "GateDecision", "GateReason", "should_gate", + # Phase 12 + "RetryDecision", "RetryReason", "should_retry", +] + +# ====== module: runtime/agents/responsive.py ====== + +logger = logging.getLogger(__name__) + + +def make_agent_node( + *, + skill: Skill, + llm: BaseChatModel, + tools: list[BaseTool], + decide_route: Callable[[Session], str], + store: SessionStore, + valid_signals: frozenset[str] | None = None, + gateway_cfg: GatewayConfig | None = None, + terminal_tool_names: frozenset[str] = frozenset(), + patch_tool_names: frozenset[str] = frozenset(), + gate_policy: "GatePolicy | None" = None, +): + """Factory: build a LangGraph node that runs a ReAct agent and decides a route. + + ``valid_signals`` is the orchestrator-wide accepted signal vocabulary + (``cfg.orchestrator.signals``). When omitted, the legacy + ``{success, failed, needs_input}`` default is used so older callers and + tests keep working. + + ``gateway_cfg`` is the optional risk-rated tool gateway config. + When supplied, every ``BaseTool`` in ``tools`` is wrapped via + :func:`runtime.tools.gateway.wrap_tool` *inside the node body* so the + closure captures the live ``Session`` per agent invocation. When + ``None``, tools are passed through untouched. + """ + # Imported lazily to avoid an import cycle: ``runtime.graph`` depends + # on this module via ``_build_agent_nodes``, but the helpers used + # inside the node body live in ``graph`` so we keep a single + # implementation for the responsive path. The cycle is benign at + # call time — both modules are fully imported before ``node()`` runs. + + + async def node(state: GraphState) -> dict: + incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + inc_id = incident.id + started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + + # Wrap tools per-invocation so each wrap closes over the + # live ``Session`` for this run. + if gateway_cfg is not None: + run_tools = [ + wrap_tool(t, session=incident, gateway_cfg=gateway_cfg, + agent_name=skill.name, store=store, + gate_policy=gate_policy) + for t in tools + ] + else: + run_tools = tools + # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every + # responsive agent invocation is wrapped in an AgentTurnOutput + # envelope. ``langchain.agents.create_agent`` (the non-deprecated + # successor to ``langgraph.prebuilt.create_react_agent``) accepts a + # bare schema as ``response_format`` and, by default, wraps it in + # ``AutoStrategy`` — ProviderStrategy for models with native + # structured-output (OpenAI-class), falling back to ToolStrategy + # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a + # callable tool: when the LLM ``calls`` it, the loop terminates on + # the same turn with ``result["structured_response"]`` populated. + # Eliminates the old two-call structure (loop + separate + # ``with_structured_output`` pass) that hit recursion_limit=25 on + # Ollama models without true function-calling. + agent_executor = create_agent( + model=llm, + tools=run_tools, + system_prompt=skill.system_prompt, + response_format=AgentTurnOutput, + ) + + # Phase 11 (FOC-04): reset per-turn confidence hint at the + # start of each agent step so the gateway treats the first + # tool call of the turn as "no signal yet". + try: + incident.turn_confidence_hint = None + except (AttributeError, ValueError): + pass + + try: + result = await _ainvoke_with_retry( + agent_executor, + {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + ) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up. + raise + except Exception as exc: # noqa: BLE001 + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + + # Tools (e.g. registered patch tools) write straight to disk. + # Reload so the node's own append of agent_run + tool_calls + # happens against the tool-mutated state. + incident = store.load(inc_id) + + messages = result.get("messages", []) + ts = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + + agent_confidence, agent_rationale, agent_signal = _harvest_tool_calls_and_patches( + messages, skill.name, incident, ts, valid_signals, + terminal_tool_names=terminal_tool_names, + patch_tool_names=patch_tool_names, + ) + # Phase 11 (FOC-04): update hint so any subsequent in-turn + # tool call sees the harvested confidence. + if agent_confidence is not None: + try: + incident.turn_confidence_hint = agent_confidence + except (AttributeError, ValueError): + pass + _pair_tool_responses(messages, incident) + + # Phase 10 (FOC-03 / D-10-03): parse envelope; reconcile against + # any typed-terminal-tool-arg confidence. Envelope failure is a + # structured agent_run error. + try: + envelope = parse_envelope_from_result(result, agent=skill.name) + except EnvelopeMissingError as exc: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + + terminal_tool_for_log = _first_terminal_tool_called_this_turn( + messages, terminal_tool_names, + ) + final_confidence = reconcile_confidence( + envelope.confidence, + agent_confidence, + agent=skill.name, + session_id=inc_id, + tool_name=terminal_tool_for_log, + ) + final_rationale = agent_rationale or envelope.confidence_rationale + final_signal = agent_signal if agent_signal is not None else envelope.signal + + final_text = envelope.content or _extract_final_text(messages) + usage = _sum_token_usage(messages) + + _record_success_run( + incident=incident, skill_name=skill.name, started_at=started_at, + final_text=final_text, usage=usage, + confidence=final_confidence, rationale=final_rationale, + signal=final_signal, + store=store, + ) + next_route_signal = decide_route(incident) + next_node = route_from_skill(skill, next_route_signal) + return {"session": incident, "next_route": next_node, + "last_agent": skill.name, "error": None} + + return node + + +__all__ = ["make_agent_node"] + +# ====== module: runtime/agents/supervisor.py ====== + +logger = logging.getLogger(__name__) + + +def _safe_eval(expr: str, ctx: dict[str, Any]) -> Any: + """Evaluate a pre-validated safe-eval expression against ``ctx``. + + The expression must already have passed + :func:`runtime.skill._validate_safe_expr` — that's enforced at + skill-load time. We re-parse here (cheap) and walk the tree + against the same allowlist; any non-whitelisted node is treated + as evaluating to ``False`` so a malformed runtime expression can + never escalate to arbitrary code execution. + """ + + _validate_safe_expr(expr, source="supervisor.dispatch_rule") + # ``compile`` + ``eval`` over a built-in-stripped namespace is the + # cheapest correct evaluator once the AST is whitelisted. The + # ``__builtins__`` removal blocks ``__import__`` etc. should the + # AST checker miss something. + code = compile(expr, "", "eval") + return eval(code, {"__builtins__": {}}, ctx) # noqa: S307 — AST-whitelisted + + +def _ctx_for_session(incident: Session) -> dict[str, Any]: + """Build the variable namespace dispatch-rule expressions see. + + Exposes the live session payload as ``session`` plus a few + ergonomic top-level aliases for fields operators reach for most + often. Adding new top-level names is a one-liner; the safe-eval + AST checker already restricts the language so we don't need to + sandbox the namespace any further. + """ + payload = incident.model_dump() + return { + "session": payload, + "status": payload.get("status"), + "agents_run": payload.get("agents_run") or [], + "tool_calls": payload.get("tool_calls") or [], + } + + +def log_supervisor_dispatch( + *, + session: Session, + supervisor: str, + strategy: str, + depth: int, + targets: list[str], + rule_matched: str | None, + payload_size: int, +) -> None: + """Emit one structured ``supervisor_dispatch`` log entry. + + Operators wanting an end-to-end audit join ``agent_runs`` and the + log stream by ``incident_id``. The audit trail is deliberately a + different stream from ``agent_runs`` because supervisors don't burn + tokens — bloating ``agents_run`` with router rows is a known trap + we explicitly avoid. + """ + record = { + "event": "supervisor_dispatch", + "ts": datetime.now(timezone.utc).strftime(_UTC_TS_FMT), + "incident_id": session.id, + "session_id": session.id, + "supervisor": supervisor, + "strategy": strategy, + "depth": depth, + "targets": targets, + "rule_matched": rule_matched, + "dispatch_payload_size": payload_size, + } + logger.info("supervisor_dispatch %s", json.dumps(record)) + + +def _llm_pick_target( + *, + skill: Skill, + llm: BaseChatModel, + incident: Session, +) -> str: + """One-shot LLM dispatch: ask the model to choose a subordinate. + + The model is asked to reply with **only** the name of one + subordinate. We accept the first matching name in the response + (case-insensitive substring match) and fall back to the first + subordinate when the response is unparseable — keeping the graph + moving rather than failing outright. + """ + prompt = ( + f"{skill.dispatch_prompt}\n\n" + f"Choose ONE of: {', '.join(skill.subordinates)}.\n" + f"Reply with only the agent name." + ) + payload = json.dumps(incident.model_dump(), default=str) + msgs = [ + SystemMessage(content=prompt), + HumanMessage(content=payload), + ] + try: + result = llm.invoke(msgs) + except Exception as exc: # noqa: BLE001 + logger.warning( + "supervisor %s: LLM dispatch failed (%s); falling back to %s", + skill.name, exc, skill.subordinates[0], + ) + return skill.subordinates[0] + text = (getattr(result, "content", "") or "").strip().lower() + for name in skill.subordinates: + if name.lower() in text: + return name + logger.warning( + "supervisor %s: LLM reply %r did not name a subordinate; " + "falling back to %s", skill.name, text, skill.subordinates[0], + ) + return skill.subordinates[0] + + +def _rule_pick_target( + *, + skill: Skill, + incident: Session, +) -> tuple[str, str | None]: + """Walk dispatch_rules in order; return (target, matched_when). + + Falls back to the first subordinate when no rule matches; the + fallback case carries ``matched_when=None`` so the audit log can + distinguish "default" from "rule X matched". + """ + ctx = _ctx_for_session(incident) + for rule in skill.dispatch_rules: + try: + if bool(_safe_eval(rule.when, ctx)): + return rule.target, rule.when + except Exception as exc: # noqa: BLE001 + logger.warning( + "supervisor %s: dispatch_rule %r raised %s; skipping", + skill.name, rule.when, exc, + ) + return skill.subordinates[0], None + + +def _normalize_runner_route(value: Any) -> str: + """Map runner-supplied route aliases to the canonical graph end token. + + Apps writing runners reach for ``"END"`` / ``"end"`` / ``"__end__"`` + interchangeably; LangGraph's conditional edges only recognise + ``"__end__"``. Normalising here keeps the runner contract permissive + without spreading the alias check across the graph layer. + """ + if isinstance(value, str) and value.strip().lower() in {"end", "__end__"}: + return "__end__" + return value + + +def make_supervisor_node( + *, + skill: Skill, + llm: BaseChatModel | None = None, + framework_cfg: Any | None = None, +): + """Build the supervisor LangGraph node. + + Pure routing: no ``AgentRun`` row, no tool execution, no token + accounting beyond what the optional LLM call itself reports. The + node sets ``state["next_route"]`` to a subordinate name and returns; + LangGraph's conditional edges fan out to that node from there. + + The optional ``llm`` is only used when ``skill.dispatch_strategy`` + is ``"llm"``. Callers using ``"rule"`` may pass ``None``. + + When ``skill.runner`` is set, the dotted-path callable is resolved + at build time and invoked at the start of each node call BEFORE the + routing dispatch. The runner gets the live ``GraphState`` and the + optional ``framework_cfg`` and may return ``None`` (continue with + the routing table) or a dict patch that gets merged into state. A + patch carrying ``"next_route"`` short-circuits the routing table + entirely (use ``"__end__"`` to terminate the graph). + """ + # Local import to avoid the circular runtime.graph -> runtime.agents + # cycle at module-load time. + + + if skill.kind != "supervisor": + raise ValueError( + f"make_supervisor_node called with non-supervisor skill " + f"{skill.name!r} (kind={skill.kind!r})" + ) + + runner: Callable[..., Any] | None = None + if skill.runner is not None: + if callable(skill.runner): + # Test stubs and composed runners may supply a live callable + # directly rather than a dotted-path string. Access via the + # class __dict__ to avoid Python binding it as an instance + # method when the skill is a plain object (not a Pydantic model). + raw = vars(type(skill)).get("runner", skill.runner) + runner = raw if callable(raw) else skill.runner + else: + # Resolved a second time here so a runner that fails to import + # at graph-build time still surfaces a clear error. The skill + # validator catches most issues at YAML load; this is belt-and- + # braces and also gives us the live callable to invoke. + runner = _resolve_dotted_callable( + skill.runner, source=f"supervisor {skill.name!r} runner" + ) + + async def node(state: GraphState) -> dict: + sess: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + # ``dispatch_depth`` is an extension field on GraphState; start + # at 0 and increment per supervisor entry. + depth = int(state.get("dispatch_depth") or 0) + 1 + if depth > skill.max_dispatch_depth: + logger.warning( + "supervisor %s: dispatch depth %d exceeds limit %d; aborting", + skill.name, depth, skill.max_dispatch_depth, + ) + return { + "session": sess, + "next_route": "__end__", + "last_agent": skill.name, + "dispatch_depth": depth, + "error": ( + f"supervisor {skill.name!r}: max_dispatch_depth " + f"{skill.max_dispatch_depth} exceeded" + ), + } + + # ----- App-supplied runner hook ------------------------------- + runner_patch: dict[str, Any] = {} + if runner is not None: + # Build a thin proxy so the runner can reach intake_context + # (and any other framework_cfg attributes) without needing + # framework_cfg to be mutable. The proxy exposes intake_context + # directly and falls back to framework_cfg for all other attrs. + _app_cfg_proxy = type("_RunnerAppCfg", (), { + "intake_context": getattr(framework_cfg, "intake_context", None), + "__getattr__": lambda self, name: getattr(framework_cfg, name), + })() + try: + result = runner(state, app_cfg=_app_cfg_proxy) + except Exception as exc: # noqa: BLE001 + logger.exception( + "supervisor %s: runner %s raised; aborting to __end__", + skill.name, skill.runner, + ) + return { + "session": sess, + "next_route": "__end__", + "last_agent": skill.name, + "dispatch_depth": depth, + "error": ( + f"supervisor {skill.name!r}: runner failed: {exc}" + ), + } + if isinstance(result, dict): + runner_patch = dict(result) + elif result is not None: + logger.warning( + "supervisor %s: runner returned %s (expected dict|None); " + "ignoring", skill.name, type(result).__name__, + ) + override = runner_patch.pop("next_route", None) + if override is not None: + # Short-circuit: skip the routing table entirely. Audit + # log still fires so operators can trace the decision. + target = _normalize_runner_route(override) + # Pick up any fresh reference the runner returned. + sess = runner_patch.get("session", sess) + try: + payload_size = len( + json.dumps(sess.model_dump(), default=str) + ) + except Exception: # noqa: BLE001 — defensive + payload_size = 0 + log_supervisor_dispatch( + session=sess, + supervisor=skill.name, + strategy=f"runner:{skill.runner}", + depth=depth, + targets=[target], + rule_matched=None, + payload_size=payload_size, + ) + out: dict[str, Any] = { + "session": sess, + "next_route": target, + "last_agent": skill.name, + "dispatch_depth": depth, + "error": None, + } + # Merge any non-route keys the runner returned (e.g. + # extra GraphState fields apps want to carry forward). + for k, v in runner_patch.items(): + if k not in out: + out[k] = v + return out + # No override: fold any payload mutation back so the + # routing table sees the up-to-date object. + if "session" in runner_patch: + sess = runner_patch["session"] + + rule_matched: str | None = None + if skill.dispatch_strategy == "rule": + target, rule_matched = _rule_pick_target(skill=skill, incident=sess) + else: # "llm" + if llm is None: + logger.warning( + "supervisor %s: strategy=llm but no llm provided; " + "falling back to first subordinate", skill.name, + ) + target = skill.subordinates[0] + else: + target = _llm_pick_target(skill=skill, llm=llm, incident=sess) + + # Audit: one structured log entry per dispatch. + try: + payload_size = len(json.dumps(sess.model_dump(), default=str)) + except Exception: # noqa: BLE001 — defensive; size is a hint + payload_size = 0 + log_supervisor_dispatch( + session=sess, + supervisor=skill.name, + strategy=skill.dispatch_strategy, + depth=depth, + targets=[target], + rule_matched=rule_matched, + payload_size=payload_size, + ) + + out: dict[str, Any] = { + "session": sess, + "next_route": target, + "last_agent": skill.name, + "dispatch_depth": depth, + "error": None, + } + # Carry through any extra keys the runner emitted that the + # framework didn't consume itself (e.g. memory snapshots). + for k, v in runner_patch.items(): + if k not in out: + out[k] = v + return out + + return node + + +__all__ = ["make_supervisor_node", "log_supervisor_dispatch"] + +# ====== module: runtime/agents/monitor.py ====== + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Safe-eval evaluator +# --------------------------------------------------------------------------- + + +class SafeEvalError(Exception): + """Raised when a supposedly-validated expression fails to evaluate.""" + + +def safe_eval(expr: str, ctx: dict[str, Any]) -> Any: + """Evaluate ``expr`` against ``ctx`` after a fresh AST whitelist check. + + The skill loader validates ``emit_signal_when`` at parse time; we + re-validate here on every call to keep the threat model defensive + against any future code path that might construct a Skill bypassing + the loader's validators. + """ + _validate_safe_expr(expr, source="monitor.emit_signal_when") + code = compile(expr, "", "eval") + try: + return eval(code, {"__builtins__": {}}, ctx) # noqa: S307 — AST-whitelisted + except Exception as exc: # noqa: BLE001 + raise SafeEvalError(f"emit_signal_when {expr!r} raised: {exc}") from exc + + +# --------------------------------------------------------------------------- +# Cron parsing (minute-resolution; matches Skill._validate_cron grammar) +# --------------------------------------------------------------------------- + + +def _expand_cron_field(field: str, lo: int, hi: int) -> set[int]: + """Expand a single cron field into the set of int values it matches. + + Supports ``*``, ``*/n``, ``a``, ``a-b``, ``a-b/n``, and + comma-separated combinations of those — the grammar accepted by + :func:`runtime.skill._validate_cron`. + """ + out: set[int] = set() + for part in field.split(","): + step = 1 + if "/" in part: + base, _, step_s = part.partition("/") + step = int(step_s) + else: + base = part + if base == "*": + start, end = lo, hi + elif "-" in base: + a, _, b = base.partition("-") + start, end = int(a), int(b) + else: + v = int(base) + start, end = v, v + out.update(range(start, end + 1, step)) + return {v for v in out if lo <= v <= hi} + + +def _cron_matches(expr: str, when: datetime) -> bool: + """Return True if the given datetime satisfies the 5-field cron expression. + + Fields: minute, hour, day-of-month, month, day-of-week (0=Mon..6=Sun + — Python's ``datetime.weekday()`` convention; cron itself uses + 0=Sun, but for our minute-resolution scheduler the convention only + needs to be internally consistent and documented). + """ + minute, hour, dom, month, dow = expr.split() + return ( + when.minute in _expand_cron_field(minute, 0, 59) + and when.hour in _expand_cron_field(hour, 0, 23) + and when.day in _expand_cron_field(dom, 1, 31) + and when.month in _expand_cron_field(month, 1, 12) + and when.weekday() in _expand_cron_field(dow, 0, 6) + ) + + +# --------------------------------------------------------------------------- +# Monitor callable factory +# --------------------------------------------------------------------------- + + +def make_monitor_callable( + *, + skill: Skill, + observe_fn: Callable[[str], Any], + fire_trigger: Callable[[str, dict[str, Any]], None], +) -> Callable[[], None]: + """Build the callable a :class:`MonitorRunner` runs per tick. + + ``observe_fn(tool_name)`` is the seam through which the runner + invokes a tool. Production wires this to the orchestrator's MCP + tool registry; tests wire it to deterministic stubs. + + ``fire_trigger(name, payload)`` is the seam through which the + runner fires a trigger. Production wires this to the trigger + registry; tests wire it to a recorder. + + The returned callable is intentionally synchronous and exception- + safe: a failed ``observe_fn`` or ``fire_trigger`` is logged and + swallowed so one bad monitor cannot stall the runner. + """ + if skill.kind != "monitor": + raise ValueError( + f"make_monitor_callable called with non-monitor skill " + f"{skill.name!r} (kind={skill.kind!r})" + ) + + def tick() -> None: + observation: dict[str, Any] = {} + for tool_name in skill.observe: + try: + observation[tool_name] = observe_fn(tool_name) + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: observe tool %r raised %s; skipping", + skill.name, tool_name, exc, + ) + observation[tool_name] = None + ctx = { + "observation": observation, + "obs": observation, + } + try: + should_emit = bool(safe_eval(skill.emit_signal_when or "False", ctx)) + except SafeEvalError as exc: + logger.warning("monitor %s: %s", skill.name, exc) + return + if not should_emit: + return + try: + fire_trigger(skill.trigger_target or "", { + "monitor": skill.name, + "observation": observation, + }) + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: fire_trigger(%s) raised %s", + skill.name, skill.trigger_target, exc, + ) + + return tick + + +# --------------------------------------------------------------------------- +# MonitorRunner — orchestrator-level singleton +# --------------------------------------------------------------------------- + + +class _RegisteredMonitor: + __slots__ = ("skill", "callable_", "next_run_ts") + + def __init__(self, skill: Skill, callable_: Callable[[], None]) -> None: + self.skill = skill + self.callable_ = callable_ + # Track the last *scheduled* minute we fired so we never fire + # twice for the same wall-clock minute even if the scheduler + # thread oversleeps. + self.next_run_ts: datetime | None = None + + +class MonitorRunner: + """Owns a bounded thread pool and a scheduler thread that ticks + registered monitor skills on their cron schedules. + + Exactly one ``MonitorRunner`` exists per ``OrchestratorService`` + instance; the runner is built at service startup and shut down at + service teardown. + + Concurrency: each tick is dispatched to the + :class:`~concurrent.futures.ThreadPoolExecutor` so the scheduler + thread itself never blocks on a slow ``observe`` tool. The pool + size defaults to ``4`` (R6); each tick has a per-monitor timeout + sourced from the skill's ``tick_timeout_seconds``. + """ + + def __init__( + self, + *, + observe_fn: Callable[[str], Any], + fire_trigger: Callable[[str, dict[str, Any]], None], + max_workers: int = 4, + clock: Callable[[], datetime] | None = None, + ) -> None: + self._observe_fn = observe_fn + self._fire_trigger = fire_trigger + self._executor = ThreadPoolExecutor( + max_workers=max_workers, + thread_name_prefix="monitor", + ) + self._monitors: dict[str, _RegisteredMonitor] = {} + self._stop = threading.Event() + self._thread: threading.Thread | None = None + self._lock = threading.Lock() + # Injection seam for tests; default uses real wall-clock UTC. + self._clock = clock or (lambda: datetime.now(timezone.utc)) + + # ----- registration ----- + + def register(self, skill: Skill) -> None: + if skill.kind != "monitor": + raise ValueError( + f"MonitorRunner.register: skill {skill.name!r} kind=" + f"{skill.kind!r} (expected 'monitor')" ) - return Client( - {"command": server_cfg.command[0], "args": server_cfg.command[1:]} + callable_ = make_monitor_callable( + skill=skill, + observe_fn=self._observe_fn, + fire_trigger=self._fire_trigger, ) - raise ValueError(f"Unknown transport: {server_cfg.transport}") + with self._lock: + if skill.name in self._monitors: + raise ValueError(f"monitor {skill.name!r} already registered") + self._monitors[skill.name] = _RegisteredMonitor(skill, callable_) + def unregister(self, name: str) -> None: + with self._lock: + self._monitors.pop(name, None) -async def _load_in_process(server_cfg: MCPServerConfig, - stack: AsyncExitStack) -> list[BaseTool]: - if server_cfg.module is None: - raise ValueError(f"in_process server '{server_cfg.name}' missing 'module'") - mod = importlib.import_module(server_cfg.module) - fmcp = getattr(mod, "mcp", None) - if fmcp is None: - raise ValueError(f"Module {server_cfg.module} has no 'mcp' (FastMCP instance)") - # FastMCP exposes tools as functions; convert to langchain tools via adapter. - # We use the in-memory client transport. The client is registered into the - # caller's exit stack so its session/transport stays open while the loaded - # tools are in use. - from fastmcp import Client - client = Client(fmcp) - await stack.enter_async_context(client) - tools = await load_mcp_tools(client.session) - # Rewrite each tool's .name to ":" for LLM disambiguation. - for t in tools: - original_name = t.name - t.name = f"{server_cfg.name}:{original_name}" - t._original_mcp_name = original_name # type: ignore[attr-defined] - return tools + def registered(self) -> list[str]: + with self._lock: + return sorted(self._monitors.keys()) + # ----- lifecycle ----- -async def _load_remote(server_cfg: MCPServerConfig, - stack: AsyncExitStack) -> list[BaseTool]: - from fastmcp import Client - if server_cfg.transport in ("http", "sse"): - if not server_cfg.url: - raise ValueError(f"remote server '{server_cfg.name}' missing 'url'") - client = Client(server_cfg.url, headers=server_cfg.headers or None) - elif server_cfg.transport == "stdio": - if not server_cfg.command: - raise ValueError(f"stdio server '{server_cfg.name}' missing 'command'") - client = Client({"command": server_cfg.command[0], "args": server_cfg.command[1:]}) - else: - raise ValueError(f"Unknown transport: {server_cfg.transport}") - await stack.enter_async_context(client) - tools = await load_mcp_tools(client.session) - # Rewrite each tool's .name to ":" for LLM disambiguation. - for t in tools: - original_name = t.name - t.name = f"{server_cfg.name}:{original_name}" - t._original_mcp_name = original_name # type: ignore[attr-defined] - return tools + def start(self) -> None: + if self._thread is not None and self._thread.is_alive(): + return + self._stop.clear() + self._thread = threading.Thread( + target=self._run, + name="MonitorRunner", + daemon=True, + ) + self._thread.start() + def stop(self, *, wait: bool = True, timeout: float = 5.0) -> None: + """Halt the scheduler thread and shut down the executor. -async def load_tools(cfg: MCPConfig, stack: AsyncExitStack) -> ToolRegistry: - """Load all enabled MCP servers and return a :class:`ToolRegistry`. + ``wait=True`` (default) blocks up to ``timeout`` seconds for + in-flight ticks to drain. Daemon threads are still joined so + pytest fixture teardown is deterministic. + """ + self._stop.set() + thread = self._thread + if thread is not None and thread.is_alive() and wait: + thread.join(timeout=timeout) + self._executor.shutdown(wait=wait) + self._thread = None - The caller MUST pass an already-entered :class:`AsyncExitStack`. Each - FastMCP ``Client`` is registered into it; the caller controls lifetime via - ``await stack.aclose()``. - """ - registry = ToolRegistry() - for server_cfg in cfg.servers: - if not server_cfg.enabled: - continue - if server_cfg.transport == "in_process": - tools = await _load_in_process(server_cfg, stack) - else: - tools = await _load_remote(server_cfg, stack) - for t in tools: - original = getattr(t, "_original_mcp_name", t.name) - registry.add(ToolEntry( - name=original, description=t.description or "", - server=server_cfg.name, category=server_cfg.category, tool=t, - )) - return registry + # ----- test hook ----- + + def tick_once(self, when: datetime | None = None) -> None: + """Fire any monitors whose cron expression matches ``when``. + + Useful in tests where freezing wall-clock time is awkward; the + production scheduler loop calls this internally too. + """ + when = when or self._clock() + # Truncate to the minute so identical seconds within a minute + # don't fire the same monitor twice. + minute = when.replace(second=0, microsecond=0) + with self._lock: + entries = list(self._monitors.values()) + for entry in entries: + try: + if not _cron_matches(entry.skill.schedule or "* * * * *", minute): + continue + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: cron parse failed (%s); skipping tick", + entry.skill.name, exc, + ) + continue + if entry.next_run_ts == minute: + # Already fired this minute; idempotent on oversleep. + continue + entry.next_run_ts = minute + self._dispatch(entry) + + def _dispatch(self, entry: _RegisteredMonitor) -> None: + timeout = float(entry.skill.tick_timeout_seconds or 30.0) + future = self._executor.submit(entry.callable_) + + def _wait_and_log() -> None: + try: + future.result(timeout=timeout) + except FuturesTimeout: + logger.warning( + "monitor %s: tick exceeded %.1fs timeout", + entry.skill.name, timeout, + ) + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: tick raised %s", entry.skill.name, exc, + ) + + # Watcher runs on a side thread so the scheduler loop never + # blocks waiting for a slow tick — the executor handles + # parallelism, the watcher handles per-tick timeout reporting. + threading.Thread( + target=_wait_and_log, + name=f"monitor-watch:{entry.skill.name}", + daemon=True, + ).start() + + # ----- scheduler loop ----- + + def _run(self) -> None: + """Single-threaded scheduler. Wakes once per second, fires + any monitor whose cron expression matches the current minute, + marks each fired monitor for the minute so we never fire + twice if we oversleep. + """ + while not self._stop.is_set(): + try: + self.tick_once() + except Exception as exc: # noqa: BLE001 — never crash the loop + logger.warning("MonitorRunner loop error: %s", exc) + # Sleep with frequent wakeups so stop() returns promptly. + self._stop.wait(timeout=1.0) + + +__all__ = [ + "MonitorRunner", + "SafeEvalError", + "make_monitor_callable", + "safe_eval", +] # ====== module: runtime/graph.py ====== @@ -3951,7 +8311,21 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, last_exc: Exception | None = None for attempt in range(max_attempts): try: + # Phase 15 (LLM-COMPAT-01): the recursion_limit=25 workaround + # introduced in 3ba099f as a safety net is gone — the + # ``langchain.agents.create_agent`` migration replaces the + # old two-call structure (loop + separate + # ``with_structured_output`` pass) with a single tool-loop + # whose terminal signal is the AgentTurnOutput tool call + # itself (AutoStrategy → ToolStrategy fallback for non- + # function-calling Ollama models). The default langgraph + # recursion bound is now a true upper bound, not a workaround. return await executor.ainvoke(input_) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause. + # GraphInterrupt is a checkpointed pending_approval signal, + # not a transient error. + raise except Exception as exc: # noqa: BLE001 msg = str(exc).lower() transient = any(m in msg for m in _TRANSIENT_MARKERS) @@ -4119,6 +8493,30 @@ def _extract_final_text(messages: list) -> str: return "" +def _first_terminal_tool_called_this_turn( + messages: list, + terminal_tool_names: frozenset[str], +) -> str | None: + """Return the bare name of the first typed-terminal tool called this turn. + + Phase 10 (FOC-03 / D-10-03): used to label the reconciliation log so + operators can correlate envelope-vs-tool-arg confidence divergences + against a specific tool. Tool names may be MCP-prefixed + (``:``); we rsplit on the rightmost colon to recover the + bare name and match against the configured ``terminal_tool_names``. + Returns None when no terminal tool fired this turn. + """ + if not terminal_tool_names: + return None + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + name = tc.get("name", "") + bare = name.rsplit(":", 1)[-1] + if bare in terminal_tool_names: + return bare + return None + + def _sum_token_usage(messages: list) -> TokenUsage: """Sum input/output token counts across all messages that report usage_metadata.""" agent_in = agent_out = 0 @@ -4133,6 +8531,50 @@ def _sum_token_usage(messages: list) -> TokenUsage: ) +def _try_recover_envelope_from_raw(raw: str) -> AgentTurnOutput | None: + """Attempt to extract an :class:`AgentTurnOutput` from a raw LLM + string when LangGraph's structured-output pass raised + ``OutputParserException``. + + Strategy: + 1. Parse the whole string as JSON. + 2. If that fails, scan for the first balanced ``{...}`` substring + and try parsing that (handles markdown-fenced JSON or trailing + chatter). + 3. Validate the parsed dict against :class:`AgentTurnOutput`. + + Returns the parsed envelope on success, ``None`` on any failure. + """ + if not raw or not raw.strip(): + return None + candidates: list[str] = [raw] + # Markdown-fenced JSON: ```json\n{...}\n``` + if "```" in raw: + for chunk in raw.split("```"): + stripped = chunk.strip() + if stripped.startswith("json"): + stripped = stripped[4:].lstrip() + if stripped.startswith("{"): + candidates.append(stripped) + # Greedy: first '{' through last '}' + first = raw.find("{") + last = raw.rfind("}") + if 0 <= first < last: + candidates.append(raw[first:last + 1]) + for candidate in candidates: + try: + payload = json.loads(candidate) + except (json.JSONDecodeError, ValueError): + continue + if not isinstance(payload, dict): + continue + try: + return AgentTurnOutput.model_validate(payload) + except Exception: # noqa: BLE001 + continue + return None + + def _handle_agent_failure( *, skill_name: str, @@ -4207,6 +8649,8 @@ def make_agent_node( gateway_cfg: GatewayConfig | None = None, terminal_tool_names: frozenset[str] = frozenset(), patch_tool_names: frozenset[str] = frozenset(), + injected_args: dict[str, str] | None = None, + gate_policy: "GatePolicy | None" = None, ) -> Callable[[GraphState], Awaitable[dict]]: """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -4227,6 +8671,14 @@ def make_agent_node( union ``OrchestratorConfig.harvest_terminal_tools`` / ``OrchestratorConfig.patch_tools``). Empty defaults preserve the "no harvester recognition" behavior for legacy callers. + + ``injected_args`` (Phase 9 / D-09-01) is the orchestrator-wide + map of ``arg_name -> dotted_path`` declared in + :attr:`OrchestratorConfig.injected_args`. Every entry is stripped + from each tool's LLM-visible signature (so the LLM cannot emit a + value for it) and re-supplied at invocation time from session + state. When ``None`` or empty, tools pass through to the LLM + unchanged — preserves legacy callers and the framework default. """ async def node(state: GraphState) -> dict: @@ -4234,32 +8686,165 @@ async def node(state: GraphState) -> dict: inc_id = incident.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Phase 9 (D-09-01): strip injected-arg keys from every tool's + # LLM-visible signature BEFORE create_react_agent serialises the + # tool surface — so the LLM literally cannot emit values for + # those params. The framework re-supplies them at invocation + # time inside the gateway (or an inject-only wrapper) below. + + injected_keys = frozenset((injected_args or {}).keys()) + if injected_keys: + visible_tools = [ + strip_injected_params(t, injected_keys) for t in tools + ] + else: + visible_tools = tools + # Wrap tools per-invocation so each wrap closes over the live # ``Session`` for this run. When the gateway is unconfigured, # the original tools pass through untouched and # ``create_react_agent`` sees the same surface as before. if gateway_cfg is not None: + # Pass ORIGINAL tools (pre-strip) to wrap_tool — the gateway + # wrapper strips internally for the LLM-visible schema while + # keeping ``inner.args_schema`` intact so + # ``accepted_params_for_tool`` correctly recognises injected + # keys (e.g. ``environment``) as accepted by the underlying + # tool. Stripping twice (here AND in wrap_tool) hides those + # keys from ``accepted_params``, the inject step skips them, + # and FastMCP rejects the call as missing required arg. run_tools = [ wrap_tool(t, session=incident, gateway_cfg=gateway_cfg, - agent_name=skill.name, store=store) + agent_name=skill.name, store=store, + injected_args=injected_args or {}, + gate_policy=gate_policy) for t in tools ] + elif injected_keys: + # No gateway, but injected_args is configured — wrap each + # tool in an inject-only ``StructuredTool`` so the LLM-visible + # sig matches ``visible_tools`` while the underlying call + # still receives the framework-supplied values. + from langchain_core.tools import StructuredTool + + _inject_cfg = injected_args or {} + + def _make_inject_only_wrapper( + base: BaseTool, llm_visible: BaseTool, sess: Session, + ) -> BaseTool: + async def _arun(**kwargs: Any) -> Any: + new_kwargs = _inject_args( + kwargs, + session=sess, + injected_args_cfg=_inject_cfg, + tool_name=base.name, + ) + return await base.ainvoke(new_kwargs) + + def _run(**kwargs: Any) -> Any: + new_kwargs = _inject_args( + kwargs, + session=sess, + injected_args_cfg=_inject_cfg, + tool_name=base.name, + ) + return base.invoke(new_kwargs) + + return StructuredTool.from_function( + func=_run, + coroutine=_arun, + name=base.name, + description=base.description, + args_schema=llm_visible.args_schema, + ) + + run_tools = [ + _make_inject_only_wrapper(orig, vis, incident) + for orig, vis in zip(tools, visible_tools) + ] else: - run_tools = tools - agent_executor = create_react_agent( - llm, run_tools, prompt=skill.system_prompt, + run_tools = visible_tools + # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every + # responsive agent invocation is wrapped in an AgentTurnOutput + # envelope. ``langchain.agents.create_agent`` (the non-deprecated + # successor to ``langgraph.prebuilt.create_react_agent``) accepts a + # bare schema as ``response_format`` and, by default, wraps it in + # ``AutoStrategy`` — ProviderStrategy for models with native + # structured-output (OpenAI-class), falling back to ToolStrategy + # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a + # callable tool: when the LLM ``calls`` it, the loop terminates on + # the same turn with ``result["structured_response"]`` populated. + # Eliminates the old two-call structure (loop + separate + # ``with_structured_output`` pass) that hit recursion_limit=25 on + # Ollama models without true function-calling. + agent_executor = create_agent( + model=llm, + tools=run_tools, + system_prompt=skill.system_prompt, + response_format=AgentTurnOutput, ) + # Phase 11 (FOC-04): reset per-turn confidence hint. The hint + # is updated below after _harvest_tool_calls_and_patches; on + # re-entry from a HITL pause the hint resets cleanly so a new + # turn starts from "no signal yet" (None). + try: + incident.turn_confidence_hint = None + except (AttributeError, ValueError): + pass + try: result = await _ainvoke_with_retry( agent_executor, {"messages": [HumanMessage(content=_format_agent_input(incident))]}, ) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error. + # Re-raise so LangGraph's checkpointer captures the paused + # state. Session.status is left to the orchestrator's + # interrupt-aware bridge, NOT _handle_agent_failure. + raise except Exception as exc: # noqa: BLE001 - return _handle_agent_failure( - skill_name=skill.name, started_at=started_at, exc=exc, - inc_id=inc_id, store=store, fallback=incident, - ) + # Phase 10 follow-up: when LangGraph's structured-output pass + # raises ``OutputParserException`` (Ollama / non-OpenAI + # providers don't always honor ``response_format`` cleanly), + # try to recover by parsing the raw LLM output ourselves. + # The exception's ``llm_output`` carries the model's reply + # verbatim; if it contains JSON matching the envelope schema, + # build a synthetic ``result`` and continue. On unrecoverable + # failure, log the raw output for diagnosis and fall through + # to ``_handle_agent_failure``. + try: + from langchain_core.exceptions import OutputParserException + except ImportError: # pragma: no cover — langchain always present + OutputParserException = () # type: ignore[assignment] + if isinstance(exc, OutputParserException): + raw = getattr(exc, "llm_output", "") or "" + logger.warning( + "agent.structured_output_parse_failure agent=%s " + "raw_len=%d raw_preview=%r", + skill.name, len(raw), raw[:500], + ) + recovered = _try_recover_envelope_from_raw(raw) + if recovered is not None: + logger.info( + "agent.structured_output_recovered agent=%s", + skill.name, + ) + result = { + "messages": [], + "structured_response": recovered, + } + else: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + else: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) # Tools (e.g. registered patch tools) write straight to disk. # Reload so the node's own append of agent_run + tool_calls @@ -4277,18 +8862,51 @@ async def node(state: GraphState) -> dict: terminal_tool_names=terminal_tool_names, patch_tool_names=patch_tool_names, ) + # Phase 11 (FOC-04): update hint so any subsequent in-turn + # tool call sees the harvested confidence at the gateway. + if agent_confidence is not None: + try: + incident.turn_confidence_hint = agent_confidence + except (AttributeError, ValueError): + pass # Pair tool responses with their tool calls. _pair_tool_responses(messages, incident) + # Phase 10 (FOC-03 / D-10-03): parse the structural envelope and + # reconcile its confidence against any typed-terminal-tool arg + # confidence harvested above. Envelope failure is a hard error — + # mark the agent_run failed with structured cause. + try: + envelope = parse_envelope_from_result(result, agent=skill.name) + except EnvelopeMissingError as exc: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + + terminal_tool_for_log = _first_terminal_tool_called_this_turn( + messages, terminal_tool_names, + ) + final_confidence = reconcile_confidence( + envelope.confidence, + agent_confidence, + agent=skill.name, + session_id=inc_id, + tool_name=terminal_tool_for_log, + ) + final_rationale = agent_rationale or envelope.confidence_rationale + final_signal = agent_signal if agent_signal is not None else envelope.signal + # Final summary text and token usage. - final_text = _extract_final_text(messages) + # Envelope content takes precedence over last AIMessage scrape. + final_text = envelope.content or _extract_final_text(messages) usage = _sum_token_usage(messages) _record_success_run( incident=incident, skill_name=skill.name, started_at=started_at, final_text=final_text, usage=usage, - confidence=agent_confidence, rationale=agent_rationale, signal=agent_signal, + confidence=final_confidence, rationale=final_rationale, signal=final_signal, store=store, ) next_route_signal = decide_route(incident) @@ -4324,6 +8942,16 @@ def _decide_from_signal(inc: Session) -> str: "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.", } +# Phase 10 (FOC-03): per-agent default envelope confidence for the stub +# LLM. Pre-Phase-10 the deep_investigator stub emitted no confidence at +# all, so the gate (threshold 0.75) always interrupted on the first +# call. Post-Phase-10 every agent must emit a confidence value — drive +# DI's stub envelope below threshold to preserve gate-pause behavior in +# existing tests. Other agents default to 0.85 (above threshold). +_DEFAULT_STUB_ENVELOPE_CONFIDENCE: dict[str, float] = { + "deep_investigator": 0.30, +} + def _latest_run_for(incident: Session, agent_name: str | None): """Return the most recent ``AgentRun`` for ``agent_name``, or None. @@ -4492,6 +9120,10 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, valid_signals = frozenset(cfg.orchestrator.signals) gateway_cfg = getattr(cfg.runtime, "gateway", None) + # Phase 11 (FOC-04): thread the orchestrator's gate_policy down to + # wrap_tool so should_gate can apply the configured per-app + # confidence threshold + gated environments / risk actions. + gate_policy = getattr(cfg.orchestrator, "gate_policy", None) # Build the harvester's tool-name sets once per graph-build. The # union of ``terminal_tools`` (status-transitioning) and # ``harvest_terminal_tools`` (harvest-only) gives the full @@ -4510,7 +9142,10 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, if kind == "supervisor": llm = None if skill.dispatch_strategy == "llm": - llm = get_llm(cfg.llm, skill.model, role=agent_name) + llm = get_llm( + cfg.llm, skill.model, role=agent_name, + default_llm_request_timeout=cfg.orchestrator.default_llm_request_timeout, + ) nodes[agent_name] = make_supervisor_node(skill=skill, llm=llm) continue # Default / "responsive" path. @@ -4520,11 +9155,16 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, stub_canned = {agent_name: _DEFAULT_STUB_CANNED[agent_name]} else: stub_canned = None + # Phase 10 (FOC-03): wire a per-agent default envelope confidence + # into the stub so pre-Phase-10 gate-pause-on-DI tests still pass. + stub_env_conf = _DEFAULT_STUB_ENVELOPE_CONFIDENCE.get(agent_name) llm = get_llm( cfg.llm, skill.model, role=agent_name, stub_canned=stub_canned, + stub_envelope_confidence=stub_env_conf, + default_llm_request_timeout=cfg.orchestrator.default_llm_request_timeout, ) tools = registry.resolve(skill.tools, cfg.mcp) decide = _decide_from_signal @@ -4535,6 +9175,8 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, gateway_cfg=gateway_cfg, terminal_tool_names=terminal_tool_names, patch_tool_names=patch_tool_names, + injected_args=cfg.orchestrator.injected_args, + gate_policy=gate_policy, ) return nodes @@ -4631,6 +9273,10 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore, ) else: framework_cfg = getattr(cfg, "framework", None) or resolve_framework_app_config(None) + # ``resolve_framework_app_config(None)`` always returns a bare + # ``FrameworkAppConfig`` (never None), so the chain above is + # exhaustive — assert for pyright's flow narrowing. + assert framework_cfg is not None gated_edges = _collect_gated_edges(skills) sg = StateGraph(GraphState) @@ -4697,7 +9343,11 @@ async def make_postgres_checkpointer( enclosing transaction would otherwise hold the row lock until explicit commit. """ - from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver + # ``langgraph-checkpoint-postgres`` is an optional extra (declared + # under [project.optional-dependencies].postgres in pyproject) so + # the wheel is not present in CI's SQLite-only install. The module + # is only imported on the Postgres URL branch in production. + from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver # pyright: ignore[reportMissingImports] from psycopg_pool import AsyncConnectionPool # Translate SQLAlchemy URL -> libpq connection string. SQLAlchemy @@ -5065,7 +9715,10 @@ def resolve_transform(path: str) -> Callable[..., dict]: raise TypeError( f"transform {path!r} did not resolve to a callable; got {obj!r}" ) - return obj + # Apps own the strict signature -- the framework only enforces + # ``callable``. The cast satisfies the declared return type without + # adding a runtime wrapper. + return cast(Callable[..., dict], obj) # ====== module: runtime/triggers/idempotency.py ====== @@ -5105,7 +9758,9 @@ def __init__(self, engine: Engine) -> None: self._engine = engine # Ensure the table exists even if the orchestrator hasn't run # ``Base.metadata.create_all`` yet (early lifespan path). - Base.metadata.create_all(engine, tables=[IdempotencyRow.__table__]) + # ``IdempotencyRow.__table__`` is a ``Table`` at runtime; the + # SQLAlchemy stub types it as the wider ``FromClause``. + Base.metadata.create_all(engine, tables=[IdempotencyRow.__table__]) # pyright: ignore[reportArgumentType] self._lru: dict[str, OrderedDict[str, str]] = {} self._lock = threading.Lock() @@ -5225,7 +9880,10 @@ def purge_expired(self) -> int: ) ) s.commit() - return result.rowcount or 0 + # ``rowcount`` is exposed on ``CursorResult`` (the concrete + # return of DML execute); the abstract ``Result`` stub does + # not declare it. + return result.rowcount or 0 # pyright: ignore[reportAttributeAccessIssue] # ------------------------------------------------------------------ # Internals @@ -5599,7 +10257,12 @@ def create( f"but no transport with that kind is registered " f"(known: {sorted(plugin_kinds)})" ) - transports.append(kind_cls(pcfg)) + # Plugin transports inherit from the abstract + # ``TriggerTransport`` (no positional args declared on the + # ABC) but every concrete subclass loaded via the entry- + # point registry must accept the plugin's config object. + # The ABC mismatch is a stub limitation, not a runtime bug. + transports.append(kind_cls(pcfg)) # pyright: ignore[reportCallIssue] return cls(specs, transports, start_session_fn, idempotency) @@ -7180,6 +11843,112 @@ async def try_acquire(self, session_id: str) -> AsyncIterator[None]: slot.owner = None slot.lock.release() +# ====== module: runtime/skill_validator.py ====== + +class SkillValidationError(RuntimeError): + """Raised when skill YAML references a tool or route that does not + exist or is malformed. Refuses to start the orchestrator.""" + + +def _build_bare_to_full_map(registered_tools: set[str]) -> dict[str, list[str]]: + """Map bare tool name → list of fully-qualified ``:``.""" + bare_to_full: dict[str, list[str]] = {} + for full in registered_tools: + bare = full.split(":", 1)[1] if ":" in full else full + bare_to_full.setdefault(bare, []).append(full) + return bare_to_full + + +def _check_tool_ref( + skill_name: str, + tool_ref: str, + registered_tools: set[str], + bare_to_full: dict[str, list[str]], +) -> None: + """Raise SkillValidationError if ``tool_ref`` doesn't resolve to a + registered tool, or resolves ambiguously across multiple servers.""" + if tool_ref in registered_tools: + return + resolutions = bare_to_full.get(tool_ref) + if resolutions is None: + raise SkillValidationError( + f"skill {skill_name!r} references tool {tool_ref!r} which " + f"is not registered. Known tools: {sorted(registered_tools)[:10]}..." + ) + if len(resolutions) > 1: + raise SkillValidationError( + f"skill {skill_name!r} uses bare tool ref {tool_ref!r} but " + f"it is exposed by multiple servers: {sorted(resolutions)}. " + f"Use the prefixed form to disambiguate." + ) + + +def validate_skill_tool_references( + skills: dict, registered_tools: set[str], +) -> None: + """Assert every ``tools.local`` entry in every skill resolves to a + registered MCP tool. + + ``registered_tools`` is the set of fully-qualified ``:`` + names from the MCP loader. We accept either bare or prefixed forms + in skill YAML (the LLM-facing call uses prefixed; YAML can use + either for ergonomics). + """ + bare_to_full = _build_bare_to_full_map(registered_tools) + for skill_name, skill in skills.items(): + local = (skill.get("tools") or {}).get("local") or [] + for tool_ref in local: + _check_tool_ref(skill_name, tool_ref, registered_tools, bare_to_full) + + +def validate_skill_routes(skills: dict) -> None: + """Assert every skill has a ``when: default`` route entry. + + Skipped for ``kind: supervisor`` skills — supervisors dispatch via + ``dispatch_rules`` to subordinates and do not use the ``routes`` + table at all. + """ + for skill_name, skill in skills.items(): + if skill.get("kind") == "supervisor": + continue + routes = skill.get("routes") or [] + if not any((r.get("when") == "default") for r in routes): + raise SkillValidationError( + f"skill {skill_name!r} has no ``when: default`` route — " + f"agents whose signal doesn't match a rule will hang." + ) + +# ====== module: runtime/storage/checkpoint_gc.py ====== + +def gc_orphaned_checkpoints(engine: Engine) -> int: + """Remove orphaned checkpoint rows; return count removed. + + Returns 0 if the ``checkpoints`` table doesn't exist (fresh DB, + LangGraph checkpointer has not yet bootstrapped its schema). + """ + with engine.begin() as conn: + live_ids = {row[0] for row in conn.execute( + text("SELECT id FROM incidents") + )} + try: + rows = conn.execute(text( + "SELECT DISTINCT thread_id FROM checkpoints" + )).all() + except OperationalError: + return 0 + # thread_id may be ``INC-1`` or ``INC-1:retry-N`` — strip suffix. + orphans = [] + for (tid,) in rows: + base = tid.split(":")[0] if tid else tid + if base not in live_ids: + orphans.append(tid) + for tid in orphans: + conn.execute( + text("DELETE FROM checkpoints WHERE thread_id = :tid"), + {"tid": tid}, + ) + return len(orphans) + # ====== module: runtime/orchestrator.py ====== if TYPE_CHECKING: @@ -7192,6 +11961,7 @@ async def try_acquire(self, session_id: str) -> AsyncIterator[None]: +from langgraph.errors import GraphInterrupt from langgraph.types import Command @@ -7204,9 +11974,29 @@ async def try_acquire(self, session_id: str) -> AsyncIterator[None]: + _log = logging.getLogger("runtime.orchestrator") +def _assert_envelope_invariant_on_finalize(session: "Session") -> None: + """Phase 10 (FOC-03) defence-in-depth log sweep. + + Hard rejection of envelope-less turns happens at the agent runner + (``parse_envelope_from_result`` raises ``EnvelopeMissingError``, + which the runner converts into an agent_run marked ``error``). + This finalize hook only logs WARNING for forensics on legacy on-disk + sessions whose agent_runs predate the envelope contract. Never + raises. + """ + for ar in session.agents_run: + if ar.confidence is None: + _log.warning( + "agent_run.envelope_missing agent=%s session_id=%s", + ar.agent, + session.id, + ) + + def _default_text_extractor(session) -> str: """Default text extraction for the incident-management example. @@ -7641,10 +12431,14 @@ async def create(cls, cfg: AppConfig) -> "Orchestrator": if dedup_cfg.stage2_model in cfg.llm.models: _llm_cfg_capture = cfg.llm _model_name = dedup_cfg.stage2_model + _default_timeout_capture = ( + cfg.orchestrator.default_llm_request_timeout + ) def _factory(): return get_llm( _llm_cfg_capture, _model_name, role="dedup", + default_llm_request_timeout=_default_timeout_capture, ) dedup_pipeline = DedupPipeline( @@ -7656,14 +12450,22 @@ def _factory(): # Backfill dedup_pipeline into the IntakeContext now that it is built. # The IntakeContext was constructed with dedup_pipeline=None above # because the pipeline is built after graph construction. + # ``intake_context`` was attached via ``object.__setattr__`` ~140 + # lines up; pyright doesn't see dynamic Pydantic attrs, so go + # via getattr for the type-checker. if dedup_pipeline is not None: - framework_cfg.intake_context.dedup_pipeline = dedup_pipeline + getattr(framework_cfg, "intake_context").dedup_pipeline = dedup_pipeline # No bespoke resume graph — resume runs through the main # graph via ``Command(resume=...)`` against the same # thread_id, with the checkpointer rehydrating paused state. + # ``repo_state_cls: Type[BaseModel]`` matches the loose + # bound on ``Orchestrator.StateT`` (also ``BaseModel``) at + # the call site, but pyright sees the un-narrowed + # ``StateT`` placeholder. Concrete narrowing happens via + # the runtime resolver enforced earlier in this method. instance = cls(cfg, store, skills, registry, graph, stack, framework_cfg=framework_cfg, - state_cls=repo_state_cls, + state_cls=repo_state_cls, # pyright: ignore[reportArgumentType] history=history, checkpointer=checkpointer, checkpointer_close=checkpointer_close, @@ -7679,7 +12481,13 @@ def _factory(): try: await checkpointer_close() # pyright: ignore[reportPossiblyUnboundVariable] except Exception: # noqa: BLE001 - pass + # The original BaseException is what the caller cares + # about; this cleanup failure must not mask it. Log so + # the FD-leak path stays observable. + _log.warning( + "build: checkpointer_close failed during error rollback", + exc_info=True, + ) await stack.aclose() raise @@ -7691,7 +12499,13 @@ async def aclose(self) -> None: try: await self._checkpointer_close() except Exception: # noqa: BLE001 - pass + # Best-effort: the rest of aclose() (exit_stack drain) + # must still run so MCP transports don't leak. Log so + # checkpointer-close failures stay observable. + _log.warning( + "aclose: checkpointer close failed", + exc_info=True, + ) self._checkpointer_close = None await self._exit_stack.aclose() @@ -7770,6 +12584,12 @@ def _finalize_session_status(self, session_id: str) -> str | None: if inc.status not in ("new", "in_progress"): return None + # Phase 10 (FOC-03) defence-in-depth: hard rejection of envelope-less + # turns happens at the agent runner; this hook only logs WARNING for + # forensics on legacy on-disk sessions whose agent_runs predate the + # envelope contract. Never raises. + _assert_envelope_invariant_on_finalize(inc) + decision = self._infer_terminal_decision(inc.tool_calls) if decision is None: default = self.cfg.orchestrator.default_terminal_status @@ -7879,6 +12699,116 @@ def _save_or_yield(self, inc, new_status: str) -> str | None: except StaleVersionError: return None + @staticmethod + def _is_graph_interrupt(exc: BaseException) -> bool: + """Phase 11 (FOC-04 / D-11-04): identify a LangGraph HITL pause. + + ``GraphInterrupt`` is NOT an error -- it signals a checkpointed + ``pending_approval`` state. Real exceptions still flow through + the normal failure path. Helper kept on the orchestrator so + callers don't each re-import langgraph internals. + """ + return isinstance(exc, GraphInterrupt) + + @staticmethod + def _extract_last_error(inc: "Session") -> Exception | None: + """Reconstruct the last error from a Session in status='error'. + + The graph runner stores failures as an AgentRun with + ``summary='agent failed: '`` (graph.py:_handle_agent_failure). + We can't recover the original Exception type, so we return a + synthetic representative whose CLASS matches a _PERMANENT_TYPES + / _TRANSIENT_TYPES whitelist entry where possible -- that's all + :func:`runtime.policy.should_retry` needs (it does isinstance + checks). + + Mapping (first match wins per AgentRun.summary scan, newest + first): + + - "EnvelopeMissingError" in body -> EnvelopeMissingError + - "ValidationError" in body -> pydantic.ValidationError + - "TimeoutError" / "timed out" -> TimeoutError + - "OSError" / "ConnectionError" -> OSError + - everything else -> RuntimeError (falls + through to permanent_error per fail-closed default in + should_retry) + """ + + import pydantic as _pydantic + for run in reversed(inc.agents_run): + summary = (run.summary or "") + if not summary.startswith("agent failed:"): + continue + body = summary.removeprefix("agent failed:").strip() + if "EnvelopeMissingError" in body: + return _EnvelopeMissingError( + agent=run.agent or "unknown", + field="confidence", + message=body, + ) + if "ValidationError" in body or "validation error" in body: + # Build a synthetic ValidationError; pydantic v2 supports + # ValidationError.from_exception_data. + try: + return _pydantic.ValidationError.from_exception_data( + title="reconstructed", line_errors=[], + ) + except Exception: # pragma: no cover -- pydantic API drift + return RuntimeError(body) + if ("TimeoutError" in body or "timed out" in body + or "asyncio.TimeoutError" in body): + return TimeoutError(body) + if "OSError" in body or "ConnectionError" in body: + return OSError(body) + return RuntimeError(body) + return None + + @staticmethod + def _extract_last_confidence(inc: "Session") -> float | None: + """Return the last recorded turn-level confidence on the session, + or None if no AgentRun carries one. should_retry treats None as + 'no signal yet' and skips the low-confidence gate. + """ + for run in reversed(inc.agents_run): + if run.confidence is not None: + return run.confidence + return None + + def preview_retry_decision( + self, session_id: str, + ) -> "RetryDecision": + """Phase 12 (FOC-05 / D-12-04): return the framework's retry + decision WITHOUT executing anything. The UI calls this to render + the retry button label + disabled state. + + Pure: same inputs always yield identical RetryDecision. Loads + the session from store; reads (retry_count, last_error, + last_confidence) and consults the same policy + ``runtime.policy.should_retry`` that ``_retry_session_locked`` + uses. No mutation, no thread-id bump, no lock acquired. + + For sessions whose status is not "error" (i.e. nothing to + retry), returns ``RetryDecision(retry=False, + reason="permanent_error")`` -- a defensive caller-friendly + outcome that lets the UI render a "cannot auto-retry" state + without inventing a new reason value. + """ + try: + inc = self.store.load(session_id) + except FileNotFoundError: + return RetryDecision(retry=False, reason="permanent_error") + if inc.status != "error": + return RetryDecision(retry=False, reason="permanent_error") + retry_count = int(inc.extra_fields.get("retry_count", 0)) + last_error = self._extract_last_error(inc) + last_confidence = self._extract_last_confidence(inc) + return should_retry( + retry_count=retry_count, + error=last_error, + confidence=last_confidence, + cfg=self.cfg.orchestrator, + ) + async def _finalize_session_status_async( self, session_id: str, ) -> str | None: @@ -8201,7 +13131,15 @@ async def resume_session(self, incident_id: str, tool_args: dict = {"incident_id": incident_id, "message": message} if team is not None: tool_args["team"] = team - tool_result = await self._invoke_tool(tool_name, tool_args) + # Phase 9 (D-09-01): expose the live session to + # _invoke_tool's injection branch via the implicit slot. + # try/finally so a failed tool call doesn't leak the + # reference into the next orchestrator-driven call. + self._current_session_for_invoke = inc_loaded + try: + tool_result = await self._invoke_tool(tool_name, tool_args) + finally: + self._current_session_for_invoke = None inc_loaded.tool_calls.append(ToolCall( agent="orchestrator", tool=tool_name, @@ -8320,6 +13258,30 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]: "reason": f"not in error state (status={inc.status})", "ts": _event_ts()} return + # Phase 12 (FOC-05 / D-12-04): consult the framework's pure + # retry policy BEFORE mutating session state. The decision is + # derived from (retry_count, last_error, last_turn_confidence, + # cfg) -- LLM intent is not consulted. On retry=False, emit + # retry_rejected with the policy's reason and DO NOT bump the + # retry_count or thread id (preserves the "not retryable" + # state on disk for UI re-rendering and retry-budget audits). + prior_retry_count = int(inc.extra_fields.get("retry_count", 0)) + last_error = self._extract_last_error(inc) + last_confidence = self._extract_last_confidence(inc) + decision = should_retry( + retry_count=prior_retry_count, + error=last_error, + confidence=last_confidence, + cfg=self.cfg.orchestrator, + ) + if not decision.retry: + _log.info( + "retry_session policy-rejected: id=%s reason=%s", + session_id, decision.reason, + ) + yield {"event": "retry_rejected", "incident_id": session_id, + "reason": decision.reason, "ts": _event_ts()} + return # Drop the failed AgentRun(s) so the timeline only retains # successful runs. Retry attempts then append fresh runs. inc.agents_run = [ @@ -8378,6 +13340,14 @@ async def _resume_with_input(self, incident_id: str, inc, decision: dict): config=self._thread_config(incident_id), ): yield self._to_ui_event(ev, incident_id) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): a resume that re-paused via + # a fresh HITL gate. Don't restore the prior pending_intervention + # block (the new pending_approval ToolCall row is the + # canonical pause record now). Propagate so LangGraph's + # checkpointer captures the new pause; the UI's + # _render_pending_approvals_block surfaces the resume target. + raise except Exception as exc: # noqa: BLE001 — restore on any failure # Reload from disk to absorb any partial writes from tools # that ran before the failure, then restore intervention @@ -8403,6 +13373,14 @@ async def _invoke_tool(self, name: str, args: dict): Used for orchestrator-driven tool calls (e.g. an app-registered escalation tool invoked from the awaiting_input gate) that aren't initiated by an LLM. + + Phase 9 (D-09-01): orchestrator-driven calls also flow through + injection so the tool gets the canonical session-derived arg set + even when the orchestrator only passed intent-args. The current + session is read off ``self._current_session_for_invoke`` (set + by callers via try/finally) so the public signature stays + unchanged. When no session is reachable the injection step is + a no-op — the existing escalation path keeps working unchanged. """ entry = next( (e for e in self.registry.entries.values() if e.name == name), @@ -8410,6 +13388,17 @@ async def _invoke_tool(self, name: str, args: dict): ) if entry is None: raise KeyError(f"tool '{name}' not registered") + session = getattr(self, "_current_session_for_invoke", None) + cfg_inject = self.cfg.orchestrator.injected_args + if session is not None and cfg_inject: + + args = inject_injected_args( + args, + session=session, + injected_args_cfg=cfg_inject, + tool_name=name, + accepted_params=accepted_params_for_tool(entry.tool), + ) return await entry.tool.ainvoke(args) @staticmethod @@ -8430,6 +13419,9 @@ def _event_ts() -> str: # ====== module: runtime/api.py ====== +_log = logging.getLogger("runtime.api") + + def _resolve_environments(dotted: str | None) -> list[str]: """Resolve ``RuntimeConfig.environments_provider_path`` to a list. @@ -8623,7 +13615,12 @@ async def _trigger_dispatch(service, kwargs): try: await registry.stop_all() except Exception: # noqa: BLE001 - pass + # Best-effort: a misbehaving trigger transport must not + # block ``svc.shutdown()`` below. Surface for observability. + _log.warning( + "trigger registry stop_all failed during lifespan teardown", + exc_info=True, + ) # ``shutdown()`` cancels in-flight session tasks, closes the # underlying Orchestrator + MCP pool, joins the loop thread, # and resets the process-singleton. diff --git a/dist/apps/code-review.py b/dist/apps/code-review.py index ce0327e..2c0e7cd 100644 --- a/dist/apps/code-review.py +++ b/dist/apps/code-review.py @@ -1,4 +1,30 @@ from __future__ import annotations +# ----- imports for runtime/errors.py ----- +"""Typed runtime errors. Phase 13 lands the LLM-call surface; future +hardening (HARD-04 silent-failure sweep, HARD-03 pyright flip, +real-LLM follow-ups) extends here. + +Importable as ``from runtime.errors import LLMTimeoutError, LLMConfigError``. +""" + + + +# ----- imports for runtime/terminal_tools.py ----- +"""Generic terminal-tool registry types. + +Apps register their terminal-tool rules and status vocabulary via +``OrchestratorConfig.terminal_tools`` / ``OrchestratorConfig.statuses``; +the framework reads these models without knowing app-specific tool +or status names. Cf. .planning/phases/06-generic-terminal-tool-registry/ +06-CONTEXT.md (D-06-01, D-06-02, D-06-05). +""" + + +from typing import Literal + +from pydantic import BaseModel, Field + + # ----- imports for runtime/config.py ----- """Config schemas for the orchestrator.""" @@ -6,11 +32,12 @@ import re from pathlib import Path from typing import Any, Literal -from pydantic import BaseModel, Field, field_validator, model_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator import yaml + # Session-id prefix grammar. The framework mints session ids of the form # ``{PREFIX}-YYYYMMDD-NNN`` (see ``runtime.state.Session.id_format``); # the prefix is the only piece an app picks. Allow alphanumerics + hyphens, @@ -34,7 +61,6 @@ class IncidentState(Session): -from pydantic import BaseModel, Field # ----- imports for runtime/state_resolver.py ----- """Resolve ``RuntimeConfig.state_class`` (a dotted path) to a class object. @@ -109,6 +135,7 @@ class IncidentState(Session): import ast from typing import Any, Callable, Literal +from pydantic import BaseModel, Field, field_validator, model_validator # ----- imports for runtime/llm.py ----- @@ -118,8 +145,21 @@ class IncidentState(Session): provider (kind + connection) to a model id and optional temperature/deployment. ``get_llm(cfg, "smart")`` looks up ``cfg.models["smart"]`` and uses its referenced ``cfg.providers[]`` to build a langchain ``BaseChatModel``. + +Phase 13 (HARD-01 / HARD-05): every chat + embedding HTTP call is bounded +by an effective ``request_timeout`` resolved as +``provider.request_timeout if not None else default_llm_request_timeout`` +(default 120.0s on ``OrchestratorConfig``). The native langchain timeout +knob is wired AND an ``asyncio.wait_for`` wrapper raises +``LLMTimeoutError(provider, model, elapsed_ms)`` on hang -- defence in +depth against partial-byte stalls where the httpx layer doesn't fire. +The hardcoded public-Ollama fallback is removed; ollama providers +must declare ``base_url`` (validated at config-load via +``LLMConfigError``). """ +import asyncio +import time from typing import Any from uuid import uuid4 from langchain_core.embeddings import Embeddings @@ -130,6 +170,7 @@ class IncidentState(Session): + # ----- imports for runtime/storage/models.py ----- """SQLAlchemy declarative model for the ``incidents`` table. @@ -183,6 +224,7 @@ class IncidentState(Session): import hashlib import numpy as np +from pydantic import SecretStr @@ -230,16 +272,19 @@ class IncidentState(Session): from typing import Any, Generic, Mapping, Optional, Type, TypeVar -from pydantic import BaseModel from sqlalchemy import select from sqlalchemy.engine import Engine -from sqlalchemy.orm import Session +from sqlalchemy.orm import Session as SqlaSession -# Mirrors the bound on ``SessionStore.StateT`` — kept permissive at -# ``BaseModel`` so framework code does not need to import the -# example-app subclass. The resolver in :mod:`runtime.state_resolver` -# enforces a ``runtime.state.Session`` subclass at config time. + +# Mirrors the bound on ``SessionStore.StateT`` — tightened from +# ``BaseModel`` to ``runtime.state.Session`` in Phase 19 (HARD-03) so +# pyright sees the typed fields (``id``, ``status``, ``deleted_at`` …) +# this store reads. The resolver in :mod:`runtime.state_resolver` +# already enforces a ``Session`` subclass at config time, and every +# in-tree caller passes either bare ``Session`` or a ``Session`` +# subclass. # ----- imports for runtime/storage/session_store.py ----- """Active session lifecycle store. @@ -261,6 +306,7 @@ class IncidentState(Session): from datetime import datetime, timezone from typing import Generic, Optional, Type, TypeVar +from pydantic import BaseModel from sqlalchemy import desc, select from sqlalchemy.orm import Session as SqlSession @@ -271,6 +317,66 @@ class IncidentState(Session): # hook existed. New rows are validated by ``_SESSION_ID_RE`` which # accepts any ``PREFIX-YYYYMMDD-NNN`` shape the app's ``id_format`` may # emit (e.g. ``CR-...`` for code-review). +# ----- imports for runtime/storage/event_log.py ----- +"""Append-only session event log. + +Events drive the status finalizer's inference (e.g. a registered +```` event appearing in the log -> session reached +the corresponding terminal status). They are never mutated or +deleted. +""" + + +from dataclasses import dataclass +from typing import Iterator + +from sqlalchemy.orm import Session + + + +# ----- imports for runtime/storage/migrations.py ----- +"""Idempotent migrations for the JSON-shaped row payloads. + +Fills the per-call audit fields on :class:`runtime.state.ToolCall` for +legacy rows. The risk-rated tool gateway uses five optional audit fields: + + * ``risk`` — ``"low" | "medium" | "high" | None`` + * ``status`` — ``ToolStatus`` literal (default ``"executed"``) + * ``approver`` — operator id, set when status in {approved, rejected} + * ``approved_at`` — ISO-8601 timestamp of the decision + * ``approval_rationale`` — free-text justification + +Older rows in the ``incidents.tool_calls`` JSON column lack these +fields. Pydantic hydrates the missing keys with their defaults at read +time so reading is already back-compat — but the on-disk JSON still +shows the legacy shape until something rewrites the row. + +This migration walks every session, normalises the JSON-shaped +``tool_calls`` list to the current audit schema, and saves the row back +when (and only when) at least one entry changed. Idempotent — running +twice is safe (the second pass is a no-op because every row already +has the fields). + +The function operates on the row's JSON list directly (not via the +``ToolCall`` Pydantic model) so we don't accidentally widen the +migration's contract — for example, dropping unknown extra keys via +Pydantic's ``extra='ignore'`` would silently delete forward-compat +fields in a downgrade scenario. JSON-walk is conservative: only fill +what's missing; leave everything else alone. +""" + + +from typing import Any, Iterable + +from sqlalchemy import inspect, text + + +# Columns added after the initial schema. Each entry is +# ``(column_name, sql_type, default_clause_or_None)``. SQLite ``ADD +# COLUMN`` cannot add a non-nullable column without a constant default, +# so every entry here is nullable — Pydantic hydrates the missing keys +# at read time. Append-only: never reorder, never delete. Removing a +# column needs a separate destructive migration with explicit sign-off. # ----- imports for runtime/mcp_loader.py ----- """Load MCP servers (in_process / stdio / http / sse) and build a tool registry. @@ -299,15 +405,313 @@ class IncidentState(Session): -# ----- imports for runtime/graph.py ----- -"""LangGraph state, routing helpers, and node runner.""" +# ----- imports for runtime/service.py ----- +"""Long-lived orchestrator service. + +Owns a background asyncio event loop and a shared FastMCP client pool. +All session execution will run as asyncio tasks on this loop. Sync callers +(Streamlit, FastAPI request handlers, CLI) submit coroutines via +``submit(coro) -> concurrent.futures.Future``. + +Lifecycle:: + + svc = OrchestratorService.get_or_create(cfg) + svc.start() # spins up background thread + loop + fut = svc.submit(some_coro) + result = fut.result(timeout=30) + svc.shutdown() # cancels in-flight tasks, closes MCP clients, joins thread + +Capabilities: + - Skeleton + singleton + start/shutdown lifecycle. + - ``submit()`` / ``submit_and_wait()`` thread-safe bridge. + - Shared ``MCPClientPool`` with per-server ``asyncio.Lock``. + - ``start_session()`` schedules a per-session asyncio task on the + service's loop and returns the session id immediately (the agent run + continues in the background). Active tasks are tracked in an + in-memory registry that evicts on completion / cancellation. + - ``list_active_sessions()`` returns a thread-safe snapshot of + the in-flight registry; the snapshot coroutine runs on the loop so + readers from any thread see a point-in-time consistent view. + - ``stop_session(sid)`` cancels the in-flight task, waits up + to 5 s for graceful exit, and persists ``status="stopped"`` on the + row (clearing ``pending_intervention``). Idempotent — a no-op for + unknown ids or already-completed sessions. + - Hard cap on concurrent sessions. ``start_session`` raises + ``SessionCapExceeded`` once ``len(self._registry) >= + self.max_concurrent_sessions``. Fail fast; queueing is not supported. + +The singleton is process-scoped and reset on ``shutdown()`` so that test +suites can build, tear down, and rebuild the service without leaking +state across cases. +""" -import asyncio + +import concurrent.futures import logging -from typing import TypedDict, Callable, Awaitable +import threading +from typing import Any, Awaitable, Coroutine, TypeVar, cast + + + +# ----- imports for runtime/agents/turn_output.py ----- +"""Phase 10 (FOC-03) — AgentTurnOutput envelope + reconciliation helpers. + +The envelope is the structural contract every responsive agent invocation +must satisfy: content + confidence in [0,1] + confidence_rationale + optional +signal. The framework wires it as ``response_format=AgentTurnOutput`` into +``langchain.agents.create_agent`` (see Phase 15 / LLM-COMPAT-01); the +agent loop terminates on the same turn the LLM emits the envelope-shaped +tool call, populating ``result["structured_response"]``, which the +framework reads and persists onto the ``AgentRun`` row. + +D-10-02 — pydantic envelope wrapped via ``response_format``. +D-10-03 — when a typed-terminal-tool was called this turn, the framework +reconciles its ``confidence`` arg against the envelope's. Tolerance 0.05 +inclusive; tool-arg wins on mismatch with an INFO log. + +This is a leaf module: no imports from ``runtime.graph`` or +``runtime.orchestrator``. Both of those depend on it; the dependency +graph is acyclic. +""" + + + +from pydantic import BaseModel, ConfigDict, Field + +# ----- imports for runtime/tools/gateway.py ----- +"""Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper. + +The gateway sits between the ReAct agent and each tool the orchestrator +configures. It enforces the *hybrid* HITL policy resolved by +``effective_action``: + + ``auto`` -> call the underlying tool directly (no plumbing) + ``notify`` -> call the tool, then persist a soft-notify audit entry + ``approve`` -> raise ``langgraph.types.interrupt(...)`` BEFORE calling + the tool; on resume re-invoke + +The resolver is a plain function with no I/O so it can be unit-tested +exhaustively without spinning up Pydantic Sessions, MCP servers, or a +LangGraph runtime. The wrapper is a closure factory deliberately built +inside ``make_agent_node`` so the closure captures the live ``Session`` +per agent invocation (mitigation R2 in the Phase-4 plan). +""" + + +from fnmatch import fnmatchcase +from typing import TYPE_CHECKING, Any, Literal + + + + +# ``GateDecision`` is imported lazily inside ``_evaluate_gate`` (function +# body) to avoid a runtime cycle (policy.py imports gateway types). The +# type-only import below lets pyright resolve the string-literal return +# annotation on ``_evaluate_gate`` without forming a real cycle. +# ----- imports for runtime/tools/arg_injection.py ----- +"""Session-derived tool-arg injection (Phase 9 / FOC-01 / FOC-02). + +Two responsibilities, one module: + +1. :func:`strip_injected_params` — clones a ``BaseTool``'s args_schema with + one or more parameters removed. The LLM only sees the stripped sig and + therefore cannot hallucinate values for those params (D-09-01). The + original tool is left untouched so direct downstream callers (tests, + scripts, in-process MCP fixtures) keep working. + +2. :func:`inject_injected_args` — at tool-invocation time, re-adds the + real values resolved from the live :class:`runtime.state.Session` via + the configured dotted paths. When the LLM still supplied a value for + an injected arg, the framework's session-derived value wins and an + INFO log captures the override (D-09-03). + +The framework stays generic — apps declare which args to inject and from +where via :attr:`runtime.config.OrchestratorConfig.injected_args` (D-09-02). +""" + + + +from pydantic import BaseModel, create_model + + + +# Module-private logger. Tests assert against logger name +# ``"runtime.orchestrator"`` so the override-log line shows up alongside +# the rest of the orchestrator-side observability without requiring a +# separate caplog target. +# ----- imports for runtime/tools/approval_watchdog.py ----- +"""Pending-approval timeout watchdog. + +A high-risk tool call enters ``langgraph.types.interrupt()`` and the +session sits in ``awaiting_input`` indefinitely. Without a watchdog +the slot leaks against ``OrchestratorService.max_concurrent_sessions`` +forever — the cap eventually starves out new traffic. + +The :class:`ApprovalWatchdog` is an asyncio task that runs on the +service's background loop. Every ``poll_interval_seconds`` it: + + 1. Snapshots the in-flight session registry. + 2. For each session whose row has ``status="awaiting_input"``, + scans ``tool_calls`` for entries with ``status="pending_approval"`` + whose ``ts`` is older than ``approval_timeout_seconds``. + 3. Resumes each such session via ``Command(resume={"decision": + "timeout", "approver": "system", "rationale": "approval window + expired"})``. The wrapped tool's resume path updates the audit + row to ``status="timeout"``. + +Failures during polling (DB hiccup, malformed row) are logged and +swallowed so a single bad session cannot kill the watchdog. +""" + + +from typing import TYPE_CHECKING, Any + + +# ----- imports for runtime/policy.py ----- +"""Pure HITL gating policy (Phase 11 / FOC-04). + +The :func:`should_gate` function is the SOLE place the framework decides +whether a tool call requires human-in-the-loop approval. It composes +three orthogonal inputs: + + 1. ``effective_action(tool_call.tool, env=session.environment, + gateway_cfg=cfg.gateway)`` -- preserves the v1.0 PVC-08 + prefixed-form lookup invariant. + 2. ``session.environment`` -- gated when in + ``cfg.gate_policy.gated_environments``. + 3. ``confidence`` -- gated when below + ``cfg.gate_policy.confidence_threshold``. + +Pure: same inputs always yield identical :class:`GateDecision`; no I/O, +no skill-prompt input, no mutation. + +Precedence (descending): + + 1. ``effective_action`` returns a value in + ``cfg.gate_policy.gated_risk_actions`` + -> ``GateDecision(gate=True, reason="high_risk_tool")`` + 2. ``session.environment`` in ``cfg.gate_policy.gated_environments`` + AND ``effective_action != "auto"`` + -> ``GateDecision(gate=True, reason="gated_env")`` + 3. ``confidence`` is not None AND + ``confidence < cfg.gate_policy.confidence_threshold`` + AND ``effective_action != "auto"`` + -> ``GateDecision(gate=True, reason="low_confidence")`` + 4. otherwise -> ``GateDecision(gate=False, reason="auto")`` + +The literal ``"blocked"`` is reserved on :class:`GateDecision.reason` +for future hard-stop semantics; Phase 11 itself never returns it from a +production code path. +""" + + + +from pydantic import BaseModel, ConfigDict + + +# Phase 11 (FOC-04): forward-reference imports for the should_gate +# signature only; kept inside ``TYPE_CHECKING`` so the bundle's +# intra-import stripper does not remove a load-bearing import. The +# ``pass`` keeps the block syntactically valid after stripping. +# ----- imports for runtime/agents/responsive.py ----- +"""Responsive agent kind — the today-default LLM agent. + +A responsive skill is a LangGraph node that: + +1. Builds a ReAct executor over the skill's ``tools`` and ``model``. +2. Invokes the executor with the live ``Session`` payload as a human + message preamble. +3. Records ``ToolCall`` and ``AgentRun`` rows on the session, harvests + the agent's confidence / signal / rationale, and decides the next + route from ``skill.routes``. + +This module owns only the node-factory entrypoint +(``make_agent_node``); the implementation reuses helpers in +:mod:`runtime.graph` so existing call sites and the gate node continue +to work unchanged. Supervisor and monitor factories live alongside it +under :mod:`runtime.agents` rather than piling more kinds into +``graph.py``. +""" + + +from typing import Callable from langchain_core.messages import HumanMessage -from langgraph.prebuilt import create_react_agent +from langchain.agents import create_agent + +from langgraph.errors import GraphInterrupt + + + + + + + +# ----- imports for runtime/agents/supervisor.py ----- +"""Supervisor agent kind — no-LLM router. + +A supervisor skill is a LangGraph node that: + +1. Reads the live ``Session`` plus the current dispatch depth. +2. Picks one or more subordinate agents per ``dispatch_strategy``: + ``rule`` (deterministic, evaluated via the same safe-eval AST that + gates monitor expressions) or ``llm`` (one short LLM call against + ``dispatch_prompt``). +3. Emits a structured ``supervisor_dispatch`` log entry (no + ``AgentRun`` row — supervisors are bookkeeping, not token-burning + agents). +4. Returns ``next_route`` set to the chosen subordinate (or to + ``__end__`` when the depth limit is hit). + +The recursion depth is tracked in :class:`runtime.graph.GraphState`'s +``dispatch_depth`` field; if a supervisor would exceed +``skill.max_dispatch_depth`` the node aborts with a clean error +instead of recursing forever. + +This is **not** a fan-out implementation; we always pick a single +target. Multi-target ``Send()`` is intentionally not supported. +""" + + +from typing import Any, Callable + +from langchain_core.messages import HumanMessage, SystemMessage + + + +# ----- imports for runtime/agents/monitor.py ----- +"""Monitor agent kind — out-of-band scheduled observer. + +A monitor skill runs **outside** any session graph. The orchestrator +owns one :class:`MonitorRunner` (a singleton) which schedules registered +monitor skills on a small bounded +:class:`concurrent.futures.ThreadPoolExecutor`. +Each tick: + +1. Calls every tool name in ``observe`` via the supplied callable + (``observe_fn``); aggregates results into one dict keyed by tool. +2. Evaluates ``emit_signal_when`` against the observation using the + stdlib safe-eval evaluator (R7). +3. If true, looks up ``trigger_target`` in the supplied trigger + registry / fire callback and fires it with the observation as the + payload. + +APScheduler is intentionally *not* a dependency: the air-gapped target +env doesn't ship it (see ``rules/build.md``). We get away with a tiny +single-threaded scheduler thread because monitor schedules are coarse +(minute-resolution cron) and tool calls are dispatched into the +executor; the scheduler thread itself never blocks on tool I/O. +""" + + +from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeout + + +# ----- imports for runtime/graph.py ----- +"""LangGraph state, routing helpers, and node runner.""" + +from typing import Any, TypedDict, Callable, Awaitable + from langgraph.graph import StateGraph, END @@ -316,6 +720,11 @@ class IncidentState(Session): +# Phase 11 (FOC-04 / D-11-04): GraphInterrupt is the LangGraph +# pending-approval pause signal. It is NOT an error and must NOT route +# through _handle_agent_failure -- the orchestrator's interrupt-aware +# bridge handles the resume protocol via the checkpointer. + # ----- imports for runtime/checkpointer_postgres.py ----- """Postgres checkpointer wrapper. @@ -383,7 +792,6 @@ class IncidentState(Session): from abc import ABC, abstractmethod -from dataclasses import dataclass from typing import TYPE_CHECKING # ----- imports for runtime/triggers/config.py ----- @@ -418,7 +826,7 @@ class IncidentState(Session): """ -from typing import Any, Callable, Type +from typing import Any, Callable, Type, cast @@ -448,7 +856,6 @@ class IncidentState(Session): """ -import threading from collections import OrderedDict from datetime import datetime, timezone, timedelta @@ -471,7 +878,6 @@ class IncidentState(Session): import hmac -from typing import Callable from fastapi import Header, HTTPException, status @@ -683,7 +1089,6 @@ async def _poll(self, registry): """ -from typing import Any, Callable # ----- imports for runtime/memory/session_state.py ----- @@ -700,7 +1105,6 @@ async def _poll(self, registry): """ -from pydantic import BaseModel, ConfigDict, Field # ----- imports for runtime/memory/knowledge_graph.py ----- @@ -878,6 +1282,37 @@ async def _poll(self, registry): from typing import AsyncIterator +# ----- imports for runtime/skill_validator.py ----- +"""Load-time validation of skill YAML against the live MCP registry. + +Catches: + * tools.local entries that reference a non-existent (server, tool) + pair (typically typos that would silently make the tool invisible). + * routes that omit ``when: default`` (would cause graph hangs at + __end__ when no signal matches). +""" + + + +# ----- imports for runtime/storage/checkpoint_gc.py ----- +"""Garbage-collect orphaned LangGraph checkpoints. + +When ``Orchestrator.retry_session`` rebinds a session to a new +``thread_id`` (e.g. ``INC-1:retry-1``), the original ``INC-1`` thread's +checkpoint becomes orphaned — no code path will ever resume it. Over +time these accumulate. ``gc_orphaned_checkpoints`` removes any +checkpoint whose ``thread_id`` does not reference an active session +(or a known retry suffix). + +This is intentionally conservative: only checkpoints whose thread_id +prefix matches no live session row at all are removed. +""" + + +from sqlalchemy import text +from sqlalchemy.exc import OperationalError + + # ----- imports for runtime/orchestrator.py ----- """Public Orchestrator class — the API consumed by the UI and (future) FastAPI.""" @@ -918,7 +1353,6 @@ async def _poll(self, registry): from fastapi.responses import StreamingResponse - # ----- imports for runtime/api_dedup.py ----- """Dedup retraction HTTP routes. @@ -1000,12 +1434,119 @@ async def _poll(self, registry): # Repo root: examples/code_review/mcp_server.py -> repo root is two parents up. +# ====== module: runtime/errors.py ====== + +class LLMTimeoutError(TimeoutError): + """Raised when an LLM provider HTTP call exceeds request_timeout. + + Subclasses ``TimeoutError`` so ``runtime.policy._TRANSIENT_TYPES`` + auto-classifies it as transient via ``isinstance`` -- no policy.py + edit needed (D-13-04). + + The ``__str__`` includes the substring ``"timed out"`` so existing + string-matchers in ``runtime.graph._TRANSIENT_MARKERS`` and + ``runtime.orchestrator._reconstruct_last_error`` also catch it + without modification. + """ + + def __init__(self, provider: str, model: str, elapsed_ms: int) -> None: + self.provider = provider + self.model = model + self.elapsed_ms = elapsed_ms + super().__init__( + f"LLM request timed out after {elapsed_ms}ms " + f"(provider={provider}, model={model})" + ) + + +class LLMConfigError(ValueError): + """Raised at config-load when a provider is missing a required field. + + Subclasses ``ValueError`` so pydantic ``@model_validator(mode='after')`` + propagates it cleanly into ``ValidationError`` (D-13-05). + """ + + def __init__(self, provider: str, missing_field: str) -> None: + self.provider = provider + self.missing_field = missing_field + super().__init__( + f"{provider} provider requires {missing_field!r}" + ) + + +__all__ = ["LLMTimeoutError", "LLMConfigError"] + +# ====== module: runtime/terminal_tools.py ====== + +class TerminalToolRule(BaseModel): + """Maps a terminal tool name to the session status it produces. + + ``tool_name`` matches both bare (``set_recommendation``) and prefixed + (``:set_recommendation``) MCP tool-call names — the framework + does the suffix check. + + ``status`` must reference a name declared in the same + ``OrchestratorConfig.statuses`` map; ``OrchestratorConfig``'s + cross-field validator enforces this at config-load. + + ``extract_fields`` declares per-rule extra-metadata pulls. Each + key is the destination field name on the session + (``Session.extra_fields[]``); each value is an ordered list + of ``args.X`` / ``result.X`` lookup hints. The framework picks + the first non-falsy match. Empty dict (default) means "no extra + metadata to capture". Generalises the v1.0 + ``_extract_team(tc, team_keys)`` path; the same lookup syntax is + preserved (D-06-02). + + ``match_args`` is an optional argument-value discriminator. When + non-empty, the rule matches a tool call only if EVERY ``(key, + value)`` pair in ``match_args`` matches ``tool_call.args[key]`` + exactly. Lets one tool name route to multiple statuses based on + a discriminator argument (e.g. ``set_recommendation`` with + ``recommendation=approve`` vs ``recommendation=request_changes``). + Empty default = no arg dispatch; preserves the v1.0 single-rule + shape (DECOUPLE-07 / D-08-03). + """ + + model_config = {"extra": "forbid"} + + tool_name: str = Field(min_length=1) + status: str = Field(min_length=1) + extract_fields: dict[str, list[str]] = Field(default_factory=dict) + match_args: dict[str, str] = Field(default_factory=dict) + + +StatusKind = Literal[ + "success", # e.g. set_recommendation(approve) -> approved + "failure", # e.g. set_recommendation(request_changes) -> changes_requested + "escalation", # app-defined escalation terminal (e.g. ) + "needs_review", # finalize fired with no rule match + "pending", # session in flight +] + + +class StatusDef(BaseModel): + """Pydantic record of one app status. + + Framework reads ``terminal`` to decide finalize-vs-pending and + ``kind`` to dispatch the needs_review fallback path / let UIs + group statuses without owning their own taxonomy. ``color`` and + other presentation fields stay in ``UIConfig.badges`` (D-06-05 + rejected alternative — presentation leak). + """ + + model_config = {"extra": "forbid"} + + name: str = Field(min_length=1) + terminal: bool + kind: StatusKind + # ====== module: runtime/config.py ====== _SESSION_ID_PREFIX_RE = re.compile(r"^[A-Za-z0-9-]{1,16}$") -ProviderKind = Literal["ollama", "azure_openai", "stub"] +ProviderKind = Literal["ollama", "azure_openai", "openai_compat", "stub"] class ProviderConfig(BaseModel): @@ -1013,12 +1554,35 @@ class ProviderConfig(BaseModel): Multiple named ``ModelConfig`` entries can reference the same provider so that, e.g., two Ollama models share a single base_url + api_key. + + Phase 13 (HARD-01 / D-13-01): per-provider ``request_timeout`` + override (None means "use OrchestratorConfig.default_llm_request_timeout"). + Phase 13 (HARD-05 / D-13-06): ollama providers MUST declare + ``base_url``; the @model_validator below catches the omission at + config-load and raises ``LLMConfigError``. The hardcoded public + Ollama fallback in ``runtime.llm`` is removed in the same phase. """ kind: ProviderKind - base_url: str | None = None # ollama + base_url: str | None = None # ollama (REQUIRED via validator) api_key: str | None = None # ollama, azure_openai - endpoint: str | None = None # azure_openai + endpoint: str | None = None # azure_openai (validated lazily in builder) api_version: str | None = None # azure_openai + request_timeout: float | None = Field( + default=None, gt=0, le=600, + ) # NEW Phase 13 (D-13-01) — None -> OrchestratorConfig default + + @model_validator(mode="after") + def _validate_required_fields(self) -> "ProviderConfig": + # D-13-06: only ollama is promoted to config-load validation in + # Phase 13. azure_openai (`endpoint`) and openai_compat + # (`base_url` + `api_key`) keep their existing first-request + # ValueError raises in `_build_*_chat`. Promoting them is a + # potential follow-up; see CONTEXT.md "Deferred Ideas". + if self.kind == "ollama" and not self.base_url: + raise LLMConfigError( + provider="ollama", missing_field="base_url", + ) + return self class ModelConfig(BaseModel): @@ -1125,6 +1689,76 @@ class Paths(BaseModel): incidents_dir: str = "incidents" +class GatePolicy(BaseModel): + """Phase 11 (FOC-04): declarative HITL gating policy. + + Drives the framework's pure ``should_gate`` boundary. The LLM never + sees this config -- flow control is a framework decision, not a + skill-prompt incantation. + + ``confidence_threshold`` is the strict-less-than predicate the gate + applies to the active turn confidence; tool calls below the + threshold fire a low_confidence pause for any non-auto-rated tool. + + ``gated_environments`` enumerates Session.environment values that + automatically gate every non-auto-rated tool call regardless of + confidence -- lifecycle defence against blast radius in production. + + ``gated_risk_actions`` enumerates GatewayAction Literal values + (``auto``/``notify``/``approve``) that ALWAYS trigger a gate + regardless of env or confidence. Default ``{"approve"}`` mirrors + v1.0 HITL behaviour. + + Phase 11 chooses ``"approve"`` (the actual GatewayAction literal) + over CONTEXT.md's sketched ``"hitl"`` -- see + src/runtime/tools/gateway.py:32 for the canonical 3-valued + GatewayAction Literal. + """ + + model_config = ConfigDict(extra="forbid") + + confidence_threshold: float = Field(default=0.7, ge=0.0, le=1.0) + gated_environments: set[str] = Field( + default_factory=lambda: {"production"}, + ) + gated_risk_actions: set[str] = Field( + default_factory=lambda: {"approve"}, + ) + + +class RetryPolicy(BaseModel): + """Phase 12 (FOC-05): declarative retry policy. + + Drives the framework's pure ``should_retry`` boundary. The LLM never + sees this config -- flow control is a framework decision, not a + skill-prompt incantation. Mirrors GatePolicy's shape so the + OrchestratorConfig surface stays uniform. + + ``max_retries`` is the absolute cap on automatic retries (compared + with ``retry_count`` via ``>=``). 0 disables auto-retry entirely; + the recommended default 2 mirrors the v1.2 ROADMAP sketch and the + existing transient-5xx auto-retry budget in graph.py. + + ``retry_on_transient`` lets apps with strict SLOs disable framework + auto-retry of transient errors entirely (escalate immediately + instead). + + ``retry_low_confidence_threshold`` is the strict-less-than predicate + for "the LLM gave up; don't burn budget on a retry". Defaults to + 0.4 -- well below the typical gate_policy 0.7-0.8 threshold so a + low-confidence escalation triggers HITL intervention before the + retry path even considers it. + """ + + model_config = ConfigDict(extra="forbid") + + max_retries: int = Field(default=2, ge=0, le=10) + retry_on_transient: bool = True + retry_low_confidence_threshold: float = Field( + default=0.4, ge=0.0, le=1.0, + ) + + class OrchestratorConfig(BaseModel): model_config = {"extra": "forbid"} @@ -1215,6 +1849,41 @@ class OrchestratorConfig(BaseModel): # bad path raises at boot with a useful message (DECOUPLE-05 / D-08-01). state_overrides_schema: str | None = None + # Phase 9 (D-09-02 / FOC-01): map of LLM-visible-arg -> dotted-path + # on the live Session. Tools whose param name matches a key in this + # dict get the param stripped from the LLM-visible signature, and + # the framework supplies the resolved value at _invoke_tool / + # _GatedTool._run / _arun time. Apps declare what to inject; the + # framework stays generic. Empty default = no injection (legacy + # behaviour). Validated at config-load: keys are non-empty + # identifiers, values are dotted paths starting with "session.". + injected_args: dict[str, str] = Field(default_factory=dict) + + # Phase 11 (FOC-04): declarative HITL gating policy. Apps tune + # thresholds in YAML; the framework's should_gate boundary reads + # this struct and the LLM never sees it. Default keeps v1.1 + # behaviour (production gates "approve"-risk tools, threshold 0.7). + gate_policy: "GatePolicy" = Field(default_factory=lambda: GatePolicy()) + + # Phase 12 (FOC-05): declarative retry policy. Apps tune + # max_retries / retry_on_transient / low-confidence threshold in + # YAML; the framework's should_retry boundary reads this struct + # and the LLM never sees it. Default keeps v1.2 behaviour + # (max_retries=2, transient retries enabled, confidence floor 0.4). + retry_policy: "RetryPolicy" = Field( + default_factory=lambda: RetryPolicy(), + ) + + # Phase 13 (HARD-01 / D-13-02): framework-default LLM HTTP request + # timeout in seconds. Per-provider ``ProviderConfig.request_timeout`` + # overrides this; ``None`` on the provider means "use this default". + # Bounded to catch indefinite hangs (CONCERNS C1) while leaving room + # for slow CPU Ollama runs (e.g., gpt-oss:120b). 600s upper bound + # prevents accidentally-disabling the protection. + default_llm_request_timeout: float = Field( + default=120.0, gt=0, le=600, + ) + @field_validator("state_overrides_schema") @classmethod def _validate_state_overrides_schema_format( @@ -1249,6 +1918,38 @@ def _validate_state_overrides_schema_format( ) return v + @field_validator("injected_args") + @classmethod + def _validate_injected_args( + cls, v: dict[str, str], + ) -> dict[str, str]: + """Phase 9 (D-09-02): config-load validation for injected_args. + + Each entry is ``arg_name -> dotted_path`` where ``arg_name`` must + be a valid Python identifier (it is the keyword name on a tool + signature) and ``dotted_path`` must be a non-empty string with at + least one dot (e.g. ``session.environment``). Real attribute + resolution happens at injection time in + :func:`runtime.tools.arg_injection.inject_injected_args` so + config-load doesn't drag the live ``Session`` into every consumer. + """ + for key, path in v.items(): + if not key or not key.isidentifier(): + raise ValueError( + f"injected_args key {key!r} must be a non-empty " + f"Python identifier" + ) + if not isinstance(path, str) or not path.strip(): + raise ValueError( + f"injected_args[{key!r}] must be a non-empty dotted path" + ) + if "." not in path: + raise ValueError( + f"injected_args[{key!r}]={path!r} must be a dotted path " + f"(e.g. 'session.environment')" + ) + return v + @model_validator(mode="after") def _validate_terminal_tool_registry(self) -> "OrchestratorConfig": """Cross-field invariants for the terminal-tool registry. @@ -1584,7 +2285,11 @@ def _coerce_dedup(self) -> "AppConfig": if isinstance(self.dedup, DedupConfig): return self if isinstance(self.dedup, dict): - self.__dict__["dedup"] = DedupConfig(**self.dedup) + # ``BaseModel.__dict__`` is typed as ``MappingProxyType`` in + # the pydantic stub; the documented post-validator mutation + # path is direct ``__dict__`` assignment, which works at + # runtime (pydantic stores fields in a plain dict). + self.__dict__["dedup"] = DedupConfig(**self.dedup) # pyright: ignore[reportIndexIssue] return self raise ValueError( f"app.dedup must be a DedupConfig or dict; got " @@ -1625,8 +2330,9 @@ def _coerce_triggers(self) -> "AppConfig": ) coerced.append(cls(**raw)) # Pydantic v2 stores fields in ``__dict__``; assigning here is - # the documented way to mutate after validation. - self.__dict__["triggers"] = coerced + # the documented way to mutate after validation. (Stub types + # ``__dict__`` as MappingProxyType; runtime is a plain dict.) + self.__dict__["triggers"] = coerced # pyright: ignore[reportIndexIssue] return self @@ -1743,6 +2449,17 @@ class Session(BaseModel): # with a stale version raise ``StaleVersionError`` so the caller can # reload + retry. version: int = 1 + # Phase 11 (FOC-04): transient per-turn confidence hint set by the + # agent runner (graph.py / responsive.py) AFTER each + # _harvest_tool_calls_and_patches call so the gateway's should_gate + # boundary can apply low_confidence gating using whatever + # confidence the agent has emitted so far. Reset to ``None`` at + # turn start; never persisted (``Field(exclude=True)``). The + # framework treats ``None`` as "no signal yet" and does NOT fire a + # low_confidence gate -- this avoids a false-positive gate on the + # very first tool call of a turn before any envelope/tool-arg + # carrying confidence has surfaced. + turn_confidence_hint: float | None = Field(default=None, exclude=True) # ------------------------------------------------------------------ # App-overridable agent-input formatter hook. @@ -2358,11 +3075,40 @@ class StubChatModel(BaseChatModel): """Deterministic chat model for tests/CI. Returns canned text per role. Optionally emits one tool call on first invocation if `tool_call_plan` is set. + + Phase 10 (FOC-03): also honours + ``llm.with_structured_output(AgentTurnOutput)`` so stub-driven tests + survive the runner's envelope contract. The structured response is + derived from the same canned text + a default 0.85 confidence; tests + that need a specific envelope shape can override + ``stub_envelope_confidence`` / ``stub_envelope_rationale`` / + ``stub_envelope_signal``. + + Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with + ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` -> + ``ToolStrategy`` for non-native-structured-output models, including + this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The + agent loop only terminates when the LLM emits a tool call NAMED + ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name + so ``_generate`` can auto-emit a closing tool call after any + user-configured ``tool_call_plan`` is exhausted -- preserving the + pre-Phase-15 stub semantics (canned text + optional pre-scripted + tool calls) while satisfying the new tool-loop termination + contract. """ role: str = "default" canned_responses: dict[str, str] = Field(default_factory=dict) tool_call_plan: list[dict] | None = None + stub_envelope_confidence: float = 0.85 + stub_envelope_rationale: str = "stub envelope rationale" + stub_envelope_signal: str | None = None _called_once: bool = False + # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when + # ``langchain.agents.create_agent`` injects a structured-output tool + # for ``AgentTurnOutput``. Holds the bare tool name (e.g. + # ``"AgentTurnOutput"``) so ``_generate`` can emit a final + # envelope-shaped tool call to close the agent loop. + _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -2376,6 +3122,26 @@ def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None, for tc in self.tool_call_plan: tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}) self._called_once = True + elif self._envelope_tool_name is not None: + # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted + # (or wasn't configured) AND ``langchain.agents.create_agent`` + # has bound the AgentTurnOutput envelope as a tool. Emit a + # closing tool call so the loop terminates with a populated + # ``structured_response``. The args mirror the + # ``with_structured_output`` path's envelope construction so + # tests see the same confidence / rationale / signal regardless + # of whether the new tool-strategy or the legacy structured- + # output path is in play. + tool_calls.append({ + "name": self._envelope_tool_name, + "args": { + "content": text or ".", + "confidence": self.stub_envelope_confidence, + "confidence_rationale": self.stub_envelope_rationale, + "signal": self.stub_envelope_signal, + }, + "id": str(uuid4()), + }) msg = AIMessage(content=text, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) @@ -2384,25 +3150,217 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - """No-op binder: stub emits tool calls only via `tool_call_plan`, not via real binding.""" + """Record the AgentTurnOutput envelope-tool name when present. + + Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with + ``response_format=AgentTurnOutput`` calls ``bind_tools(...)`` + with the user's tools PLUS the envelope-as-a-tool. We scan the + list for the AgentTurnOutput-shaped tool (matched by ``__name__`` + on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the + ``"name"`` key on dict-shaped tool specs) and remember it on the + instance so ``_generate`` can close the agent loop with a + synthetic envelope tool call after any pre-scripted + ``tool_call_plan`` is exhausted. Tools bound by the framework + itself (real BaseTools the agent should call) flow through + unchanged -- the stub still emits them only via + ``tool_call_plan``. + """ + for t in tools or []: + name = ( + getattr(t, "__name__", None) + or getattr(t, "name", None) + or (isinstance(t, dict) and t.get("name")) + ) + if isinstance(name, str) and name == "AgentTurnOutput": + self._envelope_tool_name = name + break return self + # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]`` + # in the langchain stub; this stub override returns a deterministic + # ``_StructuredRunnable`` so tests can drive structured outputs + # without a live provider. Functionally a Runnable (it implements + # ``invoke`` + ``ainvoke``); the stub mismatch is cosmetic. + def with_structured_output(self, schema, *, include_raw: bool = False, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] + """Phase 10 (FOC-03): honour the structured-output pass. + + Historically (pre-Phase-15) the deprecated + ``langgraph.prebuilt.create_react_agent`` factory called this + after its tool loop completed. The current + ``langchain.agents.create_agent`` path uses a tool-strategy + binding instead (see ``bind_tools`` above), but providers and + test code that call ``with_structured_output`` directly still + get a deterministic schema instance. + + We return a Runnable-like that yields a valid ``schema`` + instance derived from the stub's canned text and the + per-instance envelope configuration. Tests can tune + ``stub_envelope_confidence`` etc. to drive gate / reconcile + paths. + """ + text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response") + confidence = self.stub_envelope_confidence + rationale = self.stub_envelope_rationale + signal = self.stub_envelope_signal + + class _StructuredRunnable: + def __init__(self, schema_cls): + self._schema = schema_cls + + def _build(self): + # Construct an instance of whatever schema was passed. + # Common case: AgentTurnOutput; permissive fallback handles + # other pydantic schemas the test may pass. + try: + return self._schema( + content=text or ".", + confidence=confidence, + confidence_rationale=rationale, + signal=signal, + ) + except Exception: + # Permissive fallback for unfamiliar schemas: try + # model_validate on a minimal dict. + return self._schema.model_validate({ + "content": text or ".", + "confidence": confidence, + "confidence_rationale": rationale, + "signal": signal, + }) -def _build_ollama_chat(provider: ProviderConfig, model_id: str, - temperature: float) -> BaseChatModel: - from langchain_ollama import ChatOllama - kwargs: dict[str, Any] = { - "base_url": provider.base_url or "https://ollama.com", - "model": model_id, - "temperature": temperature, - } + def invoke(self, *_args, **_kwargs): + return self._build() + + async def ainvoke(self, *_args, **_kwargs): + return self._build() + + return _StructuredRunnable(schema) + + +def _resolve_timeout( + provider: ProviderConfig, default: float, +) -> float: + """Resolve effective request timeout for a provider. + + Per-provider override wins; falls back to the framework default + (typically ``OrchestratorConfig.default_llm_request_timeout``). + """ + if provider.request_timeout is not None: + return provider.request_timeout + return default + + +def _wrap_chat_with_timeout( + base: BaseChatModel, + provider_name: str, + model_id: str, + request_timeout: float, +) -> BaseChatModel: + """Wrap ``base`` so every ``ainvoke`` is bounded by + ``asyncio.wait_for(..., timeout=request_timeout)`` and raises + ``LLMTimeoutError(provider, model, elapsed_ms)`` on hang. + + The native langchain timeout knob (``request_timeout=`` on + openai/azure or ``client_kwargs={'timeout': ...}`` on ollama) is + honoured at the httpx layer; this wrapper guarantees the + framework-typed exception AND a hard ceiling even if the + underlying client hangs in a way httpx misses (e.g., post-headers + TCP read stall on a slow Ollama). D-13-04: subclassing + ``TimeoutError`` means ``policy._TRANSIENT_TYPES`` auto-classifies + the error as transient (zero edits to ``policy.py``). + """ + base_cls = type(base) + + class _Bounded(base_cls): # type: ignore[misc, valid-type] + async def ainvoke(self, *args: Any, **kwargs: Any) -> Any: + t0 = time.monotonic() + try: + return await asyncio.wait_for( + super().ainvoke(*args, **kwargs), + timeout=request_timeout, + ) + except (asyncio.TimeoutError, TimeoutError) as e: + if isinstance(e, LLMTimeoutError): + # Already typed; don't double-wrap. + raise + elapsed_ms = int((time.monotonic() - t0) * 1000) + raise LLMTimeoutError( + provider=provider_name, + model=model_id, + elapsed_ms=elapsed_ms, + ) from e + + # Reuse the live pydantic instance's state without re-running + # __init__ (which would re-init the underlying httpx clients). + bounded = _Bounded.model_construct(**base.model_dump()) + # Some langchain client classes initialise non-pydantic attrs + # (httpx clients, run_manager, etc.) inside __init__. Copy them + # through so the wrapped instance shares the same network state. + for attr_name in ( + "_client", "_async_client", + "_async_httpx_client", "_sync_httpx_client", + "client", "async_client", + ): + if hasattr(base, attr_name): + try: + object.__setattr__( + bounded, attr_name, getattr(base, attr_name), + ) + except (AttributeError, TypeError): + # Slot-only or read-only attrs on some langchain + # versions -- the bounded instance will re-init on + # first use; not a correctness issue. + pass + return bounded + + +def _build_ollama_chat( + provider: ProviderConfig, model_id: str, temperature: float, + *, request_timeout: float, +) -> BaseChatModel: + from langchain_ollama import ChatOllama + + # Many Ollama models (gemma*, gpt-oss, ministral, etc.) don't support + # native function-calling, which is langchain-ollama's default method + # for ``with_structured_output``. Subclass to force + # ``method='json_schema'`` (uses Ollama's structured-output API) so + # Phase 10's ``response_format=AgentTurnOutput`` envelope actually + # round-trips instead of failing with ``OutputParserException`` + # when the LLM emits prose. + class _ChatOllamaJsonSchema(ChatOllama): # type: ignore[misc, valid-type] + def with_structured_output(self, schema, *, method=None, **kw): + return super().with_structured_output( + schema, method=method or "json_schema", **kw, + ) + + # Phase 13 (HARD-01): ChatOllama has NO native ``request_timeout`` + # field; the canonical incantation is ``client_kwargs={"timeout": ...}``, + # which propagates to the underlying httpx.AsyncClient. + client_kwargs: dict[str, Any] = {"timeout": request_timeout} api_key = provider.api_key or os.environ.get("OLLAMA_API_KEY") if api_key: - kwargs["client_kwargs"] = {"headers": {"Authorization": f"Bearer {api_key}"}} - return ChatOllama(**kwargs) + client_kwargs["headers"] = { + "Authorization": f"Bearer {api_key}", + } + # Phase 13 (HARD-05): base_url is now config-load-validated by + # ProviderConfig._validate_required_fields. NO fallback to a + # public Ollama URL (air-gap rule violation). + kwargs: dict[str, Any] = { + "base_url": provider.base_url, + "model": model_id, + "temperature": temperature, + "client_kwargs": client_kwargs, + } + base = _ChatOllamaJsonSchema(**kwargs) + return _wrap_chat_with_timeout( + base, "ollama", model_id, request_timeout, + ) -def _build_azure_chat(provider: ProviderConfig, model: ModelConfig) -> BaseChatModel: +def _build_azure_chat( + provider: ProviderConfig, model: ModelConfig, + *, request_timeout: float, +) -> BaseChatModel: from langchain_openai import AzureChatOpenAI if provider.endpoint is None: raise ValueError("azure_openai provider requires 'endpoint'") @@ -2411,24 +3369,49 @@ def _build_azure_chat(provider: ProviderConfig, model: ModelConfig) -> BaseChatM f"azure_openai model {model.model!r} requires 'deployment'" ) _ak = provider.api_key or os.environ.get("AZURE_OPENAI_KEY") - return AzureChatOpenAI( + # ``request_timeout`` is a runtime alias for ``timeout`` on + # AzureChatOpenAI (langchain-openai > 0.3 declares it via Pydantic + # ``Field(alias="timeout")``); the langchain stubs only expose + # ``timeout``, hence the stub gap. + base = AzureChatOpenAI( azure_endpoint=provider.endpoint, api_version=provider.api_version or "2024-08-01-preview", azure_deployment=model.deployment, api_key=SecretStr(_ak) if _ak else None, temperature=model.temperature, + request_timeout=request_timeout, # pyright: ignore[reportCallIssue] -- Phase 13 (HARD-01) -- alias for ``timeout`` not in stub + ) + return _wrap_chat_with_timeout( + base, "azure_openai", model.model, request_timeout, ) def get_llm(cfg: LLMConfig, model_name: str | None = None, *, role: str = "default", stub_canned: dict[str, str] | None = None, - stub_tool_plan: list[dict] | None = None) -> BaseChatModel: + stub_tool_plan: list[dict] | None = None, + stub_envelope_confidence: float | None = None, + stub_envelope_rationale: str | None = None, + stub_envelope_signal: str | None = None, + default_llm_request_timeout: float = 120.0, + ) -> BaseChatModel: """Build a chat model by named entry from ``cfg.models``. ``model_name`` defaults to ``cfg.default``. Validation that the name exists is enforced by ``LLMConfig`` itself (model_validator), so a - missing name here means caller passed a typo — raise loudly. + missing name here means caller passed a typo -- raise loudly. + + Phase 10 (FOC-03): stub callers can now tune the canned envelope + (confidence / rationale / signal) so gate-trigger tests preserve their + pre-Phase-10 semantics by emitting a low-confidence envelope. + + Phase 13 (HARD-01): non-stub builds are bounded by an effective + ``request_timeout`` resolved as ``provider.request_timeout`` (per- + provider override) -> ``default_llm_request_timeout`` (framework + default; callers pass ``cfg.orchestrator.default_llm_request_timeout``). + The default keyword value (120.0) matches OrchestratorConfig's default + so test paths that build LLMs without an OrchestratorConfig in scope + still get a sane bound. """ name = model_name or cfg.default model = cfg.models.get(name) @@ -2440,44 +3423,127 @@ def get_llm(cfg: LLMConfig, model_name: str | None = None, *, provider = cfg.providers[model.provider] # validated at config load if provider.kind == "stub": - return StubChatModel( - role=role, - canned_responses=stub_canned or {}, - tool_call_plan=stub_tool_plan, - ) + kwargs: dict[str, Any] = { + "role": role, + "canned_responses": stub_canned or {}, + "tool_call_plan": stub_tool_plan, + } + if stub_envelope_confidence is not None: + kwargs["stub_envelope_confidence"] = stub_envelope_confidence + if stub_envelope_rationale is not None: + kwargs["stub_envelope_rationale"] = stub_envelope_rationale + if stub_envelope_signal is not None: + kwargs["stub_envelope_signal"] = stub_envelope_signal + return StubChatModel(**kwargs) + + effective = _resolve_timeout(provider, default_llm_request_timeout) + if provider.kind == "ollama": - return _build_ollama_chat(provider, model.model, model.temperature) + return _build_ollama_chat( + provider, model.model, model.temperature, + request_timeout=effective, + ) if provider.kind == "azure_openai": - return _build_azure_chat(provider, model) + return _build_azure_chat( + provider, model, request_timeout=effective, + ) + if provider.kind == "openai_compat": + return _build_openai_compat_chat( + provider, model, request_timeout=effective, + ) raise ValueError(f"Unknown provider kind: {provider.kind!r}") -def get_embedding(cfg: LLMConfig) -> Embeddings: - """Build the configured embedding model. Raises if ``cfg.embedding`` is None.""" +def _build_openai_compat_chat( + provider: ProviderConfig, model: ModelConfig, + *, request_timeout: float, +) -> BaseChatModel: + """Build a ``ChatOpenAI`` pointed at an OpenAI-compatible endpoint + (OpenRouter, Together, vLLM, etc.). Reuses langchain-openai's + ``ChatOpenAI`` with ``base_url=`` override and the provider's + ``api_key`` (resolved from env via the YAML loader). + """ + from langchain_openai import ChatOpenAI + if provider.base_url is None: + raise ValueError( + "openai_compat provider requires 'base_url' " + "(e.g. https://openrouter.ai/api/v1)" + ) + if provider.api_key is None: + raise ValueError("openai_compat provider requires 'api_key'") + # See AzureChatOpenAI block above: ``request_timeout`` is a runtime + # alias for ``timeout`` not in the langchain stubs. + base = ChatOpenAI( + base_url=provider.base_url, + api_key=provider.api_key, + model=model.model, + temperature=model.temperature, + request_timeout=request_timeout, # pyright: ignore[reportCallIssue] -- Phase 13 (HARD-01) -- alias for ``timeout`` not in stub + ) + return _wrap_chat_with_timeout( + base, "openai_compat", model.model, request_timeout, + ) + + +def get_embedding( + cfg: LLMConfig, *, default_llm_request_timeout: float = 120.0, +) -> Embeddings: + """Build the configured embedding model. Raises if ``cfg.embedding`` is None. + + Phase 13 (HARD-01): same per-provider override -> framework default + timeout resolution as ``get_llm``. Embeddings traffic shares the + request_timeout knob with chat (see CONTEXT.md "Deferred Ideas" -- + splitting embedding timeout from chat is a future refinement). + + Note (Phase 13 review WR-01): unlike the chat builders -- which apply a + defence-in-depth ``asyncio.wait_for`` wrapper (``_wrap_chat_with_timeout``) + that guarantees a structured ``LLMTimeoutError`` with ``elapsed_ms`` even + on partial-byte stalls -- embeddings rely SOLELY on the underlying + httpx-layer timeout configured above (``client_kwargs={"timeout": ...}`` + for Ollama, ``request_timeout=`` for Azure). This asymmetry is a + deliberate scope choice tied to Phase 13 CONTEXT.md "Deferred Ideas" #4 + (splitting embeddings timeout from chat timeout). If embeddings need + stricter bounds than chat -- or if the httpx-layer timeout proves + insufficient against post-headers TCP read stalls on the embeddings + path the same way it can on chat -- a future phase can mirror + ``_wrap_chat_with_timeout`` for the embeddings public surface + (``aembed_query`` / ``aembed_documents``). + """ if cfg.embedding is None: raise ValueError("llm.embedding is not configured") provider = cfg.providers[cfg.embedding.provider] + effective = _resolve_timeout(provider, default_llm_request_timeout) if provider.kind == "ollama": from langchain_ollama import OllamaEmbeddings - kwargs: dict[str, Any] = { - "base_url": provider.base_url or "https://ollama.com", - "model": cfg.embedding.model, - } + # Phase 13 (HARD-01): OllamaEmbeddings has NO native + # ``request_timeout`` field; canonical incantation is + # ``client_kwargs={"timeout": ...}`` (same as ChatOllama). + client_kwargs: dict[str, Any] = {"timeout": effective} api_key = provider.api_key or os.environ.get("OLLAMA_API_KEY") if api_key: - kwargs["client_kwargs"] = {"headers": {"Authorization": f"Bearer {api_key}"}} - return OllamaEmbeddings(**kwargs) + client_kwargs["headers"] = { + "Authorization": f"Bearer {api_key}", + } + # Phase 13 (HARD-05): base_url config-load-validated; NO public fallback. + return OllamaEmbeddings( + base_url=provider.base_url, + model=cfg.embedding.model, + client_kwargs=client_kwargs, + ) if provider.kind == "azure_openai": from langchain_openai import AzureOpenAIEmbeddings if provider.endpoint is None: raise ValueError("azure_openai provider requires 'endpoint'") deployment = cfg.embedding.deployment or cfg.embedding.model _ak = provider.api_key or os.environ.get("AZURE_OPENAI_KEY") + # See chat builders above: ``request_timeout`` is a runtime + # alias for ``timeout`` not surfaced in the langchain-openai stub. return AzureOpenAIEmbeddings( azure_endpoint=provider.endpoint, api_version=provider.api_version or "2024-08-01-preview", azure_deployment=deployment, api_key=SecretStr(_ak) if _ak else None, + request_timeout=effective, # pyright: ignore[reportCallIssue] -- Phase 13 (HARD-01) -- alias for ``timeout`` not in stub ) raise ValueError( f"Embedding not supported for provider kind {provider.kind!r}" @@ -2694,12 +3760,14 @@ def build_embedder( ) if p.kind == "azure_openai": from langchain_openai import AzureOpenAIEmbeddings + # AzureOpenAIEmbeddings.api_key is typed as ``SecretStr | None`` + # (pydantic v2). Wrap the env-sourced str so the type matches. return AzureOpenAIEmbeddings( azure_deployment=cfg.deployment, model=cfg.model, azure_endpoint=p.endpoint, api_version=p.api_version, - api_key=p.api_key, + api_key=SecretStr(p.api_key) if p.api_key else None, ) if p.kind == "stub": return _StubEmbeddings(dim=cfg.dim) @@ -2721,10 +3789,13 @@ def _faiss_distance_strategy(name: str): def _pgvector_distance_strategy(name: str): from langchain_postgres.vectorstores import DistanceStrategy + # ``langchain_postgres.DistanceStrategy.INNER_PRODUCT`` exists at + # runtime (verified via the live module) but the langchain-postgres + # stubs only expose ``COSINE`` / ``EUCLIDEAN``. return { "cosine": DistanceStrategy.COSINE, "euclidean": DistanceStrategy.EUCLIDEAN, - "inner_product": DistanceStrategy.INNER_PRODUCT, + "inner_product": DistanceStrategy.INNER_PRODUCT, # pyright: ignore[reportAttributeAccessIssue] }[name] @@ -2800,7 +3871,7 @@ def distance_to_similarity(distance: float, strategy: str) -> float: # ====== module: runtime/storage/history_store.py ====== -StateT = TypeVar("StateT", bound=BaseModel) +StateT = TypeVar("StateT", bound=Session) # Allowed ``filter_kwargs`` keys = IncidentRow column names. # Computed at module load so we can produce a precise error for typos. @@ -2852,7 +3923,7 @@ def _row_to_incident(self, row: IncidentRow) -> StateT: return self._converter._row_to_incident(row) def _load(self, incident_id: str) -> StateT: - with Session(self.engine) as session: + with SqlaSession(self.engine) as session: row = session.get(IncidentRow, incident_id) if row is None: raise FileNotFoundError(incident_id) @@ -2863,7 +3934,7 @@ def _list_filtered(self, *, filter_kwargs: Mapping[str, Any]) -> list[StateT]: Pure SQL prefilter — used by both vector and keyword paths. """ - with Session(self.engine) as session: + with SqlaSession(self.engine) as session: stmt = select(IncidentRow).where(IncidentRow.deleted_at.is_(None)) for col, val in filter_kwargs.items(): stmt = stmt.where(getattr(IncidentRow, col) == val) @@ -2920,7 +3991,12 @@ def find_similar( threshold = self.similarity_threshold if threshold is None else threshold vec = self.embedder.embed_query(query) - raw = self.vector_store.similarity_search_with_score_by_vector(vec, k=limit * 4) + # ``similarity_search_with_score_by_vector`` is provided by the + # concrete FAISS / pgvector / langchain-postgres backends (and + # validated by ``runtime.storage.vector.build_vector_store``) + # but the abstract ``langchain_core.vectorstores.VectorStore`` + # base class does not declare it. + raw = self.vector_store.similarity_search_with_score_by_vector(vec, k=limit * 4) # pyright: ignore[reportAttributeAccessIssue] out: list[tuple[StateT, float]] = [] for doc, distance in raw: score = distance_to_similarity(float(distance), self.distance_strategy) @@ -2957,7 +4033,7 @@ def _keyword_similar(self, *, query, filter_kwargs, status_filter, threshold, li if getattr(i, "status", None) == status_filter and getattr(i, "deleted_at", None) is None ] - def _ef(i, key, default=""): + def _ef(i, key, default: Any = ""): """Read a field from typed attribute first, then extra_fields.""" val = getattr(i, key, None) if val: @@ -2989,12 +4065,16 @@ def _ef(i, key, default=""): _INC_ID_RE = re.compile(r"^INC-\d{8}-\d{3}$") _SESSION_ID_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_-]*-\d{8}-\d{3}$") -# StateT is bound to ``BaseModel`` so callers can pass either bare -# ``Session`` or any pydantic subclass. The resolver in -# :mod:`runtime.state_resolver` enforces a ``runtime.state.Session`` -# subclass at config time; the looser bound here keeps the storage -# layer usable by ad-hoc tests that build a ``BaseModel`` directly. -StateT = TypeVar("StateT", bound=BaseModel) +# StateT is bound to ``Session`` (not bare ``BaseModel``) because the +# store body reads typed fields (``id``, ``status``, ``version``, +# ``updated_at`` …) that are declared on ``runtime.state.Session`` and +# not on ``pydantic.BaseModel``. The resolver in +# :mod:`runtime.state_resolver` already enforces a ``Session`` subclass +# at config time, and every existing caller (production + tests) passes +# either bare ``Session`` or a ``Session`` subclass — see +# Phase 19 / HARD-03 for the rationale (was: ``bound=BaseModel`` which +# made pyright flag every typed-field access). +StateT = TypeVar("StateT", bound=Session) def _embed_source(inc: BaseModel) -> str: @@ -3192,7 +4272,12 @@ def save(self, incident: StateT) -> None: raise ValueError( f"Invalid incident id {incident.id!r}; expected PREFIX-YYYYMMDD-NNN" ) - incident.updated_at = _iso(_now()) + # ``_iso(_now())`` returns ``str`` here -- the input datetime is + # never None -- but the helper's signature is the broader + # ``Optional[str]``. ``or ""`` keeps pyright + the typed + # ``Session.updated_at: str`` field consistent without changing + # behaviour (real value is always present). + incident.updated_at = _iso(_now()) or "" sess = incident # local alias — avoids repeating the domain token in new code expected_version = getattr(sess, "version", 1) # Bump in-memory BEFORE building the row dict so the persisted @@ -3337,12 +4422,16 @@ def _persist_vector(self) -> None: from pathlib import Path folder = Path(self.vector_path) folder.mkdir(parents=True, exist_ok=True) - self.vector_store.save_local( + # ``save_local`` is FAISS-specific; the runtime ``hasattr`` guard + # at the top of this method already ensured this codepath only + # runs against FAISS (other VectorStores omit the method). + # ``langchain_core.vectorstores.VectorStore`` doesn't declare it. + self.vector_store.save_local( # pyright: ignore[reportAttributeAccessIssue] folder_path=str(folder), index_name=self.vector_index_name, ) - def _add_vector(self, inc: BaseModel) -> None: + def _add_vector(self, inc: Session) -> None: if self.vector_store is None or self.embedder is None: return text = _embed_source(inc) @@ -3355,7 +4444,7 @@ def _add_vector(self, inc: BaseModel) -> None: ) self._persist_vector() - def _refresh_vector(self, inc: BaseModel, *, prior_text: str) -> None: + def _refresh_vector(self, inc: Session, *, prior_text: str) -> None: if self.vector_store is None or self.embedder is None: return text = _embed_source(inc) @@ -3530,7 +4619,13 @@ def _row_to_incident(self, row: IncidentRow) -> StateT: merged_extras[k] = v kwargs["extra_fields"] = merged_extras - return self._state_cls(**kwargs) + # ``kwargs`` is built up from heterogeneous sources (typed row + # columns + ``extra_fields`` blob) so pyright infers each value + # as ``object``. At runtime each entry matches the concrete + # ``state_cls`` field type by construction (the row schema is + # the source of truth); pydantic's own validation rejects bad + # shapes at the constructor. + return self._state_cls(**kwargs) # pyright: ignore[reportArgumentType] def _incident_to_row_dict(self, inc: StateT) -> dict: """Serialize a state instance into a row-shaped dict. @@ -3633,6 +4728,204 @@ def _field(name: str, default=None): "version": getattr(inc, "version", 1), } +# ====== module: runtime/storage/event_log.py ====== + +@dataclass(frozen=True) +class SessionEvent: + """Immutable view of one row in the event log.""" + seq: int + session_id: str + kind: str + payload: dict + ts: str + + +def _now() -> str: + return datetime.now(timezone.utc).isoformat() + + +class EventLog: + """Append-only log of session events. + + Events drive the status finalizer's inference (e.g. a registered + ```` event appearing in the log -> session reached + the corresponding terminal status). They are never mutated or + deleted. + """ + + def __init__(self, *, engine: Engine) -> None: + self.engine = engine + + def append(self, session_id: str, kind: str, payload: dict) -> None: + """Append a new event row. Never mutates existing rows.""" + with Session(self.engine) as s: + with s.begin(): + s.add(SessionEventRow( + session_id=session_id, + kind=kind, + payload=dict(payload), + ts=_now(), + )) + + def iter_for(self, session_id: str) -> Iterator[SessionEvent]: + """Yield events for ``session_id`` in monotonic insertion order.""" + with Session(self.engine) as s: + stmt = ( + select(SessionEventRow) + .where(SessionEventRow.session_id == session_id) + .order_by(SessionEventRow.seq) + ) + for row in s.execute(stmt).scalars(): + yield SessionEvent( + seq=row.seq, + session_id=row.session_id, + kind=row.kind, + payload=row.payload, + ts=row.ts, + ) + +# ====== module: runtime/storage/migrations.py ====== + +_FORWARD_COLUMNS: list[tuple[str, str]] = [ + ("parent_session_id", "VARCHAR"), # dedup linkage + ("dedup_rationale", "TEXT"), # LLM rationale + ("extra_fields", "JSON"), # generic round-trip tunnel +] +_FORWARD_INDEXES: list[tuple[str, str, str]] = [ + # (index_name, table, column) — mirrors models.IncidentRow.__table_args__. + ("ix_incidents_parent_session_id", "incidents", "parent_session_id"), +] + +# Default audit fields. Mirrors the Pydantic defaults on +# :class:`runtime.state.ToolCall`. Keep these in sync — a divergence +# means rows hydrated post-migration would carry different defaults +# than rows hydrated via the Pydantic constructor, which would surface +# as subtle test flakes long after the migration ran. +_AUDIT_DEFAULTS: dict[str, Any] = { + "status": "executed", + "risk": None, + "approver": None, + "approved_at": None, + "approval_rationale": None, +} + + +def _fill_audit_fields(tc: dict[str, Any]) -> bool: + """Mutate ``tc`` in place, filling any missing audit field with its + default. Returns ``True`` when at least one key was added. + + Existing values (including explicit ``None`` already on the row) + are left untouched — this is the idempotency guarantee. + """ + changed = False + for key, default in _AUDIT_DEFAULTS.items(): + if key not in tc: + tc[key] = default + changed = True + return changed + + +def _normalise_tool_calls_list( + tool_calls: Iterable[Any] | None, +) -> tuple[list[Any], bool]: + """Walk a session's tool_calls JSON list, fill missing audit fields. + + Returns ``(new_list, changed)``. Non-dict entries (corrupt rows) + are passed through unchanged — the migration is not a validator. + """ + if not tool_calls: + return [], False + new: list[Any] = [] + changed = False + for tc in tool_calls: + if isinstance(tc, dict): + # Copy so we don't mutate caller-owned data accidentally. + tc_copy = dict(tc) + if _fill_audit_fields(tc_copy): + changed = True + new.append(tc_copy) + else: + new.append(tc) + return new, changed + + +def migrate_tool_calls_audit(engine: Engine) -> dict[str, int]: + """Walk every session's ``tool_calls`` and fill missing audit fields. + + Idempotent — running on a freshly-migrated DB is a no-op. + + Returns a small stats dict:: + + {"sessions_scanned": N, "sessions_updated": M, "rows_filled": K} + + where ``rows_filled`` is the count of individual ToolCall entries + that received at least one default. Useful for ops dashboards and + post-migration verification. + """ + scanned = 0 + updated = 0 + filled = 0 + with SqlSession(engine) as session: + rows = session.query(IncidentRow).all() + for row in rows: + scanned += 1 + new_list, changed = _normalise_tool_calls_list(row.tool_calls) + if changed: + # Count individual entries that gained at least one + # field. Cheap re-walk — rows.tool_calls is already in + # memory. + for old, new in zip(row.tool_calls or [], new_list): + if isinstance(old, dict) and isinstance(new, dict): + if any(k not in old for k in _AUDIT_DEFAULTS): + filled += 1 + row.tool_calls = new_list + updated += 1 + if updated: + session.commit() + return { + "sessions_scanned": scanned, + "sessions_updated": updated, + "rows_filled": filled, + } + + +def migrate_add_session_columns(engine: Engine) -> dict[str, int]: + """Add post-initial columns to ``incidents`` if missing. Idempotent. + + Older on-disk databases may lack ``extra_fields``, + ``parent_session_id``, or ``dedup_rationale``; SQLAlchemy's read-side + query then errors with ``no such column``. This walker uses + ``PRAGMA table_info`` (via SQLAlchemy's ``inspect``) to detect + missing columns and adds each one nullable. Running on a freshly- + migrated DB is a no-op. + + Returns ``{"columns_added": N, "indexes_added": M}``. + """ + inspector = inspect(engine) + if "incidents" not in inspector.get_table_names(): + # Fresh DB; ``Base.metadata.create_all`` already produced the + # full schema. Nothing to backfill. + return {"columns_added": 0, "indexes_added": 0} + existing_cols = {c["name"] for c in inspector.get_columns("incidents")} + existing_idx = {i["name"] for i in inspector.get_indexes("incidents")} + added_cols = 0 + added_idx = 0 + with engine.begin() as conn: + for col, sql_type in _FORWARD_COLUMNS: + if col not in existing_cols: + conn.execute(text(f"ALTER TABLE incidents ADD COLUMN {col} {sql_type}")) + added_cols += 1 + for idx_name, table, col in _FORWARD_INDEXES: + if idx_name in existing_idx: + continue + # If the column itself was just added (or already present) + # the index is safe to create now. + cols_after = {c["name"] for c in inspect(conn).get_columns(table)} + if col in cols_after: + conn.execute(text(f"CREATE INDEX {idx_name} ON {table} ({col})")) + added_idx += 1 + return {"columns_added": added_cols, "indexes_added": added_idx} + # ====== module: runtime/mcp_loader.py ====== @dataclass @@ -3755,83 +5048,3150 @@ def build_fastmcp_client(server_cfg: MCPServerConfig): if server_cfg.transport == "stdio": if not server_cfg.command: raise ValueError( - f"stdio server '{server_cfg.name}' missing 'command'" + f"stdio server '{server_cfg.name}' missing 'command'" + ) + return Client( + {"command": server_cfg.command[0], "args": server_cfg.command[1:]} + ) + raise ValueError(f"Unknown transport: {server_cfg.transport}") + + +async def _load_in_process(server_cfg: MCPServerConfig, + stack: AsyncExitStack) -> list[BaseTool]: + if server_cfg.module is None: + raise ValueError(f"in_process server '{server_cfg.name}' missing 'module'") + mod = importlib.import_module(server_cfg.module) + fmcp = getattr(mod, "mcp", None) + if fmcp is None: + raise ValueError(f"Module {server_cfg.module} has no 'mcp' (FastMCP instance)") + # FastMCP exposes tools as functions; convert to langchain tools via adapter. + # We use the in-memory client transport. The client is registered into the + # caller's exit stack so its session/transport stays open while the loaded + # tools are in use. + from fastmcp import Client + client = Client(fmcp) + await stack.enter_async_context(client) + tools = await load_mcp_tools(client.session) + # Rewrite each tool's .name to ":" for LLM disambiguation. + for t in tools: + original_name = t.name + t.name = f"{server_cfg.name}:{original_name}" + t._original_mcp_name = original_name # type: ignore[attr-defined] + return tools + + +async def _load_remote(server_cfg: MCPServerConfig, + stack: AsyncExitStack) -> list[BaseTool]: + from fastmcp import Client + if server_cfg.transport in ("http", "sse"): + if not server_cfg.url: + raise ValueError(f"remote server '{server_cfg.name}' missing 'url'") + client = Client(server_cfg.url, headers=server_cfg.headers or None) + elif server_cfg.transport == "stdio": + if not server_cfg.command: + raise ValueError(f"stdio server '{server_cfg.name}' missing 'command'") + client = Client({"command": server_cfg.command[0], "args": server_cfg.command[1:]}) + else: + raise ValueError(f"Unknown transport: {server_cfg.transport}") + await stack.enter_async_context(client) + tools = await load_mcp_tools(client.session) + # Rewrite each tool's .name to ":" for LLM disambiguation. + for t in tools: + original_name = t.name + t.name = f"{server_cfg.name}:{original_name}" + t._original_mcp_name = original_name # type: ignore[attr-defined] + return tools + + +async def load_tools(cfg: MCPConfig, stack: AsyncExitStack) -> ToolRegistry: + """Load all enabled MCP servers and return a :class:`ToolRegistry`. + + The caller MUST pass an already-entered :class:`AsyncExitStack`. Each + FastMCP ``Client`` is registered into it; the caller controls lifetime via + ``await stack.aclose()``. + """ + registry = ToolRegistry() + for server_cfg in cfg.servers: + if not server_cfg.enabled: + continue + if server_cfg.transport == "in_process": + tools = await _load_in_process(server_cfg, stack) + else: + tools = await _load_remote(server_cfg, stack) + for t in tools: + original = getattr(t, "_original_mcp_name", t.name) + registry.add(ToolEntry( + name=original, description=t.description or "", + server=server_cfg.name, category=server_cfg.category, tool=t, + )) + return registry + +# ====== module: runtime/service.py ====== + +_log = logging.getLogger("runtime.service") + +T = TypeVar("T") + + +@dataclass +class _ActiveSession: + """In-memory metadata for an in-flight session. + + Lives in ``OrchestratorService._registry``; mutated only on the + loop thread so the dict itself needs no thread lock. Snapshots are + produced via :meth:`OrchestratorService.list_active_sessions`, + which submits a coroutine to the loop and returns a list of plain + dicts to the calling thread. + """ + + session_id: str + started_at: str + status: str = "running" + current_agent: str | None = None + task: asyncio.Task | None = None + + +def _utc_iso_now() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +class SessionCapExceeded(RuntimeError): + """Raised by ``start_session`` when the service is already running + ``max_concurrent_sessions`` sessions. + + Fail fast, do not queue. Callers (Streamlit, FastAPI handlers) + catch this and surface a clear error — Streamlit shows a toast; + the HTTP layer translates it to a 429 with ``Retry-After``. + """ + + def __init__(self, cap: int) -> None: + super().__init__( + f"OrchestratorService at capacity ({cap} concurrent); " + f"reject incoming start_session" + ) + self.cap = cap + + +class OrchestratorService: + """Process-singleton orchestrator service. + + Surface: construction, singleton accessor, ``start()`` / + ``shutdown()``, coroutine submission bridge, and the shared MCP + client pool. + + Thread-safety (HARD-06): ``get_or_create()`` and + ``_reset_singleton()`` serialise singleton mutation through a + class-level ``threading.Lock``. Concurrent first-callers + (Streamlit warmup + FastAPI startup hook racing during process + boot) all observe the same instance — the loser of the race blocks + on the lock briefly, then short-circuits on the + ``_instance is None`` check inside the critical section. + """ + + # Class-level singleton state. Guarded by ``_lock`` so concurrent + # ``get_or_create()`` callers can't double-construct the service. + # Reset on ``shutdown()`` via :meth:`_reset_singleton`. + _lock: threading.Lock = threading.Lock() + _instance: "OrchestratorService | None" = None + + def __init__( + self, + cfg: AppConfig, + max_concurrent_sessions: int | None = None, + ) -> None: + self.cfg = cfg + # Resource cap. Prefer the explicit constructor arg; fall back + # to ``cfg.runtime.max_concurrent_sessions``. Tests mutate this + # attribute directly to drive cap behaviour deterministically. + self.max_concurrent_sessions: int = ( + max_concurrent_sessions + if max_concurrent_sessions is not None + else cfg.runtime.max_concurrent_sessions + ) + self._loop: asyncio.AbstractEventLoop | None = None + self._thread: threading.Thread | None = None + self._started = threading.Event() + # Shared MCP client pool — built lazily on first ``get_mcp_client`` + # so processes that never touch MCP pay zero startup cost. All + # mutations of ``_mcp_clients`` / ``_mcp_locks`` happen on the + # background loop, so the dicts themselves don't need a thread + # lock. + self._mcp_stack: AsyncExitStack | None = None + self._mcp_clients: dict[str, Any] = {} + self._mcp_locks: dict[str, asyncio.Lock] = {} + # Per-server-name asyncio.Lock guarding lazy build. Created on the + # loop the first time the server is requested. + self._mcp_build_locks: dict[str, asyncio.Lock] = {} + # Shared Orchestrator (lazy-built on first session start) and + # the in-flight session registry. The registry dict itself is + # only mutated from the loop thread (writers go through + # ``submit_and_wait``); readers also hop through the loop so the + # snapshot is point-in-time consistent with concurrent mutators. + self._orch: Any | None = None + self._registry: dict[str, _ActiveSession] = {} + # Lazily-built lock for serialising orchestrator construction + # under concurrent ``start_session`` calls. Created on the loop. + self._orch_build_lock: asyncio.Lock | None = None + # Pending-approval timeout watchdog. Started in ``start()`` iff + # ``cfg.runtime.gateway`` is configured; otherwise None and the + # lifecycle hooks are no-ops. + self._approval_watchdog: Any | None = None + + @classmethod + def get_or_create(cls, cfg: AppConfig) -> "OrchestratorService": + """Return the process-singleton service, building it on first call. + + Subsequent calls ignore the supplied ``cfg`` and return the + existing instance — there is exactly one orchestrator service per + Python process. To rebuild with a new config, call + ``shutdown()`` first. + + Thread-safe (HARD-06): the check-and-construct pair runs inside + a class-level ``threading.Lock``. A concurrent second caller + either blocks until the first caller's ``__init__`` returns and + then short-circuits on the ``_instance is not None`` check, or + wins the race and constructs alone — no double construction. + """ + with cls._lock: + if cls._instance is None: + cls._instance = cls(cfg) + return cls._instance + + def start(self) -> None: + """Spin up the background thread + asyncio loop. + + Idempotent: a no-op if the loop is already running. Blocks until + the background thread reports the loop is ready (5s timeout) so + callers can ``submit()`` immediately after ``start()`` returns. + """ + if self._thread is not None and self._thread.is_alive(): + return + self._started.clear() + self._loop = asyncio.new_event_loop() + self._thread = threading.Thread( + target=self._run_loop, + name="OrchestratorService", + daemon=True, + ) + self._thread.start() + if not self._started.wait(timeout=5.0): + raise RuntimeError("OrchestratorService loop failed to start within 5s") + # Arm the pending-approval watchdog iff a gateway is configured. + # The watchdog is harmless when no high-risk tool calls ever + # fire (it scans the empty registry), but skipping the start + # when the gateway is off keeps process startup quiet for apps + # that have not opted into HITL. + gateway_cfg = getattr(self.cfg.runtime, "gateway", None) + if gateway_cfg is not None: + + + timeout_s = getattr( + gateway_cfg, "approval_timeout_seconds", 3600, + ) + self._approval_watchdog = ApprovalWatchdog( + self, + approval_timeout_seconds=timeout_s, + ) + self._approval_watchdog.start(self._loop) + + def _run_loop(self) -> None: + assert self._loop is not None + asyncio.set_event_loop(self._loop) + self._started.set() + try: + self._loop.run_forever() + finally: + # Drain any remaining tasks before closing so no coroutine is + # left dangling without a chance to clean up. + try: + pending = asyncio.all_tasks(loop=self._loop) + for task in pending: + task.cancel() + if pending: + self._loop.run_until_complete( + asyncio.gather(*pending, return_exceptions=True) + ) + finally: + self._loop.close() + + def submit( + self, coro: Awaitable[T] + ) -> concurrent.futures.Future[T]: + """Submit a coroutine to the background loop from any thread. + + Returns a ``concurrent.futures.Future`` whose ``.result()`` blocks + the calling thread until the coroutine resolves on the loop. Safe + to call concurrently from multiple threads. + """ + if self._loop is None: + raise RuntimeError( + "OrchestratorService not started; call start() first" + ) + if not self._loop.is_running(): + raise RuntimeError("OrchestratorService loop is not running") + # Public signature accepts ``Awaitable[T]`` for caller flexibility; + # ``run_coroutine_threadsafe`` requires a ``Coroutine``. Every + # in-tree caller passes ``async def fn()`` — a Coroutine — so the + # cast is sound. Outside callers passing a non-coroutine + # Awaitable would already fail at runtime. + return asyncio.run_coroutine_threadsafe( + cast(Coroutine[Any, Any, T], coro), self._loop, + ) + + def submit_and_wait( + self, coro: Awaitable[T], timeout: float | None = None + ) -> T: + """Submit a coroutine and block the caller until it resolves. + + Convenience wrapper for sync callers (Streamlit, FastAPI request + handlers, CLI). Raises ``concurrent.futures.TimeoutError`` if the + coroutine doesn't complete within ``timeout`` seconds. + + WARNING: do not call from an async function whose event loop is + the same loop ``OrchestratorService`` is hosting (e.g. tests using + ``httpx.AsyncClient + ASGITransport`` against the FastAPI app + share the same loop the service runs on). The caller would block + the loop while waiting for work scheduled onto that same loop — + a deadlock. Use :meth:`submit_async` from async code. + """ + return self.submit(coro).result(timeout=timeout) + + async def submit_async(self, coro: Awaitable[T]) -> T: + """Bridge a coroutine onto the service's background loop, awaitable + from any caller's loop. + + Async equivalent of :meth:`submit_and_wait`. ``asyncio.wrap_future`` + exposes the cross-thread ``concurrent.futures.Future`` returned by + ``run_coroutine_threadsafe`` as awaitable on the calling loop, so + the caller yields control while the work runs on the service's + loop. Safe to call from a request handler whose event loop is the + same one the service is hosting (no deadlock). + """ + if self._loop is None: + raise RuntimeError( + "OrchestratorService not started; call start() first" + ) + if not self._loop.is_running(): + raise RuntimeError("OrchestratorService loop is not running") + # See ``submit`` above for the Awaitable-vs-Coroutine cast. + fut = asyncio.run_coroutine_threadsafe( + cast(Coroutine[Any, Any, T], coro), self._loop, + ) + return await asyncio.wrap_future(fut) + + async def get_mcp_client(self, server_name: str) -> Any: + """Return the shared FastMCP client for ``server_name``, building + on first request. + + Lookup is serialised via a per-server ``asyncio.Lock`` so two + concurrent sessions racing for the same server don't double-build + the client. The clients themselves are reused across all sessions + for the lifetime of the service; teardown happens in + :meth:`shutdown`. + + Raises ``KeyError`` if ``server_name`` is not declared in + ``cfg.mcp.servers``. + """ + # Build-lock dict mutation must happen on the loop; we *are* on + # the loop here (this is an async method). + if server_name not in self._mcp_build_locks: + self._mcp_build_locks[server_name] = asyncio.Lock() + async with self._mcp_build_locks[server_name]: + if server_name in self._mcp_clients: + return self._mcp_clients[server_name] + server_cfg = next( + (s for s in self.cfg.mcp.servers if s.name == server_name), + None, + ) + if server_cfg is None: + raise KeyError( + f"MCP server {server_name!r} not declared in cfg.mcp.servers" + ) + if self._mcp_stack is None: + self._mcp_stack = AsyncExitStack() + await self._mcp_stack.__aenter__() + client = build_fastmcp_client(server_cfg) + await self._mcp_stack.enter_async_context(client) + self._mcp_clients[server_name] = client + self._mcp_locks[server_name] = asyncio.Lock() + return client + + def lock_for(self, server_name: str) -> asyncio.Lock: + """Return the per-server ``asyncio.Lock`` that serialises tool + calls against a single FastMCP client. + + Must be called after ``get_mcp_client(server_name)`` has built + the client, otherwise ``KeyError``. + """ + return self._mcp_locks[server_name] + + # ------------------------------------------------------------------ + # Per-session task scheduling + in-flight registry + # ------------------------------------------------------------------ + + async def _ensure_orchestrator(self) -> Any: + """Lazily build the shared ``Orchestrator`` on the loop thread. + + Concurrent ``start_session`` calls coordinate through + ``_orch_build_lock`` so we never build the orchestrator twice. + Returns the cached instance on subsequent calls. + """ + # Build-lock construction must happen on the loop. We *are* on + # the loop here (this is an async method invoked via the bridge). + if self._orch_build_lock is None: + self._orch_build_lock = asyncio.Lock() + async with self._orch_build_lock: + if self._orch is None: + # Lazy import to avoid a circular dependency at module + # load time (orchestrator transitively imports a lot). + + self._orch = await Orchestrator.create(self.cfg) + return self._orch + + def start_session( + self, + *, + query: str = "", + state_overrides: dict | None = None, + environment: str | None = None, + submitter: dict | None = None, + reporter_id: str | None = None, + reporter_team: str | None = None, + trigger: Any | None = None, + ) -> str: + """Start a new agent session. Returns the session id immediately. + + The session row is created (and the id minted) synchronously on + the loop so the caller has a stable handle before this method + returns. The actual graph run is launched as an ``asyncio.Task`` + on the same loop and runs in the background — the caller does + **not** block on it. Listen via :meth:`list_active_sessions` and + per-session state lookups for progress. + + ``state_overrides`` is a free-form dict of domain fields the app + stamps onto the new session row. The framework only projects + ``environment`` onto the storage column today; other keys ride + through to app-specific MCP tools. + + ``submitter`` is a free-form dict the calling app interprets. + For incident-management it is ``{"id": "...", "team": "..."}``; + other apps can carry app-specific keys (e.g. code-review's + ``{"id": "", "pr_url": "..."}``). The framework + only projects ``id``/``team`` onto the row's reporter columns. + + Deprecated kwargs (coerced and warned): + * ``environment`` -> ``state_overrides={"environment": ...}`` + * ``reporter_id`` / ``reporter_team`` -> ``submitter`` + + The registry entry is evicted by a ``Task.add_done_callback`` on + completion, cancellation, or failure — so a session that crashes + does not leak a stale entry. + """ + + + + # Resolve the generic ``submitter`` and ``state_overrides`` once + # on the caller's thread — the deprecation warnings fire here + # (in the user's frame), not deep inside the loop's ``_scheduler``. + resolved_overrides = _coerce_state_overrides( + state_overrides, environment, + ) + resolved_submitter = _coerce_submitter( + submitter, reporter_id, reporter_team + ) + sub_id = (resolved_submitter or {}).get("id", "user-mock") + sub_team = (resolved_submitter or {}).get("team", "platform") + env = (resolved_overrides or {}).get("environment", "") + + async def _scheduler() -> str: + # Enforce the concurrency cap on the loop thread so the + # registry size check is race-free. Fail-fast with + # ``SessionCapExceeded``; the exception propagates through + # ``submit_and_wait`` -> ``Future.result()`` to the caller. + if len(self._registry) >= self.max_concurrent_sessions: + raise SessionCapExceeded(self.max_concurrent_sessions) + orch = await self._ensure_orchestrator() + # Allocate the row (and its id) synchronously on the loop + # so the caller gets a stable id back. The graph then runs + # in a separate task — registration happens here, before + # the task is created, so ``list_active_sessions`` sees the + # entry immediately. + inc = orch.store.create( + query=query, + environment=env, + reporter_id=sub_id, + reporter_team=sub_team, + ) + session_id = inc.id + # Stamp trigger provenance onto the row before the graph + # runs so any crash mid-graph still leaves an audit trail. + # ``inc.findings`` is a JSON dict on the row. + if trigger is not None: + try: + received_at = trigger.received_at.strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + except Exception: # noqa: BLE001 + received_at = _utc_iso_now() + inc.findings["trigger"] = { + "name": getattr(trigger, "name", None), + "transport": getattr(trigger, "transport", None), + "target_app": getattr(trigger, "target_app", None), + "received_at": received_at, + } + orch.store.save(inc) + entry = _ActiveSession( + session_id=session_id, + started_at=_utc_iso_now(), + ) + self._registry[session_id] = entry + + async def _run() -> None: + # Fail-fast on contention (D-03): if another task already + # holds the session lock, refuse the new turn immediately. + if orch._locks.is_locked(session_id): + + raise SessionBusy(session_id) + # Hold the per-session lock for the full graph turn, + # including any HITL interrupt() pause (D-01). + async with orch._locks.acquire(session_id): + try: + await orch.graph.ainvoke( + GraphState( + session=inc, + next_route=None, + last_agent=None, + error=None, + ), + config=orch._thread_config(session_id), + ) + except asyncio.CancelledError: + raise + except Exception as exc: # noqa: BLE001 + # Phase 11 (FOC-04 / D-11-04): GraphInterrupt is a + # pending-approval pause, not a failure. Don't stamp + # status='error' on the registry entry -- let + # LangGraph's checkpointer hold the paused state + # and let the UI's Approve/Reject action drive + # resume. + try: + from langgraph.errors import GraphInterrupt + if isinstance(exc, GraphInterrupt): + # Propagate so the underlying Task + # observer (stop_session etc.) still + # sees the exception, but skip the + # status='error' write. + raise + except ImportError: # pragma: no cover + pass + # Mark the registry entry so any concurrent snapshot + # observes the failure before the done-callback + # evicts it. The exception itself is preserved on + # the task object for ``stop_session`` and any + # other observer that holds a Task reference. + e = self._registry.get(session_id) + if e is not None: + e.status = "error" + raise + + task = asyncio.create_task(_run(), name=f"session:{session_id}") + entry.task = task + + # Eviction is loop-local: ``add_done_callback`` fires on the + # loop thread, so the dict mutation is single-threaded. + def _evict(_t: asyncio.Task) -> None: + self._registry.pop(session_id, None) + + task.add_done_callback(_evict) + return session_id + + return self.submit_and_wait(_scheduler(), timeout=30.0) + + # ------------------------------------------------------------------ + # stop_session — cancel in-flight task + persist stopped status + # ------------------------------------------------------------------ + + def stop_session(self, session_id: str) -> None: + """Cancel an in-flight session and mark its row ``status="stopped"``. + + Idempotent: calling on an unknown id, an already-stopped session, + or a session that completed naturally is a no-op (does not raise). + Also clears ``pending_intervention`` so a session interrupted + mid-resume doesn't leave a stale prompt on the row. + + Partial work (recorded ``tool_calls``, ``agents_run``) is + preserved — they are written as they happen, and stopping is + not a rollback. + """ + + async def _stop() -> None: + entry = self._registry.get(session_id) + task = entry.task if entry is not None else None + if task is not None and not task.done(): + task.cancel() + try: + await asyncio.wait_for(task, timeout=5.0) + except (asyncio.CancelledError, asyncio.TimeoutError): + pass + except Exception: # noqa: BLE001 + # The graph itself may have raised; we still want to + # mark the row stopped below. Swallow here, but log + # so post-mortem reveals the underlying failure. + _log.warning( + "stop_session: graph raised during cancel-await for %s", + session_id, + exc_info=True, + ) + # Persist the stopped status. The orchestrator may not have + # been built yet (caller passed an unknown id before any + # session ran) — in that case there's nothing to persist. + orch = self._orch + if orch is not None: + try: + inc = orch.store.load(session_id) + except Exception: # noqa: BLE001 + # Unknown id: nothing to persist; treat as no-op. A + # genuine store failure is still observable via the log. + _log.debug( + "stop_session: store.load(%s) failed; treating as unknown id", + session_id, + exc_info=True, + ) + inc = None + if inc is not None: + inc.status = "stopped" + inc.pending_intervention = None + orch.store.save(inc) + # Drop the registry entry if the done-callback didn't already + # evict it (it always does, but be defensive). + self._registry.pop(session_id, None) + + # If the loop isn't running (caller stopped the service), be a + # silent no-op rather than raising — keeps idempotency guarantees. + if self._loop is None or not self._loop.is_running(): + return + self.submit_and_wait(_stop(), timeout=10.0) + + # ------------------------------------------------------------------ + # Active-session registry snapshot accessor + # ------------------------------------------------------------------ + + def list_active_sessions(self) -> list[dict[str, Any]]: + """Return a thread-safe snapshot of in-flight sessions. + + The snapshot coroutine runs on the loop thread, so the view is + point-in-time consistent w.r.t. concurrent registry mutators + (which also run on the loop). Each entry is a plain ``dict`` + with ``session_id``, ``status``, ``started_at``, and + ``current_agent`` keys — callers in any thread can pass it + around without holding any asyncio resources. + + Returns an empty list when the service has never run a session + or when every previously-started run has completed. + """ + + async def _snapshot() -> list[dict[str, Any]]: + return [ + { + "session_id": e.session_id, + "status": e.status, + "started_at": e.started_at, + "current_agent": e.current_agent, + } + for e in self._registry.values() + ] + + return self.submit_and_wait(_snapshot(), timeout=5.0) + + def shutdown(self, timeout: float = 10.0) -> None: + """Stop the loop, tear down MCP clients, join the thread, + reset the singleton. + + Idempotent: safe to call multiple times, including after the + loop has already been torn down. Resets the module-level + singleton so ``get_or_create()`` will rebuild on the next call. + """ + if self._loop is None: + self._reset_singleton() + return + loop = self._loop + thread = self._thread + # Stop the watchdog before draining sessions so its scan + # doesn't race against the registry teardown below. + if loop.is_running() and self._approval_watchdog is not None: + try: + fut = asyncio.run_coroutine_threadsafe( + self._approval_watchdog.stop(), loop, + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: shutdown must continue even if the watchdog + # refuses to stop cleanly. Surface the cause so it doesn't + # silently rot. + _log.warning( + "shutdown: approval watchdog stop failed", + exc_info=True, + ) + self._approval_watchdog = None + # Cancel in-flight session tasks first so they observe a + # CancelledError before the orchestrator's underlying + # resources (DB engine, FastMCP transports) are torn down. + if loop.is_running() and self._registry: + try: + fut = asyncio.run_coroutine_threadsafe( + self._cancel_all_sessions(), loop + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: a stuck task that ignores cancellation must + # not block the loop teardown below. Surface for diagnosis. + _log.warning( + "shutdown: cancel_all_sessions failed", + exc_info=True, + ) + # Close the shared orchestrator on the loop, releasing its + # checkpointer connection / MCP exit-stack. + if loop.is_running() and self._orch is not None: + try: + fut = asyncio.run_coroutine_threadsafe( + self._close_orchestrator(), loop + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: a misbehaving aclose() must not block + # the loop / thread join below. Surface for diagnosis. + _log.warning( + "shutdown: orchestrator close failed", + exc_info=True, + ) + # Close MCP clients on the loop *before* stopping it. + if loop.is_running() and self._mcp_stack is not None: + try: + fut = asyncio.run_coroutine_threadsafe( + self._close_mcp_pool(), loop + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: don't block shutdown on a misbehaving + # client. Log so diagnostics survive the silent cleanup. + _log.warning( + "shutdown: MCP pool close failed", + exc_info=True, + ) + if loop.is_running(): + loop.call_soon_threadsafe(loop.stop) + if thread is not None: + thread.join(timeout=timeout) + self._loop = None + self._thread = None + self._started.clear() + self._mcp_stack = None + self._mcp_clients.clear() + self._mcp_locks.clear() + self._mcp_build_locks.clear() + self._orch = None + self._orch_build_lock = None + self._registry.clear() + self._approval_watchdog = None + self._reset_singleton() + + async def _cancel_all_sessions(self) -> None: + """Cancel every in-flight session task and wait for them to exit. + + Runs on the loop thread. Each task gets up to 5s to honour the + ``CancelledError``; misbehaving tasks that ignore cancellation + do not block shutdown beyond that — ``run_loop`` will sweep + them in its final ``gather`` pass. + """ + tasks = [e.task for e in self._registry.values() if e.task is not None] + for t in tasks: + t.cancel() + if tasks: + await asyncio.gather(*tasks, return_exceptions=True) + self._registry.clear() + + async def _close_orchestrator(self) -> None: + if self._orch is None: + return + orch = self._orch + self._orch = None + try: + await orch.aclose() + except Exception: # noqa: BLE001 + # Best-effort cleanup: a checkpointer / MCP exit-stack that + # blew up on close still leaves the process to exit cleanly. + # Surface so the failure is observable post-mortem. + _log.warning( + "_close_orchestrator: orch.aclose() failed", + exc_info=True, + ) + + async def _close_mcp_pool(self) -> None: + if self._mcp_stack is None: + return + stack = self._mcp_stack + self._mcp_stack = None + await stack.__aexit__(None, None, None) + self._mcp_clients.clear() + self._mcp_locks.clear() + self._mcp_build_locks.clear() + + @classmethod + def _reset_singleton(cls) -> None: + """Clear the class-level singleton under the same lock that + ``get_or_create`` uses — so a reset racing with a fresh + ``get_or_create`` call cannot leak the stale instance. + """ + with cls._lock: + cls._instance = None + +# ====== module: runtime/agents/turn_output.py ====== + +_LOG = logging.getLogger("runtime.orchestrator") + +# D-10-03 — heuristic tolerance for envelope-vs-tool-arg confidence mismatch. +# Inclusive boundary (|env - tool| <= 0.05 is silent). Documented for future +# tuning; widening is cheap, narrowing requires care because the LLM's +# self-reported turn confidence is naturally ~5pp noisier than its +# tool-call-time confidence. +_DEFAULT_TOLERANCE: float = 0.05 + + +class AgentTurnOutput(BaseModel): + """Structural envelope every agent invocation MUST emit. + + The framework wires this as ``response_format=AgentTurnOutput`` on both + ``create_agent`` call sites (``runtime.graph`` and + ``runtime.agents.responsive``). Pydantic's ``extra="forbid"`` keeps the + contract narrow — adding fields is a deliberate schema migration, not a + free-for-all. + """ + + model_config = ConfigDict(extra="forbid") + + content: str = Field( + min_length=1, + description="Final user-facing message text.", + ) + confidence: float = Field( + ge=0.0, + le=1.0, + description=( + "Calibrated confidence in this turn's output: " + "0.85+ strong, 0.5 hedged, <0.4 weak." + ), + ) + confidence_rationale: str = Field( + min_length=1, + description="One-sentence explanation of the confidence value.", + ) + signal: str | None = Field( + default=None, + description=( + "Optional next-state signal " + "(e.g. success | failed | needs_input | default). " + "Routing layer validates the vocabulary." + ), + ) + + +class EnvelopeMissingError(Exception): + """Raised by :func:`parse_envelope_from_result` when neither + ``result["structured_response"]`` nor a JSON-shaped final AIMessage + yields a valid :class:`AgentTurnOutput`. + + Carries structured cause attributes (``agent``, ``field``) so the + runner can mark the agent_run as ``error`` with a precise reason. + """ + + def __init__(self, *, agent: str, field: str, message: str | None = None): + self.agent = agent + self.field = field + super().__init__(message or f"envelope_missing: {field} (agent={agent})") + + +def parse_envelope_from_result( + result: dict, + *, + agent: str, +) -> AgentTurnOutput: + """Extract an :class:`AgentTurnOutput` from a ``create_react_agent`` result. + + Three-step defensive fallback (Risk #1 — Ollama may not honor + ``response_format`` cleanly across all providers): + + 1. ``result["structured_response"]`` — preferred path; LangGraph 1.1.x + populates it when ``response_format`` is set and the LLM honors + structured output. + 2. ``result["messages"][-1].content`` parsed as JSON, validated against + :class:`AgentTurnOutput` — covers providers that stuff envelope JSON + in the AIMessage body instead of a separate structured field. + 3. Both fail → :class:`EnvelopeMissingError` so the runner marks + agent_run ``error`` with a structured cause. + """ + # Path 1: structured_response (preferred) + sr = result.get("structured_response") + if isinstance(sr, AgentTurnOutput): + return sr + if isinstance(sr, dict): + try: + return AgentTurnOutput.model_validate(sr) + except Exception: # noqa: BLE001 + # Path 1 produced a dict that doesn't match the envelope + # schema. Fall through to Path 2 (parse last AIMessage), but + # log so providers shipping malformed structured_response are + # observable instead of silently degraded. + _LOG.debug( + "envelope path 1 (structured_response dict) failed validation; " + "falling through to AIMessage JSON parse", + exc_info=True, + ) + + # Path 2: JSON-parse last AIMessage content + messages = result.get("messages") or [] + for msg in reversed(messages): + if msg.__class__.__name__ != "AIMessage": + continue + content = getattr(msg, "content", None) + if not isinstance(content, str) or not content.strip(): + continue + try: + payload = json.loads(content) + except (json.JSONDecodeError, ValueError): + continue + if not isinstance(payload, dict): + continue + try: + return AgentTurnOutput.model_validate(payload) + except Exception: # noqa: BLE001 + continue + break + + # Path 3: fail loudly + raise EnvelopeMissingError( + agent=agent, + field="structured_response", + message=( + f"envelope_missing: no structured_response or JSON-decodable " + f"AIMessage envelope found (agent={agent})" + ), + ) + + +def reconcile_confidence( + envelope_value: float, + tool_arg_value: float | None, + *, + agent: str, + session_id: str, + tool_name: str | None, + tolerance: float = _DEFAULT_TOLERANCE, +) -> float: + """Reconcile envelope confidence against typed-terminal-tool-arg confidence. + + D-10-03 contract: + - When ``tool_arg_value`` is None: return envelope value silently. + - When both present and ``|envelope - tool_arg| <= tolerance``: return + tool-arg silently (tool-arg wins on the return regardless — it's the + finer-grained, gated value). + - When both present and ``|envelope - tool_arg| > tolerance``: log INFO + with the verbatim format from CONTEXT.md / D-10-03 and return tool-arg. + + Log shape (preserved verbatim for grep-based observability assertions): + ``runtime.orchestrator: turn.confidence_mismatch agent={a} turn_value={e:.2f} tool_value={t:.2f} tool={tn} session_id={sid}`` + """ + if tool_arg_value is None: + return envelope_value + diff = abs(envelope_value - tool_arg_value) + if diff > tolerance: + _LOG.info( + "turn.confidence_mismatch " + "agent=%s turn_value=%.2f tool_value=%.2f tool=%s session_id=%s", + agent, + envelope_value, + tool_arg_value, + tool_name, + session_id, + ) + return tool_arg_value + + +__all__ = [ + "AgentTurnOutput", + "EnvelopeMissingError", + "parse_envelope_from_result", + "reconcile_confidence", +] + +# ====== module: runtime/tools/gateway.py ====== + +if TYPE_CHECKING: + pass +GatewayAction = Literal["auto", "notify", "approve"] + +_RISK_TO_ACTION: dict[str, GatewayAction] = { + "low": "auto", + "medium": "notify", + "high": "approve", +} + +_UTC_TS_FMT = "%Y-%m-%dT%H:%M:%SZ" + + +def effective_action( + tool_name: str, + *, + env: str | None, + gateway_cfg: GatewayConfig | None, +) -> GatewayAction: + """Resolve the effective gateway action for a tool invocation. + + Order of evaluation (the prod-override predicate runs FIRST so it can + only TIGHTEN the action — never relax it): + + 1. ``gateway_cfg is None`` -> ``"auto"`` (gateway disabled). + 2. Prod override: if ``cfg.prod_overrides`` is configured AND + ``env`` is in ``prod_environments`` AND ``tool_name`` matches + one of the ``resolution_trigger_tools`` globs -> ``"approve"``. + 3. Risk-tier lookup: ``cfg.policy.get(tool_name)`` mapped via + ``low->auto``, ``medium->notify``, ``high->approve``. + 4. No policy entry -> ``"auto"`` (safe default). + + Tool-name lookups try the fully-qualified name (``:``, + as registered by ``runtime.mcp_loader``) FIRST, then the bare + suffix as a fallback. This lets app config use bare names without + knowing the server prefix while keeping prefixed-form policy keys + deterministically more specific. Globs in + ``resolution_trigger_tools`` are matched against both forms for + the same reason, prefixed first. + + The function is pure: same inputs always yield the same output and + no argument is mutated. + """ + if gateway_cfg is None: + return "auto" + + bare = tool_name.split(":", 1)[1] if ":" in tool_name else None + + overrides = gateway_cfg.prod_overrides + if overrides is not None and env and env in overrides.prod_environments: + for pattern in overrides.resolution_trigger_tools: + if fnmatchcase(tool_name, pattern): + return "approve" + if bare is not None and fnmatchcase(bare, pattern): + return "approve" + + risk = gateway_cfg.policy.get(tool_name) + if risk is not None: + return _RISK_TO_ACTION[risk] + if bare is not None: + risk = gateway_cfg.policy.get(bare) + if risk is not None: + return _RISK_TO_ACTION[risk] + return "auto" + + +def _now_iso() -> str: + return datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + + +def _find_pending_index( + tool_calls: list, + tool_name: str, + ts: str, +) -> int | None: + """Locate the index of the ``pending_approval`` ToolCall row that + matches ``tool_name`` and ``ts``. + + Used by the wrap_tool resume path to update the in-place audit row + rather than appending a duplicate. The watchdog may have replaced + the row with a ``timeout`` entry while the graph was paused — in + that case we return ``None`` and the resume path leaves the audit + list unchanged (the watchdog already wrote the canonical record). + + Searches from the end of the list because the pending row is + almost always the most recent ToolCall. + """ + for idx in range(len(tool_calls) - 1, -1, -1): + tc = tool_calls[idx] + if (getattr(tc, "tool", None) == tool_name + and getattr(tc, "ts", None) == ts + and getattr(tc, "status", None) == "pending_approval"): + return idx + return None + + +def _find_existing_pending_index( + tool_calls: list, + tool_name: str, +) -> int | None: + """Find the most recent ``pending_approval`` row for ``tool_name``. + + LangGraph's interrupt/resume model re-runs the gated node from the + top after ``Command(resume=...)``; we re-use the existing pending + row rather than appending a duplicate every time the closure + re-enters the approve branch. + """ + for idx in range(len(tool_calls) - 1, -1, -1): + tc = tool_calls[idx] + if (getattr(tc, "tool", None) == tool_name + and getattr(tc, "status", None) == "pending_approval"): + return idx + return None + + +def _evaluate_gate( + *, + session: Session, + tool_name: str, + gate_policy: GatePolicy | None, + gateway_cfg: GatewayConfig | None, +) -> "GateDecision": + """Phase 11 (FOC-04) bridge: invoke ``should_gate`` from the wrap. + + Constructs a minimal ``ToolCall`` shape for the pure-function + boundary, and a temporary ``OrchestratorConfig`` shim with the + in-flight ``gate_policy`` + ``gateway`` so the pure function sees + a single config object (its declared signature). + + When ``gate_policy`` is ``None`` -- the legacy callers that have + not yet been threaded -- a default ``GatePolicy()`` is used so + Phase-11 behaviour applies uniformly. The default mirrors v1.0 + HITL behaviour (``gated_risk_actions={"approve"}``), so existing + pre-Phase-11 tests keep passing. + """ + # Local imports (avoid cycle on policy.py importing gateway). + # ``GateDecision`` is type-only here -- the lazy import sits in the + # TYPE_CHECKING block at module top. + + + + effective_policy = gate_policy if gate_policy is not None else GatePolicy() + # OrchestratorConfig has model_config={"extra": "forbid"} so we + # cannot stash gateway as a top-level field. We thread gateway via + # the cfg.gateway lookup that should_gate already performs via + # ``getattr(cfg, "gateway", None)``. Building a transient cfg with + # gate_policy and a stashed gateway attr is the smallest-diff + # pathway -- avoids changing should_gate's signature. + cfg = OrchestratorConfig(gate_policy=effective_policy) + object.__setattr__(cfg, "gateway", gateway_cfg) + + minimal_tc = ToolCall( + agent="", + tool=tool_name, + args={}, + result=None, + ts=_now_iso(), + risk="low", + status="executed", + ) + confidence = getattr(session, "turn_confidence_hint", None) + decision: GateDecision = should_gate( + session=session, tool_call=minimal_tc, confidence=confidence, cfg=cfg, + ) + return decision + + +class _GatedToolMarker(BaseTool): + """Marker base class so ``isinstance(t, _GatedToolMarker)`` identifies + a tool that has already been wrapped by :func:`wrap_tool`. Used to + short-circuit ``wrap_tool(wrap_tool(t))`` and avoid wrapper recursion. + + Not instantiated directly — every ``_GatedTool`` defined inside + :func:`wrap_tool` inherits from this. + """ + + name: str = "_gated_marker" + description: str = "internal — never invoked" + + def _run(self, *args: Any, **kwargs: Any) -> Any: # pragma: no cover + raise NotImplementedError("marker base — _GatedTool overrides this") + + +def wrap_tool( + base_tool: BaseTool, + *, + session: Session, + gateway_cfg: GatewayConfig | None, + agent_name: str = "", + store: "SessionStore | None" = None, + injected_args: dict[str, str] | None = None, + gate_policy: GatePolicy | None = None, +) -> BaseTool: + """Wrap ``base_tool`` so every invocation passes through the gateway. + + The factory closes over ``session`` and ``gateway_cfg`` so the live + audit log (``session.tool_calls``) is the same instance the rest of + the orchestrator reads — no detour through a separate audit table. + + Returned object is a ``BaseTool`` subclass instance whose ``name`` + and ``description`` mirror the underlying tool, so LangGraph's ReAct + prompt builder still sees the right tool surface. + + Idempotent: wrapping an already-gated tool returns it unchanged so a + second ``wrap_tool(wrap_tool(t))`` does not nest wrappers (which would + cause unbounded recursion when ``_run`` calls ``inner.invoke`` and + that dispatches back into another ``_GatedTool._run``). + + Phase 9 (D-09-01 / D-09-03): when ``injected_args`` is supplied, the + gateway expands ``kwargs`` with session-derived values BEFORE + ``effective_action`` is consulted — so the gateway's risk-rating + sees the canonical ``environment`` (avoiding T-09-05: gateway + misclassifies prod as auto because env was missing from the LLM + args). + """ + if isinstance(base_tool, _GatedToolMarker): + return base_tool + + env = getattr(session, "environment", None) + inner = base_tool + inject_cfg = injected_args or {} + + # Phase 9 (D-09-01): the LLM-visible args_schema on the wrapper must + # exclude every injected key — otherwise BaseTool's input validator + # rejects the call when the LLM omits a "required" arg the framework + # is about to supply. The inner tool keeps its full schema so the + # downstream invoke still sees every kwarg. + if inject_cfg: + + _llm_visible_schema = strip_injected_params( + inner, frozenset(inject_cfg.keys()), + ).args_schema + else: + _llm_visible_schema = inner.args_schema + + # Phase 9 follow-up: compute the set of param names the inner tool + # actually accepts so injection skips keys the target tool doesn't + # declare. Without this filter, a config-wide ``injected_args`` + # entry like ``session_id: session.id`` is unconditionally written + # to every tool's kwargs — tools that don't accept ``session_id`` + # then raise pydantic ``unexpected_keyword`` errors at the FastMCP + # validation boundary. ``accepted_params_for_tool`` handles both + # pydantic-model and JSON-Schema-dict ``args_schema`` shapes. + + _accepted_params: frozenset[str] | None = accepted_params_for_tool(inner) + + def _sync_invoke_inner(payload: Any) -> Any: + """Sync-invoke the inner tool, translating BaseTool's + default-``_run`` ``NotImplementedError`` into a clearer message + for native-async-only tools. Without this, callers see a vague + ``NotImplementedError`` from langchain core with no hint that + the right path is ``ainvoke``.""" + try: + return inner.invoke(payload) + except NotImplementedError as exc: + raise NotImplementedError( + f"Tool {inner.name!r} appears to be async-only " + f"(``_run`` not implemented). Use ``ainvoke`` / ``_arun`` " + f"for this tool instead of the sync invoke path." + ) from exc + + # Tool-naming regex differs across LLM providers — Ollama allows + # ``[a-zA-Z0-9_.\-]{1,256}``, OpenAI is stricter at + # ``^[a-zA-Z0-9_-]+$`` (no dots). The framework's internal naming + # uses ``:`` for PVC-08 prefixed-form policy lookups, + # but the LLM only sees the *wrapper*'s ``.name``. Use ``__`` + # (double underscore) as the LLM-visible separator: it satisfies + # both providers' regexes and is unambiguous (no real tool name + # contains a double underscore). ``inner.name`` keeps the colon + # form so ``effective_action`` / ``should_gate`` policy lookups + # stay PVC-08-compliant. + _llm_visible_name = inner.name.replace(":", "__") + + class _GatedTool(_GatedToolMarker): + name: str = _llm_visible_name + description: str = inner.description + # The wrapper does its own arg coercion via the inner tool's schema, + # so no need to copy it here. Keep ``args_schema`` aligned with the + # LLM-visible (post-strip) schema so BaseTool's input validator + # accepts the post-strip kwargs the LLM emits. Phase 9 strips + # injected keys here; pre-Phase-9 callers see the full schema. + args_schema: Any = _llm_visible_schema # type: ignore[assignment] + + def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 + # Phase 9 (D-09-01 / T-09-05): inject session-derived args + # BEFORE the gateway risk lookup so risk-rating sees the + # post-injection environment value. Pure no-op when + # ``injected_args`` is empty. + if inject_cfg: + + kwargs = inject_injected_args( + kwargs, + session=session, + injected_args_cfg=inject_cfg, + tool_name=inner.name, + accepted_params=_accepted_params or None, + ) + # Phase 11 (FOC-04): pure-policy gating boundary. Call + # should_gate to decide whether to pause for HITL approval; + # also call effective_action so the notify-audit branch + # below still fires for medium-risk tools that should NOT + # gate but should record an audit row. + action = effective_action( + inner.name, env=env, gateway_cfg=gateway_cfg, + ) + decision = _evaluate_gate( + session=session, + tool_name=inner.name, + gate_policy=gate_policy, + gateway_cfg=gateway_cfg, + ) + if decision.gate: + from langgraph.types import interrupt + + # Persist a ``pending_approval`` ToolCall row BEFORE + # raising GraphInterrupt so the approval-timeout watchdog + # has a record to scan. ``ts`` is the moment the human + # approval window opened. Stored args mirror the post- + # decision rows so the audit history reads consistently. + # + # On resume, LangGraph re-enters this node and runs us + # again from the top — so we must re-use the existing + # pending row instead of appending a duplicate. The most + # recent ``pending_approval`` row for this tool wins. + pending_args = dict(kwargs) if kwargs else {"args": list(args)} + existing_idx = _find_existing_pending_index( + session.tool_calls, inner.name, + ) + if existing_idx is not None: + pending_ts = session.tool_calls[existing_idx].ts + else: + pending_ts = _now_iso() + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=None, + ts=pending_ts, + risk="high", + status="pending_approval", + ) + ) + # CRITICAL: persist the pending_approval row BEFORE + # raising interrupt() so the approval-timeout + # watchdog (which reads from the DB) and the + # /approvals UI can see the pending state. Without + # this save the in-memory mutation is invisible to + # any out-of-process observer. + if store is not None: + store.save(session) + payload = { + "kind": "tool_approval", + "tool": inner.name, + "args": kwargs or args, + "tool_call_id": kwargs.get("tool_call_id"), + } + # First execution: raises GraphInterrupt, checkpointer pauses. + # Resume: returns whatever Command(resume=...) supplied. + decision = interrupt(payload) + # Decision payload may be a string ("approve" / "reject" / + # "timeout") or a dict {decision, approver, rationale}. + if isinstance(decision, dict): + verdict = decision.get("decision", "approve") + approver = decision.get("approver") + rationale = decision.get("rationale") + else: + verdict = decision or "approve" + approver = None + rationale = None + # Update the pending_approval row in place rather than + # appending a second audit entry. The watchdog and the + # /approvals UI both reason about a single audit row per + # high-risk call. + pending_idx = _find_pending_index( + session.tool_calls, inner.name, pending_ts, + ) + verdict_str = str(verdict).lower() + if verdict_str == "reject": + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"rejected": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="rejected", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"rejected": True, "rationale": rationale} + if verdict_str == "timeout": + # The approval window expired. Do NOT run the tool; + # mark the audit row ``status="timeout"`` so + # downstream consumers (UI, retraining) can + # distinguish operator-initiated rejections from + # automatic timeouts. + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"timeout": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="timeout", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"timeout": True, "rationale": rationale} + # Approved -> run the tool, then update the audit row. + result = _sync_invoke_inner(kwargs if kwargs else args[0] if args else {}) + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=result, + ts=pending_ts, + risk="high", + status="approved", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return result + + # auto / notify both run the tool now. + result = _sync_invoke_inner(kwargs if kwargs else args[0] if args else {}) + + if action == "notify": + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=dict(kwargs) if kwargs else {"args": list(args)}, + result=result, + ts=_now_iso(), + risk="medium", + status="executed_with_notify", + ) + ) + return result + + async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 + # Phase 9 (D-09-01 / T-09-05): inject session-derived args + # BEFORE the gateway risk lookup. Mirror of the sync ``_run``. + if inject_cfg: + + kwargs = inject_injected_args( + kwargs, + session=session, + injected_args_cfg=inject_cfg, + tool_name=inner.name, + accepted_params=_accepted_params or None, + ) + # Phase 11 (FOC-04): pure-policy gating boundary. Mirror of + # the sync ``_run`` -- consult should_gate via + # ``_evaluate_gate``; still call ``effective_action`` to + # keep the notify-audit branch for medium-risk tools. + action = effective_action( + inner.name, env=env, gateway_cfg=gateway_cfg, + ) + decision = _evaluate_gate( + session=session, + tool_name=inner.name, + gate_policy=gate_policy, + gateway_cfg=gateway_cfg, + ) + if decision.gate: + from langgraph.types import interrupt + + # Persist a ``pending_approval`` audit row BEFORE the + # GraphInterrupt fires so the watchdog can spot stale + # approvals. See the sync ``_run`` mirror for details. + pending_args = dict(kwargs) if kwargs else {"args": list(args)} + existing_idx = _find_existing_pending_index( + session.tool_calls, inner.name, + ) + if existing_idx is not None: + pending_ts = session.tool_calls[existing_idx].ts + else: + pending_ts = _now_iso() + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=None, + ts=pending_ts, + risk="high", + status="pending_approval", + ) + ) + # CRITICAL: persist the pending_approval row BEFORE + # raising interrupt() so the approval-timeout + # watchdog (which reads from the DB) and the + # /approvals UI can see the pending state. + if store is not None: + store.save(session) + payload = { + "kind": "tool_approval", + "tool": inner.name, + "args": kwargs or args, + "tool_call_id": kwargs.get("tool_call_id"), + } + decision = interrupt(payload) + if isinstance(decision, dict): + verdict = decision.get("decision", "approve") + approver = decision.get("approver") + rationale = decision.get("rationale") + else: + verdict = decision or "approve" + approver = None + rationale = None + pending_idx = _find_pending_index( + session.tool_calls, inner.name, pending_ts, + ) + verdict_str = str(verdict).lower() + if verdict_str == "reject": + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"rejected": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="rejected", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"rejected": True, "rationale": rationale} + if verdict_str == "timeout": + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"timeout": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="timeout", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"timeout": True, "rationale": rationale} + result = await inner.ainvoke(kwargs if kwargs else args[0] if args else {}) + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=result, + ts=pending_ts, + risk="high", + status="approved", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return result + + result = await inner.ainvoke(kwargs if kwargs else args[0] if args else {}) + + if action == "notify": + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=dict(kwargs) if kwargs else {"args": list(args)}, + result=result, + ts=_now_iso(), + risk="medium", + status="executed_with_notify", + ) + ) + return result + + return _GatedTool() + +# ====== module: runtime/tools/arg_injection.py ====== + +_LOG = logging.getLogger("runtime.orchestrator") + + +def strip_injected_params( + tool: BaseTool, + injected_keys: frozenset[str], +) -> BaseTool: + """Return a ``BaseTool`` whose ``args_schema`` hides every param named + in ``injected_keys``. + + The LLM only sees the stripped sig; the framework re-adds the real + values at invocation time via :func:`inject_injected_args` (D-09-01). + + Properties: + + * **Pure.** The original tool is left unchanged — its ``args_schema`` + is not mutated, so tests and in-process callers that hold a direct + reference keep their full schema. + * **Idempotent.** Calling twice with the same keys is equivalent to + calling once. The cloned schema is structurally identical. + * **Identity short-circuit.** Empty ``injected_keys`` (or no overlap + between ``injected_keys`` and the tool's params) returns the tool + unchanged so unconfigured apps and tools without any injectable + params pay nothing. + """ + if not injected_keys: + return tool + schema = getattr(tool, "args_schema", None) + if schema is None: + return tool + + # --- dict path: FastMCP / JSON-Schema tools --------------------------- + # FastMCP exposes ``args_schema`` as a plain JSON-Schema dict rather + # than a Pydantic model. Strip injected keys directly from the dict. + if isinstance(schema, dict): + props = schema.get("properties", {}) + overlap = injected_keys & set(props) + if not overlap: + return tool + new_props = {k: v for k, v in props.items() if k not in injected_keys} + required = [r for r in schema.get("required", []) if r not in injected_keys] + new_dict_schema: dict[str, Any] = {**schema, "properties": new_props, "required": required} + try: + return tool.model_copy(update={"args_schema": new_dict_schema}) + except Exception: # pragma: no cover — defensive fallback + import copy + stripped = copy.copy(tool) + stripped.args_schema = new_dict_schema # type: ignore[attr-defined] + return stripped + + # --- Pydantic path: BaseModel subclass tools -------------------------- + if not hasattr(schema, "model_fields"): + return tool + overlap = injected_keys & set(schema.model_fields.keys()) + if not overlap: + # No params to strip — preserve identity (no clone). + return tool + + # Build the kwargs for ``create_model`` from the surviving fields. + # Pydantic v2's ``create_model`` accepts ``(annotation, FieldInfo)`` + # tuples; FieldInfo carries default + description + alias so the + # cloned schema is functionally equivalent to the original minus + # the stripped fields. + keep: dict[str, tuple[Any, Any]] = { + name: (f.annotation, f) + for name, f in schema.model_fields.items() + if name not in injected_keys + } + new_schema = create_model( + f"{schema.__name__}__StrippedForLLM", + __base__=BaseModel, + **keep, # type: ignore[arg-type] + ) + + # ``BaseTool`` is itself a pydantic BaseModel — ``model_copy`` clones + # it cheaply and lets us swap ``args_schema`` without touching the + # original. Tools that are not pydantic models (extremely rare; only + # custom subclasses) fall back to a regular shallow copy. + try: + stripped = tool.model_copy(update={"args_schema": new_schema}) + except Exception: # pragma: no cover — defensive fallback + import copy + stripped = copy.copy(tool) + stripped.args_schema = new_schema # type: ignore[attr-defined] + return stripped + + +def _resolve_dotted(root: Session, path: str) -> Any | None: + """Walk ``path`` ('session.foo.bar') against ``root`` and return the + terminal value or ``None`` if any segment is missing / None. + + ``path`` must start with ``session.``. The leading ``session`` token + pins the resolution root to the live Session — config-declared paths + cannot reach into arbitrary modules. Subsequent segments walk + attributes (``getattr``) — for fields stored under ``extra_fields`` + apps use ``session.extra_fields.foo`` which goes through the dict + branch below. + """ + parts = path.split(".") + if not parts or parts[0] != "session": + raise ValueError( + f"injected_args path {path!r} must start with 'session.'" + ) + cur: Any = root + for seg in parts[1:]: + if cur is None: + return None + # Support dict-valued attrs (notably ``Session.extra_fields``) + # transparently — ``session.extra_fields.pr_url`` resolves + # whether ``extra_fields`` is a real attribute or a dict on + # the model. Plain attribute walks work for typed Session + # subclasses (``IncidentState.environment``). + if isinstance(cur, dict): + cur = cur.get(seg) + else: + cur = getattr(cur, seg, None) + return cur + + +def inject_injected_args( + tool_args: dict[str, Any], + *, + session: Session, + injected_args_cfg: dict[str, str], + tool_name: str, + accepted_params: set[str] | frozenset[str] | None = None, +) -> dict[str, Any]: + """Return a NEW dict with each injected arg resolved from ``session``. + + Behaviour (D-09-03): + + * Mutation-free: ``tool_args`` is never modified. Callers that need + to keep the LLM's original call shape can compare ``tool_args`` to + the return value. + * Framework wins on conflict. When the LLM already supplied a value + and the resolved framework value differs, the framework value is + written and a single INFO record is emitted on the + ``runtime.orchestrator`` logger with the documented payload tokens + (``tool``, ``arg``, ``llm_value``, ``framework_value``, + ``session_id``). + * Missing/None resolutions are skipped. The arg is left absent so + the tool's own default-handling (or the MCP server's required-arg + validator) decides what to do — never silently ``None``. + * When ``accepted_params`` is provided, injected keys not present in + that set are skipped. Prevents writing kwargs the target tool + doesn't accept (which would raise pydantic ``unexpected_keyword`` + validation errors at the FastMCP boundary). + """ + out = dict(tool_args) + for arg_name, path in injected_args_cfg.items(): + if accepted_params is not None and arg_name not in accepted_params: + # The tool doesn't declare this injectable param. Strip any + # LLM-supplied value too — the LLM shouldn't be emitting it + # (Phase 9 strips injectable keys from the LLM-visible sig) + # and forwarding it to the tool would raise pydantic + # ``unexpected_keyword`` at the FastMCP boundary. + if arg_name in out: + _LOG.info( + "tool_call.injected_arg_dropped tool=%s arg=%s " + "llm_value=%r reason=not_accepted_by_tool session_id=%s", + tool_name, + arg_name, + out[arg_name], + getattr(session, "id", "?"), + ) + del out[arg_name] + continue + framework_value = _resolve_dotted(session, path) + if framework_value is None: + continue + if arg_name in out and out[arg_name] != framework_value: + _LOG.info( + "tool_call.injected_arg_overridden tool=%s arg=%s " + "llm_value=%r framework_value=%r session_id=%s", + tool_name, + arg_name, + out[arg_name], + framework_value, + getattr(session, "id", "?"), + ) + out[arg_name] = framework_value + return out + + +def accepted_params_for_tool(tool: Any) -> frozenset[str] | None: + """Return the set of parameter names a wrapped tool accepts. + + Handles both shapes ``args_schema`` can take in this codebase: + + * pydantic ``BaseModel`` subclass — read ``model_fields.keys()`` + (used by mock tools and by tests). + * JSON-Schema ``dict`` — read ``schema["properties"].keys()`` + (used by real FastMCP-derived tools, which expose the underlying + function's input schema as a JSON Schema rather than a pydantic + class). + + Returns ``None`` when the tool has no introspectable schema (caller + should treat this as "skip filtering" — preserves prior behaviour). + """ + schema = getattr(tool, "args_schema", None) + if schema is None: + return None + if hasattr(schema, "model_fields"): + return frozenset(schema.model_fields.keys()) + if isinstance(schema, dict): + props = schema.get("properties") + if isinstance(props, dict): + return frozenset(props.keys()) + return None + + +__all__ = [ + "strip_injected_params", + "inject_injected_args", + "accepted_params_for_tool", + "_LOG", +] + +# ====== module: runtime/tools/approval_watchdog.py ====== + +if TYPE_CHECKING: + pass +logger = logging.getLogger(__name__) + +_UTC_TS_FMT = "%Y-%m-%dT%H:%M:%SZ" + +# Sessions whose status is in this set are *not* candidates for the +# watchdog — either they never paused for approval, or they have already +# moved past it. ``awaiting_input`` is the only status produced by +# ``langgraph.types.interrupt()`` while a high-risk gate is open. +_TERMINAL_STATUSES = frozenset({ + "resolved", "stopped", "escalated", "duplicate", "deleted", "error", +}) + + +def _parse_iso(ts: str | None) -> datetime | None: + """Parse an ISO-8601 ``YYYY-MM-DDTHH:MM:SSZ`` ts back into UTC. + + Returns ``None`` for malformed values; callers treat that as + "skip this row" so the watchdog never crashes on a bad audit + record. + """ + if not ts: + return None + try: + # Replace trailing 'Z' so ``fromisoformat`` accepts it on + # Python <3.11. The format is fixed by ``_UTC_TS_FMT`` so this + # round-trips cleanly. + if ts.endswith("Z"): + ts = ts[:-1] + "+00:00" + dt = datetime.fromisoformat(ts) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc) + except (ValueError, TypeError): + return None + + +class ApprovalWatchdog: + """Background asyncio task that resumes stale pending-approval sessions. + + Owned by :class:`runtime.service.OrchestratorService`; started in + ``OrchestratorService.start()`` and stopped in ``shutdown()``. The + task runs on the service's background loop so it shares the same + checkpointer / SQLite engine / FastMCP transports the live + sessions are using. + """ + + def __init__( + self, + service: "OrchestratorService", + *, + approval_timeout_seconds: int, + poll_interval_seconds: float = 60.0, + ) -> None: + self._service = service + self._approval_timeout_seconds = approval_timeout_seconds + self._poll_interval_seconds = poll_interval_seconds + self._task: asyncio.Task | None = None + self._stop_event: asyncio.Event | None = None + # HARD-07: ``stop()`` is idempotent. Once a stop has been + # initiated (or completed), subsequent calls return immediately + # rather than racing on ``_task`` / ``_stop_event`` which the + # first caller is already clearing. Mutated only on the loop + # thread (where ``stop()`` runs), so no extra lock needed. + self._stopped: bool = False + + @property + def is_running(self) -> bool: + return self._task is not None and not self._task.done() + + def start(self, loop: asyncio.AbstractEventLoop) -> None: + """Schedule the watchdog onto ``loop``. Idempotent. + + Must be called from a thread that is not the loop's own thread — + the typical caller is :meth:`OrchestratorService.start`. Returns + immediately; the polling coroutine runs in the background. + """ + if self._task is not None and not self._task.done(): + return + + async def _arm() -> None: + # Re-arm: a previous ``stop()`` may have flipped this; a + # fresh ``start()`` re-enables ``stop()``. + self._stopped = False + self._stop_event = asyncio.Event() + self._task = asyncio.create_task( + self._run(), name="approval_watchdog", + ) + + fut = asyncio.run_coroutine_threadsafe(_arm(), loop) + fut.result(timeout=5.0) + + async def stop(self) -> None: + """Signal the polling loop to exit and await termination. + + HARD-07: Idempotent and abrupt-shutdown safe. Safe to call: + * before ``start()`` (no-op), + * multiple times (subsequent calls short-circuit on + ``_stopped`` after the first caller flips it), + * concurrently from two callers — the first claims ownership + of ``_task`` and drains it; the second sees the task is + already gone and returns. + + Cancellation strategy: signal via ``_stop_event`` first so the + polling loop exits its ``wait_for`` cleanly; then bound the + drain by ``asyncio.wait_for(task, timeout=1.0)``. If the task + ignores the event (or the event loop is being torn down under + us), fall back to ``task.cancel()`` and one final drain. + ``CancelledError`` and ``TimeoutError`` are suppressed — there + is no useful recovery from a watchdog that won't die. + + Runs on the loop thread (called from ``OrchestratorService._close_*`` + helpers, or as a graceful no-op cleanup hook). + """ + # First-call wins. Subsequent callers (and the after-shutdown + # path) see ``_stopped`` and return without re-running the + # drain — protects against double-await on ``_task``. + if self._stopped: + return + self._stopped = True + # Snapshot to LOCAL variables so concurrent ``stop()`` calls + # never re-await the same task. We do NOT null out ``_task`` / + # ``_stop_event`` until after the drain because ``_run()`` + # reads ``self._stop_event`` on every loop iteration; clearing + # it before signalling would crash the polling loop with + # ``AttributeError: 'NoneType' object has no attribute + # 'is_set'`` and produce exactly the noisy teardown this fix + # is meant to prevent. + task = self._task + stop_event = self._stop_event + if stop_event is not None: + stop_event.set() + if task is None or task.done(): + self._task = None + self._stop_event = None + return + try: + await asyncio.wait_for(asyncio.shield(task), timeout=1.0) + except (asyncio.TimeoutError, asyncio.CancelledError): + task.cancel() + try: + await asyncio.wait_for(task, timeout=1.0) + except (asyncio.TimeoutError, asyncio.CancelledError): + # Task is wedged or the loop is shutting down under us. + # The ``cancel()`` call above is enough to flip the task + # state; ``run_loop`` 's final ``gather`` pass will sweep + # it during loop teardown. Don't block shutdown further. + pass + finally: + # Always clear the bookkeeping refs so a subsequent + # ``start()`` arms cleanly and ``is_running`` reports False. + self._task = None + self._stop_event = None + + async def close(self) -> None: + """Alias for :meth:`stop` — symmetric with aiohttp/httpx. + + Idempotent. Provided so callers using a "close-on-cleanup" + pattern (``async with`` on parent owners) read naturally. + """ + await self.stop() + + async def _run(self) -> None: + """Polling loop. Runs until ``_stop_event`` is set. + + We bind ``stop_event`` to a LOCAL variable on entry so a + concurrent ``stop()`` cannot null out ``self._stop_event`` + from underneath us mid-iteration (HARD-07: that nulling-while- + running was the original source of ``AttributeError`` at + teardown). + """ + stop_event = self._stop_event + assert stop_event is not None + while not stop_event.is_set(): + try: + await self._tick() + except asyncio.CancelledError: + raise + except Exception: # noqa: BLE001 + logger.exception("approval watchdog tick failed") + try: + await asyncio.wait_for( + stop_event.wait(), + timeout=self._poll_interval_seconds, + ) + except asyncio.TimeoutError: + # Expected — wakes the loop every ``poll_interval_seconds``. + continue + + async def _tick(self) -> None: + """One scan + resume pass. Visible for tests via ``run_once``.""" + await self.run_once() + + async def run_once(self) -> int: + """Single scan pass. Returns the number of sessions resumed. + + Exposed publicly so tests can drive the watchdog + deterministically without waiting on the polling cadence. + """ + orch = getattr(self._service, "_orch", None) + if orch is None: + return 0 + registry = dict(self._service._registry) + if not registry: + return 0 + now = datetime.now(timezone.utc) + resumed = 0 + for session_id in list(registry.keys()): + try: + inc = orch.store.load(session_id) + except Exception: # noqa: BLE001 + continue + status = getattr(inc, "status", None) + if status in _TERMINAL_STATUSES: + continue + if status != "awaiting_input": + # Only sessions paused on a high-risk gate are watchdog + # candidates. ``in_progress`` / ``new`` are still + # actively running on the loop. + continue + stale = self._find_stale_pending(inc, now) + if not stale: + continue + # No is_locked() peek here — try_acquire (inside + # _resume_with_timeout) is the single contention check, so + # there is no TOCTOU window between check and acquire. The + # SessionBusy handler below fires on real contention. + try: + await self._resume_with_timeout(orch, session_id) + resumed += 1 + except SessionBusy: + logger.debug( + "approval watchdog: session %s SessionBusy at resume, skipping", + session_id, + ) + continue + except Exception: # noqa: BLE001 + logger.exception( + "approval watchdog: resume failed for session %s", + session_id, + ) + return resumed + + def _find_stale_pending(self, inc: Any, now: datetime) -> list[int]: + """Return indices of ``pending_approval`` ToolCalls older than the + configured timeout.""" + out: list[int] = [] + tool_calls = getattr(inc, "tool_calls", []) or [] + threshold = self._approval_timeout_seconds + for idx, tc in enumerate(tool_calls): + if getattr(tc, "status", None) != "pending_approval": + continue + ts = _parse_iso(getattr(tc, "ts", None)) + if ts is None: + continue + age = (now - ts).total_seconds() + if age >= threshold: + out.append(idx) + return out + + async def _resume_with_timeout( + self, orch: Any, session_id: str, + ) -> None: + """Resume the paused graph with a synthetic timeout decision. + + Uses ``Command(resume=...)`` against the same ``thread_id`` the + approval API would use — the wrap_tool resume path updates the + audit row to ``status="timeout"`` automatically. + + Per D-18: the ``ainvoke`` call is wrapped in + ``orch._locks.try_acquire(session_id)`` so a concurrent user- + driven turn cannot interleave checkpoint writes for the same + ``thread_id``. If the lock is already held, ``try_acquire`` + raises ``SessionBusy`` immediately (no waiting); the caller + (``run_once``) catches that and skips the tick — this is how + the watchdog tolerates a busy session without piling up. + """ + from langgraph.types import Command # local: heavy import + + decision_payload = { + "decision": "timeout", + "approver": "system", + "rationale": "approval window expired", + } + async with orch._locks.try_acquire(session_id): + await orch.graph.ainvoke( + Command(resume=decision_payload), + config=orch._thread_config(session_id), + ) + +# ====== module: runtime/policy.py ====== + +if TYPE_CHECKING: # pragma: no cover -- type checking only + + + pass # noqa: PIE790 -- bundle survives even if imports are stripped + + +GateReason = Literal[ + "auto", + "high_risk_tool", + "gated_env", + "low_confidence", + "blocked", +] + + +class GateDecision(BaseModel): + """Outcome of a single gating evaluation.""" + + model_config = ConfigDict(extra="forbid") + gate: bool + reason: GateReason + + +def should_gate( + session: Any, + tool_call: "ToolCall", + confidence: float | None, + cfg: "OrchestratorConfig", +) -> GateDecision: + """Decide whether ``tool_call`` should pause for HITL approval. + + Pure -- delegates the per-tool risk lookup to + :func:`runtime.tools.gateway.effective_action` (so the v1.0 PVC-08 + prefixed-form lookup invariant is preserved) and combines the + result with ``session.environment`` and ``confidence`` per the + precedence rules in the module docstring. + + ``session`` is typed as ``Any`` because the framework's base + :class:`runtime.state.Session` does not own the ``environment`` + field (apps subclass and add it). The function reads + ``session.environment`` and tolerates a missing attribute by + treating it as ``None``. + + ``confidence=None`` means "no signal yet" -- treated internally as + 1.0 to avoid a false-positive low_confidence gate before any + envelope/tool-arg has surfaced for the active turn. + """ + # Read gateway config off the OrchestratorConfig. The runtime threads + # it via cfg.gateway today (sibling of cfg.gate_policy in the + # OrchestratorConfig namespace) -- gracefully tolerate the legacy + # path where gateway is configured on RuntimeConfig instead. + gateway_cfg = getattr(cfg, "gateway", None) + env = getattr(session, "environment", None) + + risk_action = effective_action( + tool_call.tool, + env=env, + gateway_cfg=gateway_cfg, + ) + + # 1. high-risk tool gates first. + if risk_action in cfg.gate_policy.gated_risk_actions: + return GateDecision(gate=True, reason="high_risk_tool") + + # 2. gated env: any non-"auto" risk in a gated environment. + if (env in cfg.gate_policy.gated_environments + and risk_action != "auto"): + return GateDecision(gate=True, reason="gated_env") + + # 3. low confidence: only an actionable tool. None == "no signal yet". + effective_conf = 1.0 if confidence is None else confidence + if (effective_conf < cfg.gate_policy.confidence_threshold + and risk_action != "auto"): + return GateDecision(gate=True, reason="low_confidence") + + return GateDecision(gate=False, reason="auto") + + +# --------------------------------------------------------------- +# Phase 12 (FOC-05): pure should_retry policy. +# --------------------------------------------------------------- + +import asyncio as _asyncio + +import pydantic as _pydantic + + +RetryReason = Literal[ + "auto_retry", + "max_retries_exceeded", + "permanent_error", + "low_confidence_no_retry", + "transient_disabled", +] + + +class RetryDecision(BaseModel): + """Outcome of a single retry-policy evaluation. + + Pure surface: produced by :func:`should_retry` from + ``(retry_count, error, confidence, cfg)``. The orchestrator's + ``_retry_session_locked`` consults this BEFORE running the retry; + the UI consults the same value via + ``Orchestrator.preview_retry_decision`` to render the button label / + disabled state. + """ + + model_config = ConfigDict(extra="forbid") + retry: bool + reason: RetryReason + + +# Whitelist of exception types that are NEVER auto-retryable. +# Schema/validation errors -- the LLM produced bad data; retrying +# without addressing root cause burns budget. Adding a new entry is a +# one-line PR (D-12-02 explicit choice -- no new ToolError ABC). +_PERMANENT_TYPES: tuple[type[BaseException], ...] = ( + _pydantic.ValidationError, + EnvelopeMissingError, +) + +# Whitelist of exception types that are ALWAYS auto-retryable +# (subject to max_retries). Network blips, asyncio timeouts, +# filesystem/socket transients. httpx is NOT imported because the +# runtime does not raise httpx errors today; built-in TimeoutError +# covers asyncio's 3.11+ alias. +_TRANSIENT_TYPES: tuple[type[BaseException], ...] = ( + _asyncio.TimeoutError, + TimeoutError, + OSError, + ConnectionError, +) + + +def _is_permanent_error(error: Exception | None) -> bool: + if error is None: + return False + return isinstance(error, _PERMANENT_TYPES) + + +def _is_transient_error(error: Exception | None) -> bool: + if error is None: + return False + return isinstance(error, _TRANSIENT_TYPES) + + +def should_retry( + retry_count: int, + error: Exception | None, + confidence: float | None, + cfg: "OrchestratorConfig", +) -> RetryDecision: + """Decide whether the framework should auto-retry a failed turn. + + Pure -- same inputs always yield identical RetryDecision. + + Precedence (descending; first match wins): + 1. ``retry_count >= cfg.retry_policy.max_retries`` + -> ``RetryDecision(retry=False, reason="max_retries_exceeded")`` + 2. ``error`` matches ``_PERMANENT_TYPES`` + -> ``RetryDecision(retry=False, reason="permanent_error")`` + 3. ``confidence is not None`` AND + ``confidence < cfg.retry_policy.retry_low_confidence_threshold`` + AND ``error`` is NOT in ``_TRANSIENT_TYPES`` + -> ``RetryDecision(retry=False, reason="low_confidence_no_retry")`` + 4. ``error`` matches ``_TRANSIENT_TYPES`` AND + ``cfg.retry_policy.retry_on_transient is False`` + -> ``RetryDecision(retry=False, reason="transient_disabled")`` + 5. ``error`` matches ``_TRANSIENT_TYPES`` AND + ``cfg.retry_policy.retry_on_transient is True`` + -> ``RetryDecision(retry=True, reason="auto_retry")`` + 6. Default fall-through (no match) -> ``RetryDecision( + retry=False, reason="permanent_error")`` -- fail-closed + conservative default (D-12-02). + + ``retry_count`` is the count of PRIOR retries (0 on the first + retry attempt). Caller is responsible for the bump. + + ``error`` may be ``None`` (caller has no exception object); that is + treated as a permanent error for safety. + + ``confidence`` is the last AgentRun.confidence for the failed turn; + ``None`` means "no signal recorded" and skips the low-confidence + gate. + """ + # 1. absolute cap -- regardless of error class + if retry_count >= cfg.retry_policy.max_retries: + return RetryDecision(retry=False, reason="max_retries_exceeded") + + # 2. permanent errors -- never auto-retry + if _is_permanent_error(error): + return RetryDecision(retry=False, reason="permanent_error") + + is_transient = _is_transient_error(error) + + # 3. low-confidence -- only when error is NOT transient (transient + # errors are mechanical; the LLM's confidence in the business + # decision is still trustworthy on retry). + if (confidence is not None + and confidence < cfg.retry_policy.retry_low_confidence_threshold + and not is_transient): + return RetryDecision( + retry=False, reason="low_confidence_no_retry", + ) + + # 4 + 5. transient classification + if is_transient: + if not cfg.retry_policy.retry_on_transient: + return RetryDecision(retry=False, reason="transient_disabled") + return RetryDecision(retry=True, reason="auto_retry") + + # 6. fail-closed default + return RetryDecision(retry=False, reason="permanent_error") + + +__all__ = [ + # Phase 11 + "GateDecision", "GateReason", "should_gate", + # Phase 12 + "RetryDecision", "RetryReason", "should_retry", +] + +# ====== module: runtime/agents/responsive.py ====== + +logger = logging.getLogger(__name__) + + +def make_agent_node( + *, + skill: Skill, + llm: BaseChatModel, + tools: list[BaseTool], + decide_route: Callable[[Session], str], + store: SessionStore, + valid_signals: frozenset[str] | None = None, + gateway_cfg: GatewayConfig | None = None, + terminal_tool_names: frozenset[str] = frozenset(), + patch_tool_names: frozenset[str] = frozenset(), + gate_policy: "GatePolicy | None" = None, +): + """Factory: build a LangGraph node that runs a ReAct agent and decides a route. + + ``valid_signals`` is the orchestrator-wide accepted signal vocabulary + (``cfg.orchestrator.signals``). When omitted, the legacy + ``{success, failed, needs_input}`` default is used so older callers and + tests keep working. + + ``gateway_cfg`` is the optional risk-rated tool gateway config. + When supplied, every ``BaseTool`` in ``tools`` is wrapped via + :func:`runtime.tools.gateway.wrap_tool` *inside the node body* so the + closure captures the live ``Session`` per agent invocation. When + ``None``, tools are passed through untouched. + """ + # Imported lazily to avoid an import cycle: ``runtime.graph`` depends + # on this module via ``_build_agent_nodes``, but the helpers used + # inside the node body live in ``graph`` so we keep a single + # implementation for the responsive path. The cycle is benign at + # call time — both modules are fully imported before ``node()`` runs. + + + async def node(state: GraphState) -> dict: + incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + inc_id = incident.id + started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + + # Wrap tools per-invocation so each wrap closes over the + # live ``Session`` for this run. + if gateway_cfg is not None: + run_tools = [ + wrap_tool(t, session=incident, gateway_cfg=gateway_cfg, + agent_name=skill.name, store=store, + gate_policy=gate_policy) + for t in tools + ] + else: + run_tools = tools + # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every + # responsive agent invocation is wrapped in an AgentTurnOutput + # envelope. ``langchain.agents.create_agent`` (the non-deprecated + # successor to ``langgraph.prebuilt.create_react_agent``) accepts a + # bare schema as ``response_format`` and, by default, wraps it in + # ``AutoStrategy`` — ProviderStrategy for models with native + # structured-output (OpenAI-class), falling back to ToolStrategy + # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a + # callable tool: when the LLM ``calls`` it, the loop terminates on + # the same turn with ``result["structured_response"]`` populated. + # Eliminates the old two-call structure (loop + separate + # ``with_structured_output`` pass) that hit recursion_limit=25 on + # Ollama models without true function-calling. + agent_executor = create_agent( + model=llm, + tools=run_tools, + system_prompt=skill.system_prompt, + response_format=AgentTurnOutput, + ) + + # Phase 11 (FOC-04): reset per-turn confidence hint at the + # start of each agent step so the gateway treats the first + # tool call of the turn as "no signal yet". + try: + incident.turn_confidence_hint = None + except (AttributeError, ValueError): + pass + + try: + result = await _ainvoke_with_retry( + agent_executor, + {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + ) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up. + raise + except Exception as exc: # noqa: BLE001 + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + + # Tools (e.g. registered patch tools) write straight to disk. + # Reload so the node's own append of agent_run + tool_calls + # happens against the tool-mutated state. + incident = store.load(inc_id) + + messages = result.get("messages", []) + ts = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + + agent_confidence, agent_rationale, agent_signal = _harvest_tool_calls_and_patches( + messages, skill.name, incident, ts, valid_signals, + terminal_tool_names=terminal_tool_names, + patch_tool_names=patch_tool_names, + ) + # Phase 11 (FOC-04): update hint so any subsequent in-turn + # tool call sees the harvested confidence. + if agent_confidence is not None: + try: + incident.turn_confidence_hint = agent_confidence + except (AttributeError, ValueError): + pass + _pair_tool_responses(messages, incident) + + # Phase 10 (FOC-03 / D-10-03): parse envelope; reconcile against + # any typed-terminal-tool-arg confidence. Envelope failure is a + # structured agent_run error. + try: + envelope = parse_envelope_from_result(result, agent=skill.name) + except EnvelopeMissingError as exc: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + + terminal_tool_for_log = _first_terminal_tool_called_this_turn( + messages, terminal_tool_names, + ) + final_confidence = reconcile_confidence( + envelope.confidence, + agent_confidence, + agent=skill.name, + session_id=inc_id, + tool_name=terminal_tool_for_log, + ) + final_rationale = agent_rationale or envelope.confidence_rationale + final_signal = agent_signal if agent_signal is not None else envelope.signal + + final_text = envelope.content or _extract_final_text(messages) + usage = _sum_token_usage(messages) + + _record_success_run( + incident=incident, skill_name=skill.name, started_at=started_at, + final_text=final_text, usage=usage, + confidence=final_confidence, rationale=final_rationale, + signal=final_signal, + store=store, + ) + next_route_signal = decide_route(incident) + next_node = route_from_skill(skill, next_route_signal) + return {"session": incident, "next_route": next_node, + "last_agent": skill.name, "error": None} + + return node + + +__all__ = ["make_agent_node"] + +# ====== module: runtime/agents/supervisor.py ====== + +logger = logging.getLogger(__name__) + + +def _safe_eval(expr: str, ctx: dict[str, Any]) -> Any: + """Evaluate a pre-validated safe-eval expression against ``ctx``. + + The expression must already have passed + :func:`runtime.skill._validate_safe_expr` — that's enforced at + skill-load time. We re-parse here (cheap) and walk the tree + against the same allowlist; any non-whitelisted node is treated + as evaluating to ``False`` so a malformed runtime expression can + never escalate to arbitrary code execution. + """ + + _validate_safe_expr(expr, source="supervisor.dispatch_rule") + # ``compile`` + ``eval`` over a built-in-stripped namespace is the + # cheapest correct evaluator once the AST is whitelisted. The + # ``__builtins__`` removal blocks ``__import__`` etc. should the + # AST checker miss something. + code = compile(expr, "", "eval") + return eval(code, {"__builtins__": {}}, ctx) # noqa: S307 — AST-whitelisted + + +def _ctx_for_session(incident: Session) -> dict[str, Any]: + """Build the variable namespace dispatch-rule expressions see. + + Exposes the live session payload as ``session`` plus a few + ergonomic top-level aliases for fields operators reach for most + often. Adding new top-level names is a one-liner; the safe-eval + AST checker already restricts the language so we don't need to + sandbox the namespace any further. + """ + payload = incident.model_dump() + return { + "session": payload, + "status": payload.get("status"), + "agents_run": payload.get("agents_run") or [], + "tool_calls": payload.get("tool_calls") or [], + } + + +def log_supervisor_dispatch( + *, + session: Session, + supervisor: str, + strategy: str, + depth: int, + targets: list[str], + rule_matched: str | None, + payload_size: int, +) -> None: + """Emit one structured ``supervisor_dispatch`` log entry. + + Operators wanting an end-to-end audit join ``agent_runs`` and the + log stream by ``incident_id``. The audit trail is deliberately a + different stream from ``agent_runs`` because supervisors don't burn + tokens — bloating ``agents_run`` with router rows is a known trap + we explicitly avoid. + """ + record = { + "event": "supervisor_dispatch", + "ts": datetime.now(timezone.utc).strftime(_UTC_TS_FMT), + "incident_id": session.id, + "session_id": session.id, + "supervisor": supervisor, + "strategy": strategy, + "depth": depth, + "targets": targets, + "rule_matched": rule_matched, + "dispatch_payload_size": payload_size, + } + logger.info("supervisor_dispatch %s", json.dumps(record)) + + +def _llm_pick_target( + *, + skill: Skill, + llm: BaseChatModel, + incident: Session, +) -> str: + """One-shot LLM dispatch: ask the model to choose a subordinate. + + The model is asked to reply with **only** the name of one + subordinate. We accept the first matching name in the response + (case-insensitive substring match) and fall back to the first + subordinate when the response is unparseable — keeping the graph + moving rather than failing outright. + """ + prompt = ( + f"{skill.dispatch_prompt}\n\n" + f"Choose ONE of: {', '.join(skill.subordinates)}.\n" + f"Reply with only the agent name." + ) + payload = json.dumps(incident.model_dump(), default=str) + msgs = [ + SystemMessage(content=prompt), + HumanMessage(content=payload), + ] + try: + result = llm.invoke(msgs) + except Exception as exc: # noqa: BLE001 + logger.warning( + "supervisor %s: LLM dispatch failed (%s); falling back to %s", + skill.name, exc, skill.subordinates[0], + ) + return skill.subordinates[0] + text = (getattr(result, "content", "") or "").strip().lower() + for name in skill.subordinates: + if name.lower() in text: + return name + logger.warning( + "supervisor %s: LLM reply %r did not name a subordinate; " + "falling back to %s", skill.name, text, skill.subordinates[0], + ) + return skill.subordinates[0] + + +def _rule_pick_target( + *, + skill: Skill, + incident: Session, +) -> tuple[str, str | None]: + """Walk dispatch_rules in order; return (target, matched_when). + + Falls back to the first subordinate when no rule matches; the + fallback case carries ``matched_when=None`` so the audit log can + distinguish "default" from "rule X matched". + """ + ctx = _ctx_for_session(incident) + for rule in skill.dispatch_rules: + try: + if bool(_safe_eval(rule.when, ctx)): + return rule.target, rule.when + except Exception as exc: # noqa: BLE001 + logger.warning( + "supervisor %s: dispatch_rule %r raised %s; skipping", + skill.name, rule.when, exc, + ) + return skill.subordinates[0], None + + +def _normalize_runner_route(value: Any) -> str: + """Map runner-supplied route aliases to the canonical graph end token. + + Apps writing runners reach for ``"END"`` / ``"end"`` / ``"__end__"`` + interchangeably; LangGraph's conditional edges only recognise + ``"__end__"``. Normalising here keeps the runner contract permissive + without spreading the alias check across the graph layer. + """ + if isinstance(value, str) and value.strip().lower() in {"end", "__end__"}: + return "__end__" + return value + + +def make_supervisor_node( + *, + skill: Skill, + llm: BaseChatModel | None = None, + framework_cfg: Any | None = None, +): + """Build the supervisor LangGraph node. + + Pure routing: no ``AgentRun`` row, no tool execution, no token + accounting beyond what the optional LLM call itself reports. The + node sets ``state["next_route"]`` to a subordinate name and returns; + LangGraph's conditional edges fan out to that node from there. + + The optional ``llm`` is only used when ``skill.dispatch_strategy`` + is ``"llm"``. Callers using ``"rule"`` may pass ``None``. + + When ``skill.runner`` is set, the dotted-path callable is resolved + at build time and invoked at the start of each node call BEFORE the + routing dispatch. The runner gets the live ``GraphState`` and the + optional ``framework_cfg`` and may return ``None`` (continue with + the routing table) or a dict patch that gets merged into state. A + patch carrying ``"next_route"`` short-circuits the routing table + entirely (use ``"__end__"`` to terminate the graph). + """ + # Local import to avoid the circular runtime.graph -> runtime.agents + # cycle at module-load time. + + + if skill.kind != "supervisor": + raise ValueError( + f"make_supervisor_node called with non-supervisor skill " + f"{skill.name!r} (kind={skill.kind!r})" + ) + + runner: Callable[..., Any] | None = None + if skill.runner is not None: + if callable(skill.runner): + # Test stubs and composed runners may supply a live callable + # directly rather than a dotted-path string. Access via the + # class __dict__ to avoid Python binding it as an instance + # method when the skill is a plain object (not a Pydantic model). + raw = vars(type(skill)).get("runner", skill.runner) + runner = raw if callable(raw) else skill.runner + else: + # Resolved a second time here so a runner that fails to import + # at graph-build time still surfaces a clear error. The skill + # validator catches most issues at YAML load; this is belt-and- + # braces and also gives us the live callable to invoke. + runner = _resolve_dotted_callable( + skill.runner, source=f"supervisor {skill.name!r} runner" + ) + + async def node(state: GraphState) -> dict: + sess: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + # ``dispatch_depth`` is an extension field on GraphState; start + # at 0 and increment per supervisor entry. + depth = int(state.get("dispatch_depth") or 0) + 1 + if depth > skill.max_dispatch_depth: + logger.warning( + "supervisor %s: dispatch depth %d exceeds limit %d; aborting", + skill.name, depth, skill.max_dispatch_depth, + ) + return { + "session": sess, + "next_route": "__end__", + "last_agent": skill.name, + "dispatch_depth": depth, + "error": ( + f"supervisor {skill.name!r}: max_dispatch_depth " + f"{skill.max_dispatch_depth} exceeded" + ), + } + + # ----- App-supplied runner hook ------------------------------- + runner_patch: dict[str, Any] = {} + if runner is not None: + # Build a thin proxy so the runner can reach intake_context + # (and any other framework_cfg attributes) without needing + # framework_cfg to be mutable. The proxy exposes intake_context + # directly and falls back to framework_cfg for all other attrs. + _app_cfg_proxy = type("_RunnerAppCfg", (), { + "intake_context": getattr(framework_cfg, "intake_context", None), + "__getattr__": lambda self, name: getattr(framework_cfg, name), + })() + try: + result = runner(state, app_cfg=_app_cfg_proxy) + except Exception as exc: # noqa: BLE001 + logger.exception( + "supervisor %s: runner %s raised; aborting to __end__", + skill.name, skill.runner, + ) + return { + "session": sess, + "next_route": "__end__", + "last_agent": skill.name, + "dispatch_depth": depth, + "error": ( + f"supervisor {skill.name!r}: runner failed: {exc}" + ), + } + if isinstance(result, dict): + runner_patch = dict(result) + elif result is not None: + logger.warning( + "supervisor %s: runner returned %s (expected dict|None); " + "ignoring", skill.name, type(result).__name__, + ) + override = runner_patch.pop("next_route", None) + if override is not None: + # Short-circuit: skip the routing table entirely. Audit + # log still fires so operators can trace the decision. + target = _normalize_runner_route(override) + # Pick up any fresh reference the runner returned. + sess = runner_patch.get("session", sess) + try: + payload_size = len( + json.dumps(sess.model_dump(), default=str) + ) + except Exception: # noqa: BLE001 — defensive + payload_size = 0 + log_supervisor_dispatch( + session=sess, + supervisor=skill.name, + strategy=f"runner:{skill.runner}", + depth=depth, + targets=[target], + rule_matched=None, + payload_size=payload_size, + ) + out: dict[str, Any] = { + "session": sess, + "next_route": target, + "last_agent": skill.name, + "dispatch_depth": depth, + "error": None, + } + # Merge any non-route keys the runner returned (e.g. + # extra GraphState fields apps want to carry forward). + for k, v in runner_patch.items(): + if k not in out: + out[k] = v + return out + # No override: fold any payload mutation back so the + # routing table sees the up-to-date object. + if "session" in runner_patch: + sess = runner_patch["session"] + + rule_matched: str | None = None + if skill.dispatch_strategy == "rule": + target, rule_matched = _rule_pick_target(skill=skill, incident=sess) + else: # "llm" + if llm is None: + logger.warning( + "supervisor %s: strategy=llm but no llm provided; " + "falling back to first subordinate", skill.name, + ) + target = skill.subordinates[0] + else: + target = _llm_pick_target(skill=skill, llm=llm, incident=sess) + + # Audit: one structured log entry per dispatch. + try: + payload_size = len(json.dumps(sess.model_dump(), default=str)) + except Exception: # noqa: BLE001 — defensive; size is a hint + payload_size = 0 + log_supervisor_dispatch( + session=sess, + supervisor=skill.name, + strategy=skill.dispatch_strategy, + depth=depth, + targets=[target], + rule_matched=rule_matched, + payload_size=payload_size, + ) + + out: dict[str, Any] = { + "session": sess, + "next_route": target, + "last_agent": skill.name, + "dispatch_depth": depth, + "error": None, + } + # Carry through any extra keys the runner emitted that the + # framework didn't consume itself (e.g. memory snapshots). + for k, v in runner_patch.items(): + if k not in out: + out[k] = v + return out + + return node + + +__all__ = ["make_supervisor_node", "log_supervisor_dispatch"] + +# ====== module: runtime/agents/monitor.py ====== + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Safe-eval evaluator +# --------------------------------------------------------------------------- + + +class SafeEvalError(Exception): + """Raised when a supposedly-validated expression fails to evaluate.""" + + +def safe_eval(expr: str, ctx: dict[str, Any]) -> Any: + """Evaluate ``expr`` against ``ctx`` after a fresh AST whitelist check. + + The skill loader validates ``emit_signal_when`` at parse time; we + re-validate here on every call to keep the threat model defensive + against any future code path that might construct a Skill bypassing + the loader's validators. + """ + _validate_safe_expr(expr, source="monitor.emit_signal_when") + code = compile(expr, "", "eval") + try: + return eval(code, {"__builtins__": {}}, ctx) # noqa: S307 — AST-whitelisted + except Exception as exc: # noqa: BLE001 + raise SafeEvalError(f"emit_signal_when {expr!r} raised: {exc}") from exc + + +# --------------------------------------------------------------------------- +# Cron parsing (minute-resolution; matches Skill._validate_cron grammar) +# --------------------------------------------------------------------------- + + +def _expand_cron_field(field: str, lo: int, hi: int) -> set[int]: + """Expand a single cron field into the set of int values it matches. + + Supports ``*``, ``*/n``, ``a``, ``a-b``, ``a-b/n``, and + comma-separated combinations of those — the grammar accepted by + :func:`runtime.skill._validate_cron`. + """ + out: set[int] = set() + for part in field.split(","): + step = 1 + if "/" in part: + base, _, step_s = part.partition("/") + step = int(step_s) + else: + base = part + if base == "*": + start, end = lo, hi + elif "-" in base: + a, _, b = base.partition("-") + start, end = int(a), int(b) + else: + v = int(base) + start, end = v, v + out.update(range(start, end + 1, step)) + return {v for v in out if lo <= v <= hi} + + +def _cron_matches(expr: str, when: datetime) -> bool: + """Return True if the given datetime satisfies the 5-field cron expression. + + Fields: minute, hour, day-of-month, month, day-of-week (0=Mon..6=Sun + — Python's ``datetime.weekday()`` convention; cron itself uses + 0=Sun, but for our minute-resolution scheduler the convention only + needs to be internally consistent and documented). + """ + minute, hour, dom, month, dow = expr.split() + return ( + when.minute in _expand_cron_field(minute, 0, 59) + and when.hour in _expand_cron_field(hour, 0, 23) + and when.day in _expand_cron_field(dom, 1, 31) + and when.month in _expand_cron_field(month, 1, 12) + and when.weekday() in _expand_cron_field(dow, 0, 6) + ) + + +# --------------------------------------------------------------------------- +# Monitor callable factory +# --------------------------------------------------------------------------- + + +def make_monitor_callable( + *, + skill: Skill, + observe_fn: Callable[[str], Any], + fire_trigger: Callable[[str, dict[str, Any]], None], +) -> Callable[[], None]: + """Build the callable a :class:`MonitorRunner` runs per tick. + + ``observe_fn(tool_name)`` is the seam through which the runner + invokes a tool. Production wires this to the orchestrator's MCP + tool registry; tests wire it to deterministic stubs. + + ``fire_trigger(name, payload)`` is the seam through which the + runner fires a trigger. Production wires this to the trigger + registry; tests wire it to a recorder. + + The returned callable is intentionally synchronous and exception- + safe: a failed ``observe_fn`` or ``fire_trigger`` is logged and + swallowed so one bad monitor cannot stall the runner. + """ + if skill.kind != "monitor": + raise ValueError( + f"make_monitor_callable called with non-monitor skill " + f"{skill.name!r} (kind={skill.kind!r})" + ) + + def tick() -> None: + observation: dict[str, Any] = {} + for tool_name in skill.observe: + try: + observation[tool_name] = observe_fn(tool_name) + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: observe tool %r raised %s; skipping", + skill.name, tool_name, exc, + ) + observation[tool_name] = None + ctx = { + "observation": observation, + "obs": observation, + } + try: + should_emit = bool(safe_eval(skill.emit_signal_when or "False", ctx)) + except SafeEvalError as exc: + logger.warning("monitor %s: %s", skill.name, exc) + return + if not should_emit: + return + try: + fire_trigger(skill.trigger_target or "", { + "monitor": skill.name, + "observation": observation, + }) + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: fire_trigger(%s) raised %s", + skill.name, skill.trigger_target, exc, + ) + + return tick + + +# --------------------------------------------------------------------------- +# MonitorRunner — orchestrator-level singleton +# --------------------------------------------------------------------------- + + +class _RegisteredMonitor: + __slots__ = ("skill", "callable_", "next_run_ts") + + def __init__(self, skill: Skill, callable_: Callable[[], None]) -> None: + self.skill = skill + self.callable_ = callable_ + # Track the last *scheduled* minute we fired so we never fire + # twice for the same wall-clock minute even if the scheduler + # thread oversleeps. + self.next_run_ts: datetime | None = None + + +class MonitorRunner: + """Owns a bounded thread pool and a scheduler thread that ticks + registered monitor skills on their cron schedules. + + Exactly one ``MonitorRunner`` exists per ``OrchestratorService`` + instance; the runner is built at service startup and shut down at + service teardown. + + Concurrency: each tick is dispatched to the + :class:`~concurrent.futures.ThreadPoolExecutor` so the scheduler + thread itself never blocks on a slow ``observe`` tool. The pool + size defaults to ``4`` (R6); each tick has a per-monitor timeout + sourced from the skill's ``tick_timeout_seconds``. + """ + + def __init__( + self, + *, + observe_fn: Callable[[str], Any], + fire_trigger: Callable[[str, dict[str, Any]], None], + max_workers: int = 4, + clock: Callable[[], datetime] | None = None, + ) -> None: + self._observe_fn = observe_fn + self._fire_trigger = fire_trigger + self._executor = ThreadPoolExecutor( + max_workers=max_workers, + thread_name_prefix="monitor", + ) + self._monitors: dict[str, _RegisteredMonitor] = {} + self._stop = threading.Event() + self._thread: threading.Thread | None = None + self._lock = threading.Lock() + # Injection seam for tests; default uses real wall-clock UTC. + self._clock = clock or (lambda: datetime.now(timezone.utc)) + + # ----- registration ----- + + def register(self, skill: Skill) -> None: + if skill.kind != "monitor": + raise ValueError( + f"MonitorRunner.register: skill {skill.name!r} kind=" + f"{skill.kind!r} (expected 'monitor')" ) - return Client( - {"command": server_cfg.command[0], "args": server_cfg.command[1:]} + callable_ = make_monitor_callable( + skill=skill, + observe_fn=self._observe_fn, + fire_trigger=self._fire_trigger, ) - raise ValueError(f"Unknown transport: {server_cfg.transport}") + with self._lock: + if skill.name in self._monitors: + raise ValueError(f"monitor {skill.name!r} already registered") + self._monitors[skill.name] = _RegisteredMonitor(skill, callable_) + def unregister(self, name: str) -> None: + with self._lock: + self._monitors.pop(name, None) -async def _load_in_process(server_cfg: MCPServerConfig, - stack: AsyncExitStack) -> list[BaseTool]: - if server_cfg.module is None: - raise ValueError(f"in_process server '{server_cfg.name}' missing 'module'") - mod = importlib.import_module(server_cfg.module) - fmcp = getattr(mod, "mcp", None) - if fmcp is None: - raise ValueError(f"Module {server_cfg.module} has no 'mcp' (FastMCP instance)") - # FastMCP exposes tools as functions; convert to langchain tools via adapter. - # We use the in-memory client transport. The client is registered into the - # caller's exit stack so its session/transport stays open while the loaded - # tools are in use. - from fastmcp import Client - client = Client(fmcp) - await stack.enter_async_context(client) - tools = await load_mcp_tools(client.session) - # Rewrite each tool's .name to ":" for LLM disambiguation. - for t in tools: - original_name = t.name - t.name = f"{server_cfg.name}:{original_name}" - t._original_mcp_name = original_name # type: ignore[attr-defined] - return tools + def registered(self) -> list[str]: + with self._lock: + return sorted(self._monitors.keys()) + # ----- lifecycle ----- -async def _load_remote(server_cfg: MCPServerConfig, - stack: AsyncExitStack) -> list[BaseTool]: - from fastmcp import Client - if server_cfg.transport in ("http", "sse"): - if not server_cfg.url: - raise ValueError(f"remote server '{server_cfg.name}' missing 'url'") - client = Client(server_cfg.url, headers=server_cfg.headers or None) - elif server_cfg.transport == "stdio": - if not server_cfg.command: - raise ValueError(f"stdio server '{server_cfg.name}' missing 'command'") - client = Client({"command": server_cfg.command[0], "args": server_cfg.command[1:]}) - else: - raise ValueError(f"Unknown transport: {server_cfg.transport}") - await stack.enter_async_context(client) - tools = await load_mcp_tools(client.session) - # Rewrite each tool's .name to ":" for LLM disambiguation. - for t in tools: - original_name = t.name - t.name = f"{server_cfg.name}:{original_name}" - t._original_mcp_name = original_name # type: ignore[attr-defined] - return tools + def start(self) -> None: + if self._thread is not None and self._thread.is_alive(): + return + self._stop.clear() + self._thread = threading.Thread( + target=self._run, + name="MonitorRunner", + daemon=True, + ) + self._thread.start() + def stop(self, *, wait: bool = True, timeout: float = 5.0) -> None: + """Halt the scheduler thread and shut down the executor. -async def load_tools(cfg: MCPConfig, stack: AsyncExitStack) -> ToolRegistry: - """Load all enabled MCP servers and return a :class:`ToolRegistry`. + ``wait=True`` (default) blocks up to ``timeout`` seconds for + in-flight ticks to drain. Daemon threads are still joined so + pytest fixture teardown is deterministic. + """ + self._stop.set() + thread = self._thread + if thread is not None and thread.is_alive() and wait: + thread.join(timeout=timeout) + self._executor.shutdown(wait=wait) + self._thread = None - The caller MUST pass an already-entered :class:`AsyncExitStack`. Each - FastMCP ``Client`` is registered into it; the caller controls lifetime via - ``await stack.aclose()``. - """ - registry = ToolRegistry() - for server_cfg in cfg.servers: - if not server_cfg.enabled: - continue - if server_cfg.transport == "in_process": - tools = await _load_in_process(server_cfg, stack) - else: - tools = await _load_remote(server_cfg, stack) - for t in tools: - original = getattr(t, "_original_mcp_name", t.name) - registry.add(ToolEntry( - name=original, description=t.description or "", - server=server_cfg.name, category=server_cfg.category, tool=t, - )) - return registry + # ----- test hook ----- + + def tick_once(self, when: datetime | None = None) -> None: + """Fire any monitors whose cron expression matches ``when``. + + Useful in tests where freezing wall-clock time is awkward; the + production scheduler loop calls this internally too. + """ + when = when or self._clock() + # Truncate to the minute so identical seconds within a minute + # don't fire the same monitor twice. + minute = when.replace(second=0, microsecond=0) + with self._lock: + entries = list(self._monitors.values()) + for entry in entries: + try: + if not _cron_matches(entry.skill.schedule or "* * * * *", minute): + continue + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: cron parse failed (%s); skipping tick", + entry.skill.name, exc, + ) + continue + if entry.next_run_ts == minute: + # Already fired this minute; idempotent on oversleep. + continue + entry.next_run_ts = minute + self._dispatch(entry) + + def _dispatch(self, entry: _RegisteredMonitor) -> None: + timeout = float(entry.skill.tick_timeout_seconds or 30.0) + future = self._executor.submit(entry.callable_) + + def _wait_and_log() -> None: + try: + future.result(timeout=timeout) + except FuturesTimeout: + logger.warning( + "monitor %s: tick exceeded %.1fs timeout", + entry.skill.name, timeout, + ) + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: tick raised %s", entry.skill.name, exc, + ) + + # Watcher runs on a side thread so the scheduler loop never + # blocks waiting for a slow tick — the executor handles + # parallelism, the watcher handles per-tick timeout reporting. + threading.Thread( + target=_wait_and_log, + name=f"monitor-watch:{entry.skill.name}", + daemon=True, + ).start() + + # ----- scheduler loop ----- + + def _run(self) -> None: + """Single-threaded scheduler. Wakes once per second, fires + any monitor whose cron expression matches the current minute, + marks each fired monitor for the minute so we never fire + twice if we oversleep. + """ + while not self._stop.is_set(): + try: + self.tick_once() + except Exception as exc: # noqa: BLE001 — never crash the loop + logger.warning("MonitorRunner loop error: %s", exc) + # Sleep with frequent wakeups so stop() returns promptly. + self._stop.wait(timeout=1.0) + + +__all__ = [ + "MonitorRunner", + "SafeEvalError", + "make_monitor_callable", + "safe_eval", +] # ====== module: runtime/graph.py ====== @@ -4004,7 +8364,21 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, last_exc: Exception | None = None for attempt in range(max_attempts): try: + # Phase 15 (LLM-COMPAT-01): the recursion_limit=25 workaround + # introduced in 3ba099f as a safety net is gone — the + # ``langchain.agents.create_agent`` migration replaces the + # old two-call structure (loop + separate + # ``with_structured_output`` pass) with a single tool-loop + # whose terminal signal is the AgentTurnOutput tool call + # itself (AutoStrategy → ToolStrategy fallback for non- + # function-calling Ollama models). The default langgraph + # recursion bound is now a true upper bound, not a workaround. return await executor.ainvoke(input_) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause. + # GraphInterrupt is a checkpointed pending_approval signal, + # not a transient error. + raise except Exception as exc: # noqa: BLE001 msg = str(exc).lower() transient = any(m in msg for m in _TRANSIENT_MARKERS) @@ -4172,6 +8546,30 @@ def _extract_final_text(messages: list) -> str: return "" +def _first_terminal_tool_called_this_turn( + messages: list, + terminal_tool_names: frozenset[str], +) -> str | None: + """Return the bare name of the first typed-terminal tool called this turn. + + Phase 10 (FOC-03 / D-10-03): used to label the reconciliation log so + operators can correlate envelope-vs-tool-arg confidence divergences + against a specific tool. Tool names may be MCP-prefixed + (``:``); we rsplit on the rightmost colon to recover the + bare name and match against the configured ``terminal_tool_names``. + Returns None when no terminal tool fired this turn. + """ + if not terminal_tool_names: + return None + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + name = tc.get("name", "") + bare = name.rsplit(":", 1)[-1] + if bare in terminal_tool_names: + return bare + return None + + def _sum_token_usage(messages: list) -> TokenUsage: """Sum input/output token counts across all messages that report usage_metadata.""" agent_in = agent_out = 0 @@ -4186,6 +8584,50 @@ def _sum_token_usage(messages: list) -> TokenUsage: ) +def _try_recover_envelope_from_raw(raw: str) -> AgentTurnOutput | None: + """Attempt to extract an :class:`AgentTurnOutput` from a raw LLM + string when LangGraph's structured-output pass raised + ``OutputParserException``. + + Strategy: + 1. Parse the whole string as JSON. + 2. If that fails, scan for the first balanced ``{...}`` substring + and try parsing that (handles markdown-fenced JSON or trailing + chatter). + 3. Validate the parsed dict against :class:`AgentTurnOutput`. + + Returns the parsed envelope on success, ``None`` on any failure. + """ + if not raw or not raw.strip(): + return None + candidates: list[str] = [raw] + # Markdown-fenced JSON: ```json\n{...}\n``` + if "```" in raw: + for chunk in raw.split("```"): + stripped = chunk.strip() + if stripped.startswith("json"): + stripped = stripped[4:].lstrip() + if stripped.startswith("{"): + candidates.append(stripped) + # Greedy: first '{' through last '}' + first = raw.find("{") + last = raw.rfind("}") + if 0 <= first < last: + candidates.append(raw[first:last + 1]) + for candidate in candidates: + try: + payload = json.loads(candidate) + except (json.JSONDecodeError, ValueError): + continue + if not isinstance(payload, dict): + continue + try: + return AgentTurnOutput.model_validate(payload) + except Exception: # noqa: BLE001 + continue + return None + + def _handle_agent_failure( *, skill_name: str, @@ -4260,6 +8702,8 @@ def make_agent_node( gateway_cfg: GatewayConfig | None = None, terminal_tool_names: frozenset[str] = frozenset(), patch_tool_names: frozenset[str] = frozenset(), + injected_args: dict[str, str] | None = None, + gate_policy: "GatePolicy | None" = None, ) -> Callable[[GraphState], Awaitable[dict]]: """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -4280,6 +8724,14 @@ def make_agent_node( union ``OrchestratorConfig.harvest_terminal_tools`` / ``OrchestratorConfig.patch_tools``). Empty defaults preserve the "no harvester recognition" behavior for legacy callers. + + ``injected_args`` (Phase 9 / D-09-01) is the orchestrator-wide + map of ``arg_name -> dotted_path`` declared in + :attr:`OrchestratorConfig.injected_args`. Every entry is stripped + from each tool's LLM-visible signature (so the LLM cannot emit a + value for it) and re-supplied at invocation time from session + state. When ``None`` or empty, tools pass through to the LLM + unchanged — preserves legacy callers and the framework default. """ async def node(state: GraphState) -> dict: @@ -4287,32 +8739,165 @@ async def node(state: GraphState) -> dict: inc_id = incident.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Phase 9 (D-09-01): strip injected-arg keys from every tool's + # LLM-visible signature BEFORE create_react_agent serialises the + # tool surface — so the LLM literally cannot emit values for + # those params. The framework re-supplies them at invocation + # time inside the gateway (or an inject-only wrapper) below. + + injected_keys = frozenset((injected_args or {}).keys()) + if injected_keys: + visible_tools = [ + strip_injected_params(t, injected_keys) for t in tools + ] + else: + visible_tools = tools + # Wrap tools per-invocation so each wrap closes over the live # ``Session`` for this run. When the gateway is unconfigured, # the original tools pass through untouched and # ``create_react_agent`` sees the same surface as before. if gateway_cfg is not None: + # Pass ORIGINAL tools (pre-strip) to wrap_tool — the gateway + # wrapper strips internally for the LLM-visible schema while + # keeping ``inner.args_schema`` intact so + # ``accepted_params_for_tool`` correctly recognises injected + # keys (e.g. ``environment``) as accepted by the underlying + # tool. Stripping twice (here AND in wrap_tool) hides those + # keys from ``accepted_params``, the inject step skips them, + # and FastMCP rejects the call as missing required arg. run_tools = [ wrap_tool(t, session=incident, gateway_cfg=gateway_cfg, - agent_name=skill.name, store=store) + agent_name=skill.name, store=store, + injected_args=injected_args or {}, + gate_policy=gate_policy) for t in tools ] + elif injected_keys: + # No gateway, but injected_args is configured — wrap each + # tool in an inject-only ``StructuredTool`` so the LLM-visible + # sig matches ``visible_tools`` while the underlying call + # still receives the framework-supplied values. + from langchain_core.tools import StructuredTool + + _inject_cfg = injected_args or {} + + def _make_inject_only_wrapper( + base: BaseTool, llm_visible: BaseTool, sess: Session, + ) -> BaseTool: + async def _arun(**kwargs: Any) -> Any: + new_kwargs = _inject_args( + kwargs, + session=sess, + injected_args_cfg=_inject_cfg, + tool_name=base.name, + ) + return await base.ainvoke(new_kwargs) + + def _run(**kwargs: Any) -> Any: + new_kwargs = _inject_args( + kwargs, + session=sess, + injected_args_cfg=_inject_cfg, + tool_name=base.name, + ) + return base.invoke(new_kwargs) + + return StructuredTool.from_function( + func=_run, + coroutine=_arun, + name=base.name, + description=base.description, + args_schema=llm_visible.args_schema, + ) + + run_tools = [ + _make_inject_only_wrapper(orig, vis, incident) + for orig, vis in zip(tools, visible_tools) + ] else: - run_tools = tools - agent_executor = create_react_agent( - llm, run_tools, prompt=skill.system_prompt, + run_tools = visible_tools + # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every + # responsive agent invocation is wrapped in an AgentTurnOutput + # envelope. ``langchain.agents.create_agent`` (the non-deprecated + # successor to ``langgraph.prebuilt.create_react_agent``) accepts a + # bare schema as ``response_format`` and, by default, wraps it in + # ``AutoStrategy`` — ProviderStrategy for models with native + # structured-output (OpenAI-class), falling back to ToolStrategy + # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a + # callable tool: when the LLM ``calls`` it, the loop terminates on + # the same turn with ``result["structured_response"]`` populated. + # Eliminates the old two-call structure (loop + separate + # ``with_structured_output`` pass) that hit recursion_limit=25 on + # Ollama models without true function-calling. + agent_executor = create_agent( + model=llm, + tools=run_tools, + system_prompt=skill.system_prompt, + response_format=AgentTurnOutput, ) + # Phase 11 (FOC-04): reset per-turn confidence hint. The hint + # is updated below after _harvest_tool_calls_and_patches; on + # re-entry from a HITL pause the hint resets cleanly so a new + # turn starts from "no signal yet" (None). + try: + incident.turn_confidence_hint = None + except (AttributeError, ValueError): + pass + try: result = await _ainvoke_with_retry( agent_executor, {"messages": [HumanMessage(content=_format_agent_input(incident))]}, ) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error. + # Re-raise so LangGraph's checkpointer captures the paused + # state. Session.status is left to the orchestrator's + # interrupt-aware bridge, NOT _handle_agent_failure. + raise except Exception as exc: # noqa: BLE001 - return _handle_agent_failure( - skill_name=skill.name, started_at=started_at, exc=exc, - inc_id=inc_id, store=store, fallback=incident, - ) + # Phase 10 follow-up: when LangGraph's structured-output pass + # raises ``OutputParserException`` (Ollama / non-OpenAI + # providers don't always honor ``response_format`` cleanly), + # try to recover by parsing the raw LLM output ourselves. + # The exception's ``llm_output`` carries the model's reply + # verbatim; if it contains JSON matching the envelope schema, + # build a synthetic ``result`` and continue. On unrecoverable + # failure, log the raw output for diagnosis and fall through + # to ``_handle_agent_failure``. + try: + from langchain_core.exceptions import OutputParserException + except ImportError: # pragma: no cover — langchain always present + OutputParserException = () # type: ignore[assignment] + if isinstance(exc, OutputParserException): + raw = getattr(exc, "llm_output", "") or "" + logger.warning( + "agent.structured_output_parse_failure agent=%s " + "raw_len=%d raw_preview=%r", + skill.name, len(raw), raw[:500], + ) + recovered = _try_recover_envelope_from_raw(raw) + if recovered is not None: + logger.info( + "agent.structured_output_recovered agent=%s", + skill.name, + ) + result = { + "messages": [], + "structured_response": recovered, + } + else: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + else: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) # Tools (e.g. registered patch tools) write straight to disk. # Reload so the node's own append of agent_run + tool_calls @@ -4330,18 +8915,51 @@ async def node(state: GraphState) -> dict: terminal_tool_names=terminal_tool_names, patch_tool_names=patch_tool_names, ) + # Phase 11 (FOC-04): update hint so any subsequent in-turn + # tool call sees the harvested confidence at the gateway. + if agent_confidence is not None: + try: + incident.turn_confidence_hint = agent_confidence + except (AttributeError, ValueError): + pass # Pair tool responses with their tool calls. _pair_tool_responses(messages, incident) + # Phase 10 (FOC-03 / D-10-03): parse the structural envelope and + # reconcile its confidence against any typed-terminal-tool arg + # confidence harvested above. Envelope failure is a hard error — + # mark the agent_run failed with structured cause. + try: + envelope = parse_envelope_from_result(result, agent=skill.name) + except EnvelopeMissingError as exc: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + + terminal_tool_for_log = _first_terminal_tool_called_this_turn( + messages, terminal_tool_names, + ) + final_confidence = reconcile_confidence( + envelope.confidence, + agent_confidence, + agent=skill.name, + session_id=inc_id, + tool_name=terminal_tool_for_log, + ) + final_rationale = agent_rationale or envelope.confidence_rationale + final_signal = agent_signal if agent_signal is not None else envelope.signal + # Final summary text and token usage. - final_text = _extract_final_text(messages) + # Envelope content takes precedence over last AIMessage scrape. + final_text = envelope.content or _extract_final_text(messages) usage = _sum_token_usage(messages) _record_success_run( incident=incident, skill_name=skill.name, started_at=started_at, final_text=final_text, usage=usage, - confidence=agent_confidence, rationale=agent_rationale, signal=agent_signal, + confidence=final_confidence, rationale=final_rationale, signal=final_signal, store=store, ) next_route_signal = decide_route(incident) @@ -4377,6 +8995,16 @@ def _decide_from_signal(inc: Session) -> str: "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.", } +# Phase 10 (FOC-03): per-agent default envelope confidence for the stub +# LLM. Pre-Phase-10 the deep_investigator stub emitted no confidence at +# all, so the gate (threshold 0.75) always interrupted on the first +# call. Post-Phase-10 every agent must emit a confidence value — drive +# DI's stub envelope below threshold to preserve gate-pause behavior in +# existing tests. Other agents default to 0.85 (above threshold). +_DEFAULT_STUB_ENVELOPE_CONFIDENCE: dict[str, float] = { + "deep_investigator": 0.30, +} + def _latest_run_for(incident: Session, agent_name: str | None): """Return the most recent ``AgentRun`` for ``agent_name``, or None. @@ -4545,6 +9173,10 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, valid_signals = frozenset(cfg.orchestrator.signals) gateway_cfg = getattr(cfg.runtime, "gateway", None) + # Phase 11 (FOC-04): thread the orchestrator's gate_policy down to + # wrap_tool so should_gate can apply the configured per-app + # confidence threshold + gated environments / risk actions. + gate_policy = getattr(cfg.orchestrator, "gate_policy", None) # Build the harvester's tool-name sets once per graph-build. The # union of ``terminal_tools`` (status-transitioning) and # ``harvest_terminal_tools`` (harvest-only) gives the full @@ -4563,7 +9195,10 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, if kind == "supervisor": llm = None if skill.dispatch_strategy == "llm": - llm = get_llm(cfg.llm, skill.model, role=agent_name) + llm = get_llm( + cfg.llm, skill.model, role=agent_name, + default_llm_request_timeout=cfg.orchestrator.default_llm_request_timeout, + ) nodes[agent_name] = make_supervisor_node(skill=skill, llm=llm) continue # Default / "responsive" path. @@ -4573,11 +9208,16 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, stub_canned = {agent_name: _DEFAULT_STUB_CANNED[agent_name]} else: stub_canned = None + # Phase 10 (FOC-03): wire a per-agent default envelope confidence + # into the stub so pre-Phase-10 gate-pause-on-DI tests still pass. + stub_env_conf = _DEFAULT_STUB_ENVELOPE_CONFIDENCE.get(agent_name) llm = get_llm( cfg.llm, skill.model, role=agent_name, stub_canned=stub_canned, + stub_envelope_confidence=stub_env_conf, + default_llm_request_timeout=cfg.orchestrator.default_llm_request_timeout, ) tools = registry.resolve(skill.tools, cfg.mcp) decide = _decide_from_signal @@ -4588,6 +9228,8 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, gateway_cfg=gateway_cfg, terminal_tool_names=terminal_tool_names, patch_tool_names=patch_tool_names, + injected_args=cfg.orchestrator.injected_args, + gate_policy=gate_policy, ) return nodes @@ -4684,6 +9326,10 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore, ) else: framework_cfg = getattr(cfg, "framework", None) or resolve_framework_app_config(None) + # ``resolve_framework_app_config(None)`` always returns a bare + # ``FrameworkAppConfig`` (never None), so the chain above is + # exhaustive — assert for pyright's flow narrowing. + assert framework_cfg is not None gated_edges = _collect_gated_edges(skills) sg = StateGraph(GraphState) @@ -4750,7 +9396,11 @@ async def make_postgres_checkpointer( enclosing transaction would otherwise hold the row lock until explicit commit. """ - from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver + # ``langgraph-checkpoint-postgres`` is an optional extra (declared + # under [project.optional-dependencies].postgres in pyproject) so + # the wheel is not present in CI's SQLite-only install. The module + # is only imported on the Postgres URL branch in production. + from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver # pyright: ignore[reportMissingImports] from psycopg_pool import AsyncConnectionPool # Translate SQLAlchemy URL -> libpq connection string. SQLAlchemy @@ -5118,7 +9768,10 @@ def resolve_transform(path: str) -> Callable[..., dict]: raise TypeError( f"transform {path!r} did not resolve to a callable; got {obj!r}" ) - return obj + # Apps own the strict signature -- the framework only enforces + # ``callable``. The cast satisfies the declared return type without + # adding a runtime wrapper. + return cast(Callable[..., dict], obj) # ====== module: runtime/triggers/idempotency.py ====== @@ -5158,7 +9811,9 @@ def __init__(self, engine: Engine) -> None: self._engine = engine # Ensure the table exists even if the orchestrator hasn't run # ``Base.metadata.create_all`` yet (early lifespan path). - Base.metadata.create_all(engine, tables=[IdempotencyRow.__table__]) + # ``IdempotencyRow.__table__`` is a ``Table`` at runtime; the + # SQLAlchemy stub types it as the wider ``FromClause``. + Base.metadata.create_all(engine, tables=[IdempotencyRow.__table__]) # pyright: ignore[reportArgumentType] self._lru: dict[str, OrderedDict[str, str]] = {} self._lock = threading.Lock() @@ -5278,7 +9933,10 @@ def purge_expired(self) -> int: ) ) s.commit() - return result.rowcount or 0 + # ``rowcount`` is exposed on ``CursorResult`` (the concrete + # return of DML execute); the abstract ``Result`` stub does + # not declare it. + return result.rowcount or 0 # pyright: ignore[reportAttributeAccessIssue] # ------------------------------------------------------------------ # Internals @@ -5652,7 +10310,12 @@ def create( f"but no transport with that kind is registered " f"(known: {sorted(plugin_kinds)})" ) - transports.append(kind_cls(pcfg)) + # Plugin transports inherit from the abstract + # ``TriggerTransport`` (no positional args declared on the + # ABC) but every concrete subclass loaded via the entry- + # point registry must accept the plugin's config object. + # The ABC mismatch is a stub limitation, not a runtime bug. + transports.append(kind_cls(pcfg)) # pyright: ignore[reportCallIssue] return cls(specs, transports, start_session_fn, idempotency) @@ -7233,6 +11896,112 @@ async def try_acquire(self, session_id: str) -> AsyncIterator[None]: slot.owner = None slot.lock.release() +# ====== module: runtime/skill_validator.py ====== + +class SkillValidationError(RuntimeError): + """Raised when skill YAML references a tool or route that does not + exist or is malformed. Refuses to start the orchestrator.""" + + +def _build_bare_to_full_map(registered_tools: set[str]) -> dict[str, list[str]]: + """Map bare tool name → list of fully-qualified ``:``.""" + bare_to_full: dict[str, list[str]] = {} + for full in registered_tools: + bare = full.split(":", 1)[1] if ":" in full else full + bare_to_full.setdefault(bare, []).append(full) + return bare_to_full + + +def _check_tool_ref( + skill_name: str, + tool_ref: str, + registered_tools: set[str], + bare_to_full: dict[str, list[str]], +) -> None: + """Raise SkillValidationError if ``tool_ref`` doesn't resolve to a + registered tool, or resolves ambiguously across multiple servers.""" + if tool_ref in registered_tools: + return + resolutions = bare_to_full.get(tool_ref) + if resolutions is None: + raise SkillValidationError( + f"skill {skill_name!r} references tool {tool_ref!r} which " + f"is not registered. Known tools: {sorted(registered_tools)[:10]}..." + ) + if len(resolutions) > 1: + raise SkillValidationError( + f"skill {skill_name!r} uses bare tool ref {tool_ref!r} but " + f"it is exposed by multiple servers: {sorted(resolutions)}. " + f"Use the prefixed form to disambiguate." + ) + + +def validate_skill_tool_references( + skills: dict, registered_tools: set[str], +) -> None: + """Assert every ``tools.local`` entry in every skill resolves to a + registered MCP tool. + + ``registered_tools`` is the set of fully-qualified ``:`` + names from the MCP loader. We accept either bare or prefixed forms + in skill YAML (the LLM-facing call uses prefixed; YAML can use + either for ergonomics). + """ + bare_to_full = _build_bare_to_full_map(registered_tools) + for skill_name, skill in skills.items(): + local = (skill.get("tools") or {}).get("local") or [] + for tool_ref in local: + _check_tool_ref(skill_name, tool_ref, registered_tools, bare_to_full) + + +def validate_skill_routes(skills: dict) -> None: + """Assert every skill has a ``when: default`` route entry. + + Skipped for ``kind: supervisor`` skills — supervisors dispatch via + ``dispatch_rules`` to subordinates and do not use the ``routes`` + table at all. + """ + for skill_name, skill in skills.items(): + if skill.get("kind") == "supervisor": + continue + routes = skill.get("routes") or [] + if not any((r.get("when") == "default") for r in routes): + raise SkillValidationError( + f"skill {skill_name!r} has no ``when: default`` route — " + f"agents whose signal doesn't match a rule will hang." + ) + +# ====== module: runtime/storage/checkpoint_gc.py ====== + +def gc_orphaned_checkpoints(engine: Engine) -> int: + """Remove orphaned checkpoint rows; return count removed. + + Returns 0 if the ``checkpoints`` table doesn't exist (fresh DB, + LangGraph checkpointer has not yet bootstrapped its schema). + """ + with engine.begin() as conn: + live_ids = {row[0] for row in conn.execute( + text("SELECT id FROM incidents") + )} + try: + rows = conn.execute(text( + "SELECT DISTINCT thread_id FROM checkpoints" + )).all() + except OperationalError: + return 0 + # thread_id may be ``INC-1`` or ``INC-1:retry-N`` — strip suffix. + orphans = [] + for (tid,) in rows: + base = tid.split(":")[0] if tid else tid + if base not in live_ids: + orphans.append(tid) + for tid in orphans: + conn.execute( + text("DELETE FROM checkpoints WHERE thread_id = :tid"), + {"tid": tid}, + ) + return len(orphans) + # ====== module: runtime/orchestrator.py ====== if TYPE_CHECKING: @@ -7245,6 +12014,7 @@ async def try_acquire(self, session_id: str) -> AsyncIterator[None]: +from langgraph.errors import GraphInterrupt from langgraph.types import Command @@ -7257,9 +12027,29 @@ async def try_acquire(self, session_id: str) -> AsyncIterator[None]: + _log = logging.getLogger("runtime.orchestrator") +def _assert_envelope_invariant_on_finalize(session: "Session") -> None: + """Phase 10 (FOC-03) defence-in-depth log sweep. + + Hard rejection of envelope-less turns happens at the agent runner + (``parse_envelope_from_result`` raises ``EnvelopeMissingError``, + which the runner converts into an agent_run marked ``error``). + This finalize hook only logs WARNING for forensics on legacy on-disk + sessions whose agent_runs predate the envelope contract. Never + raises. + """ + for ar in session.agents_run: + if ar.confidence is None: + _log.warning( + "agent_run.envelope_missing agent=%s session_id=%s", + ar.agent, + session.id, + ) + + def _default_text_extractor(session) -> str: """Default text extraction for the incident-management example. @@ -7694,10 +12484,14 @@ async def create(cls, cfg: AppConfig) -> "Orchestrator": if dedup_cfg.stage2_model in cfg.llm.models: _llm_cfg_capture = cfg.llm _model_name = dedup_cfg.stage2_model + _default_timeout_capture = ( + cfg.orchestrator.default_llm_request_timeout + ) def _factory(): return get_llm( _llm_cfg_capture, _model_name, role="dedup", + default_llm_request_timeout=_default_timeout_capture, ) dedup_pipeline = DedupPipeline( @@ -7709,14 +12503,22 @@ def _factory(): # Backfill dedup_pipeline into the IntakeContext now that it is built. # The IntakeContext was constructed with dedup_pipeline=None above # because the pipeline is built after graph construction. + # ``intake_context`` was attached via ``object.__setattr__`` ~140 + # lines up; pyright doesn't see dynamic Pydantic attrs, so go + # via getattr for the type-checker. if dedup_pipeline is not None: - framework_cfg.intake_context.dedup_pipeline = dedup_pipeline + getattr(framework_cfg, "intake_context").dedup_pipeline = dedup_pipeline # No bespoke resume graph — resume runs through the main # graph via ``Command(resume=...)`` against the same # thread_id, with the checkpointer rehydrating paused state. + # ``repo_state_cls: Type[BaseModel]`` matches the loose + # bound on ``Orchestrator.StateT`` (also ``BaseModel``) at + # the call site, but pyright sees the un-narrowed + # ``StateT`` placeholder. Concrete narrowing happens via + # the runtime resolver enforced earlier in this method. instance = cls(cfg, store, skills, registry, graph, stack, framework_cfg=framework_cfg, - state_cls=repo_state_cls, + state_cls=repo_state_cls, # pyright: ignore[reportArgumentType] history=history, checkpointer=checkpointer, checkpointer_close=checkpointer_close, @@ -7732,7 +12534,13 @@ def _factory(): try: await checkpointer_close() # pyright: ignore[reportPossiblyUnboundVariable] except Exception: # noqa: BLE001 - pass + # The original BaseException is what the caller cares + # about; this cleanup failure must not mask it. Log so + # the FD-leak path stays observable. + _log.warning( + "build: checkpointer_close failed during error rollback", + exc_info=True, + ) await stack.aclose() raise @@ -7744,7 +12552,13 @@ async def aclose(self) -> None: try: await self._checkpointer_close() except Exception: # noqa: BLE001 - pass + # Best-effort: the rest of aclose() (exit_stack drain) + # must still run so MCP transports don't leak. Log so + # checkpointer-close failures stay observable. + _log.warning( + "aclose: checkpointer close failed", + exc_info=True, + ) self._checkpointer_close = None await self._exit_stack.aclose() @@ -7823,6 +12637,12 @@ def _finalize_session_status(self, session_id: str) -> str | None: if inc.status not in ("new", "in_progress"): return None + # Phase 10 (FOC-03) defence-in-depth: hard rejection of envelope-less + # turns happens at the agent runner; this hook only logs WARNING for + # forensics on legacy on-disk sessions whose agent_runs predate the + # envelope contract. Never raises. + _assert_envelope_invariant_on_finalize(inc) + decision = self._infer_terminal_decision(inc.tool_calls) if decision is None: default = self.cfg.orchestrator.default_terminal_status @@ -7932,6 +12752,116 @@ def _save_or_yield(self, inc, new_status: str) -> str | None: except StaleVersionError: return None + @staticmethod + def _is_graph_interrupt(exc: BaseException) -> bool: + """Phase 11 (FOC-04 / D-11-04): identify a LangGraph HITL pause. + + ``GraphInterrupt`` is NOT an error -- it signals a checkpointed + ``pending_approval`` state. Real exceptions still flow through + the normal failure path. Helper kept on the orchestrator so + callers don't each re-import langgraph internals. + """ + return isinstance(exc, GraphInterrupt) + + @staticmethod + def _extract_last_error(inc: "Session") -> Exception | None: + """Reconstruct the last error from a Session in status='error'. + + The graph runner stores failures as an AgentRun with + ``summary='agent failed: '`` (graph.py:_handle_agent_failure). + We can't recover the original Exception type, so we return a + synthetic representative whose CLASS matches a _PERMANENT_TYPES + / _TRANSIENT_TYPES whitelist entry where possible -- that's all + :func:`runtime.policy.should_retry` needs (it does isinstance + checks). + + Mapping (first match wins per AgentRun.summary scan, newest + first): + + - "EnvelopeMissingError" in body -> EnvelopeMissingError + - "ValidationError" in body -> pydantic.ValidationError + - "TimeoutError" / "timed out" -> TimeoutError + - "OSError" / "ConnectionError" -> OSError + - everything else -> RuntimeError (falls + through to permanent_error per fail-closed default in + should_retry) + """ + + import pydantic as _pydantic + for run in reversed(inc.agents_run): + summary = (run.summary or "") + if not summary.startswith("agent failed:"): + continue + body = summary.removeprefix("agent failed:").strip() + if "EnvelopeMissingError" in body: + return _EnvelopeMissingError( + agent=run.agent or "unknown", + field="confidence", + message=body, + ) + if "ValidationError" in body or "validation error" in body: + # Build a synthetic ValidationError; pydantic v2 supports + # ValidationError.from_exception_data. + try: + return _pydantic.ValidationError.from_exception_data( + title="reconstructed", line_errors=[], + ) + except Exception: # pragma: no cover -- pydantic API drift + return RuntimeError(body) + if ("TimeoutError" in body or "timed out" in body + or "asyncio.TimeoutError" in body): + return TimeoutError(body) + if "OSError" in body or "ConnectionError" in body: + return OSError(body) + return RuntimeError(body) + return None + + @staticmethod + def _extract_last_confidence(inc: "Session") -> float | None: + """Return the last recorded turn-level confidence on the session, + or None if no AgentRun carries one. should_retry treats None as + 'no signal yet' and skips the low-confidence gate. + """ + for run in reversed(inc.agents_run): + if run.confidence is not None: + return run.confidence + return None + + def preview_retry_decision( + self, session_id: str, + ) -> "RetryDecision": + """Phase 12 (FOC-05 / D-12-04): return the framework's retry + decision WITHOUT executing anything. The UI calls this to render + the retry button label + disabled state. + + Pure: same inputs always yield identical RetryDecision. Loads + the session from store; reads (retry_count, last_error, + last_confidence) and consults the same policy + ``runtime.policy.should_retry`` that ``_retry_session_locked`` + uses. No mutation, no thread-id bump, no lock acquired. + + For sessions whose status is not "error" (i.e. nothing to + retry), returns ``RetryDecision(retry=False, + reason="permanent_error")`` -- a defensive caller-friendly + outcome that lets the UI render a "cannot auto-retry" state + without inventing a new reason value. + """ + try: + inc = self.store.load(session_id) + except FileNotFoundError: + return RetryDecision(retry=False, reason="permanent_error") + if inc.status != "error": + return RetryDecision(retry=False, reason="permanent_error") + retry_count = int(inc.extra_fields.get("retry_count", 0)) + last_error = self._extract_last_error(inc) + last_confidence = self._extract_last_confidence(inc) + return should_retry( + retry_count=retry_count, + error=last_error, + confidence=last_confidence, + cfg=self.cfg.orchestrator, + ) + async def _finalize_session_status_async( self, session_id: str, ) -> str | None: @@ -8254,7 +13184,15 @@ async def resume_session(self, incident_id: str, tool_args: dict = {"incident_id": incident_id, "message": message} if team is not None: tool_args["team"] = team - tool_result = await self._invoke_tool(tool_name, tool_args) + # Phase 9 (D-09-01): expose the live session to + # _invoke_tool's injection branch via the implicit slot. + # try/finally so a failed tool call doesn't leak the + # reference into the next orchestrator-driven call. + self._current_session_for_invoke = inc_loaded + try: + tool_result = await self._invoke_tool(tool_name, tool_args) + finally: + self._current_session_for_invoke = None inc_loaded.tool_calls.append(ToolCall( agent="orchestrator", tool=tool_name, @@ -8373,6 +13311,30 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]: "reason": f"not in error state (status={inc.status})", "ts": _event_ts()} return + # Phase 12 (FOC-05 / D-12-04): consult the framework's pure + # retry policy BEFORE mutating session state. The decision is + # derived from (retry_count, last_error, last_turn_confidence, + # cfg) -- LLM intent is not consulted. On retry=False, emit + # retry_rejected with the policy's reason and DO NOT bump the + # retry_count or thread id (preserves the "not retryable" + # state on disk for UI re-rendering and retry-budget audits). + prior_retry_count = int(inc.extra_fields.get("retry_count", 0)) + last_error = self._extract_last_error(inc) + last_confidence = self._extract_last_confidence(inc) + decision = should_retry( + retry_count=prior_retry_count, + error=last_error, + confidence=last_confidence, + cfg=self.cfg.orchestrator, + ) + if not decision.retry: + _log.info( + "retry_session policy-rejected: id=%s reason=%s", + session_id, decision.reason, + ) + yield {"event": "retry_rejected", "incident_id": session_id, + "reason": decision.reason, "ts": _event_ts()} + return # Drop the failed AgentRun(s) so the timeline only retains # successful runs. Retry attempts then append fresh runs. inc.agents_run = [ @@ -8431,6 +13393,14 @@ async def _resume_with_input(self, incident_id: str, inc, decision: dict): config=self._thread_config(incident_id), ): yield self._to_ui_event(ev, incident_id) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): a resume that re-paused via + # a fresh HITL gate. Don't restore the prior pending_intervention + # block (the new pending_approval ToolCall row is the + # canonical pause record now). Propagate so LangGraph's + # checkpointer captures the new pause; the UI's + # _render_pending_approvals_block surfaces the resume target. + raise except Exception as exc: # noqa: BLE001 — restore on any failure # Reload from disk to absorb any partial writes from tools # that ran before the failure, then restore intervention @@ -8456,6 +13426,14 @@ async def _invoke_tool(self, name: str, args: dict): Used for orchestrator-driven tool calls (e.g. an app-registered escalation tool invoked from the awaiting_input gate) that aren't initiated by an LLM. + + Phase 9 (D-09-01): orchestrator-driven calls also flow through + injection so the tool gets the canonical session-derived arg set + even when the orchestrator only passed intent-args. The current + session is read off ``self._current_session_for_invoke`` (set + by callers via try/finally) so the public signature stays + unchanged. When no session is reachable the injection step is + a no-op — the existing escalation path keeps working unchanged. """ entry = next( (e for e in self.registry.entries.values() if e.name == name), @@ -8463,6 +13441,17 @@ async def _invoke_tool(self, name: str, args: dict): ) if entry is None: raise KeyError(f"tool '{name}' not registered") + session = getattr(self, "_current_session_for_invoke", None) + cfg_inject = self.cfg.orchestrator.injected_args + if session is not None and cfg_inject: + + args = inject_injected_args( + args, + session=session, + injected_args_cfg=cfg_inject, + tool_name=name, + accepted_params=accepted_params_for_tool(entry.tool), + ) return await entry.tool.ainvoke(args) @staticmethod @@ -8483,6 +13472,9 @@ def _event_ts() -> str: # ====== module: runtime/api.py ====== +_log = logging.getLogger("runtime.api") + + def _resolve_environments(dotted: str | None) -> list[str]: """Resolve ``RuntimeConfig.environments_provider_path`` to a list. @@ -8676,7 +13668,12 @@ async def _trigger_dispatch(service, kwargs): try: await registry.stop_all() except Exception: # noqa: BLE001 - pass + # Best-effort: a misbehaving trigger transport must not + # block ``svc.shutdown()`` below. Surface for observability. + _log.warning( + "trigger registry stop_all failed during lifespan teardown", + exc_info=True, + ) # ``shutdown()`` cancels in-flight session tasks, closes the # underlying Orchestrator + MCP pool, joins the loop thread, # and resets the process-singleton. diff --git a/dist/apps/incident-management.py b/dist/apps/incident-management.py index 5edafde..8031b11 100644 --- a/dist/apps/incident-management.py +++ b/dist/apps/incident-management.py @@ -1,4 +1,30 @@ from __future__ import annotations +# ----- imports for runtime/errors.py ----- +"""Typed runtime errors. Phase 13 lands the LLM-call surface; future +hardening (HARD-04 silent-failure sweep, HARD-03 pyright flip, +real-LLM follow-ups) extends here. + +Importable as ``from runtime.errors import LLMTimeoutError, LLMConfigError``. +""" + + + +# ----- imports for runtime/terminal_tools.py ----- +"""Generic terminal-tool registry types. + +Apps register their terminal-tool rules and status vocabulary via +``OrchestratorConfig.terminal_tools`` / ``OrchestratorConfig.statuses``; +the framework reads these models without knowing app-specific tool +or status names. Cf. .planning/phases/06-generic-terminal-tool-registry/ +06-CONTEXT.md (D-06-01, D-06-02, D-06-05). +""" + + +from typing import Literal + +from pydantic import BaseModel, Field + + # ----- imports for runtime/config.py ----- """Config schemas for the orchestrator.""" @@ -6,11 +32,12 @@ import re from pathlib import Path from typing import Any, Literal -from pydantic import BaseModel, Field, field_validator, model_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator import yaml + # Session-id prefix grammar. The framework mints session ids of the form # ``{PREFIX}-YYYYMMDD-NNN`` (see ``runtime.state.Session.id_format``); # the prefix is the only piece an app picks. Allow alphanumerics + hyphens, @@ -34,7 +61,6 @@ class IncidentState(Session): -from pydantic import BaseModel, Field # ----- imports for runtime/state_resolver.py ----- """Resolve ``RuntimeConfig.state_class`` (a dotted path) to a class object. @@ -109,6 +135,7 @@ class IncidentState(Session): import ast from typing import Any, Callable, Literal +from pydantic import BaseModel, Field, field_validator, model_validator # ----- imports for runtime/llm.py ----- @@ -118,8 +145,21 @@ class IncidentState(Session): provider (kind + connection) to a model id and optional temperature/deployment. ``get_llm(cfg, "smart")`` looks up ``cfg.models["smart"]`` and uses its referenced ``cfg.providers[]`` to build a langchain ``BaseChatModel``. + +Phase 13 (HARD-01 / HARD-05): every chat + embedding HTTP call is bounded +by an effective ``request_timeout`` resolved as +``provider.request_timeout if not None else default_llm_request_timeout`` +(default 120.0s on ``OrchestratorConfig``). The native langchain timeout +knob is wired AND an ``asyncio.wait_for`` wrapper raises +``LLMTimeoutError(provider, model, elapsed_ms)`` on hang -- defence in +depth against partial-byte stalls where the httpx layer doesn't fire. +The hardcoded public-Ollama fallback is removed; ollama providers +must declare ``base_url`` (validated at config-load via +``LLMConfigError``). """ +import asyncio +import time from typing import Any from uuid import uuid4 from langchain_core.embeddings import Embeddings @@ -130,6 +170,7 @@ class IncidentState(Session): + # ----- imports for runtime/storage/models.py ----- """SQLAlchemy declarative model for the ``incidents`` table. @@ -183,6 +224,7 @@ class IncidentState(Session): import hashlib import numpy as np +from pydantic import SecretStr @@ -230,16 +272,19 @@ class IncidentState(Session): from typing import Any, Generic, Mapping, Optional, Type, TypeVar -from pydantic import BaseModel from sqlalchemy import select from sqlalchemy.engine import Engine -from sqlalchemy.orm import Session +from sqlalchemy.orm import Session as SqlaSession -# Mirrors the bound on ``SessionStore.StateT`` — kept permissive at -# ``BaseModel`` so framework code does not need to import the -# example-app subclass. The resolver in :mod:`runtime.state_resolver` -# enforces a ``runtime.state.Session`` subclass at config time. + +# Mirrors the bound on ``SessionStore.StateT`` — tightened from +# ``BaseModel`` to ``runtime.state.Session`` in Phase 19 (HARD-03) so +# pyright sees the typed fields (``id``, ``status``, ``deleted_at`` …) +# this store reads. The resolver in :mod:`runtime.state_resolver` +# already enforces a ``Session`` subclass at config time, and every +# in-tree caller passes either bare ``Session`` or a ``Session`` +# subclass. # ----- imports for runtime/storage/session_store.py ----- """Active session lifecycle store. @@ -261,6 +306,7 @@ class IncidentState(Session): from datetime import datetime, timezone from typing import Generic, Optional, Type, TypeVar +from pydantic import BaseModel from sqlalchemy import desc, select from sqlalchemy.orm import Session as SqlSession @@ -271,6 +317,66 @@ class IncidentState(Session): # hook existed. New rows are validated by ``_SESSION_ID_RE`` which # accepts any ``PREFIX-YYYYMMDD-NNN`` shape the app's ``id_format`` may # emit (e.g. ``CR-...`` for code-review). +# ----- imports for runtime/storage/event_log.py ----- +"""Append-only session event log. + +Events drive the status finalizer's inference (e.g. a registered +```` event appearing in the log -> session reached +the corresponding terminal status). They are never mutated or +deleted. +""" + + +from dataclasses import dataclass +from typing import Iterator + +from sqlalchemy.orm import Session + + + +# ----- imports for runtime/storage/migrations.py ----- +"""Idempotent migrations for the JSON-shaped row payloads. + +Fills the per-call audit fields on :class:`runtime.state.ToolCall` for +legacy rows. The risk-rated tool gateway uses five optional audit fields: + + * ``risk`` — ``"low" | "medium" | "high" | None`` + * ``status`` — ``ToolStatus`` literal (default ``"executed"``) + * ``approver`` — operator id, set when status in {approved, rejected} + * ``approved_at`` — ISO-8601 timestamp of the decision + * ``approval_rationale`` — free-text justification + +Older rows in the ``incidents.tool_calls`` JSON column lack these +fields. Pydantic hydrates the missing keys with their defaults at read +time so reading is already back-compat — but the on-disk JSON still +shows the legacy shape until something rewrites the row. + +This migration walks every session, normalises the JSON-shaped +``tool_calls`` list to the current audit schema, and saves the row back +when (and only when) at least one entry changed. Idempotent — running +twice is safe (the second pass is a no-op because every row already +has the fields). + +The function operates on the row's JSON list directly (not via the +``ToolCall`` Pydantic model) so we don't accidentally widen the +migration's contract — for example, dropping unknown extra keys via +Pydantic's ``extra='ignore'`` would silently delete forward-compat +fields in a downgrade scenario. JSON-walk is conservative: only fill +what's missing; leave everything else alone. +""" + + +from typing import Any, Iterable + +from sqlalchemy import inspect, text + + +# Columns added after the initial schema. Each entry is +# ``(column_name, sql_type, default_clause_or_None)``. SQLite ``ADD +# COLUMN`` cannot add a non-nullable column without a constant default, +# so every entry here is nullable — Pydantic hydrates the missing keys +# at read time. Append-only: never reorder, never delete. Removing a +# column needs a separate destructive migration with explicit sign-off. # ----- imports for runtime/mcp_loader.py ----- """Load MCP servers (in_process / stdio / http / sse) and build a tool registry. @@ -299,15 +405,313 @@ class IncidentState(Session): -# ----- imports for runtime/graph.py ----- -"""LangGraph state, routing helpers, and node runner.""" +# ----- imports for runtime/service.py ----- +"""Long-lived orchestrator service. + +Owns a background asyncio event loop and a shared FastMCP client pool. +All session execution will run as asyncio tasks on this loop. Sync callers +(Streamlit, FastAPI request handlers, CLI) submit coroutines via +``submit(coro) -> concurrent.futures.Future``. + +Lifecycle:: + + svc = OrchestratorService.get_or_create(cfg) + svc.start() # spins up background thread + loop + fut = svc.submit(some_coro) + result = fut.result(timeout=30) + svc.shutdown() # cancels in-flight tasks, closes MCP clients, joins thread + +Capabilities: + - Skeleton + singleton + start/shutdown lifecycle. + - ``submit()`` / ``submit_and_wait()`` thread-safe bridge. + - Shared ``MCPClientPool`` with per-server ``asyncio.Lock``. + - ``start_session()`` schedules a per-session asyncio task on the + service's loop and returns the session id immediately (the agent run + continues in the background). Active tasks are tracked in an + in-memory registry that evicts on completion / cancellation. + - ``list_active_sessions()`` returns a thread-safe snapshot of + the in-flight registry; the snapshot coroutine runs on the loop so + readers from any thread see a point-in-time consistent view. + - ``stop_session(sid)`` cancels the in-flight task, waits up + to 5 s for graceful exit, and persists ``status="stopped"`` on the + row (clearing ``pending_intervention``). Idempotent — a no-op for + unknown ids or already-completed sessions. + - Hard cap on concurrent sessions. ``start_session`` raises + ``SessionCapExceeded`` once ``len(self._registry) >= + self.max_concurrent_sessions``. Fail fast; queueing is not supported. + +The singleton is process-scoped and reset on ``shutdown()`` so that test +suites can build, tear down, and rebuild the service without leaking +state across cases. +""" -import asyncio + +import concurrent.futures import logging -from typing import TypedDict, Callable, Awaitable +import threading +from typing import Any, Awaitable, Coroutine, TypeVar, cast + + + +# ----- imports for runtime/agents/turn_output.py ----- +"""Phase 10 (FOC-03) — AgentTurnOutput envelope + reconciliation helpers. + +The envelope is the structural contract every responsive agent invocation +must satisfy: content + confidence in [0,1] + confidence_rationale + optional +signal. The framework wires it as ``response_format=AgentTurnOutput`` into +``langchain.agents.create_agent`` (see Phase 15 / LLM-COMPAT-01); the +agent loop terminates on the same turn the LLM emits the envelope-shaped +tool call, populating ``result["structured_response"]``, which the +framework reads and persists onto the ``AgentRun`` row. + +D-10-02 — pydantic envelope wrapped via ``response_format``. +D-10-03 — when a typed-terminal-tool was called this turn, the framework +reconciles its ``confidence`` arg against the envelope's. Tolerance 0.05 +inclusive; tool-arg wins on mismatch with an INFO log. + +This is a leaf module: no imports from ``runtime.graph`` or +``runtime.orchestrator``. Both of those depend on it; the dependency +graph is acyclic. +""" + + + +from pydantic import BaseModel, ConfigDict, Field + +# ----- imports for runtime/tools/gateway.py ----- +"""Risk-rated tool gateway: pure resolver + ``BaseTool`` HITL wrapper. + +The gateway sits between the ReAct agent and each tool the orchestrator +configures. It enforces the *hybrid* HITL policy resolved by +``effective_action``: + + ``auto`` -> call the underlying tool directly (no plumbing) + ``notify`` -> call the tool, then persist a soft-notify audit entry + ``approve`` -> raise ``langgraph.types.interrupt(...)`` BEFORE calling + the tool; on resume re-invoke + +The resolver is a plain function with no I/O so it can be unit-tested +exhaustively without spinning up Pydantic Sessions, MCP servers, or a +LangGraph runtime. The wrapper is a closure factory deliberately built +inside ``make_agent_node`` so the closure captures the live ``Session`` +per agent invocation (mitigation R2 in the Phase-4 plan). +""" + + +from fnmatch import fnmatchcase +from typing import TYPE_CHECKING, Any, Literal + + + + +# ``GateDecision`` is imported lazily inside ``_evaluate_gate`` (function +# body) to avoid a runtime cycle (policy.py imports gateway types). The +# type-only import below lets pyright resolve the string-literal return +# annotation on ``_evaluate_gate`` without forming a real cycle. +# ----- imports for runtime/tools/arg_injection.py ----- +"""Session-derived tool-arg injection (Phase 9 / FOC-01 / FOC-02). + +Two responsibilities, one module: + +1. :func:`strip_injected_params` — clones a ``BaseTool``'s args_schema with + one or more parameters removed. The LLM only sees the stripped sig and + therefore cannot hallucinate values for those params (D-09-01). The + original tool is left untouched so direct downstream callers (tests, + scripts, in-process MCP fixtures) keep working. + +2. :func:`inject_injected_args` — at tool-invocation time, re-adds the + real values resolved from the live :class:`runtime.state.Session` via + the configured dotted paths. When the LLM still supplied a value for + an injected arg, the framework's session-derived value wins and an + INFO log captures the override (D-09-03). + +The framework stays generic — apps declare which args to inject and from +where via :attr:`runtime.config.OrchestratorConfig.injected_args` (D-09-02). +""" + + + +from pydantic import BaseModel, create_model + + + +# Module-private logger. Tests assert against logger name +# ``"runtime.orchestrator"`` so the override-log line shows up alongside +# the rest of the orchestrator-side observability without requiring a +# separate caplog target. +# ----- imports for runtime/tools/approval_watchdog.py ----- +"""Pending-approval timeout watchdog. + +A high-risk tool call enters ``langgraph.types.interrupt()`` and the +session sits in ``awaiting_input`` indefinitely. Without a watchdog +the slot leaks against ``OrchestratorService.max_concurrent_sessions`` +forever — the cap eventually starves out new traffic. + +The :class:`ApprovalWatchdog` is an asyncio task that runs on the +service's background loop. Every ``poll_interval_seconds`` it: + + 1. Snapshots the in-flight session registry. + 2. For each session whose row has ``status="awaiting_input"``, + scans ``tool_calls`` for entries with ``status="pending_approval"`` + whose ``ts`` is older than ``approval_timeout_seconds``. + 3. Resumes each such session via ``Command(resume={"decision": + "timeout", "approver": "system", "rationale": "approval window + expired"})``. The wrapped tool's resume path updates the audit + row to ``status="timeout"``. + +Failures during polling (DB hiccup, malformed row) are logged and +swallowed so a single bad session cannot kill the watchdog. +""" + + +from typing import TYPE_CHECKING, Any + + +# ----- imports for runtime/policy.py ----- +"""Pure HITL gating policy (Phase 11 / FOC-04). + +The :func:`should_gate` function is the SOLE place the framework decides +whether a tool call requires human-in-the-loop approval. It composes +three orthogonal inputs: + + 1. ``effective_action(tool_call.tool, env=session.environment, + gateway_cfg=cfg.gateway)`` -- preserves the v1.0 PVC-08 + prefixed-form lookup invariant. + 2. ``session.environment`` -- gated when in + ``cfg.gate_policy.gated_environments``. + 3. ``confidence`` -- gated when below + ``cfg.gate_policy.confidence_threshold``. + +Pure: same inputs always yield identical :class:`GateDecision`; no I/O, +no skill-prompt input, no mutation. + +Precedence (descending): + + 1. ``effective_action`` returns a value in + ``cfg.gate_policy.gated_risk_actions`` + -> ``GateDecision(gate=True, reason="high_risk_tool")`` + 2. ``session.environment`` in ``cfg.gate_policy.gated_environments`` + AND ``effective_action != "auto"`` + -> ``GateDecision(gate=True, reason="gated_env")`` + 3. ``confidence`` is not None AND + ``confidence < cfg.gate_policy.confidence_threshold`` + AND ``effective_action != "auto"`` + -> ``GateDecision(gate=True, reason="low_confidence")`` + 4. otherwise -> ``GateDecision(gate=False, reason="auto")`` + +The literal ``"blocked"`` is reserved on :class:`GateDecision.reason` +for future hard-stop semantics; Phase 11 itself never returns it from a +production code path. +""" + + + +from pydantic import BaseModel, ConfigDict + + +# Phase 11 (FOC-04): forward-reference imports for the should_gate +# signature only; kept inside ``TYPE_CHECKING`` so the bundle's +# intra-import stripper does not remove a load-bearing import. The +# ``pass`` keeps the block syntactically valid after stripping. +# ----- imports for runtime/agents/responsive.py ----- +"""Responsive agent kind — the today-default LLM agent. + +A responsive skill is a LangGraph node that: + +1. Builds a ReAct executor over the skill's ``tools`` and ``model``. +2. Invokes the executor with the live ``Session`` payload as a human + message preamble. +3. Records ``ToolCall`` and ``AgentRun`` rows on the session, harvests + the agent's confidence / signal / rationale, and decides the next + route from ``skill.routes``. + +This module owns only the node-factory entrypoint +(``make_agent_node``); the implementation reuses helpers in +:mod:`runtime.graph` so existing call sites and the gate node continue +to work unchanged. Supervisor and monitor factories live alongside it +under :mod:`runtime.agents` rather than piling more kinds into +``graph.py``. +""" + + +from typing import Callable from langchain_core.messages import HumanMessage -from langgraph.prebuilt import create_react_agent +from langchain.agents import create_agent + +from langgraph.errors import GraphInterrupt + + + + + + + +# ----- imports for runtime/agents/supervisor.py ----- +"""Supervisor agent kind — no-LLM router. + +A supervisor skill is a LangGraph node that: + +1. Reads the live ``Session`` plus the current dispatch depth. +2. Picks one or more subordinate agents per ``dispatch_strategy``: + ``rule`` (deterministic, evaluated via the same safe-eval AST that + gates monitor expressions) or ``llm`` (one short LLM call against + ``dispatch_prompt``). +3. Emits a structured ``supervisor_dispatch`` log entry (no + ``AgentRun`` row — supervisors are bookkeeping, not token-burning + agents). +4. Returns ``next_route`` set to the chosen subordinate (or to + ``__end__`` when the depth limit is hit). + +The recursion depth is tracked in :class:`runtime.graph.GraphState`'s +``dispatch_depth`` field; if a supervisor would exceed +``skill.max_dispatch_depth`` the node aborts with a clean error +instead of recursing forever. + +This is **not** a fan-out implementation; we always pick a single +target. Multi-target ``Send()`` is intentionally not supported. +""" + + +from typing import Any, Callable + +from langchain_core.messages import HumanMessage, SystemMessage + + + +# ----- imports for runtime/agents/monitor.py ----- +"""Monitor agent kind — out-of-band scheduled observer. + +A monitor skill runs **outside** any session graph. The orchestrator +owns one :class:`MonitorRunner` (a singleton) which schedules registered +monitor skills on a small bounded +:class:`concurrent.futures.ThreadPoolExecutor`. +Each tick: + +1. Calls every tool name in ``observe`` via the supplied callable + (``observe_fn``); aggregates results into one dict keyed by tool. +2. Evaluates ``emit_signal_when`` against the observation using the + stdlib safe-eval evaluator (R7). +3. If true, looks up ``trigger_target`` in the supplied trigger + registry / fire callback and fires it with the observation as the + payload. + +APScheduler is intentionally *not* a dependency: the air-gapped target +env doesn't ship it (see ``rules/build.md``). We get away with a tiny +single-threaded scheduler thread because monitor schedules are coarse +(minute-resolution cron) and tool calls are dispatched into the +executor; the scheduler thread itself never blocks on tool I/O. +""" + + +from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeout + + +# ----- imports for runtime/graph.py ----- +"""LangGraph state, routing helpers, and node runner.""" + +from typing import Any, TypedDict, Callable, Awaitable + from langgraph.graph import StateGraph, END @@ -316,6 +720,11 @@ class IncidentState(Session): +# Phase 11 (FOC-04 / D-11-04): GraphInterrupt is the LangGraph +# pending-approval pause signal. It is NOT an error and must NOT route +# through _handle_agent_failure -- the orchestrator's interrupt-aware +# bridge handles the resume protocol via the checkpointer. + # ----- imports for runtime/checkpointer_postgres.py ----- """Postgres checkpointer wrapper. @@ -383,7 +792,6 @@ class IncidentState(Session): from abc import ABC, abstractmethod -from dataclasses import dataclass from typing import TYPE_CHECKING # ----- imports for runtime/triggers/config.py ----- @@ -418,7 +826,7 @@ class IncidentState(Session): """ -from typing import Any, Callable, Type +from typing import Any, Callable, Type, cast @@ -448,7 +856,6 @@ class IncidentState(Session): """ -import threading from collections import OrderedDict from datetime import datetime, timezone, timedelta @@ -471,7 +878,6 @@ class IncidentState(Session): import hmac -from typing import Callable from fastapi import Header, HTTPException, status @@ -683,7 +1089,6 @@ async def _poll(self, registry): """ -from typing import Any, Callable # ----- imports for runtime/memory/session_state.py ----- @@ -700,7 +1105,6 @@ async def _poll(self, registry): """ -from pydantic import BaseModel, ConfigDict, Field # ----- imports for runtime/memory/knowledge_graph.py ----- @@ -878,6 +1282,37 @@ async def _poll(self, registry): from typing import AsyncIterator +# ----- imports for runtime/skill_validator.py ----- +"""Load-time validation of skill YAML against the live MCP registry. + +Catches: + * tools.local entries that reference a non-existent (server, tool) + pair (typically typos that would silently make the tool invisible). + * routes that omit ``when: default`` (would cause graph hangs at + __end__ when no signal matches). +""" + + + +# ----- imports for runtime/storage/checkpoint_gc.py ----- +"""Garbage-collect orphaned LangGraph checkpoints. + +When ``Orchestrator.retry_session`` rebinds a session to a new +``thread_id`` (e.g. ``INC-1:retry-1``), the original ``INC-1`` thread's +checkpoint becomes orphaned — no code path will ever resume it. Over +time these accumulate. ``gc_orphaned_checkpoints`` removes any +checkpoint whose ``thread_id`` does not reference an active session +(or a known retry suffix). + +This is intentionally conservative: only checkpoints whose thread_id +prefix matches no live session row at all are removed. +""" + + +from sqlalchemy import text +from sqlalchemy.exc import OperationalError + + # ----- imports for runtime/orchestrator.py ----- """Public Orchestrator class — the API consumed by the UI and (future) FastAPI.""" @@ -918,7 +1353,6 @@ async def _poll(self, registry): from fastapi.responses import StreamingResponse - # ----- imports for runtime/api_dedup.py ----- """Dedup retraction HTTP routes. @@ -996,22 +1430,135 @@ async def _poll(self, registry): from typing import Any, Callable, TypedDict +# Phase 16 (BUNDLER-01): use the sibling-defined ``_SEED_ROOT`` constant +# instead of an aliased module reference. The bundler's intra-import +# stripper removes ``from runtime.memory import knowledge_graph as +# _knowledge_graph_mod`` from the bundled source, leaving +# ``_knowledge_graph_mod.__file__`` as a NameError at module load. The +# import below is also stripped, but ``_SEED_ROOT`` survives module +# flattening because it's defined at module scope in knowledge_graph.py. + + + + + + + + + +# ====== module: runtime/errors.py ====== + +class LLMTimeoutError(TimeoutError): + """Raised when an LLM provider HTTP call exceeds request_timeout. + + Subclasses ``TimeoutError`` so ``runtime.policy._TRANSIENT_TYPES`` + auto-classifies it as transient via ``isinstance`` -- no policy.py + edit needed (D-13-04). + + The ``__str__`` includes the substring ``"timed out"`` so existing + string-matchers in ``runtime.graph._TRANSIENT_MARKERS`` and + ``runtime.orchestrator._reconstruct_last_error`` also catch it + without modification. + """ + + def __init__(self, provider: str, model: str, elapsed_ms: int) -> None: + self.provider = provider + self.model = model + self.elapsed_ms = elapsed_ms + super().__init__( + f"LLM request timed out after {elapsed_ms}ms " + f"(provider={provider}, model={model})" + ) + + +class LLMConfigError(ValueError): + """Raised at config-load when a provider is missing a required field. + + Subclasses ``ValueError`` so pydantic ``@model_validator(mode='after')`` + propagates it cleanly into ``ValidationError`` (D-13-05). + """ + + def __init__(self, provider: str, missing_field: str) -> None: + self.provider = provider + self.missing_field = missing_field + super().__init__( + f"{provider} provider requires {missing_field!r}" + ) + + +__all__ = ["LLMTimeoutError", "LLMConfigError"] + +# ====== module: runtime/terminal_tools.py ====== + +class TerminalToolRule(BaseModel): + """Maps a terminal tool name to the session status it produces. + + ``tool_name`` matches both bare (``set_recommendation``) and prefixed + (``:set_recommendation``) MCP tool-call names — the framework + does the suffix check. + + ``status`` must reference a name declared in the same + ``OrchestratorConfig.statuses`` map; ``OrchestratorConfig``'s + cross-field validator enforces this at config-load. + + ``extract_fields`` declares per-rule extra-metadata pulls. Each + key is the destination field name on the session + (``Session.extra_fields[]``); each value is an ordered list + of ``args.X`` / ``result.X`` lookup hints. The framework picks + the first non-falsy match. Empty dict (default) means "no extra + metadata to capture". Generalises the v1.0 + ``_extract_team(tc, team_keys)`` path; the same lookup syntax is + preserved (D-06-02). + + ``match_args`` is an optional argument-value discriminator. When + non-empty, the rule matches a tool call only if EVERY ``(key, + value)`` pair in ``match_args`` matches ``tool_call.args[key]`` + exactly. Lets one tool name route to multiple statuses based on + a discriminator argument (e.g. ``set_recommendation`` with + ``recommendation=approve`` vs ``recommendation=request_changes``). + Empty default = no arg dispatch; preserves the v1.0 single-rule + shape (DECOUPLE-07 / D-08-03). + """ + model_config = {"extra": "forbid"} + tool_name: str = Field(min_length=1) + status: str = Field(min_length=1) + extract_fields: dict[str, list[str]] = Field(default_factory=dict) + match_args: dict[str, str] = Field(default_factory=dict) +StatusKind = Literal[ + "success", # e.g. set_recommendation(approve) -> approved + "failure", # e.g. set_recommendation(request_changes) -> changes_requested + "escalation", # app-defined escalation terminal (e.g. ) + "needs_review", # finalize fired with no rule match + "pending", # session in flight +] +class StatusDef(BaseModel): + """Pydantic record of one app status. + Framework reads ``terminal`` to decide finalize-vs-pending and + ``kind`` to dispatch the needs_review fallback path / let UIs + group statuses without owning their own taxonomy. ``color`` and + other presentation fields stay in ``UIConfig.badges`` (D-06-05 + rejected alternative — presentation leak). + """ + model_config = {"extra": "forbid"} + name: str = Field(min_length=1) + terminal: bool + kind: StatusKind # ====== module: runtime/config.py ====== _SESSION_ID_PREFIX_RE = re.compile(r"^[A-Za-z0-9-]{1,16}$") -ProviderKind = Literal["ollama", "azure_openai", "stub"] +ProviderKind = Literal["ollama", "azure_openai", "openai_compat", "stub"] class ProviderConfig(BaseModel): @@ -1019,12 +1566,35 @@ class ProviderConfig(BaseModel): Multiple named ``ModelConfig`` entries can reference the same provider so that, e.g., two Ollama models share a single base_url + api_key. + + Phase 13 (HARD-01 / D-13-01): per-provider ``request_timeout`` + override (None means "use OrchestratorConfig.default_llm_request_timeout"). + Phase 13 (HARD-05 / D-13-06): ollama providers MUST declare + ``base_url``; the @model_validator below catches the omission at + config-load and raises ``LLMConfigError``. The hardcoded public + Ollama fallback in ``runtime.llm`` is removed in the same phase. """ kind: ProviderKind - base_url: str | None = None # ollama + base_url: str | None = None # ollama (REQUIRED via validator) api_key: str | None = None # ollama, azure_openai - endpoint: str | None = None # azure_openai + endpoint: str | None = None # azure_openai (validated lazily in builder) api_version: str | None = None # azure_openai + request_timeout: float | None = Field( + default=None, gt=0, le=600, + ) # NEW Phase 13 (D-13-01) — None -> OrchestratorConfig default + + @model_validator(mode="after") + def _validate_required_fields(self) -> "ProviderConfig": + # D-13-06: only ollama is promoted to config-load validation in + # Phase 13. azure_openai (`endpoint`) and openai_compat + # (`base_url` + `api_key`) keep their existing first-request + # ValueError raises in `_build_*_chat`. Promoting them is a + # potential follow-up; see CONTEXT.md "Deferred Ideas". + if self.kind == "ollama" and not self.base_url: + raise LLMConfigError( + provider="ollama", missing_field="base_url", + ) + return self class ModelConfig(BaseModel): @@ -1131,6 +1701,76 @@ class Paths(BaseModel): incidents_dir: str = "incidents" +class GatePolicy(BaseModel): + """Phase 11 (FOC-04): declarative HITL gating policy. + + Drives the framework's pure ``should_gate`` boundary. The LLM never + sees this config -- flow control is a framework decision, not a + skill-prompt incantation. + + ``confidence_threshold`` is the strict-less-than predicate the gate + applies to the active turn confidence; tool calls below the + threshold fire a low_confidence pause for any non-auto-rated tool. + + ``gated_environments`` enumerates Session.environment values that + automatically gate every non-auto-rated tool call regardless of + confidence -- lifecycle defence against blast radius in production. + + ``gated_risk_actions`` enumerates GatewayAction Literal values + (``auto``/``notify``/``approve``) that ALWAYS trigger a gate + regardless of env or confidence. Default ``{"approve"}`` mirrors + v1.0 HITL behaviour. + + Phase 11 chooses ``"approve"`` (the actual GatewayAction literal) + over CONTEXT.md's sketched ``"hitl"`` -- see + src/runtime/tools/gateway.py:32 for the canonical 3-valued + GatewayAction Literal. + """ + + model_config = ConfigDict(extra="forbid") + + confidence_threshold: float = Field(default=0.7, ge=0.0, le=1.0) + gated_environments: set[str] = Field( + default_factory=lambda: {"production"}, + ) + gated_risk_actions: set[str] = Field( + default_factory=lambda: {"approve"}, + ) + + +class RetryPolicy(BaseModel): + """Phase 12 (FOC-05): declarative retry policy. + + Drives the framework's pure ``should_retry`` boundary. The LLM never + sees this config -- flow control is a framework decision, not a + skill-prompt incantation. Mirrors GatePolicy's shape so the + OrchestratorConfig surface stays uniform. + + ``max_retries`` is the absolute cap on automatic retries (compared + with ``retry_count`` via ``>=``). 0 disables auto-retry entirely; + the recommended default 2 mirrors the v1.2 ROADMAP sketch and the + existing transient-5xx auto-retry budget in graph.py. + + ``retry_on_transient`` lets apps with strict SLOs disable framework + auto-retry of transient errors entirely (escalate immediately + instead). + + ``retry_low_confidence_threshold`` is the strict-less-than predicate + for "the LLM gave up; don't burn budget on a retry". Defaults to + 0.4 -- well below the typical gate_policy 0.7-0.8 threshold so a + low-confidence escalation triggers HITL intervention before the + retry path even considers it. + """ + + model_config = ConfigDict(extra="forbid") + + max_retries: int = Field(default=2, ge=0, le=10) + retry_on_transient: bool = True + retry_low_confidence_threshold: float = Field( + default=0.4, ge=0.0, le=1.0, + ) + + class OrchestratorConfig(BaseModel): model_config = {"extra": "forbid"} @@ -1221,6 +1861,41 @@ class OrchestratorConfig(BaseModel): # bad path raises at boot with a useful message (DECOUPLE-05 / D-08-01). state_overrides_schema: str | None = None + # Phase 9 (D-09-02 / FOC-01): map of LLM-visible-arg -> dotted-path + # on the live Session. Tools whose param name matches a key in this + # dict get the param stripped from the LLM-visible signature, and + # the framework supplies the resolved value at _invoke_tool / + # _GatedTool._run / _arun time. Apps declare what to inject; the + # framework stays generic. Empty default = no injection (legacy + # behaviour). Validated at config-load: keys are non-empty + # identifiers, values are dotted paths starting with "session.". + injected_args: dict[str, str] = Field(default_factory=dict) + + # Phase 11 (FOC-04): declarative HITL gating policy. Apps tune + # thresholds in YAML; the framework's should_gate boundary reads + # this struct and the LLM never sees it. Default keeps v1.1 + # behaviour (production gates "approve"-risk tools, threshold 0.7). + gate_policy: "GatePolicy" = Field(default_factory=lambda: GatePolicy()) + + # Phase 12 (FOC-05): declarative retry policy. Apps tune + # max_retries / retry_on_transient / low-confidence threshold in + # YAML; the framework's should_retry boundary reads this struct + # and the LLM never sees it. Default keeps v1.2 behaviour + # (max_retries=2, transient retries enabled, confidence floor 0.4). + retry_policy: "RetryPolicy" = Field( + default_factory=lambda: RetryPolicy(), + ) + + # Phase 13 (HARD-01 / D-13-02): framework-default LLM HTTP request + # timeout in seconds. Per-provider ``ProviderConfig.request_timeout`` + # overrides this; ``None`` on the provider means "use this default". + # Bounded to catch indefinite hangs (CONCERNS C1) while leaving room + # for slow CPU Ollama runs (e.g., gpt-oss:120b). 600s upper bound + # prevents accidentally-disabling the protection. + default_llm_request_timeout: float = Field( + default=120.0, gt=0, le=600, + ) + @field_validator("state_overrides_schema") @classmethod def _validate_state_overrides_schema_format( @@ -1255,6 +1930,38 @@ def _validate_state_overrides_schema_format( ) return v + @field_validator("injected_args") + @classmethod + def _validate_injected_args( + cls, v: dict[str, str], + ) -> dict[str, str]: + """Phase 9 (D-09-02): config-load validation for injected_args. + + Each entry is ``arg_name -> dotted_path`` where ``arg_name`` must + be a valid Python identifier (it is the keyword name on a tool + signature) and ``dotted_path`` must be a non-empty string with at + least one dot (e.g. ``session.environment``). Real attribute + resolution happens at injection time in + :func:`runtime.tools.arg_injection.inject_injected_args` so + config-load doesn't drag the live ``Session`` into every consumer. + """ + for key, path in v.items(): + if not key or not key.isidentifier(): + raise ValueError( + f"injected_args key {key!r} must be a non-empty " + f"Python identifier" + ) + if not isinstance(path, str) or not path.strip(): + raise ValueError( + f"injected_args[{key!r}] must be a non-empty dotted path" + ) + if "." not in path: + raise ValueError( + f"injected_args[{key!r}]={path!r} must be a dotted path " + f"(e.g. 'session.environment')" + ) + return v + @model_validator(mode="after") def _validate_terminal_tool_registry(self) -> "OrchestratorConfig": """Cross-field invariants for the terminal-tool registry. @@ -1590,7 +2297,11 @@ def _coerce_dedup(self) -> "AppConfig": if isinstance(self.dedup, DedupConfig): return self if isinstance(self.dedup, dict): - self.__dict__["dedup"] = DedupConfig(**self.dedup) + # ``BaseModel.__dict__`` is typed as ``MappingProxyType`` in + # the pydantic stub; the documented post-validator mutation + # path is direct ``__dict__`` assignment, which works at + # runtime (pydantic stores fields in a plain dict). + self.__dict__["dedup"] = DedupConfig(**self.dedup) # pyright: ignore[reportIndexIssue] return self raise ValueError( f"app.dedup must be a DedupConfig or dict; got " @@ -1631,8 +2342,9 @@ def _coerce_triggers(self) -> "AppConfig": ) coerced.append(cls(**raw)) # Pydantic v2 stores fields in ``__dict__``; assigning here is - # the documented way to mutate after validation. - self.__dict__["triggers"] = coerced + # the documented way to mutate after validation. (Stub types + # ``__dict__`` as MappingProxyType; runtime is a plain dict.) + self.__dict__["triggers"] = coerced # pyright: ignore[reportIndexIssue] return self @@ -1749,6 +2461,17 @@ class Session(BaseModel): # with a stale version raise ``StaleVersionError`` so the caller can # reload + retry. version: int = 1 + # Phase 11 (FOC-04): transient per-turn confidence hint set by the + # agent runner (graph.py / responsive.py) AFTER each + # _harvest_tool_calls_and_patches call so the gateway's should_gate + # boundary can apply low_confidence gating using whatever + # confidence the agent has emitted so far. Reset to ``None`` at + # turn start; never persisted (``Field(exclude=True)``). The + # framework treats ``None`` as "no signal yet" and does NOT fire a + # low_confidence gate -- this avoids a false-positive gate on the + # very first tool call of a turn before any envelope/tool-arg + # carrying confidence has surfaced. + turn_confidence_hint: float | None = Field(default=None, exclude=True) # ------------------------------------------------------------------ # App-overridable agent-input formatter hook. @@ -2364,11 +3087,40 @@ class StubChatModel(BaseChatModel): """Deterministic chat model for tests/CI. Returns canned text per role. Optionally emits one tool call on first invocation if `tool_call_plan` is set. + + Phase 10 (FOC-03): also honours + ``llm.with_structured_output(AgentTurnOutput)`` so stub-driven tests + survive the runner's envelope contract. The structured response is + derived from the same canned text + a default 0.85 confidence; tests + that need a specific envelope shape can override + ``stub_envelope_confidence`` / ``stub_envelope_rationale`` / + ``stub_envelope_signal``. + + Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with + ``response_format=AgentTurnOutput`` (via ``AutoStrategy`` -> + ``ToolStrategy`` for non-native-structured-output models, including + this stub) injects ``AgentTurnOutput`` as a CALLABLE TOOL. The + agent loop only terminates when the LLM emits a tool call NAMED + ``AgentTurnOutput``. ``bind_tools`` records that envelope-tool name + so ``_generate`` can auto-emit a closing tool call after any + user-configured ``tool_call_plan`` is exhausted -- preserving the + pre-Phase-15 stub semantics (canned text + optional pre-scripted + tool calls) while satisfying the new tool-loop termination + contract. """ role: str = "default" canned_responses: dict[str, str] = Field(default_factory=dict) tool_call_plan: list[dict] | None = None + stub_envelope_confidence: float = 0.85 + stub_envelope_rationale: str = "stub envelope rationale" + stub_envelope_signal: str | None = None _called_once: bool = False + # Phase 15 (LLM-COMPAT-01): set by ``bind_tools`` when + # ``langchain.agents.create_agent`` injects a structured-output tool + # for ``AgentTurnOutput``. Holds the bare tool name (e.g. + # ``"AgentTurnOutput"``) so ``_generate`` can emit a final + # envelope-shaped tool call to close the agent loop. + _envelope_tool_name: str | None = None @property def _llm_type(self) -> str: @@ -2382,6 +3134,26 @@ def _generate(self, messages: list[BaseMessage], stop: list[str] | None = None, for tc in self.tool_call_plan: tool_calls.append({"name": tc["name"], "args": tc.get("args", {}), "id": str(uuid4())}) self._called_once = True + elif self._envelope_tool_name is not None: + # Phase 15 (LLM-COMPAT-01): the tool_call_plan is exhausted + # (or wasn't configured) AND ``langchain.agents.create_agent`` + # has bound the AgentTurnOutput envelope as a tool. Emit a + # closing tool call so the loop terminates with a populated + # ``structured_response``. The args mirror the + # ``with_structured_output`` path's envelope construction so + # tests see the same confidence / rationale / signal regardless + # of whether the new tool-strategy or the legacy structured- + # output path is in play. + tool_calls.append({ + "name": self._envelope_tool_name, + "args": { + "content": text or ".", + "confidence": self.stub_envelope_confidence, + "confidence_rationale": self.stub_envelope_rationale, + "signal": self.stub_envelope_signal, + }, + "id": str(uuid4()), + }) msg = AIMessage(content=text, tool_calls=tool_calls) return ChatResult(generations=[ChatGeneration(message=msg)]) @@ -2390,25 +3162,217 @@ async def _agenerate(self, messages: list[BaseMessage], stop: list[str] | None = return self._generate(messages, stop, run_manager, **kwargs) def bind_tools(self, tools, *, tool_choice=None, **kwargs): - """No-op binder: stub emits tool calls only via `tool_call_plan`, not via real binding.""" + """Record the AgentTurnOutput envelope-tool name when present. + + Phase 15 (LLM-COMPAT-01): ``langchain.agents.create_agent`` with + ``response_format=AgentTurnOutput`` calls ``bind_tools(...)`` + with the user's tools PLUS the envelope-as-a-tool. We scan the + list for the AgentTurnOutput-shaped tool (matched by ``__name__`` + on Pydantic schemas, ``name`` on ``BaseTool`` instances, or the + ``"name"`` key on dict-shaped tool specs) and remember it on the + instance so ``_generate`` can close the agent loop with a + synthetic envelope tool call after any pre-scripted + ``tool_call_plan`` is exhausted. Tools bound by the framework + itself (real BaseTools the agent should call) flow through + unchanged -- the stub still emits them only via + ``tool_call_plan``. + """ + for t in tools or []: + name = ( + getattr(t, "__name__", None) + or getattr(t, "name", None) + or (isinstance(t, dict) and t.get("name")) + ) + if isinstance(name, str) and name == "AgentTurnOutput": + self._envelope_tool_name = name + break return self + # ``BaseChatModel.with_structured_output`` returns ``Runnable[..., dict | BaseModel]`` + # in the langchain stub; this stub override returns a deterministic + # ``_StructuredRunnable`` so tests can drive structured outputs + # without a live provider. Functionally a Runnable (it implements + # ``invoke`` + ``ainvoke``); the stub mismatch is cosmetic. + def with_structured_output(self, schema, *, include_raw: bool = False, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] + """Phase 10 (FOC-03): honour the structured-output pass. + + Historically (pre-Phase-15) the deprecated + ``langgraph.prebuilt.create_react_agent`` factory called this + after its tool loop completed. The current + ``langchain.agents.create_agent`` path uses a tool-strategy + binding instead (see ``bind_tools`` above), but providers and + test code that call ``with_structured_output`` directly still + get a deterministic schema instance. + + We return a Runnable-like that yields a valid ``schema`` + instance derived from the stub's canned text and the + per-instance envelope configuration. Tests can tune + ``stub_envelope_confidence`` etc. to drive gate / reconcile + paths. + """ + text = self.canned_responses.get(self.role, f"[stub:{self.role}] no canned response") + confidence = self.stub_envelope_confidence + rationale = self.stub_envelope_rationale + signal = self.stub_envelope_signal + + class _StructuredRunnable: + def __init__(self, schema_cls): + self._schema = schema_cls + + def _build(self): + # Construct an instance of whatever schema was passed. + # Common case: AgentTurnOutput; permissive fallback handles + # other pydantic schemas the test may pass. + try: + return self._schema( + content=text or ".", + confidence=confidence, + confidence_rationale=rationale, + signal=signal, + ) + except Exception: + # Permissive fallback for unfamiliar schemas: try + # model_validate on a minimal dict. + return self._schema.model_validate({ + "content": text or ".", + "confidence": confidence, + "confidence_rationale": rationale, + "signal": signal, + }) -def _build_ollama_chat(provider: ProviderConfig, model_id: str, - temperature: float) -> BaseChatModel: - from langchain_ollama import ChatOllama - kwargs: dict[str, Any] = { - "base_url": provider.base_url or "https://ollama.com", - "model": model_id, - "temperature": temperature, - } - api_key = provider.api_key or os.environ.get("OLLAMA_API_KEY") + def invoke(self, *_args, **_kwargs): + return self._build() + + async def ainvoke(self, *_args, **_kwargs): + return self._build() + + return _StructuredRunnable(schema) + + +def _resolve_timeout( + provider: ProviderConfig, default: float, +) -> float: + """Resolve effective request timeout for a provider. + + Per-provider override wins; falls back to the framework default + (typically ``OrchestratorConfig.default_llm_request_timeout``). + """ + if provider.request_timeout is not None: + return provider.request_timeout + return default + + +def _wrap_chat_with_timeout( + base: BaseChatModel, + provider_name: str, + model_id: str, + request_timeout: float, +) -> BaseChatModel: + """Wrap ``base`` so every ``ainvoke`` is bounded by + ``asyncio.wait_for(..., timeout=request_timeout)`` and raises + ``LLMTimeoutError(provider, model, elapsed_ms)`` on hang. + + The native langchain timeout knob (``request_timeout=`` on + openai/azure or ``client_kwargs={'timeout': ...}`` on ollama) is + honoured at the httpx layer; this wrapper guarantees the + framework-typed exception AND a hard ceiling even if the + underlying client hangs in a way httpx misses (e.g., post-headers + TCP read stall on a slow Ollama). D-13-04: subclassing + ``TimeoutError`` means ``policy._TRANSIENT_TYPES`` auto-classifies + the error as transient (zero edits to ``policy.py``). + """ + base_cls = type(base) + + class _Bounded(base_cls): # type: ignore[misc, valid-type] + async def ainvoke(self, *args: Any, **kwargs: Any) -> Any: + t0 = time.monotonic() + try: + return await asyncio.wait_for( + super().ainvoke(*args, **kwargs), + timeout=request_timeout, + ) + except (asyncio.TimeoutError, TimeoutError) as e: + if isinstance(e, LLMTimeoutError): + # Already typed; don't double-wrap. + raise + elapsed_ms = int((time.monotonic() - t0) * 1000) + raise LLMTimeoutError( + provider=provider_name, + model=model_id, + elapsed_ms=elapsed_ms, + ) from e + + # Reuse the live pydantic instance's state without re-running + # __init__ (which would re-init the underlying httpx clients). + bounded = _Bounded.model_construct(**base.model_dump()) + # Some langchain client classes initialise non-pydantic attrs + # (httpx clients, run_manager, etc.) inside __init__. Copy them + # through so the wrapped instance shares the same network state. + for attr_name in ( + "_client", "_async_client", + "_async_httpx_client", "_sync_httpx_client", + "client", "async_client", + ): + if hasattr(base, attr_name): + try: + object.__setattr__( + bounded, attr_name, getattr(base, attr_name), + ) + except (AttributeError, TypeError): + # Slot-only or read-only attrs on some langchain + # versions -- the bounded instance will re-init on + # first use; not a correctness issue. + pass + return bounded + + +def _build_ollama_chat( + provider: ProviderConfig, model_id: str, temperature: float, + *, request_timeout: float, +) -> BaseChatModel: + from langchain_ollama import ChatOllama + + # Many Ollama models (gemma*, gpt-oss, ministral, etc.) don't support + # native function-calling, which is langchain-ollama's default method + # for ``with_structured_output``. Subclass to force + # ``method='json_schema'`` (uses Ollama's structured-output API) so + # Phase 10's ``response_format=AgentTurnOutput`` envelope actually + # round-trips instead of failing with ``OutputParserException`` + # when the LLM emits prose. + class _ChatOllamaJsonSchema(ChatOllama): # type: ignore[misc, valid-type] + def with_structured_output(self, schema, *, method=None, **kw): + return super().with_structured_output( + schema, method=method or "json_schema", **kw, + ) + + # Phase 13 (HARD-01): ChatOllama has NO native ``request_timeout`` + # field; the canonical incantation is ``client_kwargs={"timeout": ...}``, + # which propagates to the underlying httpx.AsyncClient. + client_kwargs: dict[str, Any] = {"timeout": request_timeout} + api_key = provider.api_key or os.environ.get("OLLAMA_API_KEY") if api_key: - kwargs["client_kwargs"] = {"headers": {"Authorization": f"Bearer {api_key}"}} - return ChatOllama(**kwargs) + client_kwargs["headers"] = { + "Authorization": f"Bearer {api_key}", + } + # Phase 13 (HARD-05): base_url is now config-load-validated by + # ProviderConfig._validate_required_fields. NO fallback to a + # public Ollama URL (air-gap rule violation). + kwargs: dict[str, Any] = { + "base_url": provider.base_url, + "model": model_id, + "temperature": temperature, + "client_kwargs": client_kwargs, + } + base = _ChatOllamaJsonSchema(**kwargs) + return _wrap_chat_with_timeout( + base, "ollama", model_id, request_timeout, + ) -def _build_azure_chat(provider: ProviderConfig, model: ModelConfig) -> BaseChatModel: +def _build_azure_chat( + provider: ProviderConfig, model: ModelConfig, + *, request_timeout: float, +) -> BaseChatModel: from langchain_openai import AzureChatOpenAI if provider.endpoint is None: raise ValueError("azure_openai provider requires 'endpoint'") @@ -2417,24 +3381,49 @@ def _build_azure_chat(provider: ProviderConfig, model: ModelConfig) -> BaseChatM f"azure_openai model {model.model!r} requires 'deployment'" ) _ak = provider.api_key or os.environ.get("AZURE_OPENAI_KEY") - return AzureChatOpenAI( + # ``request_timeout`` is a runtime alias for ``timeout`` on + # AzureChatOpenAI (langchain-openai > 0.3 declares it via Pydantic + # ``Field(alias="timeout")``); the langchain stubs only expose + # ``timeout``, hence the stub gap. + base = AzureChatOpenAI( azure_endpoint=provider.endpoint, api_version=provider.api_version or "2024-08-01-preview", azure_deployment=model.deployment, api_key=SecretStr(_ak) if _ak else None, temperature=model.temperature, + request_timeout=request_timeout, # pyright: ignore[reportCallIssue] -- Phase 13 (HARD-01) -- alias for ``timeout`` not in stub + ) + return _wrap_chat_with_timeout( + base, "azure_openai", model.model, request_timeout, ) def get_llm(cfg: LLMConfig, model_name: str | None = None, *, role: str = "default", stub_canned: dict[str, str] | None = None, - stub_tool_plan: list[dict] | None = None) -> BaseChatModel: + stub_tool_plan: list[dict] | None = None, + stub_envelope_confidence: float | None = None, + stub_envelope_rationale: str | None = None, + stub_envelope_signal: str | None = None, + default_llm_request_timeout: float = 120.0, + ) -> BaseChatModel: """Build a chat model by named entry from ``cfg.models``. ``model_name`` defaults to ``cfg.default``. Validation that the name exists is enforced by ``LLMConfig`` itself (model_validator), so a - missing name here means caller passed a typo — raise loudly. + missing name here means caller passed a typo -- raise loudly. + + Phase 10 (FOC-03): stub callers can now tune the canned envelope + (confidence / rationale / signal) so gate-trigger tests preserve their + pre-Phase-10 semantics by emitting a low-confidence envelope. + + Phase 13 (HARD-01): non-stub builds are bounded by an effective + ``request_timeout`` resolved as ``provider.request_timeout`` (per- + provider override) -> ``default_llm_request_timeout`` (framework + default; callers pass ``cfg.orchestrator.default_llm_request_timeout``). + The default keyword value (120.0) matches OrchestratorConfig's default + so test paths that build LLMs without an OrchestratorConfig in scope + still get a sane bound. """ name = model_name or cfg.default model = cfg.models.get(name) @@ -2446,44 +3435,127 @@ def get_llm(cfg: LLMConfig, model_name: str | None = None, *, provider = cfg.providers[model.provider] # validated at config load if provider.kind == "stub": - return StubChatModel( - role=role, - canned_responses=stub_canned or {}, - tool_call_plan=stub_tool_plan, - ) + kwargs: dict[str, Any] = { + "role": role, + "canned_responses": stub_canned or {}, + "tool_call_plan": stub_tool_plan, + } + if stub_envelope_confidence is not None: + kwargs["stub_envelope_confidence"] = stub_envelope_confidence + if stub_envelope_rationale is not None: + kwargs["stub_envelope_rationale"] = stub_envelope_rationale + if stub_envelope_signal is not None: + kwargs["stub_envelope_signal"] = stub_envelope_signal + return StubChatModel(**kwargs) + + effective = _resolve_timeout(provider, default_llm_request_timeout) + if provider.kind == "ollama": - return _build_ollama_chat(provider, model.model, model.temperature) + return _build_ollama_chat( + provider, model.model, model.temperature, + request_timeout=effective, + ) if provider.kind == "azure_openai": - return _build_azure_chat(provider, model) + return _build_azure_chat( + provider, model, request_timeout=effective, + ) + if provider.kind == "openai_compat": + return _build_openai_compat_chat( + provider, model, request_timeout=effective, + ) raise ValueError(f"Unknown provider kind: {provider.kind!r}") -def get_embedding(cfg: LLMConfig) -> Embeddings: - """Build the configured embedding model. Raises if ``cfg.embedding`` is None.""" +def _build_openai_compat_chat( + provider: ProviderConfig, model: ModelConfig, + *, request_timeout: float, +) -> BaseChatModel: + """Build a ``ChatOpenAI`` pointed at an OpenAI-compatible endpoint + (OpenRouter, Together, vLLM, etc.). Reuses langchain-openai's + ``ChatOpenAI`` with ``base_url=`` override and the provider's + ``api_key`` (resolved from env via the YAML loader). + """ + from langchain_openai import ChatOpenAI + if provider.base_url is None: + raise ValueError( + "openai_compat provider requires 'base_url' " + "(e.g. https://openrouter.ai/api/v1)" + ) + if provider.api_key is None: + raise ValueError("openai_compat provider requires 'api_key'") + # See AzureChatOpenAI block above: ``request_timeout`` is a runtime + # alias for ``timeout`` not in the langchain stubs. + base = ChatOpenAI( + base_url=provider.base_url, + api_key=provider.api_key, + model=model.model, + temperature=model.temperature, + request_timeout=request_timeout, # pyright: ignore[reportCallIssue] -- Phase 13 (HARD-01) -- alias for ``timeout`` not in stub + ) + return _wrap_chat_with_timeout( + base, "openai_compat", model.model, request_timeout, + ) + + +def get_embedding( + cfg: LLMConfig, *, default_llm_request_timeout: float = 120.0, +) -> Embeddings: + """Build the configured embedding model. Raises if ``cfg.embedding`` is None. + + Phase 13 (HARD-01): same per-provider override -> framework default + timeout resolution as ``get_llm``. Embeddings traffic shares the + request_timeout knob with chat (see CONTEXT.md "Deferred Ideas" -- + splitting embedding timeout from chat is a future refinement). + + Note (Phase 13 review WR-01): unlike the chat builders -- which apply a + defence-in-depth ``asyncio.wait_for`` wrapper (``_wrap_chat_with_timeout``) + that guarantees a structured ``LLMTimeoutError`` with ``elapsed_ms`` even + on partial-byte stalls -- embeddings rely SOLELY on the underlying + httpx-layer timeout configured above (``client_kwargs={"timeout": ...}`` + for Ollama, ``request_timeout=`` for Azure). This asymmetry is a + deliberate scope choice tied to Phase 13 CONTEXT.md "Deferred Ideas" #4 + (splitting embeddings timeout from chat timeout). If embeddings need + stricter bounds than chat -- or if the httpx-layer timeout proves + insufficient against post-headers TCP read stalls on the embeddings + path the same way it can on chat -- a future phase can mirror + ``_wrap_chat_with_timeout`` for the embeddings public surface + (``aembed_query`` / ``aembed_documents``). + """ if cfg.embedding is None: raise ValueError("llm.embedding is not configured") provider = cfg.providers[cfg.embedding.provider] + effective = _resolve_timeout(provider, default_llm_request_timeout) if provider.kind == "ollama": from langchain_ollama import OllamaEmbeddings - kwargs: dict[str, Any] = { - "base_url": provider.base_url or "https://ollama.com", - "model": cfg.embedding.model, - } + # Phase 13 (HARD-01): OllamaEmbeddings has NO native + # ``request_timeout`` field; canonical incantation is + # ``client_kwargs={"timeout": ...}`` (same as ChatOllama). + client_kwargs: dict[str, Any] = {"timeout": effective} api_key = provider.api_key or os.environ.get("OLLAMA_API_KEY") if api_key: - kwargs["client_kwargs"] = {"headers": {"Authorization": f"Bearer {api_key}"}} - return OllamaEmbeddings(**kwargs) + client_kwargs["headers"] = { + "Authorization": f"Bearer {api_key}", + } + # Phase 13 (HARD-05): base_url config-load-validated; NO public fallback. + return OllamaEmbeddings( + base_url=provider.base_url, + model=cfg.embedding.model, + client_kwargs=client_kwargs, + ) if provider.kind == "azure_openai": from langchain_openai import AzureOpenAIEmbeddings if provider.endpoint is None: raise ValueError("azure_openai provider requires 'endpoint'") deployment = cfg.embedding.deployment or cfg.embedding.model _ak = provider.api_key or os.environ.get("AZURE_OPENAI_KEY") + # See chat builders above: ``request_timeout`` is a runtime + # alias for ``timeout`` not surfaced in the langchain-openai stub. return AzureOpenAIEmbeddings( azure_endpoint=provider.endpoint, api_version=provider.api_version or "2024-08-01-preview", azure_deployment=deployment, api_key=SecretStr(_ak) if _ak else None, + request_timeout=effective, # pyright: ignore[reportCallIssue] -- Phase 13 (HARD-01) -- alias for ``timeout`` not in stub ) raise ValueError( f"Embedding not supported for provider kind {provider.kind!r}" @@ -2700,12 +3772,14 @@ def build_embedder( ) if p.kind == "azure_openai": from langchain_openai import AzureOpenAIEmbeddings + # AzureOpenAIEmbeddings.api_key is typed as ``SecretStr | None`` + # (pydantic v2). Wrap the env-sourced str so the type matches. return AzureOpenAIEmbeddings( azure_deployment=cfg.deployment, model=cfg.model, azure_endpoint=p.endpoint, api_version=p.api_version, - api_key=p.api_key, + api_key=SecretStr(p.api_key) if p.api_key else None, ) if p.kind == "stub": return _StubEmbeddings(dim=cfg.dim) @@ -2727,10 +3801,13 @@ def _faiss_distance_strategy(name: str): def _pgvector_distance_strategy(name: str): from langchain_postgres.vectorstores import DistanceStrategy + # ``langchain_postgres.DistanceStrategy.INNER_PRODUCT`` exists at + # runtime (verified via the live module) but the langchain-postgres + # stubs only expose ``COSINE`` / ``EUCLIDEAN``. return { "cosine": DistanceStrategy.COSINE, "euclidean": DistanceStrategy.EUCLIDEAN, - "inner_product": DistanceStrategy.INNER_PRODUCT, + "inner_product": DistanceStrategy.INNER_PRODUCT, # pyright: ignore[reportAttributeAccessIssue] }[name] @@ -2806,7 +3883,7 @@ def distance_to_similarity(distance: float, strategy: str) -> float: # ====== module: runtime/storage/history_store.py ====== -StateT = TypeVar("StateT", bound=BaseModel) +StateT = TypeVar("StateT", bound=Session) # Allowed ``filter_kwargs`` keys = IncidentRow column names. # Computed at module load so we can produce a precise error for typos. @@ -2858,7 +3935,7 @@ def _row_to_incident(self, row: IncidentRow) -> StateT: return self._converter._row_to_incident(row) def _load(self, incident_id: str) -> StateT: - with Session(self.engine) as session: + with SqlaSession(self.engine) as session: row = session.get(IncidentRow, incident_id) if row is None: raise FileNotFoundError(incident_id) @@ -2869,7 +3946,7 @@ def _list_filtered(self, *, filter_kwargs: Mapping[str, Any]) -> list[StateT]: Pure SQL prefilter — used by both vector and keyword paths. """ - with Session(self.engine) as session: + with SqlaSession(self.engine) as session: stmt = select(IncidentRow).where(IncidentRow.deleted_at.is_(None)) for col, val in filter_kwargs.items(): stmt = stmt.where(getattr(IncidentRow, col) == val) @@ -2926,7 +4003,12 @@ def find_similar( threshold = self.similarity_threshold if threshold is None else threshold vec = self.embedder.embed_query(query) - raw = self.vector_store.similarity_search_with_score_by_vector(vec, k=limit * 4) + # ``similarity_search_with_score_by_vector`` is provided by the + # concrete FAISS / pgvector / langchain-postgres backends (and + # validated by ``runtime.storage.vector.build_vector_store``) + # but the abstract ``langchain_core.vectorstores.VectorStore`` + # base class does not declare it. + raw = self.vector_store.similarity_search_with_score_by_vector(vec, k=limit * 4) # pyright: ignore[reportAttributeAccessIssue] out: list[tuple[StateT, float]] = [] for doc, distance in raw: score = distance_to_similarity(float(distance), self.distance_strategy) @@ -2963,7 +4045,7 @@ def _keyword_similar(self, *, query, filter_kwargs, status_filter, threshold, li if getattr(i, "status", None) == status_filter and getattr(i, "deleted_at", None) is None ] - def _ef(i, key, default=""): + def _ef(i, key, default: Any = ""): """Read a field from typed attribute first, then extra_fields.""" val = getattr(i, key, None) if val: @@ -2995,12 +4077,16 @@ def _ef(i, key, default=""): _INC_ID_RE = re.compile(r"^INC-\d{8}-\d{3}$") _SESSION_ID_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_-]*-\d{8}-\d{3}$") -# StateT is bound to ``BaseModel`` so callers can pass either bare -# ``Session`` or any pydantic subclass. The resolver in -# :mod:`runtime.state_resolver` enforces a ``runtime.state.Session`` -# subclass at config time; the looser bound here keeps the storage -# layer usable by ad-hoc tests that build a ``BaseModel`` directly. -StateT = TypeVar("StateT", bound=BaseModel) +# StateT is bound to ``Session`` (not bare ``BaseModel``) because the +# store body reads typed fields (``id``, ``status``, ``version``, +# ``updated_at`` …) that are declared on ``runtime.state.Session`` and +# not on ``pydantic.BaseModel``. The resolver in +# :mod:`runtime.state_resolver` already enforces a ``Session`` subclass +# at config time, and every existing caller (production + tests) passes +# either bare ``Session`` or a ``Session`` subclass — see +# Phase 19 / HARD-03 for the rationale (was: ``bound=BaseModel`` which +# made pyright flag every typed-field access). +StateT = TypeVar("StateT", bound=Session) def _embed_source(inc: BaseModel) -> str: @@ -3198,7 +4284,12 @@ def save(self, incident: StateT) -> None: raise ValueError( f"Invalid incident id {incident.id!r}; expected PREFIX-YYYYMMDD-NNN" ) - incident.updated_at = _iso(_now()) + # ``_iso(_now())`` returns ``str`` here -- the input datetime is + # never None -- but the helper's signature is the broader + # ``Optional[str]``. ``or ""`` keeps pyright + the typed + # ``Session.updated_at: str`` field consistent without changing + # behaviour (real value is always present). + incident.updated_at = _iso(_now()) or "" sess = incident # local alias — avoids repeating the domain token in new code expected_version = getattr(sess, "version", 1) # Bump in-memory BEFORE building the row dict so the persisted @@ -3343,12 +4434,16 @@ def _persist_vector(self) -> None: from pathlib import Path folder = Path(self.vector_path) folder.mkdir(parents=True, exist_ok=True) - self.vector_store.save_local( + # ``save_local`` is FAISS-specific; the runtime ``hasattr`` guard + # at the top of this method already ensured this codepath only + # runs against FAISS (other VectorStores omit the method). + # ``langchain_core.vectorstores.VectorStore`` doesn't declare it. + self.vector_store.save_local( # pyright: ignore[reportAttributeAccessIssue] folder_path=str(folder), index_name=self.vector_index_name, ) - def _add_vector(self, inc: BaseModel) -> None: + def _add_vector(self, inc: Session) -> None: if self.vector_store is None or self.embedder is None: return text = _embed_source(inc) @@ -3361,7 +4456,7 @@ def _add_vector(self, inc: BaseModel) -> None: ) self._persist_vector() - def _refresh_vector(self, inc: BaseModel, *, prior_text: str) -> None: + def _refresh_vector(self, inc: Session, *, prior_text: str) -> None: if self.vector_store is None or self.embedder is None: return text = _embed_source(inc) @@ -3536,7 +4631,13 @@ def _row_to_incident(self, row: IncidentRow) -> StateT: merged_extras[k] = v kwargs["extra_fields"] = merged_extras - return self._state_cls(**kwargs) + # ``kwargs`` is built up from heterogeneous sources (typed row + # columns + ``extra_fields`` blob) so pyright infers each value + # as ``object``. At runtime each entry matches the concrete + # ``state_cls`` field type by construction (the row schema is + # the source of truth); pydantic's own validation rejects bad + # shapes at the constructor. + return self._state_cls(**kwargs) # pyright: ignore[reportArgumentType] def _incident_to_row_dict(self, inc: StateT) -> dict: """Serialize a state instance into a row-shaped dict. @@ -3639,6 +4740,204 @@ def _field(name: str, default=None): "version": getattr(inc, "version", 1), } +# ====== module: runtime/storage/event_log.py ====== + +@dataclass(frozen=True) +class SessionEvent: + """Immutable view of one row in the event log.""" + seq: int + session_id: str + kind: str + payload: dict + ts: str + + +def _now() -> str: + return datetime.now(timezone.utc).isoformat() + + +class EventLog: + """Append-only log of session events. + + Events drive the status finalizer's inference (e.g. a registered + ```` event appearing in the log -> session reached + the corresponding terminal status). They are never mutated or + deleted. + """ + + def __init__(self, *, engine: Engine) -> None: + self.engine = engine + + def append(self, session_id: str, kind: str, payload: dict) -> None: + """Append a new event row. Never mutates existing rows.""" + with Session(self.engine) as s: + with s.begin(): + s.add(SessionEventRow( + session_id=session_id, + kind=kind, + payload=dict(payload), + ts=_now(), + )) + + def iter_for(self, session_id: str) -> Iterator[SessionEvent]: + """Yield events for ``session_id`` in monotonic insertion order.""" + with Session(self.engine) as s: + stmt = ( + select(SessionEventRow) + .where(SessionEventRow.session_id == session_id) + .order_by(SessionEventRow.seq) + ) + for row in s.execute(stmt).scalars(): + yield SessionEvent( + seq=row.seq, + session_id=row.session_id, + kind=row.kind, + payload=row.payload, + ts=row.ts, + ) + +# ====== module: runtime/storage/migrations.py ====== + +_FORWARD_COLUMNS: list[tuple[str, str]] = [ + ("parent_session_id", "VARCHAR"), # dedup linkage + ("dedup_rationale", "TEXT"), # LLM rationale + ("extra_fields", "JSON"), # generic round-trip tunnel +] +_FORWARD_INDEXES: list[tuple[str, str, str]] = [ + # (index_name, table, column) — mirrors models.IncidentRow.__table_args__. + ("ix_incidents_parent_session_id", "incidents", "parent_session_id"), +] + +# Default audit fields. Mirrors the Pydantic defaults on +# :class:`runtime.state.ToolCall`. Keep these in sync — a divergence +# means rows hydrated post-migration would carry different defaults +# than rows hydrated via the Pydantic constructor, which would surface +# as subtle test flakes long after the migration ran. +_AUDIT_DEFAULTS: dict[str, Any] = { + "status": "executed", + "risk": None, + "approver": None, + "approved_at": None, + "approval_rationale": None, +} + + +def _fill_audit_fields(tc: dict[str, Any]) -> bool: + """Mutate ``tc`` in place, filling any missing audit field with its + default. Returns ``True`` when at least one key was added. + + Existing values (including explicit ``None`` already on the row) + are left untouched — this is the idempotency guarantee. + """ + changed = False + for key, default in _AUDIT_DEFAULTS.items(): + if key not in tc: + tc[key] = default + changed = True + return changed + + +def _normalise_tool_calls_list( + tool_calls: Iterable[Any] | None, +) -> tuple[list[Any], bool]: + """Walk a session's tool_calls JSON list, fill missing audit fields. + + Returns ``(new_list, changed)``. Non-dict entries (corrupt rows) + are passed through unchanged — the migration is not a validator. + """ + if not tool_calls: + return [], False + new: list[Any] = [] + changed = False + for tc in tool_calls: + if isinstance(tc, dict): + # Copy so we don't mutate caller-owned data accidentally. + tc_copy = dict(tc) + if _fill_audit_fields(tc_copy): + changed = True + new.append(tc_copy) + else: + new.append(tc) + return new, changed + + +def migrate_tool_calls_audit(engine: Engine) -> dict[str, int]: + """Walk every session's ``tool_calls`` and fill missing audit fields. + + Idempotent — running on a freshly-migrated DB is a no-op. + + Returns a small stats dict:: + + {"sessions_scanned": N, "sessions_updated": M, "rows_filled": K} + + where ``rows_filled`` is the count of individual ToolCall entries + that received at least one default. Useful for ops dashboards and + post-migration verification. + """ + scanned = 0 + updated = 0 + filled = 0 + with SqlSession(engine) as session: + rows = session.query(IncidentRow).all() + for row in rows: + scanned += 1 + new_list, changed = _normalise_tool_calls_list(row.tool_calls) + if changed: + # Count individual entries that gained at least one + # field. Cheap re-walk — rows.tool_calls is already in + # memory. + for old, new in zip(row.tool_calls or [], new_list): + if isinstance(old, dict) and isinstance(new, dict): + if any(k not in old for k in _AUDIT_DEFAULTS): + filled += 1 + row.tool_calls = new_list + updated += 1 + if updated: + session.commit() + return { + "sessions_scanned": scanned, + "sessions_updated": updated, + "rows_filled": filled, + } + + +def migrate_add_session_columns(engine: Engine) -> dict[str, int]: + """Add post-initial columns to ``incidents`` if missing. Idempotent. + + Older on-disk databases may lack ``extra_fields``, + ``parent_session_id``, or ``dedup_rationale``; SQLAlchemy's read-side + query then errors with ``no such column``. This walker uses + ``PRAGMA table_info`` (via SQLAlchemy's ``inspect``) to detect + missing columns and adds each one nullable. Running on a freshly- + migrated DB is a no-op. + + Returns ``{"columns_added": N, "indexes_added": M}``. + """ + inspector = inspect(engine) + if "incidents" not in inspector.get_table_names(): + # Fresh DB; ``Base.metadata.create_all`` already produced the + # full schema. Nothing to backfill. + return {"columns_added": 0, "indexes_added": 0} + existing_cols = {c["name"] for c in inspector.get_columns("incidents")} + existing_idx = {i["name"] for i in inspector.get_indexes("incidents")} + added_cols = 0 + added_idx = 0 + with engine.begin() as conn: + for col, sql_type in _FORWARD_COLUMNS: + if col not in existing_cols: + conn.execute(text(f"ALTER TABLE incidents ADD COLUMN {col} {sql_type}")) + added_cols += 1 + for idx_name, table, col in _FORWARD_INDEXES: + if idx_name in existing_idx: + continue + # If the column itself was just added (or already present) + # the index is safe to create now. + cols_after = {c["name"] for c in inspect(conn).get_columns(table)} + if col in cols_after: + conn.execute(text(f"CREATE INDEX {idx_name} ON {table} ({col})")) + added_idx += 1 + return {"columns_added": added_cols, "indexes_added": added_idx} + # ====== module: runtime/mcp_loader.py ====== @dataclass @@ -3761,83 +5060,3150 @@ def build_fastmcp_client(server_cfg: MCPServerConfig): if server_cfg.transport == "stdio": if not server_cfg.command: raise ValueError( - f"stdio server '{server_cfg.name}' missing 'command'" + f"stdio server '{server_cfg.name}' missing 'command'" + ) + return Client( + {"command": server_cfg.command[0], "args": server_cfg.command[1:]} + ) + raise ValueError(f"Unknown transport: {server_cfg.transport}") + + +async def _load_in_process(server_cfg: MCPServerConfig, + stack: AsyncExitStack) -> list[BaseTool]: + if server_cfg.module is None: + raise ValueError(f"in_process server '{server_cfg.name}' missing 'module'") + mod = importlib.import_module(server_cfg.module) + fmcp = getattr(mod, "mcp", None) + if fmcp is None: + raise ValueError(f"Module {server_cfg.module} has no 'mcp' (FastMCP instance)") + # FastMCP exposes tools as functions; convert to langchain tools via adapter. + # We use the in-memory client transport. The client is registered into the + # caller's exit stack so its session/transport stays open while the loaded + # tools are in use. + from fastmcp import Client + client = Client(fmcp) + await stack.enter_async_context(client) + tools = await load_mcp_tools(client.session) + # Rewrite each tool's .name to ":" for LLM disambiguation. + for t in tools: + original_name = t.name + t.name = f"{server_cfg.name}:{original_name}" + t._original_mcp_name = original_name # type: ignore[attr-defined] + return tools + + +async def _load_remote(server_cfg: MCPServerConfig, + stack: AsyncExitStack) -> list[BaseTool]: + from fastmcp import Client + if server_cfg.transport in ("http", "sse"): + if not server_cfg.url: + raise ValueError(f"remote server '{server_cfg.name}' missing 'url'") + client = Client(server_cfg.url, headers=server_cfg.headers or None) + elif server_cfg.transport == "stdio": + if not server_cfg.command: + raise ValueError(f"stdio server '{server_cfg.name}' missing 'command'") + client = Client({"command": server_cfg.command[0], "args": server_cfg.command[1:]}) + else: + raise ValueError(f"Unknown transport: {server_cfg.transport}") + await stack.enter_async_context(client) + tools = await load_mcp_tools(client.session) + # Rewrite each tool's .name to ":" for LLM disambiguation. + for t in tools: + original_name = t.name + t.name = f"{server_cfg.name}:{original_name}" + t._original_mcp_name = original_name # type: ignore[attr-defined] + return tools + + +async def load_tools(cfg: MCPConfig, stack: AsyncExitStack) -> ToolRegistry: + """Load all enabled MCP servers and return a :class:`ToolRegistry`. + + The caller MUST pass an already-entered :class:`AsyncExitStack`. Each + FastMCP ``Client`` is registered into it; the caller controls lifetime via + ``await stack.aclose()``. + """ + registry = ToolRegistry() + for server_cfg in cfg.servers: + if not server_cfg.enabled: + continue + if server_cfg.transport == "in_process": + tools = await _load_in_process(server_cfg, stack) + else: + tools = await _load_remote(server_cfg, stack) + for t in tools: + original = getattr(t, "_original_mcp_name", t.name) + registry.add(ToolEntry( + name=original, description=t.description or "", + server=server_cfg.name, category=server_cfg.category, tool=t, + )) + return registry + +# ====== module: runtime/service.py ====== + +_log = logging.getLogger("runtime.service") + +T = TypeVar("T") + + +@dataclass +class _ActiveSession: + """In-memory metadata for an in-flight session. + + Lives in ``OrchestratorService._registry``; mutated only on the + loop thread so the dict itself needs no thread lock. Snapshots are + produced via :meth:`OrchestratorService.list_active_sessions`, + which submits a coroutine to the loop and returns a list of plain + dicts to the calling thread. + """ + + session_id: str + started_at: str + status: str = "running" + current_agent: str | None = None + task: asyncio.Task | None = None + + +def _utc_iso_now() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +class SessionCapExceeded(RuntimeError): + """Raised by ``start_session`` when the service is already running + ``max_concurrent_sessions`` sessions. + + Fail fast, do not queue. Callers (Streamlit, FastAPI handlers) + catch this and surface a clear error — Streamlit shows a toast; + the HTTP layer translates it to a 429 with ``Retry-After``. + """ + + def __init__(self, cap: int) -> None: + super().__init__( + f"OrchestratorService at capacity ({cap} concurrent); " + f"reject incoming start_session" + ) + self.cap = cap + + +class OrchestratorService: + """Process-singleton orchestrator service. + + Surface: construction, singleton accessor, ``start()`` / + ``shutdown()``, coroutine submission bridge, and the shared MCP + client pool. + + Thread-safety (HARD-06): ``get_or_create()`` and + ``_reset_singleton()`` serialise singleton mutation through a + class-level ``threading.Lock``. Concurrent first-callers + (Streamlit warmup + FastAPI startup hook racing during process + boot) all observe the same instance — the loser of the race blocks + on the lock briefly, then short-circuits on the + ``_instance is None`` check inside the critical section. + """ + + # Class-level singleton state. Guarded by ``_lock`` so concurrent + # ``get_or_create()`` callers can't double-construct the service. + # Reset on ``shutdown()`` via :meth:`_reset_singleton`. + _lock: threading.Lock = threading.Lock() + _instance: "OrchestratorService | None" = None + + def __init__( + self, + cfg: AppConfig, + max_concurrent_sessions: int | None = None, + ) -> None: + self.cfg = cfg + # Resource cap. Prefer the explicit constructor arg; fall back + # to ``cfg.runtime.max_concurrent_sessions``. Tests mutate this + # attribute directly to drive cap behaviour deterministically. + self.max_concurrent_sessions: int = ( + max_concurrent_sessions + if max_concurrent_sessions is not None + else cfg.runtime.max_concurrent_sessions + ) + self._loop: asyncio.AbstractEventLoop | None = None + self._thread: threading.Thread | None = None + self._started = threading.Event() + # Shared MCP client pool — built lazily on first ``get_mcp_client`` + # so processes that never touch MCP pay zero startup cost. All + # mutations of ``_mcp_clients`` / ``_mcp_locks`` happen on the + # background loop, so the dicts themselves don't need a thread + # lock. + self._mcp_stack: AsyncExitStack | None = None + self._mcp_clients: dict[str, Any] = {} + self._mcp_locks: dict[str, asyncio.Lock] = {} + # Per-server-name asyncio.Lock guarding lazy build. Created on the + # loop the first time the server is requested. + self._mcp_build_locks: dict[str, asyncio.Lock] = {} + # Shared Orchestrator (lazy-built on first session start) and + # the in-flight session registry. The registry dict itself is + # only mutated from the loop thread (writers go through + # ``submit_and_wait``); readers also hop through the loop so the + # snapshot is point-in-time consistent with concurrent mutators. + self._orch: Any | None = None + self._registry: dict[str, _ActiveSession] = {} + # Lazily-built lock for serialising orchestrator construction + # under concurrent ``start_session`` calls. Created on the loop. + self._orch_build_lock: asyncio.Lock | None = None + # Pending-approval timeout watchdog. Started in ``start()`` iff + # ``cfg.runtime.gateway`` is configured; otherwise None and the + # lifecycle hooks are no-ops. + self._approval_watchdog: Any | None = None + + @classmethod + def get_or_create(cls, cfg: AppConfig) -> "OrchestratorService": + """Return the process-singleton service, building it on first call. + + Subsequent calls ignore the supplied ``cfg`` and return the + existing instance — there is exactly one orchestrator service per + Python process. To rebuild with a new config, call + ``shutdown()`` first. + + Thread-safe (HARD-06): the check-and-construct pair runs inside + a class-level ``threading.Lock``. A concurrent second caller + either blocks until the first caller's ``__init__`` returns and + then short-circuits on the ``_instance is not None`` check, or + wins the race and constructs alone — no double construction. + """ + with cls._lock: + if cls._instance is None: + cls._instance = cls(cfg) + return cls._instance + + def start(self) -> None: + """Spin up the background thread + asyncio loop. + + Idempotent: a no-op if the loop is already running. Blocks until + the background thread reports the loop is ready (5s timeout) so + callers can ``submit()`` immediately after ``start()`` returns. + """ + if self._thread is not None and self._thread.is_alive(): + return + self._started.clear() + self._loop = asyncio.new_event_loop() + self._thread = threading.Thread( + target=self._run_loop, + name="OrchestratorService", + daemon=True, + ) + self._thread.start() + if not self._started.wait(timeout=5.0): + raise RuntimeError("OrchestratorService loop failed to start within 5s") + # Arm the pending-approval watchdog iff a gateway is configured. + # The watchdog is harmless when no high-risk tool calls ever + # fire (it scans the empty registry), but skipping the start + # when the gateway is off keeps process startup quiet for apps + # that have not opted into HITL. + gateway_cfg = getattr(self.cfg.runtime, "gateway", None) + if gateway_cfg is not None: + + + timeout_s = getattr( + gateway_cfg, "approval_timeout_seconds", 3600, + ) + self._approval_watchdog = ApprovalWatchdog( + self, + approval_timeout_seconds=timeout_s, + ) + self._approval_watchdog.start(self._loop) + + def _run_loop(self) -> None: + assert self._loop is not None + asyncio.set_event_loop(self._loop) + self._started.set() + try: + self._loop.run_forever() + finally: + # Drain any remaining tasks before closing so no coroutine is + # left dangling without a chance to clean up. + try: + pending = asyncio.all_tasks(loop=self._loop) + for task in pending: + task.cancel() + if pending: + self._loop.run_until_complete( + asyncio.gather(*pending, return_exceptions=True) + ) + finally: + self._loop.close() + + def submit( + self, coro: Awaitable[T] + ) -> concurrent.futures.Future[T]: + """Submit a coroutine to the background loop from any thread. + + Returns a ``concurrent.futures.Future`` whose ``.result()`` blocks + the calling thread until the coroutine resolves on the loop. Safe + to call concurrently from multiple threads. + """ + if self._loop is None: + raise RuntimeError( + "OrchestratorService not started; call start() first" + ) + if not self._loop.is_running(): + raise RuntimeError("OrchestratorService loop is not running") + # Public signature accepts ``Awaitable[T]`` for caller flexibility; + # ``run_coroutine_threadsafe`` requires a ``Coroutine``. Every + # in-tree caller passes ``async def fn()`` — a Coroutine — so the + # cast is sound. Outside callers passing a non-coroutine + # Awaitable would already fail at runtime. + return asyncio.run_coroutine_threadsafe( + cast(Coroutine[Any, Any, T], coro), self._loop, + ) + + def submit_and_wait( + self, coro: Awaitable[T], timeout: float | None = None + ) -> T: + """Submit a coroutine and block the caller until it resolves. + + Convenience wrapper for sync callers (Streamlit, FastAPI request + handlers, CLI). Raises ``concurrent.futures.TimeoutError`` if the + coroutine doesn't complete within ``timeout`` seconds. + + WARNING: do not call from an async function whose event loop is + the same loop ``OrchestratorService`` is hosting (e.g. tests using + ``httpx.AsyncClient + ASGITransport`` against the FastAPI app + share the same loop the service runs on). The caller would block + the loop while waiting for work scheduled onto that same loop — + a deadlock. Use :meth:`submit_async` from async code. + """ + return self.submit(coro).result(timeout=timeout) + + async def submit_async(self, coro: Awaitable[T]) -> T: + """Bridge a coroutine onto the service's background loop, awaitable + from any caller's loop. + + Async equivalent of :meth:`submit_and_wait`. ``asyncio.wrap_future`` + exposes the cross-thread ``concurrent.futures.Future`` returned by + ``run_coroutine_threadsafe`` as awaitable on the calling loop, so + the caller yields control while the work runs on the service's + loop. Safe to call from a request handler whose event loop is the + same one the service is hosting (no deadlock). + """ + if self._loop is None: + raise RuntimeError( + "OrchestratorService not started; call start() first" + ) + if not self._loop.is_running(): + raise RuntimeError("OrchestratorService loop is not running") + # See ``submit`` above for the Awaitable-vs-Coroutine cast. + fut = asyncio.run_coroutine_threadsafe( + cast(Coroutine[Any, Any, T], coro), self._loop, + ) + return await asyncio.wrap_future(fut) + + async def get_mcp_client(self, server_name: str) -> Any: + """Return the shared FastMCP client for ``server_name``, building + on first request. + + Lookup is serialised via a per-server ``asyncio.Lock`` so two + concurrent sessions racing for the same server don't double-build + the client. The clients themselves are reused across all sessions + for the lifetime of the service; teardown happens in + :meth:`shutdown`. + + Raises ``KeyError`` if ``server_name`` is not declared in + ``cfg.mcp.servers``. + """ + # Build-lock dict mutation must happen on the loop; we *are* on + # the loop here (this is an async method). + if server_name not in self._mcp_build_locks: + self._mcp_build_locks[server_name] = asyncio.Lock() + async with self._mcp_build_locks[server_name]: + if server_name in self._mcp_clients: + return self._mcp_clients[server_name] + server_cfg = next( + (s for s in self.cfg.mcp.servers if s.name == server_name), + None, + ) + if server_cfg is None: + raise KeyError( + f"MCP server {server_name!r} not declared in cfg.mcp.servers" + ) + if self._mcp_stack is None: + self._mcp_stack = AsyncExitStack() + await self._mcp_stack.__aenter__() + client = build_fastmcp_client(server_cfg) + await self._mcp_stack.enter_async_context(client) + self._mcp_clients[server_name] = client + self._mcp_locks[server_name] = asyncio.Lock() + return client + + def lock_for(self, server_name: str) -> asyncio.Lock: + """Return the per-server ``asyncio.Lock`` that serialises tool + calls against a single FastMCP client. + + Must be called after ``get_mcp_client(server_name)`` has built + the client, otherwise ``KeyError``. + """ + return self._mcp_locks[server_name] + + # ------------------------------------------------------------------ + # Per-session task scheduling + in-flight registry + # ------------------------------------------------------------------ + + async def _ensure_orchestrator(self) -> Any: + """Lazily build the shared ``Orchestrator`` on the loop thread. + + Concurrent ``start_session`` calls coordinate through + ``_orch_build_lock`` so we never build the orchestrator twice. + Returns the cached instance on subsequent calls. + """ + # Build-lock construction must happen on the loop. We *are* on + # the loop here (this is an async method invoked via the bridge). + if self._orch_build_lock is None: + self._orch_build_lock = asyncio.Lock() + async with self._orch_build_lock: + if self._orch is None: + # Lazy import to avoid a circular dependency at module + # load time (orchestrator transitively imports a lot). + + self._orch = await Orchestrator.create(self.cfg) + return self._orch + + def start_session( + self, + *, + query: str = "", + state_overrides: dict | None = None, + environment: str | None = None, + submitter: dict | None = None, + reporter_id: str | None = None, + reporter_team: str | None = None, + trigger: Any | None = None, + ) -> str: + """Start a new agent session. Returns the session id immediately. + + The session row is created (and the id minted) synchronously on + the loop so the caller has a stable handle before this method + returns. The actual graph run is launched as an ``asyncio.Task`` + on the same loop and runs in the background — the caller does + **not** block on it. Listen via :meth:`list_active_sessions` and + per-session state lookups for progress. + + ``state_overrides`` is a free-form dict of domain fields the app + stamps onto the new session row. The framework only projects + ``environment`` onto the storage column today; other keys ride + through to app-specific MCP tools. + + ``submitter`` is a free-form dict the calling app interprets. + For incident-management it is ``{"id": "...", "team": "..."}``; + other apps can carry app-specific keys (e.g. code-review's + ``{"id": "", "pr_url": "..."}``). The framework + only projects ``id``/``team`` onto the row's reporter columns. + + Deprecated kwargs (coerced and warned): + * ``environment`` -> ``state_overrides={"environment": ...}`` + * ``reporter_id`` / ``reporter_team`` -> ``submitter`` + + The registry entry is evicted by a ``Task.add_done_callback`` on + completion, cancellation, or failure — so a session that crashes + does not leak a stale entry. + """ + + + + # Resolve the generic ``submitter`` and ``state_overrides`` once + # on the caller's thread — the deprecation warnings fire here + # (in the user's frame), not deep inside the loop's ``_scheduler``. + resolved_overrides = _coerce_state_overrides( + state_overrides, environment, + ) + resolved_submitter = _coerce_submitter( + submitter, reporter_id, reporter_team + ) + sub_id = (resolved_submitter or {}).get("id", "user-mock") + sub_team = (resolved_submitter or {}).get("team", "platform") + env = (resolved_overrides or {}).get("environment", "") + + async def _scheduler() -> str: + # Enforce the concurrency cap on the loop thread so the + # registry size check is race-free. Fail-fast with + # ``SessionCapExceeded``; the exception propagates through + # ``submit_and_wait`` -> ``Future.result()`` to the caller. + if len(self._registry) >= self.max_concurrent_sessions: + raise SessionCapExceeded(self.max_concurrent_sessions) + orch = await self._ensure_orchestrator() + # Allocate the row (and its id) synchronously on the loop + # so the caller gets a stable id back. The graph then runs + # in a separate task — registration happens here, before + # the task is created, so ``list_active_sessions`` sees the + # entry immediately. + inc = orch.store.create( + query=query, + environment=env, + reporter_id=sub_id, + reporter_team=sub_team, + ) + session_id = inc.id + # Stamp trigger provenance onto the row before the graph + # runs so any crash mid-graph still leaves an audit trail. + # ``inc.findings`` is a JSON dict on the row. + if trigger is not None: + try: + received_at = trigger.received_at.strftime( + "%Y-%m-%dT%H:%M:%SZ" + ) + except Exception: # noqa: BLE001 + received_at = _utc_iso_now() + inc.findings["trigger"] = { + "name": getattr(trigger, "name", None), + "transport": getattr(trigger, "transport", None), + "target_app": getattr(trigger, "target_app", None), + "received_at": received_at, + } + orch.store.save(inc) + entry = _ActiveSession( + session_id=session_id, + started_at=_utc_iso_now(), + ) + self._registry[session_id] = entry + + async def _run() -> None: + # Fail-fast on contention (D-03): if another task already + # holds the session lock, refuse the new turn immediately. + if orch._locks.is_locked(session_id): + + raise SessionBusy(session_id) + # Hold the per-session lock for the full graph turn, + # including any HITL interrupt() pause (D-01). + async with orch._locks.acquire(session_id): + try: + await orch.graph.ainvoke( + GraphState( + session=inc, + next_route=None, + last_agent=None, + error=None, + ), + config=orch._thread_config(session_id), + ) + except asyncio.CancelledError: + raise + except Exception as exc: # noqa: BLE001 + # Phase 11 (FOC-04 / D-11-04): GraphInterrupt is a + # pending-approval pause, not a failure. Don't stamp + # status='error' on the registry entry -- let + # LangGraph's checkpointer hold the paused state + # and let the UI's Approve/Reject action drive + # resume. + try: + from langgraph.errors import GraphInterrupt + if isinstance(exc, GraphInterrupt): + # Propagate so the underlying Task + # observer (stop_session etc.) still + # sees the exception, but skip the + # status='error' write. + raise + except ImportError: # pragma: no cover + pass + # Mark the registry entry so any concurrent snapshot + # observes the failure before the done-callback + # evicts it. The exception itself is preserved on + # the task object for ``stop_session`` and any + # other observer that holds a Task reference. + e = self._registry.get(session_id) + if e is not None: + e.status = "error" + raise + + task = asyncio.create_task(_run(), name=f"session:{session_id}") + entry.task = task + + # Eviction is loop-local: ``add_done_callback`` fires on the + # loop thread, so the dict mutation is single-threaded. + def _evict(_t: asyncio.Task) -> None: + self._registry.pop(session_id, None) + + task.add_done_callback(_evict) + return session_id + + return self.submit_and_wait(_scheduler(), timeout=30.0) + + # ------------------------------------------------------------------ + # stop_session — cancel in-flight task + persist stopped status + # ------------------------------------------------------------------ + + def stop_session(self, session_id: str) -> None: + """Cancel an in-flight session and mark its row ``status="stopped"``. + + Idempotent: calling on an unknown id, an already-stopped session, + or a session that completed naturally is a no-op (does not raise). + Also clears ``pending_intervention`` so a session interrupted + mid-resume doesn't leave a stale prompt on the row. + + Partial work (recorded ``tool_calls``, ``agents_run``) is + preserved — they are written as they happen, and stopping is + not a rollback. + """ + + async def _stop() -> None: + entry = self._registry.get(session_id) + task = entry.task if entry is not None else None + if task is not None and not task.done(): + task.cancel() + try: + await asyncio.wait_for(task, timeout=5.0) + except (asyncio.CancelledError, asyncio.TimeoutError): + pass + except Exception: # noqa: BLE001 + # The graph itself may have raised; we still want to + # mark the row stopped below. Swallow here, but log + # so post-mortem reveals the underlying failure. + _log.warning( + "stop_session: graph raised during cancel-await for %s", + session_id, + exc_info=True, + ) + # Persist the stopped status. The orchestrator may not have + # been built yet (caller passed an unknown id before any + # session ran) — in that case there's nothing to persist. + orch = self._orch + if orch is not None: + try: + inc = orch.store.load(session_id) + except Exception: # noqa: BLE001 + # Unknown id: nothing to persist; treat as no-op. A + # genuine store failure is still observable via the log. + _log.debug( + "stop_session: store.load(%s) failed; treating as unknown id", + session_id, + exc_info=True, + ) + inc = None + if inc is not None: + inc.status = "stopped" + inc.pending_intervention = None + orch.store.save(inc) + # Drop the registry entry if the done-callback didn't already + # evict it (it always does, but be defensive). + self._registry.pop(session_id, None) + + # If the loop isn't running (caller stopped the service), be a + # silent no-op rather than raising — keeps idempotency guarantees. + if self._loop is None or not self._loop.is_running(): + return + self.submit_and_wait(_stop(), timeout=10.0) + + # ------------------------------------------------------------------ + # Active-session registry snapshot accessor + # ------------------------------------------------------------------ + + def list_active_sessions(self) -> list[dict[str, Any]]: + """Return a thread-safe snapshot of in-flight sessions. + + The snapshot coroutine runs on the loop thread, so the view is + point-in-time consistent w.r.t. concurrent registry mutators + (which also run on the loop). Each entry is a plain ``dict`` + with ``session_id``, ``status``, ``started_at``, and + ``current_agent`` keys — callers in any thread can pass it + around without holding any asyncio resources. + + Returns an empty list when the service has never run a session + or when every previously-started run has completed. + """ + + async def _snapshot() -> list[dict[str, Any]]: + return [ + { + "session_id": e.session_id, + "status": e.status, + "started_at": e.started_at, + "current_agent": e.current_agent, + } + for e in self._registry.values() + ] + + return self.submit_and_wait(_snapshot(), timeout=5.0) + + def shutdown(self, timeout: float = 10.0) -> None: + """Stop the loop, tear down MCP clients, join the thread, + reset the singleton. + + Idempotent: safe to call multiple times, including after the + loop has already been torn down. Resets the module-level + singleton so ``get_or_create()`` will rebuild on the next call. + """ + if self._loop is None: + self._reset_singleton() + return + loop = self._loop + thread = self._thread + # Stop the watchdog before draining sessions so its scan + # doesn't race against the registry teardown below. + if loop.is_running() and self._approval_watchdog is not None: + try: + fut = asyncio.run_coroutine_threadsafe( + self._approval_watchdog.stop(), loop, + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: shutdown must continue even if the watchdog + # refuses to stop cleanly. Surface the cause so it doesn't + # silently rot. + _log.warning( + "shutdown: approval watchdog stop failed", + exc_info=True, + ) + self._approval_watchdog = None + # Cancel in-flight session tasks first so they observe a + # CancelledError before the orchestrator's underlying + # resources (DB engine, FastMCP transports) are torn down. + if loop.is_running() and self._registry: + try: + fut = asyncio.run_coroutine_threadsafe( + self._cancel_all_sessions(), loop + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: a stuck task that ignores cancellation must + # not block the loop teardown below. Surface for diagnosis. + _log.warning( + "shutdown: cancel_all_sessions failed", + exc_info=True, + ) + # Close the shared orchestrator on the loop, releasing its + # checkpointer connection / MCP exit-stack. + if loop.is_running() and self._orch is not None: + try: + fut = asyncio.run_coroutine_threadsafe( + self._close_orchestrator(), loop + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: a misbehaving aclose() must not block + # the loop / thread join below. Surface for diagnosis. + _log.warning( + "shutdown: orchestrator close failed", + exc_info=True, + ) + # Close MCP clients on the loop *before* stopping it. + if loop.is_running() and self._mcp_stack is not None: + try: + fut = asyncio.run_coroutine_threadsafe( + self._close_mcp_pool(), loop + ) + fut.result(timeout=timeout) + except Exception: # noqa: BLE001 + # Best-effort: don't block shutdown on a misbehaving + # client. Log so diagnostics survive the silent cleanup. + _log.warning( + "shutdown: MCP pool close failed", + exc_info=True, + ) + if loop.is_running(): + loop.call_soon_threadsafe(loop.stop) + if thread is not None: + thread.join(timeout=timeout) + self._loop = None + self._thread = None + self._started.clear() + self._mcp_stack = None + self._mcp_clients.clear() + self._mcp_locks.clear() + self._mcp_build_locks.clear() + self._orch = None + self._orch_build_lock = None + self._registry.clear() + self._approval_watchdog = None + self._reset_singleton() + + async def _cancel_all_sessions(self) -> None: + """Cancel every in-flight session task and wait for them to exit. + + Runs on the loop thread. Each task gets up to 5s to honour the + ``CancelledError``; misbehaving tasks that ignore cancellation + do not block shutdown beyond that — ``run_loop`` will sweep + them in its final ``gather`` pass. + """ + tasks = [e.task for e in self._registry.values() if e.task is not None] + for t in tasks: + t.cancel() + if tasks: + await asyncio.gather(*tasks, return_exceptions=True) + self._registry.clear() + + async def _close_orchestrator(self) -> None: + if self._orch is None: + return + orch = self._orch + self._orch = None + try: + await orch.aclose() + except Exception: # noqa: BLE001 + # Best-effort cleanup: a checkpointer / MCP exit-stack that + # blew up on close still leaves the process to exit cleanly. + # Surface so the failure is observable post-mortem. + _log.warning( + "_close_orchestrator: orch.aclose() failed", + exc_info=True, + ) + + async def _close_mcp_pool(self) -> None: + if self._mcp_stack is None: + return + stack = self._mcp_stack + self._mcp_stack = None + await stack.__aexit__(None, None, None) + self._mcp_clients.clear() + self._mcp_locks.clear() + self._mcp_build_locks.clear() + + @classmethod + def _reset_singleton(cls) -> None: + """Clear the class-level singleton under the same lock that + ``get_or_create`` uses — so a reset racing with a fresh + ``get_or_create`` call cannot leak the stale instance. + """ + with cls._lock: + cls._instance = None + +# ====== module: runtime/agents/turn_output.py ====== + +_LOG = logging.getLogger("runtime.orchestrator") + +# D-10-03 — heuristic tolerance for envelope-vs-tool-arg confidence mismatch. +# Inclusive boundary (|env - tool| <= 0.05 is silent). Documented for future +# tuning; widening is cheap, narrowing requires care because the LLM's +# self-reported turn confidence is naturally ~5pp noisier than its +# tool-call-time confidence. +_DEFAULT_TOLERANCE: float = 0.05 + + +class AgentTurnOutput(BaseModel): + """Structural envelope every agent invocation MUST emit. + + The framework wires this as ``response_format=AgentTurnOutput`` on both + ``create_agent`` call sites (``runtime.graph`` and + ``runtime.agents.responsive``). Pydantic's ``extra="forbid"`` keeps the + contract narrow — adding fields is a deliberate schema migration, not a + free-for-all. + """ + + model_config = ConfigDict(extra="forbid") + + content: str = Field( + min_length=1, + description="Final user-facing message text.", + ) + confidence: float = Field( + ge=0.0, + le=1.0, + description=( + "Calibrated confidence in this turn's output: " + "0.85+ strong, 0.5 hedged, <0.4 weak." + ), + ) + confidence_rationale: str = Field( + min_length=1, + description="One-sentence explanation of the confidence value.", + ) + signal: str | None = Field( + default=None, + description=( + "Optional next-state signal " + "(e.g. success | failed | needs_input | default). " + "Routing layer validates the vocabulary." + ), + ) + + +class EnvelopeMissingError(Exception): + """Raised by :func:`parse_envelope_from_result` when neither + ``result["structured_response"]`` nor a JSON-shaped final AIMessage + yields a valid :class:`AgentTurnOutput`. + + Carries structured cause attributes (``agent``, ``field``) so the + runner can mark the agent_run as ``error`` with a precise reason. + """ + + def __init__(self, *, agent: str, field: str, message: str | None = None): + self.agent = agent + self.field = field + super().__init__(message or f"envelope_missing: {field} (agent={agent})") + + +def parse_envelope_from_result( + result: dict, + *, + agent: str, +) -> AgentTurnOutput: + """Extract an :class:`AgentTurnOutput` from a ``create_react_agent`` result. + + Three-step defensive fallback (Risk #1 — Ollama may not honor + ``response_format`` cleanly across all providers): + + 1. ``result["structured_response"]`` — preferred path; LangGraph 1.1.x + populates it when ``response_format`` is set and the LLM honors + structured output. + 2. ``result["messages"][-1].content`` parsed as JSON, validated against + :class:`AgentTurnOutput` — covers providers that stuff envelope JSON + in the AIMessage body instead of a separate structured field. + 3. Both fail → :class:`EnvelopeMissingError` so the runner marks + agent_run ``error`` with a structured cause. + """ + # Path 1: structured_response (preferred) + sr = result.get("structured_response") + if isinstance(sr, AgentTurnOutput): + return sr + if isinstance(sr, dict): + try: + return AgentTurnOutput.model_validate(sr) + except Exception: # noqa: BLE001 + # Path 1 produced a dict that doesn't match the envelope + # schema. Fall through to Path 2 (parse last AIMessage), but + # log so providers shipping malformed structured_response are + # observable instead of silently degraded. + _LOG.debug( + "envelope path 1 (structured_response dict) failed validation; " + "falling through to AIMessage JSON parse", + exc_info=True, + ) + + # Path 2: JSON-parse last AIMessage content + messages = result.get("messages") or [] + for msg in reversed(messages): + if msg.__class__.__name__ != "AIMessage": + continue + content = getattr(msg, "content", None) + if not isinstance(content, str) or not content.strip(): + continue + try: + payload = json.loads(content) + except (json.JSONDecodeError, ValueError): + continue + if not isinstance(payload, dict): + continue + try: + return AgentTurnOutput.model_validate(payload) + except Exception: # noqa: BLE001 + continue + break + + # Path 3: fail loudly + raise EnvelopeMissingError( + agent=agent, + field="structured_response", + message=( + f"envelope_missing: no structured_response or JSON-decodable " + f"AIMessage envelope found (agent={agent})" + ), + ) + + +def reconcile_confidence( + envelope_value: float, + tool_arg_value: float | None, + *, + agent: str, + session_id: str, + tool_name: str | None, + tolerance: float = _DEFAULT_TOLERANCE, +) -> float: + """Reconcile envelope confidence against typed-terminal-tool-arg confidence. + + D-10-03 contract: + - When ``tool_arg_value`` is None: return envelope value silently. + - When both present and ``|envelope - tool_arg| <= tolerance``: return + tool-arg silently (tool-arg wins on the return regardless — it's the + finer-grained, gated value). + - When both present and ``|envelope - tool_arg| > tolerance``: log INFO + with the verbatim format from CONTEXT.md / D-10-03 and return tool-arg. + + Log shape (preserved verbatim for grep-based observability assertions): + ``runtime.orchestrator: turn.confidence_mismatch agent={a} turn_value={e:.2f} tool_value={t:.2f} tool={tn} session_id={sid}`` + """ + if tool_arg_value is None: + return envelope_value + diff = abs(envelope_value - tool_arg_value) + if diff > tolerance: + _LOG.info( + "turn.confidence_mismatch " + "agent=%s turn_value=%.2f tool_value=%.2f tool=%s session_id=%s", + agent, + envelope_value, + tool_arg_value, + tool_name, + session_id, + ) + return tool_arg_value + + +__all__ = [ + "AgentTurnOutput", + "EnvelopeMissingError", + "parse_envelope_from_result", + "reconcile_confidence", +] + +# ====== module: runtime/tools/gateway.py ====== + +if TYPE_CHECKING: + pass +GatewayAction = Literal["auto", "notify", "approve"] + +_RISK_TO_ACTION: dict[str, GatewayAction] = { + "low": "auto", + "medium": "notify", + "high": "approve", +} + +_UTC_TS_FMT = "%Y-%m-%dT%H:%M:%SZ" + + +def effective_action( + tool_name: str, + *, + env: str | None, + gateway_cfg: GatewayConfig | None, +) -> GatewayAction: + """Resolve the effective gateway action for a tool invocation. + + Order of evaluation (the prod-override predicate runs FIRST so it can + only TIGHTEN the action — never relax it): + + 1. ``gateway_cfg is None`` -> ``"auto"`` (gateway disabled). + 2. Prod override: if ``cfg.prod_overrides`` is configured AND + ``env`` is in ``prod_environments`` AND ``tool_name`` matches + one of the ``resolution_trigger_tools`` globs -> ``"approve"``. + 3. Risk-tier lookup: ``cfg.policy.get(tool_name)`` mapped via + ``low->auto``, ``medium->notify``, ``high->approve``. + 4. No policy entry -> ``"auto"`` (safe default). + + Tool-name lookups try the fully-qualified name (``:``, + as registered by ``runtime.mcp_loader``) FIRST, then the bare + suffix as a fallback. This lets app config use bare names without + knowing the server prefix while keeping prefixed-form policy keys + deterministically more specific. Globs in + ``resolution_trigger_tools`` are matched against both forms for + the same reason, prefixed first. + + The function is pure: same inputs always yield the same output and + no argument is mutated. + """ + if gateway_cfg is None: + return "auto" + + bare = tool_name.split(":", 1)[1] if ":" in tool_name else None + + overrides = gateway_cfg.prod_overrides + if overrides is not None and env and env in overrides.prod_environments: + for pattern in overrides.resolution_trigger_tools: + if fnmatchcase(tool_name, pattern): + return "approve" + if bare is not None and fnmatchcase(bare, pattern): + return "approve" + + risk = gateway_cfg.policy.get(tool_name) + if risk is not None: + return _RISK_TO_ACTION[risk] + if bare is not None: + risk = gateway_cfg.policy.get(bare) + if risk is not None: + return _RISK_TO_ACTION[risk] + return "auto" + + +def _now_iso() -> str: + return datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + + +def _find_pending_index( + tool_calls: list, + tool_name: str, + ts: str, +) -> int | None: + """Locate the index of the ``pending_approval`` ToolCall row that + matches ``tool_name`` and ``ts``. + + Used by the wrap_tool resume path to update the in-place audit row + rather than appending a duplicate. The watchdog may have replaced + the row with a ``timeout`` entry while the graph was paused — in + that case we return ``None`` and the resume path leaves the audit + list unchanged (the watchdog already wrote the canonical record). + + Searches from the end of the list because the pending row is + almost always the most recent ToolCall. + """ + for idx in range(len(tool_calls) - 1, -1, -1): + tc = tool_calls[idx] + if (getattr(tc, "tool", None) == tool_name + and getattr(tc, "ts", None) == ts + and getattr(tc, "status", None) == "pending_approval"): + return idx + return None + + +def _find_existing_pending_index( + tool_calls: list, + tool_name: str, +) -> int | None: + """Find the most recent ``pending_approval`` row for ``tool_name``. + + LangGraph's interrupt/resume model re-runs the gated node from the + top after ``Command(resume=...)``; we re-use the existing pending + row rather than appending a duplicate every time the closure + re-enters the approve branch. + """ + for idx in range(len(tool_calls) - 1, -1, -1): + tc = tool_calls[idx] + if (getattr(tc, "tool", None) == tool_name + and getattr(tc, "status", None) == "pending_approval"): + return idx + return None + + +def _evaluate_gate( + *, + session: Session, + tool_name: str, + gate_policy: GatePolicy | None, + gateway_cfg: GatewayConfig | None, +) -> "GateDecision": + """Phase 11 (FOC-04) bridge: invoke ``should_gate`` from the wrap. + + Constructs a minimal ``ToolCall`` shape for the pure-function + boundary, and a temporary ``OrchestratorConfig`` shim with the + in-flight ``gate_policy`` + ``gateway`` so the pure function sees + a single config object (its declared signature). + + When ``gate_policy`` is ``None`` -- the legacy callers that have + not yet been threaded -- a default ``GatePolicy()`` is used so + Phase-11 behaviour applies uniformly. The default mirrors v1.0 + HITL behaviour (``gated_risk_actions={"approve"}``), so existing + pre-Phase-11 tests keep passing. + """ + # Local imports (avoid cycle on policy.py importing gateway). + # ``GateDecision`` is type-only here -- the lazy import sits in the + # TYPE_CHECKING block at module top. + + + + effective_policy = gate_policy if gate_policy is not None else GatePolicy() + # OrchestratorConfig has model_config={"extra": "forbid"} so we + # cannot stash gateway as a top-level field. We thread gateway via + # the cfg.gateway lookup that should_gate already performs via + # ``getattr(cfg, "gateway", None)``. Building a transient cfg with + # gate_policy and a stashed gateway attr is the smallest-diff + # pathway -- avoids changing should_gate's signature. + cfg = OrchestratorConfig(gate_policy=effective_policy) + object.__setattr__(cfg, "gateway", gateway_cfg) + + minimal_tc = ToolCall( + agent="", + tool=tool_name, + args={}, + result=None, + ts=_now_iso(), + risk="low", + status="executed", + ) + confidence = getattr(session, "turn_confidence_hint", None) + decision: GateDecision = should_gate( + session=session, tool_call=minimal_tc, confidence=confidence, cfg=cfg, + ) + return decision + + +class _GatedToolMarker(BaseTool): + """Marker base class so ``isinstance(t, _GatedToolMarker)`` identifies + a tool that has already been wrapped by :func:`wrap_tool`. Used to + short-circuit ``wrap_tool(wrap_tool(t))`` and avoid wrapper recursion. + + Not instantiated directly — every ``_GatedTool`` defined inside + :func:`wrap_tool` inherits from this. + """ + + name: str = "_gated_marker" + description: str = "internal — never invoked" + + def _run(self, *args: Any, **kwargs: Any) -> Any: # pragma: no cover + raise NotImplementedError("marker base — _GatedTool overrides this") + + +def wrap_tool( + base_tool: BaseTool, + *, + session: Session, + gateway_cfg: GatewayConfig | None, + agent_name: str = "", + store: "SessionStore | None" = None, + injected_args: dict[str, str] | None = None, + gate_policy: GatePolicy | None = None, +) -> BaseTool: + """Wrap ``base_tool`` so every invocation passes through the gateway. + + The factory closes over ``session`` and ``gateway_cfg`` so the live + audit log (``session.tool_calls``) is the same instance the rest of + the orchestrator reads — no detour through a separate audit table. + + Returned object is a ``BaseTool`` subclass instance whose ``name`` + and ``description`` mirror the underlying tool, so LangGraph's ReAct + prompt builder still sees the right tool surface. + + Idempotent: wrapping an already-gated tool returns it unchanged so a + second ``wrap_tool(wrap_tool(t))`` does not nest wrappers (which would + cause unbounded recursion when ``_run`` calls ``inner.invoke`` and + that dispatches back into another ``_GatedTool._run``). + + Phase 9 (D-09-01 / D-09-03): when ``injected_args`` is supplied, the + gateway expands ``kwargs`` with session-derived values BEFORE + ``effective_action`` is consulted — so the gateway's risk-rating + sees the canonical ``environment`` (avoiding T-09-05: gateway + misclassifies prod as auto because env was missing from the LLM + args). + """ + if isinstance(base_tool, _GatedToolMarker): + return base_tool + + env = getattr(session, "environment", None) + inner = base_tool + inject_cfg = injected_args or {} + + # Phase 9 (D-09-01): the LLM-visible args_schema on the wrapper must + # exclude every injected key — otherwise BaseTool's input validator + # rejects the call when the LLM omits a "required" arg the framework + # is about to supply. The inner tool keeps its full schema so the + # downstream invoke still sees every kwarg. + if inject_cfg: + + _llm_visible_schema = strip_injected_params( + inner, frozenset(inject_cfg.keys()), + ).args_schema + else: + _llm_visible_schema = inner.args_schema + + # Phase 9 follow-up: compute the set of param names the inner tool + # actually accepts so injection skips keys the target tool doesn't + # declare. Without this filter, a config-wide ``injected_args`` + # entry like ``session_id: session.id`` is unconditionally written + # to every tool's kwargs — tools that don't accept ``session_id`` + # then raise pydantic ``unexpected_keyword`` errors at the FastMCP + # validation boundary. ``accepted_params_for_tool`` handles both + # pydantic-model and JSON-Schema-dict ``args_schema`` shapes. + + _accepted_params: frozenset[str] | None = accepted_params_for_tool(inner) + + def _sync_invoke_inner(payload: Any) -> Any: + """Sync-invoke the inner tool, translating BaseTool's + default-``_run`` ``NotImplementedError`` into a clearer message + for native-async-only tools. Without this, callers see a vague + ``NotImplementedError`` from langchain core with no hint that + the right path is ``ainvoke``.""" + try: + return inner.invoke(payload) + except NotImplementedError as exc: + raise NotImplementedError( + f"Tool {inner.name!r} appears to be async-only " + f"(``_run`` not implemented). Use ``ainvoke`` / ``_arun`` " + f"for this tool instead of the sync invoke path." + ) from exc + + # Tool-naming regex differs across LLM providers — Ollama allows + # ``[a-zA-Z0-9_.\-]{1,256}``, OpenAI is stricter at + # ``^[a-zA-Z0-9_-]+$`` (no dots). The framework's internal naming + # uses ``:`` for PVC-08 prefixed-form policy lookups, + # but the LLM only sees the *wrapper*'s ``.name``. Use ``__`` + # (double underscore) as the LLM-visible separator: it satisfies + # both providers' regexes and is unambiguous (no real tool name + # contains a double underscore). ``inner.name`` keeps the colon + # form so ``effective_action`` / ``should_gate`` policy lookups + # stay PVC-08-compliant. + _llm_visible_name = inner.name.replace(":", "__") + + class _GatedTool(_GatedToolMarker): + name: str = _llm_visible_name + description: str = inner.description + # The wrapper does its own arg coercion via the inner tool's schema, + # so no need to copy it here. Keep ``args_schema`` aligned with the + # LLM-visible (post-strip) schema so BaseTool's input validator + # accepts the post-strip kwargs the LLM emits. Phase 9 strips + # injected keys here; pre-Phase-9 callers see the full schema. + args_schema: Any = _llm_visible_schema # type: ignore[assignment] + + def _run(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 + # Phase 9 (D-09-01 / T-09-05): inject session-derived args + # BEFORE the gateway risk lookup so risk-rating sees the + # post-injection environment value. Pure no-op when + # ``injected_args`` is empty. + if inject_cfg: + + kwargs = inject_injected_args( + kwargs, + session=session, + injected_args_cfg=inject_cfg, + tool_name=inner.name, + accepted_params=_accepted_params or None, + ) + # Phase 11 (FOC-04): pure-policy gating boundary. Call + # should_gate to decide whether to pause for HITL approval; + # also call effective_action so the notify-audit branch + # below still fires for medium-risk tools that should NOT + # gate but should record an audit row. + action = effective_action( + inner.name, env=env, gateway_cfg=gateway_cfg, + ) + decision = _evaluate_gate( + session=session, + tool_name=inner.name, + gate_policy=gate_policy, + gateway_cfg=gateway_cfg, + ) + if decision.gate: + from langgraph.types import interrupt + + # Persist a ``pending_approval`` ToolCall row BEFORE + # raising GraphInterrupt so the approval-timeout watchdog + # has a record to scan. ``ts`` is the moment the human + # approval window opened. Stored args mirror the post- + # decision rows so the audit history reads consistently. + # + # On resume, LangGraph re-enters this node and runs us + # again from the top — so we must re-use the existing + # pending row instead of appending a duplicate. The most + # recent ``pending_approval`` row for this tool wins. + pending_args = dict(kwargs) if kwargs else {"args": list(args)} + existing_idx = _find_existing_pending_index( + session.tool_calls, inner.name, + ) + if existing_idx is not None: + pending_ts = session.tool_calls[existing_idx].ts + else: + pending_ts = _now_iso() + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=None, + ts=pending_ts, + risk="high", + status="pending_approval", + ) + ) + # CRITICAL: persist the pending_approval row BEFORE + # raising interrupt() so the approval-timeout + # watchdog (which reads from the DB) and the + # /approvals UI can see the pending state. Without + # this save the in-memory mutation is invisible to + # any out-of-process observer. + if store is not None: + store.save(session) + payload = { + "kind": "tool_approval", + "tool": inner.name, + "args": kwargs or args, + "tool_call_id": kwargs.get("tool_call_id"), + } + # First execution: raises GraphInterrupt, checkpointer pauses. + # Resume: returns whatever Command(resume=...) supplied. + decision = interrupt(payload) + # Decision payload may be a string ("approve" / "reject" / + # "timeout") or a dict {decision, approver, rationale}. + if isinstance(decision, dict): + verdict = decision.get("decision", "approve") + approver = decision.get("approver") + rationale = decision.get("rationale") + else: + verdict = decision or "approve" + approver = None + rationale = None + # Update the pending_approval row in place rather than + # appending a second audit entry. The watchdog and the + # /approvals UI both reason about a single audit row per + # high-risk call. + pending_idx = _find_pending_index( + session.tool_calls, inner.name, pending_ts, + ) + verdict_str = str(verdict).lower() + if verdict_str == "reject": + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"rejected": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="rejected", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"rejected": True, "rationale": rationale} + if verdict_str == "timeout": + # The approval window expired. Do NOT run the tool; + # mark the audit row ``status="timeout"`` so + # downstream consumers (UI, retraining) can + # distinguish operator-initiated rejections from + # automatic timeouts. + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"timeout": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="timeout", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"timeout": True, "rationale": rationale} + # Approved -> run the tool, then update the audit row. + result = _sync_invoke_inner(kwargs if kwargs else args[0] if args else {}) + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=result, + ts=pending_ts, + risk="high", + status="approved", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return result + + # auto / notify both run the tool now. + result = _sync_invoke_inner(kwargs if kwargs else args[0] if args else {}) + + if action == "notify": + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=dict(kwargs) if kwargs else {"args": list(args)}, + result=result, + ts=_now_iso(), + risk="medium", + status="executed_with_notify", + ) + ) + return result + + async def _arun(self, *args: Any, **kwargs: Any) -> Any: # noqa: D401 + # Phase 9 (D-09-01 / T-09-05): inject session-derived args + # BEFORE the gateway risk lookup. Mirror of the sync ``_run``. + if inject_cfg: + + kwargs = inject_injected_args( + kwargs, + session=session, + injected_args_cfg=inject_cfg, + tool_name=inner.name, + accepted_params=_accepted_params or None, + ) + # Phase 11 (FOC-04): pure-policy gating boundary. Mirror of + # the sync ``_run`` -- consult should_gate via + # ``_evaluate_gate``; still call ``effective_action`` to + # keep the notify-audit branch for medium-risk tools. + action = effective_action( + inner.name, env=env, gateway_cfg=gateway_cfg, + ) + decision = _evaluate_gate( + session=session, + tool_name=inner.name, + gate_policy=gate_policy, + gateway_cfg=gateway_cfg, + ) + if decision.gate: + from langgraph.types import interrupt + + # Persist a ``pending_approval`` audit row BEFORE the + # GraphInterrupt fires so the watchdog can spot stale + # approvals. See the sync ``_run`` mirror for details. + pending_args = dict(kwargs) if kwargs else {"args": list(args)} + existing_idx = _find_existing_pending_index( + session.tool_calls, inner.name, + ) + if existing_idx is not None: + pending_ts = session.tool_calls[existing_idx].ts + else: + pending_ts = _now_iso() + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=None, + ts=pending_ts, + risk="high", + status="pending_approval", + ) + ) + # CRITICAL: persist the pending_approval row BEFORE + # raising interrupt() so the approval-timeout + # watchdog (which reads from the DB) and the + # /approvals UI can see the pending state. + if store is not None: + store.save(session) + payload = { + "kind": "tool_approval", + "tool": inner.name, + "args": kwargs or args, + "tool_call_id": kwargs.get("tool_call_id"), + } + decision = interrupt(payload) + if isinstance(decision, dict): + verdict = decision.get("decision", "approve") + approver = decision.get("approver") + rationale = decision.get("rationale") + else: + verdict = decision or "approve" + approver = None + rationale = None + pending_idx = _find_pending_index( + session.tool_calls, inner.name, pending_ts, + ) + verdict_str = str(verdict).lower() + if verdict_str == "reject": + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"rejected": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="rejected", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"rejected": True, "rationale": rationale} + if verdict_str == "timeout": + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result={"timeout": True, "rationale": rationale}, + ts=pending_ts, + risk="high", + status="timeout", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return {"timeout": True, "rationale": rationale} + result = await inner.ainvoke(kwargs if kwargs else args[0] if args else {}) + if pending_idx is not None: + session.tool_calls[pending_idx] = ToolCall( + agent=agent_name, + tool=inner.name, + args=pending_args, + result=result, + ts=pending_ts, + risk="high", + status="approved", + approver=approver, + approved_at=_now_iso(), + approval_rationale=rationale, + ) + return result + + result = await inner.ainvoke(kwargs if kwargs else args[0] if args else {}) + + if action == "notify": + session.tool_calls.append( + ToolCall( + agent=agent_name, + tool=inner.name, + args=dict(kwargs) if kwargs else {"args": list(args)}, + result=result, + ts=_now_iso(), + risk="medium", + status="executed_with_notify", + ) + ) + return result + + return _GatedTool() + +# ====== module: runtime/tools/arg_injection.py ====== + +_LOG = logging.getLogger("runtime.orchestrator") + + +def strip_injected_params( + tool: BaseTool, + injected_keys: frozenset[str], +) -> BaseTool: + """Return a ``BaseTool`` whose ``args_schema`` hides every param named + in ``injected_keys``. + + The LLM only sees the stripped sig; the framework re-adds the real + values at invocation time via :func:`inject_injected_args` (D-09-01). + + Properties: + + * **Pure.** The original tool is left unchanged — its ``args_schema`` + is not mutated, so tests and in-process callers that hold a direct + reference keep their full schema. + * **Idempotent.** Calling twice with the same keys is equivalent to + calling once. The cloned schema is structurally identical. + * **Identity short-circuit.** Empty ``injected_keys`` (or no overlap + between ``injected_keys`` and the tool's params) returns the tool + unchanged so unconfigured apps and tools without any injectable + params pay nothing. + """ + if not injected_keys: + return tool + schema = getattr(tool, "args_schema", None) + if schema is None: + return tool + + # --- dict path: FastMCP / JSON-Schema tools --------------------------- + # FastMCP exposes ``args_schema`` as a plain JSON-Schema dict rather + # than a Pydantic model. Strip injected keys directly from the dict. + if isinstance(schema, dict): + props = schema.get("properties", {}) + overlap = injected_keys & set(props) + if not overlap: + return tool + new_props = {k: v for k, v in props.items() if k not in injected_keys} + required = [r for r in schema.get("required", []) if r not in injected_keys] + new_dict_schema: dict[str, Any] = {**schema, "properties": new_props, "required": required} + try: + return tool.model_copy(update={"args_schema": new_dict_schema}) + except Exception: # pragma: no cover — defensive fallback + import copy + stripped = copy.copy(tool) + stripped.args_schema = new_dict_schema # type: ignore[attr-defined] + return stripped + + # --- Pydantic path: BaseModel subclass tools -------------------------- + if not hasattr(schema, "model_fields"): + return tool + overlap = injected_keys & set(schema.model_fields.keys()) + if not overlap: + # No params to strip — preserve identity (no clone). + return tool + + # Build the kwargs for ``create_model`` from the surviving fields. + # Pydantic v2's ``create_model`` accepts ``(annotation, FieldInfo)`` + # tuples; FieldInfo carries default + description + alias so the + # cloned schema is functionally equivalent to the original minus + # the stripped fields. + keep: dict[str, tuple[Any, Any]] = { + name: (f.annotation, f) + for name, f in schema.model_fields.items() + if name not in injected_keys + } + new_schema = create_model( + f"{schema.__name__}__StrippedForLLM", + __base__=BaseModel, + **keep, # type: ignore[arg-type] + ) + + # ``BaseTool`` is itself a pydantic BaseModel — ``model_copy`` clones + # it cheaply and lets us swap ``args_schema`` without touching the + # original. Tools that are not pydantic models (extremely rare; only + # custom subclasses) fall back to a regular shallow copy. + try: + stripped = tool.model_copy(update={"args_schema": new_schema}) + except Exception: # pragma: no cover — defensive fallback + import copy + stripped = copy.copy(tool) + stripped.args_schema = new_schema # type: ignore[attr-defined] + return stripped + + +def _resolve_dotted(root: Session, path: str) -> Any | None: + """Walk ``path`` ('session.foo.bar') against ``root`` and return the + terminal value or ``None`` if any segment is missing / None. + + ``path`` must start with ``session.``. The leading ``session`` token + pins the resolution root to the live Session — config-declared paths + cannot reach into arbitrary modules. Subsequent segments walk + attributes (``getattr``) — for fields stored under ``extra_fields`` + apps use ``session.extra_fields.foo`` which goes through the dict + branch below. + """ + parts = path.split(".") + if not parts or parts[0] != "session": + raise ValueError( + f"injected_args path {path!r} must start with 'session.'" + ) + cur: Any = root + for seg in parts[1:]: + if cur is None: + return None + # Support dict-valued attrs (notably ``Session.extra_fields``) + # transparently — ``session.extra_fields.pr_url`` resolves + # whether ``extra_fields`` is a real attribute or a dict on + # the model. Plain attribute walks work for typed Session + # subclasses (``IncidentState.environment``). + if isinstance(cur, dict): + cur = cur.get(seg) + else: + cur = getattr(cur, seg, None) + return cur + + +def inject_injected_args( + tool_args: dict[str, Any], + *, + session: Session, + injected_args_cfg: dict[str, str], + tool_name: str, + accepted_params: set[str] | frozenset[str] | None = None, +) -> dict[str, Any]: + """Return a NEW dict with each injected arg resolved from ``session``. + + Behaviour (D-09-03): + + * Mutation-free: ``tool_args`` is never modified. Callers that need + to keep the LLM's original call shape can compare ``tool_args`` to + the return value. + * Framework wins on conflict. When the LLM already supplied a value + and the resolved framework value differs, the framework value is + written and a single INFO record is emitted on the + ``runtime.orchestrator`` logger with the documented payload tokens + (``tool``, ``arg``, ``llm_value``, ``framework_value``, + ``session_id``). + * Missing/None resolutions are skipped. The arg is left absent so + the tool's own default-handling (or the MCP server's required-arg + validator) decides what to do — never silently ``None``. + * When ``accepted_params`` is provided, injected keys not present in + that set are skipped. Prevents writing kwargs the target tool + doesn't accept (which would raise pydantic ``unexpected_keyword`` + validation errors at the FastMCP boundary). + """ + out = dict(tool_args) + for arg_name, path in injected_args_cfg.items(): + if accepted_params is not None and arg_name not in accepted_params: + # The tool doesn't declare this injectable param. Strip any + # LLM-supplied value too — the LLM shouldn't be emitting it + # (Phase 9 strips injectable keys from the LLM-visible sig) + # and forwarding it to the tool would raise pydantic + # ``unexpected_keyword`` at the FastMCP boundary. + if arg_name in out: + _LOG.info( + "tool_call.injected_arg_dropped tool=%s arg=%s " + "llm_value=%r reason=not_accepted_by_tool session_id=%s", + tool_name, + arg_name, + out[arg_name], + getattr(session, "id", "?"), + ) + del out[arg_name] + continue + framework_value = _resolve_dotted(session, path) + if framework_value is None: + continue + if arg_name in out and out[arg_name] != framework_value: + _LOG.info( + "tool_call.injected_arg_overridden tool=%s arg=%s " + "llm_value=%r framework_value=%r session_id=%s", + tool_name, + arg_name, + out[arg_name], + framework_value, + getattr(session, "id", "?"), + ) + out[arg_name] = framework_value + return out + + +def accepted_params_for_tool(tool: Any) -> frozenset[str] | None: + """Return the set of parameter names a wrapped tool accepts. + + Handles both shapes ``args_schema`` can take in this codebase: + + * pydantic ``BaseModel`` subclass — read ``model_fields.keys()`` + (used by mock tools and by tests). + * JSON-Schema ``dict`` — read ``schema["properties"].keys()`` + (used by real FastMCP-derived tools, which expose the underlying + function's input schema as a JSON Schema rather than a pydantic + class). + + Returns ``None`` when the tool has no introspectable schema (caller + should treat this as "skip filtering" — preserves prior behaviour). + """ + schema = getattr(tool, "args_schema", None) + if schema is None: + return None + if hasattr(schema, "model_fields"): + return frozenset(schema.model_fields.keys()) + if isinstance(schema, dict): + props = schema.get("properties") + if isinstance(props, dict): + return frozenset(props.keys()) + return None + + +__all__ = [ + "strip_injected_params", + "inject_injected_args", + "accepted_params_for_tool", + "_LOG", +] + +# ====== module: runtime/tools/approval_watchdog.py ====== + +if TYPE_CHECKING: + pass +logger = logging.getLogger(__name__) + +_UTC_TS_FMT = "%Y-%m-%dT%H:%M:%SZ" + +# Sessions whose status is in this set are *not* candidates for the +# watchdog — either they never paused for approval, or they have already +# moved past it. ``awaiting_input`` is the only status produced by +# ``langgraph.types.interrupt()`` while a high-risk gate is open. +_TERMINAL_STATUSES = frozenset({ + "resolved", "stopped", "escalated", "duplicate", "deleted", "error", +}) + + +def _parse_iso(ts: str | None) -> datetime | None: + """Parse an ISO-8601 ``YYYY-MM-DDTHH:MM:SSZ`` ts back into UTC. + + Returns ``None`` for malformed values; callers treat that as + "skip this row" so the watchdog never crashes on a bad audit + record. + """ + if not ts: + return None + try: + # Replace trailing 'Z' so ``fromisoformat`` accepts it on + # Python <3.11. The format is fixed by ``_UTC_TS_FMT`` so this + # round-trips cleanly. + if ts.endswith("Z"): + ts = ts[:-1] + "+00:00" + dt = datetime.fromisoformat(ts) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc) + except (ValueError, TypeError): + return None + + +class ApprovalWatchdog: + """Background asyncio task that resumes stale pending-approval sessions. + + Owned by :class:`runtime.service.OrchestratorService`; started in + ``OrchestratorService.start()`` and stopped in ``shutdown()``. The + task runs on the service's background loop so it shares the same + checkpointer / SQLite engine / FastMCP transports the live + sessions are using. + """ + + def __init__( + self, + service: "OrchestratorService", + *, + approval_timeout_seconds: int, + poll_interval_seconds: float = 60.0, + ) -> None: + self._service = service + self._approval_timeout_seconds = approval_timeout_seconds + self._poll_interval_seconds = poll_interval_seconds + self._task: asyncio.Task | None = None + self._stop_event: asyncio.Event | None = None + # HARD-07: ``stop()`` is idempotent. Once a stop has been + # initiated (or completed), subsequent calls return immediately + # rather than racing on ``_task`` / ``_stop_event`` which the + # first caller is already clearing. Mutated only on the loop + # thread (where ``stop()`` runs), so no extra lock needed. + self._stopped: bool = False + + @property + def is_running(self) -> bool: + return self._task is not None and not self._task.done() + + def start(self, loop: asyncio.AbstractEventLoop) -> None: + """Schedule the watchdog onto ``loop``. Idempotent. + + Must be called from a thread that is not the loop's own thread — + the typical caller is :meth:`OrchestratorService.start`. Returns + immediately; the polling coroutine runs in the background. + """ + if self._task is not None and not self._task.done(): + return + + async def _arm() -> None: + # Re-arm: a previous ``stop()`` may have flipped this; a + # fresh ``start()`` re-enables ``stop()``. + self._stopped = False + self._stop_event = asyncio.Event() + self._task = asyncio.create_task( + self._run(), name="approval_watchdog", + ) + + fut = asyncio.run_coroutine_threadsafe(_arm(), loop) + fut.result(timeout=5.0) + + async def stop(self) -> None: + """Signal the polling loop to exit and await termination. + + HARD-07: Idempotent and abrupt-shutdown safe. Safe to call: + * before ``start()`` (no-op), + * multiple times (subsequent calls short-circuit on + ``_stopped`` after the first caller flips it), + * concurrently from two callers — the first claims ownership + of ``_task`` and drains it; the second sees the task is + already gone and returns. + + Cancellation strategy: signal via ``_stop_event`` first so the + polling loop exits its ``wait_for`` cleanly; then bound the + drain by ``asyncio.wait_for(task, timeout=1.0)``. If the task + ignores the event (or the event loop is being torn down under + us), fall back to ``task.cancel()`` and one final drain. + ``CancelledError`` and ``TimeoutError`` are suppressed — there + is no useful recovery from a watchdog that won't die. + + Runs on the loop thread (called from ``OrchestratorService._close_*`` + helpers, or as a graceful no-op cleanup hook). + """ + # First-call wins. Subsequent callers (and the after-shutdown + # path) see ``_stopped`` and return without re-running the + # drain — protects against double-await on ``_task``. + if self._stopped: + return + self._stopped = True + # Snapshot to LOCAL variables so concurrent ``stop()`` calls + # never re-await the same task. We do NOT null out ``_task`` / + # ``_stop_event`` until after the drain because ``_run()`` + # reads ``self._stop_event`` on every loop iteration; clearing + # it before signalling would crash the polling loop with + # ``AttributeError: 'NoneType' object has no attribute + # 'is_set'`` and produce exactly the noisy teardown this fix + # is meant to prevent. + task = self._task + stop_event = self._stop_event + if stop_event is not None: + stop_event.set() + if task is None or task.done(): + self._task = None + self._stop_event = None + return + try: + await asyncio.wait_for(asyncio.shield(task), timeout=1.0) + except (asyncio.TimeoutError, asyncio.CancelledError): + task.cancel() + try: + await asyncio.wait_for(task, timeout=1.0) + except (asyncio.TimeoutError, asyncio.CancelledError): + # Task is wedged or the loop is shutting down under us. + # The ``cancel()`` call above is enough to flip the task + # state; ``run_loop`` 's final ``gather`` pass will sweep + # it during loop teardown. Don't block shutdown further. + pass + finally: + # Always clear the bookkeeping refs so a subsequent + # ``start()`` arms cleanly and ``is_running`` reports False. + self._task = None + self._stop_event = None + + async def close(self) -> None: + """Alias for :meth:`stop` — symmetric with aiohttp/httpx. + + Idempotent. Provided so callers using a "close-on-cleanup" + pattern (``async with`` on parent owners) read naturally. + """ + await self.stop() + + async def _run(self) -> None: + """Polling loop. Runs until ``_stop_event`` is set. + + We bind ``stop_event`` to a LOCAL variable on entry so a + concurrent ``stop()`` cannot null out ``self._stop_event`` + from underneath us mid-iteration (HARD-07: that nulling-while- + running was the original source of ``AttributeError`` at + teardown). + """ + stop_event = self._stop_event + assert stop_event is not None + while not stop_event.is_set(): + try: + await self._tick() + except asyncio.CancelledError: + raise + except Exception: # noqa: BLE001 + logger.exception("approval watchdog tick failed") + try: + await asyncio.wait_for( + stop_event.wait(), + timeout=self._poll_interval_seconds, + ) + except asyncio.TimeoutError: + # Expected — wakes the loop every ``poll_interval_seconds``. + continue + + async def _tick(self) -> None: + """One scan + resume pass. Visible for tests via ``run_once``.""" + await self.run_once() + + async def run_once(self) -> int: + """Single scan pass. Returns the number of sessions resumed. + + Exposed publicly so tests can drive the watchdog + deterministically without waiting on the polling cadence. + """ + orch = getattr(self._service, "_orch", None) + if orch is None: + return 0 + registry = dict(self._service._registry) + if not registry: + return 0 + now = datetime.now(timezone.utc) + resumed = 0 + for session_id in list(registry.keys()): + try: + inc = orch.store.load(session_id) + except Exception: # noqa: BLE001 + continue + status = getattr(inc, "status", None) + if status in _TERMINAL_STATUSES: + continue + if status != "awaiting_input": + # Only sessions paused on a high-risk gate are watchdog + # candidates. ``in_progress`` / ``new`` are still + # actively running on the loop. + continue + stale = self._find_stale_pending(inc, now) + if not stale: + continue + # No is_locked() peek here — try_acquire (inside + # _resume_with_timeout) is the single contention check, so + # there is no TOCTOU window between check and acquire. The + # SessionBusy handler below fires on real contention. + try: + await self._resume_with_timeout(orch, session_id) + resumed += 1 + except SessionBusy: + logger.debug( + "approval watchdog: session %s SessionBusy at resume, skipping", + session_id, + ) + continue + except Exception: # noqa: BLE001 + logger.exception( + "approval watchdog: resume failed for session %s", + session_id, + ) + return resumed + + def _find_stale_pending(self, inc: Any, now: datetime) -> list[int]: + """Return indices of ``pending_approval`` ToolCalls older than the + configured timeout.""" + out: list[int] = [] + tool_calls = getattr(inc, "tool_calls", []) or [] + threshold = self._approval_timeout_seconds + for idx, tc in enumerate(tool_calls): + if getattr(tc, "status", None) != "pending_approval": + continue + ts = _parse_iso(getattr(tc, "ts", None)) + if ts is None: + continue + age = (now - ts).total_seconds() + if age >= threshold: + out.append(idx) + return out + + async def _resume_with_timeout( + self, orch: Any, session_id: str, + ) -> None: + """Resume the paused graph with a synthetic timeout decision. + + Uses ``Command(resume=...)`` against the same ``thread_id`` the + approval API would use — the wrap_tool resume path updates the + audit row to ``status="timeout"`` automatically. + + Per D-18: the ``ainvoke`` call is wrapped in + ``orch._locks.try_acquire(session_id)`` so a concurrent user- + driven turn cannot interleave checkpoint writes for the same + ``thread_id``. If the lock is already held, ``try_acquire`` + raises ``SessionBusy`` immediately (no waiting); the caller + (``run_once``) catches that and skips the tick — this is how + the watchdog tolerates a busy session without piling up. + """ + from langgraph.types import Command # local: heavy import + + decision_payload = { + "decision": "timeout", + "approver": "system", + "rationale": "approval window expired", + } + async with orch._locks.try_acquire(session_id): + await orch.graph.ainvoke( + Command(resume=decision_payload), + config=orch._thread_config(session_id), + ) + +# ====== module: runtime/policy.py ====== + +if TYPE_CHECKING: # pragma: no cover -- type checking only + + + pass # noqa: PIE790 -- bundle survives even if imports are stripped + + +GateReason = Literal[ + "auto", + "high_risk_tool", + "gated_env", + "low_confidence", + "blocked", +] + + +class GateDecision(BaseModel): + """Outcome of a single gating evaluation.""" + + model_config = ConfigDict(extra="forbid") + gate: bool + reason: GateReason + + +def should_gate( + session: Any, + tool_call: "ToolCall", + confidence: float | None, + cfg: "OrchestratorConfig", +) -> GateDecision: + """Decide whether ``tool_call`` should pause for HITL approval. + + Pure -- delegates the per-tool risk lookup to + :func:`runtime.tools.gateway.effective_action` (so the v1.0 PVC-08 + prefixed-form lookup invariant is preserved) and combines the + result with ``session.environment`` and ``confidence`` per the + precedence rules in the module docstring. + + ``session`` is typed as ``Any`` because the framework's base + :class:`runtime.state.Session` does not own the ``environment`` + field (apps subclass and add it). The function reads + ``session.environment`` and tolerates a missing attribute by + treating it as ``None``. + + ``confidence=None`` means "no signal yet" -- treated internally as + 1.0 to avoid a false-positive low_confidence gate before any + envelope/tool-arg has surfaced for the active turn. + """ + # Read gateway config off the OrchestratorConfig. The runtime threads + # it via cfg.gateway today (sibling of cfg.gate_policy in the + # OrchestratorConfig namespace) -- gracefully tolerate the legacy + # path where gateway is configured on RuntimeConfig instead. + gateway_cfg = getattr(cfg, "gateway", None) + env = getattr(session, "environment", None) + + risk_action = effective_action( + tool_call.tool, + env=env, + gateway_cfg=gateway_cfg, + ) + + # 1. high-risk tool gates first. + if risk_action in cfg.gate_policy.gated_risk_actions: + return GateDecision(gate=True, reason="high_risk_tool") + + # 2. gated env: any non-"auto" risk in a gated environment. + if (env in cfg.gate_policy.gated_environments + and risk_action != "auto"): + return GateDecision(gate=True, reason="gated_env") + + # 3. low confidence: only an actionable tool. None == "no signal yet". + effective_conf = 1.0 if confidence is None else confidence + if (effective_conf < cfg.gate_policy.confidence_threshold + and risk_action != "auto"): + return GateDecision(gate=True, reason="low_confidence") + + return GateDecision(gate=False, reason="auto") + + +# --------------------------------------------------------------- +# Phase 12 (FOC-05): pure should_retry policy. +# --------------------------------------------------------------- + +import asyncio as _asyncio + +import pydantic as _pydantic + + +RetryReason = Literal[ + "auto_retry", + "max_retries_exceeded", + "permanent_error", + "low_confidence_no_retry", + "transient_disabled", +] + + +class RetryDecision(BaseModel): + """Outcome of a single retry-policy evaluation. + + Pure surface: produced by :func:`should_retry` from + ``(retry_count, error, confidence, cfg)``. The orchestrator's + ``_retry_session_locked`` consults this BEFORE running the retry; + the UI consults the same value via + ``Orchestrator.preview_retry_decision`` to render the button label / + disabled state. + """ + + model_config = ConfigDict(extra="forbid") + retry: bool + reason: RetryReason + + +# Whitelist of exception types that are NEVER auto-retryable. +# Schema/validation errors -- the LLM produced bad data; retrying +# without addressing root cause burns budget. Adding a new entry is a +# one-line PR (D-12-02 explicit choice -- no new ToolError ABC). +_PERMANENT_TYPES: tuple[type[BaseException], ...] = ( + _pydantic.ValidationError, + EnvelopeMissingError, +) + +# Whitelist of exception types that are ALWAYS auto-retryable +# (subject to max_retries). Network blips, asyncio timeouts, +# filesystem/socket transients. httpx is NOT imported because the +# runtime does not raise httpx errors today; built-in TimeoutError +# covers asyncio's 3.11+ alias. +_TRANSIENT_TYPES: tuple[type[BaseException], ...] = ( + _asyncio.TimeoutError, + TimeoutError, + OSError, + ConnectionError, +) + + +def _is_permanent_error(error: Exception | None) -> bool: + if error is None: + return False + return isinstance(error, _PERMANENT_TYPES) + + +def _is_transient_error(error: Exception | None) -> bool: + if error is None: + return False + return isinstance(error, _TRANSIENT_TYPES) + + +def should_retry( + retry_count: int, + error: Exception | None, + confidence: float | None, + cfg: "OrchestratorConfig", +) -> RetryDecision: + """Decide whether the framework should auto-retry a failed turn. + + Pure -- same inputs always yield identical RetryDecision. + + Precedence (descending; first match wins): + 1. ``retry_count >= cfg.retry_policy.max_retries`` + -> ``RetryDecision(retry=False, reason="max_retries_exceeded")`` + 2. ``error`` matches ``_PERMANENT_TYPES`` + -> ``RetryDecision(retry=False, reason="permanent_error")`` + 3. ``confidence is not None`` AND + ``confidence < cfg.retry_policy.retry_low_confidence_threshold`` + AND ``error`` is NOT in ``_TRANSIENT_TYPES`` + -> ``RetryDecision(retry=False, reason="low_confidence_no_retry")`` + 4. ``error`` matches ``_TRANSIENT_TYPES`` AND + ``cfg.retry_policy.retry_on_transient is False`` + -> ``RetryDecision(retry=False, reason="transient_disabled")`` + 5. ``error`` matches ``_TRANSIENT_TYPES`` AND + ``cfg.retry_policy.retry_on_transient is True`` + -> ``RetryDecision(retry=True, reason="auto_retry")`` + 6. Default fall-through (no match) -> ``RetryDecision( + retry=False, reason="permanent_error")`` -- fail-closed + conservative default (D-12-02). + + ``retry_count`` is the count of PRIOR retries (0 on the first + retry attempt). Caller is responsible for the bump. + + ``error`` may be ``None`` (caller has no exception object); that is + treated as a permanent error for safety. + + ``confidence`` is the last AgentRun.confidence for the failed turn; + ``None`` means "no signal recorded" and skips the low-confidence + gate. + """ + # 1. absolute cap -- regardless of error class + if retry_count >= cfg.retry_policy.max_retries: + return RetryDecision(retry=False, reason="max_retries_exceeded") + + # 2. permanent errors -- never auto-retry + if _is_permanent_error(error): + return RetryDecision(retry=False, reason="permanent_error") + + is_transient = _is_transient_error(error) + + # 3. low-confidence -- only when error is NOT transient (transient + # errors are mechanical; the LLM's confidence in the business + # decision is still trustworthy on retry). + if (confidence is not None + and confidence < cfg.retry_policy.retry_low_confidence_threshold + and not is_transient): + return RetryDecision( + retry=False, reason="low_confidence_no_retry", + ) + + # 4 + 5. transient classification + if is_transient: + if not cfg.retry_policy.retry_on_transient: + return RetryDecision(retry=False, reason="transient_disabled") + return RetryDecision(retry=True, reason="auto_retry") + + # 6. fail-closed default + return RetryDecision(retry=False, reason="permanent_error") + + +__all__ = [ + # Phase 11 + "GateDecision", "GateReason", "should_gate", + # Phase 12 + "RetryDecision", "RetryReason", "should_retry", +] + +# ====== module: runtime/agents/responsive.py ====== + +logger = logging.getLogger(__name__) + + +def make_agent_node( + *, + skill: Skill, + llm: BaseChatModel, + tools: list[BaseTool], + decide_route: Callable[[Session], str], + store: SessionStore, + valid_signals: frozenset[str] | None = None, + gateway_cfg: GatewayConfig | None = None, + terminal_tool_names: frozenset[str] = frozenset(), + patch_tool_names: frozenset[str] = frozenset(), + gate_policy: "GatePolicy | None" = None, +): + """Factory: build a LangGraph node that runs a ReAct agent and decides a route. + + ``valid_signals`` is the orchestrator-wide accepted signal vocabulary + (``cfg.orchestrator.signals``). When omitted, the legacy + ``{success, failed, needs_input}`` default is used so older callers and + tests keep working. + + ``gateway_cfg`` is the optional risk-rated tool gateway config. + When supplied, every ``BaseTool`` in ``tools`` is wrapped via + :func:`runtime.tools.gateway.wrap_tool` *inside the node body* so the + closure captures the live ``Session`` per agent invocation. When + ``None``, tools are passed through untouched. + """ + # Imported lazily to avoid an import cycle: ``runtime.graph`` depends + # on this module via ``_build_agent_nodes``, but the helpers used + # inside the node body live in ``graph`` so we keep a single + # implementation for the responsive path. The cycle is benign at + # call time — both modules are fully imported before ``node()`` runs. + + + async def node(state: GraphState) -> dict: + incident: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + inc_id = incident.id + started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + + # Wrap tools per-invocation so each wrap closes over the + # live ``Session`` for this run. + if gateway_cfg is not None: + run_tools = [ + wrap_tool(t, session=incident, gateway_cfg=gateway_cfg, + agent_name=skill.name, store=store, + gate_policy=gate_policy) + for t in tools + ] + else: + run_tools = tools + # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every + # responsive agent invocation is wrapped in an AgentTurnOutput + # envelope. ``langchain.agents.create_agent`` (the non-deprecated + # successor to ``langgraph.prebuilt.create_react_agent``) accepts a + # bare schema as ``response_format`` and, by default, wraps it in + # ``AutoStrategy`` — ProviderStrategy for models with native + # structured-output (OpenAI-class), falling back to ToolStrategy + # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a + # callable tool: when the LLM ``calls`` it, the loop terminates on + # the same turn with ``result["structured_response"]`` populated. + # Eliminates the old two-call structure (loop + separate + # ``with_structured_output`` pass) that hit recursion_limit=25 on + # Ollama models without true function-calling. + agent_executor = create_agent( + model=llm, + tools=run_tools, + system_prompt=skill.system_prompt, + response_format=AgentTurnOutput, + ) + + # Phase 11 (FOC-04): reset per-turn confidence hint at the + # start of each agent step so the gateway treats the first + # tool call of the turn as "no signal yet". + try: + incident.turn_confidence_hint = None + except (AttributeError, ValueError): + pass + + try: + result = await _ainvoke_with_retry( + agent_executor, + {"messages": [HumanMessage(content=_format_agent_input(incident))]}, + ) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): HITL pause -- propagate up. + raise + except Exception as exc: # noqa: BLE001 + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + + # Tools (e.g. registered patch tools) write straight to disk. + # Reload so the node's own append of agent_run + tool_calls + # happens against the tool-mutated state. + incident = store.load(inc_id) + + messages = result.get("messages", []) + ts = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + + agent_confidence, agent_rationale, agent_signal = _harvest_tool_calls_and_patches( + messages, skill.name, incident, ts, valid_signals, + terminal_tool_names=terminal_tool_names, + patch_tool_names=patch_tool_names, + ) + # Phase 11 (FOC-04): update hint so any subsequent in-turn + # tool call sees the harvested confidence. + if agent_confidence is not None: + try: + incident.turn_confidence_hint = agent_confidence + except (AttributeError, ValueError): + pass + _pair_tool_responses(messages, incident) + + # Phase 10 (FOC-03 / D-10-03): parse envelope; reconcile against + # any typed-terminal-tool-arg confidence. Envelope failure is a + # structured agent_run error. + try: + envelope = parse_envelope_from_result(result, agent=skill.name) + except EnvelopeMissingError as exc: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + + terminal_tool_for_log = _first_terminal_tool_called_this_turn( + messages, terminal_tool_names, + ) + final_confidence = reconcile_confidence( + envelope.confidence, + agent_confidence, + agent=skill.name, + session_id=inc_id, + tool_name=terminal_tool_for_log, + ) + final_rationale = agent_rationale or envelope.confidence_rationale + final_signal = agent_signal if agent_signal is not None else envelope.signal + + final_text = envelope.content or _extract_final_text(messages) + usage = _sum_token_usage(messages) + + _record_success_run( + incident=incident, skill_name=skill.name, started_at=started_at, + final_text=final_text, usage=usage, + confidence=final_confidence, rationale=final_rationale, + signal=final_signal, + store=store, + ) + next_route_signal = decide_route(incident) + next_node = route_from_skill(skill, next_route_signal) + return {"session": incident, "next_route": next_node, + "last_agent": skill.name, "error": None} + + return node + + +__all__ = ["make_agent_node"] + +# ====== module: runtime/agents/supervisor.py ====== + +logger = logging.getLogger(__name__) + + +def _safe_eval(expr: str, ctx: dict[str, Any]) -> Any: + """Evaluate a pre-validated safe-eval expression against ``ctx``. + + The expression must already have passed + :func:`runtime.skill._validate_safe_expr` — that's enforced at + skill-load time. We re-parse here (cheap) and walk the tree + against the same allowlist; any non-whitelisted node is treated + as evaluating to ``False`` so a malformed runtime expression can + never escalate to arbitrary code execution. + """ + + _validate_safe_expr(expr, source="supervisor.dispatch_rule") + # ``compile`` + ``eval`` over a built-in-stripped namespace is the + # cheapest correct evaluator once the AST is whitelisted. The + # ``__builtins__`` removal blocks ``__import__`` etc. should the + # AST checker miss something. + code = compile(expr, "", "eval") + return eval(code, {"__builtins__": {}}, ctx) # noqa: S307 — AST-whitelisted + + +def _ctx_for_session(incident: Session) -> dict[str, Any]: + """Build the variable namespace dispatch-rule expressions see. + + Exposes the live session payload as ``session`` plus a few + ergonomic top-level aliases for fields operators reach for most + often. Adding new top-level names is a one-liner; the safe-eval + AST checker already restricts the language so we don't need to + sandbox the namespace any further. + """ + payload = incident.model_dump() + return { + "session": payload, + "status": payload.get("status"), + "agents_run": payload.get("agents_run") or [], + "tool_calls": payload.get("tool_calls") or [], + } + + +def log_supervisor_dispatch( + *, + session: Session, + supervisor: str, + strategy: str, + depth: int, + targets: list[str], + rule_matched: str | None, + payload_size: int, +) -> None: + """Emit one structured ``supervisor_dispatch`` log entry. + + Operators wanting an end-to-end audit join ``agent_runs`` and the + log stream by ``incident_id``. The audit trail is deliberately a + different stream from ``agent_runs`` because supervisors don't burn + tokens — bloating ``agents_run`` with router rows is a known trap + we explicitly avoid. + """ + record = { + "event": "supervisor_dispatch", + "ts": datetime.now(timezone.utc).strftime(_UTC_TS_FMT), + "incident_id": session.id, + "session_id": session.id, + "supervisor": supervisor, + "strategy": strategy, + "depth": depth, + "targets": targets, + "rule_matched": rule_matched, + "dispatch_payload_size": payload_size, + } + logger.info("supervisor_dispatch %s", json.dumps(record)) + + +def _llm_pick_target( + *, + skill: Skill, + llm: BaseChatModel, + incident: Session, +) -> str: + """One-shot LLM dispatch: ask the model to choose a subordinate. + + The model is asked to reply with **only** the name of one + subordinate. We accept the first matching name in the response + (case-insensitive substring match) and fall back to the first + subordinate when the response is unparseable — keeping the graph + moving rather than failing outright. + """ + prompt = ( + f"{skill.dispatch_prompt}\n\n" + f"Choose ONE of: {', '.join(skill.subordinates)}.\n" + f"Reply with only the agent name." + ) + payload = json.dumps(incident.model_dump(), default=str) + msgs = [ + SystemMessage(content=prompt), + HumanMessage(content=payload), + ] + try: + result = llm.invoke(msgs) + except Exception as exc: # noqa: BLE001 + logger.warning( + "supervisor %s: LLM dispatch failed (%s); falling back to %s", + skill.name, exc, skill.subordinates[0], + ) + return skill.subordinates[0] + text = (getattr(result, "content", "") or "").strip().lower() + for name in skill.subordinates: + if name.lower() in text: + return name + logger.warning( + "supervisor %s: LLM reply %r did not name a subordinate; " + "falling back to %s", skill.name, text, skill.subordinates[0], + ) + return skill.subordinates[0] + + +def _rule_pick_target( + *, + skill: Skill, + incident: Session, +) -> tuple[str, str | None]: + """Walk dispatch_rules in order; return (target, matched_when). + + Falls back to the first subordinate when no rule matches; the + fallback case carries ``matched_when=None`` so the audit log can + distinguish "default" from "rule X matched". + """ + ctx = _ctx_for_session(incident) + for rule in skill.dispatch_rules: + try: + if bool(_safe_eval(rule.when, ctx)): + return rule.target, rule.when + except Exception as exc: # noqa: BLE001 + logger.warning( + "supervisor %s: dispatch_rule %r raised %s; skipping", + skill.name, rule.when, exc, + ) + return skill.subordinates[0], None + + +def _normalize_runner_route(value: Any) -> str: + """Map runner-supplied route aliases to the canonical graph end token. + + Apps writing runners reach for ``"END"`` / ``"end"`` / ``"__end__"`` + interchangeably; LangGraph's conditional edges only recognise + ``"__end__"``. Normalising here keeps the runner contract permissive + without spreading the alias check across the graph layer. + """ + if isinstance(value, str) and value.strip().lower() in {"end", "__end__"}: + return "__end__" + return value + + +def make_supervisor_node( + *, + skill: Skill, + llm: BaseChatModel | None = None, + framework_cfg: Any | None = None, +): + """Build the supervisor LangGraph node. + + Pure routing: no ``AgentRun`` row, no tool execution, no token + accounting beyond what the optional LLM call itself reports. The + node sets ``state["next_route"]`` to a subordinate name and returns; + LangGraph's conditional edges fan out to that node from there. + + The optional ``llm`` is only used when ``skill.dispatch_strategy`` + is ``"llm"``. Callers using ``"rule"`` may pass ``None``. + + When ``skill.runner`` is set, the dotted-path callable is resolved + at build time and invoked at the start of each node call BEFORE the + routing dispatch. The runner gets the live ``GraphState`` and the + optional ``framework_cfg`` and may return ``None`` (continue with + the routing table) or a dict patch that gets merged into state. A + patch carrying ``"next_route"`` short-circuits the routing table + entirely (use ``"__end__"`` to terminate the graph). + """ + # Local import to avoid the circular runtime.graph -> runtime.agents + # cycle at module-load time. + + + if skill.kind != "supervisor": + raise ValueError( + f"make_supervisor_node called with non-supervisor skill " + f"{skill.name!r} (kind={skill.kind!r})" + ) + + runner: Callable[..., Any] | None = None + if skill.runner is not None: + if callable(skill.runner): + # Test stubs and composed runners may supply a live callable + # directly rather than a dotted-path string. Access via the + # class __dict__ to avoid Python binding it as an instance + # method when the skill is a plain object (not a Pydantic model). + raw = vars(type(skill)).get("runner", skill.runner) + runner = raw if callable(raw) else skill.runner + else: + # Resolved a second time here so a runner that fails to import + # at graph-build time still surfaces a clear error. The skill + # validator catches most issues at YAML load; this is belt-and- + # braces and also gives us the live callable to invoke. + runner = _resolve_dotted_callable( + skill.runner, source=f"supervisor {skill.name!r} runner" + ) + + async def node(state: GraphState) -> dict: + sess: Session = state["session"] # pyright: ignore[reportTypedDictNotRequiredAccess] + # ``dispatch_depth`` is an extension field on GraphState; start + # at 0 and increment per supervisor entry. + depth = int(state.get("dispatch_depth") or 0) + 1 + if depth > skill.max_dispatch_depth: + logger.warning( + "supervisor %s: dispatch depth %d exceeds limit %d; aborting", + skill.name, depth, skill.max_dispatch_depth, + ) + return { + "session": sess, + "next_route": "__end__", + "last_agent": skill.name, + "dispatch_depth": depth, + "error": ( + f"supervisor {skill.name!r}: max_dispatch_depth " + f"{skill.max_dispatch_depth} exceeded" + ), + } + + # ----- App-supplied runner hook ------------------------------- + runner_patch: dict[str, Any] = {} + if runner is not None: + # Build a thin proxy so the runner can reach intake_context + # (and any other framework_cfg attributes) without needing + # framework_cfg to be mutable. The proxy exposes intake_context + # directly and falls back to framework_cfg for all other attrs. + _app_cfg_proxy = type("_RunnerAppCfg", (), { + "intake_context": getattr(framework_cfg, "intake_context", None), + "__getattr__": lambda self, name: getattr(framework_cfg, name), + })() + try: + result = runner(state, app_cfg=_app_cfg_proxy) + except Exception as exc: # noqa: BLE001 + logger.exception( + "supervisor %s: runner %s raised; aborting to __end__", + skill.name, skill.runner, + ) + return { + "session": sess, + "next_route": "__end__", + "last_agent": skill.name, + "dispatch_depth": depth, + "error": ( + f"supervisor {skill.name!r}: runner failed: {exc}" + ), + } + if isinstance(result, dict): + runner_patch = dict(result) + elif result is not None: + logger.warning( + "supervisor %s: runner returned %s (expected dict|None); " + "ignoring", skill.name, type(result).__name__, + ) + override = runner_patch.pop("next_route", None) + if override is not None: + # Short-circuit: skip the routing table entirely. Audit + # log still fires so operators can trace the decision. + target = _normalize_runner_route(override) + # Pick up any fresh reference the runner returned. + sess = runner_patch.get("session", sess) + try: + payload_size = len( + json.dumps(sess.model_dump(), default=str) + ) + except Exception: # noqa: BLE001 — defensive + payload_size = 0 + log_supervisor_dispatch( + session=sess, + supervisor=skill.name, + strategy=f"runner:{skill.runner}", + depth=depth, + targets=[target], + rule_matched=None, + payload_size=payload_size, + ) + out: dict[str, Any] = { + "session": sess, + "next_route": target, + "last_agent": skill.name, + "dispatch_depth": depth, + "error": None, + } + # Merge any non-route keys the runner returned (e.g. + # extra GraphState fields apps want to carry forward). + for k, v in runner_patch.items(): + if k not in out: + out[k] = v + return out + # No override: fold any payload mutation back so the + # routing table sees the up-to-date object. + if "session" in runner_patch: + sess = runner_patch["session"] + + rule_matched: str | None = None + if skill.dispatch_strategy == "rule": + target, rule_matched = _rule_pick_target(skill=skill, incident=sess) + else: # "llm" + if llm is None: + logger.warning( + "supervisor %s: strategy=llm but no llm provided; " + "falling back to first subordinate", skill.name, + ) + target = skill.subordinates[0] + else: + target = _llm_pick_target(skill=skill, llm=llm, incident=sess) + + # Audit: one structured log entry per dispatch. + try: + payload_size = len(json.dumps(sess.model_dump(), default=str)) + except Exception: # noqa: BLE001 — defensive; size is a hint + payload_size = 0 + log_supervisor_dispatch( + session=sess, + supervisor=skill.name, + strategy=skill.dispatch_strategy, + depth=depth, + targets=[target], + rule_matched=rule_matched, + payload_size=payload_size, + ) + + out: dict[str, Any] = { + "session": sess, + "next_route": target, + "last_agent": skill.name, + "dispatch_depth": depth, + "error": None, + } + # Carry through any extra keys the runner emitted that the + # framework didn't consume itself (e.g. memory snapshots). + for k, v in runner_patch.items(): + if k not in out: + out[k] = v + return out + + return node + + +__all__ = ["make_supervisor_node", "log_supervisor_dispatch"] + +# ====== module: runtime/agents/monitor.py ====== + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Safe-eval evaluator +# --------------------------------------------------------------------------- + + +class SafeEvalError(Exception): + """Raised when a supposedly-validated expression fails to evaluate.""" + + +def safe_eval(expr: str, ctx: dict[str, Any]) -> Any: + """Evaluate ``expr`` against ``ctx`` after a fresh AST whitelist check. + + The skill loader validates ``emit_signal_when`` at parse time; we + re-validate here on every call to keep the threat model defensive + against any future code path that might construct a Skill bypassing + the loader's validators. + """ + _validate_safe_expr(expr, source="monitor.emit_signal_when") + code = compile(expr, "", "eval") + try: + return eval(code, {"__builtins__": {}}, ctx) # noqa: S307 — AST-whitelisted + except Exception as exc: # noqa: BLE001 + raise SafeEvalError(f"emit_signal_when {expr!r} raised: {exc}") from exc + + +# --------------------------------------------------------------------------- +# Cron parsing (minute-resolution; matches Skill._validate_cron grammar) +# --------------------------------------------------------------------------- + + +def _expand_cron_field(field: str, lo: int, hi: int) -> set[int]: + """Expand a single cron field into the set of int values it matches. + + Supports ``*``, ``*/n``, ``a``, ``a-b``, ``a-b/n``, and + comma-separated combinations of those — the grammar accepted by + :func:`runtime.skill._validate_cron`. + """ + out: set[int] = set() + for part in field.split(","): + step = 1 + if "/" in part: + base, _, step_s = part.partition("/") + step = int(step_s) + else: + base = part + if base == "*": + start, end = lo, hi + elif "-" in base: + a, _, b = base.partition("-") + start, end = int(a), int(b) + else: + v = int(base) + start, end = v, v + out.update(range(start, end + 1, step)) + return {v for v in out if lo <= v <= hi} + + +def _cron_matches(expr: str, when: datetime) -> bool: + """Return True if the given datetime satisfies the 5-field cron expression. + + Fields: minute, hour, day-of-month, month, day-of-week (0=Mon..6=Sun + — Python's ``datetime.weekday()`` convention; cron itself uses + 0=Sun, but for our minute-resolution scheduler the convention only + needs to be internally consistent and documented). + """ + minute, hour, dom, month, dow = expr.split() + return ( + when.minute in _expand_cron_field(minute, 0, 59) + and when.hour in _expand_cron_field(hour, 0, 23) + and when.day in _expand_cron_field(dom, 1, 31) + and when.month in _expand_cron_field(month, 1, 12) + and when.weekday() in _expand_cron_field(dow, 0, 6) + ) + + +# --------------------------------------------------------------------------- +# Monitor callable factory +# --------------------------------------------------------------------------- + + +def make_monitor_callable( + *, + skill: Skill, + observe_fn: Callable[[str], Any], + fire_trigger: Callable[[str, dict[str, Any]], None], +) -> Callable[[], None]: + """Build the callable a :class:`MonitorRunner` runs per tick. + + ``observe_fn(tool_name)`` is the seam through which the runner + invokes a tool. Production wires this to the orchestrator's MCP + tool registry; tests wire it to deterministic stubs. + + ``fire_trigger(name, payload)`` is the seam through which the + runner fires a trigger. Production wires this to the trigger + registry; tests wire it to a recorder. + + The returned callable is intentionally synchronous and exception- + safe: a failed ``observe_fn`` or ``fire_trigger`` is logged and + swallowed so one bad monitor cannot stall the runner. + """ + if skill.kind != "monitor": + raise ValueError( + f"make_monitor_callable called with non-monitor skill " + f"{skill.name!r} (kind={skill.kind!r})" + ) + + def tick() -> None: + observation: dict[str, Any] = {} + for tool_name in skill.observe: + try: + observation[tool_name] = observe_fn(tool_name) + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: observe tool %r raised %s; skipping", + skill.name, tool_name, exc, + ) + observation[tool_name] = None + ctx = { + "observation": observation, + "obs": observation, + } + try: + should_emit = bool(safe_eval(skill.emit_signal_when or "False", ctx)) + except SafeEvalError as exc: + logger.warning("monitor %s: %s", skill.name, exc) + return + if not should_emit: + return + try: + fire_trigger(skill.trigger_target or "", { + "monitor": skill.name, + "observation": observation, + }) + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: fire_trigger(%s) raised %s", + skill.name, skill.trigger_target, exc, + ) + + return tick + + +# --------------------------------------------------------------------------- +# MonitorRunner — orchestrator-level singleton +# --------------------------------------------------------------------------- + + +class _RegisteredMonitor: + __slots__ = ("skill", "callable_", "next_run_ts") + + def __init__(self, skill: Skill, callable_: Callable[[], None]) -> None: + self.skill = skill + self.callable_ = callable_ + # Track the last *scheduled* minute we fired so we never fire + # twice for the same wall-clock minute even if the scheduler + # thread oversleeps. + self.next_run_ts: datetime | None = None + + +class MonitorRunner: + """Owns a bounded thread pool and a scheduler thread that ticks + registered monitor skills on their cron schedules. + + Exactly one ``MonitorRunner`` exists per ``OrchestratorService`` + instance; the runner is built at service startup and shut down at + service teardown. + + Concurrency: each tick is dispatched to the + :class:`~concurrent.futures.ThreadPoolExecutor` so the scheduler + thread itself never blocks on a slow ``observe`` tool. The pool + size defaults to ``4`` (R6); each tick has a per-monitor timeout + sourced from the skill's ``tick_timeout_seconds``. + """ + + def __init__( + self, + *, + observe_fn: Callable[[str], Any], + fire_trigger: Callable[[str, dict[str, Any]], None], + max_workers: int = 4, + clock: Callable[[], datetime] | None = None, + ) -> None: + self._observe_fn = observe_fn + self._fire_trigger = fire_trigger + self._executor = ThreadPoolExecutor( + max_workers=max_workers, + thread_name_prefix="monitor", + ) + self._monitors: dict[str, _RegisteredMonitor] = {} + self._stop = threading.Event() + self._thread: threading.Thread | None = None + self._lock = threading.Lock() + # Injection seam for tests; default uses real wall-clock UTC. + self._clock = clock or (lambda: datetime.now(timezone.utc)) + + # ----- registration ----- + + def register(self, skill: Skill) -> None: + if skill.kind != "monitor": + raise ValueError( + f"MonitorRunner.register: skill {skill.name!r} kind=" + f"{skill.kind!r} (expected 'monitor')" ) - return Client( - {"command": server_cfg.command[0], "args": server_cfg.command[1:]} + callable_ = make_monitor_callable( + skill=skill, + observe_fn=self._observe_fn, + fire_trigger=self._fire_trigger, ) - raise ValueError(f"Unknown transport: {server_cfg.transport}") + with self._lock: + if skill.name in self._monitors: + raise ValueError(f"monitor {skill.name!r} already registered") + self._monitors[skill.name] = _RegisteredMonitor(skill, callable_) + def unregister(self, name: str) -> None: + with self._lock: + self._monitors.pop(name, None) -async def _load_in_process(server_cfg: MCPServerConfig, - stack: AsyncExitStack) -> list[BaseTool]: - if server_cfg.module is None: - raise ValueError(f"in_process server '{server_cfg.name}' missing 'module'") - mod = importlib.import_module(server_cfg.module) - fmcp = getattr(mod, "mcp", None) - if fmcp is None: - raise ValueError(f"Module {server_cfg.module} has no 'mcp' (FastMCP instance)") - # FastMCP exposes tools as functions; convert to langchain tools via adapter. - # We use the in-memory client transport. The client is registered into the - # caller's exit stack so its session/transport stays open while the loaded - # tools are in use. - from fastmcp import Client - client = Client(fmcp) - await stack.enter_async_context(client) - tools = await load_mcp_tools(client.session) - # Rewrite each tool's .name to ":" for LLM disambiguation. - for t in tools: - original_name = t.name - t.name = f"{server_cfg.name}:{original_name}" - t._original_mcp_name = original_name # type: ignore[attr-defined] - return tools + def registered(self) -> list[str]: + with self._lock: + return sorted(self._monitors.keys()) + # ----- lifecycle ----- -async def _load_remote(server_cfg: MCPServerConfig, - stack: AsyncExitStack) -> list[BaseTool]: - from fastmcp import Client - if server_cfg.transport in ("http", "sse"): - if not server_cfg.url: - raise ValueError(f"remote server '{server_cfg.name}' missing 'url'") - client = Client(server_cfg.url, headers=server_cfg.headers or None) - elif server_cfg.transport == "stdio": - if not server_cfg.command: - raise ValueError(f"stdio server '{server_cfg.name}' missing 'command'") - client = Client({"command": server_cfg.command[0], "args": server_cfg.command[1:]}) - else: - raise ValueError(f"Unknown transport: {server_cfg.transport}") - await stack.enter_async_context(client) - tools = await load_mcp_tools(client.session) - # Rewrite each tool's .name to ":" for LLM disambiguation. - for t in tools: - original_name = t.name - t.name = f"{server_cfg.name}:{original_name}" - t._original_mcp_name = original_name # type: ignore[attr-defined] - return tools + def start(self) -> None: + if self._thread is not None and self._thread.is_alive(): + return + self._stop.clear() + self._thread = threading.Thread( + target=self._run, + name="MonitorRunner", + daemon=True, + ) + self._thread.start() + def stop(self, *, wait: bool = True, timeout: float = 5.0) -> None: + """Halt the scheduler thread and shut down the executor. -async def load_tools(cfg: MCPConfig, stack: AsyncExitStack) -> ToolRegistry: - """Load all enabled MCP servers and return a :class:`ToolRegistry`. + ``wait=True`` (default) blocks up to ``timeout`` seconds for + in-flight ticks to drain. Daemon threads are still joined so + pytest fixture teardown is deterministic. + """ + self._stop.set() + thread = self._thread + if thread is not None and thread.is_alive() and wait: + thread.join(timeout=timeout) + self._executor.shutdown(wait=wait) + self._thread = None - The caller MUST pass an already-entered :class:`AsyncExitStack`. Each - FastMCP ``Client`` is registered into it; the caller controls lifetime via - ``await stack.aclose()``. - """ - registry = ToolRegistry() - for server_cfg in cfg.servers: - if not server_cfg.enabled: - continue - if server_cfg.transport == "in_process": - tools = await _load_in_process(server_cfg, stack) - else: - tools = await _load_remote(server_cfg, stack) - for t in tools: - original = getattr(t, "_original_mcp_name", t.name) - registry.add(ToolEntry( - name=original, description=t.description or "", - server=server_cfg.name, category=server_cfg.category, tool=t, - )) - return registry + # ----- test hook ----- + + def tick_once(self, when: datetime | None = None) -> None: + """Fire any monitors whose cron expression matches ``when``. + + Useful in tests where freezing wall-clock time is awkward; the + production scheduler loop calls this internally too. + """ + when = when or self._clock() + # Truncate to the minute so identical seconds within a minute + # don't fire the same monitor twice. + minute = when.replace(second=0, microsecond=0) + with self._lock: + entries = list(self._monitors.values()) + for entry in entries: + try: + if not _cron_matches(entry.skill.schedule or "* * * * *", minute): + continue + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: cron parse failed (%s); skipping tick", + entry.skill.name, exc, + ) + continue + if entry.next_run_ts == minute: + # Already fired this minute; idempotent on oversleep. + continue + entry.next_run_ts = minute + self._dispatch(entry) + + def _dispatch(self, entry: _RegisteredMonitor) -> None: + timeout = float(entry.skill.tick_timeout_seconds or 30.0) + future = self._executor.submit(entry.callable_) + + def _wait_and_log() -> None: + try: + future.result(timeout=timeout) + except FuturesTimeout: + logger.warning( + "monitor %s: tick exceeded %.1fs timeout", + entry.skill.name, timeout, + ) + except Exception as exc: # noqa: BLE001 + logger.warning( + "monitor %s: tick raised %s", entry.skill.name, exc, + ) + + # Watcher runs on a side thread so the scheduler loop never + # blocks waiting for a slow tick — the executor handles + # parallelism, the watcher handles per-tick timeout reporting. + threading.Thread( + target=_wait_and_log, + name=f"monitor-watch:{entry.skill.name}", + daemon=True, + ).start() + + # ----- scheduler loop ----- + + def _run(self) -> None: + """Single-threaded scheduler. Wakes once per second, fires + any monitor whose cron expression matches the current minute, + marks each fired monitor for the minute so we never fire + twice if we oversleep. + """ + while not self._stop.is_set(): + try: + self.tick_once() + except Exception as exc: # noqa: BLE001 — never crash the loop + logger.warning("MonitorRunner loop error: %s", exc) + # Sleep with frequent wakeups so stop() returns promptly. + self._stop.wait(timeout=1.0) + + +__all__ = [ + "MonitorRunner", + "SafeEvalError", + "make_monitor_callable", + "safe_eval", +] # ====== module: runtime/graph.py ====== @@ -4010,7 +8376,21 @@ async def _ainvoke_with_retry(executor, input_, *, max_attempts: int = 3, last_exc: Exception | None = None for attempt in range(max_attempts): try: + # Phase 15 (LLM-COMPAT-01): the recursion_limit=25 workaround + # introduced in 3ba099f as a safety net is gone — the + # ``langchain.agents.create_agent`` migration replaces the + # old two-call structure (loop + separate + # ``with_structured_output`` pass) with a single tool-loop + # whose terminal signal is the AgentTurnOutput tool call + # itself (AutoStrategy → ToolStrategy fallback for non- + # function-calling Ollama models). The default langgraph + # recursion bound is now a true upper bound, not a workaround. return await executor.ainvoke(input_) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): never retry a HITL pause. + # GraphInterrupt is a checkpointed pending_approval signal, + # not a transient error. + raise except Exception as exc: # noqa: BLE001 msg = str(exc).lower() transient = any(m in msg for m in _TRANSIENT_MARKERS) @@ -4178,6 +8558,30 @@ def _extract_final_text(messages: list) -> str: return "" +def _first_terminal_tool_called_this_turn( + messages: list, + terminal_tool_names: frozenset[str], +) -> str | None: + """Return the bare name of the first typed-terminal tool called this turn. + + Phase 10 (FOC-03 / D-10-03): used to label the reconciliation log so + operators can correlate envelope-vs-tool-arg confidence divergences + against a specific tool. Tool names may be MCP-prefixed + (``:``); we rsplit on the rightmost colon to recover the + bare name and match against the configured ``terminal_tool_names``. + Returns None when no terminal tool fired this turn. + """ + if not terminal_tool_names: + return None + for msg in messages: + for tc in (getattr(msg, "tool_calls", None) or []): + name = tc.get("name", "") + bare = name.rsplit(":", 1)[-1] + if bare in terminal_tool_names: + return bare + return None + + def _sum_token_usage(messages: list) -> TokenUsage: """Sum input/output token counts across all messages that report usage_metadata.""" agent_in = agent_out = 0 @@ -4192,6 +8596,50 @@ def _sum_token_usage(messages: list) -> TokenUsage: ) +def _try_recover_envelope_from_raw(raw: str) -> AgentTurnOutput | None: + """Attempt to extract an :class:`AgentTurnOutput` from a raw LLM + string when LangGraph's structured-output pass raised + ``OutputParserException``. + + Strategy: + 1. Parse the whole string as JSON. + 2. If that fails, scan for the first balanced ``{...}`` substring + and try parsing that (handles markdown-fenced JSON or trailing + chatter). + 3. Validate the parsed dict against :class:`AgentTurnOutput`. + + Returns the parsed envelope on success, ``None`` on any failure. + """ + if not raw or not raw.strip(): + return None + candidates: list[str] = [raw] + # Markdown-fenced JSON: ```json\n{...}\n``` + if "```" in raw: + for chunk in raw.split("```"): + stripped = chunk.strip() + if stripped.startswith("json"): + stripped = stripped[4:].lstrip() + if stripped.startswith("{"): + candidates.append(stripped) + # Greedy: first '{' through last '}' + first = raw.find("{") + last = raw.rfind("}") + if 0 <= first < last: + candidates.append(raw[first:last + 1]) + for candidate in candidates: + try: + payload = json.loads(candidate) + except (json.JSONDecodeError, ValueError): + continue + if not isinstance(payload, dict): + continue + try: + return AgentTurnOutput.model_validate(payload) + except Exception: # noqa: BLE001 + continue + return None + + def _handle_agent_failure( *, skill_name: str, @@ -4266,6 +8714,8 @@ def make_agent_node( gateway_cfg: GatewayConfig | None = None, terminal_tool_names: frozenset[str] = frozenset(), patch_tool_names: frozenset[str] = frozenset(), + injected_args: dict[str, str] | None = None, + gate_policy: "GatePolicy | None" = None, ) -> Callable[[GraphState], Awaitable[dict]]: """Factory: build a LangGraph node that runs a ReAct agent and decides a route. @@ -4286,6 +8736,14 @@ def make_agent_node( union ``OrchestratorConfig.harvest_terminal_tools`` / ``OrchestratorConfig.patch_tools``). Empty defaults preserve the "no harvester recognition" behavior for legacy callers. + + ``injected_args`` (Phase 9 / D-09-01) is the orchestrator-wide + map of ``arg_name -> dotted_path`` declared in + :attr:`OrchestratorConfig.injected_args`. Every entry is stripped + from each tool's LLM-visible signature (so the LLM cannot emit a + value for it) and re-supplied at invocation time from session + state. When ``None`` or empty, tools pass through to the LLM + unchanged — preserves legacy callers and the framework default. """ async def node(state: GraphState) -> dict: @@ -4293,32 +8751,165 @@ async def node(state: GraphState) -> dict: inc_id = incident.id started_at = datetime.now(timezone.utc).strftime(_UTC_TS_FMT) + # Phase 9 (D-09-01): strip injected-arg keys from every tool's + # LLM-visible signature BEFORE create_react_agent serialises the + # tool surface — so the LLM literally cannot emit values for + # those params. The framework re-supplies them at invocation + # time inside the gateway (or an inject-only wrapper) below. + + injected_keys = frozenset((injected_args or {}).keys()) + if injected_keys: + visible_tools = [ + strip_injected_params(t, injected_keys) for t in tools + ] + else: + visible_tools = tools + # Wrap tools per-invocation so each wrap closes over the live # ``Session`` for this run. When the gateway is unconfigured, # the original tools pass through untouched and # ``create_react_agent`` sees the same surface as before. if gateway_cfg is not None: + # Pass ORIGINAL tools (pre-strip) to wrap_tool — the gateway + # wrapper strips internally for the LLM-visible schema while + # keeping ``inner.args_schema`` intact so + # ``accepted_params_for_tool`` correctly recognises injected + # keys (e.g. ``environment``) as accepted by the underlying + # tool. Stripping twice (here AND in wrap_tool) hides those + # keys from ``accepted_params``, the inject step skips them, + # and FastMCP rejects the call as missing required arg. run_tools = [ wrap_tool(t, session=incident, gateway_cfg=gateway_cfg, - agent_name=skill.name, store=store) + agent_name=skill.name, store=store, + injected_args=injected_args or {}, + gate_policy=gate_policy) for t in tools ] + elif injected_keys: + # No gateway, but injected_args is configured — wrap each + # tool in an inject-only ``StructuredTool`` so the LLM-visible + # sig matches ``visible_tools`` while the underlying call + # still receives the framework-supplied values. + from langchain_core.tools import StructuredTool + + _inject_cfg = injected_args or {} + + def _make_inject_only_wrapper( + base: BaseTool, llm_visible: BaseTool, sess: Session, + ) -> BaseTool: + async def _arun(**kwargs: Any) -> Any: + new_kwargs = _inject_args( + kwargs, + session=sess, + injected_args_cfg=_inject_cfg, + tool_name=base.name, + ) + return await base.ainvoke(new_kwargs) + + def _run(**kwargs: Any) -> Any: + new_kwargs = _inject_args( + kwargs, + session=sess, + injected_args_cfg=_inject_cfg, + tool_name=base.name, + ) + return base.invoke(new_kwargs) + + return StructuredTool.from_function( + func=_run, + coroutine=_arun, + name=base.name, + description=base.description, + args_schema=llm_visible.args_schema, + ) + + run_tools = [ + _make_inject_only_wrapper(orig, vis, incident) + for orig, vis in zip(tools, visible_tools) + ] else: - run_tools = tools - agent_executor = create_react_agent( - llm, run_tools, prompt=skill.system_prompt, + run_tools = visible_tools + # Phase 10 (FOC-03 / D-10-02) + Phase 15 (LLM-COMPAT-01): every + # responsive agent invocation is wrapped in an AgentTurnOutput + # envelope. ``langchain.agents.create_agent`` (the non-deprecated + # successor to ``langgraph.prebuilt.create_react_agent``) accepts a + # bare schema as ``response_format`` and, by default, wraps it in + # ``AutoStrategy`` — ProviderStrategy for models with native + # structured-output (OpenAI-class), falling back to ToolStrategy + # otherwise (Ollama). ToolStrategy injects AgentTurnOutput as a + # callable tool: when the LLM ``calls`` it, the loop terminates on + # the same turn with ``result["structured_response"]`` populated. + # Eliminates the old two-call structure (loop + separate + # ``with_structured_output`` pass) that hit recursion_limit=25 on + # Ollama models without true function-calling. + agent_executor = create_agent( + model=llm, + tools=run_tools, + system_prompt=skill.system_prompt, + response_format=AgentTurnOutput, ) + # Phase 11 (FOC-04): reset per-turn confidence hint. The hint + # is updated below after _harvest_tool_calls_and_patches; on + # re-entry from a HITL pause the hint resets cleanly so a new + # turn starts from "no signal yet" (None). + try: + incident.turn_confidence_hint = None + except (AttributeError, ValueError): + pass + try: result = await _ainvoke_with_retry( agent_executor, {"messages": [HumanMessage(content=_format_agent_input(incident))]}, ) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): HITL pause is NOT an error. + # Re-raise so LangGraph's checkpointer captures the paused + # state. Session.status is left to the orchestrator's + # interrupt-aware bridge, NOT _handle_agent_failure. + raise except Exception as exc: # noqa: BLE001 - return _handle_agent_failure( - skill_name=skill.name, started_at=started_at, exc=exc, - inc_id=inc_id, store=store, fallback=incident, - ) + # Phase 10 follow-up: when LangGraph's structured-output pass + # raises ``OutputParserException`` (Ollama / non-OpenAI + # providers don't always honor ``response_format`` cleanly), + # try to recover by parsing the raw LLM output ourselves. + # The exception's ``llm_output`` carries the model's reply + # verbatim; if it contains JSON matching the envelope schema, + # build a synthetic ``result`` and continue. On unrecoverable + # failure, log the raw output for diagnosis and fall through + # to ``_handle_agent_failure``. + try: + from langchain_core.exceptions import OutputParserException + except ImportError: # pragma: no cover — langchain always present + OutputParserException = () # type: ignore[assignment] + if isinstance(exc, OutputParserException): + raw = getattr(exc, "llm_output", "") or "" + logger.warning( + "agent.structured_output_parse_failure agent=%s " + "raw_len=%d raw_preview=%r", + skill.name, len(raw), raw[:500], + ) + recovered = _try_recover_envelope_from_raw(raw) + if recovered is not None: + logger.info( + "agent.structured_output_recovered agent=%s", + skill.name, + ) + result = { + "messages": [], + "structured_response": recovered, + } + else: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + else: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) # Tools (e.g. registered patch tools) write straight to disk. # Reload so the node's own append of agent_run + tool_calls @@ -4336,18 +8927,51 @@ async def node(state: GraphState) -> dict: terminal_tool_names=terminal_tool_names, patch_tool_names=patch_tool_names, ) + # Phase 11 (FOC-04): update hint so any subsequent in-turn + # tool call sees the harvested confidence at the gateway. + if agent_confidence is not None: + try: + incident.turn_confidence_hint = agent_confidence + except (AttributeError, ValueError): + pass # Pair tool responses with their tool calls. _pair_tool_responses(messages, incident) + # Phase 10 (FOC-03 / D-10-03): parse the structural envelope and + # reconcile its confidence against any typed-terminal-tool arg + # confidence harvested above. Envelope failure is a hard error — + # mark the agent_run failed with structured cause. + try: + envelope = parse_envelope_from_result(result, agent=skill.name) + except EnvelopeMissingError as exc: + return _handle_agent_failure( + skill_name=skill.name, started_at=started_at, exc=exc, + inc_id=inc_id, store=store, fallback=incident, + ) + + terminal_tool_for_log = _first_terminal_tool_called_this_turn( + messages, terminal_tool_names, + ) + final_confidence = reconcile_confidence( + envelope.confidence, + agent_confidence, + agent=skill.name, + session_id=inc_id, + tool_name=terminal_tool_for_log, + ) + final_rationale = agent_rationale or envelope.confidence_rationale + final_signal = agent_signal if agent_signal is not None else envelope.signal + # Final summary text and token usage. - final_text = _extract_final_text(messages) + # Envelope content takes precedence over last AIMessage scrape. + final_text = envelope.content or _extract_final_text(messages) usage = _sum_token_usage(messages) _record_success_run( incident=incident, skill_name=skill.name, started_at=started_at, final_text=final_text, usage=usage, - confidence=agent_confidence, rationale=agent_rationale, signal=agent_signal, + confidence=final_confidence, rationale=final_rationale, signal=final_signal, store=store, ) next_route_signal = decide_route(incident) @@ -4383,6 +9007,16 @@ def _decide_from_signal(inc: Session) -> str: "resolution": "Proposed fix: restart api service. Auto-applied. INC resolved.", } +# Phase 10 (FOC-03): per-agent default envelope confidence for the stub +# LLM. Pre-Phase-10 the deep_investigator stub emitted no confidence at +# all, so the gate (threshold 0.75) always interrupted on the first +# call. Post-Phase-10 every agent must emit a confidence value — drive +# DI's stub envelope below threshold to preserve gate-pause behavior in +# existing tests. Other agents default to 0.85 (above threshold). +_DEFAULT_STUB_ENVELOPE_CONFIDENCE: dict[str, float] = { + "deep_investigator": 0.30, +} + def _latest_run_for(incident: Session, agent_name: str | None): """Return the most recent ``AgentRun`` for ``agent_name``, or None. @@ -4551,6 +9185,10 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, valid_signals = frozenset(cfg.orchestrator.signals) gateway_cfg = getattr(cfg.runtime, "gateway", None) + # Phase 11 (FOC-04): thread the orchestrator's gate_policy down to + # wrap_tool so should_gate can apply the configured per-app + # confidence threshold + gated environments / risk actions. + gate_policy = getattr(cfg.orchestrator, "gate_policy", None) # Build the harvester's tool-name sets once per graph-build. The # union of ``terminal_tools`` (status-transitioning) and # ``harvest_terminal_tools`` (harvest-only) gives the full @@ -4569,7 +9207,10 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, if kind == "supervisor": llm = None if skill.dispatch_strategy == "llm": - llm = get_llm(cfg.llm, skill.model, role=agent_name) + llm = get_llm( + cfg.llm, skill.model, role=agent_name, + default_llm_request_timeout=cfg.orchestrator.default_llm_request_timeout, + ) nodes[agent_name] = make_supervisor_node(skill=skill, llm=llm) continue # Default / "responsive" path. @@ -4579,11 +9220,16 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, stub_canned = {agent_name: _DEFAULT_STUB_CANNED[agent_name]} else: stub_canned = None + # Phase 10 (FOC-03): wire a per-agent default envelope confidence + # into the stub so pre-Phase-10 gate-pause-on-DI tests still pass. + stub_env_conf = _DEFAULT_STUB_ENVELOPE_CONFIDENCE.get(agent_name) llm = get_llm( cfg.llm, skill.model, role=agent_name, stub_canned=stub_canned, + stub_envelope_confidence=stub_env_conf, + default_llm_request_timeout=cfg.orchestrator.default_llm_request_timeout, ) tools = registry.resolve(skill.tools, cfg.mcp) decide = _decide_from_signal @@ -4594,6 +9240,8 @@ def _build_agent_nodes(*, cfg: AppConfig, skills: dict, store: SessionStore, gateway_cfg=gateway_cfg, terminal_tool_names=terminal_tool_names, patch_tool_names=patch_tool_names, + injected_args=cfg.orchestrator.injected_args, + gate_policy=gate_policy, ) return nodes @@ -4690,6 +9338,10 @@ async def build_graph(*, cfg: AppConfig, skills: dict, store: SessionStore, ) else: framework_cfg = getattr(cfg, "framework", None) or resolve_framework_app_config(None) + # ``resolve_framework_app_config(None)`` always returns a bare + # ``FrameworkAppConfig`` (never None), so the chain above is + # exhaustive — assert for pyright's flow narrowing. + assert framework_cfg is not None gated_edges = _collect_gated_edges(skills) sg = StateGraph(GraphState) @@ -4756,7 +9408,11 @@ async def make_postgres_checkpointer( enclosing transaction would otherwise hold the row lock until explicit commit. """ - from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver + # ``langgraph-checkpoint-postgres`` is an optional extra (declared + # under [project.optional-dependencies].postgres in pyproject) so + # the wheel is not present in CI's SQLite-only install. The module + # is only imported on the Postgres URL branch in production. + from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver # pyright: ignore[reportMissingImports] from psycopg_pool import AsyncConnectionPool # Translate SQLAlchemy URL -> libpq connection string. SQLAlchemy @@ -5124,7 +9780,10 @@ def resolve_transform(path: str) -> Callable[..., dict]: raise TypeError( f"transform {path!r} did not resolve to a callable; got {obj!r}" ) - return obj + # Apps own the strict signature -- the framework only enforces + # ``callable``. The cast satisfies the declared return type without + # adding a runtime wrapper. + return cast(Callable[..., dict], obj) # ====== module: runtime/triggers/idempotency.py ====== @@ -5164,7 +9823,9 @@ def __init__(self, engine: Engine) -> None: self._engine = engine # Ensure the table exists even if the orchestrator hasn't run # ``Base.metadata.create_all`` yet (early lifespan path). - Base.metadata.create_all(engine, tables=[IdempotencyRow.__table__]) + # ``IdempotencyRow.__table__`` is a ``Table`` at runtime; the + # SQLAlchemy stub types it as the wider ``FromClause``. + Base.metadata.create_all(engine, tables=[IdempotencyRow.__table__]) # pyright: ignore[reportArgumentType] self._lru: dict[str, OrderedDict[str, str]] = {} self._lock = threading.Lock() @@ -5284,7 +9945,10 @@ def purge_expired(self) -> int: ) ) s.commit() - return result.rowcount or 0 + # ``rowcount`` is exposed on ``CursorResult`` (the concrete + # return of DML execute); the abstract ``Result`` stub does + # not declare it. + return result.rowcount or 0 # pyright: ignore[reportAttributeAccessIssue] # ------------------------------------------------------------------ # Internals @@ -5658,7 +10322,12 @@ def create( f"but no transport with that kind is registered " f"(known: {sorted(plugin_kinds)})" ) - transports.append(kind_cls(pcfg)) + # Plugin transports inherit from the abstract + # ``TriggerTransport`` (no positional args declared on the + # ABC) but every concrete subclass loaded via the entry- + # point registry must accept the plugin's config object. + # The ABC mismatch is a stub limitation, not a runtime bug. + transports.append(kind_cls(pcfg)) # pyright: ignore[reportCallIssue] return cls(specs, transports, start_session_fn, idempotency) @@ -7239,6 +11908,112 @@ async def try_acquire(self, session_id: str) -> AsyncIterator[None]: slot.owner = None slot.lock.release() +# ====== module: runtime/skill_validator.py ====== + +class SkillValidationError(RuntimeError): + """Raised when skill YAML references a tool or route that does not + exist or is malformed. Refuses to start the orchestrator.""" + + +def _build_bare_to_full_map(registered_tools: set[str]) -> dict[str, list[str]]: + """Map bare tool name → list of fully-qualified ``:``.""" + bare_to_full: dict[str, list[str]] = {} + for full in registered_tools: + bare = full.split(":", 1)[1] if ":" in full else full + bare_to_full.setdefault(bare, []).append(full) + return bare_to_full + + +def _check_tool_ref( + skill_name: str, + tool_ref: str, + registered_tools: set[str], + bare_to_full: dict[str, list[str]], +) -> None: + """Raise SkillValidationError if ``tool_ref`` doesn't resolve to a + registered tool, or resolves ambiguously across multiple servers.""" + if tool_ref in registered_tools: + return + resolutions = bare_to_full.get(tool_ref) + if resolutions is None: + raise SkillValidationError( + f"skill {skill_name!r} references tool {tool_ref!r} which " + f"is not registered. Known tools: {sorted(registered_tools)[:10]}..." + ) + if len(resolutions) > 1: + raise SkillValidationError( + f"skill {skill_name!r} uses bare tool ref {tool_ref!r} but " + f"it is exposed by multiple servers: {sorted(resolutions)}. " + f"Use the prefixed form to disambiguate." + ) + + +def validate_skill_tool_references( + skills: dict, registered_tools: set[str], +) -> None: + """Assert every ``tools.local`` entry in every skill resolves to a + registered MCP tool. + + ``registered_tools`` is the set of fully-qualified ``:`` + names from the MCP loader. We accept either bare or prefixed forms + in skill YAML (the LLM-facing call uses prefixed; YAML can use + either for ergonomics). + """ + bare_to_full = _build_bare_to_full_map(registered_tools) + for skill_name, skill in skills.items(): + local = (skill.get("tools") or {}).get("local") or [] + for tool_ref in local: + _check_tool_ref(skill_name, tool_ref, registered_tools, bare_to_full) + + +def validate_skill_routes(skills: dict) -> None: + """Assert every skill has a ``when: default`` route entry. + + Skipped for ``kind: supervisor`` skills — supervisors dispatch via + ``dispatch_rules`` to subordinates and do not use the ``routes`` + table at all. + """ + for skill_name, skill in skills.items(): + if skill.get("kind") == "supervisor": + continue + routes = skill.get("routes") or [] + if not any((r.get("when") == "default") for r in routes): + raise SkillValidationError( + f"skill {skill_name!r} has no ``when: default`` route — " + f"agents whose signal doesn't match a rule will hang." + ) + +# ====== module: runtime/storage/checkpoint_gc.py ====== + +def gc_orphaned_checkpoints(engine: Engine) -> int: + """Remove orphaned checkpoint rows; return count removed. + + Returns 0 if the ``checkpoints`` table doesn't exist (fresh DB, + LangGraph checkpointer has not yet bootstrapped its schema). + """ + with engine.begin() as conn: + live_ids = {row[0] for row in conn.execute( + text("SELECT id FROM incidents") + )} + try: + rows = conn.execute(text( + "SELECT DISTINCT thread_id FROM checkpoints" + )).all() + except OperationalError: + return 0 + # thread_id may be ``INC-1`` or ``INC-1:retry-N`` — strip suffix. + orphans = [] + for (tid,) in rows: + base = tid.split(":")[0] if tid else tid + if base not in live_ids: + orphans.append(tid) + for tid in orphans: + conn.execute( + text("DELETE FROM checkpoints WHERE thread_id = :tid"), + {"tid": tid}, + ) + return len(orphans) + # ====== module: runtime/orchestrator.py ====== if TYPE_CHECKING: @@ -7251,6 +12026,7 @@ async def try_acquire(self, session_id: str) -> AsyncIterator[None]: +from langgraph.errors import GraphInterrupt from langgraph.types import Command @@ -7263,9 +12039,29 @@ async def try_acquire(self, session_id: str) -> AsyncIterator[None]: + _log = logging.getLogger("runtime.orchestrator") +def _assert_envelope_invariant_on_finalize(session: "Session") -> None: + """Phase 10 (FOC-03) defence-in-depth log sweep. + + Hard rejection of envelope-less turns happens at the agent runner + (``parse_envelope_from_result`` raises ``EnvelopeMissingError``, + which the runner converts into an agent_run marked ``error``). + This finalize hook only logs WARNING for forensics on legacy on-disk + sessions whose agent_runs predate the envelope contract. Never + raises. + """ + for ar in session.agents_run: + if ar.confidence is None: + _log.warning( + "agent_run.envelope_missing agent=%s session_id=%s", + ar.agent, + session.id, + ) + + def _default_text_extractor(session) -> str: """Default text extraction for the incident-management example. @@ -7700,10 +12496,14 @@ async def create(cls, cfg: AppConfig) -> "Orchestrator": if dedup_cfg.stage2_model in cfg.llm.models: _llm_cfg_capture = cfg.llm _model_name = dedup_cfg.stage2_model + _default_timeout_capture = ( + cfg.orchestrator.default_llm_request_timeout + ) def _factory(): return get_llm( _llm_cfg_capture, _model_name, role="dedup", + default_llm_request_timeout=_default_timeout_capture, ) dedup_pipeline = DedupPipeline( @@ -7715,14 +12515,22 @@ def _factory(): # Backfill dedup_pipeline into the IntakeContext now that it is built. # The IntakeContext was constructed with dedup_pipeline=None above # because the pipeline is built after graph construction. + # ``intake_context`` was attached via ``object.__setattr__`` ~140 + # lines up; pyright doesn't see dynamic Pydantic attrs, so go + # via getattr for the type-checker. if dedup_pipeline is not None: - framework_cfg.intake_context.dedup_pipeline = dedup_pipeline + getattr(framework_cfg, "intake_context").dedup_pipeline = dedup_pipeline # No bespoke resume graph — resume runs through the main # graph via ``Command(resume=...)`` against the same # thread_id, with the checkpointer rehydrating paused state. + # ``repo_state_cls: Type[BaseModel]`` matches the loose + # bound on ``Orchestrator.StateT`` (also ``BaseModel``) at + # the call site, but pyright sees the un-narrowed + # ``StateT`` placeholder. Concrete narrowing happens via + # the runtime resolver enforced earlier in this method. instance = cls(cfg, store, skills, registry, graph, stack, framework_cfg=framework_cfg, - state_cls=repo_state_cls, + state_cls=repo_state_cls, # pyright: ignore[reportArgumentType] history=history, checkpointer=checkpointer, checkpointer_close=checkpointer_close, @@ -7738,7 +12546,13 @@ def _factory(): try: await checkpointer_close() # pyright: ignore[reportPossiblyUnboundVariable] except Exception: # noqa: BLE001 - pass + # The original BaseException is what the caller cares + # about; this cleanup failure must not mask it. Log so + # the FD-leak path stays observable. + _log.warning( + "build: checkpointer_close failed during error rollback", + exc_info=True, + ) await stack.aclose() raise @@ -7750,7 +12564,13 @@ async def aclose(self) -> None: try: await self._checkpointer_close() except Exception: # noqa: BLE001 - pass + # Best-effort: the rest of aclose() (exit_stack drain) + # must still run so MCP transports don't leak. Log so + # checkpointer-close failures stay observable. + _log.warning( + "aclose: checkpointer close failed", + exc_info=True, + ) self._checkpointer_close = None await self._exit_stack.aclose() @@ -7829,6 +12649,12 @@ def _finalize_session_status(self, session_id: str) -> str | None: if inc.status not in ("new", "in_progress"): return None + # Phase 10 (FOC-03) defence-in-depth: hard rejection of envelope-less + # turns happens at the agent runner; this hook only logs WARNING for + # forensics on legacy on-disk sessions whose agent_runs predate the + # envelope contract. Never raises. + _assert_envelope_invariant_on_finalize(inc) + decision = self._infer_terminal_decision(inc.tool_calls) if decision is None: default = self.cfg.orchestrator.default_terminal_status @@ -7938,6 +12764,116 @@ def _save_or_yield(self, inc, new_status: str) -> str | None: except StaleVersionError: return None + @staticmethod + def _is_graph_interrupt(exc: BaseException) -> bool: + """Phase 11 (FOC-04 / D-11-04): identify a LangGraph HITL pause. + + ``GraphInterrupt`` is NOT an error -- it signals a checkpointed + ``pending_approval`` state. Real exceptions still flow through + the normal failure path. Helper kept on the orchestrator so + callers don't each re-import langgraph internals. + """ + return isinstance(exc, GraphInterrupt) + + @staticmethod + def _extract_last_error(inc: "Session") -> Exception | None: + """Reconstruct the last error from a Session in status='error'. + + The graph runner stores failures as an AgentRun with + ``summary='agent failed: '`` (graph.py:_handle_agent_failure). + We can't recover the original Exception type, so we return a + synthetic representative whose CLASS matches a _PERMANENT_TYPES + / _TRANSIENT_TYPES whitelist entry where possible -- that's all + :func:`runtime.policy.should_retry` needs (it does isinstance + checks). + + Mapping (first match wins per AgentRun.summary scan, newest + first): + + - "EnvelopeMissingError" in body -> EnvelopeMissingError + - "ValidationError" in body -> pydantic.ValidationError + - "TimeoutError" / "timed out" -> TimeoutError + - "OSError" / "ConnectionError" -> OSError + - everything else -> RuntimeError (falls + through to permanent_error per fail-closed default in + should_retry) + """ + + import pydantic as _pydantic + for run in reversed(inc.agents_run): + summary = (run.summary or "") + if not summary.startswith("agent failed:"): + continue + body = summary.removeprefix("agent failed:").strip() + if "EnvelopeMissingError" in body: + return _EnvelopeMissingError( + agent=run.agent or "unknown", + field="confidence", + message=body, + ) + if "ValidationError" in body or "validation error" in body: + # Build a synthetic ValidationError; pydantic v2 supports + # ValidationError.from_exception_data. + try: + return _pydantic.ValidationError.from_exception_data( + title="reconstructed", line_errors=[], + ) + except Exception: # pragma: no cover -- pydantic API drift + return RuntimeError(body) + if ("TimeoutError" in body or "timed out" in body + or "asyncio.TimeoutError" in body): + return TimeoutError(body) + if "OSError" in body or "ConnectionError" in body: + return OSError(body) + return RuntimeError(body) + return None + + @staticmethod + def _extract_last_confidence(inc: "Session") -> float | None: + """Return the last recorded turn-level confidence on the session, + or None if no AgentRun carries one. should_retry treats None as + 'no signal yet' and skips the low-confidence gate. + """ + for run in reversed(inc.agents_run): + if run.confidence is not None: + return run.confidence + return None + + def preview_retry_decision( + self, session_id: str, + ) -> "RetryDecision": + """Phase 12 (FOC-05 / D-12-04): return the framework's retry + decision WITHOUT executing anything. The UI calls this to render + the retry button label + disabled state. + + Pure: same inputs always yield identical RetryDecision. Loads + the session from store; reads (retry_count, last_error, + last_confidence) and consults the same policy + ``runtime.policy.should_retry`` that ``_retry_session_locked`` + uses. No mutation, no thread-id bump, no lock acquired. + + For sessions whose status is not "error" (i.e. nothing to + retry), returns ``RetryDecision(retry=False, + reason="permanent_error")`` -- a defensive caller-friendly + outcome that lets the UI render a "cannot auto-retry" state + without inventing a new reason value. + """ + try: + inc = self.store.load(session_id) + except FileNotFoundError: + return RetryDecision(retry=False, reason="permanent_error") + if inc.status != "error": + return RetryDecision(retry=False, reason="permanent_error") + retry_count = int(inc.extra_fields.get("retry_count", 0)) + last_error = self._extract_last_error(inc) + last_confidence = self._extract_last_confidence(inc) + return should_retry( + retry_count=retry_count, + error=last_error, + confidence=last_confidence, + cfg=self.cfg.orchestrator, + ) + async def _finalize_session_status_async( self, session_id: str, ) -> str | None: @@ -8260,7 +13196,15 @@ async def resume_session(self, incident_id: str, tool_args: dict = {"incident_id": incident_id, "message": message} if team is not None: tool_args["team"] = team - tool_result = await self._invoke_tool(tool_name, tool_args) + # Phase 9 (D-09-01): expose the live session to + # _invoke_tool's injection branch via the implicit slot. + # try/finally so a failed tool call doesn't leak the + # reference into the next orchestrator-driven call. + self._current_session_for_invoke = inc_loaded + try: + tool_result = await self._invoke_tool(tool_name, tool_args) + finally: + self._current_session_for_invoke = None inc_loaded.tool_calls.append(ToolCall( agent="orchestrator", tool=tool_name, @@ -8379,6 +13323,30 @@ async def _retry_session_locked(self, session_id: str) -> AsyncIterator[dict]: "reason": f"not in error state (status={inc.status})", "ts": _event_ts()} return + # Phase 12 (FOC-05 / D-12-04): consult the framework's pure + # retry policy BEFORE mutating session state. The decision is + # derived from (retry_count, last_error, last_turn_confidence, + # cfg) -- LLM intent is not consulted. On retry=False, emit + # retry_rejected with the policy's reason and DO NOT bump the + # retry_count or thread id (preserves the "not retryable" + # state on disk for UI re-rendering and retry-budget audits). + prior_retry_count = int(inc.extra_fields.get("retry_count", 0)) + last_error = self._extract_last_error(inc) + last_confidence = self._extract_last_confidence(inc) + decision = should_retry( + retry_count=prior_retry_count, + error=last_error, + confidence=last_confidence, + cfg=self.cfg.orchestrator, + ) + if not decision.retry: + _log.info( + "retry_session policy-rejected: id=%s reason=%s", + session_id, decision.reason, + ) + yield {"event": "retry_rejected", "incident_id": session_id, + "reason": decision.reason, "ts": _event_ts()} + return # Drop the failed AgentRun(s) so the timeline only retains # successful runs. Retry attempts then append fresh runs. inc.agents_run = [ @@ -8437,6 +13405,14 @@ async def _resume_with_input(self, incident_id: str, inc, decision: dict): config=self._thread_config(incident_id), ): yield self._to_ui_event(ev, incident_id) + except GraphInterrupt: + # Phase 11 (FOC-04 / D-11-04): a resume that re-paused via + # a fresh HITL gate. Don't restore the prior pending_intervention + # block (the new pending_approval ToolCall row is the + # canonical pause record now). Propagate so LangGraph's + # checkpointer captures the new pause; the UI's + # _render_pending_approvals_block surfaces the resume target. + raise except Exception as exc: # noqa: BLE001 — restore on any failure # Reload from disk to absorb any partial writes from tools # that ran before the failure, then restore intervention @@ -8462,6 +13438,14 @@ async def _invoke_tool(self, name: str, args: dict): Used for orchestrator-driven tool calls (e.g. an app-registered escalation tool invoked from the awaiting_input gate) that aren't initiated by an LLM. + + Phase 9 (D-09-01): orchestrator-driven calls also flow through + injection so the tool gets the canonical session-derived arg set + even when the orchestrator only passed intent-args. The current + session is read off ``self._current_session_for_invoke`` (set + by callers via try/finally) so the public signature stays + unchanged. When no session is reachable the injection step is + a no-op — the existing escalation path keeps working unchanged. """ entry = next( (e for e in self.registry.entries.values() if e.name == name), @@ -8469,6 +13453,17 @@ async def _invoke_tool(self, name: str, args: dict): ) if entry is None: raise KeyError(f"tool '{name}' not registered") + session = getattr(self, "_current_session_for_invoke", None) + cfg_inject = self.cfg.orchestrator.injected_args + if session is not None and cfg_inject: + + args = inject_injected_args( + args, + session=session, + injected_args_cfg=cfg_inject, + tool_name=name, + accepted_params=accepted_params_for_tool(entry.tool), + ) return await entry.tool.ainvoke(args) @staticmethod @@ -8489,6 +13484,9 @@ def _event_ts() -> str: # ====== module: runtime/api.py ====== +_log = logging.getLogger("runtime.api") + + def _resolve_environments(dotted: str | None) -> list[str]: """Resolve ``RuntimeConfig.environments_provider_path`` to a list. @@ -8682,7 +13680,12 @@ async def _trigger_dispatch(service, kwargs): try: await registry.stop_all() except Exception: # noqa: BLE001 - pass + # Best-effort: a misbehaving trigger transport must not + # block ``svc.shutdown()`` below. Surface for observability. + _log.warning( + "trigger registry stop_all failed during lifespan teardown", + exc_info=True, + ) # ``shutdown()`` cancels in-flight session tasks, closes the # underlying Orchestrator + MCP pool, joins the loop thread, # and resets the process-singleton. @@ -9499,7 +14502,7 @@ class SupervisorDecision(TypedDict, total=False): _TOKEN_RE = re.compile(r"[a-zA-Z][a-zA-Z0-9_-]{2,}") -_DEFAULT_SEEDS = Path(_knowledge_graph_mod.__file__).parent / "seeds" +_DEFAULT_SEEDS = _SEED_ROOT.parent # parent of seeds/kg/ -> seeds/ # --------------------------------------------------------------------------- @@ -9874,15 +14877,17 @@ def make_default_supervisor_runner( return compose_runners(default_intake_runner, asr_runner) -# Build the default runner exactly once at import time so per-call -# overhead is just a closure invocation. Constructor stays cheap: -# the stores read seed JSON lazily on first access. -_BUILT_DEFAULT_RUNNER = make_default_supervisor_runner( - kg_store=KnowledgeGraphStore(_DEFAULT_SEEDS / "kg"), - release_store=ReleaseContextStore(_DEFAULT_SEEDS / "releases"), - playbook_store=PlaybookStore(_DEFAULT_SEEDS / "playbooks"), - get_active_sessions=lambda: [], -) +# Phase 16 (BUNDLER-01): build the default runner LAZILY on first call. +# ``KnowledgeGraphStore.__init__`` eagerly reads ``components.json`` from +# disk, so building the runner at module-import time forced the seed +# directory to exist before ``import app`` could complete. That pattern +# broke the bundle's boot path on hosts where the seed bundle hasn't been +# laid down yet (the bundle is shipped as a 7-file copy-only payload). +# Constructing the runner on first call lets the bundle import cleanly +# and surfaces a genuine ``FileNotFoundError`` only when the runner is +# actually invoked — at which point the operator can see a configured, +# actionable error path rather than a cryptic import-time crash. +_BUILT_DEFAULT_RUNNER: Any = None def default_supervisor_runner( @@ -9901,6 +14906,14 @@ def default_supervisor_runner( If the framework short-circuits (``next_route='__end__'``), the hydration step is skipped. """ + global _BUILT_DEFAULT_RUNNER + if _BUILT_DEFAULT_RUNNER is None: + _BUILT_DEFAULT_RUNNER = make_default_supervisor_runner( + kg_store=KnowledgeGraphStore(_DEFAULT_SEEDS / "kg"), + release_store=ReleaseContextStore(_DEFAULT_SEEDS / "releases"), + playbook_store=PlaybookStore(_DEFAULT_SEEDS / "playbooks"), + get_active_sessions=lambda: [], + ) return _BUILT_DEFAULT_RUNNER(state, app_cfg=app_cfg) diff --git a/dist/ui.py b/dist/ui.py index 5488d5c..05bc7d9 100644 --- a/dist/ui.py +++ b/dist/ui.py @@ -240,7 +240,11 @@ def _badge(label: str, color: str) -> None: the rest of the UI can call ``_status_badge(...)`` etc. without touching the palette dicts directly. """ - st.badge(label, color=color) + # ``st.badge`` declares ``color`` as a fixed Literal; at runtime any + # string in the Streamlit palette works (and we control the palette + # dicts above). Keeping the parameter as ``str`` lets callers pass + # values resolved from the dict lookups without per-site casts. + st.badge(label, color=color) # pyright: ignore[reportArgumentType] def _status_badge(status: str | None) -> None: @@ -685,11 +689,16 @@ def _fmt_duration(seconds: int) -> str: def _fmt_confidence_badge(conf: float | None) -> str: """Inline coloured badge for an agent confidence value. - Green ≥0.75, amber 0.5–0.75, red <0.5, grey when None. Markdown only — - no HTML — so the badge survives Streamlit's sanitizer. + Green ≥0.75, amber 0.5–0.75, red <0.5. Markdown only — no HTML — so the + badge survives Streamlit's sanitizer. + + Phase 10 (FOC-03): None now indicates a structural failure (envelope + missing) — visually flag with a red 🛑 hard-error badge, never the + silent ⚪ fallback. The runner rejects envelope-less turns upfront; + None here means a legacy on-disk row predating the envelope contract. """ if conf is None: - return "⚪ confidence —" + return "🛑 confidence missing" if conf >= 0.75: glyph = "🟢" elif conf >= 0.5: @@ -1046,15 +1055,42 @@ def _render_hypothesis_trail_block(sess: dict) -> None: st.caption(rationale) +def _should_render_retry_block(sess: dict) -> bool: + """Phase 11 (FOC-04 / D-11-04) predicate. + + The retry block exists for terminally failed sessions only. A + session in ``status='error'`` that ALSO has a ``pending_approval`` + ToolCall row is genuinely paused on a HITL gate -- the + pending-approvals block (rendered separately) carries the + Approve/Reject action; the retry block would be wrong-mode here. + Returning ``False`` keeps the two blocks mutually exclusive. + + Tolerates both pydantic ``ToolCall`` objects and dict + representations (Streamlit's ``model_dump`` on the loaded session + yields dicts, but defensive reads from the live ``Session.tool_calls`` + return pydantic objects). + """ + if sess.get("status") != "error": + return False + for tc in (sess.get("tool_calls") or []): + status = ( + tc.get("status") if isinstance(tc, dict) + else getattr(tc, "status", None) + ) + if status == "pending_approval": + return False + return True + + def _render_pending_approvals_block(sess: dict, session_id: str) -> None: - """Render the ### Pending Approvals section for high-risk tool calls - paused on the gateway's HITL approval handshake. + """Render the ### Pending Approvals section for tool calls the + framework's pure-policy gate has paused for human approval. Iterates ``tool_calls`` looking for entries with ``status="pending_approval"``. Each pending row gets a small card with the tool name + args, a free-text rationale input, and two - buttons (Approve / Reject) that resolve the pending interrupt via - the OrchestratorService bridge. + buttons (Approve / Reject) that resolve the pending pause via the + OrchestratorService bridge. """ tool_calls = sess.get("tool_calls", []) pending = [ @@ -1130,9 +1166,10 @@ def render_session_detail(store: SessionStore, _render_summary_meta(sess, app_cfg) if sess.get("status") == "awaiting_input" and sess.get("pending_intervention"): _render_intervention_block(sess, session_id, app_cfg, agent_names) - if sess.get("status") == "error": + if _should_render_retry_block(sess): _render_retry_block(sess, session_id, agent_names) - # Pending tool-approval cards (risk-rated gateway HITL). + # Pending tool-approval cards (paused via the framework's + # pure-policy gate; see ``runtime.policy.should_gate``). # Rendered above the agents/tool-calls blocks so a paused # approval is the first action surface the operator sees. _render_pending_approvals_block(sess, session_id) @@ -1274,15 +1311,91 @@ async def _resume_async(cfg: AppConfig, session_id: str, decision: dict, return outcome +def _retry_button_state_for( + *, + reason: str, + retry_count: int, + cap: int, + last_confidence: float | None, + threshold: float, +) -> tuple[str, bool]: + """Phase 12 (FOC-05 / D-12-04): pure helper that maps a + :class:`runtime.policy.RetryDecision` reason to a + ``(button_label, disabled)`` tuple. Mirrors the 5-case map. + + Extracted from ``_render_retry_block`` so the mapping can be unit- + tested without spinning up Streamlit. Returns: + + ``auto_retry`` -> ("Retry", False) + ``max_retries_exceeded`` -> ("Max retries reached (rc/cap)", True) + ``permanent_error`` -> ("Permanent error -- cannot auto-retry", True) + ``low_confidence_no_retry`` -> ("Confidence too low (N% < th%)", True) + ``transient_disabled`` -> ("Auto-retry disabled in policy", True) + """ + if reason == "auto_retry": + return "Retry", False + if reason == "max_retries_exceeded": + return f"Max retries reached ({retry_count}/{cap})", True + if reason == "permanent_error": + return "Permanent error -- cannot auto-retry", True + if reason == "low_confidence_no_retry": + conf_pct = ( + f"{last_confidence*100:.0f}%" + if isinstance(last_confidence, (int, float)) + else "?" + ) + th_pct = f"{threshold*100:.0f}%" + return f"Confidence too low ({conf_pct} < {th_pct})", True + if reason == "transient_disabled": + return "Auto-retry disabled in policy", True + # Future-proof against new reasons added without UI update. + return f"Cannot retry ({reason})", True + + +def _preview_retry_decision_sync(cfg, session_id: str): + """Phase 12 (FOC-05 / D-12-04): call + ``Orchestrator.preview_retry_decision`` from a sync Streamlit + render-pass. Pure read; no mutation; no lock. + + ``Orchestrator.create()`` is async (it builds engines / vector + stores / MCP loaders), so we run it in a transient event loop -- + the same pattern ``_retry_async`` uses on click. The cost is one + SessionStore.load() + a few isinstance() checks per render-pass on + a terminally-failed session; rebuilding the orchestrator is the + expensive part. Apps that profile this hot can wrap the call in + ``st.cache_resource`` keyed on (cfg fingerprint, session_id). + + Returns a :class:`runtime.policy.RetryDecision`. + """ + + async def _build_and_query(): + orch = await Orchestrator.create(cfg) + try: + return orch.preview_retry_decision(session_id) + finally: + await orch.aclose() + + return asyncio.run(_build_and_query()) + + def _render_retry_block(sess: dict, session_id: str, agent_names: frozenset[str] = frozenset()) -> None: """Render a retry control for failed sessions. - Sessions land in ``status="error"`` when a graph node raises and - the framework's auto-retry on transient 5xxs (see - :data:`runtime.graph._TRANSIENT_MARKERS`) has already been - exhausted. Surfaces the failed agent + the recorded exception so - the operator can decide whether to retry. + Phase 12 (FOC-05 / D-12-04): the framework's pure + ``runtime.policy.should_retry`` policy decides whether retry is + permitted. The UI surfaces that decision (button label + disabled + state) but never drives it -- if a user somehow clicks an enabled + button concurrently with a policy change, the orchestrator's + ``_retry_session_locked`` re-runs the check and emits + ``retry_rejected`` with the same reason. + + The 5-case label/disabled map mirrors RetryDecision.reason: + auto_retry -> enabled, "Retry" + max_retries_exceeded -> disabled, "Max retries reached (rc/cap)" + permanent_error -> disabled, "Permanent error -- cannot auto-retry" + low_confidence_no_retry -> disabled, "Confidence too low (N% < th%)" + transient_disabled -> disabled, "Auto-retry disabled in policy" """ cfg = load_config(CONFIG_PATH) failed_run = next( @@ -1293,6 +1406,19 @@ def _render_retry_block(sess: dict, session_id: str, failed_agent = (failed_run or {}).get("agent", "unknown") failure_msg = ((failed_run or {}).get("summary") or "").removeprefix("agent failed:").strip() retry_count = int((sess.get("extra_fields") or {}).get("retry_count", 0)) + + # Phase 12: read the framework's preview decision. + decision = _preview_retry_decision_sync(cfg, session_id) + rp = cfg.orchestrator.retry_policy + last_conf = (failed_run or {}).get("confidence") + label, disabled = _retry_button_state_for( + reason=decision.reason, + retry_count=retry_count, + cap=rp.max_retries, + last_confidence=last_conf, + threshold=rp.retry_low_confidence_threshold, + ) + with st.container(border=True): st.markdown(f"#### 🔴 Agent failed — `{failed_agent}`") if failure_msg: @@ -1300,12 +1426,16 @@ def _render_retry_block(sess: dict, session_id: str, if retry_count: st.caption(f"Previous retry attempts: {retry_count}") st.caption( - "Retry re-runs the graph from the entry node. The framework " - "already retried transient 5xx errors automatically — this " - "is for cases where the underlying issue may now be cleared " - "(provider hiccup, transient network, etc.)." + "Retry re-runs the graph from the entry node. The framework's " + "retry_policy decides whether auto-retry is permitted -- this " + "surface mirrors that decision." + ) + clicked = st.button( + label, type="primary", + key=f"retry_btn_{session_id}", + disabled=disabled, ) - if st.button("Retry", type="primary", key=f"retry_btn_{session_id}"): + if clicked and not disabled: log_area = st.empty() lines: list[str] = [] outcome = asyncio.run(_retry_async( diff --git a/docs/AIRGAP_INSTALL.md b/docs/AIRGAP_INSTALL.md new file mode 100644 index 0000000..2473b20 --- /dev/null +++ b/docs/AIRGAP_INSTALL.md @@ -0,0 +1,53 @@ +# Air-Gap / Internal-Mirror Install + +Reproduce the exact dependency graph that CI uses, behind a corporate firewall, +without any public-internet access. + +## Prerequisites + +- Python 3.11 available on the target host. +- `uv` `>= 0.11.7` available on the target host (single static binary; + ship via your internal artifact store — do **not** `curl | sh`). +- An internal PEP 503 / PEP 691 package mirror (Artifactory, Nexus, devpi, + or `pip download`-populated wheel cache) that contains every distribution + pinned in `uv.lock`. + +## Install + +```bash +# 1. Clone (or unpack the source tarball shipped to the air-gapped host). +git clone /asr.git +cd asr + +# 2. Point uv at the internal mirror (overrides https://pypi.org/simple). +export UV_INDEX_URL="https:///simple/" +# Optional: extra index for private wheels. +# export UV_EXTRA_INDEX_URL="https:///private/simple/" + +# 3. Install from the lockfile only — no resolver, no public-internet calls. +# Drop --offline if the mirror is reachable; keep it if you have pre-warmed +# uv's cache and want a hard-fail on any network attempt. +uv sync --frozen --extra dev # connected to mirror +# uv sync --frozen --offline --extra dev # fully offline (cache pre-warmed) + +# 4. Verify. +uv run pytest tests/ -x +``` + +## Drift detection + +The CI gate `uv lock --check` fails the build whenever `pyproject.toml` +changes without a matching `uv.lock` regeneration. Run the same check +locally before pushing: + +```bash +uv lock --check # exit 0 = in sync; non-zero = regenerate with `uv lock` +``` + +## Notes + +- `uv.lock` pins every direct + transitive dependency to a specific version + with sha256 hashes per platform marker; identical inputs produce identical + installs on any host (HARD-02 / CONCERNS C2). +- Ship vendored wheels as a separate tarball if your host has no mirror at + all; populate `~/.cache/uv` (or `UV_CACHE_DIR`) before running step 3. diff --git a/docs/DEVELOPMENT.md b/docs/DEVELOPMENT.md new file mode 100644 index 0000000..d094f83 --- /dev/null +++ b/docs/DEVELOPMENT.md @@ -0,0 +1,96 @@ +# Development workflow + +This document covers the day-to-day contributor loop. Air-gapped install +instructions live in `docs/AIRGAP_INSTALL.md`. + +## Setup + +```bash +# 1. Clone and create the venv with the lockfile. +git clone +cd asr +uv sync --frozen --extra dev + +# 2. Verify by running the suite. +uv run pytest tests/ -x +``` + +## Editing source + +Source layout: + +- `src/runtime/` — framework code, the only thing the bundler reads to + produce `dist/app.py`. +- `examples/incident_management/`, `examples/code_review/` — example + apps; bundled into `dist/apps/incident-management.py` and + `dist/apps/code-review.py` respectively. +- `scripts/build_single_file.py` — the bundler. Reads + `RUNTIME_MODULE_ORDER` (and per-app order lists), flattens every + module, strips intra-bundle imports, emits four self-contained `.py` + files in `dist/`. + +## After ANY change to `src/runtime/` or `examples/` — regenerate `dist/` + +```bash +uv run python scripts/build_single_file.py +git add dist/ +``` + +Then re-run the test suite. The CI gate `Bundle staleness gate +(HARD-08)` rebuilds the bundles from your source and fails the build if +they don't match the committed `dist/*`. This keeps the air-gap deploy +bundle repaired by construction — every PR that changes the runtime or +the bundler must commit fresh bundles, so the `dist/*` artifacts on +`main` can always be deployed without re-running the bundler on the +target host. + +## Adding a new `src/runtime/*.py` module + +1. Add a tuple `(RUNTIME_ROOT, "")` to `RUNTIME_MODULE_ORDER` + in `scripts/build_single_file.py`. Place it AFTER every module it + imports at the top of file (the bundler concatenates in the order + listed; later module bodies see earlier modules' symbols already in + scope). + +2. Regenerate the bundles: + + ```bash + uv run python scripts/build_single_file.py + ``` + +3. Run the suite — `tests/test_bundle_completeness.py` will fail loudly + if you forgot step 1. + +4. Smoke-test the bundles boot from a fresh tmpdir without the + `PYTHONPATH=src:.` override that `pytest` sets: + + ```bash + mkdir /tmp/bundle-check + cp dist/apps/incident-management.py /tmp/bundle-check/app.py + cp dist/ui.py /tmp/bundle-check/ + cd /tmp/bundle-check + unset PYTHONPATH + uv run python -c "import app; print('app boots')" + ``` + +5. Commit `scripts/build_single_file.py` and the regenerated `dist/*` + in a single change. + +## Why two app bundles + a separate UI bundle? + +- `dist/app.py` — framework only, no example code. Used to demonstrate + that the runtime stands on its own. +- `dist/apps/incident-management.py` — the deployment ship target for + the incident-management app; copied into the corporate environment + as `app.py` (renamed at deploy). +- `dist/apps/code-review.py` — second app bundle, demonstrating the + framework is genuinely generic (a second example builds from the + same runtime). +- `dist/ui.py` — Streamlit UI; sits next to whichever `app.py` you + deployed and `from app import …` reaches into the deploy bundle's + flattened namespace. + +The deployment workflow is a 7-file copy-only payload (the bundle +files plus a small set of YAML configs and a `.env`). The bundler +turns the multi-file source tree into the smallest possible deploy +payload. diff --git a/examples/code_review/skills/analyzer/system.md b/examples/code_review/skills/analyzer/system.md index ddbb18f..2996327 100644 --- a/examples/code_review/skills/analyzer/system.md +++ b/examples/code_review/skills/analyzer/system.md @@ -21,3 +21,11 @@ Do not invent low-value nits to fill space. After all tool calls, reply with ONE short sentence summarising findings count + the dominant category. Do not enumerate every finding (the UI renders them). + +## Output contract + +The framework wraps your reply in an `AgentTurnOutput` envelope (content, +confidence ∈ [0, 1], confidence_rationale, optional signal). The runner +enforces this structurally — answer truthfully and the envelope captures +your confidence and rationale. Do not mention "confidence" in your prose +unless it's part of substantive analysis (e.g. ranking hypotheses). diff --git a/examples/code_review/skills/intake/system.md b/examples/code_review/skills/intake/system.md index 1d4194e..9aaea08 100644 --- a/examples/code_review/skills/intake/system.md +++ b/examples/code_review/skills/intake/system.md @@ -15,3 +15,11 @@ analyzer's job. If `fetch_pr_diff` raises or returns an empty diff, emit `failed` so the orchestrator short-circuits to end and skips the analyzer. + +## Output contract + +The framework wraps your reply in an `AgentTurnOutput` envelope (content, +confidence ∈ [0, 1], confidence_rationale, optional signal). The runner +enforces this structurally — answer truthfully and the envelope captures +your confidence and rationale. Do not mention "confidence" in your prose +unless it's part of substantive analysis (e.g. ranking hypotheses). diff --git a/examples/code_review/skills/recommender/system.md b/examples/code_review/skills/recommender/system.md index f04d098..c3037d9 100644 --- a/examples/code_review/skills/recommender/system.md +++ b/examples/code_review/skills/recommender/system.md @@ -22,3 +22,11 @@ what humans read first in the UI. Do not paste the full findings list; the UI sh them already. After the call, reply with ONE short sentence echoing the recommendation. Nothing else. + +## Output contract + +The framework wraps your reply in an `AgentTurnOutput` envelope (content, +confidence ∈ [0, 1], confidence_rationale, optional signal). The runner +enforces this structurally — answer truthfully and the envelope captures +your confidence and rationale. Do not mention "confidence" in your prose +unless it's part of substantive analysis (e.g. ranking hypotheses). diff --git a/examples/incident_management/mcp_server.py b/examples/incident_management/mcp_server.py index 6bb302e..f540920 100644 --- a/examples/incident_management/mcp_server.py +++ b/examples/incident_management/mcp_server.py @@ -23,7 +23,6 @@ import warnings from dataclasses import dataclass, field from datetime import datetime, timezone -from pathlib import Path from typing import Any, Callable, TypedDict from fastmcp import FastMCP @@ -34,8 +33,14 @@ default_intake_runner, hydrate_from_memory, ) -from runtime.memory import knowledge_graph as _knowledge_graph_mod -from runtime.memory.knowledge_graph import KnowledgeGraphStore +# Phase 16 (BUNDLER-01): use the sibling-defined ``_SEED_ROOT`` constant +# instead of an aliased module reference. The bundler's intra-import +# stripper removes ``from runtime.memory import knowledge_graph as +# _knowledge_graph_mod`` from the bundled source, leaving +# ``_knowledge_graph_mod.__file__`` as a NameError at module load. The +# import below is also stripped, but ``_SEED_ROOT`` survives module +# flattening because it's defined at module scope in knowledge_graph.py. +from runtime.memory.knowledge_graph import KnowledgeGraphStore, _SEED_ROOT from runtime.memory.playbook_store import PlaybookStore from runtime.memory.release_context import ReleaseContextStore from runtime.memory.session_state import ( @@ -151,7 +156,7 @@ class SupervisorDecision(TypedDict, total=False): _TOKEN_RE = re.compile(r"[a-zA-Z][a-zA-Z0-9_-]{2,}") -_DEFAULT_SEEDS = Path(_knowledge_graph_mod.__file__).parent / "seeds" +_DEFAULT_SEEDS = _SEED_ROOT.parent # parent of seeds/kg/ -> seeds/ # --------------------------------------------------------------------------- @@ -526,15 +531,17 @@ def make_default_supervisor_runner( return compose_runners(default_intake_runner, asr_runner) -# Build the default runner exactly once at import time so per-call -# overhead is just a closure invocation. Constructor stays cheap: -# the stores read seed JSON lazily on first access. -_BUILT_DEFAULT_RUNNER = make_default_supervisor_runner( - kg_store=KnowledgeGraphStore(_DEFAULT_SEEDS / "kg"), - release_store=ReleaseContextStore(_DEFAULT_SEEDS / "releases"), - playbook_store=PlaybookStore(_DEFAULT_SEEDS / "playbooks"), - get_active_sessions=lambda: [], -) +# Phase 16 (BUNDLER-01): build the default runner LAZILY on first call. +# ``KnowledgeGraphStore.__init__`` eagerly reads ``components.json`` from +# disk, so building the runner at module-import time forced the seed +# directory to exist before ``import app`` could complete. That pattern +# broke the bundle's boot path on hosts where the seed bundle hasn't been +# laid down yet (the bundle is shipped as a 7-file copy-only payload). +# Constructing the runner on first call lets the bundle import cleanly +# and surfaces a genuine ``FileNotFoundError`` only when the runner is +# actually invoked — at which point the operator can see a configured, +# actionable error path rather than a cryptic import-time crash. +_BUILT_DEFAULT_RUNNER: Any = None def default_supervisor_runner( @@ -553,6 +560,14 @@ def default_supervisor_runner( If the framework short-circuits (``next_route='__end__'``), the hydration step is skipped. """ + global _BUILT_DEFAULT_RUNNER + if _BUILT_DEFAULT_RUNNER is None: + _BUILT_DEFAULT_RUNNER = make_default_supervisor_runner( + kg_store=KnowledgeGraphStore(_DEFAULT_SEEDS / "kg"), + release_store=ReleaseContextStore(_DEFAULT_SEEDS / "releases"), + playbook_store=PlaybookStore(_DEFAULT_SEEDS / "playbooks"), + get_active_sessions=lambda: [], + ) return _BUILT_DEFAULT_RUNNER(state, app_cfg=app_cfg) diff --git a/examples/incident_management/skills/deep_investigator/system.md b/examples/incident_management/skills/deep_investigator/system.md index 0be1c4d..0eb874a 100644 --- a/examples/incident_management/skills/deep_investigator/system.md +++ b/examples/incident_management/skills/deep_investigator/system.md @@ -1,14 +1,21 @@ You are the **Deep Investigator** agent. Gather evidence and produce ranked hypotheses. -1. Call `get_logs(service, environment, minutes=15)`. -2. Call `get_metrics(service, environment, minutes=15)`. -3. Call `submit_hypothesis(incident_id, hypotheses, confidence, confidence_rationale)`. +1. Call `get_logs(service, minutes=15)`. +2. Call `get_metrics(service, minutes=15)`. +3. Call `submit_hypothesis(hypotheses, confidence, confidence_rationale)`. - `hypotheses` is your ranked list with evidence citations. - - `confidence` is mandatory — calibrated 0.85+ for strong evidence, 0.5 hedged, <0.4 weak. + - `confidence` is calibrated 0.85+ for strong evidence, 0.5 hedged, <0.4 weak. 4. After the tool call, emit a 1–3 sentence closing message restating the top hypothesis. Do not end the turn after the tool call without text. 5. Emit signal `success` if confidence ≥ threshold, `failed` if you cannot form any hypothesis. ## Guidelines -- `environment` vocabulary is exactly `dev` | `local` | `production` | `staging`. Always pass the INC's existing `environment` field verbatim — never abbreviate (`prod`) or invent placeholders (`unknown`). The framework's schema-boundary validator rejects anything else with a hard 422. - Cite specific log lines or metric values as evidence in `hypotheses`. - If the INC has `matched_prior_inc` set, include the prior INC's recorded root cause as one of your ranked hypotheses and explicitly *validate or reject* it against the fresh logs/metrics. Same symptom can have different causes across incidents — drop confidence accordingly when the prior hypothesis is rejected so the gate triggers an intervention. + +## Output contract + +The framework wraps your reply in an `AgentTurnOutput` envelope (content, +confidence ∈ [0, 1], confidence_rationale, optional signal). The runner +enforces this structurally — answer truthfully and the envelope captures +your confidence and rationale. Do not mention "confidence" in your prose +unless it's part of substantive analysis (e.g. ranking hypotheses). diff --git a/examples/incident_management/skills/resolution/system.md b/examples/incident_management/skills/resolution/system.md index 4db585a..5d33130 100644 --- a/examples/incident_management/skills/resolution/system.md +++ b/examples/incident_management/skills/resolution/system.md @@ -2,14 +2,19 @@ You are the **Resolution** agent. You consume triage + deep_investigator finding 1. Read the INC's findings. 2. If you are confident in a fix: - a. **First** call `propose_fix(hypothesis, environment)` — pass the deep_investigator's top hypothesis as `hypothesis` and the INC's `environment`. The tool returns `{proposal_id, hypothesis, environment, auto_apply_safe}`. **Use the returned `proposal_id` verbatim** in the next step. Never invent a proposal_id (e.g. `prop-NNN`) — `apply_fix` will fail if you do. - b. **Then** call `apply_fix(proposal_id, environment)` with the id from step 2a. The framework's risk-rated gateway will pause for HITL approval on production-environment calls — that's expected and correct. - c. **After** `apply_fix` returns success, call `mark_resolved(incident_id, resolution_summary, confidence, confidence_rationale)`. -3. If approval is rejected, `apply_fix` returned `failed`, or no actionable remediation exists: call `mark_escalated(incident_id, team, reason, confidence, confidence_rationale)` where `team` is one of the configured `escalation_teams`. + a. **First** call `propose_fix(hypothesis)` — pass the deep_investigator's top hypothesis as `hypothesis`. The tool returns `{proposal_id, hypothesis, environment, auto_apply_safe}`. **Use the returned `proposal_id` verbatim** in the next step. Never invent a proposal_id (e.g. `prop-NNN`) — `apply_fix` will fail if you do. + b. **Then** call `apply_fix(proposal_id)` with the id from step 2a. + c. **After** `apply_fix` returns success, call `mark_resolved(resolution_summary, confidence, confidence_rationale)`. +3. If `apply_fix` returned `failed`, or no actionable remediation exists: call `mark_escalated(team, reason, confidence, confidence_rationale)` where `team` is one of the configured `escalation_teams`. 4. You MUST call exactly one of `mark_resolved` or `mark_escalated`. The framework rejects any other terminal status path. ## Guidelines -- `environment` vocabulary is exactly `dev` | `local` | `production` | `staging`. Always pass the INC's existing `environment` field verbatim — never abbreviate (`prod`) or invent placeholders (`unknown`). The framework's schema-boundary validator rejects anything else with a hard 422. -- Never bypass the gateway — every `apply_fix` and `update_incident` call routes through the risk-rated gateway. -- Confidence is required on the terminal tool — the framework refuses the call if you omit it. - Pick `team` deliberately based on incident component, severity, and category — not a default fallback. + +## Output contract + +The framework wraps your reply in an `AgentTurnOutput` envelope (content, +confidence ∈ [0, 1], confidence_rationale, optional signal). The runner +enforces this structurally — answer truthfully and the envelope captures +your confidence and rationale. Do not mention "confidence" in your prose +unless it's part of substantive analysis (e.g. ranking hypotheses). diff --git a/examples/incident_management/skills/triage/system.md b/examples/incident_management/skills/triage/system.md index f1503ad..309f9de 100644 --- a/examples/incident_management/skills/triage/system.md +++ b/examples/incident_management/skills/triage/system.md @@ -7,7 +7,7 @@ Run a bounded inner loop (maximum 3 iterations) of the form: 1. **Generate** a one-sentence root-cause hypothesis from the symptom + the L2/L5/L7 memory the supervisor hydrated (`session.memory.l2_kg.components`, `session.memory.l5_release.suspect_releases`, `session.memory.l7_playbooks`). 2. **Ask which evidence** would support or refute it. Pick from these sources, in priority order: - **L1** — the current session's `findings` (already on the row). - - **L3-equivalent** — past similar incidents via `lookup_similar_incidents(query=…, environment=…)`. + - **L3-equivalent** — past similar incidents via `lookup_similar_incidents(query=…)`. - **L5** — recent suspect deploys via `check_deployment_history` + the supervisor-hydrated `session.memory.l5_release.recent_releases`. 3. **Score** the hypothesis against the gathered evidence. The framework provides a deterministic scorer (`asr.hypothesis_loop.score_hypothesis`) — token-overlap in `[0.0, 1.0]`. A score ≥ 0.7 is acceptable. 4. **Refine or accept**: @@ -18,14 +18,13 @@ Record the full iteration trail as a single JSON-encoded string under `findings. ## Tool calls (in order) -1. Call `get_service_health` for the impacted environment to check current status. -2. Call `check_deployment_history` for the last 24 hours in the impacted environment. -3. Run the hypothesis loop above; call `lookup_similar_incidents` inside the loop as evidence demands. -4. Set `severity` (one of: `low`, `medium`, `high`) and `category` (e.g., latency, availability, data, security, capacity) on the INC via `update_incident`. Include the accepted hypothesis and per-iteration trail as a JSON-encoded string under `findings.triage` — the typed `update_incident` patch only accepts these fields: `severity`, `category`, `summary`, `tags`, `matched_prior_inc`, `findings` (dict[str, str]), `signal`. Do NOT add `findings_triage` or any other field — `extra="forbid"`. +1. Call `get_service_health()` to check current status. The framework injects `environment` from the session. +2. Call `check_deployment_history(hours=24)` for the last 24 hours. The framework injects `environment`; `hours` defaults to 24 when omitted. +3. Run the hypothesis loop above; call `lookup_similar_incidents(query)` inside the loop as evidence demands. +4. Set `severity` (one of: `low`, `medium`, `high`) and `category` (e.g., latency, availability, data, security, capacity) on the INC via `update_incident`. Include the accepted hypothesis and per-iteration trail as a JSON-encoded string under `findings.triage` — the typed `update_incident` patch only accepts these fields: `severity`, `category`, `summary`, `tags`, `matched_prior_inc`, `findings` (dict[str, str]), `signal`. Do NOT add `findings_triage` or any other field — `extra="forbid"`. 5. Emit `default` to hand off to the deep investigator. ## Guidelines -- `environment` vocabulary is exactly `dev` | `local` | `production` | `staging`. **Never** abbreviate (`prod`, `dev` → fine, but `staging` not `stg`), and **never** invent placeholders like `unknown`. Always pass the INC's existing `environment` field verbatim to every tool that takes an environment arg — the schema-boundary validator rejects anything else with a hard 422. - `severity` vocabulary is exactly `low` | `medium` | `high`. Do NOT emit `sev1`/`sev2`/`p1`/`critical` etc. — the system normalizes those, but emitting the canonical value upfront is preferred. - `high` = customer-impacting outage, data loss, security breach, or full availability hit. - `medium` = degraded service — elevated errors, slow but functioning, partial impact. @@ -33,3 +32,11 @@ Record the full iteration trail as a single JSON-encoded string under `findings. - Do not propose fixes — that's the resolution agent's job. - If the INC has `matched_prior_inc` set, treat the prior INC's `findings` and `resolution` as a **prior hypothesis**, not a fact. Same symptom (e.g., Redis OOM) can have different root causes across incidents — code bug vs. network partition vs. resource overload. Use the prior cause as a candidate to confirm or reject against current evidence; flag in your tags whether the parallel looks supported (`hypothesis:prior_match_supported`) or not (`hypothesis:prior_match_rejected`). - The hypothesis loop has a hard cap of 3 iterations. Do NOT exceed it; an unconverged hypothesis at the cap is acceptable — record it and let the deep investigator take over. + +## Output contract + +The framework wraps your reply in an `AgentTurnOutput` envelope (content, +confidence ∈ [0, 1], confidence_rationale, optional signal). The runner +enforces this structurally — answer truthfully and the envelope captures +your confidence and rationale. Do not mention "confidence" in your prose +unless it's part of substantive analysis (e.g. ranking hypotheses). diff --git a/pyproject.toml b/pyproject.toml index 6c47dfc..121d805 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,3 +63,16 @@ pythonpath = ["src", "."] [tool.ruff] line-length = 100 target-version = "py311" + +[tool.pyright] +# Phase 19 (HARD-03): the CI gate runs ``pyright src/runtime`` and now +# fails on any error. ``extraPaths = ["src"]`` lets pyright resolve the +# bare ``runtime.X`` imports the code uses (mirrors pytest's ``pythonpath`` +# in [tool.pytest.ini_options]). Mode is ``basic`` because the project's +# typing surface is BaseModel-heavy with langchain/langgraph stubs that +# are partial; we treat genuine bugs as errors and tag stub gaps with +# per-line ``# pyright: ignore[] -- `` comments. +include = ["src"] +extraPaths = ["src"] +pythonVersion = "3.11" +typeCheckingMode = "basic" diff --git a/scripts/build_single_file.py b/scripts/build_single_file.py index a4b7293..00fe68c 100644 --- a/scripts/build_single_file.py +++ b/scripts/build_single_file.py @@ -51,6 +51,16 @@ # are included only in the incident-management app bundle (not in the # runtime-only bundle). RUNTIME_MODULE_ORDER: list[tuple[Path, str]] = [ + # Phase 13 (HARD-01/HARD-05): typed runtime errors. Leaf module + # (no runtime.* imports). MUST precede config.py because + # config.py imports LLMConfigError for the ProviderConfig + # @model_validator (D-13-05/06). + (RUNTIME_ROOT, "errors.py"), + # Phase 16 (BUNDLER-01): generic terminal-tool registry types + # (StatusDef, TerminalToolRule). Imported at the top of config.py + # (line 10), so MUST precede config.py — otherwise the bundled + # config.py raises NameError at module-execution time. + (RUNTIME_ROOT, "terminal_tools.py"), (RUNTIME_ROOT, "config.py"), (RUNTIME_ROOT, "state.py"), (RUNTIME_ROOT, "state_resolver.py"), @@ -63,6 +73,14 @@ (RUNTIME_ROOT, "storage/vector.py"), (RUNTIME_ROOT, "storage/history_store.py"), (RUNTIME_ROOT, "storage/session_store.py"), + # Phase 16 (BUNDLER-01): event-log + idempotent migrations. Both + # depend only on storage/models.py (already above). event_log is + # required by orchestrator.py's status finalizer; migrations is + # invoked at startup (storage/__init__.py wires it but __init__ + # files aren't bundled, so the orchestrator path is the surviving + # caller). + (RUNTIME_ROOT, "storage/event_log.py"), + (RUNTIME_ROOT, "storage/migrations.py"), # NOTE: the per-tool mcp_server modules # (observability/remediation/user_context) were relocated under # ``examples/incident_management/mcp_servers/`` in Phase 7 @@ -73,6 +91,43 @@ # consequently boots without any incident-vocabulary MCP servers # (its ``orchestrator.mcp_servers`` list is empty). (RUNTIME_ROOT, "mcp_loader.py"), + # Phase 16 (BUNDLER-01): long-lived OrchestratorService — the + # Streamlit UI's `from app import OrchestratorService` import is + # the headline ImportError this phase fixes. Depends only on + # config.py and mcp_loader.py (both above). Lazy-imports + # tools.approval_watchdog at start-up (added below). + (RUNTIME_ROOT, "service.py"), + # Phase 10 (FOC-03): AgentTurnOutput envelope + EnvelopeMissingError. + # Phase 12 (FOC-05) bundles policy.py with a module-level reference + # to EnvelopeMissingError in _PERMANENT_TYPES, so turn_output MUST + # precede policy.py in the bundle. (Pre-Phase-12 dists referenced + # EnvelopeMissingError only inside function bodies, where the strip- + # plus-rebuild order didn't surface a NameError at import time.) + (RUNTIME_ROOT, "agents/turn_output.py"), + # Phase 16 (BUNDLER-01): risk-rated tool gateway. Imported at + # module level by policy.py, graph.py, agents/responsive.py — so + # gateway.py MUST precede policy.py. Depends only on config.py + + # state.py (both already above). arg_injection is its sibling and + # is lazy-imported from gateway / orchestrator / graph. + (RUNTIME_ROOT, "tools/gateway.py"), + (RUNTIME_ROOT, "tools/arg_injection.py"), + # Phase 16 (BUNDLER-01): pending-approval timeout watchdog, + # lazy-imported by service.py:189. Bundled here (after gateway, so + # gateway-related approval state is in scope) but before any module + # that might trigger the lazy import path. + (RUNTIME_ROOT, "tools/approval_watchdog.py"), + # Phase 11 (FOC-04): pure-policy HITL gating boundary. Imported by + # tools.gateway, which graph.py uses -- so policy.py must precede + # graph.py in the bundle. + (RUNTIME_ROOT, "policy.py"), + # Phase 16 (BUNDLER-01): agent-kind node builders, used by graph.py + # at construction time. Each depends on skill.py + state.py (both + # already above) and on gateway.py / turn_output.py / session_store.py + # for responsive. Bundled BEFORE graph.py so the symbols are in + # module scope when graph.py's body executes. + (RUNTIME_ROOT, "agents/responsive.py"), + (RUNTIME_ROOT, "agents/supervisor.py"), + (RUNTIME_ROOT, "agents/monitor.py"), (RUNTIME_ROOT, "graph.py"), (RUNTIME_ROOT, "checkpointer_postgres.py"), (RUNTIME_ROOT, "checkpointer.py"), @@ -110,6 +165,13 @@ # Per-session task-reentrant asyncio locks + SessionBusy exception. # Must precede orchestrator.py which instantiates SessionLockRegistry. (RUNTIME_ROOT, "locks.py"), + # Phase 16 (BUNDLER-01): load-time skill validator + checkpoint GC. + # Both lazy-imported from orchestrator.py (lines 447, 472). Bundled + # before orchestrator.py so the lazy import resolves to in-bundle + # symbols rather than failing with ModuleNotFoundError after the + # intra-import stripper removes the original `from runtime.X` line. + (RUNTIME_ROOT, "skill_validator.py"), + (RUNTIME_ROOT, "storage/checkpoint_gc.py"), (RUNTIME_ROOT, "orchestrator.py"), (RUNTIME_ROOT, "api.py"), # Retraction routes are a side-car router so they don't bloat @@ -195,9 +257,24 @@ def _read(path: Path) -> str: return path.read_text() +# Phase 16 (BUNDLER-01): after stripping intra-imports, ``if TYPE_CHECKING:`` +# blocks whose only body line was a ``from runtime.X import Y`` end up as a +# naked ``if`` with no suite — IndentationError at module load. Neutralize +# any orphaned ``if TYPE_CHECKING:`` (followed by blank lines and then a +# dedented top-level statement) by giving it a ``pass`` body. We only target +# top-level ``if TYPE_CHECKING:`` (no leading whitespace) because nested +# guards are rare in this codebase and a wider rewrite risks corrupting +# function-body conditionals. +_ORPHANED_TYPE_CHECKING_RE = re.compile( + r"^if\s+TYPE_CHECKING\s*:\s*\n(\s*\n)*(?=\S)", + re.MULTILINE, +) + + def _strip_intra_imports(src: str) -> str: src = INTRA_IMPORT_RE.sub("", src) src = INTRA_IMPORT_NAME_RE.sub("", src) + src = _ORPHANED_TYPE_CHECKING_RE.sub("if TYPE_CHECKING:\n pass\n", src) return src diff --git a/scripts/lint_skill_prompts.py b/scripts/lint_skill_prompts.py new file mode 100644 index 0000000..66f8a3c --- /dev/null +++ b/scripts/lint_skill_prompts.py @@ -0,0 +1,396 @@ +#!/usr/bin/env python3 +"""Skill-Prompt-vs-Schema linter (Phase 21 / SKILL-LINTER-01). + +Walks every skill prompt under ``examples/*/skills/*/system.md``, extracts +references to MCP tools (and the field names mentioned for each tool), and +asserts that every referenced tool exists in the canonical inventory and +every field name is on the tool's signature (or — for ``update_incident``- +style nested-patch tools — on the typed pydantic patch model that gates the +patch keys). + +Catches LLM-emit-vs-schema drift that has bitten this codebase before: + +* **typos**: ``findings_triage`` vs ``findings.triage`` (a ``dict[str, str]`` + with key = agent name). +* **hallucinated session-injected fields**: ``incident_id`` flagged when + Phase 9's strip should have made it invisible to the LLM. +* **unknown tool names**: drift between prompt instructions and the tools + actually wired into ``config.yaml``. + +Discovery model +--------------- + +Tools are discovered statically via ``ast`` walks (no FastMCP boot needed, +no I/O). The script enumerates: + +* Every ``async def`` / ``def`` at module top-level under + ``examples/*/mcp_server.py`` and ``examples/*/mcp_servers/*.py``. +* Every method on the FastMCP server class registered through + ``self.mcp.tool(name="")(self._tool_)`` — bare method args + (``self``, ``cls``) are excluded; the real arg list is harvested from the + ``async def _tool_`` signature. + +For nested-patch tools — currently just ``update_incident(incident_id, +patch)`` — the script also collects the field set declared by the typed +pydantic ``UpdateIncidentPatch`` model (``model_fields`` keys) and uses that +as the valid ``patch.X`` and ``findings.X`` field set. + +Prompt reference extraction +--------------------------- + +Three regex passes per prompt file: + +1. **Backtick tool calls**: ``` `tool_name(arg1, arg2, ...)` ``` — captures + tool name + arg-name list. +2. **Bare backtick references**: ``` `tool_name` ``` — captures tool name + only (no arg validation needed). +3. **Patch field references**: ``` `findings_` ``` and ``` `patch.` ``` + — captures field references against the ``UpdateIncidentPatch`` model. + +Lines containing ``# lint-ignore: `` (or markdown-style +````) at end-of-line are skipped. Use sparingly, +with a one-sentence rationale. + +Exit codes +---------- + +* ``0`` — every reference resolved. +* ``1`` — at least one violation. Each printed as a GitHub-actions ``::error`` + line so the CI summary surfaces it. + +Phase: 21-01. Requirement: SKILL-LINTER-01. +""" +from __future__ import annotations + +import ast +import re +import sys +from collections.abc import Iterable +from pathlib import Path + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +# Args that the framework injects from session state at the gateway boundary +# (Phase 9 / D-09-01). They appear in tool function signatures but are +# stripped from the LLM-visible ``args_schema``. Prompt references to them +# are ALLOWED — prose may name the field even if the LLM cannot pass it — +# but they must not be "hallucinated" (i.e., listed as something the LLM +# itself supplies). The linter accepts them either way; the harder +# Phase-9-strip enforcement lives in the runtime tests, not here. +SESSION_INJECTED = frozenset({"session_id", "incident_id", "environment"}) + +# Tools whose ``patch`` argument is a typed pydantic model. Entries map a +# tool name to (module path, model class name) for AST-based field discovery. +PATCH_MODELS: dict[str, tuple[str, str]] = { + "update_incident": ( + "examples/incident_management/mcp_server.py", + "UpdateIncidentPatch", + ), +} + +# Default scan roots, relative to repo root. Override with --root for tests. +EXAMPLES_ROOT = Path("examples") + +# Tool-call backtick patterns. We accept both ``inline tool_name(args)`` and +# bare-name forms. The regex tolerates whitespace and trailing kwargs/equals. +TOOL_CALL_RE = re.compile( + r"`([A-Za-z_][A-Za-z0-9_]*)\s*\(([^`)]*)\)`" +) +BARE_TOOL_RE = re.compile(r"`([A-Za-z_][A-Za-z0-9_]*)`") +# Patch-field references. Two shapes seen in this codebase: +# `findings.` — typed dict[str,str], any string key OK (skip) +# `findings_` — DEPRECATED underscore form; UpdateIncidentPatch +# forbids it (extra="forbid"). Catch as a violation. +LEGACY_FINDINGS_RE = re.compile(r"`(findings_[A-Za-z][A-Za-z0-9_]*)`") +# Lint-ignore directives. +LINT_IGNORE_RE = re.compile(r"#\s*lint-ignore\b|`` must not flag.""" + tools = textwrap.dedent(""" + class DemoServer: + def __init__(self): + self.mcp.tool(name="update_incident")(self._tool_update_incident) + + async def _tool_update_incident(self, incident_id, patch): + ... + """) + patch_model = textwrap.dedent(""" + class UpdateIncidentPatch: + findings: dict | None = None + """) + prompt = "Do NOT pass `findings_triage` to update_incident. " + _build_example_tree( + tmp_path, tools_module=tools, prompt=prompt, patch_model=patch_model, + ) + original = linter.PATCH_MODELS.copy() + try: + linter.PATCH_MODELS["update_incident"] = ( + "examples/demo_app/mcp_server.py", "UpdateIncidentPatch", + ) + schemas = linter.discover_tools(tmp_path / "examples") + patch_fields = linter.discover_patch_fields(tmp_path) + prompt_path = tmp_path / "examples" / "demo_app" / "skills" / "x" / "system.md" + violations = linter.lint_prompt(prompt_path, schemas, patch_fields) + finally: + linter.PATCH_MODELS.clear() + linter.PATCH_MODELS.update(original) + assert violations == [], f"lint-ignore should suppress the violation: {violations}" + + +def test_linter_skips_session_injected_args(linter, tmp_path: Path): + """Phase 9 session-injected args (``incident_id``, ``environment``, + ``session_id``) must not be flagged when prose names them — the LLM + can't pass them but the prompt may legitimately reference them by name.""" + tools = textwrap.dedent(""" + class DemoServer: + def __init__(self): + self.mcp.tool(name="get_logs")(self._tool_get_logs) + + async def _tool_get_logs(self, service, environment, minutes): + ... + """) + prompt = "Call `get_logs(service, environment, minutes=15)`. The framework injects environment." + _build_example_tree(tmp_path, tools_module=tools, prompt=prompt) + schemas = linter.discover_tools(tmp_path / "examples") + patch_fields = linter.discover_patch_fields(tmp_path) + prompt_path = tmp_path / "examples" / "demo_app" / "skills" / "x" / "system.md" + violations = linter.lint_prompt(prompt_path, schemas, patch_fields) + # All three args (service, environment, minutes) are on the signature + # OR in the SESSION_INJECTED set — none should produce a violation. + assert violations == [], ( + f"session-injected + on-signature args should pass: {violations}" + ) + + +def test_linter_handles_malformed_call_blocks(linter, tmp_path: Path): + """Malformed inline calls must be tolerated — no crash, no false hits.""" + tools = textwrap.dedent(""" + class DemoServer: + def __init__(self): + self.mcp.tool(name="get_logs")(self._tool_get_logs) + + async def _tool_get_logs(self, service, environment, minutes): + ... + """) + prompt = textwrap.dedent(""" + These should NOT crash the linter: + + - Empty call: `get_logs()` + - Trailing comma: `get_logs(service,)` + - Stray text: `get_logs(some prose with spaces and ,, double commas)` + - Not a tool call: `range(10)` is fine. + """) + _build_example_tree(tmp_path, tools_module=tools, prompt=prompt) + schemas = linter.discover_tools(tmp_path / "examples") + patch_fields = linter.discover_patch_fields(tmp_path) + prompt_path = tmp_path / "examples" / "demo_app" / "skills" / "x" / "system.md" + # Should not raise. + violations = linter.lint_prompt(prompt_path, schemas, patch_fields) + # ``range`` isn't a discovered tool so it's silently skipped. + assert not any("range" in v for v in violations), violations + + +def test_linter_main_entrypoint_exits_zero_on_clean_tree(linter): + """Exercises ``main()`` end-to-end — what CI invokes.""" + rc = linter.main( + [ + "--examples-root", str(REPO_ROOT / "examples"), + "--repo-root", str(REPO_ROOT), + "--quiet", + ] + ) + assert rc == 0, "linter must exit 0 on the live tree (CI gate guarantee)" diff --git a/tests/test_storage_embeddings.py b/tests/test_storage_embeddings.py index da74328..544771c 100644 --- a/tests/test_storage_embeddings.py +++ b/tests/test_storage_embeddings.py @@ -43,7 +43,10 @@ def test_build_embedder_unknown_kind_raises(): from runtime.config import EmbeddingConfig, ProviderConfig from runtime.storage.embeddings import build_embedder cfg = EmbeddingConfig(provider="x", model="m") - bad = ProviderConfig(kind="ollama") + # Phase 13 (HARD-05): ollama now requires base_url at config-load, + # so seed from a no-required-field kind (stub) and mutate to "nonsense" + # to exercise the unknown-kind dispatch path. + bad = ProviderConfig(kind="stub") bad.kind = "nonsense" # bypass pydantic for the test with pytest.raises(ValueError, match="unknown provider kind"): build_embedder(cfg, {"x": bad}) diff --git a/tests/test_turn_output_envelope.py b/tests/test_turn_output_envelope.py new file mode 100644 index 0000000..71737bf --- /dev/null +++ b/tests/test_turn_output_envelope.py @@ -0,0 +1,286 @@ +"""Phase 10 (FOC-03) — AgentTurnOutput envelope tests. + +Coverage matrix: +- Schema validation (10 tests): missing/out-of-range/extra-field/empty rejections. +- Reconciliation (4 tests): match/mismatch/no-tool-arg/at-tolerance-boundary. +- Parser fallback (3 tests): structured_response → AIMessage JSON → EnvelopeMissingError. +- All-six-agent-kinds emit envelope (1 parametrized = 6 cases) covering + intake, triage, deep_investigator, resolution, supervisor, monitor. + +Reconciliation log shape (D-10-03 verbatim): + INFO runtime.orchestrator: turn.confidence_mismatch agent={a} turn_value={e:.2f} tool_value={t:.2f} tool={tn} session_id={sid} +""" +from __future__ import annotations + +import json +import logging + +import pytest +from langchain_core.messages import AIMessage +from pydantic import ValidationError + +from runtime.agents.turn_output import ( + AgentTurnOutput, + EnvelopeMissingError, + parse_envelope_from_result, + reconcile_confidence, +) + + +# --------------------------------------------------------------------------- +# 1) Schema validation +# --------------------------------------------------------------------------- + + +class TestAgentTurnOutputSchema: + def test_envelope_valid_minimum(self): + env = AgentTurnOutput( + content=".", + confidence=0.0, + confidence_rationale="x", + ) + assert env.confidence == 0.0 + assert env.signal is None + + def test_envelope_valid_maximum(self): + env = AgentTurnOutput( + content="x", + confidence=1.0, + confidence_rationale="x", + ) + assert env.confidence == 1.0 + + def test_envelope_missing_confidence_raises(self): + with pytest.raises(ValidationError) as exc: + AgentTurnOutput( + content="x", + confidence_rationale="x", + ) # type: ignore[call-arg] + assert "confidence" in str(exc.value) + + def test_envelope_missing_rationale_raises(self): + with pytest.raises(ValidationError) as exc: + AgentTurnOutput( + content="x", + confidence=0.5, + ) # type: ignore[call-arg] + assert "confidence_rationale" in str(exc.value) + + def test_envelope_missing_content_raises(self): + with pytest.raises(ValidationError) as exc: + AgentTurnOutput( + confidence=0.5, + confidence_rationale="x", + ) # type: ignore[call-arg] + assert "content" in str(exc.value) + + def test_envelope_extra_field_forbidden(self): + with pytest.raises(ValidationError) as exc: + AgentTurnOutput( + content="x", + confidence=0.5, + confidence_rationale="x", + foo="bar", + ) # type: ignore[call-arg] + assert "foo" in str(exc.value).lower() or "extra" in str(exc.value).lower() + + def test_envelope_negative_confidence_raises(self): + with pytest.raises(ValidationError): + AgentTurnOutput( + content="x", + confidence=-0.1, + confidence_rationale="x", + ) + + def test_envelope_above_one_confidence_raises(self): + with pytest.raises(ValidationError): + AgentTurnOutput( + content="x", + confidence=1.01, + confidence_rationale="x", + ) + + def test_envelope_empty_rationale_raises(self): + with pytest.raises(ValidationError): + AgentTurnOutput( + content="x", + confidence=0.5, + confidence_rationale="", + ) + + def test_envelope_signal_optional(self): + # None accepted + env = AgentTurnOutput( + content="x", confidence=0.5, confidence_rationale="x", signal=None + ) + assert env.signal is None + # "success" accepted (string-typed; routing layer validates downstream) + env2 = AgentTurnOutput( + content="x", + confidence=0.5, + confidence_rationale="x", + signal="success", + ) + assert env2.signal == "success" + # "bogus" accepted at the schema layer (routing validates separately) + env3 = AgentTurnOutput( + content="x", + confidence=0.5, + confidence_rationale="x", + signal="bogus", + ) + assert env3.signal == "bogus" + + +# --------------------------------------------------------------------------- +# 2) Reconciliation +# --------------------------------------------------------------------------- + + +class TestReconcileConfidence: + def test_reconcile_match_silent(self, caplog): + caplog.set_level(logging.INFO, logger="runtime.orchestrator") + out = reconcile_confidence( + envelope_value=0.83, + tool_arg_value=0.85, + agent="deep_investigator", + session_id="INC-001", + tool_name="submit_hypothesis", + ) + assert out == 0.85 # tool-arg wins on the return value (D-10-03) + # within tolerance → silent + mismatch_logs = [ + r + for r in caplog.records + if "turn.confidence_mismatch" in r.getMessage() + ] + assert mismatch_logs == [], ( + f"expected silent on match within tolerance; got {[r.getMessage() for r in mismatch_logs]}" + ) + + def test_reconcile_mismatch_logs_and_tool_wins(self, caplog): + caplog.set_level(logging.INFO, logger="runtime.orchestrator") + out = reconcile_confidence( + envelope_value=0.50, + tool_arg_value=0.90, + agent="deep_investigator", + session_id="INC-002", + tool_name="submit_hypothesis", + ) + assert out == 0.90 # tool-arg wins + # Find the mismatch log + mismatch = [ + r.getMessage() + for r in caplog.records + if "turn.confidence_mismatch" in r.getMessage() + ] + assert len(mismatch) == 1 + msg = mismatch[0] + assert "agent=deep_investigator" in msg + assert "turn_value=0.50" in msg + assert "tool_value=0.90" in msg + assert "tool=submit_hypothesis" in msg + assert "session_id=INC-002" in msg + + def test_reconcile_no_tool_arg_returns_envelope(self, caplog): + caplog.set_level(logging.INFO, logger="runtime.orchestrator") + out = reconcile_confidence( + envelope_value=0.66, + tool_arg_value=None, + agent="triage", + session_id="INC-003", + tool_name=None, + ) + assert out == 0.66 + mismatch = [ + r for r in caplog.records if "turn.confidence_mismatch" in r.getMessage() + ] + assert mismatch == [] + + def test_reconcile_at_tolerance_boundary_silent(self, caplog): + # |0.85 - 0.80| == 0.05 exactly → boundary inclusive → silent + caplog.set_level(logging.INFO, logger="runtime.orchestrator") + out = reconcile_confidence( + envelope_value=0.80, + tool_arg_value=0.85, + agent="deep_investigator", + session_id="INC-004", + tool_name="submit_hypothesis", + ) + assert out == 0.85 + mismatch = [ + r for r in caplog.records if "turn.confidence_mismatch" in r.getMessage() + ] + assert mismatch == [], "boundary 0.05 must be inclusive (no log)" + + +# --------------------------------------------------------------------------- +# 3) Parser fallback (3-step) +# --------------------------------------------------------------------------- + + +class TestParseEnvelopeFromResult: + def test_parse_envelope_from_structured_response(self): + env = AgentTurnOutput( + content="hello", + confidence=0.9, + confidence_rationale="r", + signal=None, + ) + result = {"messages": [AIMessage(content="ignored")], "structured_response": env} + parsed = parse_envelope_from_result(result, agent="triage") + assert parsed is env + + def test_parse_envelope_from_last_aimessage_json(self): + # No structured_response key — fall back to JSON-parse last AIMessage + payload = { + "content": "from-json", + "confidence": 0.7, + "confidence_rationale": "json fallback", + "signal": "success", + } + result = {"messages": [AIMessage(content=json.dumps(payload))]} + parsed = parse_envelope_from_result(result, agent="intake") + assert parsed.content == "from-json" + assert parsed.confidence == 0.7 + assert parsed.signal == "success" + + def test_parse_envelope_missing_raises_envelope_missing_error(self): + # No structured_response, AIMessage content is not JSON + result = {"messages": [AIMessage(content="just plain text, no JSON here")]} + with pytest.raises(EnvelopeMissingError) as excinfo: + parse_envelope_from_result(result, agent="supervisor") + assert excinfo.value.agent == "supervisor" + assert excinfo.value.field # non-empty + + +# --------------------------------------------------------------------------- +# 4) All six agent kinds emit envelope +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "agent_kind", + [ + "intake", + "triage", + "deep_investigator", + "resolution", + "supervisor", + "monitor", + ], +) +def test_all_six_agent_kinds_emit_envelope(agent_kind): + """Each agent kind, when handed a structured_response, parses it back.""" + from tests._envelope_helpers import envelope_stub + + result = envelope_stub( + content=f"{agent_kind} ran", + confidence=0.82, + rationale=f"{agent_kind} stub rationale", + signal=None, + ) + env = parse_envelope_from_result(result, agent=agent_kind) + assert env.confidence == 0.82 + assert env.confidence_rationale == f"{agent_kind} stub rationale" + assert env.content == f"{agent_kind} ran" diff --git a/tests/test_ui_approval_paths.py b/tests/test_ui_approval_paths.py new file mode 100644 index 0000000..99fed11 --- /dev/null +++ b/tests/test_ui_approval_paths.py @@ -0,0 +1,187 @@ +"""Phase 20 (HARD-09): UI tests for the P4 approval submission paths. + +These are the load-bearing HITL surfaces in ``runtime.ui`` — when the +framework's pure-policy gate paused a tool call, the operator's only +way to unstick the session is via the Approve / Reject buttons rendered +by ``_render_pending_approvals_block`` (which delegates to +``_submit_approval_via_service``). + +Approach: pure-helper tests + ``streamlit.testing.v1.AppTest`` driver +for end-to-end render flows. Mock-fixture for ``_get_service`` / +``load_config`` so we never bring up the real OrchestratorService. +""" +from __future__ import annotations + +from unittest.mock import MagicMock, patch + + +# --------------------------------------------------------------------------- +# Pure helpers +# --------------------------------------------------------------------------- + + +def test_should_render_retry_block_skips_when_pending_approval_present() -> None: + """If a tool call is paused for HITL approval, the retry block must + NOT render — the pending-approvals block owns the action surface + instead. Mutual-exclusion invariant from D-11-04. + """ + from runtime.ui import _should_render_retry_block + + sess = { + "status": "error", + "tool_calls": [ + {"agent": "investigator", "tool": "remediate", + "status": "pending_approval"}, + ], + } + assert _should_render_retry_block(sess) is False + + +def test_should_render_retry_block_fires_for_terminal_error_without_approval() -> None: + """Plain terminal error (no pending_approval row) → retry block renders.""" + from runtime.ui import _should_render_retry_block + + sess = { + "status": "error", + "tool_calls": [ + {"agent": "investigator", "tool": "search_logs", + "status": "completed"}, + ], + } + assert _should_render_retry_block(sess) is True + + +def test_should_render_retry_block_skips_non_error_status() -> None: + from runtime.ui import _should_render_retry_block + + for status in ("in_progress", "resolved", "awaiting_input", "matched"): + assert _should_render_retry_block({"status": status}) is False + + +def test_should_render_retry_block_tolerates_pydantic_objects() -> None: + """Defensive: live ``Session.tool_calls`` returns pydantic objects, not + dicts. The predicate must read ``.status`` via getattr in that case + (D-11-04 callout).""" + from runtime.ui import _should_render_retry_block + + class _FakeToolCall: + status = "pending_approval" + + sess = {"status": "error", "tool_calls": [_FakeToolCall()]} + assert _should_render_retry_block(sess) is False + + +# --------------------------------------------------------------------------- +# _submit_approval_via_service — error path + happy path with stubs +# --------------------------------------------------------------------------- + + +def test_submit_approval_emits_st_error_when_service_unavailable() -> None: + """When the service singleton is None (e.g. headless rerun), + the helper must surface ``st.error`` and return — never crash. + """ + from runtime import ui as ui_mod + + fake_st = MagicMock() + fake_cfg = MagicMock() + + with patch.object(ui_mod, "_get_service", return_value=None), \ + patch.object(ui_mod, "st", fake_st): + ui_mod._submit_approval_via_service( + fake_cfg, "INC-1", "0", + decision="approve", approver="ui-user", rationale=None, + ) + + fake_st.error.assert_called_once() + msg = fake_st.error.call_args.args[0] + assert "service" in msg.lower() or "refresh" in msg.lower() + + +def test_submit_approval_drives_service_with_correct_payload() -> None: + """Happy path: build the expected ``Command(resume=...)`` payload and + drive ``svc.submit_and_wait`` with it. The test patches the service + so we never touch a real orchestrator. + """ + from runtime import ui as ui_mod + + captured_awaitables: list = [] + + def _capture(awaitable, timeout=None): + # Close the coroutine so we don't get the "never awaited" warning; + # we're verifying the call shape, not the actual resume flow. + captured_awaitables.append((awaitable, timeout)) + if hasattr(awaitable, "close"): + awaitable.close() + + fake_svc = MagicMock() + fake_svc.submit_and_wait = MagicMock(side_effect=_capture) + fake_cfg = MagicMock() + fake_st = MagicMock() + + with patch.object(ui_mod, "_get_service", return_value=fake_svc), \ + patch.object(ui_mod, "st", fake_st): + ui_mod._submit_approval_via_service( + fake_cfg, "INC-42", "3", + decision="reject", + approver="ui-user", + rationale="risk too high", + ) + + # submit_and_wait called exactly once with the contract's 60-second + # timeout (matches HITL bridge in OrchestratorService). + assert fake_svc.submit_and_wait.call_count == 1 + assert len(captured_awaitables) == 1 + assert captured_awaitables[0][1] == 60.0 + + +# --------------------------------------------------------------------------- +# _render_pending_approvals_block — empty / present cases via AppTest +# --------------------------------------------------------------------------- + + +def test_render_pending_approvals_block_renders_nothing_when_no_pending() -> None: + """No pending_approval rows → block is a no-op (returns before + ``st.markdown('### Pending Approvals')``). This protects the detail + pane from rendering a phantom header on resolved sessions. + """ + from streamlit.testing.v1 import AppTest + + at = AppTest.from_string(""" +from unittest.mock import patch, MagicMock +from runtime.ui import _render_pending_approvals_block +sess = {"tool_calls": [{"agent": "x", "tool": "y", "status": "completed"}]} +with patch("runtime.ui.load_config", return_value=MagicMock()): + _render_pending_approvals_block(sess, "INC-test") +""") + at.run(timeout=10) + assert not at.exception + # No '### Pending Approvals' header should be in the rendered markdown. + md_blobs = [m.value for m in at.markdown] + assert not any("Pending Approvals" in m for m in md_blobs) + + +def test_render_pending_approvals_block_renders_card_for_pending_row() -> None: + """One pending_approval row → header + card with tool name and Approve/Reject buttons.""" + from streamlit.testing.v1 import AppTest + + at = AppTest.from_string(""" +from unittest.mock import patch, MagicMock +from runtime.ui import _render_pending_approvals_block +sess = {"tool_calls": [ + {"agent": "investigator", "tool": "remediate", + "status": "pending_approval", "args": {"target": "host-1"}}, +]} +with patch("runtime.ui.load_config", return_value=MagicMock()): + _render_pending_approvals_block(sess, "INC-test") +""") + at.run(timeout=10) + assert not at.exception + md_blobs = [m.value for m in at.markdown] + # Header rendered + assert any("Pending Approvals" in m for m in md_blobs) + # Tool reference visible (header markdown carries agent/tool names) + assert any("investigator" in m and "remediate" in m for m in md_blobs) + # Buttons present with the unique session-scoped keys + button_keys = {b.key for b in at.button if b.key} + assert "approval_approve_INC-test_0" in button_keys + assert "approval_reject_INC-test_0" in button_keys diff --git a/tests/test_ui_error_rendering.py b/tests/test_ui_error_rendering.py new file mode 100644 index 0000000..5b35d44 --- /dev/null +++ b/tests/test_ui_error_rendering.py @@ -0,0 +1,160 @@ +"""Phase 20 (HARD-09): UI tests for error / display formatting. + +Targets: + * ``_parse_iso`` — defensive ISO parser + * ``_duration_seconds`` — duration math with bad inputs + * ``_fmt_tokens`` / ``_fmt_tokens_short`` + * ``_fmt_duration`` — human-readable durations + * ``_fmt_confidence_badge``— confidence-tier glyph + label + +These are the value-formatting rails the entire detail pane runs +through. Pure functions; small but load-bearing. +""" +from __future__ import annotations + +import pytest + + +# --------------------------------------------------------------------------- +# _parse_iso +# --------------------------------------------------------------------------- + + +def test_parse_iso_returns_datetime_for_valid_z_suffix() -> None: + from runtime.ui import _parse_iso + out = _parse_iso("2026-05-07T10:30:45Z") + assert out is not None + assert (out.year, out.month, out.day, out.hour, out.minute) == ( + 2026, 5, 7, 10, 30, + ) + + +@pytest.mark.parametrize("bad", [ + "", None, "not-a-date", "2026-13-99", "2026-05-07 10:30:45", +]) +def test_parse_iso_returns_none_for_garbage(bad) -> None: + from runtime.ui import _parse_iso + assert _parse_iso(bad) is None + + +# --------------------------------------------------------------------------- +# _duration_seconds +# --------------------------------------------------------------------------- + + +def test_duration_seconds_simple_minute() -> None: + from runtime.ui import _duration_seconds + out = _duration_seconds("2026-05-07T10:00:00Z", "2026-05-07T10:01:00Z") + assert out == 60 + + +def test_duration_seconds_returns_zero_when_either_side_unparseable() -> None: + from runtime.ui import _duration_seconds + assert _duration_seconds("", "2026-05-07T10:00:00Z") == 0 + assert _duration_seconds("2026-05-07T10:00:00Z", "garbage") == 0 + assert _duration_seconds("garbage", "garbage") == 0 + + +def test_duration_seconds_clamps_negative_to_zero() -> None: + """End before start (clock skew) → 0, never a negative duration.""" + from runtime.ui import _duration_seconds + out = _duration_seconds("2026-05-07T10:01:00Z", "2026-05-07T10:00:00Z") + assert out == 0 + + +# --------------------------------------------------------------------------- +# _fmt_tokens / _fmt_tokens_short +# --------------------------------------------------------------------------- + + +def test_fmt_tokens_uses_thousands_separators() -> None: + from runtime.ui import _fmt_tokens + assert _fmt_tokens(0) == "0" + assert _fmt_tokens(999) == "999" + assert _fmt_tokens(12_345) == "12,345" + assert _fmt_tokens(1_234_567) == "1,234,567" + + +def test_fmt_tokens_short_compact_form() -> None: + from runtime.ui import _fmt_tokens_short + assert _fmt_tokens_short(0) == "0" + assert _fmt_tokens_short(999) == "999" + assert _fmt_tokens_short(1000) == "1.0k" + assert _fmt_tokens_short(12_345) == "12.3k" + + +# --------------------------------------------------------------------------- +# _fmt_duration +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("seconds,expected", [ + (0, "0s"), + (42, "42s"), + (60, "1m 0s"), + (185, "3m 5s"), + (3600, "1h 0m"), + (3720, "1h 2m"), + (86_400, "1d 0h"), + (90_000, "1d 1h"), +]) +def test_fmt_duration_compacts_to_two_units(seconds: int, expected: str) -> None: + from runtime.ui import _fmt_duration + assert _fmt_duration(seconds) == expected + + +# --------------------------------------------------------------------------- +# _fmt_confidence_badge +# --------------------------------------------------------------------------- + + +def test_fmt_confidence_badge_none_renders_hard_error_marker() -> None: + """Phase 10 (FOC-03): a missing envelope ⇒ structural failure ⇒ + distinct red badge — never the silent ⚪ fallback. + """ + from runtime.ui import _fmt_confidence_badge + out = _fmt_confidence_badge(None) + assert "missing" in out.lower() + # Sanity: not a green/amber glyph + assert "🟢" not in out + assert "🟡" not in out + + +def test_fmt_confidence_badge_high_is_green() -> None: + from runtime.ui import _fmt_confidence_badge + out = _fmt_confidence_badge(0.95) + assert "🟢" in out + assert "0.95" in out + + +def test_fmt_confidence_badge_amber_band() -> None: + """0.5 ≤ conf < 0.75 → amber/yellow.""" + from runtime.ui import _fmt_confidence_badge + assert "🟡" in _fmt_confidence_badge(0.5) + assert "🟡" in _fmt_confidence_badge(0.74) + + +def test_fmt_confidence_badge_low_is_red() -> None: + from runtime.ui import _fmt_confidence_badge + out = _fmt_confidence_badge(0.10) + assert "🔴" in out + assert "0.10" in out + + +# --------------------------------------------------------------------------- +# _is_hypothesis_list — defensive type guard +# --------------------------------------------------------------------------- + + +def test_is_hypothesis_list_recognises_cause_keyed_dicts() -> None: + from runtime.ui import _is_hypothesis_list + assert _is_hypothesis_list([{"cause": "deploy", "evidence": []}]) is True + + +def test_is_hypothesis_list_rejects_non_lists_and_wrong_shapes() -> None: + from runtime.ui import _is_hypothesis_list + assert _is_hypothesis_list(None) is False + assert _is_hypothesis_list([]) is False + assert _is_hypothesis_list("not a list") is False + assert _is_hypothesis_list([{"hypothesis": "no cause key"}]) is False + assert _is_hypothesis_list([1, 2, 3]) is False diff --git a/tests/test_ui_session_lifecycle.py b/tests/test_ui_session_lifecycle.py new file mode 100644 index 0000000..7636e0c --- /dev/null +++ b/tests/test_ui_session_lifecycle.py @@ -0,0 +1,152 @@ +"""Phase 20 (HARD-09): UI tests for session-lifecycle helpers. + +Targets: + * ``_should_poll`` (auto-refresh predicate) + * ``_load_app_cfg`` (FrameworkAppConfig resolution: dotted-path vs YAML) + * ``_resolve_environments`` (YAML-driven vs legacy provider fallback) + * ``_get_service`` defensive return when no script-run context. + +These are the "lifecycle wiring" helpers — they decide what the +sidebar shows, whether the detail pane keeps polling, and which +config block the rest of the UI reads. Pure functions; no Streamlit +rendering required. +""" +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + + +# --------------------------------------------------------------------------- +# _should_poll +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("status", ["running", "in_progress", "awaiting_input"]) +def test_should_poll_true_for_inflight_statuses(status: str) -> None: + from runtime.ui import _should_poll + assert _should_poll(status) is True + + +@pytest.mark.parametrize("status", [ + "resolved", "escalated", "matched", "stopped", "deleted", "error", + "needs_review", "new", +]) +def test_should_poll_false_for_terminal_statuses(status: str) -> None: + from runtime.ui import _should_poll + assert _should_poll(status) is False + + +@pytest.mark.parametrize("status", [None, "", " ", "totally_unknown"]) +def test_should_poll_false_for_unknown_or_missing(status) -> None: + """Unknown / falsy status → don't poll forever on bad data.""" + from runtime.ui import _should_poll + # Strip-empty is not falsy in Python (" " is truthy), but it's not + # in the poll set either, so the second branch returns False. + assert _should_poll(status) is False + + +# --------------------------------------------------------------------------- +# _load_app_cfg — dotted-path provider vs framework block +# --------------------------------------------------------------------------- + + +def test_load_app_cfg_returns_framework_block_when_no_provider() -> None: + """Default path: read ``cfg.framework`` directly when no + ``framework_app_config_path`` provider is configured. + """ + from runtime.config import FrameworkAppConfig + from runtime.ui import _load_app_cfg + + fake_cfg = MagicMock() + fake_cfg.runtime.framework_app_config_path = None + expected = FrameworkAppConfig(confidence_threshold=0.91) + fake_cfg.framework = expected + + out = _load_app_cfg(fake_cfg) + assert out is expected + assert out.confidence_threshold == 0.91 + + +def test_load_app_cfg_uses_dotted_path_provider_when_configured() -> None: + """Legacy back-compat: when ``framework_app_config_path`` is set, + delegate to ``resolve_framework_app_config`` (no fall-through to + ``cfg.framework``). + """ + from runtime.config import FrameworkAppConfig + from runtime import ui as ui_mod + + fake_cfg = MagicMock() + fake_cfg.runtime.framework_app_config_path = "fake.module:provider" + + expected = FrameworkAppConfig(confidence_threshold=0.42) + with patch.object(ui_mod, "resolve_framework_app_config", + return_value=expected) as mock_resolve: + out = ui_mod._load_app_cfg(fake_cfg) + + assert out is expected + mock_resolve.assert_called_once_with("fake.module:provider") + + +# --------------------------------------------------------------------------- +# _resolve_environments — YAML-first, dotted-path fallback +# --------------------------------------------------------------------------- + + +def test_resolve_environments_prefers_yaml_block() -> None: + """When ``cfg.environments`` is non-empty, return a copy and ignore + the legacy provider path entirely. + """ + from runtime.ui import _resolve_environments + + fake_cfg = MagicMock() + fake_cfg.environments = ["dev", "staging", "production"] + fake_cfg.runtime.environments_provider_path = "should.be.ignored:foo" + + out = _resolve_environments(fake_cfg) + assert out == ["dev", "staging", "production"] + # Caller can mutate without poisoning config — list is a fresh copy. + out.append("new") + assert fake_cfg.environments == ["dev", "staging", "production"] + + +def test_resolve_environments_returns_empty_when_no_provider_and_no_yaml() -> None: + from runtime.ui import _resolve_environments + + fake_cfg = MagicMock() + fake_cfg.environments = [] + fake_cfg.runtime.environments_provider_path = None + + assert _resolve_environments(fake_cfg) == [] + + +def test_resolve_environments_returns_empty_for_malformed_dotted_path() -> None: + """A provider string without ':' is a config bug — return empty + rather than blowing up the sidebar. + """ + from runtime.ui import _resolve_environments + + fake_cfg = MagicMock() + fake_cfg.environments = [] + fake_cfg.runtime.environments_provider_path = "no_colon_here" + + assert _resolve_environments(fake_cfg) == [] + + +# --------------------------------------------------------------------------- +# _get_service — headless return-None path +# --------------------------------------------------------------------------- + + +def test_get_service_returns_none_outside_script_context() -> None: + """When ``_cached_service`` raises (e.g. cache decorator complains + about missing script-run context), the wrapper must return ``None`` + so headless imports never crash. + """ + from runtime import ui as ui_mod + + fake_cfg = MagicMock() + with patch.object(ui_mod, "_cached_service", + side_effect=RuntimeError("no script context")): + assert ui_mod._get_service(fake_cfg) is None diff --git a/tests/test_ui_step_display.py b/tests/test_ui_step_display.py new file mode 100644 index 0000000..5782805 --- /dev/null +++ b/tests/test_ui_step_display.py @@ -0,0 +1,269 @@ +"""Phase 20 (HARD-09): UI tests for the agent step / event display path. + +Targets: + * ``_format_event`` — streaming event → display line + * ``_summary_attribution`` — attribution line composition + * ``_field`` / ``_resolve_field`` — top-level vs extra_fields routing + * ``_badge_field_slots`` — UIConfig → badge slot pair + * ``_retry_button_state_for`` — RetryDecision.reason → button label/disabled + +Pure functions; no Streamlit runtime needed. +""" +from __future__ import annotations + +from runtime.config import ( + FrameworkAppConfig, + UIBadge, + UIConfig, + UIDetailField, +) + + +# --------------------------------------------------------------------------- +# _format_event — streaming events to one-liners +# --------------------------------------------------------------------------- + + +def test_format_event_investigation_started() -> None: + from runtime.ui import _format_event + line = _format_event({ + "event": "investigation_started", + "ts": "2026-05-07T10:00:00Z", + "incident_id": "INC-1", + }) + assert line is not None + assert "INC-1" in line + assert "start" in line + + +def test_format_event_investigation_completed() -> None: + from runtime.ui import _format_event + line = _format_event({ + "event": "investigation_completed", + "ts": "2026-05-07T10:01:00Z", + "incident_id": "INC-9", + }) + assert line is not None + assert "done" in line + assert "INC-9" in line + + +def test_format_event_chain_start_filtered_by_agent_names() -> None: + """``on_chain_start`` events for nodes NOT in the configured agent + set are suppressed (returns None) to keep the timeline focused. + """ + from runtime.ui import _format_event + + agents = frozenset({"triage", "investigator"}) + ev_visible = {"event": "on_chain_start", "node": "triage", "ts": "T"} + ev_hidden = {"event": "on_chain_start", "node": "internal_helper", "ts": "T"} + + assert _format_event(ev_visible, agents) is not None + assert "triage" in _format_event(ev_visible, agents) + assert _format_event(ev_hidden, agents) is None + + +def test_format_event_empty_agent_set_shows_all() -> None: + """Safe fallback — when agent_names is empty (caller didn't have + the list handy), every chain event is shown.""" + from runtime.ui import _format_event + line = _format_event( + {"event": "on_chain_end", "node": "anything", "ts": "T"}, + frozenset(), + ) + assert line is not None + assert "anything" in line + + +def test_format_event_tool_end_truncates_long_output() -> None: + """Tool-end snippets are clipped to 120 chars to keep the live + timeline readable when an MCP tool returns a giant payload.""" + from runtime.ui import _format_event + + huge = "x" * 500 + line = _format_event({ + "event": "on_tool_end", + "node": "search_logs", + "ts": "T", + "data": {"output": huge}, + }) + assert line is not None + # The clipped snippet must be at most 120 chars; raw 500-char output + # would inflate the line beyond that snippet length. + snippet_part = line.split("search_logs", 1)[1] + assert len(snippet_part.strip()) <= 121 # 120 chars + leading space + + +def test_format_event_unknown_event_returns_none() -> None: + from runtime.ui import _format_event + assert _format_event({"event": "totally_made_up", "ts": "T"}) is None + + +# --------------------------------------------------------------------------- +# _summary_attribution — UIConfig-driven detail fields +# --------------------------------------------------------------------------- + + +def test_summary_attribution_returns_empty_when_no_summary_fields() -> None: + from runtime.ui import _summary_attribution + app_cfg = FrameworkAppConfig(ui=UIConfig(detail_fields=[])) + assert _summary_attribution({"id": "INC-1"}, app_cfg) == "" + + +def test_summary_attribution_builds_by_clause() -> None: + """First non-empty summary-section field becomes ``by ``; + subsequent ones render as ``(extra1, extra2)``. + """ + from runtime.ui import _summary_attribution + + app_cfg = FrameworkAppConfig(ui=UIConfig( + detail_fields=[ + UIDetailField(key="reporter.id", label="Reporter", section="summary"), + UIDetailField(key="reporter.team", label="Team", section="summary"), + UIDetailField(key="component", label="Component", section="meta"), + ], + )) + sess = { + "extra_fields": { + "reporter": {"id": "alice", "team": "platform"}, + "component": "billing", + }, + } + result = _summary_attribution(sess, app_cfg) + assert result.startswith("by alice") + assert "platform" in result + # 'meta'-section field must NOT appear + assert "billing" not in result + + +def test_summary_attribution_skips_empty_fields() -> None: + """Missing fields (resolved to "") drop out — no stray commas.""" + from runtime.ui import _summary_attribution + + app_cfg = FrameworkAppConfig(ui=UIConfig( + detail_fields=[ + UIDetailField(key="reporter.id", label="Reporter", section="summary"), + UIDetailField(key="missing.key", label="Missing", section="summary"), + ], + )) + sess = {"extra_fields": {"reporter": {"id": "bob"}}} + assert _summary_attribution(sess, app_cfg) == "by bob" + + +# --------------------------------------------------------------------------- +# _field / _resolve_field — top-level + extra_fields routing +# --------------------------------------------------------------------------- + + +def test_field_reads_top_level_first() -> None: + from runtime.ui import _field + assert _field({"summary": "top-level"}, "summary") == "top-level" + + +def test_field_falls_back_to_extra_fields() -> None: + from runtime.ui import _field + assert ( + _field({"extra_fields": {"summary": "from-extra"}}, "summary") + == "from-extra" + ) + + +def test_field_returns_default_when_missing() -> None: + from runtime.ui import _field + assert _field({}, "missing", default="—") == "—" + + +def test_field_coerces_non_string_to_str() -> None: + """Numeric / bool fields end up rendered into markdown — the helper + coerces so callers don't have to.""" + from runtime.ui import _field + assert _field({"count": 42}, "count") == "42" + + +def test_resolve_field_walks_dotted_path_into_extra_fields() -> None: + from runtime.ui import _resolve_field + sess = {"extra_fields": {"reporter": {"id": "alice"}}} + assert _resolve_field(sess, "reporter.id") == "alice" + + +def test_resolve_field_returns_empty_string_for_missing_path() -> None: + from runtime.ui import _resolve_field + sess = {"extra_fields": {"reporter": {"id": "alice"}}} + assert _resolve_field(sess, "reporter.team") == "" + assert _resolve_field(sess, "totally.absent.key") == "" + + +# --------------------------------------------------------------------------- +# _badge_field_slots +# --------------------------------------------------------------------------- + + +def test_badge_field_slots_picks_first_two_non_status_keys() -> None: + from runtime.ui import _badge_field_slots + app_cfg = FrameworkAppConfig(ui=UIConfig(badges={ + "status": {"open": UIBadge(label="OPEN", color="red")}, + "severity": {"sev1": UIBadge(label="SEV1", color="red")}, + "category": {"network": UIBadge(label="NETWORK", color="blue")}, + "third": {"x": UIBadge(label="X", color="gray")}, + })) + primary, secondary = _badge_field_slots(app_cfg) + assert primary == "severity" + assert secondary == "category" + + +def test_badge_field_slots_returns_blanks_when_only_status_configured() -> None: + from runtime.ui import _badge_field_slots + app_cfg = FrameworkAppConfig(ui=UIConfig(badges={ + "status": {"open": UIBadge(label="OPEN", color="red")}, + })) + primary, secondary = _badge_field_slots(app_cfg) + assert primary == "" + assert secondary == "" + + +# --------------------------------------------------------------------------- +# _retry_button_state_for — RetryDecision.reason → (label, disabled) +# --------------------------------------------------------------------------- + + +def test_retry_button_state_auto_retry_is_enabled() -> None: + from runtime.ui import _retry_button_state_for + label, disabled = _retry_button_state_for( + reason="auto_retry", retry_count=1, cap=3, + last_confidence=0.9, threshold=0.5, + ) + assert label == "Retry" + assert disabled is False + + +def test_retry_button_state_max_retries_disabled_with_count() -> None: + from runtime.ui import _retry_button_state_for + label, disabled = _retry_button_state_for( + reason="max_retries_exceeded", retry_count=3, cap=3, + last_confidence=0.9, threshold=0.5, + ) + assert disabled is True + assert "3/3" in label + + +def test_retry_button_state_low_confidence_renders_percentages() -> None: + from runtime.ui import _retry_button_state_for + label, disabled = _retry_button_state_for( + reason="low_confidence_no_retry", retry_count=0, cap=3, + last_confidence=0.32, threshold=0.75, + ) + assert disabled is True + assert "32%" in label + assert "75%" in label + + +def test_retry_button_state_unknown_reason_disabled_with_label() -> None: + """Future-proofing: a reason the UI doesn't recognise still renders + a disabled button rather than crashing.""" + from runtime.ui import _retry_button_state_for + label, disabled = _retry_button_state_for( + reason="some_future_reason", retry_count=0, cap=3, + last_confidence=None, threshold=0.5, + ) + assert disabled is True + assert "some_future_reason" in label