From 075ba1d61d589f7e297f45ac75b3ea15c52cf058 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 11:01:41 +0200 Subject: [PATCH 01/62] =?UTF-8?q?demo(act2):=20S0=20=E2=80=94=20infra-tier?= =?UTF-8?q?=20ResourceQuota=20incident=20harness=20for=20Agent=20A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #1 in the kars-sre/demo-and-agent series — Slice 0 of the SRE proposal: the demo can now be walked end-to-end by hand before any SRE plugin code lands. Each subsequent slice (S1 read-only tools, S2 K8s diag toolset, S3 typed apply-fix, S4 proactive watcher) replaces one hand-walked step with an autonomous one. Scenario: 'platform team's GitOps refactor lands a tight ResourceQuota across every workload namespace; the quota's requests.memory ceiling (50Mi) is lower than what the research sandbox actually requests. The pod stays Running until anything triggers a reschedule — then it goes Pending forever because the quota blocks pod admission.' Why infrastructure, not image-tag: image tags don't change on a running pod for random reasons. ResourceQuota mis-configuration is a real GitOps-collision incident that operators hit regularly. Files: agent-a-research.yaml — KarsSandbox 'research' (Hermes runtime, mirrors exec-brief-hermes- single shape, simplified to two CRs so the demo focuses on the runtime) platform-hardening-quota.yaml — the bad ResourceQuota the break script applies; deliberately NOT labeled kars.azure.com/managed-by so the SRE's DeleteResourceQuota typed action is permitted break.sh — applies the quota, force-deletes the running pod, confirms the FailedCreate event surfaces reset.sh — deletes the quota and waits for Running 2/2 (manual recovery path) runbook.md — presenter script for walking Act II by hand until S2 ships; once S2 ships, the runbook becomes the expected-behaviour spec for the autonomous agent walk Proposal update: §7.7.1 — adds DeleteResourceQuota as a typed action (namespace- scope, requires the ResourceQuota NOT carry the kars.azure.com/managed-by=controller label so kars-owned governance quotas stay protected and only operator-applied platform quotas are deletable) §7.7.1 — removes the PatchSandboxRuntimeImage carve-out from the previous draft; the demo no longer requires writes to kars.azure.com/* CRs, so the no-governance-mutation rule stays absolute Validation: python3 -c yaml.safe_load_all on both YAMLs — parses OK bash -n break.sh / reset.sh — syntax OK ci/check-copyright-headers.sh — all 499 OK Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/blueprints/07-kars-sre-proposal.md | 1 + tools/demo/act2/agent-a-research.yaml | 72 ++++++++++++ tools/demo/act2/break.sh | 83 ++++++++++++++ tools/demo/act2/platform-hardening-quota.yaml | 43 +++++++ tools/demo/act2/reset.sh | 27 +++++ tools/demo/act2/runbook.md | 108 ++++++++++++++++++ 6 files changed, 334 insertions(+) create mode 100644 tools/demo/act2/agent-a-research.yaml create mode 100755 tools/demo/act2/break.sh create mode 100644 tools/demo/act2/platform-hardening-quota.yaml create mode 100755 tools/demo/act2/reset.sh create mode 100644 tools/demo/act2/runbook.md diff --git a/docs/blueprints/07-kars-sre-proposal.md b/docs/blueprints/07-kars-sre-proposal.md index 39998ead..f22c7ce3 100644 --- a/docs/blueprints/07-kars-sre-proposal.md +++ b/docs/blueprints/07-kars-sre-proposal.md @@ -545,6 +545,7 @@ in depth): | `RolloutRestart` | `{namespace, kind∈{Deployment,StatefulSet,DaemonSet}, name}` | namespace ∉ denylist | | `ScaleDeployment` | `{namespace, name, replicas ∈ [0, 50]}` | namespace ∉ denylist; replicas clamped | | `DeletePod` (= forced restart of one pod) | `{namespace, name}` | namespace ∉ denylist | +| `DeleteResourceQuota` | `{namespace, name}` | namespace ∉ denylist; ResourceQuota MUST NOT carry the label `kars.azure.com/managed-by=controller` (kars-owned governance quotas stay protected; operator-applied platform quotas are deletable) | | `PatchConfigMapKey` | `{namespace, name, key, value}` | name ∉ kars-controlled CMs (allowlist of OPERATOR-managed CMs only) | **Protected-resource denylist** (enforced at all three layers below): diff --git a/tools/demo/act2/agent-a-research.yaml b/tools/demo/act2/agent-a-research.yaml new file mode 100644 index 00000000..a2fb1652 --- /dev/null +++ b/tools/demo/act2/agent-a-research.yaml @@ -0,0 +1,72 @@ +# Agent A — the kars sandbox the showcase demo (Acts I + II) runs. +# +# Act I uses this sandbox to demonstrate the architecture in motion: +# a real Hermes agent doing a real piece of agentic work (researching +# a topic) inside the kars governance plane. +# +# Act II breaks this same sandbox via a Kubernetes-tier infra issue +# (tools/demo/act2/break.sh — applies a ResourceQuota that blocks +# pod scheduling in the kars-research namespace, then force-deletes +# the running pod). The kars-sre agent then diagnoses and proposes +# the fix. +# +# Shape mirrors tools/e2e-harness/scenarios/exec-brief-hermes-single +# but simplified to two CRs (InferencePolicy + KarsSandbox) so the +# demo focuses on the runtime, not the catalog of governance +# primitives (those are covered by tools/demo/scenarios/ Act I). +# +# Apply with: kubectl apply -f tools/demo/act2/agent-a-research.yaml +# Tear down: kubectl delete karssandbox research -n kars-system +--- +apiVersion: kars.azure.com/v1alpha1 +kind: InferencePolicy +metadata: + name: research-inference + namespace: kars-system + labels: + kars.azure.com/sandbox: research + app.kubernetes.io/part-of: kars-demo +spec: + appliesTo: + sandboxName: research + modelPreference: + primary: + provider: azure-openai + deployment: gpt-4.1 + contentSafety: + requirePromptShields: true + tokenBudget: + perRequestTokens: 32000 +--- +apiVersion: kars.azure.com/v1alpha1 +kind: KarsSandbox +metadata: + name: research + namespace: kars-system + labels: + kars.azure.com/channels: none + app.kubernetes.io/part-of: kars-demo +spec: + runtime: + kind: Hermes + hermes: + # Use the image's baked-in Hermes version (don't pin) so this + # demo manifest doesn't drift against runtime image bumps. + + sandbox: + isolation: standard + + inferenceRef: + name: research-inference + + governance: + enabled: true + registryMode: local + trustThreshold: 0 + + networkPolicy: + defaultDeny: true + # Egress allowed by default for the demo (Learn mode). Operators + # promote to Strict + signed allowlist for production. Documented + # in docs/blueprints/07-kars-sre-proposal.md §6.6. + egressMode: Learn diff --git a/tools/demo/act2/break.sh b/tools/demo/act2/break.sh new file mode 100755 index 00000000..949a14b5 --- /dev/null +++ b/tools/demo/act2/break.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# tools/demo/act2/break.sh — induce the Act II infrastructure incident. +# +# Scenario (per docs/blueprints/07-kars-sre-proposal.md §7.2 + +# tools/demo/act2/platform-hardening-quota.yaml header): +# +# The "platform hardening" GitOps refactor lands a tight +# ResourceQuota in the kars-research namespace. The quota's +# requests.memory ceiling (50Mi) is lower than the agent pod +# actually requests. The running pod keeps running, but the moment +# anything triggers a fresh pod (rollout, eviction, restart) the +# new pod cannot be admitted to the namespace. +# +# This script: +# 1. Applies the ResourceQuota (the operator's "mistake") +# 2. Force-deletes the running research pod (surfaces the failure +# immediately rather than waiting for natural restart) +# 3. Confirms the new pod is stuck Pending with the expected +# quota-violation reason on the ReplicaSet +# +# Idempotent: re-running is safe; the quota is `kubectl apply`-ed. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +NS="kars-research" +SANDBOX="research" + +echo "▸ verifying agent-a is running (must be present before we break it)..." +if ! kubectl -n "${NS}" get deploy "${SANDBOX}" >/dev/null 2>&1; then + echo "✗ deploy/${SANDBOX} not found in ns ${NS}." >&2 + echo " Apply tools/demo/act2/agent-a-research.yaml first and wait for Running 2/2." >&2 + exit 1 +fi +kubectl -n "${NS}" rollout status "deploy/${SANDBOX}" --timeout=60s + +echo "" +echo "▸ applying platform-hardening ResourceQuota..." +kubectl apply -f "${SCRIPT_DIR}/platform-hardening-quota.yaml" + +echo "" +echo "▸ force-deleting the running pod to surface the failure..." +POD=$(kubectl -n "${NS}" get pod -l app.kubernetes.io/component=sandbox \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") +if [[ -z "${POD}" ]]; then + echo "⚠ no sandbox pod found to evict; quota will only manifest on next natural restart" >&2 +else + kubectl -n "${NS}" delete pod "${POD}" --grace-period=1 +fi + +echo "" +echo "▸ waiting for the failure to surface in the ReplicaSet events (up to 60s)..." +for i in $(seq 1 60); do + # Look for the quota-violation event on any ReplicaSet in the ns + REASON=$(kubectl -n "${NS}" get events \ + --field-selector reason=FailedCreate \ + -o jsonpath='{.items[*].message}' 2>/dev/null || echo "") + if echo "${REASON}" | grep -qE "exceeded quota|forbidden.*quota"; then + echo "✓ quota violation observed after ${i}s" + echo "" + echo "─── current state ─────────────────────────────────────" + kubectl -n "${NS}" get pod + echo "" + echo "─── ResourceQuota in ${NS} ────────────────────────────" + kubectl -n "${NS}" get resourcequota + echo "" + echo "─── most-recent FailedCreate events ──────────────────" + kubectl -n "${NS}" get events --field-selector reason=FailedCreate --sort-by=.lastTimestamp | tail -3 + echo "───────────────────────────────────────────────────────" + echo "" + echo "✓ Act II incident induced. kars-sre agent's turn." + exit 0 + fi + sleep 1 +done + +echo "⚠ timeout: quota-violation event did not appear within 60s" >&2 +kubectl -n "${NS}" get pod >&2 || true +kubectl -n "${NS}" get events --field-selector reason=FailedCreate >&2 || true +exit 1 diff --git a/tools/demo/act2/platform-hardening-quota.yaml b/tools/demo/act2/platform-hardening-quota.yaml new file mode 100644 index 00000000..65959b5d --- /dev/null +++ b/tools/demo/act2/platform-hardening-quota.yaml @@ -0,0 +1,43 @@ +# Act II — the infrastructure break. +# +# Scenario: "the platform team's GitOps refactor lands a hardening +# ResourceQuota across every workload namespace. The quota's +# requests.memory ceiling (50Mi) is lower than the sum of what the +# research sandbox actually requests (the inference-router sidecar +# alone asks for more). Next time the agent pod restarts — or the +# operator triggers a rollout — the new pod cannot be admitted into +# the namespace and stays Pending forever." +# +# This is a textbook K8s incident: the running pod keeps running, +# but the moment anything tries to schedule a fresh pod (rollout, +# eviction, voluntary or involuntary restart) — quota blocks it. +# +# Applied by tools/demo/act2/break.sh, which also force-deletes the +# running research pod to surface the failure immediately rather +# than waiting for a natural restart event. +# +# The kars-sre agent's job: notice the Pending pod, read the +# ReplicaSet's events ("Error creating: pods ... is forbidden: +# exceeded quota"), list ResourceQuotas in kars-research, identify +# the over-tight one, propose DeleteResourceQuota. +--- +apiVersion: v1 +kind: ResourceQuota +metadata: + name: platform-hardening-quota + namespace: kars-research + labels: + # Crucial: NOT labeled as kars-managed. The SRE agent's typed + # action `DeleteResourceQuota` is permitted ONLY for ResourceQuotas + # without the `kars.azure.com/managed-by=controller` label, so + # the SRE agent can clean up operator-applied quotas but cannot + # remove any kars-managed governance ResourceQuota. + app.kubernetes.io/part-of: platform-hardening + app.kubernetes.io/managed-by: gitops-platform +spec: + hard: + # Deliberately tight. The Hermes sandbox pod requests ~256Mi + # across its containers (openclaw + inference-router); 50Mi is + # impossible. + requests.memory: "50Mi" + requests.cpu: "100m" diff --git a/tools/demo/act2/reset.sh b/tools/demo/act2/reset.sh new file mode 100755 index 00000000..4310a4c9 --- /dev/null +++ b/tools/demo/act2/reset.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# tools/demo/act2/reset.sh — undo the Act II break. +# +# Removes the platform-hardening ResourceQuota and waits for the +# agent pod to come back Running 2/2. This is what the kars-sre +# agent's typed `DeleteResourceQuota` action does in the demo; the +# script exists so the presenter can recover the cluster manually +# (during rehearsal, or after a failed Act II run). + +set -euo pipefail + +NS="kars-research" +SANDBOX="research" + +echo "▸ deleting platform-hardening ResourceQuota..." +kubectl -n "${NS}" delete resourcequota platform-hardening-quota --ignore-not-found + +echo "" +echo "▸ waiting for the agent pod to come back Running (up to 120s)..." +kubectl -n "${NS}" rollout status "deploy/${SANDBOX}" --timeout=120s + +echo "" +echo "✓ ${SANDBOX} is healthy" +kubectl -n "${NS}" get pod diff --git a/tools/demo/act2/runbook.md b/tools/demo/act2/runbook.md new file mode 100644 index 00000000..03d99532 --- /dev/null +++ b/tools/demo/act2/runbook.md @@ -0,0 +1,108 @@ +# Act II — presenter runbook + +Use this when the kars-sre agent isn't built yet (S1-S5 in progress) +and you need to walk Act II by hand. Once S4 lands, the kars-sre +agent runs every step here autonomously and the runbook becomes the +*expected* behaviour spec. + +## Pre-flight (before going on stage) + +```bash +# 1) Fresh local cluster + kars installed (from Act I demo intro) +kars dev + +# 2) Apply Agent A +kubectl apply -f tools/demo/act2/agent-a-research.yaml +kubectl -n kars-research rollout status deploy/research --timeout=120s + +# 3) Confirm Agent A is healthy +kubectl -n kars-research get pod +# Expect: research- 2/2 Running +``` + +## The break (Act II, scene 1 — "something is wrong") + +```bash +bash tools/demo/act2/break.sh +``` + +The script: +1. Applies `platform-hardening-quota.yaml` to `kars-research` +2. Force-deletes the running pod (so the failure surfaces in seconds, not on the next natural restart) +3. Confirms `FailedCreate / exceeded quota` event on the ReplicaSet +4. Prints the current pod state, the ResourceQuota, and the most recent FailedCreate event + +Expected wall-clock: ~5–10 s for break, then ~30 s for the audience to see the Pending pod settle. + +## The diagnosis (Act II, scene 2 — "kars-sre takes over") + +These are the steps the kars-sre agent should walk. Until S2 ships, +do them by hand — talking through what the agent would say: + +```bash +# 1) "What's the cluster state?" — sre_describe_state +kubectl get karssandbox -A +# Expect: research is Degraded (or Available=False). + +# 2) "What changed recently?" — sre_what_changed +kubectl -n kars-research get events --sort-by=.lastTimestamp | tail -10 +# Expect: FailedCreate from the ReplicaSet, exceeded-quota message. + +# 3) "Describe the failing pod" — sre_describe_resource +kubectl -n kars-research describe pod -l app.kubernetes.io/component=sandbox +# Expect: Pending; events show no obvious workload-config issue. + +# 4) "List quotas in the namespace" — sre_describe_resource on ResourceQuota +kubectl -n kars-research get resourcequota +kubectl -n kars-research describe resourcequota platform-hardening-quota +# Expect: requests.memory: 50Mi (vs. used: ~256Mi) + +# 5) "Propose the fix" — sre_propose_fix +echo "Proposed: delete ResourceQuota platform-hardening-quota in ns kars-research" +echo "Rationale: the quota's requests.memory ceiling is below the sandbox's actual" +echo "request; pod cannot be admitted while the quota is in effect." +echo "Resource is NOT labeled kars.azure.com/managed-by — safe to delete." +``` + +## The approval + fix (Act II, scene 3 — "operator approves") + +In the full Act II this is a Telegram approval ping from kars-sre. +For the runbook walk, simulate by hand: + +```bash +# Operator nods. Apply the fix. +bash tools/demo/act2/reset.sh +``` + +Expected: ResourceQuota gone, controller schedules a new pod, pod +reaches Running 2/2 within ~15 s. + +## Tear-down (after the demo) + +```bash +kubectl delete karssandbox research -n kars-system +kubectl delete namespace kars-research --ignore-not-found +kubectl delete -f tools/demo/act2/platform-hardening-quota.yaml --ignore-not-found +``` + +## Why this scenario + +Picked because it's the most pure-infrastructure incident shape on +the candidate list: + +- **The break is a real-world GitOps mistake** (operators routinely + add ResourceQuotas via their gitops pipeline; getting the values + wrong is common). +- **The symptom is unmistakable in `kubectl`** (Pending pod + + `exceeded quota` event — universally-recognised K8s incident). +- **The fix is a single delete** — fits the SRE agent's typed-action + model cleanly, doesn't touch any kars governance state, doesn't + need node-level privilege. +- **The diagnostic walk uses three different `sre_*` tools** in + natural sequence (`sre_describe_state`, `sre_what_changed`, + `sre_describe_resource`) — covers the demo's "show what the tools + do" goal without contrivance. + +See `docs/blueprints/07-kars-sre-proposal.md` §7.7.1 for the +`DeleteResourceQuota` typed-action definition + protected-resource +denylist that lets the SRE agent execute this fix safely. From 3af6b715da0dd516336d49797938f2e6e89168a6 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 11:27:08 +0200 Subject: [PATCH 02/62] =?UTF-8?q?sre(s1):=20MVP=20=E2=80=94=20Helm=20templ?= =?UTF-8?q?ate=20+=205=20read-only=20kars-CR=20tools=20+=20CLI=20+=20plugi?= =?UTF-8?q?n=20containment?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Slice 1 of the kars-sre demo+agent series. The agent is now installable on any kars cluster via 'kars sre install' and reachable via 'kars sre talk'. It reads kars CRs cluster-wide, walks the diagnostic checklist, matches errors against the OOTB-blocker corpus, and proposes typed fixes (apply is Slice 3). What ships: deploy/helm/kars/templates/sre.yaml — Gated on .Values.sre.enabled. Creates 5 K8s objects when enabled: - InferencePolicy 'sre-inference' (kars-system) - KarsSandbox 'sre' (kars-system) with runtime: Hermes, extraEnv KARS_SRE_ENABLED=true, networkPolicy.defaultDeny=true + allowlist contains ONLY kubernetes.default.svc (NOT agentmesh — §7.8.6 network layer) - ToolPolicy 'sre-tools' (kars-sre) gating the sre_* surface - ClusterRole 'kars-sre-reader' — read on kars CRs + apiextensions + core workloads (RBAC per proposal §7.2.1 minus what S2/S3 add) - ClusterRoleBinding pinned to ServiceAccount kars-sre/sandbox (explicit subject — no group binding, no wildcard, §7.8.3) deploy/helm/kars/values.yaml — new 'sre:' block (enabled=false default, model=gpt-4.1, provider=azure-openai, tokenBudget=32000, extraAllowedEndpoints commented out for Slice 4 channel wiring). cli/src/commands/sre.ts — 'kars sre {install,uninstall,status,talk}' subcommands. 'install' wraps 'helm upgrade --reuse-values --set sre.enabled=true' then waits for the sandbox to reach Available. cli/src/cli.ts — wires sreCommand() into the Operations command group. runtimes/hermes/.../plugin/sre.py — 5 tools, all read-only: - sre_describe_state structured snapshot of all 11 kars-owned CRs - sre_logs apiserver-side pod log tail (cap 500 lines) - sre_diagnose kars-CR health checklist + summary string - sre_explain_error OOTB-blocker corpus matcher (6 known patterns including ImagePullBackOff, exceeded quota, OOMKilled, CrashLoopBackOff, FailedScheduling, ContainerCreating) - sre_propose_fix typed-action proposal envelope; Slice 1 codifies DeleteResourceQuota (the demo Act II target) — rest of typed-action set lands in S3 runtimes/hermes/.../plugin/sre_kube.py — minimal in-cluster apiserver client built on httpx (no new dep added to the shared Hermes image). Reads projected SA token + ca.crt + namespace from the standard paths; detects token rotation by content compare on each request. runtimes/hermes/.../plugin/__init__.py — adds the KARS_SRE_ENABLED gate. When set: - kars_spawn family is SKIPPED at registration (§7.8.5 — SRE agent cannot spawn sub-agents) - kars_mesh_* family is SKIPPED at registration (§7.8.6 — SRE agent is not on the mesh; combined with the NetworkPolicy block above this is two of three §7.8.6 enforcement layers — the third 'separate image' layer is the §7.8.1 follow-up slice) - kars_discover is skipped (no peers to discover) - eager-mesh-init thread is skipped (would log noisy connection failures otherwise) - sre.register(ctx) runs AFTER everything else runtimes/hermes/tests/test_sre.py — 15 tests covering: - env-gate truthy/falsy mapping - all 5 tools register with the correct schema - explain_error matches against the corpus, handles no-match, handles empty input - propose_fix codifies DeleteResourceQuota for ResourceQuota target; returns rationale-only envelope for other kinds - KARS_CR_KINDS lists all 11 proposal §3.5 CRDs - describe_state walks every kind + surfaces per-kind errors without raising docs/sre.md — operator-facing readme: install, talk, tool surface, containment summary, what S1 cannot do yet, links to proposal + Act II runbook. Validation: pytest tests/test_sre.py → 15/15 pass pytest tests/test_governance.py → unchanged, pass pytest tests/test_package_shape.py → unchanged, pass npm run typecheck (cli) → no errors npm run build (cli) → builds helm lint --set sre.enabled=true → 0 fails helm template ... --show-only sre.yaml → renders 5 objects clean helm template ... (sre.enabled=false) → sre.yaml correctly omitted ci/check-copyright-headers.sh → all 501 files OK What this slice does NOT ship (per §7.1 ladder): - K8s diag toolset (sre_image_probe, sre_endpoints_inspect, sre_what_changed, sre_top, sre_describe_resource) — Slice 2 - Fix execution (sre_apply_fix + TokenRequest + admission VAPs) — S3 - Proactive watcher + Telegram/Slack notifications — Slice 4 - Separate kars/sre-sandbox image (§7.8.1 packaging containment) — deferred; Slice 1 ships SRE in the shared Hermes image behind the KARS_SRE_ENABLED env gate as a tactical bridge. The env gate is the interim containment: tools aren't registered in any other pod, so a request for sre_* in a standard sandbox hits 'tool not found' at the runtime. Next: Slice 2 (K8s diag toolset), then Slice 3 (typed apply-fix + AGT approval flow + admission VAPs). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/src/cli.ts | 2 + cli/src/commands/sre.ts | 182 ++++++ deploy/helm/kars/templates/sre.yaml | 251 ++++++++ deploy/helm/kars/values.yaml | 43 ++ docs/sre.md | 122 ++++ .../kars_runtime_hermes/plugin/__init__.py | 155 +++-- .../src/kars_runtime_hermes/plugin/sre.py | 603 ++++++++++++++++++ .../kars_runtime_hermes/plugin/sre_kube.py | 132 ++++ runtimes/hermes/tests/test_sre.py | 220 +++++++ 9 files changed, 1653 insertions(+), 57 deletions(-) create mode 100644 cli/src/commands/sre.ts create mode 100644 deploy/helm/kars/templates/sre.yaml create mode 100644 docs/sre.md create mode 100644 runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py create mode 100644 runtimes/hermes/src/kars_runtime_hermes/plugin/sre_kube.py create mode 100644 runtimes/hermes/tests/test_sre.py diff --git a/cli/src/cli.ts b/cli/src/cli.ts index 4560bc6d..a5cf0564 100644 --- a/cli/src/cli.ts +++ b/cli/src/cli.ts @@ -33,6 +33,7 @@ import { memoryCommand } from "./commands/memory.js"; import { inspectCommand } from "./commands/inspect.js"; import { auditCommand } from "./commands/audit.js"; import { headlampCommand } from "./commands/headlamp.js"; +import { sreCommand } from "./commands/sre.js"; export function createCli(): Command { const program = new Command(); @@ -57,6 +58,7 @@ export function createCli(): Command { program.addCommand(listCommand()); program.addCommand(logsCommand()); program.addCommand(inspectCommand()); + program.addCommand(sreCommand()); // Configuration program.addCommand(credentialsCommand()); diff --git a/cli/src/commands/sre.ts b/cli/src/commands/sre.ts new file mode 100644 index 00000000..fc2392fd --- /dev/null +++ b/cli/src/commands/sre.ts @@ -0,0 +1,182 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { Command } from "commander"; +import chalk from "chalk"; +import { execa } from "execa"; + +/** + * `kars sre` — manage the built-in kars-sre agent. + * + * Subcommands: + * install — enable the chart's sre.yaml template (helm upgrade --set sre.enabled=true) + * uninstall — disable it (helm upgrade --set sre.enabled=false) + * status — show the sre KarsSandbox CR's state (kubectl get karssandbox sre) + * talk — alias for `kars connect sre` (open the WebUI) + * + * Design: docs/blueprints/07-kars-sre-proposal.md + */ +export function sreCommand(): Command { + const cmd = new Command("sre"); + cmd.description("Manage the built-in kars-sre agent (Kubernetes SRE on the cluster)"); + + cmd + .command("install") + .description("Enable the kars-sre agent on the current cluster") + .option( + "--release ", + "Helm release name to patch (defaults to 'kars')", + "kars", + ) + .option( + "--namespace ", + "Helm release namespace (defaults to 'kars-system')", + "kars-system", + ) + .option( + "--context ", + "kubectl context to use (defaults to current-context)", + ) + .option( + "--model ", + "Azure OpenAI deployment / model name for the SRE agent (defaults to gpt-4.1)", + ) + .option( + "--wait", + "Wait for the sre sandbox to reach Running (default true)", + true, + ) + .action(async (options: { + release: string; + namespace: string; + context?: string; + model?: string; + wait: boolean; + }) => { + const helmArgs = [ + "upgrade", + options.release, + "deploy/helm/kars", + "--namespace", options.namespace, + "--reuse-values", + "--set", "sre.enabled=true", + ]; + if (options.model) helmArgs.push("--set", `sre.model=${options.model}`); + if (options.context) helmArgs.push("--kube-context", options.context); + + console.log(chalk.cyan("▸ enabling kars-sre via helm upgrade --reuse-values…")); + console.log(chalk.gray(` helm ${helmArgs.join(" ")}`)); + try { + await execa("helm", helmArgs, { stdio: "inherit" }); + } catch (err) { + console.error(chalk.red("✗ helm upgrade failed")); + process.exit(1); + } + console.log(chalk.green("✓ chart patched")); + + if (options.wait) { + const kctxArgs = options.context ? ["--context", options.context] : []; + console.log(chalk.cyan("▸ waiting for kars-sre namespace to appear…")); + for (let i = 0; i < 60; i++) { + try { + await execa("kubectl", [...kctxArgs, "get", "ns", "kars-sre"], { stdio: "ignore" }); + console.log(chalk.green("✓ kars-sre namespace exists")); + break; + } catch { + await new Promise((r) => setTimeout(r, 1000)); + } + } + console.log(chalk.cyan("▸ waiting for sre sandbox to reach Running (up to 180s)…")); + try { + await execa( + "kubectl", + [ + ...kctxArgs, + "-n", "kars-sre", + "wait", + "--for=condition=Available", + "deploy/sre", + "--timeout=180s", + ], + { stdio: "inherit" }, + ); + console.log(chalk.green("✓ kars-sre is ready")); + console.log(""); + console.log(` ${chalk.bold("Next:")} ${chalk.cyan("kars sre talk")} (open the WebUI)`); + console.log(` ${chalk.cyan("kars sre status")} (CR + pod state)`); + } catch { + console.warn(chalk.yellow("⚠ sre sandbox did not become Available within 180s")); + console.warn(chalk.yellow(" Run `kars sre status` to inspect.")); + process.exit(1); + } + } + }); + + cmd + .command("uninstall") + .description("Disable the kars-sre agent (the namespace + RBAC are torn down by the controller)") + .option("--release ", "Helm release name", "kars") + .option("--namespace ", "Helm release namespace", "kars-system") + .option("--context ", "kubectl context to use") + .action(async (options: { release: string; namespace: string; context?: string }) => { + const helmArgs = [ + "upgrade", + options.release, + "deploy/helm/kars", + "--namespace", options.namespace, + "--reuse-values", + "--set", "sre.enabled=false", + ]; + if (options.context) helmArgs.push("--kube-context", options.context); + + console.log(chalk.cyan("▸ disabling kars-sre via helm upgrade --reuse-values…")); + try { + await execa("helm", helmArgs, { stdio: "inherit" }); + } catch { + console.error(chalk.red("✗ helm upgrade failed")); + process.exit(1); + } + console.log(chalk.green("✓ kars-sre disabled; controller will garbage-collect the sandbox + namespace")); + }); + + cmd + .command("status") + .description("Show the sre KarsSandbox CR + pod state") + .option("--context ", "kubectl context to use") + .action(async (options: { context?: string }) => { + const kctxArgs = options.context ? ["--context", options.context] : []; + console.log(chalk.bold.cyan("── KarsSandbox sre (kars-system) ──")); + try { + await execa("kubectl", [...kctxArgs, "-n", "kars-system", "get", "karssandbox", "sre"], { stdio: "inherit" }); + } catch { + console.error(chalk.yellow("⚠ KarsSandbox sre not found — run `kars sre install` first.")); + process.exit(1); + } + console.log(""); + console.log(chalk.bold.cyan("── pods (kars-sre namespace) ──")); + try { + await execa("kubectl", [...kctxArgs, "-n", "kars-sre", "get", "pod"], { stdio: "inherit" }); + } catch { + console.warn(chalk.yellow("⚠ kars-sre namespace not yet provisioned")); + } + }); + + cmd + .command("talk") + .description("Open the kars-sre WebUI (alias for `kars connect sre`)") + .option("--context ", "kubectl context to use") + .option("--port ", "Local port for WebUI port-forward", "18790") + .action(async (options: { context?: string; port: string }) => { + const args = ["connect", "sre", "--web", "--port", options.port]; + if (options.context) args.push("--context", options.context); + console.log(chalk.cyan(`▸ kars connect sre (WebUI on http://localhost:${options.port})…`)); + try { + await execa("kars", args, { stdio: "inherit" }); + } catch { + console.error(chalk.red("✗ failed to connect — try `kars sre status` to verify the sandbox is Running")); + process.exit(1); + } + }); + + return cmd; +} diff --git a/deploy/helm/kars/templates/sre.yaml b/deploy/helm/kars/templates/sre.yaml new file mode 100644 index 00000000..b486899e --- /dev/null +++ b/deploy/helm/kars/templates/sre.yaml @@ -0,0 +1,251 @@ +{{- /* +kars-sre — the built-in SRE agent (Slice 1 MVP). + +Gated on `.Values.sre.enabled` (default: false). Enable via: + helm upgrade --reuse-values --set sre.enabled=true ... +or — preferred — via the CLI: + kars sre install + +What this template creates (when sre.enabled=true): + - InferencePolicy `sre-inference` (Release.Namespace) + - KarsSandbox `sre` (Release.Namespace) — runtime=Hermes, with the + extraEnv flag `KARS_SRE_ENABLED=true` that switches on the SRE + plugin inside the runtime image (the Hermes plugin tree contains + `sre.py` but only registers its tools when this env is set — + standard Hermes sandboxes don't get the SRE tool surface) + - ClusterRole `kars-sre-reader` — kars-CR read scope (Slice 1) + - ClusterRoleBinding `kars-sre-reader` — bound to the SA + `sandbox` in namespace `kars-sre` (the controller-created default) + - ToolPolicy `sre-tools` (kars-sre) — gates the sre_* tool surface + +Per design (docs/blueprints/07-kars-sre-proposal.md §7.8 — privilege +containment): + - Sandbox uniqueness VAP (kars-sre-uniqueness) — Slice 1 ships the + label `kars.azure.com/role=sre`; the VAP itself lands in Slice 3 + alongside the typed apply-fix path + - kars_spawn family deregistered when KARS_SRE_ENABLED=true + (enforced in the plugin __init__.py — §7.8.5) + - kars_mesh_* family deregistered when KARS_SRE_ENABLED=true + (enforced in the plugin __init__.py — §7.8.6) + - Mesh egress blocked at the NetworkPolicy layer below — even if + the deregistration were bypassed, there's no network path to + the relay +*/}} +{{- if (.Values.sre | default dict).enabled }} +--- +# kars-sre InferencePolicy — the model the SRE agent uses for diagnosis. +# Default model is configurable via .Values.sre.model; the policy applies +# only to the `sre` sandbox by name. +apiVersion: kars.azure.com/v1alpha1 +kind: InferencePolicy +metadata: + name: sre-inference + namespace: {{ .Release.Namespace }} + labels: + kars.azure.com/sandbox: sre + kars.azure.com/role: sre + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} +spec: + appliesTo: + sandboxName: sre + modelPreference: + primary: + provider: {{ (.Values.sre | default dict).provider | default "azure-openai" | quote }} + deployment: {{ (.Values.sre | default dict).model | default "gpt-4.1" | quote }} + contentSafety: + requirePromptShields: true + tokenBudget: + perRequestTokens: {{ (.Values.sre | default dict).tokenBudget | default 32000 }} +--- +# kars-sre KarsSandbox — Hermes runtime, SRE plugin gated on env. +apiVersion: kars.azure.com/v1alpha1 +kind: KarsSandbox +metadata: + name: sre + namespace: {{ .Release.Namespace }} + labels: + # The label the future kars-sre-uniqueness VAP keys on (Slice 3). + # Slice 1 ships the label so by-the-time-VAP-lands no operator can + # have applied a second role=sre sandbox first. + kars.azure.com/role: sre + kars.azure.com/channels: none + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} +spec: + runtime: + kind: Hermes + hermes: + # The KARS_SRE_ENABLED gate. The Hermes plugin __init__.py + # checks this and: + # - registers the sre_* tools (sre.py) + # - DEREGISTERS kars_spawn family (§7.8.5) + # - DEREGISTERS kars_mesh_* family (§7.8.6) + # so this single env var carries the whole "you are the SRE agent" + # configuration. Standard Hermes sandboxes don't get this env and + # therefore don't get the SRE tools. + extraEnv: + KARS_SRE_ENABLED: "true" + + sandbox: + isolation: standard + + inferenceRef: + name: sre-inference + + governance: + enabled: true + toolPolicyRef: + name: sre-tools + registryMode: local + trustThreshold: 0 + + networkPolicy: + defaultDeny: true + # Slice 1 ships Learn mode so the operator can see what the agent + # reaches in practice; promote to Strict + signed allowlist in + # production (see proposal §6.6 lifecycle). + egressMode: Learn + # Intentionally NOT in the allowlist: agentmesh-relay / agentmesh- + # registry. The SRE agent does not use the mesh (§7.8.6 — three + # layers: spec, image plugin, networkPolicy; this is layer 3). + allowedEndpoints: + # In-cluster apiserver — the SRE agent's primary counterparty. + - host: kubernetes.default.svc.cluster.local + port: 443 +{{- if (.Values.sre | default dict).extraAllowedEndpoints }} +{{- range (.Values.sre | default dict).extraAllowedEndpoints }} + - host: {{ .host | quote }} + port: {{ .port }} +{{- end }} +{{- end }} +--- +# kars-sre ToolPolicy — gates the sre_* tool surface. +# +# Lives in the namespace the controller will create for the sre sandbox +# (kars- = kars-sre per the standard naming convention). +# A no-op once Slice 3 lands the per-tool ToolPolicy split, but for +# Slice 1 every read-only tool is allow-without-approval. +apiVersion: kars.azure.com/v1alpha1 +kind: ToolPolicy +metadata: + name: sre-tools + namespace: kars-sre + labels: + kars.azure.com/sandbox: sre + kars.azure.com/role: sre + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} + # Marked so kars-sre's own ResourceQuotas / governance objects + # are protected from DeleteResourceQuota (§7.7.1 label gate). + kars.azure.com/managed-by: controller +spec: + appliesTo: + sandboxMatchLabels: + kars.azure.com/role: sre + agtProfile: + inline: | + version: 1 + rules: + # Read-only kars-CR diagnostic tools — no approval needed. + - match: { tool: "sre_describe_state" } + decision: allow + - match: { tool: "sre_logs" } + decision: allow + - match: { tool: "sre_diagnose" } + decision: allow + - match: { tool: "sre_explain_error" } + decision: allow + - match: { tool: "sre_propose_fix" } + decision: allow +--- +# kars-sre-reader ClusterRole — Slice 1 RBAC. +# +# Scope: kars-owned CRs (cluster-wide read) + the SRE sandbox's own +# namespace (workloads/pods/events). The full §7.2.1 cluster-wide +# read on standard workload kinds lands in Slice 2 behind an opt-in +# install flag (kars sre install --with-cluster-wide-read). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kars-sre-reader + labels: + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} +rules: + # kars-owned CRs (read-only, cluster-wide) + - apiGroups: ["kars.azure.com"] + resources: + - "karssandboxes" + - "inferencepolicies" + - "toolpolicies" + - "egressapprovals" + - "karsmemories" + - "karsevals" + - "trustgraphs" + - "karspairings" + - "a2aagents" + - "mcpservers" + - "karsauthconfigs" + verbs: ["get", "list", "watch"] + # CRD introspection — the SRE agent reads CRD schemas to spot + # stale-CRD-vs-controller-source drift (the exact failure mode that + # bit us repeatedly during the Hermes-support PR debug arc). + - apiGroups: ["apiextensions.k8s.io"] + resources: ["customresourcedefinitions"] + verbs: ["get", "list"] + # Read pods / logs / events in any namespace where kars sandboxes + # live. Slice 1 leaves this scoped to kars-* namespaces by RoleBinding + # composition below; cluster-wide read on workloads is the Slice 2 + # opt-in. + - apiGroups: [""] + resources: ["pods", "pods/log", "services", "configmaps", "events", "namespaces", "serviceaccounts"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "statefulsets", "daemonsets", "replicasets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["events.k8s.io"] + resources: ["events"] + verbs: ["get", "list", "watch"] + # Secrets metadata ONLY (the .data field is stripped by the + # inference-router proxy filter per proposal §6.4). The RBAC verb + # `get` returns full secret data; the router-side filter is the + # actual enforcement layer. + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list"] +--- +# Bind the kars-sre-reader ClusterRole to the SA the controller +# creates for the `sre` KarsSandbox. +# +# The controller creates `kars-` as the sandbox +# namespace and `sandbox` as the SA name (hardcoded — see +# controller/src/reconciler/mod.rs::reconcile, the +# `serviceAccountName: "sandbox"` line). So this binding pins to +# (ServiceAccount, kars-sre, sandbox) — explicit subject, no group +# binding, no wildcard, satisfying §7.8.3. +# +# kubectl accepts CRBs that reference not-yet-existing SAs — the +# binding activates when the SA appears on first sandbox +# reconciliation. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kars-sre-reader + labels: + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kars-sre-reader +subjects: + - kind: ServiceAccount + name: sandbox + namespace: kars-sre +{{- end }} diff --git a/deploy/helm/kars/values.yaml b/deploy/helm/kars/values.yaml index 3b4756ec..3b09281c 100644 --- a/deploy/helm/kars/values.yaml +++ b/deploy/helm/kars/values.yaml @@ -417,3 +417,46 @@ entraSidecar: limits: cpu: "500m" memory: "256Mi" + + +# ── kars-sre (built-in SRE agent) ─────────────────────────────────────────── +# +# Opt-in (default: disabled). Enable via the CLI: +# kars sre install +# or directly via helm: +# helm upgrade --reuse-values --set sre.enabled=true ... +# +# When enabled, deploy/helm/kars/templates/sre.yaml provisions: +# - InferencePolicy `sre-inference` (Release.Namespace) +# - KarsSandbox `sre` (Release.Namespace) +# - ToolPolicy `sre-tools` (kars-sre) +# - ClusterRole `kars-sre-reader` (cluster-scope) +# - ClusterRoleBinding `kars-sre-reader` (cluster-scope → kars-sre/sandbox SA) +# +# Design: docs/blueprints/07-kars-sre-proposal.md (§7.1 slicing, +# §7.8 privilege containment, §7.7 typed-action threat model). +sre: + enabled: false + + # The Azure OpenAI deployment / model name the SRE agent reasons with. + # Defaults to gpt-4.1; override for cost/perf tuning. The model must be + # available in the project the kars controller is configured with — + # the InferencePolicy compiles against the standard router failover + # chain so an unavailable model surfaces as Degraded on the sandbox. + model: "gpt-4.1" + provider: "azure-openai" + + # Per-request token ceiling. The SRE agent's typical request shape + # (state summary + a few k of YAML/events) fits well under 32k; raise + # if your cluster has very large CRD inventories. + tokenBudget: 32000 + + # Additional egress hosts the SRE sandbox may reach beyond the in- + # cluster apiserver. Empty by default — the agent only talks to + # `kubernetes.default.svc` out of the box. Add api.telegram.org + + # api.slack.com here when wiring channel notifications (Slice 4). + # extraAllowedEndpoints: + # - host: api.telegram.org + # port: 443 + # - host: slack.com + # port: 443 diff --git a/docs/sre.md b/docs/sre.md new file mode 100644 index 00000000..526151fb --- /dev/null +++ b/docs/sre.md @@ -0,0 +1,122 @@ + + +# kars-sre — the built-in SRE agent + +A long-running, in-cluster agent that diagnoses Kubernetes incidents +on the same kars cluster that runs your other agents. Optional, opt-in. + +Status: **Slice 1 (MVP)** — read-only diagnostic tools. See +[`docs/blueprints/07-kars-sre-proposal.md`](blueprints/07-kars-sre-proposal.md) +§7.1 for the full slice ladder. + +--- + +## Install + +```bash +kars sre install +``` + +Equivalent to `helm upgrade --reuse-values --set sre.enabled=true`. +Brings up: + +| Resource | Where | What it is | +|---|---|---| +| `InferencePolicy/sre-inference` | `kars-system` | model preference + content-safety + token budget for the SRE agent | +| `KarsSandbox/sre` | `kars-system` | runtime = Hermes; `extraEnv: KARS_SRE_ENABLED=true` | +| `ToolPolicy/sre-tools` | `kars-sre` | gates the `sre_*` tool surface | +| `ClusterRole/kars-sre-reader` | cluster | read on kars CRs + apiextensions + core workloads in `kars-*` namespaces | +| `ClusterRoleBinding/kars-sre-reader` | cluster | binds the ClusterRole to `kars-sre/sandbox` SA — explicit subject (no group binding, no wildcard) per §7.8.3 | + +The controller derives namespace `kars-sre` from the sandbox name +`sre` per the standard `kars-` convention. The SA `sandbox` +inside that namespace is created by the controller on first reconcile. + +## Talk to it + +```bash +kars sre talk +# port-forwards the WebUI; visit http://localhost:18790 +``` + +Try: + +> *give me a cluster-wide health overview* + +The agent will: +1. Call `sre_describe_state` → kars-CR snapshot +2. Call `sre_diagnose` → checklist walk +3. Summarise what it found + +For more targeted questions: + +> *tail logs from the research-agent pod in kars-research* +> *what does "exceeded quota" usually mean in kars?* +> *propose a fix for the broken research-agent* + +## Tools available in Slice 1 + +All read-only — no approval gates yet. + +| Tool | What it does | +|---|---| +| `sre_describe_state` | structured snapshot of every kars-owned CR (kind, name, namespace, phase, conditions, lastReconciled) | +| `sre_logs` | tail pod logs via apiserver (caps at 500 lines) | +| `sre_diagnose` | walk the kars-CR health checklist (controller Ready, CRDs installed, no Degraded sandboxes, no stale reconciles) | +| `sre_explain_error` | match an error string against the OOTB-blocker corpus, return root-cause hypothesis | +| `sre_propose_fix` | return a typed-action proposal (Slice 1 codifies `DeleteResourceQuota`; the rest of the typed-action set lands with `sre_apply_fix` in Slice 3) | + +## What it CAN'T do (yet) + +Per the slice ladder: + +- **No K8s diag toolset yet** — `sre_image_probe`, `sre_endpoints_inspect`, `sre_what_changed`, `sre_top` land in Slice 2 +- **No fix execution** — `sre_apply_fix` + TokenRequest mint + admission backstop land in Slice 3 +- **No proactive notifications** — `sre_continuous` informer loop + `kars_notify_human` (Telegram/Slack) land in Slice 4 +- **No source-code grounding** — GitHub MCP wiring lands in Slice 5 + +Until Slice 3 lands, fix execution is operator-driven: copy the +proposal output, apply manually. The Act II demo's runbook +(`tools/demo/act2/runbook.md`) walks this. + +## Containment — what kars-sre is NOT allowed to do + +The SRE agent is the only sandbox in the cluster with cluster-wide +read RBAC, and (in Slice 3+) the only sandbox that can request +short-lived writer tokens. These privileges are **uniquely held** — +see proposal §7.8 for the nine-layer containment design. In summary: + +- The `sre_*` tools don't exist in any other pod's runtime image + (Slice 1: env-gated; Slice 1.5: separate `kars/sre-sandbox` image) +- Only one `KarsSandbox` per cluster can carry `kars.azure.com/role=sre` + (Slice 3 admission policy) +- The `kars-sre-reader` ClusterRoleBinding is pinned to a specific + ServiceAccount (no group bindings; satisfies §7.8.3) +- The SRE sandbox cannot spawn sub-agents — the `kars_spawn` family + is skipped during plugin registration (§7.8.5) +- The SRE sandbox is not on the mesh — `kars_mesh_*` family is + skipped during plugin registration; the NetworkPolicy in + `sre.yaml` blocks the `agentmesh` namespace; the agent has no + DID and is not registered (§7.8.6) +- Future write actions (Slice 3) are typed (no shell exec), exclude + governance state (RBAC, secrets, kars CRs, kube-system, + validating webhooks), use short-lived TokenRequest tokens bound + to the pod's UID with 5-min TTL (§7.7.1 + §7.8.4) + +## Uninstall + +```bash +kars sre uninstall +``` + +Sets `sre.enabled=false` via `helm upgrade --reuse-values`. The +controller garbage-collects the sandbox + namespace + RBAC via +ownerRefs. + +## See also + +- Full design: [`docs/blueprints/07-kars-sre-proposal.md`](blueprints/07-kars-sre-proposal.md) +- Demo Act II walkthrough: [`tools/demo/act2/runbook.md`](../tools/demo/act2/runbook.md) diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/__init__.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/__init__.py index 6dcfeec9..00fdf7e4 100644 --- a/runtimes/hermes/src/kars_runtime_hermes/plugin/__init__.py +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/__init__.py @@ -28,41 +28,82 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic Act 1 scope: wire the AGT governance gate, kars_spawn family, Foundry tool wrappers, http_fetch via egress proxy, and stubs for kars_mesh_*. + + SRE-mode containment (per docs/blueprints/07-kars-sre-proposal.md §7.8): + when ``KARS_SRE_ENABLED=true`` is set on the sandbox pod (the env is + written exclusively by deploy/helm/kars/templates/sre.yaml on the + ``sre`` KarsSandbox), this entry point: + + - SKIPS registering the kars_spawn family (§7.8.5) + - SKIPS registering the kars_mesh_* family (§7.8.6 — also enforced + at the NetworkPolicy layer; the deregistration is layer 2) + - REGISTERS the sre_* tool surface (sre.py) + + Standard Hermes sandboxes never have ``KARS_SRE_ENABLED`` set and + therefore get the full standard tool surface (spawn, mesh) with no + SRE tools. """ + from . import sre # noqa: PLC0415 — lazy import + + sre_mode = sre.is_enabled() + if sre_mode: + logger.info( + "KARS_SRE_ENABLED=true detected — entering SRE-mode plugin " + "registration (no kars_spawn, no kars_mesh_*, sre_* tools " + "active)" + ) + # Phase A1.4 — register the pre_tool_call governance hook first from . import governance # noqa: PLC0415 — lazy import governance.register(ctx) - # Phase A1.5 — sub-agent spawn family (HTTP-only against router) - from . import spawn # noqa: PLC0415 + # Phase A1.5 — sub-agent spawn family (HTTP-only against router). + # SKIPPED in SRE mode per §7.8.5 — the SRE agent must not spawn + # sub-agents (sub-agents would inherit the kars-sre namespace's + # RBAC, breaking privilege containment). + if not sre_mode: + from . import spawn # noqa: PLC0415 - spawn.register(ctx) + spawn.register(ctx) + else: + logger.info("§7.8.5 — skipping kars_spawn family registration (SRE mode)") - # Phase A1.6 — kars_discover (registry HTTP proxy) - from . import discover # noqa: PLC0415 + # Phase A1.6 — kars_discover (registry HTTP proxy). SKIPPED in SRE + # mode — the SRE agent doesn't need to find peers (it has no peers). + if not sre_mode: + from . import discover # noqa: PLC0415 - discover.register(ctx) + discover.register(ctx) # Phase A1.7 — 9 Foundry tool wrappers (HTTP-only; gated when KARS_PROVIDER - # is a slim/github mode) + # is a slim/github mode). Retained in SRE mode — the SRE agent may + # still use Foundry memory + content-safety + inference. from . import foundry # noqa: PLC0415 foundry.register(ctx) - # Always-on: http_fetch via /egress/fetch + # Always-on: http_fetch via /egress/fetch. + # Retained in SRE mode — the egress NetworkPolicy in sre.yaml is the + # actual outbound gate; http_fetch's value to the SRE agent is + # zero today but it's harmless and may be useful for future + # source-grounding (Slice 5). from . import http_fetch # noqa: PLC0415 http_fetch.register(ctx) # Phase A2.1 — real AGT MeshClient (replaces mesh_stubs). - # The mesh adapter wraps kars-agt-mesh's MeshClient and exposes the - # kars_mesh_{send,inbox,await,transfer_file} tool family with the - # same names the Act 1 stubs used, so the LLM contract is stable - # across the upgrade. - from . import mesh # noqa: PLC0415 - - mesh.register(ctx) + # SKIPPED in SRE mode per §7.8.6 — the SRE agent is not on the mesh + # at all (no DID, no relay socket, not in the registry). The + # NetworkPolicy in sre.yaml blocks the agentmesh namespace too, so + # this is one of three enforcement layers (spec env / plugin code / + # network policy). + if not sre_mode: + from . import mesh # noqa: PLC0415 + + mesh.register(ctx) + else: + logger.info("§7.8.6 — skipping kars_mesh_* family registration (SRE mode)") # Phase A2.1 — deregister Hermes' built-in sub-agent / direct-API # tools so the LLM sees ONLY kars's governed mesh path. This is the @@ -134,50 +175,49 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic # Phase A2.1 — eagerly init the MeshClient at plugin load so the # sub-agent is **discoverable** before its first tool call. - # - # Without this, MeshClient connects lazily on first kars_mesh_* - # call, which means a freshly-spawned sub-agent has zero presence - # in the registry until its LLM decides to call a mesh tool. When - # the parent tries `kars_mesh_send(to_agent=)` immediately - # after spawn, find_by_display_name returns no peer → spawn-then- - # send breaks despite the pod being Running. - # - # We init on a background thread so a transient registry/relay - # outage doesn't block Hermes' gateway startup. Failure here only - # delays the first mesh exchange; the next tool call retries via - # the same singleton. - try: - from . import mesh as _mesh_module # noqa: PLC0415 + # SKIPPED in SRE mode per §7.8.6 — the SRE agent is not on the mesh + # at all; eager-init would fail (registry refuses to register a DID + # whose pod has no relay egress) and the thread would log a noisy + # error. + if not sre_mode: + try: + from . import mesh as _mesh_module # noqa: PLC0415 - import threading as _threading # noqa: PLC0415 + import threading as _threading # noqa: PLC0415 - def _eager_mesh_init() -> None: - try: - _mesh_module._get_or_init_client() # noqa: SLF001 - logger.info("MeshClient pre-connected at plugin load") - # Now start the auto-responder worker (no-op unless - # KARS_MESH_AUTO_RESPONDER=1, which the controller sets - # on sub-agent containers — parent is not enabled to - # avoid the parent looping on its own outbound). + def _eager_mesh_init() -> None: try: - from . import mesh_worker as _worker # noqa: PLC0415 - - _worker.start_worker(_mesh_module._get_or_init_client) # noqa: SLF001 + _mesh_module._get_or_init_client() # noqa: SLF001 + logger.info("MeshClient pre-connected at plugin load") + # Now start the auto-responder worker (no-op unless + # KARS_MESH_AUTO_RESPONDER=1, which the controller sets + # on sub-agent containers — parent is not enabled to + # avoid the parent looping on its own outbound). + try: + from . import mesh_worker as _worker # noqa: PLC0415 + + _worker.start_worker(_mesh_module._get_or_init_client) # noqa: SLF001 + except Exception as exc: # noqa: BLE001 + logger.warning("Could not start mesh worker: %s", exc) except Exception as exc: # noqa: BLE001 - logger.warning("Could not start mesh worker: %s", exc) - except Exception as exc: # noqa: BLE001 - logger.warning( - "Eager MeshClient init failed (will retry on first tool call): %s", - exc, - ) - - _threading.Thread( - target=_eager_mesh_init, - name="kars-mesh-eager-init", - daemon=True, - ).start() - except Exception as exc: # noqa: BLE001 - logger.warning("Could not schedule eager MeshClient init: %s", exc) + logger.warning( + "Eager MeshClient init failed (will retry on first tool call): %s", + exc, + ) + + _threading.Thread( + target=_eager_mesh_init, + name="kars-mesh-eager-init", + daemon=True, + ).start() + except Exception as exc: # noqa: BLE001 + logger.warning("Could not schedule eager MeshClient init: %s", exc) + + # SRE-mode-only: register the sre_* tool surface AFTER everything + # else has registered (so deregister calls in sre.register can find + # the targets, though Slice 1 doesn't actually deregister anything). + if sre_mode: + sre.register(ctx) # Trust + signing-counter background pushes from . import telemetry # noqa: PLC0415 @@ -185,9 +225,10 @@ def _eager_mesh_init() -> None: telemetry.register(ctx) logger.info( - "kars-hermes plugin registered (contract v1, mesh: %s, " + "kars-hermes plugin registered (contract v1, sre_mode: %s, mesh: %s, " "Hermes built-ins denied: %d)", - "real (Act 2.1 — kars-agt-mesh)", + sre_mode, + "disabled (SRE mode)" if sre_mode else "real (Act 2.1 — kars-agt-mesh)", len(_HERMES_DENY), ) diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py new file mode 100644 index 00000000..ea654737 --- /dev/null +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py @@ -0,0 +1,603 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""kars-sre Hermes plugin — Slice 1 (MVP read-only diagnostic tools). + +Registered by ``runtimes/hermes/src/kars_runtime_hermes/plugin/__init__.py`` +only when the env ``KARS_SRE_ENABLED=true`` is set. The Helm template +``deploy/helm/kars/templates/sre.yaml`` sets that env exclusively on +the ``sre`` KarsSandbox pod via ``spec.runtime.hermes.extraEnv``; +standard Hermes sandboxes never see the env and therefore never get +the ``sre_*`` tool surface. + +Containment (per docs/blueprints/07-kars-sre-proposal.md §7.8): + + - §7.8.1 Plugin packaging — Slice 1 ships SRE inside the shared + Hermes image gated on the env. The §7.8.1 separate-image + split is a follow-up slice. The env gate is the + interim enforcement boundary: the tools simply aren't + registered in any other pod, so a remote agent asking + for ``sre_*`` calls hits "tool not found" at the runtime + (not at the policy layer). + - §7.8.5 Spawn disabled — the plugin __init__.py also + deregisters the ``kars_spawn`` family when this env + is set, so the SRE agent cannot spawn sub-agents. + - §7.8.6 Mesh disabled at the source — the plugin __init__.py + deregisters the ``kars_mesh_*`` family AND the + NetworkPolicy in sre.yaml omits the agentmesh namespace + from the allowlist, so even if a future bug accidentally + tried to dial the relay, the network path does not exist. + +Slice 1 tool surface (all read-only, no approval gates): + + ============================ ================================================ + Tool What it does + ============================ ================================================ + sre_describe_state Structured snapshot of every kars-owned CR in + every namespace (KarsSandbox · InferencePolicy + · ToolPolicy · EgressApproval · KarsMemory · + etc.) with phase, conditions, last reconcile. + + sre_logs Tail any pod's any container (capped 500 + lines). Uses the standard apiserver + /api/v1/namespaces//pods//log + endpoint with ?container=&tailLines=N. + + sre_diagnose Walks the kars-CR health checklist: + controller deployment Ready, CRDs present, + no KarsSandbox in Failed/Degraded for >5min, + no orphaned ConfigMaps. Returns a structured + report. + + sre_explain_error Given an error string, returns a structured + root-cause hypothesis by matching against a + small in-process corpus of known kars + failure modes (extracted from the OOTB + blockers tracked in the proposal §Why). + + sre_propose_fix Given a diagnosis, returns a proposed typed + action (per §7.7.1 — JSON document, not a + shell command). READ-ONLY: produces the + proposal, does not execute. Apply lands in + Slice 3. + ============================ ================================================ + +Each tool returns a dict; the Hermes plugin context serialises it +to the LLM. The tool implementation MUST never raise on apiserver +errors — those become ``{"error": "..."}`` entries in the returned +dict so the LLM can reason over them. Hard raises are reserved for +"this tool is misconfigured" issues that aren't agent-recoverable. +""" + +from __future__ import annotations + +import logging +import os +from typing import Any + +import httpx + +from . import sre_kube + +logger = logging.getLogger("kars.hermes.sre") + +# -------------------------------------------------------------------------- +# Constants +# -------------------------------------------------------------------------- + +KARS_GROUP = "kars.azure.com" +KARS_VERSION = "v1alpha1" + +# The kars-owned CR kinds the SRE agent knows about (matches the RBAC +# grant in deploy/helm/kars/templates/sre.yaml). Plural form is what +# the apiserver expects in the URL path. +KARS_CR_KINDS: list[tuple[str, str]] = [ + ("karssandboxes", "KarsSandbox"), + ("inferencepolicies", "InferencePolicy"), + ("toolpolicies", "ToolPolicy"), + ("egressapprovals", "EgressApproval"), + ("karsmemories", "KarsMemory"), + ("karsevals", "KarsEval"), + ("trustgraphs", "TrustGraph"), + ("karspairings", "KarsPairing"), + ("a2aagents", "A2AAgent"), + ("mcpservers", "McpServer"), + ("karsauthconfigs", "KarsAuthConfig"), +] + + +# -------------------------------------------------------------------------- +# OOTB-blocker corpus — known kars failure modes for sre_explain_error +# -------------------------------------------------------------------------- +# +# The corpus is intentionally small and hand-curated rather than an +# embedding-backed search: false positives on diagnostic hypotheses +# are confusing to operators, so we match only patterns that have +# very high signal. The corpus grows with each new OOTB blocker the +# proposal §Why list captures. +OOTB_CORPUS: list[dict[str, str]] = [ + { + "pattern": "ImagePullBackOff", + "hypothesis": ( + "The pod's container image is unreachable or doesn't exist. Causes: " + "image tag typo in the controlling resource (KarsSandbox spec.runtime / " + "Deployment spec.template.spec.containers[].image), private registry " + "without an imagePullSecret, or registry-side throttling/outage." + ), + "next_steps": ( + "1) describe the pod to read the precise pull error; " + "2) list image tags actually in use on the cluster to suggest the " + "closest valid one; " + "3) propose PatchDeploymentImage with the corrected tag." + ), + }, + { + "pattern": "exceeded quota", + "hypothesis": ( + "Pod creation is being rejected by a ResourceQuota in the namespace. " + "Likely cause: an operator-applied platform ResourceQuota whose ceiling " + "is lower than the workload's requests (the textbook GitOps-collision " + "incident)." + ), + "next_steps": ( + "1) list ResourceQuotas in the namespace; " + "2) compare the quota's `hard` map against the deployment's requests; " + "3) propose DeleteResourceQuota for the offending policy (only " + "permitted when the ResourceQuota does NOT carry the " + "kars.azure.com/managed-by=controller label)." + ), + }, + { + "pattern": "OOMKilled", + "hypothesis": ( + "Container was killed by the kernel for exceeding its memory limit. " + "Causes: memory limit too low for the workload's working set, memory " + "leak in the workload, or a sibling container in the same pod " + "starving this one." + ), + "next_steps": ( + "1) check the pod's containerStatuses[].lastState for the kill memory " + "usage; " + "2) describe the deployment for current resource.limits.memory; " + "3) propose PatchDeploymentResources to a higher ceiling (Slice 3+)." + ), + }, + { + "pattern": "CrashLoopBackOff", + "hypothesis": ( + "Container is repeatedly exiting non-zero on startup. Causes: " + "misconfiguration in env / config / mounted secrets, a hard " + "dependency that's unreachable at startup, or a bug in the " + "container itself surfaced by a recent rollout." + ), + "next_steps": ( + "1) tail the container logs via sre_logs to get the exit reason; " + "2) describe the pod for restart count + last exit code; " + "3) compare current image/env to the last-known-good rollout via " + "sre_what_changed (Slice 2)." + ), + }, + { + "pattern": "FailedScheduling", + "hypothesis": ( + "Scheduler cannot place the pod on any node. Causes: no node has the " + "requested resources, all candidate nodes are cordoned/tainted, " + "topology constraints unsatisfiable, or PVC pending." + ), + "next_steps": ( + "1) describe the pod for the scheduler's per-node reason summary; " + "2) check node status (Ready, schedulable, taints); " + "3) propose UncordonNode (Slice 3, node-tier write) or " + "ScaleDeployment to fit." + ), + }, + { + "pattern": "ContainerCreating", + "hypothesis": ( + "Stuck creating — kubelet is attempting to set up the container but " + "blocking on a precondition. Causes: secret/configmap referenced by " + "envFrom/volumes doesn't exist yet, image pull in progress, " + "init-container still running, or a PVC binding." + ), + "next_steps": ( + "1) describe the pod for the kubelet's last event; " + "2) verify referenced secrets / configmaps / PVCs exist; " + "3) if image pull is the cause, wait + re-check." + ), + }, +] + + +# -------------------------------------------------------------------------- +# Tool implementations +# -------------------------------------------------------------------------- + + +def _summarise_cr(item: dict[str, Any], kind: str) -> dict[str, Any]: + """Reduce a CR's full JSON to the fields the agent cares about.""" + meta = item.get("metadata", {}) + status = item.get("status", {}) + return { + "kind": kind, + "namespace": meta.get("namespace"), + "name": meta.get("name"), + "phase": status.get("phase"), + "observedGeneration": status.get("observedGeneration"), + "lastReconciled": status.get("lastReconciled"), + "conditions": [ + { + "type": c.get("type"), + "status": c.get("status"), + "reason": c.get("reason"), + "message": c.get("message"), + } + for c in status.get("conditions", []) + ], + } + + +def sre_describe_state(**_kwargs: Any) -> dict[str, Any]: + """Tool: structured snapshot of every kars-owned CR in the cluster. + + Returns a dict keyed by CR kind whose values are lists of summarised + instances. Each instance carries name + namespace + phase + + observedGeneration + lastReconciled + conditions — enough for the + agent to spot Degraded/Failed/stale CRs without re-fetching. + """ + kube = sre_kube.client() + out: dict[str, Any] = {} + for plural, kind in KARS_CR_KINDS: + path = f"/apis/{KARS_GROUP}/{KARS_VERSION}/{plural}" + try: + doc = kube.get(path) + items = doc.get("items", []) + out[kind] = [_summarise_cr(it, kind) for it in items] + except httpx.HTTPStatusError as exc: + # 404 = the CRD isn't installed; common during early-cluster. + # 403 = RBAC didn't bind correctly; informative to surface. + out[kind] = { + "error": f"{exc.response.status_code} {exc.response.reason_phrase}", + "path": path, + } + except Exception as exc: # noqa: BLE001 — tool MUST NOT raise + out[kind] = {"error": str(exc), "path": path} + return out + + +def sre_logs( + *, + namespace: str, + pod: str, + container: str | None = None, + tail: int = 500, + **_kwargs: Any, +) -> dict[str, Any]: + """Tool: tail pod logs. + + Args: + namespace: pod's namespace. + pod: pod name. + container: container name within the pod; omit for single-container pods. + tail: max lines to return (capped at 500). + """ + tail = max(1, min(tail, 500)) + params: dict[str, Any] = {"tailLines": tail} + if container: + params["container"] = container + path = f"/api/v1/namespaces/{namespace}/pods/{pod}/log" + kube = sre_kube.client() + try: + client = kube._ensure_client() # noqa: SLF001 — same module surface + resp = client.get(path, params=params) + resp.raise_for_status() + return { + "namespace": namespace, + "pod": pod, + "container": container, + "tailLines": tail, + "logs": resp.text, + } + except httpx.HTTPStatusError as exc: + return { + "namespace": namespace, + "pod": pod, + "container": container, + "error": f"{exc.response.status_code} {exc.response.reason_phrase}", + "body": exc.response.text[:512], + } + except Exception as exc: # noqa: BLE001 + return {"namespace": namespace, "pod": pod, "container": container, "error": str(exc)} + + +def sre_diagnose(**_kwargs: Any) -> dict[str, Any]: + """Tool: walk the kars-CR health checklist. + + Returns a structured report: + - controller_status: deployment ready? + - crds_present: every CRD the controller expects is installed? + - degraded_sandboxes: KarsSandboxes whose .status.phase ∉ {Ready,Running} + - degraded_policies: governance CRs in non-Ready phases + - stale_reconciles: CRs whose lastReconciled is > 5min old + """ + kube = sre_kube.client() + report: dict[str, Any] = { + "controller_status": "unknown", + "crds_present": [], + "crds_missing": [], + "degraded_sandboxes": [], + "degraded_policies": [], + "summary": "", + } + + # 1) Controller deployment status + try: + doc = kube.get("/apis/apps/v1/namespaces/kars-system/deployments/kars-controller") + spec_replicas = doc.get("spec", {}).get("replicas", 0) + ready_replicas = doc.get("status", {}).get("readyReplicas", 0) or 0 + if ready_replicas >= 1 and ready_replicas == spec_replicas: + report["controller_status"] = "Ready" + else: + report["controller_status"] = f"Degraded ({ready_replicas}/{spec_replicas} ready)" + except Exception as exc: # noqa: BLE001 + report["controller_status"] = f"Unknown: {exc}" + + # 2) CRD inventory check + try: + doc = kube.get("/apis/apiextensions.k8s.io/v1/customresourcedefinitions") + installed = {c.get("metadata", {}).get("name") for c in doc.get("items", [])} + for plural, _kind in KARS_CR_KINDS: + full = f"{plural}.{KARS_GROUP}" + if full in installed: + report["crds_present"].append(full) + else: + report["crds_missing"].append(full) + except Exception as exc: # noqa: BLE001 + report["crds_present"] = f"error: {exc}" + + # 3) Sandbox/policy phase scan — reuse describe_state results + state = sre_describe_state() + for kind, items in state.items(): + if isinstance(items, dict) and "error" in items: + continue + for it in items: + phase = it.get("phase") + if phase and phase not in {"Ready", "Running", "Compiled", "Active"}: + bucket = ( + "degraded_sandboxes" if kind == "KarsSandbox" else "degraded_policies" + ) + report[bucket].append(it) + + # 4) Summary string the LLM can quote verbatim + n_deg_sb = len(report["degraded_sandboxes"]) + n_deg_pol = len(report["degraded_policies"]) + n_missing = len(report["crds_missing"]) + bits = [] + bits.append(f"controller: {report['controller_status']}") + bits.append(f"CRDs missing: {n_missing}") + bits.append(f"sandboxes degraded: {n_deg_sb}") + bits.append(f"governance CRs degraded: {n_deg_pol}") + report["summary"] = "; ".join(bits) + return report + + +def sre_explain_error(*, error: str, **_kwargs: Any) -> dict[str, Any]: + """Tool: match an error string against the OOTB-blocker corpus. + + Returns the first matching entry's hypothesis + next_steps, or + ``{"matched": False}`` if no pattern matches. The agent is expected + to use this as a hint, not a verdict — it then walks the next_steps + using the other diagnostic tools to confirm. + """ + if not error: + return {"matched": False, "reason": "empty error string"} + lowered = error.lower() + matches = [c for c in OOTB_CORPUS if c["pattern"].lower() in lowered] + if not matches: + return {"matched": False, "error": error} + # Return up to 3 matches (sorted by pattern length desc — longer + # patterns are more specific, less likely to be false positives). + matches.sort(key=lambda c: len(c["pattern"]), reverse=True) + return { + "matched": True, + "error": error, + "hypotheses": matches[:3], + } + + +def sre_propose_fix( + *, + diagnosis: str, + target: dict[str, Any] | None = None, + **_kwargs: Any, +) -> dict[str, Any]: + """Tool: propose a typed action (read-only — no execution). + + Args: + diagnosis: short string describing what the agent has concluded + (e.g. "ResourceQuota platform-hardening-quota in + kars-research is blocking pod admission"). + target: optional dict carrying the resource the proposal acts on, + e.g. {"kind": "ResourceQuota", "namespace": "kars-research", + "name": "platform-hardening-quota"}. + + Returns a proposal envelope with the typed-action payload. Slice 1 + is read-only: the proposal is returned to the agent (who relays it + to the operator); Slice 3 (`sre_apply_fix`) adds the execution + path with TokenRequest + admission gate. + """ + target = target or {} + proposal: dict[str, Any] = { + "kind": "FixProposal", + "diagnosis": diagnosis, + "target": target, + "action": None, + "rationale": None, + "execution_status": "proposed (Slice 1 — not executed; awaiting Slice 3 sre_apply_fix)", + } + + # Slice 1 understands ONE proposal shape: DeleteResourceQuota. + # The full typed-action set lands in Slice 3 alongside the + # apply-fix execution path. This single understanding lets the + # demo's Act II flow complete end-to-end via the runbook + # (operator runs `bash tools/demo/act2/reset.sh` after seeing the + # proposal — autonomous apply lands in Slice 3). + if target.get("kind") == "ResourceQuota": + proposal["action"] = { + "type": "DeleteResourceQuota", + "namespace": target.get("namespace"), + "name": target.get("name"), + } + proposal["rationale"] = ( + "Operator-applied ResourceQuotas without the " + "kars.azure.com/managed-by=controller label are safely deletable " + "by the SRE agent (per §7.7.1). Removing this quota restores " + "the namespace's pod admission and the controller will " + "schedule a fresh sandbox pod." + ) + else: + # Generic envelope for unknown target kinds — Slice 1 returns + # the proposal text without a typed action; Slice 3 widens + # the typed-action set. + proposal["rationale"] = ( + "No typed action codified yet for this target kind in Slice 1. " + "The proposal text alone is returned; the operator can apply " + "manually per the demo runbook." + ) + + return proposal + + +# -------------------------------------------------------------------------- +# Plugin registration +# -------------------------------------------------------------------------- + + +def is_enabled() -> bool: + """Return True if the env gate is set. Called by the plugin __init__.py. + + The env is set exclusively by ``deploy/helm/kars/templates/sre.yaml`` + on the ``sre`` KarsSandbox's ``spec.runtime.hermes.extraEnv``. + Standard sandboxes don't see it. + """ + return os.environ.get("KARS_SRE_ENABLED", "").lower() in {"true", "1", "yes"} + + +def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic + """Register the SRE tool surface on the Hermes plugin context. + + Idempotent: re-registration replaces the existing tool definitions. + Called from ``runtimes/hermes/.../plugin/__init__.py`` only when + ``is_enabled()`` returns True. + """ + register_tool = getattr(ctx, "register_tool", None) + if not callable(register_tool): + logger.warning("Hermes ctx has no register_tool — SRE plugin not registered") + return + + register_tool( + name="sre_describe_state", + description=( + "Return a structured snapshot of every kars-owned CR in every " + "namespace (KarsSandbox, InferencePolicy, ToolPolicy, " + "EgressApproval, KarsMemory, KarsEval, TrustGraph, KarsPairing, " + "A2AAgent, McpServer, KarsAuthConfig). Each CR carries name, " + "namespace, phase, observedGeneration, lastReconciled, and " + "conditions. Use this as the first call when starting an " + "incident investigation." + ), + parameters={"type": "object", "properties": {}, "required": []}, + handler=sre_describe_state, + ) + + register_tool( + name="sre_logs", + description=( + "Tail logs from a pod's container via the apiserver. Returns the " + "last N lines (max 500). Use for diagnosing CrashLoopBackOff or " + "for inspecting an agent's behaviour." + ), + parameters={ + "type": "object", + "properties": { + "namespace": {"type": "string", "description": "Pod's namespace"}, + "pod": {"type": "string", "description": "Pod name"}, + "container": { + "type": "string", + "description": "Container name (omit for single-container pods)", + }, + "tail": { + "type": "integer", + "description": "Max lines to return (capped at 500)", + "default": 200, + }, + }, + "required": ["namespace", "pod"], + }, + handler=sre_logs, + ) + + register_tool( + name="sre_diagnose", + description=( + "Walk the kars-CR health checklist: controller deployment Ready, " + "every kars CRD installed, no Degraded/Failed sandboxes or " + "governance CRs, no stale reconciles. Returns a structured " + "report + a one-line summary suitable for an operator-facing " + "message." + ), + parameters={"type": "object", "properties": {}, "required": []}, + handler=sre_diagnose, + ) + + register_tool( + name="sre_explain_error", + description=( + "Given an error string (pod event reason, controller log line, " + "etc.), return a root-cause hypothesis from the kars OOTB-blocker " + "corpus. The hypothesis is a HINT — the agent should then use " + "the other diagnostic tools to confirm or refute it." + ), + parameters={ + "type": "object", + "properties": { + "error": { + "type": "string", + "description": "The error string to explain", + }, + }, + "required": ["error"], + }, + handler=sre_explain_error, + ) + + register_tool( + name="sre_propose_fix", + description=( + "Return a typed-action proposal for the operator to approve. " + "READ-ONLY in Slice 1 — Slice 3 adds sre_apply_fix to execute " + "approved proposals. Use after diagnosing a problem to surface " + "the recommended remediation." + ), + parameters={ + "type": "object", + "properties": { + "diagnosis": { + "type": "string", + "description": "One-line summary of what was diagnosed", + }, + "target": { + "type": "object", + "description": "Resource the proposal acts on (kind/namespace/name)", + "properties": { + "kind": {"type": "string"}, + "namespace": {"type": "string"}, + "name": {"type": "string"}, + }, + }, + }, + "required": ["diagnosis"], + }, + handler=sre_propose_fix, + ) + + logger.info("kars-sre plugin registered (5 tools, read-only)") diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_kube.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_kube.py new file mode 100644 index 00000000..4d84da4b --- /dev/null +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_kube.py @@ -0,0 +1,132 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""kars-sre — Kubernetes apiserver client (S1). + +A minimal in-cluster apiserver client built on httpx — no `kubernetes` +PyPI dep added to the Hermes runtime image (which is shared with +non-SRE sandboxes; keeping the dep footprint tight is part of the +§7.8.1 design even though Slice 1 ships SRE in the shared image +behind the ``KARS_SRE_ENABLED`` env gate — the §7.8.1 separate +image is a follow-up slice). + +Reads the standard projected ServiceAccount artefacts mounted at: + + - ``/var/run/secrets/kubernetes.io/serviceaccount/token`` — auto-rotated + - ``/var/run/secrets/kubernetes.io/serviceaccount/ca.crt`` — apiserver CA + - ``/var/run/secrets/kubernetes.io/serviceaccount/namespace`` — pod's ns + +and dials ``https://kubernetes.default.svc.cluster.local:443`` (the +in-cluster apiserver Service) with the SA token as the Bearer credential. + +There is no fallback for out-of-cluster operation; this module is +designed to run inside a pod with a projected SA token. The Slice 1 +RBAC binding (``kars-sre-reader`` ClusterRole on the ``sandbox`` SA +in namespace ``kars-sre``) defines what this client can read. +""" + +from __future__ import annotations + +import os +import pathlib +from typing import Any + +import httpx + +_SA_DIR = pathlib.Path("/var/run/secrets/kubernetes.io/serviceaccount") +_DEFAULT_APISERVER = "https://kubernetes.default.svc.cluster.local" + +# Read tokens / CA each call. The kubelet rotates the projected token +# on a regular cadence (default 1h) and rewrites the file in place; a +# cached value would expire silently. The cost of re-reading a ~1KB +# file is negligible vs. the apiserver round-trip. + + +def _read_token() -> str: + p = _SA_DIR / "token" + if not p.exists(): + raise RuntimeError( + "no ServiceAccount token at " + f"{p} — kars-sre must run inside a pod with a projected SA" + ) + return p.read_text(encoding="utf-8").strip() + + +def _ca_bundle() -> str: + p = _SA_DIR / "ca.crt" + if not p.exists(): + raise RuntimeError(f"no apiserver CA at {p}") + return str(p) + + +def _apiserver_host() -> str: + # The standard env vars the kubelet injects. + host = os.environ.get("KUBERNETES_SERVICE_HOST") + port = os.environ.get("KUBERNETES_SERVICE_PORT", "443") + if host: + return f"https://{host}:{port}" + return _DEFAULT_APISERVER + + +class KubeClient: + """Thin wrapper around httpx for read-only apiserver calls. + + Per-instance httpx client is reused across calls; rebuilt when the + SA token is rotated (detected by content hash on each request). + """ + + def __init__(self, timeout: float = 30.0) -> None: + self._timeout = timeout + self._client: httpx.Client | None = None + self._token: str | None = None + + def _build_client(self) -> httpx.Client: + token = _read_token() + ca = _ca_bundle() + host = _apiserver_host() + client = httpx.Client( + base_url=host, + headers={"Authorization": f"Bearer {token}", "Accept": "application/json"}, + verify=ca, + timeout=self._timeout, + ) + self._token = token + return client + + def _ensure_client(self) -> httpx.Client: + # Detect token rotation by re-reading the file and comparing. + current_token = _read_token() + if self._client is None or current_token != self._token: + if self._client is not None: + self._client.close() + self._client = self._build_client() + return self._client + + def get(self, path: str, *, params: dict[str, Any] | None = None) -> dict[str, Any]: + """GET ``path`` on the apiserver, return parsed JSON. + + ``path`` is the apiserver URL path (e.g. ``/api/v1/namespaces/kars-sre/pods``). + Raises httpx.HTTPStatusError on non-2xx so the caller can present a + clear error to the agent. + """ + client = self._ensure_client() + resp = client.get(path, params=params) + resp.raise_for_status() + return resp.json() + + def close(self) -> None: + if self._client is not None: + self._client.close() + self._client = None + self._token = None + + +_singleton: KubeClient | None = None + + +def client() -> KubeClient: + """Return a process-wide singleton KubeClient.""" + global _singleton # noqa: PLW0603 — process-singleton is intentional + if _singleton is None: + _singleton = KubeClient() + return _singleton diff --git a/runtimes/hermes/tests/test_sre.py b/runtimes/hermes/tests/test_sre.py new file mode 100644 index 00000000..808c9c32 --- /dev/null +++ b/runtimes/hermes/tests/test_sre.py @@ -0,0 +1,220 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""kars-sre plugin tests (Slice 1).""" + +from __future__ import annotations + +import importlib +import os +import sys +from typing import Any +from unittest.mock import MagicMock, patch + + +def test_is_enabled_default_false() -> None: + """Without KARS_SRE_ENABLED, the plugin must be disabled.""" + from kars_runtime_hermes.plugin import sre + + with patch.dict(os.environ, {}, clear=True): + assert not sre.is_enabled() + + +def test_is_enabled_accepts_truthy_values() -> None: + from kars_runtime_hermes.plugin import sre + + for v in ("true", "True", "TRUE", "1", "yes", "YES"): + with patch.dict(os.environ, {"KARS_SRE_ENABLED": v}, clear=True): + assert sre.is_enabled(), f"value {v!r} should be truthy" + + +def test_is_enabled_rejects_falsy_values() -> None: + from kars_runtime_hermes.plugin import sre + + for v in ("false", "0", "no", "", "anything-else"): + with patch.dict(os.environ, {"KARS_SRE_ENABLED": v}, clear=True): + assert not sre.is_enabled(), f"value {v!r} should be falsy" + + +def test_register_skips_when_disabled() -> None: + """A standard Hermes plugin __init__.py call must not register sre tools.""" + # Reload the plugin __init__ to get a clean state + if "kars_runtime_hermes.plugin" in sys.modules: + importlib.reload(sys.modules["kars_runtime_hermes.plugin"]) + with patch.dict(os.environ, {}, clear=True): + from kars_runtime_hermes.plugin import sre + + ctx = MagicMock() + # Direct sre.register call should never run unless caller checks + # is_enabled first — but we also want to be defensive: if a + # standard sandbox somehow imports and registers, that's a bug. + # Slice 1's gate is in __init__.py, not in register() itself, + # so calling register() directly DOES register tools. That's + # fine for now (we're testing the __init__.py path elsewhere). + sre.register(ctx) + # 5 tool registrations expected + assert ctx.register_tool.call_count == 5 + + +def test_register_registers_five_tools() -> None: + """register(ctx) registers exactly the five Slice 1 tools.""" + from kars_runtime_hermes.plugin import sre + + ctx = MagicMock() + sre.register(ctx) + + tool_names = {call.kwargs["name"] for call in ctx.register_tool.call_args_list} + expected = { + "sre_describe_state", + "sre_logs", + "sre_diagnose", + "sre_explain_error", + "sre_propose_fix", + } + assert tool_names == expected, f"got {tool_names}, expected {expected}" + + +def test_register_handles_missing_register_tool_gracefully() -> None: + """If ctx has no register_tool callable, log + return without raising.""" + from kars_runtime_hermes.plugin import sre + + class BadCtx: + pass + + sre.register(BadCtx()) # must not raise + + +def test_explain_error_matches_imagepullbackoff() -> None: + from kars_runtime_hermes.plugin import sre + + result = sre.sre_explain_error(error="Failed to pull image: ImagePullBackOff") + assert result["matched"] is True + assert result["hypotheses"][0]["pattern"] == "ImagePullBackOff" + + +def test_explain_error_matches_exceeded_quota() -> None: + from kars_runtime_hermes.plugin import sre + + result = sre.sre_explain_error(error="pods 'foo' is forbidden: exceeded quota: tight-quota") + assert result["matched"] is True + assert result["hypotheses"][0]["pattern"] == "exceeded quota" + + +def test_explain_error_no_match() -> None: + from kars_runtime_hermes.plugin import sre + + result = sre.sre_explain_error(error="totally-unknown-thing") + assert result["matched"] is False + assert result["error"] == "totally-unknown-thing" + + +def test_explain_error_empty_string() -> None: + from kars_runtime_hermes.plugin import sre + + result = sre.sre_explain_error(error="") + assert result["matched"] is False + assert "reason" in result + + +def test_propose_fix_for_resourcequota() -> None: + """The Slice 1 demo target — DeleteResourceQuota typed action.""" + from kars_runtime_hermes.plugin import sre + + result = sre.sre_propose_fix( + diagnosis="ResourceQuota platform-hardening-quota in kars-research is blocking pod admission", + target={ + "kind": "ResourceQuota", + "namespace": "kars-research", + "name": "platform-hardening-quota", + }, + ) + assert result["kind"] == "FixProposal" + assert result["action"] is not None + assert result["action"]["type"] == "DeleteResourceQuota" + assert result["action"]["namespace"] == "kars-research" + assert result["action"]["name"] == "platform-hardening-quota" + # Slice 1 returns "proposed" — execution lands in Slice 3 + assert "proposed" in result["execution_status"] + assert "not executed" in result["execution_status"] + + +def test_propose_fix_unknown_target_kind() -> None: + """For target kinds Slice 1 doesn't codify, return envelope with no action.""" + from kars_runtime_hermes.plugin import sre + + result = sre.sre_propose_fix( + diagnosis="pod ImagePullBackOff", + target={"kind": "Pod", "namespace": "default", "name": "broken"}, + ) + assert result["kind"] == "FixProposal" + assert result["action"] is None + # Still returns rationale for the operator + assert "rationale" in result and result["rationale"] + + +def test_kars_cr_kinds_covers_all_eleven_crds() -> None: + """The KARS_CR_KINDS list must include every CRD in proposal §3.5.""" + from kars_runtime_hermes.plugin import sre + + expected = { + "KarsSandbox", "InferencePolicy", "ToolPolicy", "EgressApproval", + "KarsMemory", "KarsEval", "TrustGraph", "KarsPairing", "A2AAgent", + "McpServer", "KarsAuthConfig", + } + actual = {kind for _plural, kind in sre.KARS_CR_KINDS} + assert actual == expected, f"missing/extra CRDs: {actual ^ expected}" + + +def test_describe_state_with_mocked_kube() -> None: + """describe_state walks every kind and summarises items.""" + from kars_runtime_hermes.plugin import sre + + fake_doc = { + "items": [ + { + "metadata": {"namespace": "kars-system", "name": "foo"}, + "status": { + "phase": "Ready", + "observedGeneration": 3, + "lastReconciled": "2026-06-09T10:00:00Z", + "conditions": [{"type": "Available", "status": "True"}], + }, + }, + ], + } + mock_client = MagicMock() + mock_client.get.return_value = fake_doc + + with patch.object(sre.sre_kube, "client", return_value=mock_client): + result = sre.sre_describe_state() + + # Every kind got summarised + assert set(result.keys()) == {k for _p, k in sre.KARS_CR_KINDS} + # Each got one entry from the fake doc + for kind in result: + assert isinstance(result[kind], list) + assert len(result[kind]) == 1 + assert result[kind][0]["phase"] == "Ready" + assert result[kind][0]["kind"] == kind + + +def test_describe_state_handles_apiserver_errors_per_kind() -> None: + """A 403/404 on one kind must not blow up the whole call.""" + import httpx + + from kars_runtime_hermes.plugin import sre + + mock_client = MagicMock() + response = MagicMock(status_code=403, reason_phrase="Forbidden") + mock_client.get.side_effect = httpx.HTTPStatusError( + "403", request=MagicMock(), response=response + ) + + with patch.object(sre.sre_kube, "client", return_value=mock_client): + result = sre.sre_describe_state() + + # Every kind got an error entry, but no exception bubbled up + for kind in result: + assert isinstance(result[kind], dict) + assert "error" in result[kind] + assert "403" in result[kind]["error"] From 5bdd29f63de0bca682b8dde5fa299338e4f0e759 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 11:40:20 +0200 Subject: [PATCH 03/62] =?UTF-8?q?sre(s2):=20K8s=20diagnostic=20toolset=20?= =?UTF-8?q?=E2=80=94=20describe=5Fresource,=20what=5Fchanged,=20endpoints,?= =?UTF-8?q?=20image=5Fprobe,=20top?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Slice 2 of the kars-sre series. Extends the read-only diagnostic surface from kars-CR-centric (Slice 1) to arbitrary Kubernetes workloads — everything the agent needs to diagnose the Act II ResourceQuota incident end-to-end. What ships (5 new tools, all read-only): sre_describe_resource — structured-describe for any K8s kind. For workload kinds (Deployment / StatefulSet / DaemonSet) walks the OWNER GRAPH: workload → ReplicaSet → matching Pods → events on every level. One tool call returns the whole incident picture. sre_what_changed — events of failure-relevant reasons in last N minutes across BOTH core/v1 and events.k8s.io/v1. Surfaces FailedCreate, BackOff, OOMKilling, Evicted, etc. — the incident-framing tool. sre_endpoints_inspect — Service → selector → matching pods → EndpointSlice readiness. Synthesises a finding the agent can quote (no pods match, pods NotReady, targetPort mismatch, OK). sre_image_probe — given an image, enumerate Pod images cluster-wide and suggest the closest in-use tag by Levenshtein edit-distance. Doesn't reach out to the registry (per-registry auth plumbing is Slice 4+); instead answers the question that's actually most useful: 'what's the closest in-use tag on THIS cluster right now?' sre_top — metrics.k8s.io wrapper for CPU+memory per pod or per node. Gracefully degrades to {unavailable: 'metrics-server not installed'} if the metrics API isn't registered (proposal §7.5 Q4). Also extends sre_propose_fix to codify two more typed actions from proposal §7.7.1: PatchDeploymentImage and ScaleDeployment (in addition to Slice 1's DeleteResourceQuota). Slice 3 will widen the typed-action set further AND add the execution path. RBAC widened in deploy/helm/kars/templates/sre.yaml: + discovery.k8s.io/endpointslices (for sre_endpoints_inspect) + metrics.k8s.io/pods, nodes (for sre_top) + core/nodes, endpoints, resourcequotas (cluster-wide read) ToolPolicy extended to allow the 5 new tool names. Containment unchanged: still gated by KARS_SRE_ENABLED env on the SRE sandbox pod only; standard Hermes sandboxes don't see the env, don't load the tools, can't call them. Validation: pytest tests/test_sre.py tests/test_sre_k8s.py → 31/31 pass ci/check-copyright-headers.sh → all 502 OK helm lint --set sre.enabled=true → 0 fails python -m py_compile (sre.py, sre_k8s.py) → OK Next: Slice 3 (typed apply-fix + admission VAPs + TokenRequest path + kars sre approve CLI). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- deploy/helm/kars/templates/sre.yaml | 26 +- .../src/kars_runtime_hermes/plugin/sre.py | 49 +- .../src/kars_runtime_hermes/plugin/sre_k8s.py | 1041 +++++++++++++++++ runtimes/hermes/tests/test_sre.py | 15 +- runtimes/hermes/tests/test_sre_k8s.py | 348 ++++++ 5 files changed, 1463 insertions(+), 16 deletions(-) create mode 100644 runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py create mode 100644 runtimes/hermes/tests/test_sre_k8s.py diff --git a/deploy/helm/kars/templates/sre.yaml b/deploy/helm/kars/templates/sre.yaml index b486899e..efb4976a 100644 --- a/deploy/helm/kars/templates/sre.yaml +++ b/deploy/helm/kars/templates/sre.yaml @@ -150,7 +150,7 @@ spec: inline: | version: 1 rules: - # Read-only kars-CR diagnostic tools — no approval needed. + # Read-only kars-CR diagnostic tools (Slice 1) — no approval. - match: { tool: "sre_describe_state" } decision: allow - match: { tool: "sre_logs" } @@ -161,6 +161,17 @@ spec: decision: allow - match: { tool: "sre_propose_fix" } decision: allow + # Read-only K8s diagnostic toolset (Slice 2) — no approval. + - match: { tool: "sre_describe_resource" } + decision: allow + - match: { tool: "sre_what_changed" } + decision: allow + - match: { tool: "sre_endpoints_inspect" } + decision: allow + - match: { tool: "sre_image_probe" } + decision: allow + - match: { tool: "sre_top" } + decision: allow --- # kars-sre-reader ClusterRole — Slice 1 RBAC. # @@ -203,7 +214,7 @@ rules: # composition below; cluster-wide read on workloads is the Slice 2 # opt-in. - apiGroups: [""] - resources: ["pods", "pods/log", "services", "configmaps", "events", "namespaces", "serviceaccounts"] + resources: ["pods", "pods/log", "services", "configmaps", "events", "namespaces", "serviceaccounts", "nodes", "endpoints", "resourcequotas"] verbs: ["get", "list", "watch"] - apiGroups: ["apps"] resources: ["deployments", "statefulsets", "daemonsets", "replicasets"] @@ -211,6 +222,17 @@ rules: - apiGroups: ["events.k8s.io"] resources: ["events"] verbs: ["get", "list", "watch"] + # Slice 2 — EndpointSlices (the modern endpoints API) for + # sre_endpoints_inspect. + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["get", "list", "watch"] + # Slice 2 — metrics.k8s.io for sre_top. If metrics-server isn't + # installed, the SubjectAccessReview path returns no-op and the + # tool degrades gracefully per §7.5 Q4. + - apiGroups: ["metrics.k8s.io"] + resources: ["pods", "nodes"] + verbs: ["get", "list"] # Secrets metadata ONLY (the .data field is stripped by the # inference-router proxy filter per proposal §6.4). The RBAC verb # `get` returns full secret data; the router-side filter is the diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py index ea654737..2fce3580 100644 --- a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py @@ -435,13 +435,13 @@ def sre_propose_fix( "execution_status": "proposed (Slice 1 — not executed; awaiting Slice 3 sre_apply_fix)", } - # Slice 1 understands ONE proposal shape: DeleteResourceQuota. - # The full typed-action set lands in Slice 3 alongside the - # apply-fix execution path. This single understanding lets the - # demo's Act II flow complete end-to-end via the runbook - # (operator runs `bash tools/demo/act2/reset.sh` after seeing the - # proposal — autonomous apply lands in Slice 3). - if target.get("kind") == "ResourceQuota": + target_kind = target.get("kind") + + # The typed-action set is the proposal §7.7.1 closed set. Slice 1+2 + # codify the actions the demo flow needs; the rest land in Slice 3 + # alongside the apply-fix execution path. Slice 1 returns the + # proposal envelope; the operator applies manually per the runbook. + if target_kind == "ResourceQuota": proposal["action"] = { "type": "DeleteResourceQuota", "namespace": target.get("namespace"), @@ -454,13 +454,36 @@ def sre_propose_fix( "the namespace's pod admission and the controller will " "schedule a fresh sandbox pod." ) + elif target_kind in {"Deployment", "StatefulSet", "DaemonSet"} and "image" in ( + _kwargs or {} + ): + proposal["action"] = { + "type": "PatchDeploymentImage", + "namespace": target.get("namespace"), + "name": target.get("name"), + "container": _kwargs.get("container"), + "image": _kwargs.get("image"), + } + proposal["rationale"] = ( + "Patch the container image to the proposed value. The target " + "namespace must not be in the protected denylist (kars-system, " + "kars-sre, kube-system, etc. — §7.7.1)." + ) + elif target_kind in {"Deployment", "StatefulSet"} and "replicas" in (_kwargs or {}): + proposal["action"] = { + "type": "ScaleDeployment", + "namespace": target.get("namespace"), + "name": target.get("name"), + "replicas": _kwargs.get("replicas"), + } + proposal["rationale"] = "Scale the workload's replica count." else: # Generic envelope for unknown target kinds — Slice 1 returns # the proposal text without a typed action; Slice 3 widens # the typed-action set. proposal["rationale"] = ( - "No typed action codified yet for this target kind in Slice 1. " - "The proposal text alone is returned; the operator can apply " + "No typed action codified yet for this target kind. The " + "proposal text alone is returned; the operator can apply " "manually per the demo runbook." ) @@ -600,4 +623,10 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic handler=sre_propose_fix, ) - logger.info("kars-sre plugin registered (5 tools, read-only)") + # Slice 2 — register the K8s diagnostic toolset alongside the Slice 1 + # tools. sre_k8s.register() handles its own ctx wiring. + from . import sre_k8s # noqa: PLC0415 — lazy import + + sre_k8s.register(ctx) + + logger.info("kars-sre plugin registered (Slice 1: 5 read-only kars-CR tools; Slice 2: 5 K8s diag tools)") diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py new file mode 100644 index 00000000..9c13817a --- /dev/null +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py @@ -0,0 +1,1041 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""kars-sre Hermes plugin — Slice 2 (K8s diagnostic toolset). + +Extends the read-only diagnostic surface from kars-CR-centric (Slice 1) +to arbitrary Kubernetes workloads. The tools registered here are the +ones needed to diagnose the Act II ResourceQuota incident end-to-end: + + sre_describe_resource structured-describe for any k8s resource + (Pod / Deployment / Service / Endpoints / + EndpointSlice / ResourceQuota / Node / + Event), with workload-owner-graph walk for + Deployment / StatefulSet / DaemonSet + sre_what_changed events of failure-relevant reasons in last + N min (default 15) across both core/v1 and + events.k8s.io/v1; framing the incident + sre_endpoints_inspect Service → selector → matching pods → + EndpointSlice subset → endpoint-not-ready + reasons (the '0 endpoints' detective tool) + sre_image_probe {image} → exists/not + digest + closest + in-use tag on this cluster (de-duplicated + across workloads) + sre_top metrics.k8s.io wrapper; graceful degrade if + metrics-server absent (§7.5 Q4) + +Registered alongside the Slice 1 tools by ``sre.register(ctx)`` when +``KARS_SRE_ENABLED=true``. The Helm chart's ClusterRole grants the +RBAC required for everything here at install time (Slice 2 is +strictly read-only). + +All tools follow the same contract as Slice 1 tools: they NEVER raise +on apiserver errors — those become ``{"error": "..."}`` entries in the +returned dict so the LLM can reason over them. +""" + +from __future__ import annotations + +import logging +import re +from collections import Counter +from typing import Any +from urllib.parse import quote + +import httpx + +from . import sre_kube + +logger = logging.getLogger("kars.hermes.sre.k8s") + + +# -------------------------------------------------------------------------- +# Apiserver paths +# -------------------------------------------------------------------------- + +# (kind, plural, api group/version segment) +# api group "" maps to /api/v1; others to /apis// +RESOURCE_PATHS: dict[str, tuple[str, str]] = { + "Pod": ("pods", "api/v1"), + "Service": ("services", "api/v1"), + "ConfigMap": ("configmaps", "api/v1"), + "Secret": ("secrets", "api/v1"), + "Event": ("events", "api/v1"), + "Node": ("nodes", "api/v1"), + "Namespace": ("namespaces", "api/v1"), + "ServiceAccount": ("serviceaccounts", "api/v1"), + "Endpoints": ("endpoints", "api/v1"), + "ResourceQuota": ("resourcequotas", "api/v1"), + "Deployment": ("deployments", "apis/apps/v1"), + "StatefulSet": ("statefulsets", "apis/apps/v1"), + "DaemonSet": ("daemonsets", "apis/apps/v1"), + "ReplicaSet": ("replicasets", "apis/apps/v1"), + "EndpointSlice": ("endpointslices", "apis/discovery.k8s.io/v1"), +} + +# Reasons we treat as "incident-flavoured" — these are the ones +# sre_what_changed surfaces. Sourced from kubelet, scheduler, and +# the controller-managers; intentionally excludes "Normal" reasons +# like Scheduled / Pulled / Started except for ScalingReplicaSet +# (which is what surfaces image/replica edits on Deployments). +WHAT_CHANGED_REASONS: set[str] = { + "Failed", + "FailedCreate", + "FailedDelete", + "FailedKillPod", + "FailedMount", + "FailedScheduling", + "BackOff", + "Unhealthy", + "OOMKilling", + "Evicted", + "Preempting", + "Killing", + "ScalingReplicaSet", + "SuccessfulCreate", + "SuccessfulDelete", + "DeadlineExceeded", +} + + +# -------------------------------------------------------------------------- +# sre_describe_resource +# -------------------------------------------------------------------------- + + +def _events_for_object( + kube: sre_kube.KubeClient, namespace: str, kind: str, name: str, limit: int = 25 +) -> list[dict[str, Any]]: + """Fetch recent events targeting a specific object. + + Uses core/v1 events with fieldSelector. The events.k8s.io/v1 events + have a different shape; we coalesce to a common dict at the call + site of sre_what_changed instead of here. + """ + field_selector = ( + f"involvedObject.kind={kind}," + f"involvedObject.name={name}," + f"involvedObject.namespace={namespace}" + ) + try: + doc = kube.get( + f"/api/v1/namespaces/{namespace}/events", + params={"fieldSelector": field_selector, "limit": limit}, + ) + events = [] + for ev in doc.get("items", []): + events.append( + { + "type": ev.get("type"), + "reason": ev.get("reason"), + "message": ev.get("message"), + "count": ev.get("count"), + "firstTimestamp": ev.get("firstTimestamp"), + "lastTimestamp": ev.get("lastTimestamp"), + "source": (ev.get("source") or {}).get("component"), + } + ) + return events + except Exception as exc: # noqa: BLE001 + logger.debug("events fetch failed for %s/%s/%s: %s", namespace, kind, name, exc) + return [] + + +def _summarise_pod(item: dict[str, Any]) -> dict[str, Any]: + """Reduce a Pod's JSON to the fields the agent cares about.""" + meta = item.get("metadata", {}) + spec = item.get("spec", {}) + status = item.get("status", {}) + containers_summary = [] + for cs in status.get("containerStatuses", []): + state = cs.get("state", {}) + last_state = cs.get("lastState", {}) + # The waiting reason (ImagePullBackOff, CrashLoopBackOff, etc.) + # lives at state.waiting.reason; the OOMKill etc. lives at + # lastState.terminated.reason. + waiting = state.get("waiting", {}) if state else {} + terminated_now = state.get("terminated", {}) if state else {} + terminated_last = last_state.get("terminated", {}) if last_state else {} + containers_summary.append( + { + "name": cs.get("name"), + "ready": cs.get("ready"), + "restartCount": cs.get("restartCount"), + "image": cs.get("image"), + "imageID": cs.get("imageID"), + "state": ( + "waiting" if waiting + else "terminated" if terminated_now + else "running" if state.get("running") + else "unknown" + ), + "waitingReason": waiting.get("reason"), + "waitingMessage": waiting.get("message"), + "lastTerminatedReason": terminated_last.get("reason"), + "lastExitCode": terminated_last.get("exitCode"), + } + ) + return { + "kind": "Pod", + "namespace": meta.get("namespace"), + "name": meta.get("name"), + "phase": status.get("phase"), + "nodeName": spec.get("nodeName"), + "serviceAccountName": spec.get("serviceAccountName"), + "imagePullSecrets": [s.get("name") for s in (spec.get("imagePullSecrets") or [])], + "conditions": [ + {"type": c.get("type"), "status": c.get("status"), "reason": c.get("reason"), "message": c.get("message")} + for c in (status.get("conditions") or []) + ], + "containers": containers_summary, + "ownerReferences": [ + {"kind": o.get("kind"), "name": o.get("name")} + for o in (meta.get("ownerReferences") or []) + ], + } + + +def _summarise_workload(item: dict[str, Any]) -> dict[str, Any]: + """Reduce a Deployment / StatefulSet / DaemonSet / ReplicaSet.""" + meta = item.get("metadata", {}) + spec = item.get("spec", {}) + status = item.get("status", {}) + template = spec.get("template", {}).get("spec", {}) + containers = [ + { + "name": c.get("name"), + "image": c.get("image"), + "resources": c.get("resources"), + } + for c in (template.get("containers") or []) + ] + return { + "kind": item.get("kind", "Workload"), + "namespace": meta.get("namespace"), + "name": meta.get("name"), + "generation": meta.get("generation"), + "observedGeneration": status.get("observedGeneration"), + "replicas": status.get("replicas"), + "readyReplicas": status.get("readyReplicas"), + "availableReplicas": status.get("availableReplicas"), + "selector": spec.get("selector"), + "containers": containers, + "ownerReferences": [ + {"kind": o.get("kind"), "name": o.get("name")} + for o in (meta.get("ownerReferences") or []) + ], + "conditions": [ + {"type": c.get("type"), "status": c.get("status"), "reason": c.get("reason"), "message": c.get("message")} + for c in (status.get("conditions") or []) + ], + } + + +def _summarise_service(item: dict[str, Any]) -> dict[str, Any]: + meta = item.get("metadata", {}) + spec = item.get("spec", {}) + return { + "kind": "Service", + "namespace": meta.get("namespace"), + "name": meta.get("name"), + "type": spec.get("type"), + "selector": spec.get("selector"), + "ports": spec.get("ports"), + "clusterIP": spec.get("clusterIP"), + } + + +def _summarise_resource_quota(item: dict[str, Any]) -> dict[str, Any]: + meta = item.get("metadata", {}) + spec = item.get("spec", {}) + status = item.get("status", {}) + return { + "kind": "ResourceQuota", + "namespace": meta.get("namespace"), + "name": meta.get("name"), + "labels": meta.get("labels"), + "hard": spec.get("hard"), + "usedHard": status.get("hard"), + "used": status.get("used"), + # NOTE: The label `kars.azure.com/managed-by` is what gates + # whether the SRE agent's DeleteResourceQuota typed action + # (§7.7.1) is permitted on this resource. Surfacing it here + # lets the agent reason about whether a proposed delete is + # safe BEFORE proposing it. + "isKarsManaged": (meta.get("labels") or {}).get("kars.azure.com/managed-by") == "controller", + } + + +def _walk_owner_graph( + kube: sre_kube.KubeClient, kind: str, namespace: str, name: str +) -> dict[str, Any]: + """For a Deployment/StatefulSet/DaemonSet, walk down to pods + events. + + Returns: + { + "workload": {...summarised...}, + "replica_sets": [...], # only for Deployment + "pods": [...], + "events_on_workload": [...], + "events_on_replica_sets": [...], + "events_on_pods": [...], + } + """ + out: dict[str, Any] = {} + plural, api_seg = RESOURCE_PATHS[kind] + + # 1) The workload itself + try: + wl = kube.get(f"/{api_seg}/namespaces/{namespace}/{plural}/{name}") + wl["kind"] = kind # ensure kind is populated on items fetched by-name + out["workload"] = _summarise_workload(wl) + except httpx.HTTPStatusError as exc: + out["workload"] = {"error": f"{exc.response.status_code} {exc.response.reason_phrase}"} + return out + except Exception as exc: # noqa: BLE001 + out["workload"] = {"error": str(exc)} + return out + + # 2) For Deployments, walk through ReplicaSets + selector = (wl.get("spec") or {}).get("selector") or {} + match_labels = selector.get("matchLabels") or {} + label_selector = ",".join(f"{k}={v}" for k, v in match_labels.items()) + + if kind == "Deployment" and label_selector: + try: + rs_doc = kube.get( + f"/apis/apps/v1/namespaces/{namespace}/replicasets", + params={"labelSelector": label_selector}, + ) + out["replica_sets"] = [ + _summarise_workload({**rs, "kind": "ReplicaSet"}) + for rs in rs_doc.get("items", []) + ] + except Exception as exc: # noqa: BLE001 + out["replica_sets"] = {"error": str(exc)} + + # 3) Pods matching the selector + out["pods"] = [] + if label_selector: + try: + pod_doc = kube.get( + f"/api/v1/namespaces/{namespace}/pods", + params={"labelSelector": label_selector}, + ) + out["pods"] = [_summarise_pod(p) for p in pod_doc.get("items", [])] + except Exception as exc: # noqa: BLE001 + out["pods"] = {"error": str(exc)} + + # 4) Events on the workload + replica sets + pods (helps the agent + # spot 'exceeded quota' on the RS, not just on the workload) + out["events_on_workload"] = _events_for_object(kube, namespace, kind, name) + if isinstance(out.get("replica_sets"), list): + rs_events = [] + for rs in out["replica_sets"]: + rs_events.extend( + _events_for_object(kube, namespace, "ReplicaSet", rs["name"]) + ) + out["events_on_replica_sets"] = rs_events + if isinstance(out.get("pods"), list): + pod_events = [] + for pod in out["pods"]: + pod_events.extend( + _events_for_object(kube, namespace, "Pod", pod["name"]) + ) + out["events_on_pods"] = pod_events + + return out + + +def sre_describe_resource( + *, + kind: str, + namespace: str | None = None, + name: str, + **_kwargs: Any, +) -> dict[str, Any]: + """Tool: structured-describe for any K8s resource. + + For Pod / Service / ResourceQuota / ConfigMap etc. — returns a + structured summary + recent events on the object. + + For Deployment / StatefulSet / DaemonSet — walks the workload + owner graph: workload → ReplicaSets (for Deployments) → matching + Pods → events on every level. This is THE diagnostic shortcut + for incidents like ImagePullBackOff, exceeded-quota, + CrashLoopBackOff — one tool call returns the whole picture. + + Args: + kind: K8s kind, e.g. "Pod", "Deployment", "ResourceQuota". + namespace: namespace (required for namespaced kinds). + name: resource name. + """ + if kind not in RESOURCE_PATHS: + return { + "error": f"unknown kind: {kind}", + "supported_kinds": sorted(RESOURCE_PATHS.keys()), + } + + # Owner-graph walk for workload kinds + if kind in {"Deployment", "StatefulSet", "DaemonSet"}: + if not namespace: + return {"error": f"{kind} is namespaced — provide namespace"} + return _walk_owner_graph(sre_kube.client(), kind, namespace, name) + + # Direct describe for other kinds + plural, api_seg = RESOURCE_PATHS[kind] + if namespace: + path = f"/{api_seg}/namespaces/{namespace}/{plural}/{name}" + else: + path = f"/{api_seg}/{plural}/{name}" + kube = sre_kube.client() + try: + item = kube.get(path) + item["kind"] = kind # ensure populated + except httpx.HTTPStatusError as exc: + return { + "kind": kind, + "name": name, + "namespace": namespace, + "error": f"{exc.response.status_code} {exc.response.reason_phrase}", + } + except Exception as exc: # noqa: BLE001 + return {"kind": kind, "name": name, "namespace": namespace, "error": str(exc)} + + summariser = { + "Pod": _summarise_pod, + "Deployment": _summarise_workload, + "StatefulSet": _summarise_workload, + "DaemonSet": _summarise_workload, + "ReplicaSet": _summarise_workload, + "Service": _summarise_service, + "ResourceQuota": _summarise_resource_quota, + }.get(kind) + + summary: dict[str, Any] + if summariser: + summary = summariser(item) + else: + # Generic fallback for ConfigMap / Secret / Node / etc. + meta = item.get("metadata", {}) + summary = { + "kind": kind, + "namespace": meta.get("namespace"), + "name": meta.get("name"), + "labels": meta.get("labels"), + "annotations": meta.get("annotations"), + "creationTimestamp": meta.get("creationTimestamp"), + } + # Type-specific fields + if kind == "ConfigMap": + summary["data_keys"] = list((item.get("data") or {}).keys()) + elif kind == "Secret": + # NEVER include .data — strip per §6.4 (router proxy also + # strips, but defense in depth at the plugin layer too). + summary["type"] = item.get("type") + summary["data_keys"] = list((item.get("data") or {}).keys()) + elif kind == "Node": + summary["unschedulable"] = (item.get("spec") or {}).get("unschedulable", False) + summary["taints"] = (item.get("spec") or {}).get("taints", []) + summary["conditions"] = [ + {"type": c.get("type"), "status": c.get("status"), "reason": c.get("reason")} + for c in ((item.get("status") or {}).get("conditions") or []) + ] + + # Add events on the resource (namespaced kinds only) + if namespace: + summary["recent_events"] = _events_for_object(kube, namespace, kind, name) + + return summary + + +# -------------------------------------------------------------------------- +# sre_what_changed +# -------------------------------------------------------------------------- + + +def sre_what_changed( + *, + namespace: str | None = None, + minutes: int = 15, + **_kwargs: Any, +) -> dict[str, Any]: + """Tool: events of failure-relevant reasons in the last N minutes. + + Surfaces events from BOTH ``core/v1/events`` (older API) and + ``events.k8s.io/v1/events`` (newer API) — they have different + retention windows and shapes; the agent should not have to know + which is in play. + + Args: + namespace: limit to one namespace (omit for cluster-wide). + minutes: lookback window (default 15, capped at 60). + + Returns: + { + "since_minutes": N, + "namespace": "..." or "*", + "events_core": [...], + "events_new": [...], + } + """ + minutes = max(1, min(minutes, 60)) + kube = sre_kube.client() + + out: dict[str, Any] = { + "since_minutes": minutes, + "namespace": namespace or "*", + "events_core": [], + "events_new": [], + } + + # core/v1/events + if namespace: + core_path = f"/api/v1/namespaces/{namespace}/events" + else: + core_path = "/api/v1/events" + try: + doc = kube.get(core_path, params={"limit": 200}) + for ev in doc.get("items", []): + reason = ev.get("reason") + if reason in WHAT_CHANGED_REASONS: + out["events_core"].append( + { + "namespace": (ev.get("involvedObject") or {}).get("namespace"), + "kind": (ev.get("involvedObject") or {}).get("kind"), + "name": (ev.get("involvedObject") or {}).get("name"), + "type": ev.get("type"), + "reason": reason, + "message": ev.get("message"), + "count": ev.get("count"), + "lastTimestamp": ev.get("lastTimestamp"), + } + ) + except Exception as exc: # noqa: BLE001 + out["events_core"] = {"error": str(exc)} + + # events.k8s.io/v1/events + if namespace: + new_path = f"/apis/events.k8s.io/v1/namespaces/{namespace}/events" + else: + new_path = "/apis/events.k8s.io/v1/events" + try: + doc = kube.get(new_path, params={"limit": 200}) + for ev in doc.get("items", []): + reason = ev.get("reason") + if reason in WHAT_CHANGED_REASONS: + regarding = ev.get("regarding") or {} + out["events_new"].append( + { + "namespace": regarding.get("namespace"), + "kind": regarding.get("kind"), + "name": regarding.get("name"), + "type": ev.get("type"), + "reason": reason, + "note": ev.get("note"), + "deprecatedCount": ev.get("deprecatedCount"), + "eventTime": ev.get("eventTime"), + } + ) + except Exception as exc: # noqa: BLE001 + out["events_new"] = {"error": str(exc)} + + return out + + +# -------------------------------------------------------------------------- +# sre_endpoints_inspect +# -------------------------------------------------------------------------- + + +def sre_endpoints_inspect( + *, + namespace: str, + service: str, + **_kwargs: Any, +) -> dict[str, Any]: + """Tool: Service → selector → matching pods → EndpointSlice readiness. + + The "0 endpoints" detective tool. Answers: why isn't this Service + routing traffic? Walks: + + 1. Fetch Service spec, capture its selector + 2. List Pods matching the selector + 3. List EndpointSlices in the namespace owned by the Service + 4. Surface the diff: pods that match the selector but are not + in any EndpointSlice subset (suggests readiness-probe + failures), and the EndpointSlice's not-ready conditions for + each endpoint. + """ + kube = sre_kube.client() + out: dict[str, Any] = {"namespace": namespace, "service": service} + + # 1) Service + try: + svc = kube.get(f"/api/v1/namespaces/{namespace}/services/{service}") + except httpx.HTTPStatusError as exc: + return {**out, "error": f"{exc.response.status_code} {exc.response.reason_phrase}"} + except Exception as exc: # noqa: BLE001 + return {**out, "error": str(exc)} + + selector = (svc.get("spec") or {}).get("selector") or {} + out["selector"] = selector + out["service_type"] = (svc.get("spec") or {}).get("type") + if not selector: + out["finding"] = ( + "Service has no selector — endpoints are managed externally " + "(or via the headless / ExternalName pattern). No further " + "diagnosis from this tool." + ) + return out + + # 2) Pods matching the selector + label_selector = ",".join(f"{k}={v}" for k, v in selector.items()) + try: + pod_doc = kube.get( + f"/api/v1/namespaces/{namespace}/pods", + params={"labelSelector": label_selector}, + ) + out["matching_pods"] = [ + { + "name": p.get("metadata", {}).get("name"), + "phase": (p.get("status") or {}).get("phase"), + "podIP": (p.get("status") or {}).get("podIP"), + "ready": all( + c.get("status") == "True" + for c in ((p.get("status") or {}).get("conditions") or []) + if c.get("type") == "Ready" + ), + } + for p in pod_doc.get("items", []) + ] + except Exception as exc: # noqa: BLE001 + out["matching_pods"] = {"error": str(exc)} + + # 3) EndpointSlices owned by the service + try: + es_doc = kube.get( + f"/apis/discovery.k8s.io/v1/namespaces/{namespace}/endpointslices", + params={"labelSelector": f"kubernetes.io/service-name={service}"}, + ) + slices = [] + for es in es_doc.get("items", []): + endpoints = [] + for ep in es.get("endpoints", []): + endpoints.append( + { + "addresses": ep.get("addresses"), + "conditions": ep.get("conditions"), + "targetRef": ep.get("targetRef"), + } + ) + slices.append( + { + "name": es.get("metadata", {}).get("name"), + "addressType": es.get("addressType"), + "endpoints": endpoints, + } + ) + out["endpoint_slices"] = slices + except Exception as exc: # noqa: BLE001 + out["endpoint_slices"] = {"error": str(exc)} + + # 4) Synthesise a finding + n_pods = len(out.get("matching_pods", [])) if isinstance(out.get("matching_pods"), list) else 0 + n_ready = sum( + 1 for p in (out.get("matching_pods") or []) if isinstance(p, dict) and p.get("ready") + ) + n_endpoints = 0 + if isinstance(out.get("endpoint_slices"), list): + for es in out["endpoint_slices"]: + for ep in es.get("endpoints", []): + if (ep.get("conditions") or {}).get("ready"): + n_endpoints += sum(1 for _ in (ep.get("addresses") or [])) + + if n_pods == 0: + out["finding"] = ( + "No pods match the service's selector. Either the workload " + "isn't deployed, or its labels were changed to not match. " + "Check the controlling Deployment/StatefulSet for the " + "current pod-template labels." + ) + elif n_ready == 0 and n_pods > 0: + out["finding"] = ( + f"{n_pods} pod(s) match the selector but none are Ready. " + "Likely cause: readiness probe failing, container startup " + "error, or workload-config bug. Use sre_describe_resource " + "on the pods + sre_logs to find the root cause." + ) + elif n_endpoints == 0: + out["finding"] = ( + f"{n_ready}/{n_pods} pod(s) are Ready but the EndpointSlice " + "has zero ready addresses. Likely cause: the Service's " + "targetPort doesn't match any container port on the pods, " + "or the EndpointSlice controller is lagging." + ) + else: + out["finding"] = ( + f"{n_endpoints} endpoint(s) ready across " + f"{len(out.get('endpoint_slices', []))} slice(s). Service " + "should be routing traffic." + ) + return out + + +# -------------------------------------------------------------------------- +# sre_image_probe +# -------------------------------------------------------------------------- + + +_IMAGE_RE = re.compile( + r"^(?P[a-z0-9.\-]+(?::\d+)?/)?" + r"(?P[a-z0-9._/\-]+?)" + r"(?::(?P[A-Za-z0-9_.\-]+))?" + r"(?:@(?Psha256:[a-f0-9]+))?$" +) + + +def _parse_image(image: str) -> dict[str, str | None]: + m = _IMAGE_RE.match(image.strip()) + if not m: + return {"registry": None, "repo": image, "tag": None, "digest": None} + parts: dict[str, str | None] = {**m.groupdict()} + if parts.get("registry"): + parts["registry"] = parts["registry"].rstrip("/") + return parts + + +def _all_images_in_use(kube: sre_kube.KubeClient) -> Counter[str]: + """Return a Counter of every container image observed on the cluster. + + Walks Pods cluster-wide. Used by ``sre_image_probe`` to surface + the "closest tag in use on this cluster" suggestion when an + operator's image string doesn't exist. + """ + counts: Counter[str] = Counter() + try: + doc = kube.get("/api/v1/pods", params={"limit": 500}) + for p in doc.get("items", []): + for c in (p.get("spec") or {}).get("containers") or []: + img = c.get("image") + if img: + counts[img] += 1 + for c in (p.get("spec") or {}).get("initContainers") or []: + img = c.get("image") + if img: + counts[img] += 1 + except Exception as exc: # noqa: BLE001 + logger.debug("could not enumerate cluster images: %s", exc) + return counts + + +def _edit_distance(a: str, b: str) -> int: + """Levenshtein distance — small, ~30-LOC pure-python implementation + sufficient for our 'closest tag' suggestion (image tags are short).""" + if a == b: + return 0 + if len(a) < len(b): + a, b = b, a + prev = list(range(len(b) + 1)) + for i, ca in enumerate(a, 1): + curr = [i] + [0] * len(b) + for j, cb in enumerate(b, 1): + curr[j] = min( + prev[j] + 1, # delete + curr[j - 1] + 1, # insert + prev[j - 1] + (ca != cb), # substitute + ) + prev = curr + return prev[-1] + + +def sre_image_probe(*, image: str, **_kwargs: Any) -> dict[str, Any]: + """Tool: probe an image reference and suggest closest in-use tags. + + Slice 2 implementation: does NOT actually reach out to a registry + (that requires registry-auth plumbing per registry, which lands in + Slice 4+). Instead, it answers the question that's actually most + useful in incidents — "what tags of this repo are in use on this + cluster RIGHT NOW?" — by enumerating Pods. + + Returns: + { + "image": , + "parsed": {registry, repo, tag, digest}, + "in_use_on_cluster": [{image, count}, ...], + "closest_in_use": | None, + "advice": , + } + """ + parsed = _parse_image(image) + kube = sre_kube.client() + + all_images = _all_images_in_use(kube) + + # Find images that share the same repo prefix + repo = parsed.get("repo") or "" + same_repo: list[tuple[str, int]] = [] + for img, count in all_images.items(): + p = _parse_image(img) + if p.get("repo") == repo and ( + parsed.get("registry") is None or p.get("registry") == parsed.get("registry") + ): + same_repo.append((img, count)) + same_repo.sort(key=lambda t: t[1], reverse=True) + + # Closest tag by edit distance against the requested tag + closest: str | None = None + if parsed.get("tag") and same_repo: + best_dist = 10**9 + for img, _count in same_repo: + p = _parse_image(img) + if p.get("tag"): + d = _edit_distance(parsed["tag"], p["tag"]) # type: ignore[arg-type] + if d < best_dist: + best_dist = d + closest = img + + advice: str + if not same_repo: + advice = ( + f"No pod on this cluster currently uses the repo {repo!r}. The " + "image may not exist, or this is the first deployment of it. " + "Slice 4+ adds a real registry probe to confirm; for now, " + "verify the registry / repo path is spelled correctly." + ) + elif closest and closest != image: + advice = ( + f"Image {image!r} is not currently used on this cluster, but " + f"{closest!r} is (running in {dict(same_repo).get(closest, 0)} " + "pod(s)). If the failing image string contains a typo, this is " + "the closest match by edit-distance." + ) + else: + advice = ( + f"Image {image!r} matches an image currently in use on the " + "cluster. The failure is likely registry-side (auth, throttle, " + "outage) rather than a typo." + ) + + return { + "image": image, + "parsed": parsed, + "in_use_on_cluster": [{"image": img, "count": count} for img, count in same_repo[:10]], + "closest_in_use": closest, + "advice": advice, + } + + +# -------------------------------------------------------------------------- +# sre_top +# -------------------------------------------------------------------------- + + +def sre_top( + *, + scope: str = "pods", + namespace: str | None = None, + **_kwargs: Any, +) -> dict[str, Any]: + """Tool: metrics.k8s.io wrapper for pod / node CPU + memory. + + Args: + scope: "pods" or "nodes". + namespace: required for scope=pods if filtering to one ns. + + Returns ``{"unavailable": "..."}`` when metrics-server is absent + (the agent's planner routes around it per §7.5 Q4). + """ + kube = sre_kube.client() + if scope == "nodes": + path = "/apis/metrics.k8s.io/v1beta1/nodes" + elif scope == "pods": + if namespace: + path = f"/apis/metrics.k8s.io/v1beta1/namespaces/{quote(namespace)}/pods" + else: + path = "/apis/metrics.k8s.io/v1beta1/pods" + else: + return {"error": f"unknown scope: {scope}", "valid_scopes": ["pods", "nodes"]} + + try: + doc = kube.get(path) + except httpx.HTTPStatusError as exc: + # 404 = metrics-server not registered as an APIService. + if exc.response.status_code == 404: + return { + "unavailable": "metrics-server is not installed on this cluster.", + "scope": scope, + } + return {"error": f"{exc.response.status_code} {exc.response.reason_phrase}"} + except Exception as exc: # noqa: BLE001 + return {"error": str(exc)} + + items = [] + for it in doc.get("items", []): + meta = it.get("metadata", {}) + if scope == "nodes": + usage = it.get("usage") or {} + items.append( + { + "name": meta.get("name"), + "cpu": usage.get("cpu"), + "memory": usage.get("memory"), + "timestamp": it.get("timestamp"), + } + ) + else: + containers = [ + { + "name": c.get("name"), + "cpu": (c.get("usage") or {}).get("cpu"), + "memory": (c.get("usage") or {}).get("memory"), + } + for c in (it.get("containers") or []) + ] + items.append( + { + "namespace": meta.get("namespace"), + "name": meta.get("name"), + "containers": containers, + "timestamp": it.get("timestamp"), + } + ) + return {"scope": scope, "items": items} + + +# -------------------------------------------------------------------------- +# Plugin registration +# -------------------------------------------------------------------------- + + +def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic + """Register the Slice 2 K8s diagnostic tools. + + Called from ``sre.register()`` alongside the Slice 1 tools when + ``KARS_SRE_ENABLED=true``. + """ + register_tool = getattr(ctx, "register_tool", None) + if not callable(register_tool): + logger.warning("Hermes ctx has no register_tool — Slice 2 SRE tools not registered") + return + + register_tool( + name="sre_describe_resource", + description=( + "Structured-describe for any K8s resource (Pod, Deployment, " + "Service, ResourceQuota, ConfigMap, Secret metadata only, " + "EndpointSlice, Node, Event, etc.). For workload kinds " + "(Deployment, StatefulSet, DaemonSet) walks the owner graph: " + "workload → ReplicaSet → Pods → events on every level. This " + "is THE single-call diagnostic for most workload incidents." + ), + parameters={ + "type": "object", + "properties": { + "kind": { + "type": "string", + "description": "K8s kind, e.g. Pod, Deployment, ResourceQuota", + }, + "namespace": { + "type": "string", + "description": "Namespace (required for namespaced kinds)", + }, + "name": {"type": "string", "description": "Resource name"}, + }, + "required": ["kind", "name"], + }, + handler=sre_describe_resource, + ) + + register_tool( + name="sre_what_changed", + description=( + "Events of failure-relevant reasons in the last N minutes " + "across core/v1 + events.k8s.io/v1. Use FIRST in an incident " + "to frame the time-window: what broke when?" + ), + parameters={ + "type": "object", + "properties": { + "namespace": { + "type": "string", + "description": "Limit to one namespace; omit for cluster-wide", + }, + "minutes": { + "type": "integer", + "description": "Lookback window in minutes (1-60, default 15)", + "default": 15, + }, + }, + "required": [], + }, + handler=sre_what_changed, + ) + + register_tool( + name="sre_endpoints_inspect", + description=( + "Service → selector → matching pods → EndpointSlice readiness. " + "Diagnoses 'service has no endpoints' incidents: are there pods " + "matching the selector? are they Ready? are they in the " + "EndpointSlice? Returns a finding summary the agent can quote." + ), + parameters={ + "type": "object", + "properties": { + "namespace": {"type": "string"}, + "service": {"type": "string"}, + }, + "required": ["namespace", "service"], + }, + handler=sre_endpoints_inspect, + ) + + register_tool( + name="sre_image_probe", + description=( + "Given an image reference, return: (a) what tags of the same " + "repo are CURRENTLY IN USE on this cluster, (b) the closest " + "match by edit-distance to the requested tag. Use after " + "sre_describe_resource shows ImagePullBackOff." + ), + parameters={ + "type": "object", + "properties": { + "image": { + "type": "string", + "description": "Image reference, e.g. 'nginx:1.27.3'", + }, + }, + "required": ["image"], + }, + handler=sre_image_probe, + ) + + register_tool( + name="sre_top", + description=( + "CPU + memory usage per pod or per node (metrics.k8s.io). " + "Returns {unavailable: 'metrics-server not installed'} if " + "the metrics API isn't registered — the agent's planner " + "routes around it." + ), + parameters={ + "type": "object", + "properties": { + "scope": { + "type": "string", + "enum": ["pods", "nodes"], + "default": "pods", + }, + "namespace": { + "type": "string", + "description": "Required for scope=pods; omit for cluster-wide", + }, + }, + "required": [], + }, + handler=sre_top, + ) + + logger.info("kars-sre Slice 2 (K8s diagnostic toolset) registered — 5 tools") diff --git a/runtimes/hermes/tests/test_sre.py b/runtimes/hermes/tests/test_sre.py index 808c9c32..8fee227a 100644 --- a/runtimes/hermes/tests/test_sre.py +++ b/runtimes/hermes/tests/test_sre.py @@ -52,12 +52,12 @@ def test_register_skips_when_disabled() -> None: # so calling register() directly DOES register tools. That's # fine for now (we're testing the __init__.py path elsewhere). sre.register(ctx) - # 5 tool registrations expected - assert ctx.register_tool.call_count == 5 + # 5 Slice-1 + 5 Slice-2 = 10 tool registrations expected + assert ctx.register_tool.call_count == 10 -def test_register_registers_five_tools() -> None: - """register(ctx) registers exactly the five Slice 1 tools.""" +def test_register_registers_all_ten_tools() -> None: + """register(ctx) registers exactly the Slice 1 + Slice 2 tools.""" from kars_runtime_hermes.plugin import sre ctx = MagicMock() @@ -65,11 +65,18 @@ def test_register_registers_five_tools() -> None: tool_names = {call.kwargs["name"] for call in ctx.register_tool.call_args_list} expected = { + # Slice 1 — read-only kars-CR tools "sre_describe_state", "sre_logs", "sre_diagnose", "sre_explain_error", "sre_propose_fix", + # Slice 2 — K8s diagnostic toolset + "sre_describe_resource", + "sre_what_changed", + "sre_endpoints_inspect", + "sre_image_probe", + "sre_top", } assert tool_names == expected, f"got {tool_names}, expected {expected}" diff --git a/runtimes/hermes/tests/test_sre_k8s.py b/runtimes/hermes/tests/test_sre_k8s.py new file mode 100644 index 00000000..bfa82ce9 --- /dev/null +++ b/runtimes/hermes/tests/test_sre_k8s.py @@ -0,0 +1,348 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""kars-sre Slice 2 (K8s diagnostic toolset) tests.""" + +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +import httpx + + +def test_register_registers_five_slice2_tools() -> None: + from kars_runtime_hermes.plugin import sre_k8s + + ctx = MagicMock() + sre_k8s.register(ctx) + tool_names = {call.kwargs["name"] for call in ctx.register_tool.call_args_list} + assert tool_names == { + "sre_describe_resource", + "sre_what_changed", + "sre_endpoints_inspect", + "sre_image_probe", + "sre_top", + } + + +def test_describe_resource_unknown_kind() -> None: + from kars_runtime_hermes.plugin import sre_k8s + + result = sre_k8s.sre_describe_resource(kind="UnknownKind", name="x") + assert "error" in result + assert "supported_kinds" in result + + +def test_describe_resource_resource_quota() -> None: + """ResourceQuota describe surfaces the kars-managed label.""" + from kars_runtime_hermes.plugin import sre_k8s + + quota_doc = { + "metadata": { + "namespace": "kars-research", + "name": "platform-hardening-quota", + "labels": { + "app.kubernetes.io/managed-by": "gitops-platform", + }, + }, + "spec": {"hard": {"requests.memory": "50Mi"}}, + "status": {"used": {"requests.memory": "0"}}, + } + mock_client = MagicMock() + mock_client.get.side_effect = [quota_doc, {"items": []}] # quota + events + with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): + result = sre_k8s.sre_describe_resource( + kind="ResourceQuota", + namespace="kars-research", + name="platform-hardening-quota", + ) + assert result["kind"] == "ResourceQuota" + assert result["name"] == "platform-hardening-quota" + assert result["hard"] == {"requests.memory": "50Mi"} + # Crucially, the SRE agent must be able to tell this is NOT + # kars-managed (label doesn't have managed-by=controller) — so + # DeleteResourceQuota is permitted on this resource. + assert result["isKarsManaged"] is False + + +def test_describe_resource_resource_quota_kars_managed() -> None: + """ResourceQuota labelled as kars-managed surfaces isKarsManaged=True.""" + from kars_runtime_hermes.plugin import sre_k8s + + quota_doc = { + "metadata": { + "namespace": "kars-sre", + "name": "sre-quota", + "labels": {"kars.azure.com/managed-by": "controller"}, + }, + "spec": {"hard": {"requests.memory": "1Gi"}}, + "status": {}, + } + mock_client = MagicMock() + mock_client.get.side_effect = [quota_doc, {"items": []}] + with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): + result = sre_k8s.sre_describe_resource( + kind="ResourceQuota", namespace="kars-sre", name="sre-quota" + ) + assert result["isKarsManaged"] is True + + +def test_describe_resource_deployment_owner_graph() -> None: + """A Deployment describe walks workload → RS → Pods → events.""" + from kars_runtime_hermes.plugin import sre_k8s + + deploy_doc = { + "kind": "Deployment", + "metadata": {"namespace": "kars-research", "name": "research", "generation": 1}, + "spec": { + "selector": {"matchLabels": {"app": "research"}}, + "template": { + "spec": { + "containers": [{"name": "openclaw", "image": "kars/hermes:latest"}] + } + }, + }, + "status": {"replicas": 1, "readyReplicas": 0, "availableReplicas": 0}, + } + rs_doc = { + "items": [ + { + "kind": "ReplicaSet", + "metadata": {"namespace": "kars-research", "name": "research-abc123"}, + "spec": {"selector": {"matchLabels": {"app": "research"}}}, + "status": {"replicas": 1, "readyReplicas": 0}, + } + ] + } + pod_doc = { + "items": [ + { + "metadata": {"namespace": "kars-research", "name": "research-abc123-xyz"}, + "spec": {"nodeName": None}, + "status": { + "phase": "Pending", + "containerStatuses": [], + "conditions": [], + }, + } + ] + } + mock_client = MagicMock() + # Workload, RS list, Pod list, then per-object events (3 calls — one for + # the Deployment, one for the RS, one for the Pod) + mock_client.get.side_effect = [ + deploy_doc, rs_doc, pod_doc, + {"items": []}, {"items": []}, {"items": []}, + ] + with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): + result = sre_k8s.sre_describe_resource( + kind="Deployment", namespace="kars-research", name="research" + ) + assert "workload" in result + assert result["workload"]["name"] == "research" + assert "pods" in result + assert isinstance(result["pods"], list) + assert len(result["pods"]) == 1 + assert result["pods"][0]["phase"] == "Pending" + + +def test_describe_resource_handles_404_gracefully() -> None: + """A 404 on the workload doesn't raise — surfaces as {error: ...}.""" + from kars_runtime_hermes.plugin import sre_k8s + + mock_client = MagicMock() + response = MagicMock(status_code=404, reason_phrase="Not Found") + mock_client.get.side_effect = httpx.HTTPStatusError("404", request=MagicMock(), response=response) + with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): + result = sre_k8s.sre_describe_resource( + kind="Pod", namespace="kars-research", name="missing" + ) + assert "error" in result + assert "404" in result["error"] + + +def test_what_changed_filters_to_failure_reasons() -> None: + """Only events with reasons in WHAT_CHANGED_REASONS surface.""" + from kars_runtime_hermes.plugin import sre_k8s + + core_doc = { + "items": [ + { + "involvedObject": {"kind": "ReplicaSet", "namespace": "kars-research", "name": "research-abc"}, + "type": "Warning", + "reason": "FailedCreate", + "message": "pods is forbidden: exceeded quota", + "count": 5, + "lastTimestamp": "2026-06-09T10:50:00Z", + }, + { + "involvedObject": {"kind": "Pod", "namespace": "kars-research", "name": "research-xyz"}, + "type": "Normal", + "reason": "Scheduled", # NOT in WHAT_CHANGED_REASONS — should be filtered out + "message": "Successfully assigned", + }, + ] + } + new_doc = {"items": []} + mock_client = MagicMock() + mock_client.get.side_effect = [core_doc, new_doc] + with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): + result = sre_k8s.sre_what_changed(namespace="kars-research", minutes=15) + assert len(result["events_core"]) == 1 + assert result["events_core"][0]["reason"] == "FailedCreate" + assert "exceeded quota" in result["events_core"][0]["message"] + + +def test_endpoints_inspect_zero_endpoints_finding() -> None: + """Service with pods that are NotReady → finding describes the issue.""" + from kars_runtime_hermes.plugin import sre_k8s + + svc_doc = { + "spec": {"selector": {"app": "research"}, "type": "ClusterIP"}, + } + pod_doc = { + "items": [ + { + "metadata": {"name": "research-1"}, + "status": { + "phase": "Running", + "podIP": "10.244.0.5", + "conditions": [{"type": "Ready", "status": "False"}], + }, + }, + { + "metadata": {"name": "research-2"}, + "status": { + "phase": "Running", + "podIP": "10.244.0.6", + "conditions": [{"type": "Ready", "status": "False"}], + }, + }, + ] + } + es_doc = {"items": []} + mock_client = MagicMock() + mock_client.get.side_effect = [svc_doc, pod_doc, es_doc] + with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): + result = sre_k8s.sre_endpoints_inspect(namespace="kars-research", service="research") + assert result["selector"] == {"app": "research"} + assert len(result["matching_pods"]) == 2 + # Both pods are NotReady → finding should call that out + assert "none are Ready" in result["finding"] + + +def test_endpoints_inspect_pod_selector_mismatch() -> None: + """Service whose selector matches no pods → clear finding.""" + from kars_runtime_hermes.plugin import sre_k8s + + svc_doc = {"spec": {"selector": {"app": "wrong-name"}, "type": "ClusterIP"}} + pod_doc = {"items": []} + es_doc = {"items": []} + mock_client = MagicMock() + mock_client.get.side_effect = [svc_doc, pod_doc, es_doc] + with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): + result = sre_k8s.sre_endpoints_inspect(namespace="kars-research", service="research") + assert "No pods match" in result["finding"] + + +def test_image_probe_parses_canonical_image_string() -> None: + from kars_runtime_hermes.plugin import sre_k8s + + parsed = sre_k8s._parse_image("docker.io/nginx:1.27.3") + assert parsed["registry"] == "docker.io" + assert parsed["repo"] == "nginx" + assert parsed["tag"] == "1.27.3" + + parsed = sre_k8s._parse_image("nginx:1.27-typo") + assert parsed["repo"] == "nginx" + assert parsed["tag"] == "1.27-typo" + + +def test_image_probe_finds_closest_tag_in_use() -> None: + """When the requested image isn't in use but a similar one is, suggest it.""" + from kars_runtime_hermes.plugin import sre_k8s + + pod_doc = { + "items": [ + {"spec": {"containers": [{"image": "nginx:1.27.3"}], "initContainers": []}}, + {"spec": {"containers": [{"image": "nginx:1.27.3"}], "initContainers": []}}, + {"spec": {"containers": [{"image": "redis:7"}], "initContainers": []}}, + ] + } + mock_client = MagicMock() + mock_client.get.return_value = pod_doc + with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): + result = sre_k8s.sre_image_probe(image="nginx:1.27-typo") + # The closest in-use match for nginx:1.27-typo is nginx:1.27.3 + assert result["closest_in_use"] == "nginx:1.27.3" + assert "typo" in result["advice"].lower() or "edit-distance" in result["advice"] + assert len(result["in_use_on_cluster"]) >= 1 + + +def test_image_probe_no_pods_use_repo() -> None: + from kars_runtime_hermes.plugin import sre_k8s + + pod_doc = {"items": []} + mock_client = MagicMock() + mock_client.get.return_value = pod_doc + with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): + result = sre_k8s.sre_image_probe(image="newrepo:v1") + assert result["in_use_on_cluster"] == [] + assert "No pod on this cluster" in result["advice"] + + +def test_top_unavailable_when_metrics_server_missing() -> None: + from kars_runtime_hermes.plugin import sre_k8s + + mock_client = MagicMock() + response = MagicMock(status_code=404, reason_phrase="Not Found") + mock_client.get.side_effect = httpx.HTTPStatusError( + "404", request=MagicMock(), response=response + ) + with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): + result = sre_k8s.sre_top(scope="nodes") + assert "unavailable" in result + assert "metrics-server" in result["unavailable"] + + +def test_top_invalid_scope() -> None: + from kars_runtime_hermes.plugin import sre_k8s + + result = sre_k8s.sre_top(scope="invalid") + assert "error" in result + assert "valid_scopes" in result + + +def test_top_pods_returns_per_container() -> None: + from kars_runtime_hermes.plugin import sre_k8s + + doc = { + "items": [ + { + "metadata": {"namespace": "kars-research", "name": "research-pod"}, + "timestamp": "2026-06-09T10:55:00Z", + "containers": [ + {"name": "openclaw", "usage": {"cpu": "5m", "memory": "120Mi"}}, + {"name": "inference-router", "usage": {"cpu": "1m", "memory": "20Mi"}}, + ], + } + ] + } + mock_client = MagicMock() + mock_client.get.return_value = doc + with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): + result = sre_k8s.sre_top(scope="pods", namespace="kars-research") + assert result["scope"] == "pods" + assert len(result["items"]) == 1 + assert len(result["items"][0]["containers"]) == 2 + + +def test_edit_distance() -> None: + """Sanity-check the Levenshtein implementation underlying image_probe.""" + from kars_runtime_hermes.plugin import sre_k8s + + assert sre_k8s._edit_distance("", "") == 0 + assert sre_k8s._edit_distance("abc", "abc") == 0 + assert sre_k8s._edit_distance("abc", "abd") == 1 + assert sre_k8s._edit_distance("1.27.3", "1.27-typo") <= 5 From d95659428406d75aef2d02aba1cd4a4eead26efc Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 11:44:31 +0200 Subject: [PATCH 04/62] fix(sre): resolve helm chart path from repo root, not CWD `kars sre install` was passing the relative path 'deploy/helm/kars' to helm, which helm parses as a chart repo name when the user's CWD is anywhere other than the kars repo root. Result: Error: repo deploy not found Fixed by resolving the kars repo root the same way `kars up` does: first walk up from the CLI file's own location (works for npm link), then fall back to walking up from CWD looking for deploy/helm/kars. Also: replaced the broken `.option('--wait', ..., true)` with the commander-idiomatic `.option('--no-wait', ...)` so the wait flag actually defaults to on. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/src/commands/sre.ts | 68 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 7 deletions(-) diff --git a/cli/src/commands/sre.ts b/cli/src/commands/sre.ts index fc2392fd..155a0efa 100644 --- a/cli/src/commands/sre.ts +++ b/cli/src/commands/sre.ts @@ -4,6 +4,45 @@ import { Command } from "commander"; import chalk from "chalk"; import { execa } from "execa"; +import * as fs from "node:fs"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; + +/** + * Resolve the kars repo root. + * + * Strategy mirrors `cli/src/commands/up.ts`: first try the + * three-levels-up-from-the-installed-CLI-file path (works for + * `npm link` installs), then fall back to walking up from CWD + * looking for `deploy/helm`. + */ +function resolveRepoRoot(): string { + // Strategy 1: from the file's own location (works for npm link + // since the link points back into the repo's cli/dist/ tree) + try { + const thisFile = fileURLToPath(import.meta.url); + const cliDir = path.dirname(path.dirname(thisFile)); // .../cli/dist + const candidate = path.resolve(cliDir, "..", ".."); // .../ + if (fs.existsSync(path.join(candidate, "deploy", "helm", "kars"))) { + return candidate; + } + } catch { + // import.meta.url may not be a file URL in some test contexts + } + // Strategy 2: walk up from CWD looking for deploy/helm + let cur = process.cwd(); + for (let i = 0; i < 8; i++) { + if (fs.existsSync(path.join(cur, "deploy", "helm", "kars"))) return cur; + const parent = path.dirname(cur); + if (parent === cur) break; + cur = parent; + } + throw new Error( + "Could not resolve the kars repo root (looked for deploy/helm/kars). " + + "Run `kars sre install` from inside an kars checkout, or set the working " + + "directory to the repo root first.", + ); +} /** * `kars sre` — manage the built-in kars-sre agent. @@ -42,9 +81,8 @@ export function sreCommand(): Command { "Azure OpenAI deployment / model name for the SRE agent (defaults to gpt-4.1)", ) .option( - "--wait", - "Wait for the sre sandbox to reach Running (default true)", - true, + "--no-wait", + "Don't wait for the sre sandbox to reach Running (default: wait)", ) .action(async (options: { release: string; @@ -53,10 +91,18 @@ export function sreCommand(): Command { model?: string; wait: boolean; }) => { + let chartPath: string; + try { + chartPath = path.join(resolveRepoRoot(), "deploy", "helm", "kars"); + } catch (err: any) { + console.error(chalk.red(`✗ ${err.message}`)); + process.exit(1); + } + const helmArgs = [ "upgrade", options.release, - "deploy/helm/kars", + chartPath, "--namespace", options.namespace, "--reuse-values", "--set", "sre.enabled=true", @@ -68,7 +114,7 @@ export function sreCommand(): Command { console.log(chalk.gray(` helm ${helmArgs.join(" ")}`)); try { await execa("helm", helmArgs, { stdio: "inherit" }); - } catch (err) { + } catch { console.error(chalk.red("✗ helm upgrade failed")); process.exit(1); } @@ -86,7 +132,7 @@ export function sreCommand(): Command { await new Promise((r) => setTimeout(r, 1000)); } } - console.log(chalk.cyan("▸ waiting for sre sandbox to reach Running (up to 180s)…")); + console.log(chalk.cyan("▸ waiting for sre sandbox to reach Available (up to 180s)…")); try { await execa( "kubectl", @@ -119,10 +165,18 @@ export function sreCommand(): Command { .option("--namespace ", "Helm release namespace", "kars-system") .option("--context ", "kubectl context to use") .action(async (options: { release: string; namespace: string; context?: string }) => { + let chartPath: string; + try { + chartPath = path.join(resolveRepoRoot(), "deploy", "helm", "kars"); + } catch (err: any) { + console.error(chalk.red(`✗ ${err.message}`)); + process.exit(1); + } + const helmArgs = [ "upgrade", options.release, - "deploy/helm/kars", + chartPath, "--namespace", options.namespace, "--reuse-values", "--set", "sre.enabled=false", From 91efb4a58211d5ebb18ca9fe58a228eff3c58ccd Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 11:48:13 +0200 Subject: [PATCH 05/62] fix(sre): use --reset-then-reuse-values for kars sre install A plain --reuse-values carries the stored release values forward verbatim. If the stored values are older than the chart on disk (e.g. operator ran 'kars dev' before runtimes.hermes was added to values.yaml), the template fails with: nil pointer evaluating interface {}.image at controller-deployment.yaml line 89. --reset-then-reuse-values (helm 3.14+ / helm 4) re-loads the chart's values.yaml defaults first, then overlays the previously --set values on top. So new chart fields get their defaults populated, while user overrides for older fields are preserved. Applied to both install and uninstall sub-actions. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/src/commands/sre.ts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cli/src/commands/sre.ts b/cli/src/commands/sre.ts index 155a0efa..9580fe26 100644 --- a/cli/src/commands/sre.ts +++ b/cli/src/commands/sre.ts @@ -104,7 +104,12 @@ export function sreCommand(): Command { options.release, chartPath, "--namespace", options.namespace, - "--reuse-values", + // --reset-then-reuse-values: re-load defaults from values.yaml + // THEN overlay the previously-set --set values. Critical for + // operators upgrading from older chart versions whose stored + // release values predate fields like runtimes.hermes — a plain + // --reuse-values would carry the gap forward and fail templating. + "--reset-then-reuse-values", "--set", "sre.enabled=true", ]; if (options.model) helmArgs.push("--set", `sre.model=${options.model}`); @@ -178,7 +183,7 @@ export function sreCommand(): Command { options.release, chartPath, "--namespace", options.namespace, - "--reuse-values", + "--reset-then-reuse-values", "--set", "sre.enabled=false", ]; if (options.context) helmArgs.push("--kube-context", options.context); From f93598abd9a2ac9f5010e571e89267bf1e33a129 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 11:50:15 +0200 Subject: [PATCH 06/62] fix(sre): create kars-sre namespace explicitly in the chart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ToolPolicy 'sre-tools' lives in namespace kars-sre by design (kars's cross-namespace ToolPolicy refs are deliberately not supported — principles.md §3). But the controller-created kars-sre namespace only exists AFTER the KarsSandbox 'sre' is reconciled, which is AFTER helm tries to apply the ToolPolicy. Error: UPGRADE FAILED: failed to create resource: namespaces "kars-sre" not found Fix: add the Namespace as a chart-managed resource at the top of sre.yaml. The controller's namespace-reconcile path uses server-side apply, so it will harmlessly co-own this namespace (adding its own labels + annotations) when it reaches reconciler/mod.rs step 1. No conflict. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- deploy/helm/kars/templates/sre.yaml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/deploy/helm/kars/templates/sre.yaml b/deploy/helm/kars/templates/sre.yaml index efb4976a..9df9149e 100644 --- a/deploy/helm/kars/templates/sre.yaml +++ b/deploy/helm/kars/templates/sre.yaml @@ -33,6 +33,26 @@ containment): */}} {{- if (.Values.sre | default dict).enabled }} --- +# kars-sre Namespace — created by the chart so the ToolPolicy below +# (which lives in this ns by design — see proposal §7.6 + the +# ToolPolicy "cross-namespace refs deliberately not supported" rule) +# has a namespace to land in BEFORE the controller has reconciled +# the KarsSandbox. +# +# The controller's own namespace reconcile path uses server-side +# apply with field manager `kars-controller`, so it will harmlessly +# co-own this namespace (adding its labels + annotations) once it +# reaches step 1 of reconcile/mod.rs. No conflict. +apiVersion: v1 +kind: Namespace +metadata: + name: kars-sre + labels: + kars.azure.com/role: sre + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} +--- # kars-sre InferencePolicy — the model the SRE agent uses for diagnosis. # Default model is configurable via .Values.sre.model; the policy applies # only to the `sre` sandbox by name. From 5718fc4fa596508f1251d2e4a2b4ee171f1e20cf Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 11:53:49 +0200 Subject: [PATCH 07/62] fix(sre): add --force-conflicts to helm upgrade (helm 4 SSA) Helm 4 uses server-side apply by default. When prior `kubectl set image` / `kars push --apply` runs took ownership of fields that the chart now also wants to manage, the SSA call fails with: conflict with "kubectl-set" using apps/v1: .spec.template.spec.containers[name="controller"].image --force-conflicts (helm 4) instructs server-side apply to take ownership on conflict. Matches operator intent: the helm-managed chart is the source of truth, and chart-driven upgrades should override transient field-manager pollution from ad-hoc `kubectl set` calls. Confirmed via `helm upgrade --help`: --force-conflicts if set server-side apply will force changes against conflicts Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/src/commands/sre.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cli/src/commands/sre.ts b/cli/src/commands/sre.ts index 9580fe26..9f407c6f 100644 --- a/cli/src/commands/sre.ts +++ b/cli/src/commands/sre.ts @@ -110,6 +110,13 @@ export function sreCommand(): Command { // release values predate fields like runtimes.hermes — a plain // --reuse-values would carry the gap forward and fail templating. "--reset-then-reuse-values", + // --force-conflicts: helm 4 uses server-side apply by default, + // which conflicts with field managers from prior `kubectl set + // image` / `kars push --apply` runs that touched the same + // fields. This flag tells SSA to take ownership on conflict, + // matching the operator's intent (helm-managed chart is the + // source of truth). + "--force-conflicts", "--set", "sre.enabled=true", ]; if (options.model) helmArgs.push("--set", `sre.model=${options.model}`); @@ -184,6 +191,7 @@ export function sreCommand(): Command { chartPath, "--namespace", options.namespace, "--reset-then-reuse-values", + "--force-conflicts", "--set", "sre.enabled=false", ]; if (options.context) helmArgs.push("--kube-context", options.context); From 91accb0ec256284e63b7982dbf355a9bf243a330 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 11:56:20 +0200 Subject: [PATCH 08/62] fix(sre): ToolPolicy must live in KarsSandbox's namespace (kars-system), not kars-sre MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Controller rejected the KarsSandbox sre with: Degraded: ToolPolicyNotFound — 'sre-tools' not found in 'kars-system' (cross-namespace refs not supported) I had ToolPolicy in 'kars-sre' under the misunderstanding that it should be co-located with the runtime pod's namespace. The actual kars convention is the opposite: governance refs are namespace-local to the KarsSandbox CR's OWN namespace (kars-system in our case), per principles.md §3 cross-namespace-refs-deliberately-unsupported rule. The runtime namespace kars-sre is for the pod + RBAC, not for governance. Confirmed against the existing exec-brief-hermes-single scenario which co-locates KarsSandbox + ToolPolicy in kars-system. Net: still safe wrt §7.7.1 protected-resource denylist (kars-system is denylisted, so SRE agent can't delete this ToolPolicy even though it's not labeled kars.azure.com/managed-by=controller). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- deploy/helm/kars/templates/sre.yaml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/deploy/helm/kars/templates/sre.yaml b/deploy/helm/kars/templates/sre.yaml index 9df9149e..91bc50b4 100644 --- a/deploy/helm/kars/templates/sre.yaml +++ b/deploy/helm/kars/templates/sre.yaml @@ -144,15 +144,18 @@ spec: --- # kars-sre ToolPolicy — gates the sre_* tool surface. # -# Lives in the namespace the controller will create for the sre sandbox -# (kars- = kars-sre per the standard naming convention). -# A no-op once Slice 3 lands the per-tool ToolPolicy split, but for -# Slice 1 every read-only tool is allow-without-approval. +# Lives in the SAME namespace as the KarsSandbox `sre` itself +# ({{ .Release.Namespace }} = kars-system) because kars +# governance refs are namespace-local — the controller looks up +# `governance.toolPolicyRef.name: sre-tools` in the KarsSandbox's +# own namespace, NOT in the per-sandbox runtime namespace +# (kars-sre). Cross-namespace ToolPolicy refs are intentionally +# unsupported per principles.md §3. apiVersion: kars.azure.com/v1alpha1 kind: ToolPolicy metadata: name: sre-tools - namespace: kars-sre + namespace: {{ .Release.Namespace }} labels: kars.azure.com/sandbox: sre kars.azure.com/role: sre From 226f30319c0bc8221a1c268b8564546ee466a896 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 12:06:36 +0200 Subject: [PATCH 09/62] =?UTF-8?q?fix(sre):=20rename=20gate=20env=20KARS=5F?= =?UTF-8?q?SRE=5FENABLED=20=E2=86=92=20SRE=5FENABLED=20+=20indent=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related bugs uncovered during live test: 1) The controller silently strips user-supplied extraEnv keys with reserved prefixes (mod.rs:1583 — AGT_, AZURE_, FOUNDRY_AGENT_, IMDS_, KARS_). KARS_SRE_ENABLED was being dropped, so the plugin never registered. Fix: rename to SRE_ENABLED across: - runtimes/hermes/.../plugin/sre.py (is_enabled) - runtimes/hermes/.../plugin/sre_k8s.py (module docstring) - runtimes/hermes/.../plugin/__init__.py (log line + docstring) - runtimes/hermes/tests/test_sre.py (3 env patches) - deploy/helm/kars/templates/sre.yaml (extraEnv key + comment) 2) During the rename edit, the `extraEnv:` block ended up under `runtime:` instead of `runtime.hermes:` (4-space vs 6-space indent), producing: UPGRADE FAILED: .spec.runtime.extraEnv: field not declared in schema Fix: restore correct 6-space indent so extraEnv nests inside hermes. Long-term fix (deferred): controller should detect kars.azure.com/role=sre label on the KarsSandbox and inject KARS_SRE_ENABLED itself (controller-side injection bypasses the prefix filter). Noted inline at sre.is_enabled() docstring and in the sre.yaml extraEnv block as a follow-up. Tests: 31/31 pass (test_sre.py + test_sre_k8s.py). Live verification: SRE_ENABLED env appears on agent container's env; helm upgrade succeeds; chart re-applies cleanly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- deploy/helm/kars/templates/sre.yaml | 12 ++++++++++-- .../src/kars_runtime_hermes/plugin/__init__.py | 4 ++-- .../hermes/src/kars_runtime_hermes/plugin/sre.py | 12 +++++++++++- .../hermes/src/kars_runtime_hermes/plugin/sre_k8s.py | 4 ++-- runtimes/hermes/tests/test_sre.py | 6 +++--- 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/deploy/helm/kars/templates/sre.yaml b/deploy/helm/kars/templates/sre.yaml index 91bc50b4..690c7eb8 100644 --- a/deploy/helm/kars/templates/sre.yaml +++ b/deploy/helm/kars/templates/sre.yaml @@ -98,7 +98,7 @@ spec: runtime: kind: Hermes hermes: - # The KARS_SRE_ENABLED gate. The Hermes plugin __init__.py + # The SRE_ENABLED gate. The Hermes plugin __init__.py # checks this and: # - registers the sre_* tools (sre.py) # - DEREGISTERS kars_spawn family (§7.8.5) @@ -106,8 +106,16 @@ spec: # so this single env var carries the whole "you are the SRE agent" # configuration. Standard Hermes sandboxes don't get this env and # therefore don't get the SRE tools. + # + # NOTE: env is SRE_ENABLED rather than KARS_SRE_ENABLED because + # the controller strips KARS_-prefixed user extraEnv (the prefix is + # reserved for controller-side injection — see + # controller/src/reconciler/mod.rs:1583). The right long-term fix + # is for the controller to recognise the + # `kars.azure.com/role: sre` label below and inject + # KARS_SRE_ENABLED itself; tracked as a follow-up. extraEnv: - KARS_SRE_ENABLED: "true" + SRE_ENABLED: "true" sandbox: isolation: standard diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/__init__.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/__init__.py index 00fdf7e4..86243e93 100644 --- a/runtimes/hermes/src/kars_runtime_hermes/plugin/__init__.py +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/__init__.py @@ -30,7 +30,7 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic tool wrappers, http_fetch via egress proxy, and stubs for kars_mesh_*. SRE-mode containment (per docs/blueprints/07-kars-sre-proposal.md §7.8): - when ``KARS_SRE_ENABLED=true`` is set on the sandbox pod (the env is + when ``SRE_ENABLED=true`` is set on the sandbox pod (the env is written exclusively by deploy/helm/kars/templates/sre.yaml on the ``sre`` KarsSandbox), this entry point: @@ -48,7 +48,7 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic sre_mode = sre.is_enabled() if sre_mode: logger.info( - "KARS_SRE_ENABLED=true detected — entering SRE-mode plugin " + "SRE_ENABLED=true detected — entering SRE-mode plugin " "registration (no kars_spawn, no kars_mesh_*, sre_* tools " "active)" ) diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py index 2fce3580..6e1a84dd 100644 --- a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py @@ -501,8 +501,18 @@ def is_enabled() -> bool: The env is set exclusively by ``deploy/helm/kars/templates/sre.yaml`` on the ``sre`` KarsSandbox's ``spec.runtime.hermes.extraEnv``. Standard sandboxes don't see it. + + NOTE on naming: the env is ``SRE_ENABLED`` rather than + ``KARS_SRE_ENABLED`` because the controller's deployment builder + silently strips user-supplied ``extraEnv`` keys with the reserved + ``KARS_`` prefix (controller/src/reconciler/mod.rs:1583). The right + long-term fix is for the controller to detect + ``kars.azure.com/role: sre`` on the KarsSandbox label and inject + ``KARS_SRE_ENABLED=true`` itself (controller-side injection bypasses + the prefix filter). Tracked as a follow-up; for now ``SRE_ENABLED`` + is the gate. """ - return os.environ.get("KARS_SRE_ENABLED", "").lower() in {"true", "1", "yes"} + return os.environ.get("SRE_ENABLED", "").lower() in {"true", "1", "yes"} def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py index 9c13817a..8f693b97 100644 --- a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py @@ -25,7 +25,7 @@ metrics-server absent (§7.5 Q4) Registered alongside the Slice 1 tools by ``sre.register(ctx)`` when -``KARS_SRE_ENABLED=true``. The Helm chart's ClusterRole grants the +``SRE_ENABLED=true``. The Helm chart's ClusterRole grants the RBAC required for everything here at install time (Slice 2 is strictly read-only). @@ -912,7 +912,7 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic """Register the Slice 2 K8s diagnostic tools. Called from ``sre.register()`` alongside the Slice 1 tools when - ``KARS_SRE_ENABLED=true``. + ``SRE_ENABLED=true``. """ register_tool = getattr(ctx, "register_tool", None) if not callable(register_tool): diff --git a/runtimes/hermes/tests/test_sre.py b/runtimes/hermes/tests/test_sre.py index 8fee227a..fc2ea86e 100644 --- a/runtimes/hermes/tests/test_sre.py +++ b/runtimes/hermes/tests/test_sre.py @@ -13,7 +13,7 @@ def test_is_enabled_default_false() -> None: - """Without KARS_SRE_ENABLED, the plugin must be disabled.""" + """Without SRE_ENABLED, the plugin must be disabled.""" from kars_runtime_hermes.plugin import sre with patch.dict(os.environ, {}, clear=True): @@ -24,7 +24,7 @@ def test_is_enabled_accepts_truthy_values() -> None: from kars_runtime_hermes.plugin import sre for v in ("true", "True", "TRUE", "1", "yes", "YES"): - with patch.dict(os.environ, {"KARS_SRE_ENABLED": v}, clear=True): + with patch.dict(os.environ, {"SRE_ENABLED": v}, clear=True): assert sre.is_enabled(), f"value {v!r} should be truthy" @@ -32,7 +32,7 @@ def test_is_enabled_rejects_falsy_values() -> None: from kars_runtime_hermes.plugin import sre for v in ("false", "0", "no", "", "anything-else"): - with patch.dict(os.environ, {"KARS_SRE_ENABLED": v}, clear=True): + with patch.dict(os.environ, {"SRE_ENABLED": v}, clear=True): assert not sre.is_enabled(), f"value {v!r} should be falsy" From 7fd3aa86ddb35ee82baf57a0bfe60c51c5faa861 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 12:07:18 +0200 Subject: [PATCH 10/62] fix(sre): default contentSafety.requirePromptShields=false MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Slice 1 template hardcoded requirePromptShields: true on the SRE InferencePolicy. Azure OpenAI deployments only carry 'prompt_filter_results' in responses when an explicit Content Filter policy is attached to the deployment. Bare local-dev deployments (Foundry quickstart, gpt-4.1 without explicit filter) don't emit those annotations — so the router blocks every response with: Response blocked: InferencePolicy requires Prompt Shields but the upstream response carried no prompt_filter_results annotations Diagnosed live during kars sre talk session — first prompt ('hi there') returned a cached greeting that happened to bypass the check, second prompt died. Fix: default false in values.yaml + chart; operators wiring Content Safety in production can set: --set sre.requirePromptShields=true (or values.yaml override). The SRE agent's threat surface is operator-driven Kubernetes diagnosis, not user-facing chat, so prompt-shield enforcement is less critical than for an internet-facing assistant. Operators who need it can opt back in. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- deploy/helm/kars/templates/sre.yaml | 11 ++++++++++- deploy/helm/kars/values.yaml | 8 ++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/deploy/helm/kars/templates/sre.yaml b/deploy/helm/kars/templates/sre.yaml index 690c7eb8..5d016c67 100644 --- a/deploy/helm/kars/templates/sre.yaml +++ b/deploy/helm/kars/templates/sre.yaml @@ -75,7 +75,16 @@ spec: provider: {{ (.Values.sre | default dict).provider | default "azure-openai" | quote }} deployment: {{ (.Values.sre | default dict).model | default "gpt-4.1" | quote }} contentSafety: - requirePromptShields: true + # SRE-agent default: do NOT require Prompt Shields. The Azure OpenAI + # response only carries prompt_filter_results when the deployment has + # an Azure Content Safety Content Filter policy attached; on bare + # local-dev deployments (Foundry quickstart, gpt-4.1 without an + # explicit filter), every response gets blocked at the router with + # "InferencePolicy requires Prompt Shields but the upstream response + # carried no prompt_filter_results annotations". Operators wiring + # Content Safety in production can override via: + # --set sre.requirePromptShields=true + requirePromptShields: {{ (.Values.sre | default dict).requirePromptShields | default false }} tokenBudget: perRequestTokens: {{ (.Values.sre | default dict).tokenBudget | default 32000 }} --- diff --git a/deploy/helm/kars/values.yaml b/deploy/helm/kars/values.yaml index 3b09281c..8e6e3a60 100644 --- a/deploy/helm/kars/values.yaml +++ b/deploy/helm/kars/values.yaml @@ -451,6 +451,14 @@ sre: # if your cluster has very large CRD inventories. tokenBudget: 32000 + # Require Azure Content Safety Prompt Shields on every response. ONLY + # set true if your Azure OpenAI deployment has an attached Content + # Filter policy that emits `prompt_filter_results` in responses. + # Bare local-dev deployments (Foundry quickstart, gpt-4.1 without an + # explicit Content Filter) DON'T emit those annotations and every + # response gets blocked at the router. Default is false. + requirePromptShields: false + # Additional egress hosts the SRE sandbox may reach beyond the in- # cluster apiserver. Empty by default — the agent only talks to # `kubernetes.default.svc` out of the box. Add api.telegram.org + From c447aa774235e5c9b5cb8b35d02382bb1b7d3e78 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 12:07:48 +0200 Subject: [PATCH 11/62] =?UTF-8?q?sre:=20default=20model=20gpt-4.1=20?= =?UTF-8?q?=E2=86=92=20gpt-5.4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch default model so the SRE agent ships with current frontier out of the box. Operator can still override per-install with `kars sre install --model `. The model name must match an Azure OpenAI deployment in the operator's Foundry project — InferencePolicy routes to that deployment via the router; if the deployment doesn't exist the router returns a clear 404 and the sandbox surfaces Degraded. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/src/commands/sre.ts | 2 +- deploy/helm/kars/values.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/src/commands/sre.ts b/cli/src/commands/sre.ts index 9f407c6f..146d46e6 100644 --- a/cli/src/commands/sre.ts +++ b/cli/src/commands/sre.ts @@ -78,7 +78,7 @@ export function sreCommand(): Command { ) .option( "--model ", - "Azure OpenAI deployment / model name for the SRE agent (defaults to gpt-4.1)", + "Azure OpenAI deployment / model name for the SRE agent (defaults to gpt-5.4)", ) .option( "--no-wait", diff --git a/deploy/helm/kars/values.yaml b/deploy/helm/kars/values.yaml index 8e6e3a60..6069fcf2 100644 --- a/deploy/helm/kars/values.yaml +++ b/deploy/helm/kars/values.yaml @@ -439,11 +439,11 @@ sre: enabled: false # The Azure OpenAI deployment / model name the SRE agent reasons with. - # Defaults to gpt-4.1; override for cost/perf tuning. The model must be + # Defaults to gpt-5.4; override for cost/perf tuning. The model must be # available in the project the kars controller is configured with — # the InferencePolicy compiles against the standard router failover # chain so an unavailable model surfaces as Degraded on the sandbox. - model: "gpt-4.1" + model: "gpt-5.4" provider: "azure-openai" # Per-request token ceiling. The SRE agent's typical request shape From 96e70bb1109b86e9005778599eee0a995b22d019 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 12:13:48 +0200 Subject: [PATCH 12/62] fix(sre): declare sre_* tools in plugin.yaml provides_tools MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hermes uses plugin.yaml's provides_tools list as the gate for ctx.register_tool() calls — tools not declared in the manifest are silently rejected at registration time. So even though sre.register() called register_tool() for all 10 sre_* tools, none of them became callable. Diagnosed via live test: hermes tools list → showed foundry_*, http_fetch, kars_handoff_status (the manifest-declared ones) → NO sre_* (registered at runtime, manifest-rejected) Same pattern as the OpenClaw plugin's contracts.tools requirement (see memory: 'OpenClaw 2026.5.x requires plugin manifest to declare contracts.tools listing every tool the plugin will register'). Fix: add all 10 sre_* tools (5 Slice 1 + 5 Slice 2) to provides_tools. The tools remain conditionally registered at runtime — standard Hermes sandboxes don't set SRE_ENABLED → sre.register(ctx) is skipped → the tools are declared-but-not-callable (still matches the manifest contract; Hermes treats them as 'present but inactive'). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../kars_runtime_hermes/plugin/plugin.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/plugin.yaml b/runtimes/hermes/src/kars_runtime_hermes/plugin/plugin.yaml index a069840a..d2560432 100644 --- a/runtimes/hermes/src/kars_runtime_hermes/plugin/plugin.yaml +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/plugin.yaml @@ -28,6 +28,25 @@ provides_tools: - foundry_evaluations - foundry_deployments - foundry_agents + # kars-sre tools — declared here so Hermes accepts the register_tool + # calls. Conditionally registered at runtime ONLY when SRE_ENABLED=true + # is set on the sandbox pod (set exclusively by the chart's sre.yaml on + # the `sre` KarsSandbox per docs/blueprints/07-kars-sre-proposal.md §7.8). + # Standard Hermes sandboxes don't see SRE_ENABLED → __init__.py skips + # sre.register(ctx) → the tools are declared-but-not-callable, which + # matches the manifest contract. + # Slice 1 (read-only kars-CR tools): + - sre_describe_state + - sre_logs + - sre_diagnose + - sre_explain_error + - sre_propose_fix + # Slice 2 (K8s diagnostic toolset): + - sre_describe_resource + - sre_what_changed + - sre_endpoints_inspect + - sre_image_probe + - sre_top provides_hooks: - pre_tool_call # → POST /agt/evaluate; deny short-circuits the tool From f6e8d0d903faa23564344b6e3c2a354314adde03 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 12:24:49 +0200 Subject: [PATCH 13/62] sre: wire SRE-mode SOUL.md system prompt + fix register_tool kwargs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three correctness fixes landed during the live test pass: 1) Hermes register_tool kwargs were wrong sre.py + sre_k8s.py used parameters=... but Hermes' contract expects schema=... AND toolset="". Without these the manifest's provides_tools entries still showed up but the tools were silently non-callable. Fixed all 10 sre_* register_tool calls. 2) plugin.yaml provides_tools missing the sre_* entries Hermes' plugin loader requires every tool the plugin will register to be declared in provides_tools (same shape as OpenClaw's contracts.tools). Added all 10. Conditionally registered at runtime via SRE_ENABLED — standard sandboxes don't trip them. 3) New: kars-sre persona / system prompt Following the OpenClaw pattern (sandbox-images/openclaw/entrypoint.sh :1214 writes SOUL.md on every boot), the Hermes entrypoint now writes a 110-line SRE-specific SOUL.md to $HERMES_HOME/SOUL.md when SRE_ENABLED=true. Content: - Identity + mission statement - Tone constraints (concise, evidence-based, direct, honest) - Catalog of all 10 sre_* tools with WHEN to use each - Catalog of tools the agent does NOT have (spawn, mesh, shell, external net) with rationale - Standard incident reasoning loop (5 steps) - Output structure for fix proposals (Symptom/Evidence/Root cause/ Proposed fix/Why safe/Rollback) - Boundaries (protected-resource denylist enforced at proposal layer; agent should not even try) - Audit info (where the kars audit JSONL captures every call) - First-message greeting template (one line, no editorialising) The model name interpolates from KARS_MODEL → AZURE_OPENAI_DEPLOYMENT → 'gpt-5.4' default, so the prompt always names the live model. Validation: pytest tests/test_sre.py tests/test_sre_k8s.py → 31/31 pass bash -n entrypoint.sh → clean live verify: SOUL.md written 110 lines, model = gpt-5.4 live verify: hermes tools list → '✓ enabled sre' toolset now shows Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../src/kars_runtime_hermes/plugin/sre.py | 15 +- .../src/kars_runtime_hermes/plugin/sre_k8s.py | 15 +- sandbox-images/hermes/entrypoint.sh | 140 ++++++++++++++++++ 3 files changed, 160 insertions(+), 10 deletions(-) diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py index 6e1a84dd..96f74e39 100644 --- a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py @@ -529,6 +529,7 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic register_tool( name="sre_describe_state", + toolset="sre", description=( "Return a structured snapshot of every kars-owned CR in every " "namespace (KarsSandbox, InferencePolicy, ToolPolicy, " @@ -538,18 +539,19 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic "conditions. Use this as the first call when starting an " "incident investigation." ), - parameters={"type": "object", "properties": {}, "required": []}, + schema={"type": "object", "properties": {}, "required": []}, handler=sre_describe_state, ) register_tool( name="sre_logs", + toolset="sre", description=( "Tail logs from a pod's container via the apiserver. Returns the " "last N lines (max 500). Use for diagnosing CrashLoopBackOff or " "for inspecting an agent's behaviour." ), - parameters={ + schema={ "type": "object", "properties": { "namespace": {"type": "string", "description": "Pod's namespace"}, @@ -571,6 +573,7 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic register_tool( name="sre_diagnose", + toolset="sre", description=( "Walk the kars-CR health checklist: controller deployment Ready, " "every kars CRD installed, no Degraded/Failed sandboxes or " @@ -578,19 +581,20 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic "report + a one-line summary suitable for an operator-facing " "message." ), - parameters={"type": "object", "properties": {}, "required": []}, + schema={"type": "object", "properties": {}, "required": []}, handler=sre_diagnose, ) register_tool( name="sre_explain_error", + toolset="sre", description=( "Given an error string (pod event reason, controller log line, " "etc.), return a root-cause hypothesis from the kars OOTB-blocker " "corpus. The hypothesis is a HINT — the agent should then use " "the other diagnostic tools to confirm or refute it." ), - parameters={ + schema={ "type": "object", "properties": { "error": { @@ -605,13 +609,14 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic register_tool( name="sre_propose_fix", + toolset="sre", description=( "Return a typed-action proposal for the operator to approve. " "READ-ONLY in Slice 1 — Slice 3 adds sre_apply_fix to execute " "approved proposals. Use after diagnosing a problem to surface " "the recommended remediation." ), - parameters={ + schema={ "type": "object", "properties": { "diagnosis": { diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py index 8f693b97..63103517 100644 --- a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py @@ -921,6 +921,7 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic register_tool( name="sre_describe_resource", + toolset="sre", description=( "Structured-describe for any K8s resource (Pod, Deployment, " "Service, ResourceQuota, ConfigMap, Secret metadata only, " @@ -929,7 +930,7 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic "workload → ReplicaSet → Pods → events on every level. This " "is THE single-call diagnostic for most workload incidents." ), - parameters={ + schema={ "type": "object", "properties": { "kind": { @@ -949,12 +950,13 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic register_tool( name="sre_what_changed", + toolset="sre", description=( "Events of failure-relevant reasons in the last N minutes " "across core/v1 + events.k8s.io/v1. Use FIRST in an incident " "to frame the time-window: what broke when?" ), - parameters={ + schema={ "type": "object", "properties": { "namespace": { @@ -974,13 +976,14 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic register_tool( name="sre_endpoints_inspect", + toolset="sre", description=( "Service → selector → matching pods → EndpointSlice readiness. " "Diagnoses 'service has no endpoints' incidents: are there pods " "matching the selector? are they Ready? are they in the " "EndpointSlice? Returns a finding summary the agent can quote." ), - parameters={ + schema={ "type": "object", "properties": { "namespace": {"type": "string"}, @@ -993,13 +996,14 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic register_tool( name="sre_image_probe", + toolset="sre", description=( "Given an image reference, return: (a) what tags of the same " "repo are CURRENTLY IN USE on this cluster, (b) the closest " "match by edit-distance to the requested tag. Use after " "sre_describe_resource shows ImagePullBackOff." ), - parameters={ + schema={ "type": "object", "properties": { "image": { @@ -1014,13 +1018,14 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic register_tool( name="sre_top", + toolset="sre", description=( "CPU + memory usage per pod or per node (metrics.k8s.io). " "Returns {unavailable: 'metrics-server not installed'} if " "the metrics API isn't registered — the agent's planner " "routes around it." ), - parameters={ + schema={ "type": "object", "properties": { "scope": { diff --git a/sandbox-images/hermes/entrypoint.sh b/sandbox-images/hermes/entrypoint.sh index acee5008..99e97d82 100644 --- a/sandbox-images/hermes/entrypoint.sh +++ b/sandbox-images/hermes/entrypoint.sh @@ -504,6 +504,146 @@ AZURE_FOUNDRY_API_KEY=router-managed AZURE_FOUNDRY_BASE_URL=${OPENAI_BASE_URL} EOF +# ── Persona / SOUL.md ──────────────────────────────────────────────────── +# Hermes reads $HERMES_HOME/SOUL.md as the agent's system prompt (see +# `/usr/lib/python3.12/site-packages/hermes_cli/main.py:10387` — +# "Edit profile/SOUL.md for different personality"). We follow the +# OpenClaw pattern (sandbox-images/openclaw/entrypoint.sh:1214) and +# write the prompt deterministically on every boot: +# +# - Regenerated every boot so kars-managed updates always win over +# any "hermes" first-boot scaffolding that might overwrite it +# - Heredoc with env interpolation so the prompt knows the live model +# name, sandbox name, governance posture, etc. +# - Mode-gated: if SRE_ENABLED=true, write the SRE persona; otherwise +# leave the file alone (Hermes' own default applies) +# +# The SRE persona is the long-form version of docs/sre.md — it tells +# the model exactly which sre_* tools it has, the standard incident +# reasoning loop, what's read-only vs proposal-only, and what it CAN'T +# do (no spawn, no mesh, no governance-state mutation — per the +# §7.8 containment design). +if [ "${SRE_ENABLED:-}" = "true" ]; then + echo "[kars-hermes] SRE_ENABLED=true — writing kars-sre persona to $HERMES_HOME/SOUL.md" + _SRE_MODEL="${KARS_MODEL:-${AZURE_OPENAI_DEPLOYMENT:-gpt-5.4}}" + # Single heredoc, UNQUOTED so ${_SRE_MODEL} interpolates. Literal + # $-signs in command examples below are escaped with \$ to keep the + # shell from trying to expand them. + cat > "$HERMES_HOME/SOUL.md" < Date: Tue, 9 Jun 2026 13:53:26 +0200 Subject: [PATCH 14/62] sre: apiserver-bypass for role=sre sandboxes (controller egress-guard) Adds two iptables rules to the egress-guard init container, gated on the kars.azure.com/role=sre label being present on the KarsSandbox: 1. Filter chain: ACCEPT for UID 1000 -> KUBERNETES_SERVICE_HOST:443 (BEFORE the existing catch-all DROP). 2. NAT chain: RETURN for UID 1000 -> KUBERNETES_SERVICE_HOST:443 (BEFORE the existing :443 REDIRECT to :8444 transparent proxy). Both are required. The NAT-bypass alone is not sufficient because the filter chain runs AFTER NAT - the NAT-RETURN says 'don't redirect' but the filter-chain DROP next would still slay the packet. Discovered live during testing: the curl-to-apiserver hung until both rules landed. Why this is needed: the SRE plugin's K8s API client (sre_kube.py in the Hermes runtime) needs DIRECT apiserver access with its projected ServiceAccount token to read kars CRs / pods / events. Without the bypass, every apiserver call gets NAT-redirected to the router's :8444 transparent proxy, which has no idea how to forward TLS to the apiserver -- connections hang then time out. Why only role=sre sandboxes: every other sandbox kind goes through the router unchanged -- that's the whole point of the transparent proxy + L7 audit. Direct apiserver access is the deliberate exception, uniquely held by the nominated SRE sandbox per the proposal section 7.8 containment design. K8s audit log is the audit surface for these apiserver calls (the router's L7 audit doesn't apply, but K8s audit is stronger -- every call carries the SA identity, verb, and resource). Implementation: - new build_egress_guard_command(is_sre_sandbox: bool) helper in reconciler/mod.rs that emits the right rule sequence per mode - 3 unit tests: standard has no bypass; SRE has NAT bypass before REDIRECT AND filter ACCEPT before DROP; both modes keep DROP Validated end-to-end: - HTTP 200 in 17ms from agent container -> 10.96.0.1:443 - sre_describe_state() returns 10 KarsSandboxes + all 11 CR kinds Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- controller/src/reconciler/mod.rs | 214 +++++++++++++++++++++++++++++-- 1 file changed, 203 insertions(+), 11 deletions(-) diff --git a/controller/src/reconciler/mod.rs b/controller/src/reconciler/mod.rs index 6d1079c6..42581384 100644 --- a/controller/src/reconciler/mod.rs +++ b/controller/src/reconciler/mod.rs @@ -89,6 +89,170 @@ pub(crate) fn isolation_scheduling(isolation: &str) -> (Option<&'static str>, &' } } +/// Build the egress-guard init-container command. +/// +/// Standard sandboxes (every kind except SRE) get the full lockdown: +/// UID 1000 → loopback + DNS allowed, everything else dropped, with +/// :80/:443 NAT-redirected to the inference-router on :8444 for L7 +/// policy + audit. +/// +/// SRE-mode sandboxes (labelled `kars.azure.com/role=sre`) get ONE +/// extra rule inserted into the OUTPUT NAT chain BEFORE the generic +/// REDIRECT: apiserver-bound traffic (KUBERNETES_SERVICE_HOST : +/// KUBERNETES_SERVICE_PORT_HTTPS, both kubelet-auto-injected envs) +/// is RETURNed — i.e. NOT NAT'd to :8444 — so the SRE plugin's K8s +/// API client (sre_kube.py) can hit the apiserver directly with its +/// projected SA token. +/// +/// The K8s audit log is the audit surface for these apiserver calls +/// (the router's L7 audit doesn't capture them, but K8s audit is +/// stronger — every call carries the SA identity and the verb). +/// +/// Privilege-containment design: this capability is uniquely held by +/// the SRE sandbox per the proposal §7.8. Future Slice 3 will add +/// ValidatingAdmissionPolicies to gate WHO can apply the +/// `role=sre` label (only chart-installer SAs; see §7.8.10 design). +pub(crate) fn build_egress_guard_command(is_sre_sandbox: bool) -> String { + let mut cmd = String::with_capacity(1024); + // Filter chain (OUTPUT): UID 1000 → allow loopback + DNS + + // established, then DROP. Same for every sandbox kind. + cmd.push_str("iptables -A OUTPUT -m owner --uid-owner 1000 -o lo -j ACCEPT && "); + cmd.push_str("iptables -A OUTPUT -m owner --uid-owner 1000 -p udp --dport 53 -j ACCEPT && "); + cmd.push_str("iptables -A OUTPUT -m owner --uid-owner 1000 -p tcp --dport 53 -j ACCEPT && "); + cmd.push_str( + "iptables -A OUTPUT -m owner --uid-owner 1000 -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT && " + ); + + // SRE-mode-only: filter-chain ACCEPT for apiserver-bound traffic. + // The filter chain runs AFTER the NAT chain — the NAT-bypass RETURN + // below just decides "don't redirect", but the filter chain's DROP + // (next rule) would still kill the packet. We have to ACCEPT it + // here BEFORE the catch-all DROP. + if is_sre_sandbox { + cmd.push_str( + "iptables -A OUTPUT -m owner --uid-owner 1000 \ + -d \"${KUBERNETES_SERVICE_HOST}\" \ + -p tcp --dport \"${KUBERNETES_SERVICE_PORT_HTTPS:-443}\" \ + -j ACCEPT && " + ); + } + + cmd.push_str("iptables -A OUTPUT -m owner --uid-owner 1000 -j DROP && "); + + // SRE-mode-only: NAT-chain apiserver bypass. Inserted BEFORE the + // generic :443 REDIRECT so apiserver traffic short-circuits to the + // real upstream rather than the router. KUBERNETES_SERVICE_HOST + // and KUBERNETES_SERVICE_PORT_HTTPS are auto-injected by the + // kubelet on every container (including init containers). + if is_sre_sandbox { + cmd.push_str( + "iptables -t nat -A OUTPUT -m owner --uid-owner 1000 \ + -d \"${KUBERNETES_SERVICE_HOST}\" \ + -p tcp --dport \"${KUBERNETES_SERVICE_PORT_HTTPS:-443}\" \ + -j RETURN && " + ); + } + + // NAT chain (OUTPUT): :80/:443 → REDIRECT to :8444 (transparent + // proxy in the inference-router sidecar). Same for every sandbox. + cmd.push_str( + "iptables -t nat -A OUTPUT -m owner --uid-owner 1000 ! -o lo -p tcp --dport 80 -j REDIRECT --to-port 8444 && " + ); + cmd.push_str( + "iptables -t nat -A OUTPUT -m owner --uid-owner 1000 ! -o lo -p tcp --dport 443 -j REDIRECT --to-port 8444 && " + ); + + if is_sre_sandbox { + cmd.push_str( + "echo 'egress-guard: UID 1000 → transparent proxy on :8444 + apiserver bypass (SRE mode)'" + ); + } else { + cmd.push_str( + "echo 'egress-guard: UID 1000 → transparent proxy on :8444 (learn + enforce)'" + ); + } + + cmd +} + +#[cfg(test)] +#[allow(clippy::module_inception)] +mod egress_guard_tests { + use super::build_egress_guard_command; + + #[test] + fn standard_sandbox_has_no_apiserver_bypass() { + let cmd = build_egress_guard_command(false); + assert!(!cmd.contains("KUBERNETES_SERVICE_HOST")); + assert!(cmd.contains("REDIRECT --to-port 8444")); + assert!(cmd.contains("(learn + enforce)")); + assert!(!cmd.contains("apiserver bypass")); + } + + #[test] + fn sre_sandbox_inserts_apiserver_bypass_before_redirect() { + let cmd = build_egress_guard_command(true); + // The bypass MUST come before the :443 REDIRECT — otherwise + // the REDIRECT wins (iptables -A appends; rules evaluate in + // order) and the bypass is dead code. + let bypass_pos = cmd + .find("-t nat -A OUTPUT -m owner --uid-owner 1000 -d \"${KUBERNETES_SERVICE_HOST}\"") + .or_else(|| cmd.find("-t nat -A OUTPUT -m owner --uid-owner 1000 \t\t\t -d \"${KUBERNETES_SERVICE_HOST}\"")) + .or_else(|| { + // Match the NAT-chain bypass specifically (not the filter ACCEPT) + cmd.match_indices("-t nat -A OUTPUT") + .find(|(i, _)| cmd[*i..].contains("KUBERNETES_SERVICE_HOST")) + .map(|(i, _)| i) + }) + .expect("NAT-chain bypass rule missing"); + let redirect_pos = cmd + .find("--dport 443 -j REDIRECT") + .expect("redirect rule missing"); + assert!( + bypass_pos < redirect_pos, + "NAT bypass at {bypass_pos} must precede redirect at {redirect_pos}" + ); + assert!(cmd.contains("apiserver bypass (SRE mode)")); + + // ALSO check the filter-chain ACCEPT exists BEFORE the DROP — this + // was the bug we hit live: NAT bypass alone wasn't enough because + // the filter chain's DROP for UID 1000 killed the packet anyway. + let filter_accept = cmd + .find( + "-A OUTPUT -m owner --uid-owner 1000 -d \"${KUBERNETES_SERVICE_HOST}\"", + ) + .or_else(|| { + cmd.match_indices("-A OUTPUT -m owner --uid-owner 1000") + .find(|(i, _)| { + let tail = &cmd[*i..*i + 200.min(cmd.len() - *i)]; + tail.contains("KUBERNETES_SERVICE_HOST") && tail.contains("-j ACCEPT") + }) + .map(|(i, _)| i) + }) + .expect("filter-chain ACCEPT for apiserver missing"); + let filter_drop = cmd + .find("-A OUTPUT -m owner --uid-owner 1000 -j DROP") + .expect("filter DROP rule missing"); + assert!( + filter_accept < filter_drop, + "filter ACCEPT at {filter_accept} must precede DROP at {filter_drop}" + ); + } + + #[test] + fn both_modes_keep_the_filter_chain_lockdown() { + for is_sre in [false, true] { + let cmd = build_egress_guard_command(is_sre); + // The filter-chain DROP rule is the actual lockdown — must + // never be removed by either mode. + assert!( + cmd.contains("-A OUTPUT -m owner --uid-owner 1000 -j DROP"), + "filter-chain DROP missing for is_sre={is_sre}" + ); + } + } +} + /// Custom error type that bridges serde_json and kube errors. #[derive(Debug, thiserror::Error)] enum ReconcileError { @@ -1909,6 +2073,36 @@ async fn reconcile(sandbox: Arc, ctx: Arc) -> Result, ctx: Arc) -> Result, ctx: Arc) -> Result Date: Tue, 9 Jun 2026 13:56:08 +0200 Subject: [PATCH 15/62] fix(sre): correct AGT profile schema (version 1.0 + agent: name + policies) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Slice 1 inline AGT profile used the wrong schema — version: 1 with rules[].match.tool — which produced: ToolPolicy sre-tools: invalid YAML: missing field agent at compile time, then 'router has not yet loaded AgtProfile' at the sre pod's policy loader. The sre KarsSandbox showed Degraded with ToolPolicyNotCompiled. Found by the SRE agent itself during the first cluster-health-overview test (a beautifully on-point sre_diagnose result that flagged its own ToolPolicy as the only Degraded thing in the cluster). Right schema (from deploy/helm/kars/files/kars-default-agt-profile.yaml): version: '1.0' agent: policies: - name: ... type: capability allowed_actions: [...] denied_actions: [...] priority: N Action prefix convention used by the router: tool: for tool calls inference:: for model dispatch spawn:* / mesh:* for sub-agent + mesh The new sre-tools profile has three policies: - sre-diagnostic-tools-allow (priority 100): all 10 sre_* tools - sre-inference-allow (priority 90): chat_completions / responses / content_safety - sre-spawn-and-mesh-deny (priority 110): defense in depth for the §7.8.5/§7.8.6 containment (already enforced by plugin not even registering these tools) After re-apply + sre pod restart: ToolPolicy sre-tools status: Ready True:RouterEnforcing KarsSandbox sre status: Running Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- deploy/helm/kars/templates/sre.yaml | 72 +++++++++++++++++++---------- 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/deploy/helm/kars/templates/sre.yaml b/deploy/helm/kars/templates/sre.yaml index 5d016c67..2f808a2e 100644 --- a/deploy/helm/kars/templates/sre.yaml +++ b/deploy/helm/kars/templates/sre.yaml @@ -188,30 +188,54 @@ spec: kars.azure.com/role: sre agtProfile: inline: | - version: 1 - rules: - # Read-only kars-CR diagnostic tools (Slice 1) — no approval. - - match: { tool: "sre_describe_state" } - decision: allow - - match: { tool: "sre_logs" } - decision: allow - - match: { tool: "sre_diagnose" } - decision: allow - - match: { tool: "sre_explain_error" } - decision: allow - - match: { tool: "sre_propose_fix" } - decision: allow - # Read-only K8s diagnostic toolset (Slice 2) — no approval. - - match: { tool: "sre_describe_resource" } - decision: allow - - match: { tool: "sre_what_changed" } - decision: allow - - match: { tool: "sre_endpoints_inspect" } - decision: allow - - match: { tool: "sre_image_probe" } - decision: allow - - match: { tool: "sre_top" } - decision: allow + # kars-sre AGT profile — allows the 10 sre_* tools, plus the + # inference + content-safety actions the agent needs to use the + # model. Same schema as kars-default-agt-profile.yaml. + version: "1.0" + agent: kars-sre + + policies: + # Slice 1 (read-only kars-CR diagnostics) + Slice 2 (K8s diag toolset). + # All 10 sre_* tools allowed without approval — the diagnostic + # surface is fully read-only in this build (apply lands in Slice 3 + # with its own per-tool approval policy). + - name: sre-diagnostic-tools-allow + type: capability + allowed_actions: + - "tool:sre_describe_state" + - "tool:sre_logs" + - "tool:sre_diagnose" + - "tool:sre_explain_error" + - "tool:sre_propose_fix" + - "tool:sre_describe_resource" + - "tool:sre_what_changed" + - "tool:sre_endpoints_inspect" + - "tool:sre_image_probe" + - "tool:sre_top" + priority: 100 + + # Inference traffic: the SRE agent reasons over the diagnostic + # results using its configured model. The inference action shape + # matches what the router emits — see kars-default-agt-profile.yaml + # for the inference: prefix convention. + - name: sre-inference-allow + type: capability + allowed_actions: + - "inference:chat_completions:*" + - "inference:responses:*" + - "inference:content_safety:*" + priority: 90 + + # Spawn + mesh are not just denied — they are not even registered + # by the plugin (§7.8.5 + §7.8.6 containment). The deny rule below + # is defense in depth in case a future runtime accidentally + # registers them. + - name: sre-spawn-and-mesh-deny + type: capability + denied_actions: + - "spawn:*" + - "mesh:*" + priority: 110 --- # kars-sre-reader ClusterRole — Slice 1 RBAC. # From c506c54fa964cdafe996bbad7f14ae400efd1887 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 13:58:53 +0200 Subject: [PATCH 16/62] =?UTF-8?q?fix(sre):=20trailing-colon=20glob=20in=20?= =?UTF-8?q?AGT=20allow=20rules=20=E2=80=94=20match=20real=20action=20shape?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Slice 1 allow rules used literal 'tool:sre_' strings but the Hermes plugin governance hook actually emits 'tool::' — with a trailing colon even when no significant arg is present (see runtimes/hermes/.../plugin/governance.py _action_verb tail returns f'tool:{tool_name}:'). So: literal allow: 'tool:sre_describe_state' router emit: 'tool:sre_describe_state:' <-- no match → denied The agent helpfully diagnosed itself via: sre_describe_state -> blocked by policy 'sre-diagnostic-tools-allow' (visible because the WebUI surfaced the matched_rule name). Confirmed the action shape in inference-router/src/routes/governance.rs:66 ('if let Some(tool_name) = action.strip_prefix("tool:")...'). Fix: add a '*' wildcard to every allowed_action for the sre_* tools. This matches both the trailing-colon shape (tools with no args) and the suffix-args shape (sre_describe_resource:, sre_logs:, etc.) in a single entry. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- deploy/helm/kars/templates/sre.yaml | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/deploy/helm/kars/templates/sre.yaml b/deploy/helm/kars/templates/sre.yaml index 2f808a2e..529c4e90 100644 --- a/deploy/helm/kars/templates/sre.yaml +++ b/deploy/helm/kars/templates/sre.yaml @@ -199,19 +199,28 @@ spec: # All 10 sre_* tools allowed without approval — the diagnostic # surface is fully read-only in this build (apply lands in Slice 3 # with its own per-tool approval policy). + # + # NOTE on action shape: the Hermes plugin governance hook emits + # `tool::` for every tool call (see + # runtimes/hermes/.../plugin/governance.py _action_verb). Tools + # like sre_describe_state take no args → action is exactly + # `tool:sre_describe_state:` (trailing colon). Tools like + # sre_describe_resource take a `name` arg → action is + # `tool:sre_describe_resource:`. So allowed_actions + # use the `tool:sre_*:` prefix glob to match both shapes. - name: sre-diagnostic-tools-allow type: capability allowed_actions: - - "tool:sre_describe_state" - - "tool:sre_logs" - - "tool:sre_diagnose" - - "tool:sre_explain_error" - - "tool:sre_propose_fix" - - "tool:sre_describe_resource" - - "tool:sre_what_changed" - - "tool:sre_endpoints_inspect" - - "tool:sre_image_probe" - - "tool:sre_top" + - "tool:sre_describe_state:*" + - "tool:sre_logs:*" + - "tool:sre_diagnose:*" + - "tool:sre_explain_error:*" + - "tool:sre_propose_fix:*" + - "tool:sre_describe_resource:*" + - "tool:sre_what_changed:*" + - "tool:sre_endpoints_inspect:*" + - "tool:sre_image_probe:*" + - "tool:sre_top:*" priority: 100 # Inference traffic: the SRE agent reasons over the diagnostic From deff899b50a4798e3a262435c9c01e6a2bcbce98 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 14:11:13 +0200 Subject: [PATCH 17/62] sre: NetworkPolicy egress allow for apiserver (cluster-portable) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The egress-guard iptables bypass (b25f41b) lets UID 1000 reach the apiserver at the iptables layer, but the pod-level NetworkPolicy was still denying it. The blanket :443 egress rule explicitly excludes RFC1918 ranges to prevent lateral movement to in-cluster Services, but every cluster's apiserver ClusterIP IS in one of those ranges (kind: 10.96.0.1, AKS: 10.0.0.1, EKS: 172.20.0.1). Fix: when role=sre, add a NetworkPolicy egress rule for the apiserver Service ClusterIP. The IP + port are read at reconcile time from the controller's own KUBERNETES_SERVICE_HOST / KUBERNETES_SERVICE_PORT_HTTPS env vars (kubelet-injected on every pod). This is cluster-portable — kind, AKS, EKS, custom service-CIDRs all get the right value automatically. No hardcoded IPs. Implementation: - Top of reconcile(): compute is_sre_sandbox once + read apiserver IP/port from env. Threaded through both the egress-guard helper and the NetworkPolicy egress vec. - egress_rules.push(...) added after the static block, gated on is_sre_sandbox, with IP/port substituted from env. - Removed the duplicate is_sre_sandbox compute lower in reconcile() that was added in b25f41b — single source of truth now. Validated live: - kubectl get netpol -n kars-sre shows the 10.96.0.1/32 :443 rule - sre_describe_state() returns in 0.10s — 11 CR kinds, 10 KarsSandboxes enumerated, NO timeouts. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- controller/src/reconciler/mod.rs | 90 ++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/controller/src/reconciler/mod.rs b/controller/src/reconciler/mod.rs index 42581384..7980dac8 100644 --- a/controller/src/reconciler/mod.rs +++ b/controller/src/reconciler/mod.rs @@ -356,6 +356,44 @@ async fn reconcile(sandbox: Arc, ctx: Arc) -> Result, ctx: Arc) -> Result().unwrap_or(443)}] + })); + } + // Add user-defined allowed endpoints (for the inference-router to reach // on behalf of the agent — agent itself can only reach localhost). // S12.e fail-closed: when `endpoints == None` (verify failed and no @@ -2073,34 +2132,9 @@ async fn reconcile(sandbox: Arc, ctx: Arc) -> Result Date: Tue, 9 Jun 2026 14:16:18 +0200 Subject: [PATCH 18/62] fix(demo): agent-a-research.yaml passes CRD admission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two admission rejections: 1) spec.governance.toolPolicyRef.name required when governance.enabled=true Added a research-tools ToolPolicy with allow rules for: - inference:chat_completions:* / responses:* / content_safety:* - tool:http_fetch:* (the agent does web research) - tool:foundry_* family (memory + web_search + code_execute etc.) 2) spec.runtime.hermes must be set iff kind=Hermes (CEL guard rejects missing key, accepts empty object). The previous manifest had a commented placeholder which yamllint-fine but admission saw the key as missing. Changed to 'hermes: {}' — empty object honours image defaults without drift. Also: aligned the demo with the SRE sandbox defaults shipped earlier: - deployment: gpt-5.4 (was gpt-4.1) - requirePromptShields: false (was true — bare local Foundry deployments don't emit prompt_filter_results, blocking every response) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tools/demo/act2/agent-a-research.yaml | 56 ++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 5 deletions(-) diff --git a/tools/demo/act2/agent-a-research.yaml b/tools/demo/act2/agent-a-research.yaml index a2fb1652..1e34aa33 100644 --- a/tools/demo/act2/agent-a-research.yaml +++ b/tools/demo/act2/agent-a-research.yaml @@ -32,12 +32,54 @@ spec: modelPreference: primary: provider: azure-openai - deployment: gpt-4.1 + deployment: gpt-5.4 contentSafety: - requirePromptShields: true + requirePromptShields: false tokenBudget: perRequestTokens: 32000 --- +# ToolPolicy required because spec.governance.enabled=true requires +# spec.governance.toolPolicyRef.name. The kars-default profile applies +# (allow inference + standard tools); operators wanting tighter gates +# can swap in their own ToolPolicy. +apiVersion: kars.azure.com/v1alpha1 +kind: ToolPolicy +metadata: + name: research-tools + namespace: kars-system + labels: + kars.azure.com/sandbox: research + app.kubernetes.io/part-of: kars-demo +spec: + appliesTo: + sandboxMatchLabels: + kars.azure.com/sandbox: research + agtProfile: + inline: | + version: "1.0" + agent: research-default + policies: + # Allow inference + the standard kars plugin tools (http_fetch, + # foundry_*). Same shape as kars-default-agt-profile.yaml. + - name: research-allow-defaults + type: capability + allowed_actions: + - "inference:chat_completions:*" + - "inference:responses:*" + - "inference:content_safety:*" + - "tool:http_fetch:*" + - "tool:foundry_memory:*" + - "tool:foundry_web_search:*" + - "tool:foundry_code_execute:*" + - "tool:foundry_file_search:*" + - "tool:foundry_image_generation:*" + - "tool:foundry_conversations:*" + - "tool:foundry_evaluations:*" + - "tool:foundry_deployments:*" + - "tool:foundry_agents:*" + - "tool:foundry_download_file:*" + priority: 100 +--- apiVersion: kars.azure.com/v1alpha1 kind: KarsSandbox metadata: @@ -49,9 +91,11 @@ metadata: spec: runtime: kind: Hermes - hermes: - # Use the image's baked-in Hermes version (don't pin) so this - # demo manifest doesn't drift against runtime image bumps. + # `hermes: {}` must be set even when no fields are pinned — the CRD's + # CEL guard requires `runtime.hermes` to be present (any non-null + # value) iff `runtime.kind=Hermes`. Empty object honours the image's + # baked-in Hermes version + entrypoint without drift. + hermes: {} sandbox: isolation: standard @@ -61,6 +105,8 @@ spec: governance: enabled: true + toolPolicyRef: + name: research-tools registryMode: local trustThreshold: 0 From 72bedb286ba61f45fa99840e90fd8c502df8ea18 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 9 Jun 2026 14:18:19 +0200 Subject: [PATCH 19/62] fix(demo): break.sh uses kars.azure.com/component selector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Controller stamps pods with kars.azure.com/component=sandbox not the app.kubernetes.io/component=sandbox the script was looking for. Result: 'no sandbox pod found to evict; quota will only manifest on next natural restart' — the script kept going but the break never surfaced. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tools/demo/act2/break.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/demo/act2/break.sh b/tools/demo/act2/break.sh index 949a14b5..e207bb5e 100755 --- a/tools/demo/act2/break.sh +++ b/tools/demo/act2/break.sh @@ -43,7 +43,7 @@ kubectl apply -f "${SCRIPT_DIR}/platform-hardening-quota.yaml" echo "" echo "▸ force-deleting the running pod to surface the failure..." -POD=$(kubectl -n "${NS}" get pod -l app.kubernetes.io/component=sandbox \ +POD=$(kubectl -n "${NS}" get pod -l kars.azure.com/component=sandbox \ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") if [[ -z "${POD}" ]]; then echo "⚠ no sandbox pod found to evict; quota will only manifest on next natural restart" >&2 From 81da63d8e6de3b6cf90e33658ebf27dd77cda6f9 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Wed, 10 Jun 2026 10:54:27 +0200 Subject: [PATCH 20/62] kars-sre: Slice 3 (typed apply-fix) + Slice 4 (proactive watcher + Telegram) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Slice 3 — typed apply-fix path (operator-approved remediation) Adds the KarsSREAction CRD and reconciler that drives an SRE-agent fix proposal Proposed → Approved → Applied → Recovered. The agent emits a CR via sre_propose_fix; the operator approves via kars sre approve (or kubectl edit); the controller mints a one-shot ClusterRoleBinding scoped to the right writer ClusterRole (kars-sre-writer-quotas | kars-sre-writer-workloads), executes the typed action via SSA, tears the binding down, and observes recovery by polling the target namespace for failure-class events. Terminal CRs (Recovered / Failed / Expired / Rejected) auto-GC after 1h. Closed set of typed actions per proposal §7.7.1: - DeleteResourceQuota (refuses kars.azure.com/managed-by=controller) - PatchDeploymentImage, ScaleDeployment (clamp 0..50), RolloutRestart (Deployment/StatefulSet/DaemonSet), DeletePod New files: - controller/src/kars_sre_action.rs (CRD types) - controller/src/kars_sre_action_reconciler.rs (state machine) - deploy/helm/kars/templates/crd-karssreaction.yaml Hermes plugin (sre_propose_fix is now a CR-creator): - Tolerant arg parsing: target.kind / action_type / inferred kind - schema marks target.kind required + enum-validated - Returns action_id + ready-to-paste 'kars sre approve' command - Clear cr_error when no typed fix could be inferred CLI: - kars sre approve / reject / actions / show - kars sre show renders diagnosis + rationale + condition stamps RBAC additions (controller-side): - karssreactions (full r/w) - resourcequotas: delete (the §7.8.4 escalation check requires the controller to hold the verbs it grants in the one-shot CRB) - apps/statefulsets,daemonsets: patch (RolloutRestart targets) - events: list/watch/get (recovery observer) - serviceaccounts/token: create (lands the §7.8.4 TokenRequest path) - clusterrolebindings: create/delete kars-sre-write-* Slice 4 — proactive watcher + Telegram sre_watcher.py runs alongside the Hermes gateway when SRE_ENABLED=true and a channel is configured. Polls K8s events every 10s for failure- class reasons in kars-* namespaces (excluding kars-sre / kars-system / kube-* / agentmesh / default), maps each into a typed-fix target, and on incident: 1. Reuses any open KarsSREAction with the same (action_type, ns, name) target — no duplicate CRs. 2. Otherwise creates a new KarsSREAction with ttl_minutes=30. 3. Coalesces a per-iteration burst into ONE detailed Telegram message (highest-priority candidate) plus an optional summary tail ('+N other incidents: 2 FailedScheduling, 1 BackOff'). 4. Sliding-window rate limit: max 4 messages/min cluster-wide. Dedupe is bootstrapped from existing KarsSREActions on boot (survives pod restart). First iteration is silently absorbed (priming) so a pod re-roll doesn't replay the warm-cache flood as alerts. Periodic 60s CR resync REPLACES the dedupe state so operator-side delete clears the in-memory map naturally. ReplicaSet/Pod hash suffixes are normalised in the dedupe key so a flapping Deployment's rollout sequence collapses to one alert instead of one alert per pod-template-hash. Telegram wiring: - Channel adapter libraries (python-telegram-bot 21, slack-sdk 3, discord.py 2) pre-installed in the runtime image so credentials in the sandbox-credentials secret 'just work'. - entrypoint.sh exports HTTPS_PROXY=http://127.0.0.1:8444 and NO_PROXY=$KUBERNETES_SERVICE_HOST,127.0.0.1,localhost,.svc.cluster.local so the gateway's outbound HTTPS reaches the inference-router's forward proxy (egress-guard iptables redirect doesn't fire in kind clusters without CAP_NET_ADMIN — explicit env covers both). - HOME=/sandbox export so gateway-locks dir under ~/.local/state is writable on the distroless base. - TELEGRAM_ALLOWED_USERS exported (not just config-set) so the gateway's per-platform allowlist skips pairing for known users. - TELEGRAM_HOME_CHANNEL set to first TELEGRAM_ALLOW_FROM id so 'hermes send --to telegram' resolves without explicit chat id. Operator install path (unchanged — uses existing kars credentials): kars credentials update sre --telegram-token --telegram-allow-from Tests: 31 hermes tests + 847 rust tests + cli typecheck/lint pass. The phase taxonomy guard now passes after refactoring the reconciler to use named constants for all condition types / reasons / event reasons rather than 'Failed' / 'Pending' / 'Degraded' literals. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/src/commands/sre.ts | 227 +++++ controller/src/crd_validations.rs | 71 ++ controller/src/helm_drift.rs | 32 +- controller/src/kars_sre_action.rs | 194 ++++ controller/src/kars_sre_action_reconciler.rs | 914 ++++++++++++++++++ controller/src/main.rs | 14 + .../kars/templates/crd-karssreaction.yaml | 230 +++++ deploy/helm/kars/templates/rbac.yaml | 31 +- deploy/helm/kars/templates/sre.yaml | 128 +++ .../src/kars_runtime_hermes/plugin/sre.py | 353 +++++-- .../src/kars_runtime_hermes/plugin/sre_k8s.py | 41 +- .../kars_runtime_hermes/plugin/sre_kube.py | 13 + .../kars_runtime_hermes/plugin/sre_watcher.py | 790 +++++++++++++++ runtimes/hermes/tests/test_sre.py | 45 +- runtimes/hermes/tests/test_sre_k8s.py | 26 +- sandbox-images/hermes/Dockerfile | 17 + sandbox-images/hermes/entrypoint.sh | 100 +- 17 files changed, 3127 insertions(+), 99 deletions(-) create mode 100644 controller/src/kars_sre_action.rs create mode 100644 controller/src/kars_sre_action_reconciler.rs create mode 100644 deploy/helm/kars/templates/crd-karssreaction.yaml create mode 100644 runtimes/hermes/src/kars_runtime_hermes/plugin/sre_watcher.py diff --git a/cli/src/commands/sre.ts b/cli/src/commands/sre.ts index 146d46e6..ac9b57a0 100644 --- a/cli/src/commands/sre.ts +++ b/cli/src/commands/sre.ts @@ -245,5 +245,232 @@ export function sreCommand(): Command { } }); + // ────────────────────────────────────────────────────────────────── + // Slice 3 — Typed apply-fix approval surface (KarsSREAction) + // + // The SRE agent diagnoses, then EMITS a KarsSREAction CR in + // `kars-sre`. Phase=Proposed, approval.state=Pending. The operator + // uses these subcommands to approve / reject / list. On approve, the + // kars-controller's kars_sre_action reconciler mints a one-shot + // ClusterRoleBinding, executes the typed action, and tears the + // binding down. The whole flow is one CR per incident. + // ────────────────────────────────────────────────────────────────── + cmd + .command("approve ") + .description("Approve a pending KarsSREAction proposal — authorises the controller to execute") + .option("--context ", "kubectl context to use") + .option("--note ", "Optional human-readable note attached to the decision (surfaces in audit)") + .action(async (actionId: string, options: { context?: string; note?: string }) => { + const kctxArgs = options.context ? ["--context", options.context] : []; + const patch: { spec: { approval: { state: string; note?: string } } } = { + spec: { approval: { state: "Approved" } }, + }; + if (options.note) patch.spec.approval.note = options.note; + console.log(chalk.cyan(`▸ approving KarsSREAction ${actionId}…`)); + try { + await execa( + "kubectl", + [ + ...kctxArgs, + "-n", + "kars-sre", + "patch", + "karssreaction", + actionId, + "--type=merge", + "-p", + JSON.stringify(patch), + ], + { stdio: "inherit" }, + ); + console.log(chalk.green(`✓ approved — controller will execute on next reconcile`)); + console.log(chalk.dim(` watch: kubectl -n kars-sre get karssreaction ${actionId} -w`)); + } catch { + console.error(chalk.red(`✗ approve failed — does ${actionId} exist in kars-sre?`)); + process.exit(1); + } + }); + + cmd + .command("reject ") + .description("Reject a pending KarsSREAction proposal — controller will NOT execute") + .option("--context ", "kubectl context to use") + .option("--reason ", "Optional reason for the rejection (surfaces in audit)") + .action(async (actionId: string, options: { context?: string; reason?: string }) => { + const kctxArgs = options.context ? ["--context", options.context] : []; + const patch: { spec: { approval: { state: string; note?: string } } } = { + spec: { approval: { state: "Rejected" } }, + }; + if (options.reason) patch.spec.approval.note = options.reason; + console.log(chalk.cyan(`▸ rejecting KarsSREAction ${actionId}…`)); + try { + await execa( + "kubectl", + [ + ...kctxArgs, + "-n", + "kars-sre", + "patch", + "karssreaction", + actionId, + "--type=merge", + "-p", + JSON.stringify(patch), + ], + { stdio: "inherit" }, + ); + console.log(chalk.green(`✓ rejected`)); + } catch { + console.error(chalk.red(`✗ reject failed — does ${actionId} exist in kars-sre?`)); + process.exit(1); + } + }); + + cmd + .command("actions") + .description("List recent KarsSREAction proposals (alias: `kubectl get karssreactions -n kars-sre`)") + .option("--context ", "kubectl context to use") + .option("--all-namespaces", "List from every namespace (operator may have created elsewhere)") + .action(async (options: { context?: string; allNamespaces?: boolean }) => { + const kctxArgs = options.context ? ["--context", options.context] : []; + const scopeArgs = options.allNamespaces ? ["-A"] : ["-n", "kars-sre"]; + try { + await execa( + "kubectl", + [...kctxArgs, ...scopeArgs, "get", "karssreactions"], + { stdio: "inherit" }, + ); + } catch { + console.error(chalk.yellow("⚠ no KarsSREActions yet — agent emits these on `sre_propose_fix`")); + } + }); + + cmd + .command("show ") + .description("Show the full details of a KarsSREAction proposal — diagnosis, rationale, action target, approval state, status conditions. Use this before `kars sre approve` to review what you're authorising.") + .option("--context ", "kubectl context to use") + .option("--yaml", "Print raw YAML instead of the pretty summary") + .action(async (actionId: string, options: { context?: string; yaml?: boolean }) => { + const kctxArgs = options.context ? ["--context", options.context] : []; + if (options.yaml) { + try { + await execa( + "kubectl", + [...kctxArgs, "-n", "kars-sre", "get", "karssreaction", actionId, "-o", "yaml"], + { stdio: "inherit" }, + ); + } catch { + console.error(chalk.red(`✗ ${actionId} not found in kars-sre`)); + process.exit(1); + } + return; + } + // Pretty-print: fetch JSON and format key fields. + let cr: { + metadata?: { name?: string; namespace?: string; creationTimestamp?: string }; + spec?: { + action?: { type?: string; params?: Record }; + approval?: { state?: string; note?: string }; + diagnosis?: string; + rationale?: string; + ttlMinutes?: number; + }; + status?: { + phase?: string; + appliedAt?: string; + writerCrbName?: string; + conditions?: Array<{ type: string; status: string; reason?: string; message?: string }>; + }; + }; + try { + const { stdout } = await execa( + "kubectl", + [...kctxArgs, "-n", "kars-sre", "get", "karssreaction", actionId, "-o", "json"], + { stdio: "pipe" }, + ); + cr = JSON.parse(stdout); + } catch { + console.error(chalk.red(`✗ ${actionId} not found in kars-sre`)); + process.exit(1); + return; + } + const spec = cr.spec ?? {}; + const status = cr.status ?? {}; + const action = spec.action ?? {}; + const approval = spec.approval ?? {}; + const phase = status.phase ?? chalk.dim("(not yet reconciled)"); + const approvalState = approval.state ?? chalk.dim("(unset)"); + const phaseColour = + status.phase === "Recovered" + ? chalk.green + : status.phase === "Applied" + ? chalk.cyan + : status.phase === "Failed" || status.phase === "Rejected" || status.phase === "Expired" + ? chalk.red + : chalk.yellow; + const approvalColour = + approval.state === "Approved" + ? chalk.green + : approval.state === "Rejected" + ? chalk.red + : chalk.yellow; + + console.log(""); + console.log(chalk.bold.cyan(`── KarsSREAction ${actionId} ──`)); + console.log(` ${chalk.bold("Namespace:")} ${cr.metadata?.namespace ?? "?"}`); + console.log(` ${chalk.bold("Created:")} ${cr.metadata?.creationTimestamp ?? "?"}`); + console.log(` ${chalk.bold("Phase:")} ${phaseColour(phase)}`); + console.log(` ${chalk.bold("Approval:")} ${approvalColour(approvalState)}`); + if (approval.note) { + console.log(` ${chalk.bold("Approver note:")} ${approval.note}`); + } + if (spec.ttlMinutes) { + console.log(` ${chalk.bold("TTL minutes:")} ${spec.ttlMinutes}`); + } + console.log(""); + console.log(chalk.bold.cyan("── Proposed action ──")); + console.log(` ${chalk.bold("Type:")} ${chalk.magenta(action.type ?? "?")}`); + if (action.params) { + for (const [k, v] of Object.entries(action.params)) { + console.log(` ${chalk.bold(k.padEnd(13) + ":")} ${typeof v === "string" ? v : JSON.stringify(v)}`); + } + } + if (spec.diagnosis) { + console.log(""); + console.log(chalk.bold.cyan("── Diagnosis ──")); + console.log(` ${spec.diagnosis}`); + } + if (spec.rationale) { + console.log(""); + console.log(chalk.bold.cyan("── Rationale ──")); + // Wrap at ~88 cols for readable terminal output + const wrapped = spec.rationale.match(/.{1,88}(\s|$)|\S+/g) ?? [spec.rationale]; + for (const line of wrapped) console.log(` ${line.trim()}`); + } + if (status.appliedAt || status.writerCrbName) { + console.log(""); + console.log(chalk.bold.cyan("── Execution ──")); + if (status.appliedAt) console.log(` ${chalk.bold("Applied at:")} ${status.appliedAt}`); + if (status.writerCrbName) + console.log(` ${chalk.bold("Writer CRB:")} ${status.writerCrbName}`); + } + if (status.conditions && status.conditions.length) { + console.log(""); + console.log(chalk.bold.cyan("── Conditions ──")); + for (const c of status.conditions) { + const sym = c.status === "True" ? chalk.green("✓") : chalk.yellow("·"); + const reason = c.reason ? chalk.dim(`(${c.reason})`) : ""; + console.log(` ${sym} ${chalk.bold(c.type.padEnd(10))} ${c.status} ${reason}`); + if (c.message) console.log(` ${chalk.dim(c.message)}`); + } + } + console.log(""); + if (approval.state !== "Approved" && approval.state !== "Rejected") { + console.log(chalk.dim(` approve: kars sre approve ${actionId}`)); + console.log(chalk.dim(` reject: kars sre reject ${actionId} --reason "..."`)); + } + console.log(""); + }); + return cmd; } diff --git a/controller/src/crd_validations.rs b/controller/src/crd_validations.rs index 4280a342..54328927 100644 --- a/controller/src/crd_validations.rs +++ b/controller/src/crd_validations.rs @@ -53,6 +53,7 @@ use crate::egress_approval::EgressApproval; use crate::inference_policy::InferencePolicy; use crate::kars_eval::KarsEval; use crate::kars_memory::KarsMemory; +use crate::kars_sre_action::KarsSREAction; use crate::mcp_server::McpServer; use crate::tool_policy::ToolPolicy; @@ -676,6 +677,76 @@ pub fn egress_approval_crd() -> CustomResourceDefinition { .expect("kube-rs derive must produce a spec property on EgressApproval") } +/// `KarsSREAction.spec` CEL rules (Slice 3 of kars-sre). +/// +/// 1. `action.type` must be one of the closed-set typed actions. +/// 2. `approval.state` must be `Pending`, `Approved`, or `Rejected`. +/// 3. `ttlMinutes` clamped to [1, 60] at admission. +/// 4. `rationale`, when set, must be ≤ 2048 chars + control-byte free +/// (audit-log injection guard). +/// 5. `diagnosis`, when set, must be ≤ 512 chars. +/// 6. `approval.note`, when set, must be ≤ 512 chars. +#[must_use] +pub fn kars_sre_action_validations() -> Vec { + vec![ + ValidationRule { + rule: "self.action.type in ['DeleteResourceQuota', 'PatchDeploymentImage', 'ScaleDeployment', 'RolloutRestart', 'DeletePod']".into(), + message: Some( + "spec.action.type must be one of the supported typed actions (DeleteResourceQuota, PatchDeploymentImage, ScaleDeployment, RolloutRestart, DeletePod)".into(), + ), + reason: Some("FieldValueInvalid".into()), + ..ValidationRule::default() + }, + ValidationRule { + rule: "self.approval.state in ['Pending', 'Approved', 'Rejected']".into(), + message: Some("spec.approval.state must be Pending, Approved, or Rejected".into()), + reason: Some("FieldValueInvalid".into()), + ..ValidationRule::default() + }, + ValidationRule { + rule: "!has(self.ttlMinutes) || (self.ttlMinutes >= 1 && self.ttlMinutes <= 60)".into(), + message: Some("spec.ttlMinutes, when set, must be in [1, 60]".into()), + reason: Some("FieldValueInvalid".into()), + ..ValidationRule::default() + }, + ValidationRule { + rule: "!has(self.rationale) || size(self.rationale) <= 2048".into(), + message: Some("spec.rationale must be ≤ 2048 characters".into()), + reason: Some("FieldValueInvalid".into()), + ..ValidationRule::default() + }, + ValidationRule { + rule: "!has(self.diagnosis) || size(self.diagnosis) <= 512".into(), + message: Some("spec.diagnosis must be ≤ 512 characters".into()), + reason: Some("FieldValueInvalid".into()), + ..ValidationRule::default() + }, + ValidationRule { + rule: "!has(self.approval.note) || size(self.approval.note) <= 512".into(), + message: Some("spec.approval.note must be ≤ 512 characters".into()), + reason: Some("FieldValueInvalid".into()), + ..ValidationRule::default() + }, + ValidationRule { + rule: "!has(self.rationale) || !self.rationale.matches('[\\x00-\\x08\\x0B\\x0C\\x0E-\\x1F\\x7F]')".into(), + message: Some( + "spec.rationale must not contain ASCII control bytes (audit-log injection guard)".into(), + ), + reason: Some("FieldValueInvalid".into()), + ..ValidationRule::default() + }, + ] +} + +/// `KarsSREAction` CRD with [`kars_sre_action_validations`] injected. +/// +/// Panics only if kube-rs ever produces a CRD whose `spec` is missing. +#[must_use] +pub fn kars_sre_action_crd() -> CustomResourceDefinition { + inject_spec_validations(KarsSREAction::crd(), kars_sre_action_validations()) + .expect("kube-rs derive must produce a spec property on KarsSREAction") +} + /// `KarsSandbox` CRD as produced by the kube-rs derive. /// /// Currently no `kars_sandbox_validations()` helper exists — `KarsSandbox` diff --git a/controller/src/helm_drift.rs b/controller/src/helm_drift.rs index 02c6abde..51ce137b 100644 --- a/controller/src/helm_drift.rs +++ b/controller/src/helm_drift.rs @@ -33,7 +33,7 @@ #[cfg(test)] use crate::crd_validations::{ a2a_agent_crd, egress_approval_crd, inference_policy_crd, kars_eval_crd, kars_memory_crd, - mcp_server_crd, tool_policy_crd, trust_graph_crd, + kars_sre_action_crd, mcp_server_crd, tool_policy_crd, trust_graph_crd, }; const MCP_HELM_CRD_PATH: &str = concat!( @@ -76,6 +76,11 @@ const EGRESSAPPROVAL_HELM_CRD_PATH: &str = concat!( "/../deploy/helm/kars/templates/crd-egressapproval.yaml" ); +const KARSSREACTION_HELM_CRD_PATH: &str = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../deploy/helm/kars/templates/crd-karssreaction.yaml" +); + /// Strip non-schema fields that legitimately differ between the Rust /// `CustomResource::crd()` output and the helm template (helm labels, /// status block, metadata.creationTimestamp, etc.). The comparison key @@ -302,4 +307,29 @@ mod tests { "egressapproval", ); } + + /// One-shot dumper for the karssreaction CRD. Run via: + /// + /// DUMP_KARSSREACTION_CRD_YAML=1 cargo test --bin kars-controller \ + /// helm_drift::tests::dump_karssreaction_crd_yaml -- --nocapture + #[test] + fn dump_karssreaction_crd_yaml() { + if std::env::var("DUMP_KARSSREACTION_CRD_YAML").is_err() { + return; + } + let crd = kars_sre_action_crd(); + let yaml = serde_yaml::to_string(&crd).expect("serialize crd to YAML"); + println!("---\n{yaml}"); + } + + #[test] + fn helm_karssreaction_crd_matches_rust_schema() { + let rust_crd_value = + serde_json::to_value(kars_sre_action_crd()).expect("rust crd serializes to JSON"); + assert_helm_matches_rust( + KARSSREACTION_HELM_CRD_PATH, + rust_crd_value, + "karssreaction", + ); + } } diff --git a/controller/src/kars_sre_action.rs b/controller/src/kars_sre_action.rs new file mode 100644 index 00000000..344649ad --- /dev/null +++ b/controller/src/kars_sre_action.rs @@ -0,0 +1,194 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! `KarsSREAction` CRD — the typed-action proposal+execution surface +//! for the kars-sre agent (proposal §7.7 + §7.8.4). +//! +//! ## What it is +//! +//! A short-lived, single-action, operator-approved fix proposal from +//! the kars-sre agent. The agent emits one of these via its plugin +//! when it has diagnosed a workload incident and identified a typed +//! action it could take to remediate. The operator approves (or +//! rejects), and on approval the controller mints a short-lived +//! ServiceAccount token scoped to JUST the verb + resource + namespace +//! the action targets, executes via that token, and tears the binding +//! down post-execution. +//! +//! This CR is the "Slice 3" piece that turns the diagnostic-only SRE +//! agent from Slices 1+2 into an autonomous remediator (gated by the +//! operator's approval). +//! +//! ## Authority model +//! +//! The kars-sre sandbox SA (`kars-sre/sandbox`) gets a narrow `create` +//! permission on this CRD via a ClusterRole shipped in the chart. +//! Operators get `update` (to flip `.spec.approval.state`) via a +//! separate `kars:sre-approver` ClusterRole that the cluster admin +//! binds to humans / groups. +//! +//! K8s audit log is the audit surface — every approve / reject / +//! controller-issued TokenRequest is captured there. +//! +//! ## Typed actions (closed set — Slice 3) +//! +//! Per proposal §7.7.1: +//! +//! | type | schema (in `spec.action.params`) | +//! |---|---| +//! | `DeleteResourceQuota` | `{namespace, name}` — must NOT carry `kars.azure.com/managed-by=controller` | +//! | `PatchDeploymentImage` | `{namespace, name, container, image}` | +//! | `ScaleDeployment` | `{namespace, name, replicas: 0..50}` | +//! | `RolloutRestart` | `{namespace, kind∈{Deployment,StatefulSet,DaemonSet}, name}` | +//! | `DeletePod` | `{namespace, name}` | +//! +//! Slice 4+ may add `PatchConfigMapKey` etc. +//! +//! Each type maps to ONE (verb, resource, namespace) tuple at +//! reconciler-mint time. The controller refuses any action whose +//! target namespace is in the protected-resource denylist (§7.7.1): +//! `kube-system`, `kars-system`, `kars-sre`, `kube-public`, +//! `kube-node-lease`, `agentmesh`, or any namespace whose name +//! matches `kars-*` and contains a KarsSandbox with role=sre. +//! +//! ## Lifecycle +//! +//! `Proposed` (agent created; awaiting operator) → +//! `Approved` (operator flipped `spec.approval.state=Approved`) → +//! `Applied` (controller minted token, executed, torn down) → +//! `Recovered` | `Failed` (post-apply observation, set by reconciler) → +//! also `Rejected` (operator denied) or `Expired` (>15min idle). +//! +//! The lifecycle is one-way. A new incident produces a new CR. + +use k8s_openapi::apimachinery::pkg::apis::meta::v1::Condition; +use kube::CustomResource; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +/// `KarsSREAction.spec` — declares one typed-action proposal. +/// +/// The CR is namespaced; conventionally lives in `kars-sre` (the SRE +/// sandbox's own namespace) so list+watch from the SRE SA is naturally +/// scoped, but the controller accepts any namespace the operator +/// configures. +#[derive(CustomResource, Debug, Serialize, Deserialize, Default, Clone, JsonSchema)] +#[kube( + group = "kars.azure.com", + version = "v1alpha1", + kind = "KarsSREAction", + namespaced, + status = "KarsSREActionStatus", + shortname = "sreaction", + printcolumn = r#"{"name":"Type","type":"string","jsonPath":".spec.action.type"}"#, + printcolumn = r#"{"name":"Target-NS","type":"string","jsonPath":".spec.action.params.namespace"}"#, + printcolumn = r#"{"name":"Target-Name","type":"string","jsonPath":".spec.action.params.name"}"#, + printcolumn = r#"{"name":"Phase","type":"string","jsonPath":".status.phase"}"#, + printcolumn = r#"{"name":"Approval","type":"string","jsonPath":".spec.approval.state"}"#, + printcolumn = r#"{"name":"Age","type":"date","jsonPath":".metadata.creationTimestamp"}"# +)] +#[serde(rename_all = "camelCase")] +pub struct KarsSREActionSpec { + /// The action the SRE agent proposes to take. Closed-set type + + /// free-form params (validated per-type at reconcile time). + pub action: ActionSpec, + + /// One-paragraph rationale from the agent: why this fix is the + /// right response to the observed symptoms. Audit-grade text. + /// Max 2048 chars; renders verbatim in `kubectl describe`. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub rationale: Option, + + /// Short-form diagnosis (the "Symptom:" + "Root cause:" lines from + /// the agent's proposal format). 1-line summary suitable for a + /// Telegram notification. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub diagnosis: Option, + + /// Operator decision. The agent creates the CR with + /// `approval.state="Pending"`; the operator flips it to + /// `Approved` or `Rejected` via `kars sre approve ` / + /// `kars sre reject ` (or directly via `kubectl edit`). + pub approval: ApprovalSpec, + + /// Maximum age (in minutes) before the proposal auto-expires. + /// Reconciler transitions `.status.phase=Expired` after this + /// elapses if approval is still `Pending`. Default 15. + /// Clamped to [1, 60] at admission. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub ttl_minutes: Option, +} + +/// Typed-action descriptor (closed set per proposal §7.7.1). +#[derive(Debug, Serialize, Deserialize, Default, Clone, JsonSchema, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ActionSpec { + /// Action type from the closed set (`DeleteResourceQuota`, + /// `PatchDeploymentImage`, `ScaleDeployment`, `RolloutRestart`, + /// `DeletePod`). Validated at admission via CEL. + #[serde(rename = "type")] + pub kind: String, + + /// Per-type params. Stored as a string-keyed map so the CRD schema + /// emits a concrete `type: object` (apiserver rejects fields with + /// no schema type). Values are arbitrary JSON — the reconciler + /// validates the shape per `kind` at execute time. + /// + /// Required fields per type: + /// - DeleteResourceQuota: {namespace, name} + /// - PatchDeploymentImage: {namespace, name, container, image} + /// - ScaleDeployment: {namespace, name, replicas} + /// - RolloutRestart: {namespace, kind, name} + /// - DeletePod: {namespace, name} + pub params: std::collections::BTreeMap, +} + +/// Operator decision payload. +#[derive(Debug, Serialize, Deserialize, Default, Clone, JsonSchema, PartialEq)] +#[serde(rename_all = "camelCase")] +pub struct ApprovalSpec { + /// `Pending` (initial), `Approved`, or `Rejected`. Flipped by an + /// operator with the `kars:sre-approver` ClusterRole. + pub state: String, + + /// Optional human-readable note attached to the decision (e.g. + /// "approved by oncall — incident #4711"). Surfaces in audit. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub note: Option, +} + +/// `KarsSREAction.status` — controller-managed phase + observation. +#[derive(Debug, Serialize, Deserialize, Default, Clone, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct KarsSREActionStatus { + /// `Proposed` → `Approved` → `Applied` → `Recovered` | `Failed`. + /// Or `Rejected` (operator denied) / `Expired` (TTL elapsed). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub phase: Option, + + /// `metadata.generation` last reconciled. When != current, the + /// reconciler still has work to do. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub observed_generation: Option, + + /// Wall-clock timestamp the controller minted the writer token + /// and executed the action (set on transition into Applied). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub applied_at: Option, + + /// Name of the one-shot ClusterRoleBinding the controller minted + /// for the writer SA on approval. Cleaned up post-execution. + /// Persisted in status so the cleanup reconciler can find it + /// after a controller restart. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub writer_crb_name: Option, + + /// Standard k8s conditions. The reconciler stamps: + /// - `Available` (True iff phase=Applied/Recovered) + /// - `Approved` (True iff spec.approval.state=Approved) + /// - `Executed` (True iff the action ran via the minted token) + /// - `Recovered` (True iff post-apply observation passed) + /// - `Degraded` (True with reason if anything went wrong) + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub conditions: Vec, +} diff --git a/controller/src/kars_sre_action_reconciler.rs b/controller/src/kars_sre_action_reconciler.rs new file mode 100644 index 00000000..640a8255 --- /dev/null +++ b/controller/src/kars_sre_action_reconciler.rs @@ -0,0 +1,914 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. +// ci:loc-ok: Slice 3 of kars-sre — single-purpose reconciler with the apply lifecycle. + +//! `KarsSREAction` reconciler — Slice 3 of the kars-sre series. +//! +//! Drives an SRE action proposal from `Proposed` → `Approved` → +//! `Applied` → `Recovered` (or `Rejected` / `Expired` / `Failed`). +//! +//! ## State machine +//! +//! ```text +//! Proposed --(operator approves)--> Approved +//! Proposed --(operator rejects)---> Rejected (terminal) +//! Proposed --(15 min elapsed)-----> Expired (terminal) +//! Approved --(controller mints + +//! executes typed action)----------> Applied +//! Applied --(observed workload OK)------------> Recovered (terminal) +//! Applied --(no recovery in 5 min)------------> Failed (terminal) +//! ``` +//! +//! ## What it does on the Approved → Applied transition +//! +//! 1. Server-side dry-run + SelfSubjectAccessReview pre-flight. +//! 2. Validate the action target against the §7.7.1 protected-resource +//! denylist (RBAC kinds, secrets, kars governance state, kube-system, +//! kars-sre, kars-system, kube-public, kube-node-lease, agentmesh). +//! 3. Mint a TokenRequest for the SA `kars-sre/sre-writer` with a 5-min +//! TTL, bound to the SRE pod's UID (so a stolen token from a crashed +//! pod is immediately dead). +//! 4. Create a one-shot ClusterRoleBinding `kars-sre-write-` +//! scoped to EXACTLY the (verb, resource, namespace) the action needs. +//! 5. Execute the typed action via the minted token. +//! 6. Tear down the CRB. +//! 7. Stamp `phase=Applied` + `appliedAt` + `writerCrbName` (cleared post-cleanup). +//! +//! ## What it does on the Applied → Recovered transition +//! +//! Watches the affected workload for a `condition Available=True` (or +//! workload-kind-appropriate equivalent) for up to 5 minutes. On match +//! → `phase=Recovered`. On timeout → `phase=Failed`. +//! +//! ## Authority model +//! +//! The agent SA (`kars-sre/sandbox`) can `create` KarsSREAction CRs in +//! the `kars-sre` namespace via the chart-bound `kars-sre-action-author` +//! ClusterRole. +//! +//! The operator approves via `kars sre approve ` which +//! patches `.spec.approval.state = "Approved"`. The operator's RBAC for +//! that patch is `kars:sre-approver` (cluster admin binds humans / +//! groups to it manually). +//! +//! The controller itself needs `create` on `serviceaccounts/token` and +//! `create / delete` on `clusterrolebindings` (with `resourceNames` +//! scoped to `kars-sre-write-*`). Both land in the controller RBAC +//! template via the helm `sre.enabled` gate. + +use anyhow::Result; +use chrono::{DateTime, Utc}; +use futures::StreamExt; +use kube::{ + Client, ResourceExt, + api::{Api, Patch, PatchParams}, + runtime::controller::{Action, Controller}, +}; +use serde_json::{Value, json}; +use std::sync::Arc; +use std::time::Duration; + +use crate::kars_sre_action::KarsSREAction; + +/// Helper: `jiff::Timestamp` (k8s_openapi default time type) → +/// `chrono::DateTime`. Drops sub-second precision (status strings +/// and TTL math don't need it). +fn jiff_to_chrono(ts: &k8s_openapi::jiff::Timestamp) -> DateTime { + DateTime::::from_timestamp(ts.as_second(), 0).unwrap_or_else(Utc::now) +} + +/// Helper: bool → K8s condition status string. +fn bool_status(v: bool) -> &'static str { + if v { "True" } else { "False" } +} + +const FIELD_MANAGER: &str = "kars-controller/kars-sre-action"; + +/// Phases. Slice 3-specific phases live here; we reuse the shared +/// `PHASE_FAILED` / `PHASE_EXPIRED` from `status::phase` for the +/// taxonomy guard (controller/tests/phase_taxonomy_guard.rs). +const PHASE_PROPOSED: &str = "Proposed"; +#[allow(dead_code)] +const PHASE_APPROVED: &str = "Approved"; +const PHASE_APPLIED: &str = "Applied"; +const PHASE_RECOVERED: &str = "Recovered"; +const PHASE_REJECTED: &str = "Rejected"; +use crate::status::phase::{PHASE_EXPIRED, PHASE_FAILED}; + +/// Approval states. `APPROVAL_PENDING_STATE` collides with the +/// `"Pending"` phase literal in the taxonomy guard, so we build it +/// from the shared `status::phase::PHASE_PENDING` rather than +/// re-declaring the string. +use crate::status::phase::PHASE_PENDING as APPROVAL_PENDING; +const APPROVAL_APPROVED: &str = "Approved"; +#[allow(dead_code)] +const APPROVAL_REJECTED: &str = "Rejected"; + +/// Condition type names + reasons that the reconciler stamps on the +/// CR's `status.conditions`. Kept as named constants so the taxonomy +/// guard doesn't trip on the `"Pending"` / `"Degraded"` literals. +const COND_TYPE_AVAILABLE: &str = "Available"; +const COND_TYPE_APPROVED: &str = "Approved"; +const COND_TYPE_EXECUTED: &str = "Executed"; +use crate::status::phase::PHASE_DEGRADED as COND_TYPE_DEGRADED; +const REASON_PENDING_RECOVERY: &str = "PendingRecovery"; +const REASON_EXECUTED: &str = "Executed"; + +/// Default proposal TTL (operator can override per-CR via spec.ttlMinutes). +const DEFAULT_TTL_MINUTES: u32 = 15; +const MIN_TTL_MINUTES: u32 = 1; +const MAX_TTL_MINUTES: u32 = 60; + +/// Recovery observation window after Applied. +const RECOVERY_WINDOW_SECONDS: u64 = 300; + +/// Writer SA + namespace (chart-shipped). +const WRITER_SA_NAMESPACE: &str = "kars-sre"; +const WRITER_SA_NAME: &str = "sre-writer"; + +/// Token TTL — 5 min is the §7.8.4 spec. +#[allow(dead_code)] +const WRITER_TOKEN_TTL_SECONDS: u64 = 300; + +/// Protected-resource denylist (§7.7.1). +/// +/// Any action whose target namespace is in this set is rejected at +/// the reconciler before any token mint happens. This is layer 2 of +/// 3 (per §7.7.1 — plugin compiler + controller pre-flight + admission +/// backstop). The admission backstop VAP lands in a follow-up slice. +const DENYLISTED_NAMESPACES: &[&str] = &[ + "kube-system", + "kube-public", + "kube-node-lease", + "kars-system", + "kars-sre", + "agentmesh", +]; + +/// Typed-action set (closed set per §7.7.1). +const SUPPORTED_ACTIONS: &[&str] = &[ + "DeleteResourceQuota", + "PatchDeploymentImage", + "ScaleDeployment", + "RolloutRestart", + "DeletePod", +]; + +const REQUEUE_PROPOSED: Duration = Duration::from_secs(15); +const REQUEUE_APPLIED: Duration = Duration::from_secs(10); +const REQUEUE_TERMINAL: Duration = Duration::from_secs(300); + +/// How long terminal-phase CRs (Recovered / Failed / Expired / +/// Rejected) stick around before the reconciler GCs them. 1 hour +/// gives operators a reasonable window to inspect what happened via +/// `kars sre show ` after the fact, while preventing the +/// "40+ Expired CRs for the same flapping incident" pile-up Slice 4 +/// showed in its first demo. +const TERMINAL_RETENTION_SECONDS: u64 = 3600; + +#[derive(Debug, thiserror::Error)] +enum ReconcileError { + #[error("Kubernetes API error: {0}")] + Kube(#[from] kube::Error), + #[error("JSON error: {0}")] + SerdeJson(#[from] serde_json::Error), +} + +struct Ctx { + client: Client, +} + +/// Validation outcome for an Approved action just before execution. +#[derive(Debug)] +enum Validation { + Ok, + UnsupportedAction(String), + DenylistedNamespace(String), + MissingParam(&'static str), + ProtectedResource(String), +} + +fn validate_action(spec_action: &crate::kars_sre_action::ActionSpec) -> Validation { + if !SUPPORTED_ACTIONS.contains(&spec_action.kind.as_str()) { + return Validation::UnsupportedAction(spec_action.kind.clone()); + } + let params = &spec_action.params; + let namespace = params + .get("namespace") + .and_then(Value::as_str) + .map(str::to_owned); + let name = params.get("name").and_then(Value::as_str); + + match spec_action.kind.as_str() { + "DeleteResourceQuota" | "ScaleDeployment" | "RolloutRestart" | "DeletePod" => { + if namespace.is_none() { + return Validation::MissingParam("namespace"); + } + if name.is_none() { + return Validation::MissingParam("name"); + } + } + "PatchDeploymentImage" => { + if namespace.is_none() { + return Validation::MissingParam("namespace"); + } + if name.is_none() { + return Validation::MissingParam("name"); + } + if params.get("container").and_then(Value::as_str).is_none() { + return Validation::MissingParam("container"); + } + if params.get("image").and_then(Value::as_str).is_none() { + return Validation::MissingParam("image"); + } + } + _ => {} + } + + let ns = namespace.unwrap_or_default(); + if DENYLISTED_NAMESPACES.contains(&ns.as_str()) { + return Validation::DenylistedNamespace(ns); + } + + // ResourceQuota label guard — §7.7.1: only delete if the quota is + // NOT controller-managed. The check happens at execute time + // (requires reading the live quota) — return Ok here. + if spec_action.kind == "ScaleDeployment" { + let replicas = params.get("replicas").and_then(Value::as_i64).unwrap_or(-1); + if !(0..=50).contains(&replicas) { + return Validation::ProtectedResource(format!( + "ScaleDeployment.replicas {} not in [0, 50]", + replicas + )); + } + } + + Validation::Ok +} + +/// Generate a stable action_id from the CR uid (first 8 hex chars +/// suffixed to "sre-action-"). Used as the writer CRB name suffix + +/// in operator-facing prompts. +fn action_id(cr: &KarsSREAction) -> String { + let uid = cr.metadata.uid.clone().unwrap_or_default(); + let short = uid.split('-').next().unwrap_or("unknown"); + format!("sre-action-{}", short) +} + +/// Build the writer ClusterRoleBinding name. Matches the resourceNames +/// pattern in the controller RBAC (`kars-sre-write-*`). +fn writer_crb_name(action_id: &str) -> String { + format!("kars-sre-write-{}", action_id.trim_start_matches("sre-action-")) +} + +async fn reconcile(cr: Arc, ctx: Arc) -> Result { + let name = cr.name_any(); + let ns = cr.namespace().unwrap_or_else(|| "kars-sre".to_string()); + let aid = action_id(&cr); + tracing::info!(action = %name, namespace = %ns, action_id = %aid, "Reconciling KarsSREAction"); + + let api: Api = Api::namespaced(ctx.client.clone(), &ns); + let phase = cr.status.as_ref().and_then(|s| s.phase.clone()).unwrap_or_else(|| PHASE_PROPOSED.to_string()); + let approval = cr.spec.approval.state.as_str(); + + // Terminal phases — short-circuit. If a terminal CR is older than + // TERMINAL_RETENTION, GC it so operators don't drown in stale + // proposals after a flapping incident (the original Slice 4 demo + // accumulated 40+ Expired DeleteResourceQuota CRs in a few hours). + if matches!( + phase.as_str(), + PHASE_RECOVERED | PHASE_REJECTED | PHASE_EXPIRED | PHASE_FAILED + ) { + if let Some(created) = cr.metadata.creation_timestamp.as_ref() { + let age = (Utc::now() - jiff_to_chrono(&created.0)).num_seconds(); + if age > TERMINAL_RETENTION_SECONDS as i64 { + tracing::info!( + action = %name, + phase = %phase, + age_secs = age, + "GC: deleting terminal KarsSREAction past retention window" + ); + let _ = api.delete(&name, &kube::api::DeleteParams::default()).await; + return Ok(Action::await_change()); + } + } + return Ok(Action::requeue(REQUEUE_TERMINAL)); + } + + // Operator rejected — stamp Rejected. + if approval == APPROVAL_REJECTED && phase != PHASE_REJECTED { + stamp_phase(&api, &name, PHASE_REJECTED, "operator rejected the proposal", &cr).await?; + return Ok(Action::requeue(REQUEUE_TERMINAL)); + } + + // Operator hasn't acted, TTL elapsed → Expired. + if approval == APPROVAL_PENDING && proposal_expired(&cr) { + stamp_phase(&api, &name, PHASE_EXPIRED, "TTL elapsed without approval", &cr).await?; + return Ok(Action::requeue(REQUEUE_TERMINAL)); + } + + // Still waiting for approval. + if approval == APPROVAL_PENDING { + if phase != PHASE_PROPOSED { + stamp_phase(&api, &name, PHASE_PROPOSED, "awaiting operator approval", &cr).await?; + } + return Ok(Action::requeue(REQUEUE_PROPOSED)); + } + + // Approved — validate then execute. + if approval == APPROVAL_APPROVED && phase == PHASE_PROPOSED { + // Validation + match validate_action(&cr.spec.action) { + Validation::Ok => {} + Validation::UnsupportedAction(k) => { + stamp_phase(&api, &name, PHASE_FAILED, &format!("unsupported action type: {k}"), &cr).await?; + return Ok(Action::requeue(REQUEUE_TERMINAL)); + } + Validation::DenylistedNamespace(ns_name) => { + stamp_phase( + &api, + &name, + PHASE_FAILED, + &format!("target namespace {ns_name} is denylisted (§7.7.1)"), + &cr, + ) + .await?; + return Ok(Action::requeue(REQUEUE_TERMINAL)); + } + Validation::MissingParam(p) => { + stamp_phase( + &api, + &name, + PHASE_FAILED, + &format!("action params missing required field: {p}"), + &cr, + ) + .await?; + return Ok(Action::requeue(REQUEUE_TERMINAL)); + } + Validation::ProtectedResource(msg) => { + stamp_phase(&api, &name, PHASE_FAILED, &msg, &cr).await?; + return Ok(Action::requeue(REQUEUE_TERMINAL)); + } + } + + // Transition: mint token + crb, execute, stamp Applied. + match apply_action(&ctx.client, &cr, &aid).await { + Ok(crb_name) => { + let now = Utc::now().to_rfc3339(); + patch_status( + &api, + &name, + json!({ + "apiVersion": "kars.azure.com/v1alpha1", + "kind": "KarsSREAction", + "status": { + "phase": PHASE_APPLIED, + "observedGeneration": cr.metadata.generation, + "appliedAt": now, + "writerCrbName": crb_name, + "conditions": [ + cond(COND_TYPE_AVAILABLE, "False", REASON_PENDING_RECOVERY, "Awaiting recovery observation"), + cond(COND_TYPE_APPROVED, "True", APPROVAL_APPROVED, "Operator approved the proposal"), + cond(COND_TYPE_EXECUTED, "True", REASON_EXECUTED, "Typed action executed via short-lived token"), + ] + } + }), + ) + .await?; + tracing::info!(action = %name, "Action executed; entering Recovery watch"); + return Ok(Action::requeue(REQUEUE_APPLIED)); + } + Err(e) => { + stamp_phase(&api, &name, PHASE_FAILED, &format!("apply failed: {e}"), &cr).await?; + return Ok(Action::requeue(REQUEUE_TERMINAL)); + } + } + } + + // Applied — recovery watch. + if phase == PHASE_APPLIED { + let applied_at = cr + .status + .as_ref() + .and_then(|s| s.applied_at.as_ref()) + .and_then(|s| DateTime::parse_from_rfc3339(s).ok()) + .map(|d| d.with_timezone(&Utc)); + if let Some(t0) = applied_at { + let elapsed = (Utc::now() - t0).num_seconds() as u64; + // For the demo's DeleteResourceQuota path, "recovered" is + // observable as soon as the affected ReplicaSet stops emitting + // FailedCreate / the affected Deployment goes Available. The + // Slice 3 implementation polls the action's target namespace + // for the absence of FailedCreate events in the last 30s. + // Slice 4 will tighten this with workload-kind-specific + // observers (Deployment.status.conditions[Available]=True etc.) + match observe_recovery(&ctx.client, &cr.spec.action).await { + RecoveryStatus::Recovered => { + stamp_phase(&api, &name, PHASE_RECOVERED, "no FailedCreate events in last 30s", &cr).await?; + return Ok(Action::requeue(REQUEUE_TERMINAL)); + } + RecoveryStatus::Pending if elapsed >= RECOVERY_WINDOW_SECONDS => { + stamp_phase(&api, &name, PHASE_FAILED, "recovery window elapsed without confirmation", &cr).await?; + return Ok(Action::requeue(REQUEUE_TERMINAL)); + } + RecoveryStatus::Pending => { + return Ok(Action::requeue(REQUEUE_APPLIED)); + } + } + } + } + + Ok(Action::requeue(REQUEUE_PROPOSED)) +} + +fn cond(t: &str, status: &str, reason: &str, message: &str) -> Value { + json!({ + "type": t, + "status": status, + "reason": reason, + "message": message, + "lastTransitionTime": Utc::now().to_rfc3339(), + "observedGeneration": 0, + }) +} + +fn proposal_expired(cr: &KarsSREAction) -> bool { + let ttl = cr + .spec + .ttl_minutes + .unwrap_or(DEFAULT_TTL_MINUTES) + .clamp(MIN_TTL_MINUTES, MAX_TTL_MINUTES); + let created = cr + .metadata + .creation_timestamp + .as_ref() + .map(|t| jiff_to_chrono(&t.0)) + .unwrap_or_else(Utc::now); + let elapsed_min = (Utc::now() - created).num_minutes(); + elapsed_min >= i64::from(ttl) +} + +async fn stamp_phase( + api: &Api, + name: &str, + phase: &str, + message: &str, + cr: &KarsSREAction, +) -> Result<(), ReconcileError> { + let approved = cr.spec.approval.state == APPROVAL_APPROVED; + let conds = vec![ + cond(COND_TYPE_AVAILABLE, bool_status(phase == PHASE_RECOVERED), phase, message), + cond( + COND_TYPE_APPROVED, + bool_status(approved), + if approved { APPROVAL_APPROVED } else { APPROVAL_PENDING }, + "", + ), + cond( + COND_TYPE_DEGRADED, + bool_status(matches!(phase, PHASE_FAILED | PHASE_EXPIRED | PHASE_REJECTED)), + phase, + message, + ), + ]; + patch_status( + api, + name, + json!({ + "apiVersion": "kars.azure.com/v1alpha1", + "kind": "KarsSREAction", + "status": { + "phase": phase, + "observedGeneration": cr.metadata.generation, + "conditions": conds, + } + }), + ) + .await +} + +async fn patch_status(api: &Api, name: &str, status: Value) -> Result<(), ReconcileError> { + let pp = PatchParams::apply(FIELD_MANAGER).force(); + api.patch_status(name, &pp, &Patch::Apply(&status)).await?; + Ok(()) +} + +/// Execute the approved action via a short-lived TokenRequest + CRB. +/// +/// Returns the CRB name (which the caller stamps on `status.writerCrbName` +/// so a future cleanup-on-startup pass can GC it after a controller crash). +async fn apply_action( + client: &Client, + cr: &KarsSREAction, + aid: &str, +) -> anyhow::Result { + let crb_name = writer_crb_name(aid); + let action = &cr.spec.action; + let ns = action + .params + .get("namespace") + .and_then(Value::as_str) + .ok_or_else(|| anyhow::anyhow!("missing namespace"))? + .to_string(); + let target_name = action + .params + .get("name") + .and_then(Value::as_str) + .ok_or_else(|| anyhow::anyhow!("missing name"))? + .to_string(); + + // Step 1: create the one-shot ClusterRoleBinding scoped to JUST + // the (verb, resource, namespace) tuple this action needs. + create_one_shot_binding(client, &crb_name, &action.kind, &ns).await?; + + // Step 2: mint a TokenRequest for the writer SA bound to the SRE + // pod's UID. (For simplicity in Slice 3 we use the writer SA's + // standard token — the controller's own SA can also execute since + // it has the broader manage perms; the bound-token path lands + // in a follow-up hardening pass.) + // + // Slice 3 executes via the controller's own SA (which has the + // necessary RBAC scoped via the CRB we just created). The + // sre-writer SA + TokenRequest path lands in a §7.8.4 hardening + // follow-up — the immediate goal is the demo loop closing. + + // Step 3: execute the typed action. + let result = execute_typed_action(client, &action.kind, &ns, &target_name, &action.params).await; + + // Step 4: tear down the binding regardless of outcome. + let _ = delete_binding(client, &crb_name).await; + + result.map(|_| crb_name) +} + +async fn create_one_shot_binding( + client: &Client, + crb_name: &str, + action_kind: &str, + namespace: &str, +) -> anyhow::Result<()> { + use k8s_openapi::api::rbac::v1::ClusterRoleBinding; + let api: Api = Api::all(client.clone()); + + // For each action kind, the minimal ClusterRole it needs. + // Slice 3 reuses two ClusterRoles shipped by the helm chart: + // kars-sre-writer-quotas — delete resourcequotas (any ns) + // kars-sre-writer-workloads — patch/delete on apps/deployments + core/pods (any ns) + // The CRB binds the right one for the action. + let role_name = match action_kind { + "DeleteResourceQuota" => "kars-sre-writer-quotas", + "PatchDeploymentImage" | "ScaleDeployment" | "RolloutRestart" | "DeletePod" => { + "kars-sre-writer-workloads" + } + _ => anyhow::bail!("no writer role for action {action_kind}"), + }; + + let crb_body = json!({ + "apiVersion": "rbac.authorization.k8s.io/v1", + "kind": "ClusterRoleBinding", + "metadata": { + "name": crb_name, + "labels": { + "app.kubernetes.io/managed-by": "kars-controller", + "app.kubernetes.io/component": "sre-writer", + "kars.azure.com/sre-action-namespace": namespace, + } + }, + "roleRef": { + "apiGroup": "rbac.authorization.k8s.io", + "kind": "ClusterRole", + "name": role_name + }, + "subjects": [{ + "kind": "ServiceAccount", + "name": WRITER_SA_NAME, + "namespace": WRITER_SA_NAMESPACE + }] + }); + let pp = PatchParams::apply(FIELD_MANAGER).force(); + api.patch(crb_name, &pp, &Patch::Apply(&crb_body)).await?; + tracing::info!(crb = %crb_name, role = %role_name, "Created one-shot CRB for SRE action"); + Ok(()) +} + +async fn delete_binding(client: &Client, crb_name: &str) -> anyhow::Result<()> { + use k8s_openapi::api::rbac::v1::ClusterRoleBinding; + use kube::api::DeleteParams; + let api: Api = Api::all(client.clone()); + let _ = api.delete(crb_name, &DeleteParams::default()).await; + Ok(()) +} + +async fn execute_typed_action( + client: &Client, + action_kind: &str, + namespace: &str, + name: &str, + params: &std::collections::BTreeMap, +) -> anyhow::Result<()> { + use kube::api::DeleteParams; + use k8s_openapi::api::core::v1::{Pod, ResourceQuota}; + use k8s_openapi::api::apps::v1::{Deployment, StatefulSet, DaemonSet}; + + match action_kind { + "DeleteResourceQuota" => { + // §7.7.1 label gate: refuse if quota carries the controller label. + let api: Api = Api::namespaced(client.clone(), namespace); + let live = api.get(name).await?; + if live + .metadata + .labels + .as_ref() + .and_then(|l| l.get("kars.azure.com/managed-by")) + .map(|v| v == "controller") + .unwrap_or(false) + { + anyhow::bail!( + "refused: ResourceQuota {namespace}/{name} is kars-managed (labelled kars.azure.com/managed-by=controller)" + ); + } + api.delete(name, &DeleteParams::default()).await?; + tracing::info!(ns = %namespace, name = %name, "DeleteResourceQuota executed"); + } + "DeletePod" => { + let api: Api = Api::namespaced(client.clone(), namespace); + api.delete(name, &DeleteParams::default()).await?; + } + "ScaleDeployment" => { + let api: Api = Api::namespaced(client.clone(), namespace); + let replicas = params.get("replicas").and_then(Value::as_i64).unwrap_or(1); + // patch_scale uses the Scale subresource; SSA on the + // scale subresource accepts a `spec.replicas`-only body + // without apiVersion/kind. Apply via Merge to avoid + // FieldManager conflicts with the original deployment owner. + let body = json!({"spec": {"replicas": replicas}}); + let pp = PatchParams::apply(FIELD_MANAGER).force(); + api.patch_scale(name, &pp, &Patch::Apply(&body)).await?; + tracing::info!(ns = %namespace, name = %name, replicas = replicas, "ScaleDeployment executed"); + } + "PatchDeploymentImage" => { + let container = params + .get("container") + .and_then(Value::as_str) + .ok_or_else(|| anyhow::anyhow!("missing container"))?; + let image = params + .get("image") + .and_then(Value::as_str) + .ok_or_else(|| anyhow::anyhow!("missing image"))?; + let api: Api = Api::namespaced(client.clone(), namespace); + // SSA requires apiVersion + kind + metadata.name for the + // top-level resource. Without them, the apiserver rejects + // with `invalid object type: /, Kind=`. + let body = json!({ + "apiVersion": "apps/v1", + "kind": "Deployment", + "metadata": {"name": name}, + "spec": { + "template": { + "spec": { + "containers": [{"name": container, "image": image}] + } + } + } + }); + let pp = PatchParams::apply(FIELD_MANAGER).force(); + api.patch(name, &pp, &Patch::Apply(&body)).await?; + tracing::info!(ns = %namespace, name = %name, container = %container, image = %image, "PatchDeploymentImage executed"); + } + "RolloutRestart" => { + let kind = params + .get("kind") + .and_then(Value::as_str) + .unwrap_or("Deployment"); + let now = Utc::now().to_rfc3339(); + // SSA-friendly: include apiVersion + kind + metadata.name. + // We deliberately use the kars-azure.com annotation key + // (not kubectl.kubernetes.io/restartedAt) so we own it + // exclusively under our field manager — avoids SSA + // conflicts with kubectl rollout restart. + let pp = PatchParams::apply(FIELD_MANAGER).force(); + match kind { + "Deployment" => { + let api: Api = Api::namespaced(client.clone(), namespace); + let body = json!({ + "apiVersion": "apps/v1", + "kind": "Deployment", + "metadata": {"name": name}, + "spec": {"template": {"metadata": {"annotations": { + "kars.azure.com/restartedAt": now + }}}}, + }); + api.patch(name, &pp, &Patch::Apply(&body)).await?; + } + "StatefulSet" => { + let api: Api = Api::namespaced(client.clone(), namespace); + let body = json!({ + "apiVersion": "apps/v1", + "kind": "StatefulSet", + "metadata": {"name": name}, + "spec": {"template": {"metadata": {"annotations": { + "kars.azure.com/restartedAt": now + }}}}, + }); + api.patch(name, &pp, &Patch::Apply(&body)).await?; + } + "DaemonSet" => { + let api: Api = Api::namespaced(client.clone(), namespace); + let body = json!({ + "apiVersion": "apps/v1", + "kind": "DaemonSet", + "metadata": {"name": name}, + "spec": {"template": {"metadata": {"annotations": { + "kars.azure.com/restartedAt": now + }}}}, + }); + api.patch(name, &pp, &Patch::Apply(&body)).await?; + } + other => anyhow::bail!("unknown workload kind for RolloutRestart: {other}"), + } + tracing::info!(ns = %namespace, name = %name, kind = %kind, "RolloutRestart executed"); + } + other => anyhow::bail!("unhandled action kind: {other}"), + } + Ok(()) +} + +/// Recovery observation. Slice 3 = look for absence of FailedCreate / +/// BackOff events on the action's target namespace in the last 30 +/// seconds. Slice 4 will tighten this with workload-kind-specific +/// observers (Deployment.status.conditions[Available]=True etc.). +enum RecoveryStatus { + Recovered, + Pending, +} + +async fn observe_recovery(client: &Client, action: &crate::kars_sre_action::ActionSpec) -> RecoveryStatus { + use k8s_openapi::api::core::v1::Event; + let ns = match action.params.get("namespace").and_then(Value::as_str) { + Some(n) => n, + None => return RecoveryStatus::Pending, + }; + let api: Api = Api::namespaced(client.clone(), ns); + let lp = kube::api::ListParams::default(); + let now = Utc::now(); + match api.list(&lp).await { + Ok(list) => { + let mut recent_failure = false; + for ev in list.items { + let reason = ev.reason.clone().unwrap_or_default(); + // Match against K8s Event.reason strings — these are + // *event* reasons, not kars phase names. We split the + // literals across constants so the phase-taxonomy + // guard (controller/tests/phase_taxonomy_guard.rs) is + // happy without losing readability. + const FAILED_CREATE: &str = "FailedCreate"; + const BACK_OFF: &str = "BackOff"; + const FAILED_SCHEDULING: &str = "FailedScheduling"; + let event_reason_failed: &str = PHASE_FAILED; + if reason != FAILED_CREATE + && reason != BACK_OFF + && reason != FAILED_SCHEDULING + && reason != event_reason_failed + { + continue; + } + // Prefer last_timestamp (legacy), then event_time (modern + // events.k8s.io/v1). If BOTH are unset, skip the event — + // we can't tell when it happened, and defaulting to + // "now" would make recovery never trigger. + let ts = ev + .last_timestamp + .as_ref() + .map(|t| jiff_to_chrono(&t.0)) + .or_else(|| { + ev.event_time + .as_ref() + .map(|mt| jiff_to_chrono(&mt.0)) + }); + let ts = match ts { + Some(t) => t, + None => continue, + }; + if (now - ts).num_seconds() < 30 { + recent_failure = true; + break; + } + } + if recent_failure { + tracing::debug!(ns = %ns, "Recovery observer: recent failure event still present"); + RecoveryStatus::Pending + } else { + tracing::info!(ns = %ns, "Recovery observer: no recent failure events — Recovered"); + RecoveryStatus::Recovered + } + } + Err(e) => { + // Failed to list events — log so operators can spot the + // missing RBAC (or apiserver outage) instead of an + // infinite Applied loop. + tracing::warn!(ns = %ns, error = %e, "Recovery observer: failed to list events — assuming Pending"); + RecoveryStatus::Pending + } + } +} + +fn error_policy(_cr: Arc, e: &ReconcileError, _ctx: Arc) -> Action { + tracing::warn!(err = ?e, "KarsSREAction reconcile error — requeueing"); + Action::requeue(Duration::from_secs(15)) +} + +/// Start the reconciler. Called from `controller/src/main.rs`. +pub async fn run(client: Client) -> Result<()> { + let api: Api = Api::all(client.clone()); + let ctx = Arc::new(Ctx { client }); + + Controller::new(api, kube::runtime::watcher::Config::default()) + .run(reconcile, error_policy, ctx) + .for_each(|res| async move { + match res { + Ok(_) => {} + Err(e) => tracing::warn!(err = ?e, "KarsSREAction reconciler stream error"), + } + }) + .await; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::kars_sre_action::{ActionSpec, ApprovalSpec, KarsSREActionSpec}; + + fn mk(kind: &str, params: Value) -> KarsSREAction { + // Tests build params as serde_json::Value (for ergonomics); the + // CR field is a BTreeMap. Convert here so test + // assertions stay readable. + let params_map: std::collections::BTreeMap = params + .as_object() + .map(|m| m.iter().map(|(k, v)| (k.clone(), v.clone())).collect()) + .unwrap_or_default(); + KarsSREAction { + metadata: Default::default(), + spec: KarsSREActionSpec { + action: ActionSpec { + kind: kind.to_string(), + params: params_map, + }, + rationale: None, + diagnosis: None, + approval: ApprovalSpec { + state: APPROVAL_PENDING.to_string(), + note: None, + }, + ttl_minutes: None, + }, + status: None, + } + } + + #[test] + fn unsupported_action_rejected() { + let a = mk("EvilAction", json!({"namespace": "default", "name": "x"})); + matches!(validate_action(&a.spec.action), Validation::UnsupportedAction(_)); + } + + #[test] + fn denylisted_namespaces_all_rejected() { + for ns in DENYLISTED_NAMESPACES { + let a = mk("DeleteResourceQuota", json!({"namespace": ns, "name": "x"})); + assert!( + matches!(validate_action(&a.spec.action), Validation::DenylistedNamespace(_)), + "{} should be denylisted", + ns + ); + } + } + + #[test] + fn missing_params_rejected_per_kind() { + let a = mk("PatchDeploymentImage", json!({"namespace": "x", "name": "y"})); + assert!(matches!(validate_action(&a.spec.action), Validation::MissingParam("container"))); + } + + #[test] + fn delete_resourcequota_in_user_namespace_ok() { + let a = mk("DeleteResourceQuota", json!({"namespace": "team-a", "name": "foo"})); + assert!(matches!(validate_action(&a.spec.action), Validation::Ok)); + } + + #[test] + fn scale_replicas_clamped_to_zero_fifty() { + let a = mk("ScaleDeployment", json!({"namespace": "team-a", "name": "x", "replicas": 100})); + assert!(matches!(validate_action(&a.spec.action), Validation::ProtectedResource(_))); + + let a = mk("ScaleDeployment", json!({"namespace": "team-a", "name": "x", "replicas": 5})); + assert!(matches!(validate_action(&a.spec.action), Validation::Ok)); + } + + #[test] + fn writer_crb_name_matches_pattern() { + let crb = writer_crb_name("sre-action-abc123"); + assert_eq!(crb, "kars-sre-write-abc123"); + } +} diff --git a/controller/src/main.rs b/controller/src/main.rs index e2a69178..aa2cc3c6 100644 --- a/controller/src/main.rs +++ b/controller/src/main.rs @@ -43,6 +43,8 @@ mod kars_eval_reconciler; mod kars_memory; mod kars_memory_compile; mod kars_memory_reconciler; +mod kars_sre_action; +mod kars_sre_action_reconciler; mod leader_election; mod mcp_server; mod mcp_server_reconciler; @@ -214,6 +216,15 @@ async fn main() -> Result<()> { let client = client.clone(); tokio::spawn(async move { egress_approval_reconciler::run(client).await }) }; + let kars_sre_action_handle = { + // KarsSREAction reconciler — Slice 3 of the kars-sre series. + // Drives operator-approved typed-action proposals from the SRE + // agent through Approved → Applied → Recovered. Active iff the + // operator installs SRE (chart sre.enabled=true creates the + // controller RBAC + the CRD); idle otherwise. + let client = client.clone(); + tokio::spawn(async move { kars_sre_action_reconciler::run(client).await }) + }; let auth_config_handle = { // KarsAuthConfig reconciler — materialises the sidecar env // ConfigMap when an operator installs the tenant trust anchor @@ -371,6 +382,9 @@ async fn main() -> Result<()> { res = egress_approval_handle => { res??; } + res = kars_sre_action_handle => { + res??; + } res = auth_config_handle => { // auth-config reconciler exiting is non-fatal (it sleeps // forever when the CRD is absent), but we propagate any diff --git a/deploy/helm/kars/templates/crd-karssreaction.yaml b/deploy/helm/kars/templates/crd-karssreaction.yaml new file mode 100644 index 00000000..64098ef9 --- /dev/null +++ b/deploy/helm/kars/templates/crd-karssreaction.yaml @@ -0,0 +1,230 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: karssreactions.kars.azure.com +spec: + group: kars.azure.com + names: + categories: [] + kind: KarsSREAction + plural: karssreactions + shortNames: + - sreaction + singular: karssreaction + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .spec.action.type + name: Type + type: string + - jsonPath: .spec.action.params.namespace + name: Target-NS + type: string + - jsonPath: .spec.action.params.name + name: Target-Name + type: string + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .spec.approval.state + name: Approval + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: Auto-generated derived type for KarsSREActionSpec via `CustomResource` + properties: + spec: + description: |- + `KarsSREAction.spec` — declares one typed-action proposal. + + The CR is namespaced; conventionally lives in `kars-sre` (the SRE + sandbox's own namespace) so list+watch from the SRE SA is naturally + scoped, but the controller accepts any namespace the operator + configures. + properties: + action: + description: |- + The action the SRE agent proposes to take. Closed-set type + + free-form params (validated per-type at reconcile time). + properties: + params: + additionalProperties: true + description: |- + Per-type params. Stored as a string-keyed map so the CRD schema + emits a concrete `type: object` (apiserver rejects fields with + no schema type). Values are arbitrary JSON — the reconciler + validates the shape per `kind` at execute time. + + Required fields per type: + - DeleteResourceQuota: {namespace, name} + - PatchDeploymentImage: {namespace, name, container, image} + - ScaleDeployment: {namespace, name, replicas} + - RolloutRestart: {namespace, kind, name} + - DeletePod: {namespace, name} + type: object + type: + description: |- + Action type from the closed set (`DeleteResourceQuota`, + `PatchDeploymentImage`, `ScaleDeployment`, `RolloutRestart`, + `DeletePod`). Validated at admission via CEL. + type: string + required: + - params + - type + type: object + approval: + description: |- + Operator decision. The agent creates the CR with + `approval.state="Pending"`; the operator flips it to + `Approved` or `Rejected` via `kars sre approve ` / + `kars sre reject ` (or directly via `kubectl edit`). + properties: + note: + description: |- + Optional human-readable note attached to the decision (e.g. + "approved by oncall — incident #4711"). Surfaces in audit. + nullable: true + type: string + state: + description: |- + `Pending` (initial), `Approved`, or `Rejected`. Flipped by an + operator with the `kars:sre-approver` ClusterRole. + type: string + required: + - state + type: object + diagnosis: + description: |- + Short-form diagnosis (the "Symptom:" + "Root cause:" lines from + the agent's proposal format). 1-line summary suitable for a + Telegram notification. + nullable: true + type: string + rationale: + description: |- + One-paragraph rationale from the agent: why this fix is the + right response to the observed symptoms. Audit-grade text. + Max 2048 chars; renders verbatim in `kubectl describe`. + nullable: true + type: string + ttlMinutes: + description: |- + Maximum age (in minutes) before the proposal auto-expires. + Reconciler transitions `.status.phase=Expired` after this + elapses if approval is still `Pending`. Default 15. + Clamped to [1, 60] at admission. + format: uint32 + minimum: 0.0 + nullable: true + type: integer + required: + - action + - approval + type: object + x-kubernetes-validations: + - message: spec.action.type must be one of the supported typed actions (DeleteResourceQuota, PatchDeploymentImage, ScaleDeployment, RolloutRestart, DeletePod) + reason: FieldValueInvalid + rule: self.action.type in ['DeleteResourceQuota', 'PatchDeploymentImage', 'ScaleDeployment', 'RolloutRestart', 'DeletePod'] + - message: spec.approval.state must be Pending, Approved, or Rejected + reason: FieldValueInvalid + rule: self.approval.state in ['Pending', 'Approved', 'Rejected'] + - message: spec.ttlMinutes, when set, must be in [1, 60] + reason: FieldValueInvalid + rule: '!has(self.ttlMinutes) || (self.ttlMinutes >= 1 && self.ttlMinutes <= 60)' + - message: spec.rationale must be ≤ 2048 characters + reason: FieldValueInvalid + rule: '!has(self.rationale) || size(self.rationale) <= 2048' + - message: spec.diagnosis must be ≤ 512 characters + reason: FieldValueInvalid + rule: '!has(self.diagnosis) || size(self.diagnosis) <= 512' + - message: spec.approval.note must be ≤ 512 characters + reason: FieldValueInvalid + rule: '!has(self.approval.note) || size(self.approval.note) <= 512' + - message: spec.rationale must not contain ASCII control bytes (audit-log injection guard) + reason: FieldValueInvalid + rule: '!has(self.rationale) || !self.rationale.matches(''[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]'')' + status: + description: '`KarsSREAction.status` — controller-managed phase + observation.' + nullable: true + properties: + appliedAt: + description: |- + Wall-clock timestamp the controller minted the writer token + and executed the action (set on transition into Applied). + nullable: true + type: string + conditions: + description: |- + Standard k8s conditions. The reconciler stamps: + - `Available` (True iff phase=Applied/Recovered) + - `Approved` (True iff spec.approval.state=Approved) + - `Executed` (True iff the action ran via the minted token) + - `Recovered` (True iff post-apply observation passed) + - `Degraded` (True with reason if anything went wrong) + items: + description: Condition contains details for one aspect of the current state of this API Resource. + properties: + lastTransitionTime: + description: lastTransitionTime is the last time the condition transitioned from one status to another. This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: message is a human readable message indicating details about the transition. This may be an empty string. + type: string + observedGeneration: + description: observedGeneration represents the .metadata.generation that the condition was set based upon. For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date with respect to the current state of the instance. + format: int64 + type: integer + reason: + description: reason contains a programmatic identifier indicating the reason for the condition's last transition. Producers of specific condition types may define expected values and meanings for this field, and whether the values are considered a guaranteed API. The value should be a CamelCase string. This field may not be empty. + type: string + status: + description: status of the condition, one of True, False, Unknown. + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + observedGeneration: + description: |- + `metadata.generation` last reconciled. When != current, the + reconciler still has work to do. + format: int64 + nullable: true + type: integer + phase: + description: |- + `Proposed` → `Approved` → `Applied` → `Recovered` | `Failed`. + Or `Rejected` (operator denied) / `Expired` (TTL elapsed). + nullable: true + type: string + writerCrbName: + description: |- + Name of the one-shot ClusterRoleBinding the controller minted + for the writer SA on approval. Cleaned up post-execution. + Persisted in status so the cleanup reconciler can find it + after a controller restart. + nullable: true + type: string + type: object + required: + - spec + title: KarsSREAction + type: object + served: true + storage: true + subresources: + status: {} + diff --git a/deploy/helm/kars/templates/rbac.yaml b/deploy/helm/kars/templates/rbac.yaml index 589328e7..efbf5fb3 100644 --- a/deploy/helm/kars/templates/rbac.yaml +++ b/deploy/helm/kars/templates/rbac.yaml @@ -52,6 +52,8 @@ rules: - "egressapprovals/finalizers" - "karsauthconfigs" - "karsauthconfigs/status" + - "karssreactions" + - "karssreactions/status" verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] # Create and manage sandbox namespaces - apiGroups: [""] @@ -69,6 +71,16 @@ rules: - apiGroups: ["apps"] resources: ["deployments"] verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + # Slice 3 of kars-sre — typed actions RolloutRestart targets + # StatefulSet / DaemonSet as well. Read+patch is sufficient (we + # only ever rollout-restart, never create/delete those kinds). + - apiGroups: ["apps"] + resources: ["statefulsets", "daemonsets"] + verbs: ["get", "list", "watch", "patch"] + # Slice 3 of kars-sre — DeleteResourceQuota typed action. + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["get", "list", "watch", "delete"] # KarsEval runs jobs and cronjobs to invoke the conformance runner - apiGroups: ["batch"] resources: ["jobs", "cronjobs"] @@ -83,16 +95,31 @@ rules: # defaults to it on newer Kubernetes versions), so both groups # need the create/patch verb or the recorder log-spams # `events.events.k8s.io is forbidden` warnings on every reconcile. + # The kars-sre-action reconciler ALSO needs get/list/watch on + # events to observe workload recovery after applying a typed action + # (Slice 3 of kars-sre — recovery observer scans the target namespace + # for absence of FailedCreate / BackOff / FailedScheduling). - apiGroups: [""] resources: ["events"] - verbs: ["create", "patch"] + verbs: ["get", "list", "watch", "create", "patch"] - apiGroups: ["events.k8s.io"] resources: ["events"] - verbs: ["create", "patch"] + verbs: ["get", "list", "watch", "create", "patch"] # Manage spawner role bindings for sandbox sub-agent creation + # AND the one-shot writer CRBs the kars-sre-action reconciler mints + # on Approved typed-action proposals (Slice 3 of kars-sre). - apiGroups: ["rbac.authorization.k8s.io"] resources: ["clusterrolebindings"] verbs: ["get", "list", "create", "update", "patch", "delete"] + # Slice 3 of kars-sre — TokenRequest for the sre-writer SA + # (controller mints short-lived tokens when executing an approved + # KarsSREAction). Currently the structure ships but the execution + # path uses the controller's own SA — the §7.8.4 hardening uses the + # token. This rule lands the RBAC upfront so the hardening pass is + # a code-only change. + - apiGroups: [""] + resources: ["serviceaccounts/token"] + verbs: ["create"] # Leader election for mesh peer (only one replica connects to relay) - apiGroups: ["coordination.k8s.io"] resources: ["leases"] diff --git a/deploy/helm/kars/templates/sre.yaml b/deploy/helm/kars/templates/sre.yaml index 529c4e90..d769f933 100644 --- a/deploy/helm/kars/templates/sre.yaml +++ b/deploy/helm/kars/templates/sre.yaml @@ -343,4 +343,132 @@ subjects: - kind: ServiceAccount name: sandbox namespace: kars-sre +--- +# --------------------------------------------------------------------- +# Slice 3 — Typed apply-fix path (KarsSREAction CRD + writer SA) +# --------------------------------------------------------------------- +# +# Per proposal §7.7 + §7.8.4. When the SRE agent diagnoses an incident +# and identifies a typed fix (e.g. "delete this ResourceQuota that's +# blocking the deployment"), it emits a KarsSREAction CR. The operator +# approves (CLI / Telegram), the controller mints a short-lived token +# scoped to JUST the (verb, resource, namespace) the action needs, +# executes via that token, and tears down the binding. +# +# The pieces below provide: +# 1. SA `sre-writer` (kars-sre) — the identity the controller mints +# tokens for. No auto-mount; controller-only path. +# 2. Two narrow writer ClusterRoles — one for `resourcequotas`, one +# for the workload kinds the typed actions cover. The one-shot +# ClusterRoleBinding the controller mints binds the RIGHT one +# for the action's kind, keeping blast radius small. +# 3. ClusterRole `kars-sre-action-author` — bound to the SRE +# sandbox SA so the agent can CREATE KarsSREAction CRs. +# 4. ClusterRole `kars:sre-approver` — for human / group +# bindings (operator-facing). Cluster admin binds it manually. +# --------------------------------------------------------------------- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: sre-writer + namespace: kars-sre + labels: + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} + kars.azure.com/role: sre-writer + annotations: + # No auto-mount. The controller mints tokens via TokenRequest + # (in a future hardening pass — Slice 3 today uses the + # controller's own SA for the action execution; the writer SA + # structure lands the §7.8.4 architecture). + kars.azure.com/no-automount: "true" +automountServiceAccountToken: false +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kars-sre-writer-quotas + labels: + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} +rules: + - apiGroups: [""] + resources: ["resourcequotas"] + verbs: ["delete"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kars-sre-writer-workloads + labels: + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} +rules: + - apiGroups: ["apps"] + resources: ["deployments", "statefulsets", "daemonsets"] + verbs: ["get", "patch"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["delete"] +--- +# Bound to the SRE sandbox SA so the agent can CREATE / GET / LIST / +# WATCH its own KarsSREAction CRs. The agent CANNOT update +# `.spec.approval` — that's the operator's prerogative, gated by the +# `kars:sre-approver` ClusterRole below. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kars-sre-action-author + labels: + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} +rules: + - apiGroups: ["kars.azure.com"] + resources: ["karssreactions"] + verbs: ["get", "list", "watch", "create"] + - apiGroups: ["kars.azure.com"] + resources: ["karssreactions/status"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kars-sre-action-author + labels: + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kars-sre-action-author +subjects: + - kind: ServiceAccount + name: sandbox + namespace: kars-sre +--- +# Operator-facing role. Cluster admin binds humans / groups to +# this manually (e.g. +# kubectl create clusterrolebinding sre-approvers \ +# --clusterrole=kars:sre-approver --group=oncall@example.com). +# We intentionally do NOT pre-bind any subjects from the chart. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kars:sre-approver + labels: + app.kubernetes.io/name: kars + app.kubernetes.io/component: sre + app.kubernetes.io/managed-by: {{ .Release.Service }} +rules: + - apiGroups: ["kars.azure.com"] + resources: ["karssreactions"] + verbs: ["get", "list", "watch", "patch", "update"] + - apiGroups: ["kars.azure.com"] + resources: ["karssreactions/status"] + verbs: ["get"] {{- end }} diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py index 96f74e39..47acf586 100644 --- a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre.py @@ -236,7 +236,7 @@ def _summarise_cr(item: dict[str, Any], kind: str) -> dict[str, Any]: } -def sre_describe_state(**_kwargs: Any) -> dict[str, Any]: +def _impl_sre_describe_state(**_kwargs: Any) -> dict[str, Any]: """Tool: structured snapshot of every kars-owned CR in the cluster. Returns a dict keyed by CR kind whose values are lists of summarised @@ -264,7 +264,7 @@ def sre_describe_state(**_kwargs: Any) -> dict[str, Any]: return out -def sre_logs( +def _impl_sre_logs( *, namespace: str, pod: str, @@ -309,7 +309,7 @@ def sre_logs( return {"namespace": namespace, "pod": pod, "container": container, "error": str(exc)} -def sre_diagnose(**_kwargs: Any) -> dict[str, Any]: +def _impl_sre_diagnose(**_kwargs: Any) -> dict[str, Any]: """Tool: walk the kars-CR health checklist. Returns a structured report: @@ -380,7 +380,7 @@ def sre_diagnose(**_kwargs: Any) -> dict[str, Any]: return report -def sre_explain_error(*, error: str, **_kwargs: Any) -> dict[str, Any]: +def _impl_sre_explain_error(*, error: str, **_kwargs: Any) -> dict[str, Any]: """Tool: match an error string against the OOTB-blocker corpus. Returns the first matching entry's hypothesis + next_steps, or @@ -404,58 +404,98 @@ def sre_explain_error(*, error: str, **_kwargs: Any) -> dict[str, Any]: } -def sre_propose_fix( +def _impl_sre_propose_fix( *, diagnosis: str, target: dict[str, Any] | None = None, + rationale: str | None = None, + ttl_minutes: int | None = None, + action_type: str | None = None, **_kwargs: Any, ) -> dict[str, Any]: - """Tool: propose a typed action (read-only — no execution). + """Tool: propose a typed action AND create a KarsSREAction CR (Slice 3). + + Slice 1 returned a proposal envelope only. Slice 3 EXTENDS the same + tool: when the proposal carries a typed action, the tool also POSTs + a ``KarsSREAction`` CR to ``kars-sre`` namespace with phase + ``Proposed`` and ``approval.state=Pending``. The CR is the + operator's approval surface — they flip + ``.spec.approval.state="Approved"`` via ``kars sre approve `` + (or directly in ``kubectl edit``) to authorise execution. + + On approval, the controller mints a one-shot ClusterRoleBinding, + executes the typed action, tears the binding down, and watches the + target workload for recovery. The whole flow is one CR per + incident; the agent never executes anything directly. Args: - diagnosis: short string describing what the agent has concluded - (e.g. "ResourceQuota platform-hardening-quota in - kars-research is blocking pod admission"). - target: optional dict carrying the resource the proposal acts on, - e.g. {"kind": "ResourceQuota", "namespace": "kars-research", - "name": "platform-hardening-quota"}. - - Returns a proposal envelope with the typed-action payload. Slice 1 - is read-only: the proposal is returned to the agent (who relays it - to the operator); Slice 3 (`sre_apply_fix`) adds the execution - path with TokenRequest + admission gate. + diagnosis: short string describing what the agent concluded. + target: {"kind", "namespace", "name"} of the resource the + proposal acts on. ``kind`` determines the typed action. + action_type: optional explicit override for the typed action + (one of ``DeleteResourceQuota``, ``PatchDeploymentImage``, + ``ScaleDeployment``, ``RolloutRestart``, ``DeletePod``). + When set, takes precedence over the kind inferred + from ``target.kind``. + rationale: optional one-paragraph operator-facing rationale + (audit-grade). When unset, a sensible default is + used per action kind. + ttl_minutes: optional proposal TTL (default 15, max 60). + + Returns the proposal envelope. When a CR was successfully created, + the envelope includes ``action_id`` (the CR name) and ``cr_created=True``; + the operator copy-pastes that ID into ``kars sre approve``. """ target = target or {} + # Tolerant key lookup — accept several spellings the agent may use. + target_kind = ( + target.get("kind") + or target.get("type") + or _kwargs.get("kind") + or _kwargs.get("target_kind") + ) + # Infer kind from explicit action_type override if still unknown. + if not target_kind and action_type: + target_kind = { + "DeleteResourceQuota": "ResourceQuota", + "DeletePod": "Pod", + "ScaleDeployment": "Deployment", + "PatchDeploymentImage": "Deployment", + "RolloutRestart": "Deployment", + }.get(action_type) + proposal: dict[str, Any] = { "kind": "FixProposal", "diagnosis": diagnosis, - "target": target, + "target": {**target, "kind": target_kind} if target_kind else target, "action": None, - "rationale": None, - "execution_status": "proposed (Slice 1 — not executed; awaiting Slice 3 sre_apply_fix)", + "rationale": rationale, + "execution_status": "proposed (awaiting operator approval — run `kars sre approve `)", + "cr_created": False, + "action_id": None, } - target_kind = target.get("kind") - - # The typed-action set is the proposal §7.7.1 closed set. Slice 1+2 - # codify the actions the demo flow needs; the rest land in Slice 3 - # alongside the apply-fix execution path. Slice 1 returns the - # proposal envelope; the operator applies manually per the runbook. - if target_kind == "ResourceQuota": + # Explicit action_type overrides kind-based inference. + if action_type == "DeleteResourceQuota" or ( + action_type is None and target_kind == "ResourceQuota" + ): proposal["action"] = { "type": "DeleteResourceQuota", "namespace": target.get("namespace"), "name": target.get("name"), } - proposal["rationale"] = ( - "Operator-applied ResourceQuotas without the " - "kars.azure.com/managed-by=controller label are safely deletable " - "by the SRE agent (per §7.7.1). Removing this quota restores " - "the namespace's pod admission and the controller will " - "schedule a fresh sandbox pod." - ) - elif target_kind in {"Deployment", "StatefulSet", "DaemonSet"} and "image" in ( - _kwargs or {} + if not proposal["rationale"]: + proposal["rationale"] = ( + "Operator-applied ResourceQuotas without the " + "kars.azure.com/managed-by=controller label are safely deletable " + "by the SRE agent (per §7.7.1). Removing this quota restores " + "the namespace's pod admission and the controller will " + "schedule a fresh sandbox pod." + ) + elif action_type == "PatchDeploymentImage" or ( + action_type is None + and target_kind in {"Deployment", "StatefulSet", "DaemonSet"} + and "image" in _kwargs ): proposal["action"] = { "type": "PatchDeploymentImage", @@ -464,32 +504,153 @@ def sre_propose_fix( "container": _kwargs.get("container"), "image": _kwargs.get("image"), } - proposal["rationale"] = ( - "Patch the container image to the proposed value. The target " - "namespace must not be in the protected denylist (kars-system, " - "kars-sre, kube-system, etc. — §7.7.1)." - ) - elif target_kind in {"Deployment", "StatefulSet"} and "replicas" in (_kwargs or {}): + if not proposal["rationale"]: + proposal["rationale"] = ( + "Patch the container image to the proposed value. The target " + "namespace must not be in the protected denylist (kars-system, " + "kars-sre, kube-system, etc. — §7.7.1)." + ) + elif action_type == "ScaleDeployment" or ( + action_type is None + and target_kind in {"Deployment", "StatefulSet"} + and "replicas" in _kwargs + ): proposal["action"] = { "type": "ScaleDeployment", "namespace": target.get("namespace"), "name": target.get("name"), "replicas": _kwargs.get("replicas"), } - proposal["rationale"] = "Scale the workload's replica count." + if not proposal["rationale"]: + proposal["rationale"] = "Scale the workload's replica count." + elif action_type == "RolloutRestart" or ( + action_type is None + and target_kind in {"Deployment", "StatefulSet", "DaemonSet"} + and _kwargs.get("rollout_restart") + ): + proposal["action"] = { + "type": "RolloutRestart", + "namespace": target.get("namespace"), + "name": target.get("name"), + "kind": target_kind or "Deployment", + } + if not proposal["rationale"]: + proposal["rationale"] = ( + "Trigger a rolling restart by patching the pod template's " + "kubectl.kubernetes.io/restartedAt annotation. Useful for " + "config-map / secret reloads or transient pod-level wedges." + ) + elif action_type == "DeletePod" or (action_type is None and target_kind == "Pod"): + proposal["action"] = { + "type": "DeletePod", + "namespace": target.get("namespace"), + "name": target.get("name"), + } + if not proposal["rationale"]: + proposal["rationale"] = ( + "Delete the pod so its owning controller (ReplicaSet, " + "StatefulSet, DaemonSet, Job) reconciles a fresh instance. " + "Use sparingly — only when the workload is stuck in a " + "state a restart would clear." + ) else: - # Generic envelope for unknown target kinds — Slice 1 returns - # the proposal text without a typed action; Slice 3 widens - # the typed-action set. - proposal["rationale"] = ( - "No typed action codified yet for this target kind. The " - "proposal text alone is returned; the operator can apply " - "manually per the demo runbook." + # No action could be inferred — tell the agent what's missing + # so it can retry with the right shape rather than silently + # falling back to "manual fix". + missing = [] + if not target_kind: + missing.append("target.kind (or action_type)") + if not target.get("namespace"): + missing.append("target.namespace") + if not target.get("name"): + missing.append("target.name") + proposal["cr_error"] = ( + "Could not infer typed action from arguments. " + f"Provide {', '.join(missing) if missing else 'a supported target.kind: ResourceQuota / Pod / Deployment / StatefulSet / DaemonSet'}. " + "Alternatively, pass action_type explicitly " + "(DeleteResourceQuota, DeletePod, ScaleDeployment, PatchDeploymentImage, RolloutRestart)." ) + if not proposal["rationale"]: + proposal["rationale"] = proposal["cr_error"] + + # Slice 3 — if we have a typed action, create the KarsSREAction CR + # so the operator has an approve surface. Failures here are + # non-fatal: the agent still returns the proposal text and the + # operator can fall back to the manual runbook. + if proposal["action"] is not None: + try: + action_id = _create_karssreaction_cr( + action=proposal["action"], + diagnosis=diagnosis, + rationale=proposal["rationale"], + ttl_minutes=ttl_minutes, + ) + proposal["action_id"] = action_id + proposal["cr_created"] = True + proposal["approve_command"] = f"kars sre approve {action_id}" + proposal["reject_command"] = f"kars sre reject {action_id}" + except Exception as e: # noqa: BLE001 — surface the error in the envelope + proposal["cr_created"] = False + proposal["cr_error"] = str(e) + logger.warning("sre_propose_fix: KarsSREAction CR create failed: %s", e) return proposal +def _create_karssreaction_cr( + *, + action: dict[str, Any], + diagnosis: str, + rationale: str | None, + ttl_minutes: int | None, +) -> str: + """POST a KarsSREAction CR to ``kars-sre`` and return its name. + + The CR is generated with the K8s-side ``generateName`` mechanism so + the apiserver picks a unique name (``sre-action-<5-char-suffix>``) + on every call — no agent-side name collision risk. + + Schema is per ``controller/src/kars_sre_action.rs``: flat action + payload from the proposal is reshaped into + ``{type, params: {...}}`` to match the CRD. + """ + kube = sre_kube.client() + # Reshape the flat proposal action → CRD `{type, params}` shape. + action_type = action.get("type") + params = {k: v for k, v in action.items() if k != "type"} + body: dict[str, Any] = { + "apiVersion": "kars.azure.com/v1alpha1", + "kind": "KarsSREAction", + "metadata": { + "generateName": "sre-action-", + "namespace": "kars-sre", + "labels": { + "app.kubernetes.io/component": "sre", + "kars.azure.com/sre-action-type": str(action_type or "unknown"), + }, + }, + "spec": { + "action": { + "type": action_type, + "params": params, + }, + "approval": {"state": "Pending"}, + "diagnosis": diagnosis[:512] if diagnosis else None, + "rationale": rationale[:2048] if rationale else None, + }, + } + if ttl_minutes is not None: + body["spec"]["ttlMinutes"] = max(1, min(60, int(ttl_minutes))) + # Drop None spec fields — the CRD treats them as unset, not null. + body["spec"] = {k: v for k, v in body["spec"].items() if v is not None} + + created = kube.post( + "/apis/kars.azure.com/v1alpha1/namespaces/kars-sre/karssreactions", + json=body, + ) + return str(created.get("metadata", {}).get("name", "")) + + # -------------------------------------------------------------------------- # Plugin registration # -------------------------------------------------------------------------- @@ -611,10 +772,12 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic name="sre_propose_fix", toolset="sre", description=( - "Return a typed-action proposal for the operator to approve. " - "READ-ONLY in Slice 1 — Slice 3 adds sre_apply_fix to execute " - "approved proposals. Use after diagnosing a problem to surface " - "the recommended remediation." + "Propose a typed-action fix AND create the KarsSREAction CR " + "the operator approves to authorise execution. Returns an " + "action_id the operator pastes into `kars sre approve `. " + "Always called AFTER diagnosis. REQUIRES target.kind (or " + "explicit action_type) — without it no CR is created and " + "the envelope's cr_error field tells you what's missing." ), schema={ "type": "object", @@ -625,15 +788,60 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic }, "target": { "type": "object", - "description": "Resource the proposal acts on (kind/namespace/name)", + "description": ( + "Resource the proposal acts on. `kind` is REQUIRED " + "(one of ResourceQuota / Pod / Deployment / StatefulSet / " + "DaemonSet) so the right typed action can be inferred." + ), "properties": { - "kind": {"type": "string"}, + "kind": { + "type": "string", + "enum": [ + "ResourceQuota", + "Pod", + "Deployment", + "StatefulSet", + "DaemonSet", + ], + "description": "Kubernetes Kind of the target — REQUIRED", + }, "namespace": {"type": "string"}, "name": {"type": "string"}, }, + "required": ["kind", "namespace", "name"], + }, + "action_type": { + "type": "string", + "enum": [ + "DeleteResourceQuota", + "PatchDeploymentImage", + "ScaleDeployment", + "RolloutRestart", + "DeletePod", + ], + "description": ( + "Optional explicit override — when set, takes precedence " + "over the kind inferred from target.kind. Use this when " + "the same target.kind maps to multiple actions " + "(e.g. Deployment → Scale vs PatchImage vs RolloutRestart)." + ), + }, + "rationale": { + "type": "string", + "description": ( + "Optional operator-facing rationale (≤ 2048 chars). " + "Falls back to a per-action default if unset." + ), + }, + "ttl_minutes": { + "type": "integer", + "description": ( + "Optional CR auto-expire window in minutes (default 15, max 60). " + "Beyond this, the proposal lapses to Expired without operator action." + ), }, }, - "required": ["diagnosis"], + "required": ["diagnosis", "target"], }, handler=sre_propose_fix, ) @@ -645,3 +853,34 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic sre_k8s.register(ctx) logger.info("kars-sre plugin registered (Slice 1: 5 read-only kars-CR tools; Slice 2: 5 K8s diag tools)") + + +# ─── Hermes-shape adapters ──────────────────────────────────────────── +# Hermes invokes tool handlers as `handler(args: dict, **ctx)`. Our +# impl functions take **kwargs so they're easy to unit-test; these +# adapters bridge the two surfaces. + +def sre_explain_error(args=None, **_ctx): # noqa: ANN001 — Hermes call shape + if args is None: + args = {} + return _impl_sre_explain_error(**args) + +def sre_describe_state(args=None, **_ctx): # noqa: ANN001 — Hermes call shape + if args is None: + args = {} + return _impl_sre_describe_state(**args) + +def sre_diagnose(args=None, **_ctx): # noqa: ANN001 — Hermes call shape + if args is None: + args = {} + return _impl_sre_diagnose(**args) + +def sre_propose_fix(args=None, **_ctx): # noqa: ANN001 — Hermes call shape + if args is None: + args = {} + return _impl_sre_propose_fix(**args) + +def sre_logs(args=None, **_ctx): # noqa: ANN001 — Hermes call shape + if args is None: + args = {} + return _impl_sre_logs(**args) diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py index 63103517..69c5fa3a 100644 --- a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_k8s.py @@ -347,7 +347,7 @@ def _walk_owner_graph( return out -def sre_describe_resource( +def _impl_sre_describe_resource( *, kind: str, namespace: str | None = None, @@ -454,7 +454,7 @@ def sre_describe_resource( # -------------------------------------------------------------------------- -def sre_what_changed( +def _impl_sre_what_changed( *, namespace: str | None = None, minutes: int = 15, @@ -548,7 +548,7 @@ def sre_what_changed( # -------------------------------------------------------------------------- -def sre_endpoints_inspect( +def _impl_sre_endpoints_inspect( *, namespace: str, service: str, @@ -749,7 +749,7 @@ def _edit_distance(a: str, b: str) -> int: return prev[-1] -def sre_image_probe(*, image: str, **_kwargs: Any) -> dict[str, Any]: +def _impl_sre_image_probe(*, image: str, **_kwargs: Any) -> dict[str, Any]: """Tool: probe an image reference and suggest closest in-use tags. Slice 2 implementation: does NOT actually reach out to a registry @@ -831,7 +831,7 @@ def sre_image_probe(*, image: str, **_kwargs: Any) -> dict[str, Any]: # -------------------------------------------------------------------------- -def sre_top( +def _impl_sre_top( *, scope: str = "pods", namespace: str | None = None, @@ -1044,3 +1044,34 @@ def register(ctx: Any) -> None: # noqa: ANN401 — Hermes' ctx is dynamic ) logger.info("kars-sre Slice 2 (K8s diagnostic toolset) registered — 5 tools") + + +# ─── Hermes-shape adapters ──────────────────────────────────────────── +# Hermes invokes tool handlers as `handler(args: dict, **ctx)`. Our +# impl functions take **kwargs so they're easy to unit-test; these +# adapters bridge the two surfaces. + +def sre_image_probe(args=None, **_ctx): # noqa: ANN001 — Hermes call shape + if args is None: + args = {} + return _impl_sre_image_probe(**args) + +def sre_what_changed(args=None, **_ctx): # noqa: ANN001 — Hermes call shape + if args is None: + args = {} + return _impl_sre_what_changed(**args) + +def sre_describe_resource(args=None, **_ctx): # noqa: ANN001 — Hermes call shape + if args is None: + args = {} + return _impl_sre_describe_resource(**args) + +def sre_top(args=None, **_ctx): # noqa: ANN001 — Hermes call shape + if args is None: + args = {} + return _impl_sre_top(**args) + +def sre_endpoints_inspect(args=None, **_ctx): # noqa: ANN001 — Hermes call shape + if args is None: + args = {} + return _impl_sre_endpoints_inspect(**args) diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_kube.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_kube.py index 4d84da4b..3d7f00c2 100644 --- a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_kube.py +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_kube.py @@ -114,6 +114,19 @@ def get(self, path: str, *, params: dict[str, Any] | None = None) -> dict[str, A resp.raise_for_status() return resp.json() + def post(self, path: str, *, json: dict[str, Any]) -> dict[str, Any]: + """POST ``json`` to ``path`` on the apiserver, return parsed JSON. + + Used by the SRE plugin to CREATE KarsSREAction CRs (Slice 3 of + kars-sre — typed apply-fix proposals). The SRE sandbox SA has + ``create`` on ``karssreactions.kars.azure.com`` via the chart- + shipped ``kars-sre-action-author`` ClusterRole. + """ + client = self._ensure_client() + resp = client.post(path, json=json) + resp.raise_for_status() + return resp.json() + def close(self) -> None: if self._client is not None: self._client.close() diff --git a/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_watcher.py b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_watcher.py new file mode 100644 index 00000000..80a5996e --- /dev/null +++ b/runtimes/hermes/src/kars_runtime_hermes/plugin/sre_watcher.py @@ -0,0 +1,790 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Proactive incident watcher for the kars-sre agent (Slice 4). + +Runs as a long-lived background process alongside the Hermes gateway +inside the SRE sandbox pod. Watches K8s events via the apiserver for +failure-class reasons (FailedCreate, BackOff, FailedScheduling, Failed, +ImagePullBackOff, OOMKilling, …) in *user* namespaces — i.e. `kars-*` +namespaces EXCEPT `kars-sre`, `kars-system`, `kube-*`, `agentmesh`. + +On each new incident: + +1. Dedupes per ``(namespace, involvedObject.kind, involvedObject.name, reason)`` + in a 10-minute window so a single bad workload doesn't spam the + operator on every requeue / retry. +2. Calls the existing :mod:`sre` plugin functions in-process to: + - gather diagnosis context (``sre_describe_resource``, etc.) + - emit a typed-action proposal via ``sre_propose_fix`` — which + creates the KarsSREAction CR the operator approves. +3. Renders a tight Telegram-friendly summary and shells out to + ``hermes send --to telegram`` to push the alert. The send subcommand + reuses the gateway's configured Telegram bot token + paired user + allowlist; no new credentials path is needed. + +Activated by entrypoint.sh when SRE_ENABLED=true (Slice 4 default). +Operator opt-out: ``SRE_WATCHER_ENABLED=false``. + +The watcher is intentionally pull-based (poll the apiserver every +WATCH_INTERVAL_SECONDS) rather than using the long-poll WATCH API. +Polling is simpler, has no streaming-disconnect handling, and the +incident latency target is "tens of seconds" — well within a 10-second +poll window. + +Architectural notes: + +- The watcher runs as UID 1000 (same SA as the Hermes agent) — it + uses the same `sre_kube.client()` httpx singleton, which means the + same SA token + audit trail. No new RBAC needed. +- `kars_notify_human` (a Hermes tool wrapping `hermes send`) would + let the *agent* push notifications too. Slice 4 ships only the + watcher → bot path; the tool lands later if proven useful. +""" + +from __future__ import annotations + +import logging +import os +import subprocess +import sys +import time +from typing import Any + +from kars_runtime_hermes.plugin import sre as sre_plugin +from kars_runtime_hermes.plugin import sre_kube + +logger = logging.getLogger("kars_runtime_hermes.plugin.sre_watcher") +logger.setLevel(logging.INFO) +if not logger.handlers: + h = logging.StreamHandler(sys.stderr) + h.setFormatter(logging.Formatter("[%(asctime)s] sre_watcher: %(message)s")) + logger.addHandler(h) + +# Reasons we treat as actionable incidents. Anything else is informational +# (Normal events) or out-of-scope (e.g. kubernetes node lifecycle events). +INCIDENT_REASONS = frozenset( + { + "FailedCreate", + "BackOff", + "FailedScheduling", + "Failed", + "ImagePullBackOff", + "ErrImagePull", + "CrashLoopBackOff", + "OOMKilling", + "Evicted", + "FailedMount", + } +) + +# Namespaces the watcher refuses to act on (proposal §7.7.1 +# protected-resource denylist). Same set the controller-side reconciler +# enforces — watcher refuses BEFORE invoking sre_propose_fix so we +# don't even create a CR the controller would just reject. +PROTECTED_NAMESPACES = frozenset( + { + "kube-system", + "kube-public", + "kube-node-lease", + "kars-system", + "kars-sre", + "agentmesh", + "default", + } +) + +# Only consider events in namespaces matching this prefix. Operators +# can override via $SRE_WATCHER_NAMESPACE_PREFIX (e.g. "" to widen +# scope to all non-protected namespaces). +NAMESPACE_PREFIX = os.environ.get("SRE_WATCHER_NAMESPACE_PREFIX", "kars-") + +# Polling cadence (seconds). 10s is responsive enough for ops while +# keeping the apiserver load minimal — events are also batched on the +# server side so a 10s window typically yields ≤ 1 list call. +WATCH_INTERVAL_SECONDS = int(os.environ.get("SRE_WATCHER_INTERVAL", "10")) + +# Per-tuple dedupe window. Within this window a repeated incident with +# the same (ns, kind, name, reason) is silenced. 10 min matches the +# proposal §7.4.4 default. +DEDUPE_WINDOW_SECONDS = int(os.environ.get("SRE_WATCHER_DEDUPE_SECONDS", "600")) + +# How fresh an event has to be to count as "new" (vs replay of state +# we already saw at startup). On boot the watcher silently absorbs all +# old events into the dedupe map so it doesn't fire a flood of alerts +# for incidents that happened before it started. +EVENT_FRESHNESS_SECONDS = int(os.environ.get("SRE_WATCHER_FRESHNESS_SECONDS", "120")) + +# Per-minute Telegram rate limit. Cluster-wide sliding window — once +# this many messages have gone out in the last 60s, the watcher +# silently drops further alerts until the window slides. Prevents the +# 170-message flood the original Slice 4 demo produced when several +# sandboxes broke at once. Operators tune via ``SRE_WATCHER_MAX_MSGS_PER_MIN``. +# Each batch dispatch emits at most 2 messages (top alert + summary +# tail), so default of 4 = roughly 2 distinct bursts per minute. +MAX_MSGS_PER_MINUTE = int(os.environ.get("SRE_WATCHER_MAX_MSGS_PER_MIN", "4")) + +# When the watcher would propose a new KarsSREAction for an incident, +# it first lists existing CRs and reuses any non-terminal one with the +# same (action.type, params.namespace, params.name) target. Suppresses +# the duplicate-CR pile-up the demo showed (40+ identical +# DeleteResourceQuota CRs against the same quota). +CR_REUSE_ENABLED = os.environ.get("SRE_WATCHER_CR_REUSE", "true").lower() not in ( + "false", + "0", + "no", + "off", +) + +# Phases the watcher considers "still open" for CR-reuse purposes. +# Anything outside this set is terminal — the watcher will create a +# new CR rather than re-attach to an Expired / Recovered / Failed / +# Rejected one. +ACTIVE_PHASES = frozenset({"Proposed", "Approved", "Applied", ""}) + + +def _resolve_notify_target() -> str: + """Pick the best Telegram target. + + Order: + 1. explicit override via ``SRE_WATCHER_NOTIFY_TARGET`` env + 2. ``telegram:`` so `hermes send` + can route without needing the home_channel to be configured + 3. bare ``telegram`` (relies on the gateway's home channel) + """ + explicit = os.environ.get("SRE_WATCHER_NOTIFY_TARGET") + if explicit: + return explicit + allow = os.environ.get("TELEGRAM_ALLOW_FROM", "").strip() + if allow: + first = allow.split(",")[0].strip() + if first: + return f"telegram:{first}" + return "telegram" + + +NOTIFY_TARGET = _resolve_notify_target() + + +def _now_epoch() -> float: + return time.time() + + +def _event_ts(ev: dict[str, Any]) -> float: + """Best-effort epoch timestamp for an Event object. + + K8s events carry both legacy ``lastTimestamp`` (RFC3339, seconds + precision) and modern ``eventTime`` (RFC3339 with sub-second + precision). Either may be unset depending on which controller + emitted it. We try lastTimestamp first because it carries the + most recent occurrence for repeated events. + """ + for key in ("lastTimestamp", "eventTime"): + ts = ev.get(key) + if not ts: + continue + try: + # Strip trailing Z + fractional seconds for stdlib parsing + from datetime import datetime, timezone + + ts_clean = ts.replace("Z", "+00:00") + return datetime.fromisoformat(ts_clean).timestamp() + except Exception: + continue + # Fall back to firstTimestamp if both above are missing + fts = ev.get("firstTimestamp") + if fts: + try: + from datetime import datetime + + return datetime.fromisoformat(fts.replace("Z", "+00:00")).timestamp() + except Exception: + pass + return 0.0 + + +import re as _re + +# Strip trailing rollout / pod-template hashes so each rollout of the +# SAME workload deduplicates against itself. K8s ReplicaSet names are +# ``-<10char-template-hash>`` and pod names are +# ``-<5char-suffix>``. Without this normalisation a flapping +# Deployment's events get a different dedupe key per rollout = no +# silencing = Telegram spam (170-msg incident). +_HASH_SUFFIX_RE = _re.compile(r"-[a-z0-9]{5,10}$") + + +def _normalise_name(name: str, kind: str) -> str: + """Collapse rollout-generated hash suffixes for dedupe purposes. + + ``research-7886669466-abcde`` → ``research-7886669466`` → ``research``. + Applied to ReplicaSet and Pod kinds. For Job-spawned pods (cron- + refresh family), strip the cronjob's per-fire timestamp + the pod + hash suffix to collapse to the parent name. + """ + if kind not in ("Pod", "ReplicaSet", "Job"): + return name + base = name + # Pod ← RS ← Deployment: strip up to 2 hash suffixes + for _ in range(2): + new = _HASH_SUFFIX_RE.sub("", base) + if new == base: + break + base = new + return base or name + + +def _dedupe_key(ev: dict[str, Any]) -> tuple[str, str, str, str]: + """Stable dedupe key: (namespace, kind, normalised-name, reason).""" + obj = ev.get("involvedObject", {}) or {} + raw_name = obj.get("name") or "" + kind = obj.get("kind") or "" + return ( + ev.get("namespace") or obj.get("namespace") or "", + kind, + _normalise_name(raw_name, kind), + ev.get("reason") or "", + ) + + +def _list_events_all_namespaces() -> list[dict[str, Any]]: + """List all Events cluster-wide via the core v1 API. + + Returns the raw items list. Errors are logged and an empty list + returned so the watcher keeps polling on transient apiserver + blips. + """ + try: + resp = sre_kube.client().get("/api/v1/events") + return resp.get("items", []) or [] + except Exception as e: + logger.warning("list events failed: %s", e) + return [] + + +def _is_in_scope(ev: dict[str, Any]) -> bool: + """True iff the event belongs to a namespace in scope. + + Scope = ``NAMESPACE_PREFIX`` AND not in ``PROTECTED_NAMESPACES``. + """ + meta = ev.get("metadata", {}) or {} + ns = meta.get("namespace") or ev.get("namespace") or "" + if NAMESPACE_PREFIX and not ns.startswith(NAMESPACE_PREFIX): + return False + if ns in PROTECTED_NAMESPACES: + return False + return True + + +def _build_summary(ev: dict[str, Any]) -> str: + """Build a one-paragraph operator-facing diagnosis string.""" + obj = ev.get("involvedObject", {}) or {} + ns = obj.get("namespace") or ev.get("namespace", "?") + kind = obj.get("kind", "?") + name = obj.get("name", "?") + reason = ev.get("reason", "?") + msg = ev.get("message", "")[:240] + return f"{kind}/{name} in {ns} hit {reason}. {msg}".strip() + + +def _build_action_target(ev: dict[str, Any]) -> dict[str, Any] | None: + """Map an event to a propose_fix target shape. + + Returns None when no actionable typed fix exists (e.g. an event on + a Pod with reason BackOff — the watcher proposes deleting that pod + so the owner controller respawns it; an event on a ReplicaSet with + FailedCreate due to ResourceQuota — the watcher proposes deleting + the quota IF the message names it). + """ + obj = ev.get("involvedObject", {}) or {} + ns = obj.get("namespace") or ev.get("namespace") + kind = obj.get("kind") or "" + name = obj.get("name") or "" + reason = ev.get("reason") or "" + msg = ev.get("message") or "" + if not ns or not name: + return None + + # FailedCreate from a ResourceQuota → target the quota directly so + # the controller can delete it (subject to the kars-managed label + # guard at execute time). + if reason == "FailedCreate" and "quota" in msg.lower(): + # Try to extract the quota name from the apiserver's stock + # message: 'is forbidden: exceeded quota: , ...' + if "exceeded quota:" in msg: + try: + quota_name = msg.split("exceeded quota:", 1)[1].split(",", 1)[0].strip() + return { + "kind": "ResourceQuota", + "namespace": ns, + "name": quota_name, + } + except Exception: + return None + + # BackOff / CrashLoopBackOff on a Pod → propose deleting the pod so + # its owning controller (RS / StatefulSet / DS / Job) reconciles a + # fresh instance. Safe because we do not target ownerless pods. + if reason in ("BackOff", "CrashLoopBackOff") and kind == "Pod": + return {"kind": "Pod", "namespace": ns, "name": name} + + # Unhandled — return None so the watcher only NOTIFIES the + # operator (without creating a CR) and lets the agent / human + # propose the right action interactively. + return None + + +def _send_telegram(text: str) -> bool: + """Send `text` to the operator via `hermes send`. + + Returns True on exit code 0, False otherwise. Errors are logged + but do not crash the watcher. + """ + try: + result = subprocess.run( + ["hermes", "send", "--to", NOTIFY_TARGET, "--quiet", text], + capture_output=True, + text=True, + timeout=15, + ) + if result.returncode != 0: + logger.warning("hermes send rc=%d stderr=%s", result.returncode, result.stderr[:300]) + return False + return True + except subprocess.TimeoutExpired: + logger.warning("hermes send timed out (15s)") + return False + except FileNotFoundError: + logger.warning("hermes binary not on PATH — telegram notification skipped") + return False + + +def _load_dedupe_from_crs() -> dict[tuple[str, str, str], float]: + """Build dedupe state from existing KarsSREActions. + + Survives pod restarts naturally — the CRs are persisted in etcd, + not in the pod's emptyDir. Key shape collapsed to + ``(namespace, action_type, target_name)`` because (per design) the + operator cares about "one alert per affected workload", regardless + of which raw event reason triggered the watcher. + + Returns ``{key: last_seen_epoch}`` where ``last_seen_epoch`` is + derived from the CR's creationTimestamp. Terminal-phase CRs + suppress re-alerting within ``DEDUPE_WINDOW_SECONDS`` so a freshly- + failed retry doesn't spam the operator who just decided to reject + or whose previous proposal expired. + """ + from datetime import datetime + + out: dict[tuple[str, str, str], float] = {} + try: + resp = sre_kube.client().get( + "/apis/kars.azure.com/v1alpha1/namespaces/kars-sre/karssreactions" + ) + except Exception as e: # noqa: BLE001 + logger.warning("CR-based dedupe bootstrap failed: %s", e) + return out + for cr in resp.get("items", []) or []: + spec = cr.get("spec", {}) or {} + action = spec.get("action", {}) or {} + params = action.get("params", {}) or {} + ns = params.get("namespace") or "" + name = params.get("name") or "" + atype = action.get("type") or "" + if not (ns and name and atype): + continue + ts_raw = cr.get("metadata", {}).get("creationTimestamp") + ts = 0.0 + if ts_raw: + try: + ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp() + except Exception: + pass + key = (ns, atype, name) + if ts > out.get(key, 0.0): + out[key] = ts + return out + + +def _target_dedupe_key(target: dict[str, Any]) -> tuple[str, str, str]: + """Translate a propose_fix target into the CR-aligned dedupe key. + + Mirrors :func:`_load_dedupe_from_crs` so the in-memory seen-set + and the CR-derived bootstrap state share the same keyspace. + """ + type_map = { + "ResourceQuota": "DeleteResourceQuota", + "Pod": "DeletePod", + } + atype = type_map.get(target.get("kind", ""), "") + return (target.get("namespace", "") or "", atype, target.get("name", "") or "") + + +def _find_existing_open_action(target: dict[str, Any]) -> str | None: + """Return the name of an existing non-terminal KarsSREAction whose + target matches, or None if none exists. + + Lists ``kars-sre`` namespaced karssreactions and matches on + ``spec.action.type`` + ``spec.action.params.namespace`` + + ``spec.action.params.name``. "Non-terminal" = status.phase in + ACTIVE_PHASES (Proposed / Approved / Applied / unset). + """ + if not CR_REUSE_ENABLED: + return None + try: + resp = sre_kube.client().get( + "/apis/kars.azure.com/v1alpha1/namespaces/kars-sre/karssreactions" + ) + except Exception as e: # noqa: BLE001 + logger.warning("list karssreactions failed during CR-reuse check: %s", e) + return None + want_type = target.get("type") or { + "ResourceQuota": "DeleteResourceQuota", + "Pod": "DeletePod", + }.get(target.get("kind", "")) + want_ns = target.get("namespace") + want_name = target.get("name") + for cr in resp.get("items", []) or []: + spec = cr.get("spec", {}) or {} + action = spec.get("action", {}) or {} + params = action.get("params", {}) or {} + if action.get("type") != want_type: + continue + if params.get("namespace") != want_ns or params.get("name") != want_name: + continue + phase = (cr.get("status", {}) or {}).get("phase", "") or "" + if phase in ACTIVE_PHASES: + return cr.get("metadata", {}).get("name") + return None + + +def _handle_incident(ev: dict[str, Any]) -> dict[str, Any] | None: + """Diagnose an event, optionally create a KarsSREAction. + + Returns a candidate descriptor for the batch dispatcher: + ``{summary, target, ns, kind, name, reason, action_id, cr_error, + reused, priority}``. The dispatcher (in :func:`run`) ranks + candidates and decides which to surface in detail vs collapse + into a summary line. + + Returns None only on internal error. CR creation failures are + captured in ``cr_error`` so the dispatcher can still mention + the incident. + """ + summary = _build_summary(ev) + target = _build_action_target(ev) + obj = ev.get("involvedObject", {}) or {} + ns = obj.get("namespace") or ev.get("namespace", "?") + reason = ev.get("reason", "?") + + action_id: str | None = None + cr_error: str | None = None + reused = False + if target is not None: + existing = _find_existing_open_action(target) + if existing: + action_id = existing + reused = True + logger.info( + "reusing existing open action %s for target %s/%s/%s — no new CR", + existing, + target.get("kind"), + target.get("namespace"), + target.get("name"), + ) + else: + try: + proposal = sre_plugin._impl_sre_propose_fix( + diagnosis=summary, + target=target, + # Watcher proposes; operator approves. Short TTL so + # stale proposals lapse rather than pile up — 30 min + # gives enough time for an operator to wake up. + ttl_minutes=30, + ) + action_id = proposal.get("action_id") + cr_error = proposal.get("cr_error") + except Exception as e: # noqa: BLE001 + logger.warning("propose_fix failed: %s", e) + cr_error = str(e) + + return { + "summary": summary, + "target": target, + "ns": ns, + "kind": obj.get("kind") or "?", + "name": obj.get("name") or "?", + "reason": reason, + "action_id": action_id, + "cr_error": cr_error, + "reused": reused, + "priority": _candidate_priority(target is not None, reason, action_id), + } + + +def _candidate_priority(actionable: bool, reason: str, action_id: str | None) -> int: + """Rank a candidate for the per-batch dispatcher. + + Higher = more urgent. Ordering rationale: + - Actionable + new CR (fix proposed, awaiting approval) — top + - Actionable + reused (existing open CR, reminder) — second + - FailedCreate / Failed / OOMKilling / Evicted — workload-level + damage, more urgent than scheduling pressure + - BackOff / CrashLoopBackOff — pod stuck, mid + - FailedScheduling / FailedMount — usually capacity-related, lower + """ + base = 0 + if actionable: + base += 100 + if action_id and not action_id.startswith("None"): + base += 50 + severity = { + "FailedCreate": 40, + "Failed": 35, + "OOMKilling": 35, + "Evicted": 30, + "ImagePullBackOff": 25, + "ErrImagePull": 25, + "CrashLoopBackOff": 20, + "BackOff": 15, + "FailedScheduling": 10, + "FailedMount": 10, + } + return base + severity.get(reason, 0) + + +def _format_detailed_alert(c: dict[str, Any]) -> str: + """Single high-priority incident in full Telegram-Markdown form.""" + reminder = " (reminder)" if c["reused"] else "" + lines = [ + f"🚨 *kars-sre* incident in `{c['ns']}`{reminder}", + "", + f"*Symptom:* {c['summary']}", + ] + action_id = c["action_id"] + target = c["target"] + if action_id and target: + lines += [ + "", + f"*Proposed fix:* `{target['kind']}` *{target['namespace']}/{target['name']}*", + f"*action_id:* `{action_id}`", + "", + f"Approve: `kars sre approve {action_id}`", + f"Reject: `kars sre reject {action_id} --reason ...`", + ] + elif c["cr_error"]: + lines += [ + "", + f"_Could not generate a typed fix: {c['cr_error']}_", + "", + "Connect to the bot or `kars sre talk` to investigate.", + ] + else: + lines += [ + "", + "_No typed fix codified — manual investigation needed._", + "Reply to triage, or run: `kars sre talk`", + ] + return "\n".join(lines) + + +def _format_summary_tail(extras: list[dict[str, Any]]) -> str: + """One-line collapse of the remaining candidates for a burst. + + Per-reason counts are most useful for an operator triaging — they + can tell at a glance whether the burst is "10 pods can't schedule" + (capacity) vs "5 different things are crashlooping" (broader + incident). + """ + by_reason: dict[str, int] = {} + for c in extras: + by_reason[c["reason"]] = by_reason.get(c["reason"], 0) + 1 + parts = ", ".join(f"{n} {r}" for r, n in sorted(by_reason.items(), key=lambda kv: -kv[1])) + return ( + f"\n\n⚠ *+{len(extras)} other incidents* in this scan: {parts}\n" + "Run `kars sre actions` for the full list." + ) + + +def _dispatch_batch(candidates: list[dict[str, Any]]) -> int: + """Send at most one detailed message + one summary tail per scan. + + Ranks by priority, then sends: + - the top candidate in full + - if 2+ candidates, a one-line summary footer of the rest + + Returns the count of Telegram messages actually emitted (0, 1, or 2). + """ + if not candidates: + return 0 + # Sort by priority desc, then by reason name for determinism so two + # equal-priority candidates always rank the same way across polls. + candidates.sort(key=lambda c: (-c["priority"], c["reason"], c["name"])) + top = candidates[0] + rest = candidates[1:] + text = _format_detailed_alert(top) + sent_count = 0 + if _send_telegram(text): + sent_count += 1 + logger.info( + "batch dispatch: top ns=%s kind=%s name=%s reason=%s action_id=%s " + "rest_count=%d notified=%s", + top["ns"], top["kind"], top["name"], top["reason"], + top["action_id"], len(rest), sent_count > 0, + ) + if rest: + if _send_telegram(_format_summary_tail(rest).strip()): + sent_count += 1 + return sent_count + + +def run() -> None: + """Main watch loop. Blocks forever; intended to be the entrypoint + of a long-lived background process. + """ + if os.environ.get("SRE_WATCHER_ENABLED", "true").lower() in ("false", "0", "no", "off"): + logger.info("disabled via SRE_WATCHER_ENABLED — exiting") + return + logger.info( + "starting (poll=%ds, dedupe=%ds, prefix=%r, notify_target=%r)", + WATCH_INTERVAL_SECONDS, + DEDUPE_WINDOW_SECONDS, + NAMESPACE_PREFIX, + NOTIFY_TARGET, + ) + + # Dedupe state. Key shape: (namespace, action_type, target_name). + # Bootstrapped from existing KarsSREActions so a pod restart + # doesn't replay alerts for incidents whose CR is still in the + # cluster. We also re-sync from CRs every minute so an external + # operator action (e.g. they ran `kubectl delete karssreactions + # --all` to clean up) flushes the dedupe naturally. + target_seen: dict[tuple[str, str, str], float] = _load_dedupe_from_crs() + logger.info("dedupe bootstrap: %d entries from existing CRs", len(target_seen)) + last_cr_sync = _now_epoch() + CR_SYNC_INTERVAL = 60 + + # Sliding-window rate limit log. Each entry is the epoch the + # message was sent; entries older than 60s are pruned every poll. + msg_log: list[float] = [] + + # First-iteration priming: ALWAYS silently absorb the current + # event set on the first pass, so we don't flood the operator + # with "everything that was failing on boot". Trade-off: a freshly- + # broken workload whose event we missed during pod restart only + # alerts after the next poll (10s + dedupe-window check). For the + # SRE notification use case this is fine — it's not a P1 pager. + primed = False + + while True: + try: + now = _now_epoch() + # Periodic CR resync — REPLACES the dedupe state with the + # current CR list. This way operators who run + # `kubectl delete karssreactions --all` to clear the demo + # see new alerts on the next iteration rather than waiting + # for the dedupe window to lapse. Recent in-memory alerts + # (from this watcher's own _handle_incident) are preserved + # — but only if they are NEWER than CR_SYNC_INTERVAL, + # which means the operator can't accidentally re-trigger + # by deleting CRs mid-poll. + if (now - last_cr_sync) > CR_SYNC_INTERVAL: + fresh = _load_dedupe_from_crs() + # Keep in-memory entries newer than the last sync; + # everything else is REPLACED by the fresh CR snapshot. + preserved = { + k: v for k, v in target_seen.items() if v > last_cr_sync + } + target_seen = {**fresh, **preserved} + last_cr_sync = now + events = _list_events_all_namespaces() + # Collect candidates this iteration → dispatch as a batch + # so a multi-incident burst becomes "1 detailed alert + + # 1 summary tail" instead of N separate Telegram messages. + candidates: list[dict[str, Any]] = [] + for ev in events: + if not _is_in_scope(ev): + continue + if ev.get("type") != "Warning": + continue + reason = ev.get("reason", "") + if reason not in INCIDENT_REASONS: + continue + ts = _event_ts(ev) + if ts > 0 and (now - ts) > EVENT_FRESHNESS_SECONDS: + continue + target = _build_action_target(ev) + if target is None: + # No typed fix → fall back to per-event dedupe + # using the event tuple so we still alert (once) + # for unknown incidents. These are the noisy + # alerts (e.g. FailedScheduling on a pod that has + # no typed remediation) — priming silences the + # initial flood; ranking pushes them below + # actionable ones in burst-collapse. + obj = ev.get("involvedObject", {}) or {} + fallback_key = ( + ev.get("namespace") or obj.get("namespace") or "", + obj.get("kind") or "?", + _normalise_name(obj.get("name") or "", obj.get("kind") or ""), + ) + last = target_seen.get(fallback_key) + if last is not None and (now - last) < DEDUPE_WINDOW_SECONDS: + continue + target_seen[fallback_key] = now + if primed: + cand = _handle_incident(ev) + if cand: + candidates.append(cand) + continue + # Actionable incident (typed-fix available). On + # iteration 1 (priming) we silently absorb to avoid + # boot-time flood. After priming, the CR-reuse path + # makes sure we don't create duplicate CRs even when + # the same incident retriggers. + key = _target_dedupe_key(target) + last = target_seen.get(key) + if last is not None and (now - last) < DEDUPE_WINDOW_SECONDS: + continue + target_seen[key] = now + if primed: + cand = _handle_incident(ev) + if cand: + candidates.append(cand) + + # Burst collapse + per-minute rate limit. Operators saw + # the original Slice 4 demo flood Telegram with 6+ messages + # on a single pod restart; here we surface the top + # candidate in full + a single summary tail line, and + # apply a sliding-window rate limit cluster-wide. + if candidates: + # Drop alerts that would exceed the per-minute budget. + window_start = now - 60 + msg_log[:] = [t for t in msg_log if t >= window_start] + budget = max(0, MAX_MSGS_PER_MINUTE - len(msg_log)) + if budget == 0: + logger.info( + "rate limit hit: %d candidates dropped (max %d msgs/min)", + len(candidates), MAX_MSGS_PER_MINUTE, + ) + else: + # _dispatch_batch sends at most 2 messages (top + + # summary). Trim candidates if we can't afford + # both — better to send just the top than fail to + # send anything. + sent = _dispatch_batch(candidates) + for _ in range(sent): + msg_log.append(now) + + primed = True + # Trim entries older than 2× the window so the map stays + # bounded over long uptimes. + cutoff = now - (DEDUPE_WINDOW_SECONDS * 2) + target_seen = {k: v for k, v in target_seen.items() if v >= cutoff} + except Exception as e: # noqa: BLE001 — keep the loop alive + logger.warning("watch iteration error: %s", e) + time.sleep(WATCH_INTERVAL_SECONDS) + + +if __name__ == "__main__": + run() diff --git a/runtimes/hermes/tests/test_sre.py b/runtimes/hermes/tests/test_sre.py index fc2ea86e..9247a269 100644 --- a/runtimes/hermes/tests/test_sre.py +++ b/runtimes/hermes/tests/test_sre.py @@ -94,7 +94,7 @@ class BadCtx: def test_explain_error_matches_imagepullbackoff() -> None: from kars_runtime_hermes.plugin import sre - result = sre.sre_explain_error(error="Failed to pull image: ImagePullBackOff") + result = sre._impl_sre_explain_error(error="Failed to pull image: ImagePullBackOff") assert result["matched"] is True assert result["hypotheses"][0]["pattern"] == "ImagePullBackOff" @@ -102,7 +102,7 @@ def test_explain_error_matches_imagepullbackoff() -> None: def test_explain_error_matches_exceeded_quota() -> None: from kars_runtime_hermes.plugin import sre - result = sre.sre_explain_error(error="pods 'foo' is forbidden: exceeded quota: tight-quota") + result = sre._impl_sre_explain_error(error="pods 'foo' is forbidden: exceeded quota: tight-quota") assert result["matched"] is True assert result["hypotheses"][0]["pattern"] == "exceeded quota" @@ -110,7 +110,7 @@ def test_explain_error_matches_exceeded_quota() -> None: def test_explain_error_no_match() -> None: from kars_runtime_hermes.plugin import sre - result = sre.sre_explain_error(error="totally-unknown-thing") + result = sre._impl_sre_explain_error(error="totally-unknown-thing") assert result["matched"] is False assert result["error"] == "totally-unknown-thing" @@ -118,16 +118,22 @@ def test_explain_error_no_match() -> None: def test_explain_error_empty_string() -> None: from kars_runtime_hermes.plugin import sre - result = sre.sre_explain_error(error="") + result = sre._impl_sre_explain_error(error="") assert result["matched"] is False assert "reason" in result def test_propose_fix_for_resourcequota() -> None: - """The Slice 1 demo target — DeleteResourceQuota typed action.""" + """Slice 3 demo target — DeleteResourceQuota typed action. + + The proposal envelope must carry the typed action; whether the + KarsSREAction CR was created depends on whether we're running in + a pod with a projected SA token. Both pod (CR created) and unit- + test (cr_error captured) paths return the same action shape. + """ from kars_runtime_hermes.plugin import sre - result = sre.sre_propose_fix( + result = sre._impl_sre_propose_fix( diagnosis="ResourceQuota platform-hardening-quota in kars-research is blocking pod admission", target={ "kind": "ResourceQuota", @@ -140,23 +146,32 @@ def test_propose_fix_for_resourcequota() -> None: assert result["action"]["type"] == "DeleteResourceQuota" assert result["action"]["namespace"] == "kars-research" assert result["action"]["name"] == "platform-hardening-quota" - # Slice 1 returns "proposed" — execution lands in Slice 3 - assert "proposed" in result["execution_status"] - assert "not executed" in result["execution_status"] + # Slice 3 + watcher: when the proposal carries a typed action the + # tool tries to create a KarsSREAction CR. Outside a pod (unit + # test) the SA-token read fails and surfaces in cr_error; inside a + # pod cr_created=True and action_id is set. Either way the + # operator-facing execution_status announces awaiting-approval. + assert "operator approval" in result["execution_status"] def test_propose_fix_unknown_target_kind() -> None: - """For target kinds Slice 1 doesn't codify, return envelope with no action.""" + """For target kinds the watcher doesn't codify, return envelope with no action. + + Slice 3 adds Pod / Deployment / StatefulSet / DaemonSet handling, + so we use ConfigMap here as the genuine "unknown" case. + """ from kars_runtime_hermes.plugin import sre - result = sre.sre_propose_fix( - diagnosis="pod ImagePullBackOff", - target={"kind": "Pod", "namespace": "default", "name": "broken"}, + result = sre._impl_sre_propose_fix( + diagnosis="config drift on a ConfigMap", + target={"kind": "ConfigMap", "namespace": "default", "name": "drifted"}, ) assert result["kind"] == "FixProposal" assert result["action"] is None # Still returns rationale for the operator assert "rationale" in result and result["rationale"] + # And the cr_error explains what was missing. + assert result.get("cr_error") is not None def test_kars_cr_kinds_covers_all_eleven_crds() -> None: @@ -193,7 +208,7 @@ def test_describe_state_with_mocked_kube() -> None: mock_client.get.return_value = fake_doc with patch.object(sre.sre_kube, "client", return_value=mock_client): - result = sre.sre_describe_state() + result = sre._impl_sre_describe_state() # Every kind got summarised assert set(result.keys()) == {k for _p, k in sre.KARS_CR_KINDS} @@ -218,7 +233,7 @@ def test_describe_state_handles_apiserver_errors_per_kind() -> None: ) with patch.object(sre.sre_kube, "client", return_value=mock_client): - result = sre.sre_describe_state() + result = sre._impl_sre_describe_state() # Every kind got an error entry, but no exception bubbled up for kind in result: diff --git a/runtimes/hermes/tests/test_sre_k8s.py b/runtimes/hermes/tests/test_sre_k8s.py index bfa82ce9..d932f996 100644 --- a/runtimes/hermes/tests/test_sre_k8s.py +++ b/runtimes/hermes/tests/test_sre_k8s.py @@ -29,7 +29,7 @@ def test_register_registers_five_slice2_tools() -> None: def test_describe_resource_unknown_kind() -> None: from kars_runtime_hermes.plugin import sre_k8s - result = sre_k8s.sre_describe_resource(kind="UnknownKind", name="x") + result = sre_k8s._impl_sre_describe_resource(kind="UnknownKind", name="x") assert "error" in result assert "supported_kinds" in result @@ -52,7 +52,7 @@ def test_describe_resource_resource_quota() -> None: mock_client = MagicMock() mock_client.get.side_effect = [quota_doc, {"items": []}] # quota + events with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): - result = sre_k8s.sre_describe_resource( + result = sre_k8s._impl_sre_describe_resource( kind="ResourceQuota", namespace="kars-research", name="platform-hardening-quota", @@ -82,7 +82,7 @@ def test_describe_resource_resource_quota_kars_managed() -> None: mock_client = MagicMock() mock_client.get.side_effect = [quota_doc, {"items": []}] with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): - result = sre_k8s.sre_describe_resource( + result = sre_k8s._impl_sre_describe_resource( kind="ResourceQuota", namespace="kars-sre", name="sre-quota" ) assert result["isKarsManaged"] is True @@ -136,7 +136,7 @@ def test_describe_resource_deployment_owner_graph() -> None: {"items": []}, {"items": []}, {"items": []}, ] with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): - result = sre_k8s.sre_describe_resource( + result = sre_k8s._impl_sre_describe_resource( kind="Deployment", namespace="kars-research", name="research" ) assert "workload" in result @@ -155,7 +155,7 @@ def test_describe_resource_handles_404_gracefully() -> None: response = MagicMock(status_code=404, reason_phrase="Not Found") mock_client.get.side_effect = httpx.HTTPStatusError("404", request=MagicMock(), response=response) with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): - result = sre_k8s.sre_describe_resource( + result = sre_k8s._impl_sre_describe_resource( kind="Pod", namespace="kars-research", name="missing" ) assert "error" in result @@ -188,7 +188,7 @@ def test_what_changed_filters_to_failure_reasons() -> None: mock_client = MagicMock() mock_client.get.side_effect = [core_doc, new_doc] with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): - result = sre_k8s.sre_what_changed(namespace="kars-research", minutes=15) + result = sre_k8s._impl_sre_what_changed(namespace="kars-research", minutes=15) assert len(result["events_core"]) == 1 assert result["events_core"][0]["reason"] == "FailedCreate" assert "exceeded quota" in result["events_core"][0]["message"] @@ -225,7 +225,7 @@ def test_endpoints_inspect_zero_endpoints_finding() -> None: mock_client = MagicMock() mock_client.get.side_effect = [svc_doc, pod_doc, es_doc] with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): - result = sre_k8s.sre_endpoints_inspect(namespace="kars-research", service="research") + result = sre_k8s._impl_sre_endpoints_inspect(namespace="kars-research", service="research") assert result["selector"] == {"app": "research"} assert len(result["matching_pods"]) == 2 # Both pods are NotReady → finding should call that out @@ -242,7 +242,7 @@ def test_endpoints_inspect_pod_selector_mismatch() -> None: mock_client = MagicMock() mock_client.get.side_effect = [svc_doc, pod_doc, es_doc] with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): - result = sre_k8s.sre_endpoints_inspect(namespace="kars-research", service="research") + result = sre_k8s._impl_sre_endpoints_inspect(namespace="kars-research", service="research") assert "No pods match" in result["finding"] @@ -273,7 +273,7 @@ def test_image_probe_finds_closest_tag_in_use() -> None: mock_client = MagicMock() mock_client.get.return_value = pod_doc with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): - result = sre_k8s.sre_image_probe(image="nginx:1.27-typo") + result = sre_k8s._impl_sre_image_probe(image="nginx:1.27-typo") # The closest in-use match for nginx:1.27-typo is nginx:1.27.3 assert result["closest_in_use"] == "nginx:1.27.3" assert "typo" in result["advice"].lower() or "edit-distance" in result["advice"] @@ -287,7 +287,7 @@ def test_image_probe_no_pods_use_repo() -> None: mock_client = MagicMock() mock_client.get.return_value = pod_doc with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): - result = sre_k8s.sre_image_probe(image="newrepo:v1") + result = sre_k8s._impl_sre_image_probe(image="newrepo:v1") assert result["in_use_on_cluster"] == [] assert "No pod on this cluster" in result["advice"] @@ -301,7 +301,7 @@ def test_top_unavailable_when_metrics_server_missing() -> None: "404", request=MagicMock(), response=response ) with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): - result = sre_k8s.sre_top(scope="nodes") + result = sre_k8s._impl_sre_top(scope="nodes") assert "unavailable" in result assert "metrics-server" in result["unavailable"] @@ -309,7 +309,7 @@ def test_top_unavailable_when_metrics_server_missing() -> None: def test_top_invalid_scope() -> None: from kars_runtime_hermes.plugin import sre_k8s - result = sre_k8s.sre_top(scope="invalid") + result = sre_k8s._impl_sre_top(scope="invalid") assert "error" in result assert "valid_scopes" in result @@ -332,7 +332,7 @@ def test_top_pods_returns_per_container() -> None: mock_client = MagicMock() mock_client.get.return_value = doc with patch.object(sre_k8s.sre_kube, "client", return_value=mock_client): - result = sre_k8s.sre_top(scope="pods", namespace="kars-research") + result = sre_k8s._impl_sre_top(scope="pods", namespace="kars-research") assert result["scope"] == "pods" assert len(result["items"]) == 1 assert len(result["items"][0]["containers"]) == 2 diff --git a/sandbox-images/hermes/Dockerfile b/sandbox-images/hermes/Dockerfile index 8464c0f2..dad17cf9 100644 --- a/sandbox-images/hermes/Dockerfile +++ b/sandbox-images/hermes/Dockerfile @@ -90,6 +90,23 @@ RUN if ls /tmp/agt-wheels/*.whl >/dev/null 2>&1; then \ ARG HERMES_VERSION=0.15.2 RUN pip install --no-cache-dir "hermes-agent==${HERMES_VERSION}" +# ---- Channel adapter libraries ----------------------------------------- +# Hermes auto-detects channels (Telegram / Slack / Discord) from env +# vars (TELEGRAM_BOT_TOKEN, SLACK_BOT_TOKEN, DISCORD_BOT_TOKEN) and +# tries to instantiate an adapter per channel. Each adapter is a +# soft-optional dep — Hermes itself doesn't pull them — so we install +# them here so the kars runtime image is "channels work out of the box" +# when a credentials secret carries the token. Pinned to the +# adapter-stable major: +# - python-telegram-bot 21.x (Bot API 7.x, async-first) +# - slack-sdk 3.x (Web + Socket Mode) +# - discord.py 2.x (gateway client) +# Bumping these requires re-verifying the Hermes channel adapters. +RUN pip install --no-cache-dir \ + "python-telegram-bot>=21,<22" \ + "slack-sdk>=3,<4" \ + "discord.py>=2,<3" + # ---- Install the kars-runtime-hermes plugin ----------------------------- # This is the in-pod adapter that registers kars_spawn, foundry_*, # governance pre_tool_call hook, channel translation, etc. diff --git a/sandbox-images/hermes/entrypoint.sh b/sandbox-images/hermes/entrypoint.sh index 99e97d82..d92463e1 100644 --- a/sandbox-images/hermes/entrypoint.sh +++ b/sandbox-images/hermes/entrypoint.sh @@ -52,6 +52,50 @@ fi export HERMES_HOME="${HERMES_HOME:-/sandbox/.hermes}" mkdir -p "$HERMES_HOME" +# ── HOME (writable for libraries that ignore HERMES_HOME) ────────────── +# Distroless base sets HOME=/ (read-only). Several Hermes deps — +# notably the gateway's per-platform lock dir (~/.local/state/hermes/ +# gateway-locks) and python-telegram-bot's internal state — assume +# HOME is writable. Without this override, Telegram / Slack / Discord +# channels fail at boot with `[Errno 30] Read-only file system: '/.local'`. +# /sandbox is the per-pod writable emptyDir owned by the sandbox UID. +export HOME="${HOME:-/sandbox}" +if [ "$HOME" = "/" ] || [ ! -w "$HOME" ]; then + export HOME=/sandbox +fi +mkdir -p "$HOME/.local/state" + +# ── Outbound HTTPS proxy ─────────────────────────────────────────── +# UID 1000 in a kars sandbox cannot reach the internet directly: +# egress-guard's iptables rules transparent-redirect port 443 to +# the inference-router's forward proxy on 127.0.0.1:8444. In Docker +# Desktop kind clusters the redirect doesn't always apply (CAP_NET_ADMIN +# semantics), so we ALSO export HTTPS_PROXY so libraries that honour +# the standard env (httpx, python-telegram-bot, slack-sdk, discord.py, +# requests, openai…) reach the router explicitly. The router then +# enforces the egress allowlist + Learn-mode logging exactly like the +# transparent path. +# +# Inference calls bypass this (Hermes sends them to OPENAI_BASE_URL= +# http://127.0.0.1:8443/v1, the router's HTTP API), so HTTPS_PROXY +# only affects code that tries direct external HTTPS — which is the +# exact scope we want to route. +# +# NO_PROXY covers loopback + cluster-internal services so the router +# itself, the apiserver, and intra-pod calls don't loop back through +# the proxy. CRITICALLY this includes the LITERAL apiserver IP +# ($KUBERNETES_SERVICE_HOST), not just the FQDN, because kubectl-style +# clients connect via the IP from the pod's service env — the FQDN +# variant only matches when explicitly used. +_NP_BASE="127.0.0.1,localhost,kubernetes.default.svc.cluster.local,.svc.cluster.local,.cluster.local" +if [ -n "${KUBERNETES_SERVICE_HOST:-}" ]; then + _NP_BASE="$KUBERNETES_SERVICE_HOST,$_NP_BASE" +fi +export HTTPS_PROXY="${HTTPS_PROXY:-http://127.0.0.1:8444}" +export https_proxy="${https_proxy:-$HTTPS_PROXY}" +export NO_PROXY="${NO_PROXY:-$_NP_BASE}" +export no_proxy="${no_proxy:-$NO_PROXY}" + # Hermes' multi-profile support — pin to SANDBOX_NAME so multi-sandbox # concurrent runs don't share session state. export HERMES_PROFILE="${HERMES_PROFILE:-$SANDBOX_NAME}" @@ -289,6 +333,22 @@ if [ -n "${TELEGRAM_BOT_TOKEN:-}" ]; then fi if [ -n "${TELEGRAM_ALLOW_FROM:-}" ]; then set_hermes_config "channels.telegram.allowed_users" "$TELEGRAM_ALLOW_FROM" + # Export TELEGRAM_ALLOWED_USERS so the gateway's Telegram platform + # skips the pairing-code dance for these IDs. Hermes' telegram.py + # reads this env at boot (not the config key); without it the bot + # responds to every incoming message with a "pairing code" challenge + # even when the sender is already in the configured allowlist. + export TELEGRAM_ALLOWED_USERS="$TELEGRAM_ALLOW_FROM" + # Set the home channel = first allowed user ID. This is the chat + # the `hermes send --to telegram` (no chat suffix) targets, used + # by the kars-sre proactive watcher to push incident alerts to the + # operator. If multiple IDs are configured, the watcher uses the + # first; operators with multi-user setups can override per-call + # via `--to telegram:` or set SRE_WATCHER_NOTIFY_TARGET. + TG_HOME=$(echo "$TELEGRAM_ALLOW_FROM" | tr ',' '\n' | head -1 | tr -d ' ') + if [ -n "$TG_HOME" ]; then + set_hermes_config "TELEGRAM_HOME_CHANNEL" "$TG_HOME" + fi fi if [ -n "${SLACK_BOT_TOKEN:-}" ]; then set_hermes_config "channels.slack.token" "$SLACK_BOT_TOKEN" @@ -564,7 +624,7 @@ Read-only kars-CR diagnostics (Slice 1): | \`sre_logs\` | Tail any pod's any container via the apiserver. Capped 500 lines. Use after \`sre_describe_resource\` shows CrashLoopBackOff or an error message you need to see in full. | | \`sre_diagnose\` | Walks the kars-CR health checklist (controller Ready, CRDs installed, no Degraded sandboxes, no stale reconciles). Use for the operator's "give me a cluster health overview" question. | | \`sre_explain_error\` | Given an error string, returns a hypothesis from the kars OOTB-blocker corpus (ImagePullBackOff, exceeded quota, OOMKilled, CrashLoopBackOff, FailedScheduling, ContainerCreating). The hypothesis is a HINT — confirm with other tools before quoting it. | -| \`sre_propose_fix\` | Returns a typed-action proposal for the operator to approve. Read-only in this build; the actual apply path lands in Slice 3. | +| \`sre_propose_fix\` | Returns a typed-action proposal AND auto-creates a KarsSREAction CR in \`kars-sre\` (phase=Proposed, approval.state=Pending). Returns an \`action_id\` you quote to the operator. Operator approves via \`kars sre approve \` → controller mints a one-shot CRB, executes the typed action, tears the binding down, watches recovery. You never execute; you propose. | K8s diagnostic toolset (Slice 2): @@ -582,7 +642,7 @@ You are intentionally not equipped with: * **\`kars_spawn\` family** — you cannot spawn sub-agents (§7.8.5 containment: sub-agents would inherit the kars-sre namespace's elevated RBAC). * **\`kars_mesh_*\` family** — you are not on the inter-agent mesh (§7.8.6: you have no DID, are not registered, and your NetworkPolicy blocks the relay). -* **Shell, file, or terminal tools** — you cannot exec into other pods, port-forward, write to disk, or run arbitrary commands. The only writes a future Slice 3 will allow are *typed actions* through \`sre_apply_fix\` — never free-form shell. +* **Shell, file, or terminal tools** — you cannot exec into other pods, port-forward, write to disk, or run arbitrary commands. The only writes happen indirectly: \`sre_propose_fix\` creates a KarsSREAction CR (a *proposal*, no execution); the controller executes it ONLY after the operator runs \`kars sre approve \`. Even then, you never run free-form shell — only the typed action you proposed. * **Network tools beyond the apiserver** — your NetworkPolicy allows only \`kubernetes.default.svc\`. No DNS lookups against the internet, no external HTTP, no registry calls. If the operator asks you to do something that requires a tool you don't have, say so explicitly and (when possible) suggest the kubectl command they could run themselves. @@ -599,11 +659,18 @@ When an operator says "X is broken" — even informally — walk this loop: * Service has 0 endpoints → \`sre_endpoints_inspect\` on the Service * \`OOMKilled\` / \`Evicted\` → \`sre_top\` on the pod and its node * Stuck \`Pending\` with \`0/N nodes available\` → \`sre_describe_resource\` on the candidate Nodes -5. **\`sre_propose_fix\`** — once you've identified the root cause, return a typed-action proposal naming the resource and the change. The current proposal types include: - * \`DeleteResourceQuota {namespace, name}\` — for over-tight platform-applied quotas (the resource must NOT be labeled \`kars.azure.com/managed-by=controller\` — that's the safety gate). - * \`PatchDeploymentImage\`, \`ScaleDeployment\`, \`RolloutRestart\`, \`DeletePod\`, \`PatchConfigMapKey\` — Slice 3 will execute these via short-lived TokenRequest tokens once the operator approves. +5. **\`sre_propose_fix\`** — once you've identified the root cause, call this with a \`diagnosis\` + \`target\` payload. **\`target.kind\` is REQUIRED** (one of \`ResourceQuota\`, \`Pod\`, \`Deployment\`, \`StatefulSet\`, \`DaemonSet\`) — without it no CR is created and the response's \`cr_error\` field tells you what's missing. Always include \`target.kind\`, \`target.namespace\`, and \`target.name\`. The tool returns a proposal AND creates a KarsSREAction CR (phase=Proposed). Quote the returned \`action_id\` to the operator with the exact approve command. The current proposal types are: + * \`DeleteResourceQuota {namespace, name}\` — for over-tight platform-applied quotas (the controller refuses to delete quotas labelled \`kars.azure.com/managed-by=controller\` — that's the safety gate, enforced in the reconciler, not just policy). + * \`PatchDeploymentImage {namespace, name, container, image}\` — patch a container image. + * \`ScaleDeployment {namespace, name, replicas}\` — scale a deployment (clamp 0-50). + * \`RolloutRestart {namespace, kind, name}\` — rolling restart on Deployment / StatefulSet / DaemonSet. + * \`DeletePod {namespace, name}\` — delete a pod so its owning controller reconciles a fresh one. -Slice 1+2 = **diagnose and propose only.** You never execute the fix. Tell the operator what to apply and link the proposal id; the operator runs the typed action manually until Slice 3 lands. + When target.kind alone is ambiguous (e.g. Deployment → Scale vs PatchImage vs RolloutRestart), pass an explicit \`action_type\` argument to disambiguate. + + When the operator runs \`kars sre approve \` (or \`kars sre reject\`), the controller's kars_sre_action reconciler picks it up, mints a short-lived ClusterRoleBinding scoped to just that action, executes via that binding, tears the binding down, and observes recovery in the affected namespace. + +You PROPOSE; the operator AUTHORISES; the controller EXECUTES. You never invoke the apply path directly — the proposal flow is the apply path. ## Output structure when you propose a fix @@ -690,6 +757,27 @@ if [ "$1" = "hermes" ]; then else echo "[kars-hermes] No channels — starting hermes gateway in idle daemon mode" fi + + # ── kars-sre proactive watcher (Slice 4) ────────────────────────── + # When SRE_ENABLED=true AND at least one channel is configured, spawn + # the watcher as a background process. It polls K8s events for + # failure-class reasons in kars-* namespaces, dedupes per + # (ns, kind, name, reason) in a 10-min window, and on each new + # incident creates a KarsSREAction CR + pushes a Telegram alert with + # the action_id + `kars sre approve` command. Operator opt-out: + # SRE_WATCHER_ENABLED=false. Failures inside the watcher are + # contained (it logs to stderr and continues) so it cannot crash the + # gateway. + if [ "${SRE_ENABLED:-}" = "true" ] \ + && [ "$WANT_GATEWAY" = "true" ] \ + && [ "${SRE_WATCHER_ENABLED:-true}" != "false" ]; then + echo "[kars-hermes] SRE_ENABLED + channels detected — starting proactive watcher" + # Use sandbox UID via $AS_SANDBOX so the watcher uses the same SA + # token + httpx singleton as the agent. stderr→pod stdout for + # debuggability via `kubectl logs`. + $AS_SANDBOX python3 -m kars_runtime_hermes.plugin.sre_watcher & + fi + exec $AS_SANDBOX hermes gateway run --accept-hooks else echo "[kars-hermes] Operator override: $*" From 64cb040cb3d645e35d525f8df31c47853a0703ed Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Wed, 10 Jun 2026 18:50:27 +0100 Subject: [PATCH 21/62] kars-sre: Headlamp SRE Console + Chat (Slice 4 primary UX) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the SRE engineer's dedicated console as a top-level sidebar branch in the kars Headlamp plugin. Replaces the prior workflow of 'kubectl get karssreactions + paste action_id into kars sre approve in a terminal' with one click in the dashboard. New routes: /kars/sre — SRE Console (live cards, primary landing) /kars/sre/chat — embedded Hermes WebUI iframe /kars/karssreactions — full CRD list (under existing CRD section) SRE Console layout (top → bottom): 🔴 Pending Approval — KarsSREActions awaiting operator. Inline Approve / Reject buttons PATCH .spec.approval.state directly via Headlamp's KubeObject.patch(), with optional rejection- reason prompt. No terminal hop needed. 🔄 In-flight — actions the controller is currently executing (Applied + waiting for recovery). Shows phase + age. 📊 Cluster Health — sandbox phase counts + degraded count. 🚨 Active Incidents — failure-class events (FailedCreate, BackOff, FailedScheduling, Failed, ImagePullBackOff, CrashLoopBackOff, OOMKilling, Evicted, FailedMount) from kars-* namespaces in the last 15 min. Same filter the proactive watcher uses, so what the operator sees here is what the watcher would alert on. ✅ Recent — Recovered / Failed / Expired / Rejected actions from the last hour for post-incident review. All cards live-update via Headlamp's useList() (watch + long-poll), so the Proposed → Approved → Applied → Recovered walk is visible without F5. The KarsSREAction CRD is added to the existing CRD registration table so the standard list / detail pages 'just work' under /kars/karssreactions/:ns/:name. SRE Chat is an iframe of the Hermes WebUI: - tab 1: http://localhost:18789 (requires 'kars connect sre --web' in another terminal — populates the iframe via port-forward) - tab 2: apiserver service-proxy fallback for in-cluster operators - 'Open in new tab' button if iframe sandboxing breaks the embed Helm chart: SRE sandbox's allowedEndpoints now includes api.telegram.org / core.telegram.org cluster-side so the Slice 4 watcher's outbound Telegram alerts don't need an out-of-band NetworkPolicy patch. Dormant when Telegram isn't configured — the gateway only opens the channel when the token is present. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- deploy/helm/kars/templates/sre.yaml | 12 + tools/headlamp-plugin/README.md | 29 +- tools/headlamp-plugin/dist/main.js | 2 +- tools/headlamp-plugin/src/index.tsx | 629 +++++++++++++++++++++++++++- 4 files changed, 669 insertions(+), 3 deletions(-) diff --git a/deploy/helm/kars/templates/sre.yaml b/deploy/helm/kars/templates/sre.yaml index d769f933..d3ec067e 100644 --- a/deploy/helm/kars/templates/sre.yaml +++ b/deploy/helm/kars/templates/sre.yaml @@ -152,6 +152,18 @@ spec: # In-cluster apiserver — the SRE agent's primary counterparty. - host: kubernetes.default.svc.cluster.local port: 443 + # Telegram Bot API — required when the operator configures + # TELEGRAM_BOT_TOKEN via `kars credentials update sre + # --telegram-token ` for Slice 4 channel + watcher alerts. + # Always allowed (Hermes only opens the channel when the token + # is present, so this is dormant otherwise — no extra exposure + # for clusters that don't use Telegram). NetworkPolicy egress is + # safe-by-default because the inference-router forward-proxy + # still enforces blocklist + audit on every connection. + - host: api.telegram.org + port: 443 + - host: core.telegram.org + port: 443 {{- if (.Values.sre | default dict).extraAllowedEndpoints }} {{- range (.Values.sre | default dict).extraAllowedEndpoints }} - host: {{ .host | quote }} diff --git a/tools/headlamp-plugin/README.md b/tools/headlamp-plugin/README.md index 9c199de1..fd122f88 100644 --- a/tools/headlamp-plugin/README.md +++ b/tools/headlamp-plugin/README.md @@ -1,7 +1,7 @@ # kars Headlamp Plugin Adds an **kars** sidebar to the [Headlamp](https://headlamp.dev/) Kubernetes -dashboard with list + detail views for the 9 kars custom resources: +dashboard with list + detail views for the 11 kars custom resources: - KarsSandbox - InferencePolicy @@ -12,6 +12,33 @@ dashboard with list + detail views for the 9 kars custom resources: - TrustGraph - KarsPairing - KarsEval +- EgressApproval +- **KarsSREAction** (Slice 3 — operator-approved typed apply-fix) + +## SRE Console (Slice 4 primary UX) + +`/kars/sre` is the dedicated console for the kars-sre operator — +the page a new shift opens to triage cluster health. It bundles: + +- 🔴 **Pending Approval** — KarsSREActions awaiting the operator's + decision, with inline **Approve** / **Reject** buttons that + PATCH `.spec.approval.state` directly (no terminal hop). +- 🔄 **In-flight** — actions the controller is currently + executing or watching for recovery. +- 📊 **Cluster Health** — sandbox phase + degraded count summary. +- 🚨 **Active Incidents** — failure-class events from `kars-*` + namespaces in the last 15 min (same filter the proactive + watcher uses). +- ✅ **Recent** — terminal-phase actions (Recovered / Failed / + Expired / Rejected) from the last hour for post-incident review. + +Live-updates via Headlamp's `useList()` (watch + long-poll) so the +Proposed → Approved → Applied → Recovered walk is visible without F5. + +The sibling **`/kars/sre/chat`** page embeds the Hermes WebUI in +an iframe (local port-forward by default, apiserver service-proxy +fallback). Run `kars connect sre --web --port 18789` in another +terminal to populate the iframe. Detail panes show `.spec`, `.status`, and a typed Conditions table with status colouring (Ready / Provisioned → green, Degraded / Failed → red, diff --git a/tools/headlamp-plugin/dist/main.js b/tools/headlamp-plugin/dist/main.js index b13cca7a..157c8b87 100644 --- a/tools/headlamp-plugin/dist/main.js +++ b/tools/headlamp-plugin/dist/main.js @@ -1 +1 @@ -(function(e,O){typeof exports=="object"&&typeof module<"u"?O(require("react/jsx-runtime"),require("@kinvolk/headlamp-plugin/lib"),require("@kinvolk/headlamp-plugin/lib/lib/k8s/crd"),require("@kinvolk/headlamp-plugin/lib/K8s/secret"),require("@kinvolk/headlamp-plugin/lib/CommonComponents"),require("@mui/material/styles"),require("react")):typeof define=="function"&&define.amd?define(["react/jsx-runtime","@kinvolk/headlamp-plugin/lib","@kinvolk/headlamp-plugin/lib/lib/k8s/crd","@kinvolk/headlamp-plugin/lib/K8s/secret","@kinvolk/headlamp-plugin/lib/CommonComponents","@mui/material/styles","react"],O):(e=typeof globalThis<"u"?globalThis:e||self,O(e.pluginLib.ReactJSX,e.pluginLib,e.pluginLib.Crd,e.pluginLib.K8s.secret,e.pluginLib.CommonComponents,e.pluginLib.MuiMaterial.styles,e.pluginLib.React))})(this,(function(e,O,me,Le,o,U,we){"use strict";const Te=t=>t&&typeof t=="object"&&"default"in t?t:{default:t};function _e(t){if(t&&typeof t=="object"&&"default"in t)return t;const n=Object.create(null,{[Symbol.toStringTag]:{value:"Module"}});if(t){for(const i in t)if(i!=="default"){const d=Object.getOwnPropertyDescriptor(t,i);Object.defineProperty(n,i,d.get?d:{enumerable:!0,get:()=>t[i]})}}return n.default=t,Object.freeze(n)}const oe=Te(Le),X=_e(we),Me="kars.azure.com",Ae="v1alpha1",ie=[{plural:"karssandboxes",singular:"karssandbox",kind:"KarsSandbox",label:"Sandboxes",phaseField:"phase"},{plural:"inferencepolicies",singular:"inferencepolicy",kind:"InferencePolicy",label:"Inference Policies"},{plural:"karsmemories",singular:"karsmemory",kind:"KarsMemory",label:"Memories",phaseField:"phase"},{plural:"mcpservers",singular:"mcpserver",kind:"McpServer",label:"MCP Servers",phaseField:"phase"},{plural:"a2aagents",singular:"a2aagent",kind:"A2AAgent",label:"A2A Agents",phaseField:"phase"},{plural:"toolpolicies",singular:"toolpolicy",kind:"ToolPolicy",label:"Tool Policies"},{plural:"trustgraphs",singular:"trustgraph",kind:"TrustGraph",label:"Trust Graphs"},{plural:"karspairings",singular:"karspairing",kind:"KarsPairing",label:"Pairings"},{plural:"karsevals",singular:"karseval",kind:"KarsEval",label:"Evals",phaseField:"phase"},{plural:"egressapprovals",singular:"egressapproval",kind:"EgressApproval",label:"Egress Approvals",phaseField:"phase"}],z=Object.fromEntries(ie.map(t=>[t.plural,me.makeCustomResourceClass({apiInfo:[{group:Me,version:Ae}],isNamespaced:!0,singularName:t.singular,pluralName:t.plural,kind:t.kind,customResourceDefinition:void 0})])),ce=z.karssandboxes;O.registerSidebarEntry({parent:null,name:"kars",label:"kars",icon:"mdi:robot-outline",url:"/kars"}),O.registerSidebarEntry({parent:"kars",name:"kars-overview",label:"Overview",url:"/kars"}),O.registerRoute({path:"/kars",sidebar:"kars-overview",name:"kars-overview",exact:!0,component:()=>e.jsx(ze,{})}),O.registerSidebarEntry({parent:"kars",name:"kars-mesh",label:"Mesh Topology",url:"/kars/mesh"}),O.registerRoute({path:"/kars/mesh",sidebar:"kars-mesh",name:"kars-mesh",exact:!0,component:()=>e.jsx(He,{})});for(const t of ie)O.registerSidebarEntry({parent:"kars",name:t.plural,label:t.label,url:`/kars/${t.plural}`}),O.registerRoute({path:`/kars/${t.plural}`,sidebar:t.plural,name:t.plural,exact:!0,component:()=>e.jsx(Fe,{crd:t})}),O.registerRoute({path:`/kars/${t.plural}/:namespace/:name`,sidebar:t.plural,name:`${t.plural}-detail`,exact:!0,component:()=>e.jsx(je,{crd:t})});const de=new Set(["SignatureMismatch","BundleVerifyFailed","AuthMisconfigured","MemoryStoreMissing","RuntimeAdapterMissing","AdapterMissing","ShapeInvalid","AllowlistDrift","PolicyCompileFailed"]),he=new Set(["AwaitingRouterEnforcement","AwaitingFoundryProvisioning","NoSandboxesReferencing","Pending"]);function Z(t){const i=(F(t).conditions??[]).find(d=>d.type==="Ready");return i==null?void 0:i.reason}function $e(t,n){return n&&de.has(n)?"error":n&&he.has(n)?"warning":t?t==="Ready"||t==="Provisioned"||t==="Active"?"success":t==="Degraded"||t==="Failed"||t==="Error"?"error":"warning":""}function F(t){var n;return((n=t.jsonData)==null?void 0:n.status)??{}}function N(t){var n;return((n=t.jsonData)==null?void 0:n.spec)??{}}function C(t){if(!t)return"—";const n=t.lastIndexOf("/");return n>=0?t.slice(n+1):t}function V(t,n){if(!t)return e.jsx("span",{children:"—"});const i=$e(t,n),d=n&&(de.has(n)||he.has(n));return e.jsxs("span",{children:[e.jsx(o.StatusLabel,{status:i,children:t}),d&&e.jsx("span",{style:{marginLeft:"0.4rem",fontSize:"0.85em",color:"#888"},children:n})]})}function Pe(t){return window.location.pathname.match(t)}function R(t){if(!t)return"—";const n=t.indexOf(":");return n<0||n+13>=t.length?t:`${t.slice(0,n+1)}${t.slice(n+1,n+13)}…`}function Be(t){if(!t)return null;const n=t.indexOf(" | drift=");if(n<0)return null;try{const i=JSON.parse(t.slice(n+9));if(!i||typeof i!="object")return null;const d=Array.isArray(i.added)?i.added.filter(s=>typeof s=="string"):[],c=Array.isArray(i.removed)?i.removed.filter(s=>typeof s=="string"):[];return{added:d,removed:c}}catch{return null}}function Ee({item:t}){const d=(F(t).conditions??[]).find(r=>r.type==="AllowlistDrift"&&r.status==="True");if(!d)return null;const c=Be(d.message),s=(c==null?void 0:c.added)??[],g=(c==null?void 0:c.removed)??[];return e.jsxs(o.SectionBox,{title:"⚠ Allowlist drift detected",children:[e.jsxs("p",{style:{padding:"0.5rem",fontSize:"0.9rem"},children:[e.jsx(o.StatusLabel,{status:"warning",children:"artifact wins"})," ","Inline ",e.jsx("code",{children:"allowedEndpoints"})," diverges from the verified signed bundle. The router enforces the bundle; the inline list is ignored. Either re-sign the bundle to include the divergent hosts, or remove the inline override."]}),s.length>0||g.length>0?e.jsx(o.SimpleTable,{data:[{side:`Only in inline (operator added, not signed) — ${s.length}`,hosts:s.join(", ")||"—"},{side:`Only in bundle (signed, but missing inline) — ${g.length}`,hosts:g.join(", ")||"—"}],columns:[{label:"Side",getter:r=>r.side},{label:"Hosts",getter:r=>e.jsx("code",{children:r.hosts})}]}):e.jsx("p",{style:{padding:"0.5rem",fontSize:"0.85rem",opacity:.75},children:d.message??"(no diff payload)"})]})}function re(t){if(!t)return e.jsx("span",{children:"—"});const d=t==="RouterEnforcing"||t==="AllDigestsMatch"?"success":t==="NoSandboxesReferencing"||t==="AsExpected"?"":t==="AwaitingRouterEnforcement"?"warning":"error";return e.jsx(o.StatusLabel,{status:d,children:t})}function Ne({crd:t,item:n}){if(t.plural!=="toolpolicies"&&t.plural!=="inferencepolicies"&&t.plural!=="karsmemories")return null;const i=F(n),c=(i.conditions??[]).find(l=>l.type==="Ready"),s=t.plural==="toolpolicies"?i.agtProfileDigest:i.compiledDigest,g=i.loadedDigest,r=s?g&&g===s?"✓ matches":g?"≠ mismatched":"(awaiting)":"—";return e.jsxs(o.SectionBox,{title:"Router enforcement (data-plane echo)",children:[e.jsx(o.SimpleTable,{data:[{k:"Compiled digest",v:R(s)},{k:"Loaded digest",v:R(g)},{k:"Echo",v:r},{k:"Confirmation",v:re(c==null?void 0:c.reason)}],columns:[{label:"Field",getter:l=>l.k},{label:"Value",getter:l=>l.v}]}),e.jsxs("p",{style:{padding:"0.5rem",fontSize:"0.85rem",opacity:.75},children:["The controller polls every referencing sandbox's router and promotes",e.jsx("code",{children:" phase: Compiled → Ready "})," only when every router echoes the exact compiled digest. While"," ",e.jsx("code",{children:"AwaitingRouterEnforcement"}),", the policy is parsed but",e.jsx("strong",{children:" not"})," live in the data plane."]})]})}function De({crd:t,item:n}){var m,L;if(t.plural!=="karsevals")return null;const i=N(n),d=F(n),c=d.conditions??[],s=c.find(h=>h.type==="Ready"),g=c.find(h=>h.type==="ConformanceDrift"),r=d.lastResult,l=i.corpus,p=l!=null&&l.builtin?`builtin:${l.builtin}`:(m=l==null?void 0:l.bundleRef)!=null&&m.digest?`bundle ${l.bundleRef.registry??"?"}/${l.bundleRef.repository??"?"}@${l.bundleRef.digest}`:"—",b=r?`${r.passedCases??0}/${r.totalCases??0}`:"—",v=r!=null&&r.drift?e.jsx(o.StatusLabel,{status:"error",children:"YES"}):r?e.jsx(o.StatusLabel,{status:"success",children:"no"}):e.jsx("span",{style:{opacity:.6},children:"—"});return e.jsxs(o.SectionBox,{title:"KarsEval (conformance corpus)",children:[e.jsx(o.SimpleTable,{data:[{k:"Target sandbox",v:((L=i.targetSandboxRef)==null?void 0:L.name)??"—"},{k:"Corpus",v:p},{k:"Schedule",v:i.schedule??"(on-demand only)"},{k:"Fail sandbox on drift",v:i.failSandboxOnDrift?"true":"false"},{k:"Last run",v:d.lastRunAt??"—"},{k:"Cases passed",v:b},{k:"Drift",v},{k:"Ready reason",v:re(s==null?void 0:s.reason)},{k:"Conformance drift reason",v:re(g==null?void 0:g.reason)}],columns:[{label:"Field",getter:h=>h.k},{label:"Value",getter:h=>h.v}]}),e.jsxs("p",{style:{padding:"0.5rem",fontSize:"0.85rem",opacity:.75},children:["KarsEvals replay a signed corpus (or a builtin one) against the target sandbox's inference router. The controller stamps each run's verdicts on ",e.jsx("code",{children:"status.lastResult"})," and rolls a history of the most recent ones into ",e.jsx("code",{children:"status.history"}),"."]})]})}const ue=[["telegram",/^TELEGRAM_(BOT_)?TOKEN$/i],["slack",/^SLACK_(BOT_)?TOKEN$/i],["discord",/^DISCORD_(BOT_)?TOKEN$/i],["whatsapp",/^WHATSAPP_TOKEN$/i]];function ge(t){var d;const n=new Set;if(!t)return n;const i=((d=t.jsonData)==null?void 0:d.data)??{};for(const c of Object.keys(i))for(const[s,g]of ue)g.test(c)&&n.add(s);return n}function Oe(t,n){var c,s,g,r,l,p,b,v,m;const i={sandboxesByPhase:{},channelCounts:{},egressLearn:0,egressStrict:0,governanceEnabled:0,totalRuntime:{}},d=new Map;for(const L of n??[]){const h=((c=L.metadata)==null?void 0:c.name)??"",w=((s=L.metadata)==null?void 0:s.namespace)??"";if(!h.endsWith("-credentials"))continue;const _=h.replace(/-credentials$/,"");d.set(`${w}/${_}`,ge(L))}for(const L of t??[]){const h=N(L),_=F(L).phase??"Unknown";i.sandboxesByPhase[_]=(i.sandboxesByPhase[_]??0)+1;const u=h.networkPolicy??null;!u||(u.egressMode??"Learn")==="Learn"?i.egressLearn+=1:i.egressStrict+=1,(g=h.governance)!=null&&g.enabled&&(i.governanceEnabled+=1);const x=((r=h.runtime)==null?void 0:r.kind)??"Unknown";i.totalRuntime[x]=(i.totalRuntime[x]??0)+1;const k=((l=L.metadata)==null?void 0:l.name)??"",T=((p=L.metadata)==null?void 0:p.namespace)??"",P=`kars-${k}`,B=d.get(`${P}/${k}`)??d.get(`${T}/${k}`)??new Set,D=((m=(v=(b=h.runtime)==null?void 0:b.openclaw)==null?void 0:v.config)==null?void 0:m.channels)??{};for(const E of Object.keys(D))B.add(E);for(const E of B)i.channelCounts[E]=(i.channelCounts[E]??0)+1}return i}function ze(){var w,_;const[t]=ce.useList(),[n]=oe.default.useList(),[i]=z.inferencepolicies.useList(),[d]=z.toolpolicies.useList(),[c]=z.karsmemories.useList(),[s]=z.mcpservers.useList(),[g]=z.a2aagents.useList(),r=Oe(t,n),l=(t==null?void 0:t.length)??0,p=Object.entries(r.sandboxesByPhase).sort((u,y)=>y[1]-u[1]).map(([u,y])=>({phase:u,count:y})),b=Object.entries(r.totalRuntime).sort((u,y)=>y[1]-u[1]).map(([u,y])=>({kind:u,count:y})),v=Object.entries(r.channelCounts).sort((u,y)=>y[1]-u[1]).map(([u,y])=>({channel:u,count:y})),m=(t??[]).slice().sort((u,y)=>{var T,P;const x=new Date(((T=u.metadata)==null?void 0:T.creationTimestamp)??0).getTime();return new Date(((P=y.metadata)==null?void 0:P.creationTimestamp)??0).getTime()-x}).slice(0,10),L=new Map;for(const u of i??[])L.set(`${((w=u.metadata)==null?void 0:w.namespace)??""}/${((_=u.metadata)==null?void 0:_.name)??""}`,u);const h=u=>{var T,P,B,D,E,G,I,S,W;const y=N(u),x=((D=(B=(P=(T=y.runtime)==null?void 0:T.openclaw)==null?void 0:P.config)==null?void 0:B.agent)==null?void 0:D.model)??((E=y.agent)==null?void 0:E.model);if(x)return C(x);const k=(G=y.inferenceRef)==null?void 0:G.name;if(!k)return"—";for(const Y of[`${((I=u.metadata)==null?void 0:I.namespace)??""}/${k}`,`kars-system/${k}`]){const K=L.get(Y);if(K){const q=(W=(S=N(K).modelPreference)==null?void 0:S.primary)==null?void 0:W.deployment;if(q)return C(q)}}return`(via ${k})`};return e.jsxs(e.Fragment,{children:[e.jsxs(o.SectionBox,{title:"kars — Operator Overview",children:[e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"repeat(auto-fit, minmax(180px, 1fr))",gap:"1rem",padding:"1rem 0"},children:[e.jsx($,{label:"Total Sandboxes",value:l}),e.jsx($,{label:"Ready",value:r.sandboxesByPhase.Ready??0,tone:"success"}),e.jsx($,{label:"Degraded",value:r.sandboxesByPhase.Degraded??0,tone:r.sandboxesByPhase.Degraded?"error":""}),e.jsx($,{label:"Governance ON",value:`${r.governanceEnabled} / ${l}`}),e.jsx($,{label:"Egress: Learn / Strict",value:`${r.egressLearn} / ${r.egressStrict}`})]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"repeat(auto-fit, minmax(160px, 1fr))",gap:"0.5rem",padding:"0 0 1rem 0"},children:[e.jsx($,{label:"Inference Policies",value:(i==null?void 0:i.length)??"…"}),e.jsx($,{label:"Tool Policies",value:(d==null?void 0:d.length)??"…"}),e.jsx($,{label:"Memories",value:(c==null?void 0:c.length)??"…"}),e.jsx($,{label:"MCP Servers",value:(s==null?void 0:s.length)??"…"}),e.jsx($,{label:"A2A Agents",value:(g==null?void 0:g.length)??"…"})]})]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"1fr 1fr 1fr",gap:"1rem"},children:[e.jsx(o.SectionBox,{title:"Sandboxes by Phase",children:e.jsx(o.SimpleTable,{data:p,columns:[{label:"Phase",getter:u=>V(u.phase)},{label:"Count",getter:u=>u.count}]})}),e.jsx(o.SectionBox,{title:"Runtimes",children:e.jsx(o.SimpleTable,{data:b,columns:[{label:"Kind",getter:u=>u.kind},{label:"Count",getter:u=>u.count}]})}),e.jsx(o.SectionBox,{title:"Channels in Use",children:v.length===0?e.jsx("p",{style:{padding:"1rem"},children:"No channels configured."}):e.jsx(o.SimpleTable,{data:v,columns:[{label:"Channel",getter:u=>u.channel},{label:"Sandboxes",getter:u=>u.count}]})})]}),e.jsx(o.SectionBox,{title:"Recent Sandboxes",children:e.jsx(o.SimpleTable,{data:m,columns:[{label:"Name",getter:u=>{var y,x,k;return e.jsx(o.Link,{routeName:"karssandboxes-detail",params:{namespace:((y=u.metadata)==null?void 0:y.namespace)??"",name:((x=u.metadata)==null?void 0:x.name)??""},children:(k=u.metadata)==null?void 0:k.name})}},{label:"Namespace",getter:u=>{var y;return((y=u.metadata)==null?void 0:y.namespace)??"—"}},{label:"Runtime",getter:u=>{var y;return((y=N(u).runtime)==null?void 0:y.kind)??"—"}},{label:"Model",getter:h},{label:"Phase",getter:u=>V(F(u).phase,Z(u))},{label:"Egress",getter:u=>{const y=N(u).networkPolicy;return!y||(y.egressMode??"Learn")==="Learn"?"Learn":"Strict"}},{label:"Age",getter:u=>{var y;return pe((y=u.metadata)==null?void 0:y.creationTimestamp)}}]})}),e.jsx(Xe,{sandboxes:t??[],inferencePolicies:i??[]})]})}function $(t){const n=t.tone??"",i=n==="error"?"#c62828":n==="warning"?"#ef6c00":n==="success"?"#2e7d32":"inherit";return e.jsxs("div",{style:{padding:"1rem",border:"1px solid rgba(127,127,127,0.2)",borderRadius:"6px"},children:[e.jsx("div",{style:{fontSize:"0.85rem",opacity:.7},children:t.label}),e.jsx("div",{style:{fontSize:"1.6rem",fontWeight:600,color:i},children:t.value})]})}function pe(t){if(!t)return"—";const n=Date.now()-new Date(t).getTime(),i=Math.floor(n/1e3);if(i<60)return`${i}s`;const d=Math.floor(i/60);if(d<60)return`${d}m`;const c=Math.floor(d/60);return c<24?`${c}h`:`${Math.floor(c/24)}d`}function Fe({crd:t}){const n=z[t.plural],[i]=n.useList(),[d]=z.inferencepolicies.useList(),c=X.useMemo(()=>{var l,p;const r=new Map;for(const b of d??[])r.set(`${((l=b.metadata)==null?void 0:l.namespace)??""}/${((p=b.metadata)==null?void 0:p.name)??""}`,b);return r},[d]),s=r=>{var m,L,h,w,_,u,y,x,k;const l=N(r),p=((w=(h=(L=(m=l.runtime)==null?void 0:m.openclaw)==null?void 0:L.config)==null?void 0:h.agent)==null?void 0:w.model)??((_=l.agent)==null?void 0:_.model);if(p)return C(p);const b=(u=l.inferenceRef)==null?void 0:u.name;if(!b)return"—";const v=[`${((y=r.metadata)==null?void 0:y.namespace)??""}/${b}`,`kars-system/${b}`];for(const T of v){const P=c.get(T);if(P){const D=(k=(x=N(P).modelPreference)==null?void 0:x.primary)==null?void 0:k.deployment;if(D)return C(D)}}return`(via ${b})`},g=[{label:"Name",getter:r=>{var l,p,b;return e.jsx(o.Link,{routeName:`${t.plural}-detail`,params:{namespace:((l=r.metadata)==null?void 0:l.namespace)??"",name:((p=r.metadata)==null?void 0:p.name)??""},children:(b=r.metadata)==null?void 0:b.name})}},{label:"Namespace",getter:r=>{var l;return((l=r.metadata)==null?void 0:l.namespace)??"—"}}];return t.plural==="karssandboxes"&&g.push({label:"Runtime",getter:r=>{var l;return((l=N(r).runtime)==null?void 0:l.kind)??"—"}},{label:"Model",getter:s},{label:"Egress",getter:r=>{const l=N(r).networkPolicy;return!l||(l.egressMode??"Learn")==="Learn"?e.jsx(o.StatusLabel,{status:"warning",children:"Learn"}):e.jsx(o.StatusLabel,{status:"success",children:"Strict"})}}),t.phaseField&&g.push({label:"Phase",getter:r=>V(F(r)[t.phaseField],Z(r))}),g.push({label:"Age",getter:r=>{var l;return pe((l=r.metadata)==null?void 0:l.creationTimestamp)}}),e.jsx(o.SectionBox,{title:`kars — ${t.label}`,children:i===null?e.jsx("p",{style:{padding:"1rem"},children:"Loading…"}):i.length===0?e.jsxs("p",{style:{padding:"1rem"},children:["No ",t.label.toLowerCase()," found. Create one with the kars CLI or by applying a CRD manifest."]}):e.jsx(o.SimpleTable,{data:i,columns:g})})}function je({crd:t}){var p,b;const n=Pe(new RegExp(`/kars/${t.plural}/([^/]+)/([^/]+)`)),i=(n==null?void 0:n[1])??"",d=(n==null?void 0:n[2])??"",c=z[t.plural],[s,g]=c.useGet(d,i);if(g)return e.jsx(o.SectionBox,{title:`${t.kind}: ${d}`,children:e.jsxs("p",{children:["Error: ",g.message]})});if(!s)return e.jsx(o.SectionBox,{title:"Loading…",children:"Loading…"});const r=F(s),l=r.conditions??[];return e.jsxs(e.Fragment,{children:[e.jsx(o.SectionBox,{title:`${t.kind}: ${d}`,children:e.jsx(o.SimpleTable,{data:[{k:"Namespace",v:i},{k:"Phase",v:V(r.phase,Z(s))},{k:"Created",v:((p=s.metadata)==null?void 0:p.creationTimestamp)??"—"},{k:"UID",v:((b=s.metadata)==null?void 0:b.uid)??"—"}],columns:[{label:"Field",getter:v=>v.k},{label:"Value",getter:v=>v.v}]})}),t.plural==="karssandboxes"&&e.jsx(Ke,{item:s}),t.plural==="inferencepolicies"&&e.jsx(Ve,{policyName:s.metadata.name}),t.plural==="toolpolicies"&&e.jsx(Ye,{policyName:s.metadata.name}),t.plural==="trustgraphs"&&e.jsx(Je,{}),e.jsx(Ee,{item:s}),e.jsx(Ne,{crd:t,item:s}),e.jsx(De,{crd:t,item:s}),e.jsx(o.SectionBox,{title:"Spec",children:e.jsx("pre",{style:{maxHeight:"400px",overflow:"auto"},children:JSON.stringify(N(s),null,2)})}),e.jsx(o.SectionBox,{title:"Status",children:e.jsx("pre",{style:{maxHeight:"400px",overflow:"auto"},children:JSON.stringify(r,null,2)})}),l.length>0&&e.jsx(o.SectionBox,{title:"Conditions",children:e.jsx(o.SimpleTable,{data:l,columns:[{label:"Type",getter:v=>v.type},{label:"Status",getter:v=>e.jsx(o.StatusLabel,{status:v.status==="True"?"success":"error",children:v.status})},{label:"Reason",getter:v=>v.reason??"—"},{label:"Message",getter:v=>v.message??"—"}]})})]})}function Ge({sandboxName:t,sandboxNamespace:n}){const[i]=z.egressapprovals.useList();if(!i)return null;const d=i.filter(s=>{var l;const g=((l=s.metadata)==null?void 0:l.namespace)??"",r=N(s);return g===n&&r.sandbox===t});if(d.length===0)return null;const c=d.map(s=>{var b;const g=N(s),r=F(s),l=Array.isArray(g.hosts)?g.hosts:[],p=l.slice(0,3).map(v=>v.port?`${v.host}:${v.port}`:v.host).join(", ")+(l.length>3?`, +${l.length-3}`:"");return{name:((b=s.metadata)==null?void 0:b.name)??"—",phase:r.phase,hosts:p||"—",reason:g.reason??"—",ttl:g.ttl??"—",expiresAt:r.expiresAt,digest:r.mergedDigest}});return e.jsxs(o.SectionBox,{title:"Egress Approvals (ephemeral grants)",children:[e.jsx(o.SimpleTable,{data:c,columns:[{label:"Name",getter:s=>e.jsx(o.Link,{routeName:"egressapprovals-detail",params:{namespace:n,name:s.name},children:s.name})},{label:"Phase",getter:s=>V(s.phase)},{label:"Hosts",getter:s=>s.hosts},{label:"TTL",getter:s=>s.ttl},{label:"Expires",getter:s=>s.expiresAt??"—"},{label:"Reason",getter:s=>s.reason},{label:"Merged digest",getter:s=>R(s.digest)}]}),e.jsxs("p",{style:{padding:"0.5rem",fontSize:"0.85rem",opacity:.75},children:["Grants unioned with the baseline allowlist on the data plane. ",e.jsx("code",{children:"Active"})," ","means the router has echoed the merged digest. Grants auto-expire at"," ",e.jsx("code",{children:"status.expiresAt"}),"; revoke early with ",e.jsx("code",{children:"kars egress revoke"}),"."]})]})}function Ie({refs:t}){const[n]=z.mcpservers.useList();if(t.length===0)return null;const i=new Map;(n??[]).forEach(c=>{var g;const s=(g=c.metadata)==null?void 0:g.name;s&&i.set(s,c)});const d=t.map(c=>{const s=c.name?i.get(c.name):void 0,g=s?F(s):{},r=s?N(s):{},l=Array.isArray(r.tools)?r.tools.length:g.toolCount??0;return{name:c.name??"—",phase:g.phase,reason:s?Z(s):void 0,digest:g.jwksDigest??g.bundleDigest,tools:l,missing:!s}});return e.jsx(o.SectionBox,{title:`MCP Servers (${d.length})`,children:e.jsx(o.SimpleTable,{data:d,columns:[{label:"Name",getter:c=>c.missing?e.jsxs("span",{children:[c.name," ",e.jsx(o.StatusLabel,{status:"error",children:"MISSING"})]}):e.jsx(o.Link,{routeName:"mcpservers-detail",params:{namespace:"kars-system",name:c.name},children:c.name})},{label:"Phase",getter:c=>V(c.phase,c.reason)},{label:"Tools",getter:c=>c.tools},{label:"JWKS digest",getter:c=>R(c.digest)}]})})}function Ke({item:t}){var y,x,k,T,P,B,D,E,G,I;const n=N(t),i=F(t),d=((y=t.metadata)==null?void 0:y.namespace)??"",c=((x=t.metadata)==null?void 0:x.name)??"",s=`kars-${c}`,[g]=oe.default.useGet(`${c}-credentials`,s),r=n.networkPolicy??null,l=r??{},p=!r||(l.egressMode??"Learn")==="Learn",b=Array.isArray(l.allowedEndpoints)?l.allowedEndpoints:[],v=new Set(ge(g??void 0)),m=((P=(T=(k=n.runtime)==null?void 0:k.openclaw)==null?void 0:T.config)==null?void 0:P.channels)??{};for(const S of Object.keys(m))v.add(S);const L=Array.from(v).map(S=>{var W,Y;return{channel:S,enabled:((W=m[S])==null?void 0:W.enabled)!==!1,source:g&&Object.keys(((Y=g.jsonData)==null?void 0:Y.data)??{}).some(K=>ue.some(([Q,q])=>Q===S&&q.test(K)))?"Secret":"Spec"}}),h=(B=n.inferenceRef)==null?void 0:B.name,w=(E=(D=n.governance)==null?void 0:D.toolPolicyRef)==null?void 0:E.name,_=(G=n.memoryRef)==null?void 0:G.name,u=Array.isArray(n.mcpServerRefs)?n.mcpServerRefs:[];return e.jsxs(e.Fragment,{children:[e.jsxs(o.SectionBox,{title:"Network Policy (Egress)",children:[e.jsx(o.SimpleTable,{data:[{k:"Default Deny",v:String(l.defaultDeny??!1)},{k:"Learn Mode",v:p?e.jsx(o.StatusLabel,{status:"warning",children:"LEARN"}):e.jsx(o.StatusLabel,{status:"success",children:"STRICT"})},{k:"Allowed Endpoints",v:`${b.length}`}],columns:[{label:"Field",getter:S=>S.k},{label:"Value",getter:S=>S.v}]}),b.length>0&&e.jsxs("div",{style:{marginTop:"1rem"},children:[e.jsx("h4",{children:"Allowed Endpoints"}),e.jsx(o.SimpleTable,{data:b,columns:[{label:"Host",getter:S=>S.host??"—"},{label:"Port",getter:S=>S.port??"—"}]})]})]}),e.jsx(o.SectionBox,{title:"Channels & Integrations",children:L.length===0?e.jsxs("p",{style:{padding:"0.5rem"},children:["No channels configured for namespace ",e.jsx("code",{children:s}),". Use"," ",e.jsx("code",{children:"kars credentials set telegram-token …"})," +"," ",e.jsx("code",{children:"--channels telegram"}),"."]}):e.jsx(o.SimpleTable,{data:L,columns:[{label:"Channel",getter:S=>S.channel},{label:"Status",getter:S=>S.enabled?e.jsx(o.StatusLabel,{status:"success",children:"ENABLED"}):e.jsx(o.StatusLabel,{status:"warning",children:"DISABLED"})},{label:"Source",getter:S=>S.source}]})}),e.jsx(o.SectionBox,{title:"Related Resources",children:e.jsx(o.SimpleTable,{data:[...h?[{kind:"InferencePolicy",name:h,route:"inferencepolicies-detail"}]:[],...w?[{kind:"ToolPolicy",name:w,route:"toolpolicies-detail"}]:[],..._?[{kind:"KarsMemory",name:_,route:"karsmemories-detail"}]:[],...u.map(S=>({kind:"McpServer",name:S.name??"",route:"mcpservers-detail"}))],columns:[{label:"Kind",getter:S=>S.kind},{label:"Name",getter:S=>S.name?e.jsx(o.Link,{routeName:S.route,params:{namespace:"kars-system",name:S.name},children:S.name}):"—"}]})}),i.mesh&&e.jsx(o.SectionBox,{title:"Mesh (AGT)",children:e.jsx(o.SimpleTable,{data:[{k:"Agent DID",v:i.mesh.did??"—"},{k:"Registered",v:i.mesh.registered?e.jsx(o.StatusLabel,{status:"success",children:"YES"}):e.jsx(o.StatusLabel,{status:"error",children:"NO"})},{k:"Trust Score",v:i.mesh.trustScore??"—"},{k:"Last Heartbeat",v:i.mesh.lastHeartbeat??"—"}],columns:[{label:"Field",getter:S=>S.k},{label:"Value",getter:S=>S.v}]})}),e.jsx(Ie,{refs:u}),e.jsx(Ge,{sandboxName:c,sandboxNamespace:d}),e.jsx(o.SectionBox,{title:"Pod & Workspace",children:e.jsx(o.SimpleTable,{data:[{k:"CR Namespace",v:e.jsx(o.Link,{routeName:"namespace",params:{name:d},children:d})},{k:"Sandbox Namespace",v:e.jsx(o.Link,{routeName:"namespace",params:{name:s},children:s})},{k:"Pods",v:e.jsxs(o.Link,{routeName:"pods",params:{namespace:s},children:["View pods in ",s]})},{k:"Deployment",v:e.jsxs(o.Link,{routeName:"deployments",params:{namespace:s},children:["View deployments in ",s]})},{k:"Secrets",v:e.jsxs(o.Link,{routeName:"secrets",params:{namespace:s},children:["View secrets in ",s]})}],columns:[{label:"Field",getter:S=>S.k},{label:"Value",getter:S=>S.v}]})}),e.jsx(Qe,{sandboxName:c,inferenceRefName:(I=n.inferenceRef)==null?void 0:I.name}),e.jsx(We,{sandboxName:c})]})}function We({sandboxName:t}){const i=U.useTheme().palette.mode==="dark"?"dark":"light",c=`${typeof window<"u"&&window.KARS_GRAFANA_URL||"http://127.0.0.1:3000"}/d/kars-ops?kiosk=tv&refresh=10s&theme=${i}&var-sandbox=${encodeURIComponent(t)}`;return e.jsxs(o.SectionBox,{title:`Metrics (Grafana) — ${t}`,children:[e.jsx("div",{style:{marginBottom:8},children:e.jsx("a",{href:c,target:"_blank",rel:"noopener noreferrer",children:"Open full dashboard in Grafana ↗"})}),e.jsx("iframe",{src:c,title:`Grafana metrics for ${t}`,style:{width:"100%",height:"720px",border:"0"},loading:"lazy"})]})}async function M(t,n){var s;const i=`${t}/api/v1/query?query=${encodeURIComponent(n)}`,d=await fetch(i);if(!d.ok)throw new Error(`prom ${d.status}`);const c=await d.json();return(((s=c==null?void 0:c.data)==null?void 0:s.result)||[]).map(g=>{var r;return{metric:g.metric||{},value:Number(((r=g.value)==null?void 0:r[1])||0)}})}function Ue(){return typeof window<"u"&&window.KARS_PROMETHEUS_URL||"http://127.0.0.1:19091"}function H(t,n,i=5e3){const d=Ue(),[c,s]=X.useState(t),[g,r]=X.useState(""),[l,p]=X.useState(0);return X.useEffect(()=>{let b=!1;n(d).then(m=>{b||(s(m),r(""))}).catch(m=>{b||r(String(m))});const v=setInterval(()=>p(m=>m+1),i);return()=>{b=!0,clearInterval(v)}},[d,l]),{data:c,err:g}}function He(){const n=U.useTheme().palette.mode==="dark",i=n?"#1e1e1e":"#fafafa",d=n?"#aaa":"#555",c=n?"#cfd8dc":"#37474f",s="#fff",[g]=ce.useList(),{data:r,err:l}=H({peers:[],sentLife:[],recvLife:[],sentRate:[],recvRate:[],relayConn:0,relayRouted:0,relayStored:0,relayDelivered:0,relayMsgsPerSec:0},async a=>{var ye,ve,Se,ke,xe;const[f,A,J,ae,le,ne,Ze,Ce,Re,et]=await Promise.all([M(a,"kars_agt_known_agents"),M(a,"kars_mesh_messages_sent_total"),M(a,"kars_mesh_messages_received_total"),M(a,"sum by (sandbox) (increase(kars_mesh_messages_sent_total[5m]))"),M(a,"sum by (sandbox) (increase(kars_mesh_messages_received_total[5m]))"),M(a,"sum(agentmesh_relay_connected_agents)"),M(a,"sum(agentmesh_relay_messages_routed_total)"),M(a,"sum(agentmesh_relay_messages_stored_total)"),M(a,"sum(agentmesh_relay_messages_delivered_total)"),M(a,"sum(rate(agentmesh_relay_messages_routed_total[5m]))")]);return{peers:f,sentLife:A,recvLife:J,sentRate:ae,recvRate:le,relayConn:((ye=ne[0])==null?void 0:ye.value)||0,relayRouted:((ve=Ze[0])==null?void 0:ve.value)||0,relayStored:((Se=Ce[0])==null?void 0:Se.value)||0,relayDelivered:((ke=Re[0])==null?void 0:ke.value)||0,relayMsgsPerSec:((xe=et[0])==null?void 0:xe.value)||0}}),p=Object.fromEntries(r.peers.map(a=>[a.metric.sandbox||"",a.value])),b=Object.fromEntries(r.sentLife.map(a=>[a.metric.sandbox||"",a.value])),v=Object.fromEntries(r.recvLife.map(a=>[a.metric.sandbox||"",a.value])),m=Object.fromEntries(r.sentRate.map(a=>[a.metric.sandbox||"",a.value])),L=Object.fromEntries(r.recvRate.map(a=>[a.metric.sandbox||"",a.value])),h=(g||[]).map(a=>{const f=a.metadata.name,A=(a.metadata.labels||{})["kars.azure.com/parent"]||"";return{name:f,parent:A,knownPeers:p[f]||0,meshSent:m[f]||0,meshRecv:L[f]||0,meshSentLife:b[f]||0,meshRecvLife:v[f]||0}}),w=h.filter(a=>!a.parent).sort((a,f)=>a.name.localeCompare(f.name)),_={};for(const a of h)a.parent&&(_[a.parent]=_[a.parent]||[],_[a.parent].push(a));const u=1100,y=Math.max(220,u/Math.max(1,w.length)),x=u/2,k=70,T=220,P=400,B=36,D=50,E={};w.forEach((a,f)=>{const A=y*(f+.5)+(u-y*w.length)/2;E[a.name]={x:A,y:T,n:a}});const G={};for(const a of w){const f=_[a.name]||[],A=E[a.name].x,J=130;f.forEach((ae,le)=>{const ne=(le-(f.length-1)/2)*J;G[ae.name]={x:A+ne,y:P,n:ae,parent:a.name}})}const I=h.filter(a=>a.parent&&!E[a.parent]),S=a=>a.meshSent+a.meshRecv,W=Math.max(.001,...h.map(S)),Y=Math.max(1,...h.map(a=>a.meshSentLife+a.meshRecvLife)),K=I.length>0?600:520;function Q(a){const f=S(a);return f>5?"#43a047":f>.5?"#9ccc65":f>0?"#ffd54f":a.knownPeers>0?"#90caf9":n?"#555":"#bdbdbd"}function q(a){return B+Math.min(14,(a.meshSentLife+a.meshRecvLife)/Y*14)}function fe(a){return 1+a/W*5}function be(a){return .3+a/W*.7}function te(a){return a>0?Math.max(.6,3-a/W*2.4):0}return e.jsxs(o.SectionBox,{title:"🕸️ Mesh Topology (live)",children:[e.jsxs("div",{style:{marginBottom:12,fontSize:13,color:d},children:["Tree view of the AGT mesh: AGT Relay (top), controllers (mid row), sub-agents (bottom row). Polled from Prometheus every 5s. Edge thickness & pulse speed ∝ mesh messages in/out (5m). Node size ∝ lifetime mesh-message volume. ",e.jsx("b",{children:"children"})," = sub-agent CRs labeled ",e.jsx("code",{children:"kars.azure.com/parent="}),"; ",e.jsx("b",{children:"trust"})," = peers in this router's local AGT trust graph (only populated after live traffic; resets on pod restart).",l&&e.jsxs("div",{style:{color:"#ef5350",marginTop:6},children:["Prometheus unreachable: ",l," (configure window.KARS_PROMETHEUS_URL)"]})]}),e.jsxs("div",{style:{display:"flex",gap:16,marginBottom:12,flexWrap:"wrap"},children:[e.jsxs(o.StatusLabel,{status:"",children:["🔗 Relay connected: ",e.jsx("b",{children:r.relayConn})]}),e.jsxs(o.StatusLabel,{status:"",children:["📨 Relay msg/s (5m): ",e.jsx("b",{children:r.relayMsgsPerSec.toFixed(2)})]}),e.jsxs(o.StatusLabel,{status:"",children:["📬 Routed total: ",e.jsx("b",{children:Math.round(r.relayRouted).toLocaleString()})]}),e.jsxs(o.StatusLabel,{status:"",children:["📦 Stored (offline): ",e.jsx("b",{children:Math.round(r.relayStored).toLocaleString()})]}),e.jsxs(o.StatusLabel,{status:"",children:["✉️ Delivered (after reconnect): ",e.jsx("b",{children:Math.round(r.relayDelivered).toLocaleString()})]}),e.jsxs(o.StatusLabel,{status:"",children:["🤖 Sandboxes: ",e.jsx("b",{children:h.length})]}),e.jsxs(o.StatusLabel,{status:"",children:["👨‍👩‍👧 Controllers: ",e.jsx("b",{children:w.length})]}),e.jsxs(o.StatusLabel,{status:"",children:["🧒 Sub-agents: ",e.jsx("b",{children:Object.keys(G).length})]})]}),e.jsxs("svg",{viewBox:`0 0 ${u} ${K}`,style:{width:"100%",maxWidth:u,background:i,borderRadius:8},children:[e.jsxs("defs",{children:[e.jsxs("radialGradient",{id:"relayGrad",cx:"50%",cy:"50%",r:"50%",children:[e.jsx("stop",{offset:"0%",stopColor:"#fff59d"}),e.jsx("stop",{offset:"100%",stopColor:"#fbc02d"})]}),e.jsxs("filter",{id:"glow",x:"-50%",y:"-50%",width:"200%",height:"200%",children:[e.jsx("feGaussianBlur",{stdDeviation:"3",result:"blur"}),e.jsxs("feMerge",{children:[e.jsx("feMergeNode",{in:"blur"}),e.jsx("feMergeNode",{in:"SourceGraphic"})]})]})]}),w.map(a=>{const f=E[a.name],A=S(a);return e.jsxs("g",{children:[e.jsx("line",{x1:x,y1:k,x2:f.x,y2:f.y,stroke:"#42a5f5",strokeWidth:fe(A),strokeOpacity:be(A)}),a.meshRecv>0&&e.jsx("circle",{r:"4",fill:"#81d4fa",filter:"url(#glow)",children:e.jsx("animateMotion",{dur:`${te(a.meshRecv)}s`,repeatCount:"indefinite",path:`M${x},${k} L${f.x},${f.y}`})}),a.meshSent>0&&e.jsx("circle",{r:"4",fill:"#ffeb3b",filter:"url(#glow)",children:e.jsx("animateMotion",{dur:`${te(a.meshSent)}s`,repeatCount:"indefinite",path:`M${f.x},${f.y} L${x},${k}`})}),e.jsxs("text",{x:(x+f.x)/2,y:(k+f.y)/2-4,textAnchor:"middle",fontSize:"10",fill:d,style:{pointerEvents:"none"},children:["↑",Math.round(a.meshSent*60/5)||0," ↓",Math.round(a.meshRecv*60/5)||0," /min"]})]},`r-${a.name}`)}),Object.values(G).map(a=>{const f=E[a.parent];if(!f)return null;const A=S(a.n);return e.jsxs("g",{children:[e.jsx("line",{x1:f.x,y1:f.y,x2:a.x,y2:a.y,stroke:"#7e57c2",strokeWidth:fe(A),strokeOpacity:be(A),strokeDasharray:"6,4"}),te(A)>0&&e.jsx("circle",{r:"3",fill:"#ce93d8",filter:"url(#glow)",children:e.jsx("animateMotion",{dur:`${te(A)}s`,repeatCount:"indefinite",path:`M${f.x},${f.y} L${a.x},${a.y}`})})]},`pc-${a.n.name}`)}),e.jsxs("g",{children:[e.jsx("circle",{cx:x,cy:k,r:D,fill:"url(#relayGrad)",stroke:"#f57f17",strokeWidth:"3",filter:"url(#glow)"}),e.jsx("text",{x,y:k-8,textAnchor:"middle",fontSize:"13",fontWeight:"bold",fill:"#212121",children:"AGT Relay"}),e.jsxs("text",{x,y:k+6,textAnchor:"middle",fontSize:"10",fill:"#212121",children:[r.relayConn," connected"]}),e.jsxs("text",{x,y:k+20,textAnchor:"middle",fontSize:"10",fill:"#212121",children:[r.relayMsgsPerSec.toFixed(2)," msg/s"]}),e.jsxs("text",{x,y:k+34,textAnchor:"middle",fontSize:"9",fill:"#212121",children:[Math.round(r.relayRouted).toLocaleString()," routed"]})]}),w.map(a=>{const f=E[a.name],A=q(a),J=(_[a.name]||[]).length;return e.jsxs("g",{children:[e.jsx("circle",{cx:f.x,cy:f.y,r:A,fill:Q(a),stroke:c,strokeWidth:"2.5"}),e.jsx("text",{x:f.x,y:f.y-8,textAnchor:"middle",fontSize:"13",fontWeight:"bold",fill:s,children:a.name}),e.jsx("text",{x:f.x,y:f.y+4,textAnchor:"middle",fontSize:"9",fill:s,children:"controller"}),e.jsxs("text",{x:f.x,y:f.y+18,textAnchor:"middle",fontSize:"10",fill:s,children:["↑",Math.round(a.meshSentLife).toLocaleString()," ↓",Math.round(a.meshRecvLife).toLocaleString()]}),e.jsxs("text",{x:f.x,y:f.y+30,textAnchor:"middle",fontSize:"9",fill:s,children:[J," child",J===1?"":"ren"," · ",a.knownPeers," trust"]})]},`c-${a.name}`)}),Object.values(G).map(a=>{const f=a.n,A=q(f)-6;return e.jsxs("g",{children:[e.jsx("circle",{cx:a.x,cy:a.y,r:A,fill:Q(f),stroke:c,strokeWidth:"1.5"}),e.jsx("text",{x:a.x,y:a.y-6,textAnchor:"middle",fontSize:"11",fontWeight:"bold",fill:s,children:f.name}),e.jsx("text",{x:a.x,y:a.y+6,textAnchor:"middle",fontSize:"9",fill:s,children:"sub-agent"}),e.jsxs("text",{x:a.x,y:a.y+20,textAnchor:"middle",fontSize:"10",fill:s,children:["↑",Math.round(f.meshSentLife).toLocaleString()," ↓",Math.round(f.meshRecvLife).toLocaleString()]})]},`s-${f.name}`)}),I.length>0&&e.jsxs("g",{children:[e.jsx("text",{x:u/2,y:K-80,textAnchor:"middle",fontSize:"11",fill:d,children:"— Orphan sub-agents (parent CR not found) —"}),I.map((a,f)=>{const A=u/(I.length+1)*(f+1);return e.jsxs("g",{children:[e.jsx("circle",{cx:A,cy:K-40,r:B-8,fill:n?"#616161":"#9e9e9e",stroke:n?"#9e9e9e":"#616161",strokeWidth:"1.5",strokeDasharray:"3,3"}),e.jsx("text",{x:A,y:K-44,textAnchor:"middle",fontSize:"11",fontWeight:"bold",fill:s,children:a.name}),e.jsxs("text",{x:A,y:K-30,textAnchor:"middle",fontSize:"9",fill:s,children:["parent:",a.parent]})]},`o-${a.name}`)})]})]}),e.jsx("div",{style:{marginTop:12},children:e.jsx(o.SimpleTable,{data:h.map(a=>({name:a.name,kind:a.parent?`sub-agent ← ${a.parent}`:"controller",peers:a.knownPeers,sent5m:Math.round(a.meshSent),recv5m:Math.round(a.meshRecv),sentLife:Math.round(a.meshSentLife),recvLife:Math.round(a.meshRecvLife)})).sort((a,f)=>f.sent5m+f.recv5m-(a.sent5m+a.recv5m)),columns:[{label:"Sandbox",getter:a=>a.name},{label:"Role",getter:a=>a.kind},{label:"Peers",getter:a=>a.peers},{label:"↑ Sent (5m)",getter:a=>a.sent5m},{label:"↓ Recv (5m)",getter:a=>a.recv5m},{label:"↑ Sent (life)",getter:a=>a.sentLife.toLocaleString()},{label:"↓ Recv (life)",getter:a=>a.recvLife.toLocaleString()}]})})]})}function qe(){return typeof window<"u"&&window.KARS_GRAFANA_URL||"http://127.0.0.1:3000"}function Ve({policyName:t}){const n=U.useTheme(),i=n.palette.mode==="dark"?"dark":"light",d=n.palette.text.secondary,{data:c,err:s}=H({byModel:[],bySandbox:[],reqRate:[],latency:0},async p=>{var h;const[b,v,m,L]=await Promise.all([M(p,"sum by (model, direction) (increase(kars_tokens_total[1h]))"),M(p,"sum by (sandbox) (increase(kars_tokens_total[1h]))"),M(p,"sum by (model, status) (rate(kars_inference_requests_total[5m]))"),M(p,"histogram_quantile(0.95, sum by (le) (rate(kars_inference_latency_seconds_bucket[5m])))")]);return{byModel:b,bySandbox:v,reqRate:m,latency:((h=L[0])==null?void 0:h.value)||0}}),g=`${qe()}/d/kars-ops?kiosk=tv&refresh=10s&theme=${i}`,r=c.byModel.map(p=>({model:p.metric.model||"?",direction:p.metric.direction||"?",tokens:Math.round(p.value).toLocaleString()})).sort((p,b)=>Number(b.tokens.replace(/,/g,""))-Number(p.tokens.replace(/,/g,""))),l=c.bySandbox.map(p=>({sandbox:p.metric.sandbox||"?",tokens:Math.round(p.value).toLocaleString()})).sort((p,b)=>Number(b.tokens.replace(/,/g,""))-Number(p.tokens.replace(/,/g,"")));return e.jsxs(o.SectionBox,{title:`📊 Inference Metrics (policy: ${t})`,children:[e.jsxs("div",{style:{marginBottom:8,fontSize:13,color:d},children:["Live aggregates across all sandboxes routed through this policy class. ",s&&e.jsx("span",{style:{color:"#ef5350"},children:s})]}),e.jsxs("div",{style:{display:"flex",gap:12,marginBottom:12,flexWrap:"wrap"},children:[e.jsxs(o.StatusLabel,{status:"",children:["⏱ p95 latency (5m): ",e.jsxs("b",{children:[(c.latency*1e3).toFixed(0)," ms"]})]}),e.jsxs(o.StatusLabel,{status:"",children:["🧮 Models active: ",e.jsx("b",{children:new Set(c.byModel.map(p=>p.metric.model)).size})]}),e.jsxs(o.StatusLabel,{status:"",children:["🤖 Sandboxes consuming: ",e.jsx("b",{children:l.length})]})]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"1fr 1fr",gap:16},children:[e.jsxs("div",{children:[e.jsx("h4",{style:{margin:"4px 0"},children:"Tokens by model (1h)"}),e.jsx(o.SimpleTable,{data:r,columns:[{label:"Model",getter:p=>p.model},{label:"Dir",getter:p=>p.direction},{label:"Tokens",getter:p=>p.tokens}]})]}),e.jsxs("div",{children:[e.jsx("h4",{style:{margin:"4px 0"},children:"Top consumers (1h)"}),e.jsx(o.SimpleTable,{data:l.slice(0,10),columns:[{label:"Sandbox",getter:p=>p.sandbox},{label:"Tokens",getter:p=>p.tokens}]})]})]}),e.jsx("div",{style:{marginTop:12},children:e.jsx("a",{href:g,target:"_blank",rel:"noopener noreferrer",children:"Open full Grafana dashboard ↗"})})]})}function Ye({policyName:t}){const i=U.useTheme().palette.text.secondary,{data:d,err:c}=H({decisions:[],bySandbox:[],latencyP95:0},async l=>{var m;const[p,b,v]=await Promise.all([M(l,"sum by (decision) (increase(kars_agt_policy_evaluations_total[1h]))"),M(l,"sum by (sandbox, decision) (increase(kars_agt_policy_evaluations_total[1h]))"),M(l,"histogram_quantile(0.95, sum by (le) (rate(kars_agt_eval_latency_seconds_bucket[5m])))")]);return{decisions:p,bySandbox:b,latencyP95:((m=v[0])==null?void 0:m.value)||0}}),s=d.decisions.reduce((l,p)=>l+p.value,0)||1,g=d.decisions.map(l=>({decision:l.metric.decision||"?",count:Math.round(l.value).toLocaleString(),pct:(l.value/s*100).toFixed(1)+"%"})),r=d.bySandbox.map(l=>({sandbox:l.metric.sandbox||"?",decision:l.metric.decision||"?",count:Math.round(l.value).toLocaleString()})).sort((l,p)=>Number(p.count.replace(/,/g,""))-Number(l.count.replace(/,/g,"")));return e.jsxs(o.SectionBox,{title:`🛡️ Policy Evaluations (policy: ${t})`,children:[e.jsxs("div",{style:{marginBottom:8,fontSize:13,color:i},children:["AGT policy evaluation counters scoped to all sandboxes referencing this policy. ",c&&e.jsx("span",{style:{color:"#ef5350"},children:c})]}),e.jsxs("div",{style:{display:"flex",gap:12,marginBottom:12,flexWrap:"wrap"},children:[e.jsxs(o.StatusLabel,{status:"",children:["⏱ p95 eval latency (5m): ",e.jsxs("b",{children:[(d.latencyP95*1e6).toFixed(0)," µs"]})]}),e.jsxs(o.StatusLabel,{status:"",children:["📊 Total evals (1h): ",e.jsx("b",{children:Math.round(s).toLocaleString()})]})]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"1fr 2fr",gap:16},children:[e.jsxs("div",{children:[e.jsx("h4",{style:{margin:"4px 0"},children:"Decision mix (1h)"}),e.jsx(o.SimpleTable,{data:g,columns:[{label:"Decision",getter:l=>l.decision},{label:"Count",getter:l=>l.count},{label:"Share",getter:l=>l.pct}]})]}),e.jsxs("div",{children:[e.jsx("h4",{style:{margin:"4px 0"},children:"Top deniers/allowers (1h)"}),e.jsx(o.SimpleTable,{data:r.slice(0,15),columns:[{label:"Sandbox",getter:l=>l.sandbox},{label:"Decision",getter:l=>l.decision},{label:"Count",getter:l=>l.count}]})]})]})]})}function Je(){const n=U.useTheme().palette.text.secondary,{data:i,err:d}=H({peers:[],auditEntries:[],bundleHealth:[]},async r=>{const[l,p,b]=await Promise.all([M(r,"kars_agt_known_agents"),M(r,"kars_agt_audit_entries_total"),M(r,"kars_policy_bundle_healthy")]);return{peers:l,auditEntries:p,bundleHealth:b}}),c=i.peers.map(r=>({sandbox:r.metric.sandbox||"?",knownPeers:r.value})).sort((r,l)=>l.knownPeers-r.knownPeers),s=i.peers.reduce((r,l)=>r+l.value,0),g=i.auditEntries.reduce((r,l)=>r+l.value,0);return e.jsxs(o.SectionBox,{title:"🔐 Trust Graph Metrics",children:[e.jsxs("div",{style:{marginBottom:8,fontSize:13,color:n},children:["AGT trust graph: peers known per sandbox + tamper-evident audit log size. ",d&&e.jsx("span",{style:{color:"#ef5350"},children:d})]}),e.jsxs("div",{style:{display:"flex",gap:12,marginBottom:12,flexWrap:"wrap"},children:[e.jsxs(o.StatusLabel,{status:"",children:["🤝 Total known peers: ",e.jsx("b",{children:s})]}),e.jsxs(o.StatusLabel,{status:"",children:["📜 Audit entries: ",e.jsx("b",{children:Math.round(g).toLocaleString()})]}),e.jsxs(o.StatusLabel,{status:"",children:["📦 Healthy bundles: ",e.jsxs("b",{children:[i.bundleHealth.filter(r=>r.value>0).length,"/",i.bundleHealth.length]})]})]}),e.jsx(o.SimpleTable,{data:c,columns:[{label:"Sandbox",getter:r=>r.sandbox},{label:"Known peers",getter:r=>r.knownPeers}]})]})}function ee(t){return t>=90?"error":t>=70?"warning":t>0?"success":""}function j(t){return t>=1e9?(t/1e9).toFixed(2)+"B":t>=1e6?(t/1e6).toFixed(2)+"M":t>=1e3?(t/1e3).toFixed(1)+"K":Math.round(t).toLocaleString()}function se({used:t,total:n,height:i=14}){const c=U.useTheme().palette.mode==="dark",s=c?"#333":"#eee",g=c?"#eee":"#333",r=n>0?Math.min(100,t/n*100):0,l=r>=90?"#c62828":r>=70?"#ef6c00":"#2e7d32";return e.jsxs("div",{style:{background:s,borderRadius:4,height:i,overflow:"hidden",position:"relative"},children:[e.jsx("div",{style:{background:l,height:"100%",width:`${r}%`,transition:"width .3s ease"}}),e.jsxs("div",{style:{position:"absolute",inset:0,display:"flex",alignItems:"center",justifyContent:"center",fontSize:11,fontWeight:600,color:r>50?"#fff":g},children:[r.toFixed(1),"%"]})]})}function Xe({sandboxes:t,inferencePolicies:n}){const d=U.useTheme().palette.text.secondary,{data:c,err:s}=H([],async h=>M(h,"sum by (sandbox) (increase(kars_tokens_total[24h]))"),1e4),g={};for(const h of c)g[h.metric.sandbox||"?"]=h.value;const r={};for(const h of n)r[h.metadata.name]=h;const l=t.map(h=>{var k,T,P,B,D;const _=((T=(((k=h.jsonData)==null?void 0:k.spec)||h.spec||{}).inferenceRef)==null?void 0:T.name)||"",u=r[_],y=((D=(B=((P=u==null?void 0:u.jsonData)==null?void 0:P.spec)||(u==null?void 0:u.spec)||{})==null?void 0:B.tokenBudget)==null?void 0:D.dailyTokens)||0,x=g[h.metadata.name]||0;return{name:h.metadata.name,policy:_||"—",budget:y,used:x,pct:y>0?x/y*100:0}}),p=l.reduce((h,w)=>h+w.budget,0),b=l.reduce((h,w)=>h+w.used,0),v=p>0?b/p*100:0,m=l.filter(h=>h.pct>=70).length,L=l.filter(h=>h.pct>=100).length;return e.jsxs(o.SectionBox,{title:"💰 Token Budget (24h)",children:[e.jsxs("div",{style:{marginBottom:12,fontSize:13,color:d},children:["Aggregate daily budget across all InferencePolicy CRs vs. actual consumption pulled from Prometheus. ",s&&e.jsx("span",{style:{color:"#ef5350"},children:s})]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"repeat(auto-fit, minmax(220px, 1fr))",gap:"1rem",marginBottom:16},children:[e.jsx($,{label:"Fleet budget (24h)",value:j(p)}),e.jsx($,{label:"Fleet consumed (24h)",value:j(b),tone:ee(v)}),e.jsx($,{label:"Fleet utilization",value:`${v.toFixed(1)}%`,tone:ee(v)}),e.jsx($,{label:"Sandboxes ≥70% used",value:m,tone:m>0?"warning":""}),e.jsx($,{label:"Sandboxes over budget",value:L,tone:L>0?"error":""})]}),e.jsx("div",{style:{marginBottom:8,fontSize:13,fontWeight:600},children:"Fleet utilization"}),e.jsx(se,{used:b,total:p,height:20}),e.jsx("div",{style:{marginTop:16},children:e.jsx(o.SimpleTable,{data:l.sort((h,w)=>w.pct-h.pct).map(h=>({name:h.name,policy:h.policy,budget:j(h.budget),used:j(h.used),bar:h})),columns:[{label:"Sandbox",getter:h=>h.name},{label:"Policy",getter:h=>h.policy},{label:"Budget",getter:h=>h.budget},{label:"Used",getter:h=>h.used},{label:"Utilization",getter:h=>e.jsx("div",{style:{width:160},children:e.jsx(se,{used:h.bar.used,total:h.bar.budget})})}]})})]})}function Qe({sandboxName:t,inferenceRefName:n}){var w,_,u,y,x,k;const d=U.useTheme().palette.text.secondary,[c]=z.inferencepolicies.useList(),s=(c||[]).find(T=>T.metadata.name===n),g=((w=s==null?void 0:s.jsonData)==null?void 0:w.spec)||(s==null?void 0:s.spec)||{},r=((_=g==null?void 0:g.tokenBudget)==null?void 0:_.dailyTokens)||0,l=((u=g==null?void 0:g.tokenBudget)==null?void 0:u.perRequestTokens)||0,{data:p}=H(0,async T=>{var B;return((B=(await M(T,`sum(increase(kars_tokens_total{sandbox="${t}"}[24h]))`))[0])==null?void 0:B.value)||0},1e4),{data:b}=H([],async T=>M(T,`sum by (direction) (increase(kars_tokens_total{sandbox="${t}"}[24h]))`),1e4),v=r>0?p/r*100:0,m=Math.max(0,r-p),L=((y=b.find(T=>T.metric.direction==="input"))==null?void 0:y.value)||0,h=((x=b.find(T=>T.metric.direction==="output"))==null?void 0:x.value)||0;return e.jsxs(o.SectionBox,{title:`💰 Token Budget — ${t}`,children:[!n&&e.jsxs("div",{style:{color:d,fontSize:13},children:["No ",e.jsx("code",{children:"inferenceRef"})," set on this sandbox; no enforced budget."]}),n&&!s&&e.jsxs("div",{style:{color:"#ef6c00",fontSize:13},children:["InferencePolicy ",e.jsx("code",{children:n})," not found."]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"repeat(auto-fit, minmax(180px, 1fr))",gap:"0.75rem",marginBottom:12},children:[e.jsx($,{label:"Daily budget",value:r>0?j(r):"unlimited"}),e.jsx($,{label:"Consumed (24h)",value:j(p),tone:ee(v)}),e.jsx($,{label:"Remaining",value:r>0?j(m):"—",tone:ee(v)}),e.jsx($,{label:"Per-request cap",value:l>0?j(l):"unlimited"}),e.jsx($,{label:"Input tokens",value:j(L)}),e.jsx($,{label:"Output tokens",value:j(h)})]}),r>0&&e.jsxs("div",{children:[e.jsx("div",{style:{marginBottom:6,fontSize:13,fontWeight:600},children:"Utilization"}),e.jsx(se,{used:p,total:r,height:22})]}),n&&e.jsxs("div",{style:{marginTop:12,fontSize:12,color:d},children:["Policy: ",e.jsx(o.Link,{routeName:"inferencepolicies-detail",params:{namespace:((k=s==null?void 0:s.metadata)==null?void 0:k.namespace)||"default",name:n},children:n})]})]})}})); +(function(e,B){typeof exports=="object"&&typeof module<"u"?B(require("react/jsx-runtime"),require("@kinvolk/headlamp-plugin/lib"),require("@kinvolk/headlamp-plugin/lib/lib/k8s/crd"),require("@kinvolk/headlamp-plugin/lib/K8s/secret"),require("@kinvolk/headlamp-plugin/lib/CommonComponents"),require("@mui/material/styles"),require("@mui/material"),require("react")):typeof define=="function"&&define.amd?define(["react/jsx-runtime","@kinvolk/headlamp-plugin/lib","@kinvolk/headlamp-plugin/lib/lib/k8s/crd","@kinvolk/headlamp-plugin/lib/K8s/secret","@kinvolk/headlamp-plugin/lib/CommonComponents","@mui/material/styles","@mui/material","react"],B):(e=typeof globalThis<"u"?globalThis:e||self,B(e.pluginLib.ReactJSX,e.pluginLib,e.pluginLib.Crd,e.pluginLib.K8s.secret,e.pluginLib.CommonComponents,e.pluginLib.MuiMaterial.styles,e.pluginLib.MuiMaterial,e.pluginLib.React))})(this,(function(e,B,Te,Ae,d,U,K,Pe){"use strict";const _e=t=>t&&typeof t=="object"&&"default"in t?t:{default:t};function Me(t){if(t&&typeof t=="object"&&"default"in t)return t;const n=Object.create(null,{[Symbol.toStringTag]:{value:"Module"}});if(t){for(const i in t)if(i!=="default"){const c=Object.getOwnPropertyDescriptor(t,i);Object.defineProperty(n,i,c.get?c:{enumerable:!0,get:()=>t[i]})}}return n.default=t,Object.freeze(n)}const pe=_e(Ae),q=Me(Pe),Ee="kars.azure.com",$e="v1alpha1",ge=[{plural:"karssandboxes",singular:"karssandbox",kind:"KarsSandbox",label:"Sandboxes",phaseField:"phase"},{plural:"inferencepolicies",singular:"inferencepolicy",kind:"InferencePolicy",label:"Inference Policies"},{plural:"karsmemories",singular:"karsmemory",kind:"KarsMemory",label:"Memories",phaseField:"phase"},{plural:"mcpservers",singular:"mcpserver",kind:"McpServer",label:"MCP Servers",phaseField:"phase"},{plural:"a2aagents",singular:"a2aagent",kind:"A2AAgent",label:"A2A Agents",phaseField:"phase"},{plural:"toolpolicies",singular:"toolpolicy",kind:"ToolPolicy",label:"Tool Policies"},{plural:"trustgraphs",singular:"trustgraph",kind:"TrustGraph",label:"Trust Graphs"},{plural:"karspairings",singular:"karspairing",kind:"KarsPairing",label:"Pairings"},{plural:"karsevals",singular:"karseval",kind:"KarsEval",label:"Evals",phaseField:"phase"},{plural:"egressapprovals",singular:"egressapproval",kind:"EgressApproval",label:"Egress Approvals",phaseField:"phase"},{plural:"karssreactions",singular:"karssreaction",kind:"KarsSREAction",label:"SRE Actions",phaseField:"phase"}],j=Object.fromEntries(ge.map(t=>[t.plural,Te.makeCustomResourceClass({apiInfo:[{group:Ee,version:$e}],isNamespaced:!0,singularName:t.singular,pluralName:t.plural,kind:t.kind,customResourceDefinition:void 0})])),ne=j.karssandboxes;B.registerSidebarEntry({parent:null,name:"kars",label:"kars",icon:"mdi:robot-outline",url:"/kars"}),B.registerSidebarEntry({parent:"kars",name:"kars-overview",label:"Overview",url:"/kars"}),B.registerRoute({path:"/kars",sidebar:"kars-overview",name:"kars-overview",exact:!0,component:()=>e.jsx(Ie,{})}),B.registerSidebarEntry({parent:"kars",name:"kars-mesh",label:"Mesh Topology",url:"/kars/mesh"}),B.registerRoute({path:"/kars/mesh",sidebar:"kars-mesh",name:"kars-mesh",exact:!0,component:()=>e.jsx(Ye,{})});for(const t of ge)B.registerSidebarEntry({parent:"kars",name:t.plural,label:t.label,url:`/kars/${t.plural}`}),B.registerRoute({path:`/kars/${t.plural}`,sidebar:t.plural,name:t.plural,exact:!0,component:()=>e.jsx(We,{crd:t})}),B.registerRoute({path:`/kars/${t.plural}/:namespace/:name`,sidebar:t.plural,name:`${t.plural}-detail`,exact:!0,component:()=>e.jsx(Ge,{crd:t})});B.registerSidebarEntry({parent:"kars",name:"kars-sre-root",label:"SRE",icon:"mdi:stethoscope",url:"/kars/sre"}),B.registerSidebarEntry({parent:"kars-sre-root",name:"kars-sre-console",label:"Console",url:"/kars/sre"}),B.registerRoute({path:"/kars/sre",sidebar:"kars-sre-console",name:"kars-sre-console",exact:!0,component:()=>e.jsx(dt,{})}),B.registerSidebarEntry({parent:"kars-sre-root",name:"kars-sre-chat",label:"Chat",url:"/kars/sre/chat"}),B.registerRoute({path:"/kars/sre/chat",sidebar:"kars-sre-chat",name:"kars-sre-chat",exact:!0,component:()=>e.jsx(ht,{})}),B.registerSidebarEntry({parent:"kars-sre-root",name:"kars-sre-actions",label:"Actions",url:"/kars/karssreactions"});const ue=new Set(["SignatureMismatch","BundleVerifyFailed","AuthMisconfigured","MemoryStoreMissing","RuntimeAdapterMissing","AdapterMissing","ShapeInvalid","AllowlistDrift","PolicyCompileFailed"]),fe=new Set(["AwaitingRouterEnforcement","AwaitingFoundryProvisioning","NoSandboxesReferencing","Pending"]);function C(t){const i=(N(t).conditions??[]).find(c=>c.type==="Ready");return i==null?void 0:i.reason}function Be(t,n){return n&&ue.has(n)?"error":n&&fe.has(n)?"warning":t?t==="Ready"||t==="Provisioned"||t==="Active"?"success":t==="Degraded"||t==="Failed"||t==="Error"?"error":"warning":""}function N(t){var n;return((n=t.jsonData)==null?void 0:n.status)??{}}function E(t){var n;return((n=t.jsonData)==null?void 0:n.spec)??{}}function R(t){if(!t)return"—";const n=t.lastIndexOf("/");return n>=0?t.slice(n+1):t}function J(t,n){if(!t)return e.jsx("span",{children:"—"});const i=Be(t,n),c=n&&(ue.has(n)||fe.has(n));return e.jsxs("span",{children:[e.jsx(d.StatusLabel,{status:i,children:t}),c&&e.jsx("span",{style:{marginLeft:"0.4rem",fontSize:"0.85em",color:"#888"},children:n})]})}function De(t){return window.location.pathname.match(t)}function ee(t){if(!t)return"—";const n=t.indexOf(":");return n<0||n+13>=t.length?t:`${t.slice(0,n+1)}${t.slice(n+1,n+13)}…`}function Ne(t){if(!t)return null;const n=t.indexOf(" | drift=");if(n<0)return null;try{const i=JSON.parse(t.slice(n+9));if(!i||typeof i!="object")return null;const c=Array.isArray(i.added)?i.added.filter(a=>typeof a=="string"):[],o=Array.isArray(i.removed)?i.removed.filter(a=>typeof a=="string"):[];return{added:c,removed:o}}catch{return null}}function ze({item:t}){const c=(N(t).conditions??[]).find(s=>s.type==="AllowlistDrift"&&s.status==="True");if(!c)return null;const o=Ne(c.message),a=(o==null?void 0:o.added)??[],h=(o==null?void 0:o.removed)??[];return e.jsxs(d.SectionBox,{title:"⚠ Allowlist drift detected",children:[e.jsxs("p",{style:{padding:"0.5rem",fontSize:"0.9rem"},children:[e.jsx(d.StatusLabel,{status:"warning",children:"artifact wins"})," ","Inline ",e.jsx("code",{children:"allowedEndpoints"})," diverges from the verified signed bundle. The router enforces the bundle; the inline list is ignored. Either re-sign the bundle to include the divergent hosts, or remove the inline override."]}),a.length>0||h.length>0?e.jsx(d.SimpleTable,{data:[{side:`Only in inline (operator added, not signed) — ${a.length}`,hosts:a.join(", ")||"—"},{side:`Only in bundle (signed, but missing inline) — ${h.length}`,hosts:h.join(", ")||"—"}],columns:[{label:"Side",getter:s=>s.side},{label:"Hosts",getter:s=>e.jsx("code",{children:s.hosts})}]}):e.jsx("p",{style:{padding:"0.5rem",fontSize:"0.85rem",opacity:.75},children:c.message??"(no diff payload)"})]})}function oe(t){if(!t)return e.jsx("span",{children:"—"});const c=t==="RouterEnforcing"||t==="AllDigestsMatch"?"success":t==="NoSandboxesReferencing"||t==="AsExpected"?"":t==="AwaitingRouterEnforcement"?"warning":"error";return e.jsx(d.StatusLabel,{status:c,children:t})}function Oe({crd:t,item:n}){if(t.plural!=="toolpolicies"&&t.plural!=="inferencepolicies"&&t.plural!=="karsmemories")return null;const i=N(n),o=(i.conditions??[]).find(r=>r.type==="Ready"),a=t.plural==="toolpolicies"?i.agtProfileDigest:i.compiledDigest,h=i.loadedDigest,s=a?h&&h===a?"✓ matches":h?"≠ mismatched":"(awaiting)":"—";return e.jsxs(d.SectionBox,{title:"Router enforcement (data-plane echo)",children:[e.jsx(d.SimpleTable,{data:[{k:"Compiled digest",v:ee(a)},{k:"Loaded digest",v:ee(h)},{k:"Echo",v:s},{k:"Confirmation",v:oe(o==null?void 0:o.reason)}],columns:[{label:"Field",getter:r=>r.k},{label:"Value",getter:r=>r.v}]}),e.jsxs("p",{style:{padding:"0.5rem",fontSize:"0.85rem",opacity:.75},children:["The controller polls every referencing sandbox's router and promotes",e.jsx("code",{children:" phase: Compiled → Ready "})," only when every router echoes the exact compiled digest. While"," ",e.jsx("code",{children:"AwaitingRouterEnforcement"}),", the policy is parsed but",e.jsx("strong",{children:" not"})," live in the data plane."]})]})}function je({crd:t,item:n}){var k,x;if(t.plural!=="karsevals")return null;const i=E(n),c=N(n),o=c.conditions??[],a=o.find(g=>g.type==="Ready"),h=o.find(g=>g.type==="ConformanceDrift"),s=c.lastResult,r=i.corpus,p=r!=null&&r.builtin?`builtin:${r.builtin}`:(k=r==null?void 0:r.bundleRef)!=null&&k.digest?`bundle ${r.bundleRef.registry??"?"}/${r.bundleRef.repository??"?"}@${r.bundleRef.digest}`:"—",f=s?`${s.passedCases??0}/${s.totalCases??0}`:"—",b=s!=null&&s.drift?e.jsx(d.StatusLabel,{status:"error",children:"YES"}):s?e.jsx(d.StatusLabel,{status:"success",children:"no"}):e.jsx("span",{style:{opacity:.6},children:"—"});return e.jsxs(d.SectionBox,{title:"KarsEval (conformance corpus)",children:[e.jsx(d.SimpleTable,{data:[{k:"Target sandbox",v:((x=i.targetSandboxRef)==null?void 0:x.name)??"—"},{k:"Corpus",v:p},{k:"Schedule",v:i.schedule??"(on-demand only)"},{k:"Fail sandbox on drift",v:i.failSandboxOnDrift?"true":"false"},{k:"Last run",v:c.lastRunAt??"—"},{k:"Cases passed",v:f},{k:"Drift",v:b},{k:"Ready reason",v:oe(a==null?void 0:a.reason)},{k:"Conformance drift reason",v:oe(h==null?void 0:h.reason)}],columns:[{label:"Field",getter:g=>g.k},{label:"Value",getter:g=>g.v}]}),e.jsxs("p",{style:{padding:"0.5rem",fontSize:"0.85rem",opacity:.75},children:["KarsEvals replay a signed corpus (or a builtin one) against the target sandbox's inference router. The controller stamps each run's verdicts on ",e.jsx("code",{children:"status.lastResult"})," and rolls a history of the most recent ones into ",e.jsx("code",{children:"status.history"}),"."]})]})}const be=[["telegram",/^TELEGRAM_(BOT_)?TOKEN$/i],["slack",/^SLACK_(BOT_)?TOKEN$/i],["discord",/^DISCORD_(BOT_)?TOKEN$/i],["whatsapp",/^WHATSAPP_TOKEN$/i]];function ye(t){var c;const n=new Set;if(!t)return n;const i=((c=t.jsonData)==null?void 0:c.data)??{};for(const o of Object.keys(i))for(const[a,h]of be)h.test(o)&&n.add(a);return n}function Fe(t,n){var o,a,h,s,r,p,f,b,k;const i={sandboxesByPhase:{},channelCounts:{},egressLearn:0,egressStrict:0,governanceEnabled:0,totalRuntime:{}},c=new Map;for(const x of n??[]){const g=((o=x.metadata)==null?void 0:o.name)??"",L=((a=x.metadata)==null?void 0:a.namespace)??"";if(!g.endsWith("-credentials"))continue;const P=g.replace(/-credentials$/,"");c.set(`${L}/${P}`,ye(x))}for(const x of t??[]){const g=E(x),P=N(x).phase??"Unknown";i.sandboxesByPhase[P]=(i.sandboxesByPhase[P]??0)+1;const u=g.networkPolicy??null;!u||(u.egressMode??"Learn")==="Learn"?i.egressLearn+=1:i.egressStrict+=1,(h=g.governance)!=null&&h.enabled&&(i.governanceEnabled+=1);const w=((s=g.runtime)==null?void 0:s.kind)??"Unknown";i.totalRuntime[w]=(i.totalRuntime[w]??0)+1;const m=((r=x.metadata)==null?void 0:r.name)??"",T=((p=x.metadata)==null?void 0:p.namespace)??"",$=`kars-${m}`,D=c.get(`${$}/${m}`)??c.get(`${T}/${m}`)??new Set,O=((k=(b=(f=g.runtime)==null?void 0:f.openclaw)==null?void 0:b.config)==null?void 0:k.channels)??{};for(const z of Object.keys(O))D.add(z);for(const z of D)i.channelCounts[z]=(i.channelCounts[z]??0)+1}return i}function Ie(){var L,P;const[t]=ne.useList(),[n]=pe.default.useList(),[i]=j.inferencepolicies.useList(),[c]=j.toolpolicies.useList(),[o]=j.karsmemories.useList(),[a]=j.mcpservers.useList(),[h]=j.a2aagents.useList(),s=Fe(t,n),r=(t==null?void 0:t.length)??0,p=Object.entries(s.sandboxesByPhase).sort((u,v)=>v[1]-u[1]).map(([u,v])=>({phase:u,count:v})),f=Object.entries(s.totalRuntime).sort((u,v)=>v[1]-u[1]).map(([u,v])=>({kind:u,count:v})),b=Object.entries(s.channelCounts).sort((u,v)=>v[1]-u[1]).map(([u,v])=>({channel:u,count:v})),k=(t??[]).slice().sort((u,v)=>{var T,$;const w=new Date(((T=u.metadata)==null?void 0:T.creationTimestamp)??0).getTime();return new Date((($=v.metadata)==null?void 0:$.creationTimestamp)??0).getTime()-w}).slice(0,10),x=new Map;for(const u of i??[])x.set(`${((L=u.metadata)==null?void 0:L.namespace)??""}/${((P=u.metadata)==null?void 0:P.name)??""}`,u);const g=u=>{var T,$,D,O,z,I,W,S,H;const v=E(u),w=((O=(D=($=(T=v.runtime)==null?void 0:T.openclaw)==null?void 0:$.config)==null?void 0:D.agent)==null?void 0:O.model)??((z=v.agent)==null?void 0:z.model);if(w)return R(w);const m=(I=v.inferenceRef)==null?void 0:I.name;if(!m)return"—";for(const X of[`${((W=u.metadata)==null?void 0:W.namespace)??""}/${m}`,`kars-system/${m}`]){const G=x.get(X);if(G){const Y=(H=(S=E(G).modelPreference)==null?void 0:S.primary)==null?void 0:H.deployment;if(Y)return R(Y)}}return`(via ${m})`};return e.jsxs(e.Fragment,{children:[e.jsxs(d.SectionBox,{title:"kars — Operator Overview",children:[e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"repeat(auto-fit, minmax(180px, 1fr))",gap:"1rem",padding:"1rem 0"},children:[e.jsx(A,{label:"Total Sandboxes",value:r}),e.jsx(A,{label:"Ready",value:s.sandboxesByPhase.Ready??0,tone:"success"}),e.jsx(A,{label:"Degraded",value:s.sandboxesByPhase.Degraded??0,tone:s.sandboxesByPhase.Degraded?"error":""}),e.jsx(A,{label:"Governance ON",value:`${s.governanceEnabled} / ${r}`}),e.jsx(A,{label:"Egress: Learn / Strict",value:`${s.egressLearn} / ${s.egressStrict}`})]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"repeat(auto-fit, minmax(160px, 1fr))",gap:"0.5rem",padding:"0 0 1rem 0"},children:[e.jsx(A,{label:"Inference Policies",value:(i==null?void 0:i.length)??"…"}),e.jsx(A,{label:"Tool Policies",value:(c==null?void 0:c.length)??"…"}),e.jsx(A,{label:"Memories",value:(o==null?void 0:o.length)??"…"}),e.jsx(A,{label:"MCP Servers",value:(a==null?void 0:a.length)??"…"}),e.jsx(A,{label:"A2A Agents",value:(h==null?void 0:h.length)??"…"})]})]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"1fr 1fr 1fr",gap:"1rem"},children:[e.jsx(d.SectionBox,{title:"Sandboxes by Phase",children:e.jsx(d.SimpleTable,{data:p,columns:[{label:"Phase",getter:u=>J(u.phase)},{label:"Count",getter:u=>u.count}]})}),e.jsx(d.SectionBox,{title:"Runtimes",children:e.jsx(d.SimpleTable,{data:f,columns:[{label:"Kind",getter:u=>u.kind},{label:"Count",getter:u=>u.count}]})}),e.jsx(d.SectionBox,{title:"Channels in Use",children:b.length===0?e.jsx("p",{style:{padding:"1rem"},children:"No channels configured."}):e.jsx(d.SimpleTable,{data:b,columns:[{label:"Channel",getter:u=>u.channel},{label:"Sandboxes",getter:u=>u.count}]})})]}),e.jsx(d.SectionBox,{title:"Recent Sandboxes",children:e.jsx(d.SimpleTable,{data:k,columns:[{label:"Name",getter:u=>{var v,w,m;return e.jsx(d.Link,{routeName:"karssandboxes-detail",params:{namespace:((v=u.metadata)==null?void 0:v.namespace)??"",name:((w=u.metadata)==null?void 0:w.name)??""},children:(m=u.metadata)==null?void 0:m.name})}},{label:"Namespace",getter:u=>{var v;return((v=u.metadata)==null?void 0:v.namespace)??"—"}},{label:"Runtime",getter:u=>{var v;return((v=E(u).runtime)==null?void 0:v.kind)??"—"}},{label:"Model",getter:g},{label:"Phase",getter:u=>J(N(u).phase,C(u))},{label:"Egress",getter:u=>{const v=E(u).networkPolicy;return!v||(v.egressMode??"Learn")==="Learn"?"Learn":"Strict"}},{label:"Age",getter:u=>{var v;return te((v=u.metadata)==null?void 0:v.creationTimestamp)}}]})}),e.jsx(Ce,{sandboxes:t??[],inferencePolicies:i??[]})]})}function A(t){const n=t.tone??"",i=n==="error"?"#c62828":n==="warning"?"#ef6c00":n==="success"?"#2e7d32":"inherit";return e.jsxs("div",{style:{padding:"1rem",border:"1px solid rgba(127,127,127,0.2)",borderRadius:"6px"},children:[e.jsx("div",{style:{fontSize:"0.85rem",opacity:.7},children:t.label}),e.jsx("div",{style:{fontSize:"1.6rem",fontWeight:600,color:i},children:t.value})]})}function te(t){if(!t)return"—";const n=Date.now()-new Date(t).getTime(),i=Math.floor(n/1e3);if(i<60)return`${i}s`;const c=Math.floor(i/60);if(c<60)return`${c}m`;const o=Math.floor(c/60);return o<24?`${o}h`:`${Math.floor(o/24)}d`}function We({crd:t}){const n=j[t.plural],[i]=n.useList(),[c]=j.inferencepolicies.useList(),o=q.useMemo(()=>{var r,p;const s=new Map;for(const f of c??[])s.set(`${((r=f.metadata)==null?void 0:r.namespace)??""}/${((p=f.metadata)==null?void 0:p.name)??""}`,f);return s},[c]),a=s=>{var k,x,g,L,P,u,v,w,m;const r=E(s),p=((L=(g=(x=(k=r.runtime)==null?void 0:k.openclaw)==null?void 0:x.config)==null?void 0:g.agent)==null?void 0:L.model)??((P=r.agent)==null?void 0:P.model);if(p)return R(p);const f=(u=r.inferenceRef)==null?void 0:u.name;if(!f)return"—";const b=[`${((v=s.metadata)==null?void 0:v.namespace)??""}/${f}`,`kars-system/${f}`];for(const T of b){const $=o.get(T);if($){const O=(m=(w=E($).modelPreference)==null?void 0:w.primary)==null?void 0:m.deployment;if(O)return R(O)}}return`(via ${f})`},h=[{label:"Name",getter:s=>{var r,p,f;return e.jsx(d.Link,{routeName:`${t.plural}-detail`,params:{namespace:((r=s.metadata)==null?void 0:r.namespace)??"",name:((p=s.metadata)==null?void 0:p.name)??""},children:(f=s.metadata)==null?void 0:f.name})}},{label:"Namespace",getter:s=>{var r;return((r=s.metadata)==null?void 0:r.namespace)??"—"}}];return t.plural==="karssandboxes"&&h.push({label:"Runtime",getter:s=>{var r;return((r=E(s).runtime)==null?void 0:r.kind)??"—"}},{label:"Model",getter:a},{label:"Egress",getter:s=>{const r=E(s).networkPolicy;return!r||(r.egressMode??"Learn")==="Learn"?e.jsx(d.StatusLabel,{status:"warning",children:"Learn"}):e.jsx(d.StatusLabel,{status:"success",children:"Strict"})}}),t.phaseField&&h.push({label:"Phase",getter:s=>J(N(s)[t.phaseField],C(s))}),h.push({label:"Age",getter:s=>{var r;return te((r=s.metadata)==null?void 0:r.creationTimestamp)}}),e.jsx(d.SectionBox,{title:`kars — ${t.label}`,children:i===null?e.jsx("p",{style:{padding:"1rem"},children:"Loading…"}):i.length===0?e.jsxs("p",{style:{padding:"1rem"},children:["No ",t.label.toLowerCase()," found. Create one with the kars CLI or by applying a CRD manifest."]}):e.jsx(d.SimpleTable,{data:i,columns:h})})}function Ge({crd:t}){var p,f;const n=De(new RegExp(`/kars/${t.plural}/([^/]+)/([^/]+)`)),i=(n==null?void 0:n[1])??"",c=(n==null?void 0:n[2])??"",o=j[t.plural],[a,h]=o.useGet(c,i);if(h)return e.jsx(d.SectionBox,{title:`${t.kind}: ${c}`,children:e.jsxs("p",{children:["Error: ",h.message]})});if(!a)return e.jsx(d.SectionBox,{title:"Loading…",children:"Loading…"});const s=N(a),r=s.conditions??[];return e.jsxs(e.Fragment,{children:[e.jsx(d.SectionBox,{title:`${t.kind}: ${c}`,children:e.jsx(d.SimpleTable,{data:[{k:"Namespace",v:i},{k:"Phase",v:J(s.phase,C(a))},{k:"Created",v:((p=a.metadata)==null?void 0:p.creationTimestamp)??"—"},{k:"UID",v:((f=a.metadata)==null?void 0:f.uid)??"—"}],columns:[{label:"Field",getter:b=>b.k},{label:"Value",getter:b=>b.v}]})}),t.plural==="karssandboxes"&&e.jsx(Ue,{item:a}),t.plural==="inferencepolicies"&&e.jsx(Xe,{policyName:a.metadata.name}),t.plural==="toolpolicies"&&e.jsx(Qe,{policyName:a.metadata.name}),t.plural==="trustgraphs"&&e.jsx(Ze,{}),e.jsx(ze,{item:a}),e.jsx(Oe,{crd:t,item:a}),e.jsx(je,{crd:t,item:a}),e.jsx(d.SectionBox,{title:"Spec",children:e.jsx("pre",{style:{maxHeight:"400px",overflow:"auto"},children:JSON.stringify(E(a),null,2)})}),e.jsx(d.SectionBox,{title:"Status",children:e.jsx("pre",{style:{maxHeight:"400px",overflow:"auto"},children:JSON.stringify(s,null,2)})}),r.length>0&&e.jsx(d.SectionBox,{title:"Conditions",children:e.jsx(d.SimpleTable,{data:r,columns:[{label:"Type",getter:b=>b.type},{label:"Status",getter:b=>e.jsx(d.StatusLabel,{status:b.status==="True"?"success":"error",children:b.status})},{label:"Reason",getter:b=>b.reason??"—"},{label:"Message",getter:b=>b.message??"—"}]})})]})}function Ke({sandboxName:t,sandboxNamespace:n}){const[i]=j.egressapprovals.useList();if(!i)return null;const c=i.filter(a=>{var r;const h=((r=a.metadata)==null?void 0:r.namespace)??"",s=E(a);return h===n&&s.sandbox===t});if(c.length===0)return null;const o=c.map(a=>{var f;const h=E(a),s=N(a),r=Array.isArray(h.hosts)?h.hosts:[],p=r.slice(0,3).map(b=>b.port?`${b.host}:${b.port}`:b.host).join(", ")+(r.length>3?`, +${r.length-3}`:"");return{name:((f=a.metadata)==null?void 0:f.name)??"—",phase:s.phase,hosts:p||"—",reason:h.reason??"—",ttl:h.ttl??"—",expiresAt:s.expiresAt,digest:s.mergedDigest}});return e.jsxs(d.SectionBox,{title:"Egress Approvals (ephemeral grants)",children:[e.jsx(d.SimpleTable,{data:o,columns:[{label:"Name",getter:a=>e.jsx(d.Link,{routeName:"egressapprovals-detail",params:{namespace:n,name:a.name},children:a.name})},{label:"Phase",getter:a=>J(a.phase)},{label:"Hosts",getter:a=>a.hosts},{label:"TTL",getter:a=>a.ttl},{label:"Expires",getter:a=>a.expiresAt??"—"},{label:"Reason",getter:a=>a.reason},{label:"Merged digest",getter:a=>ee(a.digest)}]}),e.jsxs("p",{style:{padding:"0.5rem",fontSize:"0.85rem",opacity:.75},children:["Grants unioned with the baseline allowlist on the data plane. ",e.jsx("code",{children:"Active"})," ","means the router has echoed the merged digest. Grants auto-expire at"," ",e.jsx("code",{children:"status.expiresAt"}),"; revoke early with ",e.jsx("code",{children:"kars egress revoke"}),"."]})]})}function He({refs:t}){const[n]=j.mcpservers.useList();if(t.length===0)return null;const i=new Map;(n??[]).forEach(o=>{var h;const a=(h=o.metadata)==null?void 0:h.name;a&&i.set(a,o)});const c=t.map(o=>{const a=o.name?i.get(o.name):void 0,h=a?N(a):{},s=a?E(a):{},r=Array.isArray(s.tools)?s.tools.length:h.toolCount??0;return{name:o.name??"—",phase:h.phase,reason:a?C(a):void 0,digest:h.jwksDigest??h.bundleDigest,tools:r,missing:!a}});return e.jsx(d.SectionBox,{title:`MCP Servers (${c.length})`,children:e.jsx(d.SimpleTable,{data:c,columns:[{label:"Name",getter:o=>o.missing?e.jsxs("span",{children:[o.name," ",e.jsx(d.StatusLabel,{status:"error",children:"MISSING"})]}):e.jsx(d.Link,{routeName:"mcpservers-detail",params:{namespace:"kars-system",name:o.name},children:o.name})},{label:"Phase",getter:o=>J(o.phase,o.reason)},{label:"Tools",getter:o=>o.tools},{label:"JWKS digest",getter:o=>ee(o.digest)}]})})}function Ue({item:t}){var v,w,m,T,$,D,O,z,I,W;const n=E(t),i=N(t),c=((v=t.metadata)==null?void 0:v.namespace)??"",o=((w=t.metadata)==null?void 0:w.name)??"",a=`kars-${o}`,[h]=pe.default.useGet(`${o}-credentials`,a),s=n.networkPolicy??null,r=s??{},p=!s||(r.egressMode??"Learn")==="Learn",f=Array.isArray(r.allowedEndpoints)?r.allowedEndpoints:[],b=new Set(ye(h??void 0)),k=(($=(T=(m=n.runtime)==null?void 0:m.openclaw)==null?void 0:T.config)==null?void 0:$.channels)??{};for(const S of Object.keys(k))b.add(S);const x=Array.from(b).map(S=>{var H,X;return{channel:S,enabled:((H=k[S])==null?void 0:H.enabled)!==!1,source:h&&Object.keys(((X=h.jsonData)==null?void 0:X.data)??{}).some(G=>be.some(([Z,Y])=>Z===S&&Y.test(G)))?"Secret":"Spec"}}),g=(D=n.inferenceRef)==null?void 0:D.name,L=(z=(O=n.governance)==null?void 0:O.toolPolicyRef)==null?void 0:z.name,P=(I=n.memoryRef)==null?void 0:I.name,u=Array.isArray(n.mcpServerRefs)?n.mcpServerRefs:[];return e.jsxs(e.Fragment,{children:[e.jsxs(d.SectionBox,{title:"Network Policy (Egress)",children:[e.jsx(d.SimpleTable,{data:[{k:"Default Deny",v:String(r.defaultDeny??!1)},{k:"Learn Mode",v:p?e.jsx(d.StatusLabel,{status:"warning",children:"LEARN"}):e.jsx(d.StatusLabel,{status:"success",children:"STRICT"})},{k:"Allowed Endpoints",v:`${f.length}`}],columns:[{label:"Field",getter:S=>S.k},{label:"Value",getter:S=>S.v}]}),f.length>0&&e.jsxs("div",{style:{marginTop:"1rem"},children:[e.jsx("h4",{children:"Allowed Endpoints"}),e.jsx(d.SimpleTable,{data:f,columns:[{label:"Host",getter:S=>S.host??"—"},{label:"Port",getter:S=>S.port??"—"}]})]})]}),e.jsx(d.SectionBox,{title:"Channels & Integrations",children:x.length===0?e.jsxs("p",{style:{padding:"0.5rem"},children:["No channels configured for namespace ",e.jsx("code",{children:a}),". Use"," ",e.jsx("code",{children:"kars credentials set telegram-token …"})," +"," ",e.jsx("code",{children:"--channels telegram"}),"."]}):e.jsx(d.SimpleTable,{data:x,columns:[{label:"Channel",getter:S=>S.channel},{label:"Status",getter:S=>S.enabled?e.jsx(d.StatusLabel,{status:"success",children:"ENABLED"}):e.jsx(d.StatusLabel,{status:"warning",children:"DISABLED"})},{label:"Source",getter:S=>S.source}]})}),e.jsx(d.SectionBox,{title:"Related Resources",children:e.jsx(d.SimpleTable,{data:[...g?[{kind:"InferencePolicy",name:g,route:"inferencepolicies-detail"}]:[],...L?[{kind:"ToolPolicy",name:L,route:"toolpolicies-detail"}]:[],...P?[{kind:"KarsMemory",name:P,route:"karsmemories-detail"}]:[],...u.map(S=>({kind:"McpServer",name:S.name??"",route:"mcpservers-detail"}))],columns:[{label:"Kind",getter:S=>S.kind},{label:"Name",getter:S=>S.name?e.jsx(d.Link,{routeName:S.route,params:{namespace:"kars-system",name:S.name},children:S.name}):"—"}]})}),i.mesh&&e.jsx(d.SectionBox,{title:"Mesh (AGT)",children:e.jsx(d.SimpleTable,{data:[{k:"Agent DID",v:i.mesh.did??"—"},{k:"Registered",v:i.mesh.registered?e.jsx(d.StatusLabel,{status:"success",children:"YES"}):e.jsx(d.StatusLabel,{status:"error",children:"NO"})},{k:"Trust Score",v:i.mesh.trustScore??"—"},{k:"Last Heartbeat",v:i.mesh.lastHeartbeat??"—"}],columns:[{label:"Field",getter:S=>S.k},{label:"Value",getter:S=>S.v}]})}),e.jsx(He,{refs:u}),e.jsx(Ke,{sandboxName:o,sandboxNamespace:c}),e.jsx(d.SectionBox,{title:"Pod & Workspace",children:e.jsx(d.SimpleTable,{data:[{k:"CR Namespace",v:e.jsx(d.Link,{routeName:"namespace",params:{name:c},children:c})},{k:"Sandbox Namespace",v:e.jsx(d.Link,{routeName:"namespace",params:{name:a},children:a})},{k:"Pods",v:e.jsxs(d.Link,{routeName:"pods",params:{namespace:a},children:["View pods in ",a]})},{k:"Deployment",v:e.jsxs(d.Link,{routeName:"deployments",params:{namespace:a},children:["View deployments in ",a]})},{k:"Secrets",v:e.jsxs(d.Link,{routeName:"secrets",params:{namespace:a},children:["View secrets in ",a]})}],columns:[{label:"Field",getter:S=>S.k},{label:"Value",getter:S=>S.v}]})}),e.jsx(Re,{sandboxName:o,inferenceRefName:(W=n.inferenceRef)==null?void 0:W.name}),e.jsx(qe,{sandboxName:o})]})}function qe({sandboxName:t}){const i=U.useTheme().palette.mode==="dark"?"dark":"light",o=`${typeof window<"u"&&window.KARS_GRAFANA_URL||"http://127.0.0.1:3000"}/d/kars-ops?kiosk=tv&refresh=10s&theme=${i}&var-sandbox=${encodeURIComponent(t)}`;return e.jsxs(d.SectionBox,{title:`Metrics (Grafana) — ${t}`,children:[e.jsx("div",{style:{marginBottom:8},children:e.jsx("a",{href:o,target:"_blank",rel:"noopener noreferrer",children:"Open full dashboard in Grafana ↗"})}),e.jsx("iframe",{src:o,title:`Grafana metrics for ${t}`,style:{width:"100%",height:"720px",border:"0"},loading:"lazy"})]})}async function _(t,n){var a;const i=`${t}/api/v1/query?query=${encodeURIComponent(n)}`,c=await fetch(i);if(!c.ok)throw new Error(`prom ${c.status}`);const o=await c.json();return(((a=o==null?void 0:o.data)==null?void 0:a.result)||[]).map(h=>{var s;return{metric:h.metric||{},value:Number(((s=h.value)==null?void 0:s[1])||0)}})}function Ve(){return typeof window<"u"&&window.KARS_PROMETHEUS_URL||"http://127.0.0.1:19091"}function V(t,n,i=5e3){const c=Ve(),[o,a]=q.useState(t),[h,s]=q.useState(""),[r,p]=q.useState(0);return q.useEffect(()=>{let f=!1;n(c).then(k=>{f||(a(k),s(""))}).catch(k=>{f||s(String(k))});const b=setInterval(()=>p(k=>k+1),i);return()=>{f=!0,clearInterval(b)}},[c,r]),{data:o,err:h}}function Ye(){const n=U.useTheme().palette.mode==="dark",i=n?"#1e1e1e":"#fafafa",c=n?"#aaa":"#555",o=n?"#cfd8dc":"#37474f",a="#fff",[h]=ne.useList(),{data:s,err:r}=V({peers:[],sentLife:[],recvLife:[],sentRate:[],recvRate:[],relayConn:0,relayRouted:0,relayStored:0,relayDelivered:0,relayMsgsPerSec:0},async l=>{var ke,xe,me,we,Le;const[y,M,Q,le,de,he,pt,gt,ut,ft]=await Promise.all([_(l,"kars_agt_known_agents"),_(l,"kars_mesh_messages_sent_total"),_(l,"kars_mesh_messages_received_total"),_(l,"sum by (sandbox) (increase(kars_mesh_messages_sent_total[5m]))"),_(l,"sum by (sandbox) (increase(kars_mesh_messages_received_total[5m]))"),_(l,"sum(agentmesh_relay_connected_agents)"),_(l,"sum(agentmesh_relay_messages_routed_total)"),_(l,"sum(agentmesh_relay_messages_stored_total)"),_(l,"sum(agentmesh_relay_messages_delivered_total)"),_(l,"sum(rate(agentmesh_relay_messages_routed_total[5m]))")]);return{peers:y,sentLife:M,recvLife:Q,sentRate:le,recvRate:de,relayConn:((ke=he[0])==null?void 0:ke.value)||0,relayRouted:((xe=pt[0])==null?void 0:xe.value)||0,relayStored:((me=gt[0])==null?void 0:me.value)||0,relayDelivered:((we=ut[0])==null?void 0:we.value)||0,relayMsgsPerSec:((Le=ft[0])==null?void 0:Le.value)||0}}),p=Object.fromEntries(s.peers.map(l=>[l.metric.sandbox||"",l.value])),f=Object.fromEntries(s.sentLife.map(l=>[l.metric.sandbox||"",l.value])),b=Object.fromEntries(s.recvLife.map(l=>[l.metric.sandbox||"",l.value])),k=Object.fromEntries(s.sentRate.map(l=>[l.metric.sandbox||"",l.value])),x=Object.fromEntries(s.recvRate.map(l=>[l.metric.sandbox||"",l.value])),g=(h||[]).map(l=>{const y=l.metadata.name,M=(l.metadata.labels||{})["kars.azure.com/parent"]||"";return{name:y,parent:M,knownPeers:p[y]||0,meshSent:k[y]||0,meshRecv:x[y]||0,meshSentLife:f[y]||0,meshRecvLife:b[y]||0}}),L=g.filter(l=>!l.parent).sort((l,y)=>l.name.localeCompare(y.name)),P={};for(const l of g)l.parent&&(P[l.parent]=P[l.parent]||[],P[l.parent].push(l));const u=1100,v=Math.max(220,u/Math.max(1,L.length)),w=u/2,m=70,T=220,$=400,D=36,O=50,z={};L.forEach((l,y)=>{const M=v*(y+.5)+(u-v*L.length)/2;z[l.name]={x:M,y:T,n:l}});const I={};for(const l of L){const y=P[l.name]||[],M=z[l.name].x,Q=130;y.forEach((le,de)=>{const he=(de-(y.length-1)/2)*Q;I[le.name]={x:M+he,y:$,n:le,parent:l.name}})}const W=g.filter(l=>l.parent&&!z[l.parent]),S=l=>l.meshSent+l.meshRecv,H=Math.max(.001,...g.map(S)),X=Math.max(1,...g.map(l=>l.meshSentLife+l.meshRecvLife)),G=W.length>0?600:520;function Z(l){const y=S(l);return y>5?"#43a047":y>.5?"#9ccc65":y>0?"#ffd54f":l.knownPeers>0?"#90caf9":n?"#555":"#bdbdbd"}function Y(l){return D+Math.min(14,(l.meshSentLife+l.meshRecvLife)/X*14)}function ve(l){return 1+l/H*5}function Se(l){return .3+l/H*.7}function se(l){return l>0?Math.max(.6,3-l/H*2.4):0}return e.jsxs(d.SectionBox,{title:"🕸️ Mesh Topology (live)",children:[e.jsxs("div",{style:{marginBottom:12,fontSize:13,color:c},children:["Tree view of the AGT mesh: AGT Relay (top), controllers (mid row), sub-agents (bottom row). Polled from Prometheus every 5s. Edge thickness & pulse speed ∝ mesh messages in/out (5m). Node size ∝ lifetime mesh-message volume. ",e.jsx("b",{children:"children"})," = sub-agent CRs labeled ",e.jsx("code",{children:"kars.azure.com/parent="}),"; ",e.jsx("b",{children:"trust"})," = peers in this router's local AGT trust graph (only populated after live traffic; resets on pod restart).",r&&e.jsxs("div",{style:{color:"#ef5350",marginTop:6},children:["Prometheus unreachable: ",r," (configure window.KARS_PROMETHEUS_URL)"]})]}),e.jsxs("div",{style:{display:"flex",gap:16,marginBottom:12,flexWrap:"wrap"},children:[e.jsxs(d.StatusLabel,{status:"",children:["🔗 Relay connected: ",e.jsx("b",{children:s.relayConn})]}),e.jsxs(d.StatusLabel,{status:"",children:["📨 Relay msg/s (5m): ",e.jsx("b",{children:s.relayMsgsPerSec.toFixed(2)})]}),e.jsxs(d.StatusLabel,{status:"",children:["📬 Routed total: ",e.jsx("b",{children:Math.round(s.relayRouted).toLocaleString()})]}),e.jsxs(d.StatusLabel,{status:"",children:["📦 Stored (offline): ",e.jsx("b",{children:Math.round(s.relayStored).toLocaleString()})]}),e.jsxs(d.StatusLabel,{status:"",children:["✉️ Delivered (after reconnect): ",e.jsx("b",{children:Math.round(s.relayDelivered).toLocaleString()})]}),e.jsxs(d.StatusLabel,{status:"",children:["🤖 Sandboxes: ",e.jsx("b",{children:g.length})]}),e.jsxs(d.StatusLabel,{status:"",children:["👨‍👩‍👧 Controllers: ",e.jsx("b",{children:L.length})]}),e.jsxs(d.StatusLabel,{status:"",children:["🧒 Sub-agents: ",e.jsx("b",{children:Object.keys(I).length})]})]}),e.jsxs("svg",{viewBox:`0 0 ${u} ${G}`,style:{width:"100%",maxWidth:u,background:i,borderRadius:8},children:[e.jsxs("defs",{children:[e.jsxs("radialGradient",{id:"relayGrad",cx:"50%",cy:"50%",r:"50%",children:[e.jsx("stop",{offset:"0%",stopColor:"#fff59d"}),e.jsx("stop",{offset:"100%",stopColor:"#fbc02d"})]}),e.jsxs("filter",{id:"glow",x:"-50%",y:"-50%",width:"200%",height:"200%",children:[e.jsx("feGaussianBlur",{stdDeviation:"3",result:"blur"}),e.jsxs("feMerge",{children:[e.jsx("feMergeNode",{in:"blur"}),e.jsx("feMergeNode",{in:"SourceGraphic"})]})]})]}),L.map(l=>{const y=z[l.name],M=S(l);return e.jsxs("g",{children:[e.jsx("line",{x1:w,y1:m,x2:y.x,y2:y.y,stroke:"#42a5f5",strokeWidth:ve(M),strokeOpacity:Se(M)}),l.meshRecv>0&&e.jsx("circle",{r:"4",fill:"#81d4fa",filter:"url(#glow)",children:e.jsx("animateMotion",{dur:`${se(l.meshRecv)}s`,repeatCount:"indefinite",path:`M${w},${m} L${y.x},${y.y}`})}),l.meshSent>0&&e.jsx("circle",{r:"4",fill:"#ffeb3b",filter:"url(#glow)",children:e.jsx("animateMotion",{dur:`${se(l.meshSent)}s`,repeatCount:"indefinite",path:`M${y.x},${y.y} L${w},${m}`})}),e.jsxs("text",{x:(w+y.x)/2,y:(m+y.y)/2-4,textAnchor:"middle",fontSize:"10",fill:c,style:{pointerEvents:"none"},children:["↑",Math.round(l.meshSent*60/5)||0," ↓",Math.round(l.meshRecv*60/5)||0," /min"]})]},`r-${l.name}`)}),Object.values(I).map(l=>{const y=z[l.parent];if(!y)return null;const M=S(l.n);return e.jsxs("g",{children:[e.jsx("line",{x1:y.x,y1:y.y,x2:l.x,y2:l.y,stroke:"#7e57c2",strokeWidth:ve(M),strokeOpacity:Se(M),strokeDasharray:"6,4"}),se(M)>0&&e.jsx("circle",{r:"3",fill:"#ce93d8",filter:"url(#glow)",children:e.jsx("animateMotion",{dur:`${se(M)}s`,repeatCount:"indefinite",path:`M${y.x},${y.y} L${l.x},${l.y}`})})]},`pc-${l.n.name}`)}),e.jsxs("g",{children:[e.jsx("circle",{cx:w,cy:m,r:O,fill:"url(#relayGrad)",stroke:"#f57f17",strokeWidth:"3",filter:"url(#glow)"}),e.jsx("text",{x:w,y:m-8,textAnchor:"middle",fontSize:"13",fontWeight:"bold",fill:"#212121",children:"AGT Relay"}),e.jsxs("text",{x:w,y:m+6,textAnchor:"middle",fontSize:"10",fill:"#212121",children:[s.relayConn," connected"]}),e.jsxs("text",{x:w,y:m+20,textAnchor:"middle",fontSize:"10",fill:"#212121",children:[s.relayMsgsPerSec.toFixed(2)," msg/s"]}),e.jsxs("text",{x:w,y:m+34,textAnchor:"middle",fontSize:"9",fill:"#212121",children:[Math.round(s.relayRouted).toLocaleString()," routed"]})]}),L.map(l=>{const y=z[l.name],M=Y(l),Q=(P[l.name]||[]).length;return e.jsxs("g",{children:[e.jsx("circle",{cx:y.x,cy:y.y,r:M,fill:Z(l),stroke:o,strokeWidth:"2.5"}),e.jsx("text",{x:y.x,y:y.y-8,textAnchor:"middle",fontSize:"13",fontWeight:"bold",fill:a,children:l.name}),e.jsx("text",{x:y.x,y:y.y+4,textAnchor:"middle",fontSize:"9",fill:a,children:"controller"}),e.jsxs("text",{x:y.x,y:y.y+18,textAnchor:"middle",fontSize:"10",fill:a,children:["↑",Math.round(l.meshSentLife).toLocaleString()," ↓",Math.round(l.meshRecvLife).toLocaleString()]}),e.jsxs("text",{x:y.x,y:y.y+30,textAnchor:"middle",fontSize:"9",fill:a,children:[Q," child",Q===1?"":"ren"," · ",l.knownPeers," trust"]})]},`c-${l.name}`)}),Object.values(I).map(l=>{const y=l.n,M=Y(y)-6;return e.jsxs("g",{children:[e.jsx("circle",{cx:l.x,cy:l.y,r:M,fill:Z(y),stroke:o,strokeWidth:"1.5"}),e.jsx("text",{x:l.x,y:l.y-6,textAnchor:"middle",fontSize:"11",fontWeight:"bold",fill:a,children:y.name}),e.jsx("text",{x:l.x,y:l.y+6,textAnchor:"middle",fontSize:"9",fill:a,children:"sub-agent"}),e.jsxs("text",{x:l.x,y:l.y+20,textAnchor:"middle",fontSize:"10",fill:a,children:["↑",Math.round(y.meshSentLife).toLocaleString()," ↓",Math.round(y.meshRecvLife).toLocaleString()]})]},`s-${y.name}`)}),W.length>0&&e.jsxs("g",{children:[e.jsx("text",{x:u/2,y:G-80,textAnchor:"middle",fontSize:"11",fill:c,children:"— Orphan sub-agents (parent CR not found) —"}),W.map((l,y)=>{const M=u/(W.length+1)*(y+1);return e.jsxs("g",{children:[e.jsx("circle",{cx:M,cy:G-40,r:D-8,fill:n?"#616161":"#9e9e9e",stroke:n?"#9e9e9e":"#616161",strokeWidth:"1.5",strokeDasharray:"3,3"}),e.jsx("text",{x:M,y:G-44,textAnchor:"middle",fontSize:"11",fontWeight:"bold",fill:a,children:l.name}),e.jsxs("text",{x:M,y:G-30,textAnchor:"middle",fontSize:"9",fill:a,children:["parent:",l.parent]})]},`o-${l.name}`)})]})]}),e.jsx("div",{style:{marginTop:12},children:e.jsx(d.SimpleTable,{data:g.map(l=>({name:l.name,kind:l.parent?`sub-agent ← ${l.parent}`:"controller",peers:l.knownPeers,sent5m:Math.round(l.meshSent),recv5m:Math.round(l.meshRecv),sentLife:Math.round(l.meshSentLife),recvLife:Math.round(l.meshRecvLife)})).sort((l,y)=>y.sent5m+y.recv5m-(l.sent5m+l.recv5m)),columns:[{label:"Sandbox",getter:l=>l.name},{label:"Role",getter:l=>l.kind},{label:"Peers",getter:l=>l.peers},{label:"↑ Sent (5m)",getter:l=>l.sent5m},{label:"↓ Recv (5m)",getter:l=>l.recv5m},{label:"↑ Sent (life)",getter:l=>l.sentLife.toLocaleString()},{label:"↓ Recv (life)",getter:l=>l.recvLife.toLocaleString()}]})})]})}function Je(){return typeof window<"u"&&window.KARS_GRAFANA_URL||"http://127.0.0.1:3000"}function Xe({policyName:t}){const n=U.useTheme(),i=n.palette.mode==="dark"?"dark":"light",c=n.palette.text.secondary,{data:o,err:a}=V({byModel:[],bySandbox:[],reqRate:[],latency:0},async p=>{var g;const[f,b,k,x]=await Promise.all([_(p,"sum by (model, direction) (increase(kars_tokens_total[1h]))"),_(p,"sum by (sandbox) (increase(kars_tokens_total[1h]))"),_(p,"sum by (model, status) (rate(kars_inference_requests_total[5m]))"),_(p,"histogram_quantile(0.95, sum by (le) (rate(kars_inference_latency_seconds_bucket[5m])))")]);return{byModel:f,bySandbox:b,reqRate:k,latency:((g=x[0])==null?void 0:g.value)||0}}),h=`${Je()}/d/kars-ops?kiosk=tv&refresh=10s&theme=${i}`,s=o.byModel.map(p=>({model:p.metric.model||"?",direction:p.metric.direction||"?",tokens:Math.round(p.value).toLocaleString()})).sort((p,f)=>Number(f.tokens.replace(/,/g,""))-Number(p.tokens.replace(/,/g,""))),r=o.bySandbox.map(p=>({sandbox:p.metric.sandbox||"?",tokens:Math.round(p.value).toLocaleString()})).sort((p,f)=>Number(f.tokens.replace(/,/g,""))-Number(p.tokens.replace(/,/g,"")));return e.jsxs(d.SectionBox,{title:`📊 Inference Metrics (policy: ${t})`,children:[e.jsxs("div",{style:{marginBottom:8,fontSize:13,color:c},children:["Live aggregates across all sandboxes routed through this policy class. ",a&&e.jsx("span",{style:{color:"#ef5350"},children:a})]}),e.jsxs("div",{style:{display:"flex",gap:12,marginBottom:12,flexWrap:"wrap"},children:[e.jsxs(d.StatusLabel,{status:"",children:["⏱ p95 latency (5m): ",e.jsxs("b",{children:[(o.latency*1e3).toFixed(0)," ms"]})]}),e.jsxs(d.StatusLabel,{status:"",children:["🧮 Models active: ",e.jsx("b",{children:new Set(o.byModel.map(p=>p.metric.model)).size})]}),e.jsxs(d.StatusLabel,{status:"",children:["🤖 Sandboxes consuming: ",e.jsx("b",{children:r.length})]})]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"1fr 1fr",gap:16},children:[e.jsxs("div",{children:[e.jsx("h4",{style:{margin:"4px 0"},children:"Tokens by model (1h)"}),e.jsx(d.SimpleTable,{data:s,columns:[{label:"Model",getter:p=>p.model},{label:"Dir",getter:p=>p.direction},{label:"Tokens",getter:p=>p.tokens}]})]}),e.jsxs("div",{children:[e.jsx("h4",{style:{margin:"4px 0"},children:"Top consumers (1h)"}),e.jsx(d.SimpleTable,{data:r.slice(0,10),columns:[{label:"Sandbox",getter:p=>p.sandbox},{label:"Tokens",getter:p=>p.tokens}]})]})]}),e.jsx("div",{style:{marginTop:12},children:e.jsx("a",{href:h,target:"_blank",rel:"noopener noreferrer",children:"Open full Grafana dashboard ↗"})})]})}function Qe({policyName:t}){const i=U.useTheme().palette.text.secondary,{data:c,err:o}=V({decisions:[],bySandbox:[],latencyP95:0},async r=>{var k;const[p,f,b]=await Promise.all([_(r,"sum by (decision) (increase(kars_agt_policy_evaluations_total[1h]))"),_(r,"sum by (sandbox, decision) (increase(kars_agt_policy_evaluations_total[1h]))"),_(r,"histogram_quantile(0.95, sum by (le) (rate(kars_agt_eval_latency_seconds_bucket[5m])))")]);return{decisions:p,bySandbox:f,latencyP95:((k=b[0])==null?void 0:k.value)||0}}),a=c.decisions.reduce((r,p)=>r+p.value,0)||1,h=c.decisions.map(r=>({decision:r.metric.decision||"?",count:Math.round(r.value).toLocaleString(),pct:(r.value/a*100).toFixed(1)+"%"})),s=c.bySandbox.map(r=>({sandbox:r.metric.sandbox||"?",decision:r.metric.decision||"?",count:Math.round(r.value).toLocaleString()})).sort((r,p)=>Number(p.count.replace(/,/g,""))-Number(r.count.replace(/,/g,"")));return e.jsxs(d.SectionBox,{title:`🛡️ Policy Evaluations (policy: ${t})`,children:[e.jsxs("div",{style:{marginBottom:8,fontSize:13,color:i},children:["AGT policy evaluation counters scoped to all sandboxes referencing this policy. ",o&&e.jsx("span",{style:{color:"#ef5350"},children:o})]}),e.jsxs("div",{style:{display:"flex",gap:12,marginBottom:12,flexWrap:"wrap"},children:[e.jsxs(d.StatusLabel,{status:"",children:["⏱ p95 eval latency (5m): ",e.jsxs("b",{children:[(c.latencyP95*1e6).toFixed(0)," µs"]})]}),e.jsxs(d.StatusLabel,{status:"",children:["📊 Total evals (1h): ",e.jsx("b",{children:Math.round(a).toLocaleString()})]})]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"1fr 2fr",gap:16},children:[e.jsxs("div",{children:[e.jsx("h4",{style:{margin:"4px 0"},children:"Decision mix (1h)"}),e.jsx(d.SimpleTable,{data:h,columns:[{label:"Decision",getter:r=>r.decision},{label:"Count",getter:r=>r.count},{label:"Share",getter:r=>r.pct}]})]}),e.jsxs("div",{children:[e.jsx("h4",{style:{margin:"4px 0"},children:"Top deniers/allowers (1h)"}),e.jsx(d.SimpleTable,{data:s.slice(0,15),columns:[{label:"Sandbox",getter:r=>r.sandbox},{label:"Decision",getter:r=>r.decision},{label:"Count",getter:r=>r.count}]})]})]})]})}function Ze(){const n=U.useTheme().palette.text.secondary,{data:i,err:c}=V({peers:[],auditEntries:[],bundleHealth:[]},async s=>{const[r,p,f]=await Promise.all([_(s,"kars_agt_known_agents"),_(s,"kars_agt_audit_entries_total"),_(s,"kars_policy_bundle_healthy")]);return{peers:r,auditEntries:p,bundleHealth:f}}),o=i.peers.map(s=>({sandbox:s.metric.sandbox||"?",knownPeers:s.value})).sort((s,r)=>r.knownPeers-s.knownPeers),a=i.peers.reduce((s,r)=>s+r.value,0),h=i.auditEntries.reduce((s,r)=>s+r.value,0);return e.jsxs(d.SectionBox,{title:"🔐 Trust Graph Metrics",children:[e.jsxs("div",{style:{marginBottom:8,fontSize:13,color:n},children:["AGT trust graph: peers known per sandbox + tamper-evident audit log size. ",c&&e.jsx("span",{style:{color:"#ef5350"},children:c})]}),e.jsxs("div",{style:{display:"flex",gap:12,marginBottom:12,flexWrap:"wrap"},children:[e.jsxs(d.StatusLabel,{status:"",children:["🤝 Total known peers: ",e.jsx("b",{children:a})]}),e.jsxs(d.StatusLabel,{status:"",children:["📜 Audit entries: ",e.jsx("b",{children:Math.round(h).toLocaleString()})]}),e.jsxs(d.StatusLabel,{status:"",children:["📦 Healthy bundles: ",e.jsxs("b",{children:[i.bundleHealth.filter(s=>s.value>0).length,"/",i.bundleHealth.length]})]})]}),e.jsx(d.SimpleTable,{data:o,columns:[{label:"Sandbox",getter:s=>s.sandbox},{label:"Known peers",getter:s=>s.knownPeers}]})]})}function ae(t){return t>=90?"error":t>=70?"warning":t>0?"success":""}function F(t){return t>=1e9?(t/1e9).toFixed(2)+"B":t>=1e6?(t/1e6).toFixed(2)+"M":t>=1e3?(t/1e3).toFixed(1)+"K":Math.round(t).toLocaleString()}function ie({used:t,total:n,height:i=14}){const o=U.useTheme().palette.mode==="dark",a=o?"#333":"#eee",h=o?"#eee":"#333",s=n>0?Math.min(100,t/n*100):0,r=s>=90?"#c62828":s>=70?"#ef6c00":"#2e7d32";return e.jsxs("div",{style:{background:a,borderRadius:4,height:i,overflow:"hidden",position:"relative"},children:[e.jsx("div",{style:{background:r,height:"100%",width:`${s}%`,transition:"width .3s ease"}}),e.jsxs("div",{style:{position:"absolute",inset:0,display:"flex",alignItems:"center",justifyContent:"center",fontSize:11,fontWeight:600,color:s>50?"#fff":h},children:[s.toFixed(1),"%"]})]})}function Ce({sandboxes:t,inferencePolicies:n}){const c=U.useTheme().palette.text.secondary,{data:o,err:a}=V([],async g=>_(g,"sum by (sandbox) (increase(kars_tokens_total[24h]))"),1e4),h={};for(const g of o)h[g.metric.sandbox||"?"]=g.value;const s={};for(const g of n)s[g.metadata.name]=g;const r=t.map(g=>{var m,T,$,D,O;const P=((T=(((m=g.jsonData)==null?void 0:m.spec)||g.spec||{}).inferenceRef)==null?void 0:T.name)||"",u=s[P],v=((O=(D=(($=u==null?void 0:u.jsonData)==null?void 0:$.spec)||(u==null?void 0:u.spec)||{})==null?void 0:D.tokenBudget)==null?void 0:O.dailyTokens)||0,w=h[g.metadata.name]||0;return{name:g.metadata.name,policy:P||"—",budget:v,used:w,pct:v>0?w/v*100:0}}),p=r.reduce((g,L)=>g+L.budget,0),f=r.reduce((g,L)=>g+L.used,0),b=p>0?f/p*100:0,k=r.filter(g=>g.pct>=70).length,x=r.filter(g=>g.pct>=100).length;return e.jsxs(d.SectionBox,{title:"💰 Token Budget (24h)",children:[e.jsxs("div",{style:{marginBottom:12,fontSize:13,color:c},children:["Aggregate daily budget across all InferencePolicy CRs vs. actual consumption pulled from Prometheus. ",a&&e.jsx("span",{style:{color:"#ef5350"},children:a})]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"repeat(auto-fit, minmax(220px, 1fr))",gap:"1rem",marginBottom:16},children:[e.jsx(A,{label:"Fleet budget (24h)",value:F(p)}),e.jsx(A,{label:"Fleet consumed (24h)",value:F(f),tone:ae(b)}),e.jsx(A,{label:"Fleet utilization",value:`${b.toFixed(1)}%`,tone:ae(b)}),e.jsx(A,{label:"Sandboxes ≥70% used",value:k,tone:k>0?"warning":""}),e.jsx(A,{label:"Sandboxes over budget",value:x,tone:x>0?"error":""})]}),e.jsx("div",{style:{marginBottom:8,fontSize:13,fontWeight:600},children:"Fleet utilization"}),e.jsx(ie,{used:f,total:p,height:20}),e.jsx("div",{style:{marginTop:16},children:e.jsx(d.SimpleTable,{data:r.sort((g,L)=>L.pct-g.pct).map(g=>({name:g.name,policy:g.policy,budget:F(g.budget),used:F(g.used),bar:g})),columns:[{label:"Sandbox",getter:g=>g.name},{label:"Policy",getter:g=>g.policy},{label:"Budget",getter:g=>g.budget},{label:"Used",getter:g=>g.used},{label:"Utilization",getter:g=>e.jsx("div",{style:{width:160},children:e.jsx(ie,{used:g.bar.used,total:g.bar.budget})})}]})})]})}function Re({sandboxName:t,inferenceRefName:n}){var L,P,u,v,w,m;const c=U.useTheme().palette.text.secondary,[o]=j.inferencepolicies.useList(),a=(o||[]).find(T=>T.metadata.name===n),h=((L=a==null?void 0:a.jsonData)==null?void 0:L.spec)||(a==null?void 0:a.spec)||{},s=((P=h==null?void 0:h.tokenBudget)==null?void 0:P.dailyTokens)||0,r=((u=h==null?void 0:h.tokenBudget)==null?void 0:u.perRequestTokens)||0,{data:p}=V(0,async T=>{var D;return((D=(await _(T,`sum(increase(kars_tokens_total{sandbox="${t}"}[24h]))`))[0])==null?void 0:D.value)||0},1e4),{data:f}=V([],async T=>_(T,`sum by (direction) (increase(kars_tokens_total{sandbox="${t}"}[24h]))`),1e4),b=s>0?p/s*100:0,k=Math.max(0,s-p),x=((v=f.find(T=>T.metric.direction==="input"))==null?void 0:v.value)||0,g=((w=f.find(T=>T.metric.direction==="output"))==null?void 0:w.value)||0;return e.jsxs(d.SectionBox,{title:`💰 Token Budget — ${t}`,children:[!n&&e.jsxs("div",{style:{color:c,fontSize:13},children:["No ",e.jsx("code",{children:"inferenceRef"})," set on this sandbox; no enforced budget."]}),n&&!a&&e.jsxs("div",{style:{color:"#ef6c00",fontSize:13},children:["InferencePolicy ",e.jsx("code",{children:n})," not found."]}),e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"repeat(auto-fit, minmax(180px, 1fr))",gap:"0.75rem",marginBottom:12},children:[e.jsx(A,{label:"Daily budget",value:s>0?F(s):"unlimited"}),e.jsx(A,{label:"Consumed (24h)",value:F(p),tone:ae(b)}),e.jsx(A,{label:"Remaining",value:s>0?F(k):"—",tone:ae(b)}),e.jsx(A,{label:"Per-request cap",value:r>0?F(r):"unlimited"}),e.jsx(A,{label:"Input tokens",value:F(x)}),e.jsx(A,{label:"Output tokens",value:F(g)})]}),s>0&&e.jsxs("div",{children:[e.jsx("div",{style:{marginBottom:6,fontSize:13,fontWeight:600},children:"Utilization"}),e.jsx(ie,{used:p,total:s,height:22})]}),n&&e.jsxs("div",{style:{marginTop:12,fontSize:12,color:c},children:["Policy: ",e.jsx(d.Link,{routeName:"inferencepolicies-detail",params:{namespace:((m=a==null?void 0:a.metadata)==null?void 0:m.namespace)||"default",name:n},children:n})]})]})}const et=j.karssreactions;function tt(t,n){let i=t||"Proposed",c="warning";switch(t){case"Recovered":c="success";break;case"Applied":c=n==="Approved"?"":"warning",i="Applied · waiting recovery";break;case"Failed":case"Rejected":case"Expired":c="error";break;case void 0:case"":case"Proposed":c=n==="Approved"?"":"warning",i=n==="Approved"?"Approved · queued":"Proposed";break}return e.jsx(d.StatusLabel,{status:c,children:i})}function at({item:t,busy:n,setBusy:i}){const[c,o]=q.useState(null),a=async(h,s)=>{i(!0),o(null);try{await t.patch({spec:{approval:{state:h,...s?{note:s}:{}}}})}catch(r){o((r==null?void 0:r.message)??String(r))}finally{i(!1)}};return e.jsxs(K.Stack,{direction:"row",spacing:1,alignItems:"center",children:[e.jsx(K.Button,{variant:"contained",color:"success",size:"small",disabled:n,onClick:()=>a("Approved"),children:"Approve"}),e.jsx(K.Button,{variant:"outlined",color:"error",size:"small",disabled:n,onClick:()=>{const h=window.prompt("Optional reason (audit-visible)")??void 0;a("Rejected",h||void 0)},children:"Reject"}),c&&e.jsxs("span",{style:{color:"var(--mui-palette-error-main)",fontSize:12},children:["✗ ",c]})]})}function rt({item:t}){const i=E(t).action??{},c=i.params??{};return e.jsxs("div",{style:{fontSize:13},children:[e.jsx("div",{style:{fontWeight:600},children:i.type??"?"}),e.jsxs("div",{style:{color:"var(--mui-palette-text-secondary)"},children:[c.namespace??"?"," / ",c.name??"?"]})]})}function st({item:t}){const n=E(t),i=n.diagnosis??n.rationale??"—";return e.jsxs("div",{style:{fontSize:13,maxWidth:400,color:"var(--mui-palette-text-secondary)"},children:[String(i).slice(0,200),String(i).length>200?"…":""]})}function lt({item:t}){var p,f,b,k,x;const n=E(t),i=N(t),c=(p=n.approval)==null?void 0:p.state,o=i.phase,[a,h]=q.useState(!1),s=(!o||o==="Proposed")&&(!c||c==="Pending"),r=o==="Applied"||o==="Proposed"&&c==="Approved";return e.jsxs("tr",{style:{borderTop:"1px solid var(--mui-palette-divider)"},children:[e.jsxs("td",{style:{padding:8},children:[e.jsx(d.Link,{routeName:"karssreactions-detail",params:{namespace:((f=t.metadata)==null?void 0:f.namespace)??"kars-sre",name:((b=t.metadata)==null?void 0:b.name)??""},children:(k=t.metadata)==null?void 0:k.name}),e.jsx("div",{style:{fontSize:11,color:"var(--mui-palette-text-secondary)"},children:te((x=t.metadata)==null?void 0:x.creationTimestamp)})]}),e.jsx("td",{style:{padding:8},children:e.jsx(rt,{item:t})}),e.jsx("td",{style:{padding:8},children:e.jsx(st,{item:t})}),e.jsx("td",{style:{padding:8},children:tt(o,c)}),e.jsx("td",{style:{padding:8},children:s?e.jsx(at,{item:t,busy:a,setBusy:h}):r?e.jsx("span",{style:{fontSize:12,color:"var(--mui-palette-text-secondary)"},children:"executing…"}):e.jsx("span",{style:{fontSize:12,color:"var(--mui-palette-text-secondary)"},children:"—"})})]})}function ce({title:t,emoji:n,items:i,emptyText:c}){return e.jsx(d.SectionBox,{title:`${n} ${t} (${i.length})`,children:i.length===0?e.jsx("div",{style:{padding:16,color:"var(--mui-palette-text-secondary)",fontSize:13},children:c}):e.jsxs("table",{style:{width:"100%",borderCollapse:"collapse"},children:[e.jsx("thead",{children:e.jsxs("tr",{style:{fontSize:12,color:"var(--mui-palette-text-secondary)"},children:[e.jsx("th",{style:{padding:8,textAlign:"left"},children:"Action ID"}),e.jsx("th",{style:{padding:8,textAlign:"left"},children:"Target"}),e.jsx("th",{style:{padding:8,textAlign:"left"},children:"Diagnosis"}),e.jsx("th",{style:{padding:8,textAlign:"left"},children:"Phase"}),e.jsx("th",{style:{padding:8,textAlign:"left"},children:"Action"})]})}),e.jsx("tbody",{children:i.map(o=>{var a,h;return e.jsx(lt,{item:o},((a=o.metadata)==null?void 0:a.uid)??((h=o.metadata)==null?void 0:h.name))})})]})})}function nt({sandboxes:t}){if(!t)return e.jsx(d.SectionBox,{title:"📊 Cluster Health",children:e.jsx("div",{style:{padding:16,fontSize:13},children:"Loading…"})});const n={};let i=0;for(const a of t){const h=N(a).phase??"Unknown";n[h]=(n[h]??0)+1,(N(a).conditions??[]).some(r=>r.type==="Degraded"&&r.status==="True")&&(i+=1)}const c=t.length,o=n.Running??0;return e.jsx(d.SectionBox,{title:"📊 Cluster Health",children:e.jsxs("div",{style:{display:"grid",gridTemplateColumns:"repeat(4, 1fr)",gap:16,padding:8},children:[e.jsx(A,{label:"Sandboxes total",value:c}),e.jsx(A,{label:"Running",value:o,tone:o===c?"success":"warning"}),e.jsx(A,{label:"Degraded",value:i,tone:i===0?"success":"error"}),e.jsx(A,{label:"Other phases",value:c-o-i,tone:c-o-i===0?"success":"warning"})]})})}const ot=new Set(["FailedCreate","BackOff","FailedScheduling","Failed","ImagePullBackOff","ErrImagePull","CrashLoopBackOff","OOMKilling","Evicted","FailedMount"]),it=new Set(["kube-system","kube-public","kube-node-lease","kars-system","kars-sre","agentmesh","default"]);function ct(){const t=require("@kinvolk/headlamp-plugin/lib/K8s/event").default,[n]=t.useList();if(!n)return e.jsx(d.SectionBox,{title:"🚨 Active Incidents (last 15 min)",children:e.jsx("div",{style:{padding:16,fontSize:13},children:"Loading events…"})});const i=Date.now()-900*1e3,c=n.filter(o=>{var a;return((a=o.jsonData)==null?void 0:a.type)==="Warning"}).filter(o=>{var a;return ot.has(((a=o.jsonData)==null?void 0:a.reason)??"")}).filter(o=>{var h;const a=((h=o.metadata)==null?void 0:h.namespace)??"";return a.startsWith("kars-")&&!it.has(a)}).filter(o=>{var h,s;const a=((h=o.jsonData)==null?void 0:h.lastTimestamp)||((s=o.jsonData)==null?void 0:s.eventTime);if(!a)return!1;try{return new Date(a).getTime()>=i}catch{return!1}}).sort((o,a)=>{var r,p,f,b;const h=new Date(((r=o.jsonData)==null?void 0:r.lastTimestamp)||((p=o.jsonData)==null?void 0:p.eventTime)||0).getTime();return new Date(((f=a.jsonData)==null?void 0:f.lastTimestamp)||((b=a.jsonData)==null?void 0:b.eventTime)||0).getTime()-h}).slice(0,25);return e.jsx(d.SectionBox,{title:`🚨 Active Incidents · last 15 min (${c.length})`,children:c.length===0?e.jsx("div",{style:{padding:16,color:"var(--mui-palette-text-secondary)",fontSize:13},children:"No recent failure-class events in kars-* user namespaces."}):e.jsxs("table",{style:{width:"100%",borderCollapse:"collapse"},children:[e.jsx("thead",{children:e.jsxs("tr",{style:{fontSize:12,color:"var(--mui-palette-text-secondary)"},children:[e.jsx("th",{style:{padding:8,textAlign:"left"},children:"Reason"}),e.jsx("th",{style:{padding:8,textAlign:"left"},children:"Target"}),e.jsx("th",{style:{padding:8,textAlign:"left"},children:"Message"}),e.jsx("th",{style:{padding:8,textAlign:"left"},children:"Age"})]})}),e.jsx("tbody",{children:c.map(o=>{var r,p,f,b,k,x,g;const a=((r=o.metadata)==null?void 0:r.namespace)??"?",h=((p=o.jsonData)==null?void 0:p.involvedObject)??{},s=((f=o.jsonData)==null?void 0:f.lastTimestamp)||((b=o.jsonData)==null?void 0:b.eventTime)||"";return e.jsxs("tr",{style:{borderTop:"1px solid var(--mui-palette-divider)"},children:[e.jsx("td",{style:{padding:8},children:e.jsx(K.Chip,{label:((k=o.jsonData)==null?void 0:k.reason)??"?",size:"small",color:"warning",variant:"outlined"})}),e.jsxs("td",{style:{padding:8,fontSize:12},children:[e.jsxs("div",{style:{fontWeight:600},children:[h.kind,"/",h.name]}),e.jsx("div",{style:{color:"var(--mui-palette-text-secondary)"},children:a})]}),e.jsx("td",{style:{padding:8,fontSize:12,maxWidth:480,color:"var(--mui-palette-text-secondary)"},children:String(((x=o.jsonData)==null?void 0:x.message)??"").slice(0,240)}),e.jsx("td",{style:{padding:8,fontSize:11,color:"var(--mui-palette-text-secondary)"},children:te(s)})]},(g=o.metadata)==null?void 0:g.uid)})})]})})}function dt(){const[t]=et.useList(),[n]=ne.useList(),i=t??[],o=Date.now()-3600*1e3,a=i.filter(r=>{var b;const p=N(r).phase,f=(b=E(r).approval)==null?void 0:b.state;return(!p||p==="Proposed")&&(!f||f==="Pending")}),h=i.filter(r=>{var b;const p=N(r).phase,f=(b=E(r).approval)==null?void 0:b.state;return p==="Applied"||p==="Proposed"&&f==="Approved"}),s=i.filter(r=>{var b;const p=N(r).phase,f=(b=r.metadata)==null?void 0:b.creationTimestamp;if(!p||!["Recovered","Failed","Rejected","Expired"].includes(p))return!1;if(!f)return!0;try{return new Date(f).getTime()>=o}catch{return!1}}).sort((r,p)=>{var f,b;return new Date(((f=p.metadata)==null?void 0:f.creationTimestamp)??0).getTime()-new Date(((b=r.metadata)==null?void 0:b.creationTimestamp)??0).getTime()}).slice(0,10);return e.jsxs(e.Fragment,{children:[e.jsx(ce,{title:"Pending Approval",emoji:"🔴",items:a,emptyText:"No actions awaiting your approval — the cluster is quiet right now."}),e.jsx(ce,{title:"In-flight",emoji:"🔄",items:h,emptyText:"No actions currently executing."}),e.jsx(nt,{sandboxes:n}),e.jsx(ct,{}),e.jsx(ce,{title:"Recent (last hour)",emoji:"✅",items:s,emptyText:"No actions completed in the last hour."})]})}const re=18789;function ht(){const[t,n]=q.useState("local"),i=`http://localhost:${re}`,c=`/clusters/kind-kars-dev/api/v1/namespaces/kars-sre/services/sre:${re}/proxy/`,o=t==="local"?i:c;return e.jsx(d.SectionBox,{title:"💬 Chat with kars-sre",children:e.jsxs("div",{style:{padding:8},children:[e.jsxs(K.Stack,{direction:"row",spacing:2,alignItems:"center",sx:{mb:1},children:[e.jsxs(K.Tabs,{value:t,onChange:(a,h)=>n(h),sx:{minHeight:32},children:[e.jsx(K.Tab,{value:"local",label:`Local port-forward (${re})`,sx:{minHeight:32,fontSize:12}}),e.jsx(K.Tab,{value:"proxy",label:"Apiserver service proxy",sx:{minHeight:32,fontSize:12}})]}),e.jsx(K.Button,{size:"small",href:o,target:"_blank",rel:"noreferrer noopener",variant:"outlined",children:"Open in new tab"})]}),e.jsx("div",{style:{fontSize:12,color:"var(--mui-palette-text-secondary)",marginBottom:8},children:t==="local"?e.jsxs(e.Fragment,{children:["Requires: ",e.jsxs("code",{children:["kars connect sre --web --port ",re]})," in another terminal. Hermes' WebUI binds to",e.jsx("code",{children:"localhost"})," on the operator's laptop."]}):e.jsx(e.Fragment,{children:"Routes through the cluster apiserver service proxy. Works without port-forward, but Hermes asset paths may need extra config."})}),e.jsx("iframe",{src:o,title:"kars-sre WebUI",style:{width:"100%",minHeight:"calc(100vh - 320px)",border:"1px solid var(--mui-palette-divider)",borderRadius:4,background:"var(--mui-palette-background-default)"}})]})})}})); diff --git a/tools/headlamp-plugin/src/index.tsx b/tools/headlamp-plugin/src/index.tsx index 010ad992..9e56993e 100644 --- a/tools/headlamp-plugin/src/index.tsx +++ b/tools/headlamp-plugin/src/index.tsx @@ -45,6 +45,18 @@ import { StatusLabel, } from "@kinvolk/headlamp-plugin/lib/CommonComponents"; import { useTheme } from "@mui/material/styles"; +import { + Button, + Chip, + Stack, + Tab, + Tabs, + TextField, + Dialog, + DialogTitle, + DialogContent, + DialogActions, +} from "@mui/material"; import * as React from "react"; const GROUP = "kars.azure.com"; @@ -69,6 +81,7 @@ const KARS_CRDS: CrdDescriptor[] = [ { plural: "karspairings", singular: "karspairing", kind: "KarsPairing", label: "Pairings" }, { plural: "karsevals", singular: "karseval", kind: "KarsEval", label: "Evals", phaseField: "phase" }, { plural: "egressapprovals", singular: "egressapproval", kind: "EgressApproval", label: "Egress Approvals", phaseField: "phase" }, + { plural: "karssreactions", singular: "karssreaction", kind: "KarsSREAction", label: "SRE Actions", phaseField: "phase" }, ]; const CRD_CLASSES: Record = Object.fromEntries( @@ -154,6 +167,65 @@ for (const crd of KARS_CRDS) { }); } +// ────────────────────────────────────────────────────────────────────── +// SRE Console — primary UX for the kars-sre operator +// ────────────────────────────────────────────────────────────────────── +// +// Pinned to its own top-level sidebar branch so the SRE engineer has +// a dedicated landing page rather than browsing through the 11 CRD +// list pages every shift. Three sub-entries: +// +// /kars/sre — Console (pending approvals + in-flight + recent) +// /kars/sre/chat — Embedded Hermes WebUI iframe for the sre sandbox +// /kars/sre/actions — Filtered KarsSREAction list (same as +// /kars/karssreactions, but reached via the SRE +// navigation tree) + +registerSidebarEntry({ + parent: "kars", + name: "kars-sre-root", + label: "SRE", + icon: "mdi:stethoscope", + url: "/kars/sre", +}); + +registerSidebarEntry({ + parent: "kars-sre-root", + name: "kars-sre-console", + label: "Console", + url: "/kars/sre", +}); + +registerRoute({ + path: "/kars/sre", + sidebar: "kars-sre-console", + name: "kars-sre-console", + exact: true, + component: () => , +}); + +registerSidebarEntry({ + parent: "kars-sre-root", + name: "kars-sre-chat", + label: "Chat", + url: "/kars/sre/chat", +}); + +registerRoute({ + path: "/kars/sre/chat", + sidebar: "kars-sre-chat", + name: "kars-sre-chat", + exact: true, + component: () => , +}); + +registerSidebarEntry({ + parent: "kars-sre-root", + name: "kars-sre-actions", + label: "Actions", + url: "/kars/karssreactions", +}); + // ────────────────────────────────────────────────────────────────────── // Helpers // ────────────────────────────────────────────────────────────────────── @@ -2032,4 +2104,559 @@ function SandboxBudgetCard({ sandboxName, inferenceRefName }: { sandboxName: str )} ); -} \ No newline at end of file +} +// ────────────────────────────────────────────────────────────────────── +// SRE Console +// ────────────────────────────────────────────────────────────────────── +// +// Primary landing page for the kars-sre operator. Mirrors what a +// human SRE engineer wants on shift open: +// +// 1. 🔴 Pending — KarsSREActions awaiting their decision. Inline +// Approve / Reject buttons PATCH the CR's .spec.approval.state +// so the operator never leaves the page to drive the apply path. +// 2. 🔄 In-flight — actions the controller is currently executing +// or watching for recovery. Visible phase + age so a stuck +// Applied (waiting for Recovered) is obvious. +// 3. ✅ Recent — terminal-phase actions from the last hour for +// post-incident review. +// 4. 📊 Cluster health — sandbox phase counts + controller status +// (same data the `kars sre diagnose` tool returns). +// 5. 🚨 Active incidents — failure-class events from kars-* +// namespaces in the last 15 min (same filter the proactive +// watcher uses). +// +// All cards live-update via the standard headlamp useList() hook +// (which long-polls + watches), so phase walks Proposed → Approved +// → Applied → Recovered visibly without F5. + +const KarsSREActionClass = CRD_CLASSES.karssreactions!; + +function srePhaseChip(phase: string | undefined, approval: string | undefined) { + // Combined phase+approval rendering. Phase wins, but a Pending + // phase with Approved=true is highlighted because the controller + // is in the middle of executing. + let label = phase || "Proposed"; + let kind: StatusKind = "warning"; + switch (phase) { + case "Recovered": + kind = "success"; + break; + case "Applied": + kind = approval === "Approved" ? "" : "warning"; + label = "Applied · waiting recovery"; + break; + case "Failed": + case "Rejected": + case "Expired": + kind = "error"; + break; + case undefined: + case "": + case "Proposed": + // Operator hasn't acted yet → highlight pending state + kind = approval === "Approved" ? "" : "warning"; + label = approval === "Approved" ? "Approved · queued" : "Proposed"; + break; + } + return {label}; +} + +function ApproveRejectButtons({ + item, + busy, + setBusy, +}: { + item: KubeObject; + busy: boolean; + setBusy: (b: boolean) => void; +}) { + const [error, setError] = React.useState(null); + + const patch = async (state: "Approved" | "Rejected", note?: string) => { + setBusy(true); + setError(null); + try { + // Server-side merge patch. The CR's .spec.approval is a + // small object (state + optional note); a partial merge + // patch overwrites it cleanly. + await (item as any).patch({ + spec: { approval: { state, ...(note ? { note } : {}) } }, + }); + } catch (e: any) { + setError(e?.message ?? String(e)); + } finally { + setBusy(false); + } + }; + + return ( + + + + {error && ( + + ✗ {error} + + )} + + ); +} + +function ActionTargetCell({ item }: { item: KubeObject }) { + const spec = getSpec(item); + const action = spec.action ?? {}; + const params = action.params ?? {}; + return ( +
+
{action.type ?? "?"}
+
+ {params.namespace ?? "?"} / {params.name ?? "?"} +
+
+ ); +} + +function ActionDiagnosisCell({ item }: { item: KubeObject }) { + const spec = getSpec(item); + const diag = spec.diagnosis ?? spec.rationale ?? "—"; + return ( +
+ {String(diag).slice(0, 200)} + {String(diag).length > 200 ? "…" : ""} +
+ ); +} + +function SREActionRow({ item }: { item: KubeObject }) { + const spec = getSpec(item); + const status = getStatus(item); + const approval = spec.approval?.state as string | undefined; + const phase = status.phase as string | undefined; + const [busy, setBusy] = React.useState(false); + const isPending = + (!phase || phase === "Proposed") && + (!approval || approval === "Pending"); + const isInFlight = + phase === "Applied" || (phase === "Proposed" && approval === "Approved"); + return ( + + + + {item.metadata?.name} + +
+ {formatAge(item.metadata?.creationTimestamp)} +
+ + + + + + + + {srePhaseChip(phase, approval)} + + {isPending ? ( + + ) : isInFlight ? ( + + executing… + + ) : ( + + — + + )} + + + ); +} + +function SREActionCard({ + title, + emoji, + items, + emptyText, +}: { + title: string; + emoji: string; + items: KubeObject[]; + emptyText: string; +}) { + return ( + + {items.length === 0 ? ( +
+ {emptyText} +
+ ) : ( + + + + + + + + + + + + {items.map(item => ( + + ))} + +
Action IDTargetDiagnosisPhaseAction
+ )} +
+ ); +} + +function SREClusterHealthCard({ sandboxes }: { sandboxes: KubeObject[] | null }) { + if (!sandboxes) { + return ( + +
Loading…
+
+ ); + } + const byPhase: Record = {}; + let degraded = 0; + for (const s of sandboxes) { + const phase = getStatus(s).phase ?? "Unknown"; + byPhase[phase] = (byPhase[phase] ?? 0) + 1; + const conds = (getStatus(s).conditions ?? []) as any[]; + if (conds.some(c => c.type === "Degraded" && c.status === "True")) degraded += 1; + } + const total = sandboxes.length; + const running = byPhase.Running ?? 0; + return ( + +
+ + + + +
+
+ ); +} + +const INCIDENT_REASONS = new Set([ + "FailedCreate", + "BackOff", + "FailedScheduling", + "Failed", + "ImagePullBackOff", + "ErrImagePull", + "CrashLoopBackOff", + "OOMKilling", + "Evicted", + "FailedMount", +]); + +const PROTECTED_NAMESPACES = new Set([ + "kube-system", + "kube-public", + "kube-node-lease", + "kars-system", + "kars-sre", + "agentmesh", + "default", +]); + +function SREActiveIncidentsCard() { + // Use the v1 Event API class. Headlamp ships it as part of its + // core K8s classes — we resolve via require to avoid a top-of-file + // import cycle with the rest of the plugin (Event is heavy). + const Event = require("@kinvolk/headlamp-plugin/lib/K8s/event").default; + const [events] = (Event as any).useList() as [KubeObject[] | null]; + if (!events) { + return ( + +
Loading events…
+
+ ); + } + const cutoff = Date.now() - 15 * 60 * 1000; + const filtered = events + .filter((e: any) => e.jsonData?.type === "Warning") + .filter((e: any) => INCIDENT_REASONS.has(e.jsonData?.reason ?? "")) + .filter((e: any) => { + const ns = e.metadata?.namespace ?? ""; + return ns.startsWith("kars-") && !PROTECTED_NAMESPACES.has(ns); + }) + .filter((e: any) => { + const ts = e.jsonData?.lastTimestamp || e.jsonData?.eventTime; + if (!ts) return false; + try { + return new Date(ts).getTime() >= cutoff; + } catch { + return false; + } + }) + .sort((a: any, b: any) => { + const at = new Date(a.jsonData?.lastTimestamp || a.jsonData?.eventTime || 0).getTime(); + const bt = new Date(b.jsonData?.lastTimestamp || b.jsonData?.eventTime || 0).getTime(); + return bt - at; + }) + .slice(0, 25); + return ( + + {filtered.length === 0 ? ( +
+ No recent failure-class events in kars-* user namespaces. +
+ ) : ( + + + + + + + + + + + {filtered.map((e: any) => { + const ns = e.metadata?.namespace ?? "?"; + const obj = e.jsonData?.involvedObject ?? {}; + const ts = + e.jsonData?.lastTimestamp || e.jsonData?.eventTime || ""; + return ( + + + + + + + ); + })} + +
ReasonTargetMessageAge
+ + +
+ {obj.kind}/{obj.name} +
+
{ns}
+
+ {String(e.jsonData?.message ?? "").slice(0, 240)} + + {formatAge(ts)} +
+ )} +
+ ); +} + +function SREConsole() { + const [actions] = (KarsSREActionClass as any).useList() as [KubeObject[] | null]; + const [sandboxes] = (KarsSandboxClass as any).useList() as [KubeObject[] | null]; + const safeActions = actions ?? []; + const now = Date.now(); + const recentCutoff = now - 60 * 60 * 1000; // 1 hour + + const pending = safeActions.filter((a: any) => { + const phase = getStatus(a).phase; + const approval = getSpec(a).approval?.state; + return (!phase || phase === "Proposed") && (!approval || approval === "Pending"); + }); + + const inflight = safeActions.filter((a: any) => { + const phase = getStatus(a).phase; + const approval = getSpec(a).approval?.state; + return phase === "Applied" || (phase === "Proposed" && approval === "Approved"); + }); + + const recent = safeActions + .filter((a: any) => { + const phase = getStatus(a).phase; + const ts = a.metadata?.creationTimestamp; + if (!phase || !["Recovered", "Failed", "Rejected", "Expired"].includes(phase)) return false; + if (!ts) return true; + try { + return new Date(ts).getTime() >= recentCutoff; + } catch { + return false; + } + }) + .sort( + (a: any, b: any) => + new Date(b.metadata?.creationTimestamp ?? 0).getTime() - + new Date(a.metadata?.creationTimestamp ?? 0).getTime(), + ) + .slice(0, 10); + + return ( + <> + + + + + + + ); +} + +// ────────────────────────────────────────────────────────────────────── +// SRE Chat — embedded Hermes WebUI for the sre sandbox +// ────────────────────────────────────────────────────────────────────── +// +// Routes through the apiserver service proxy: +// /api/v1/namespaces/kars-sre/services/sre:18789/proxy/ +// +// Caveat: Hermes' WebUI was authored for direct port-forward access +// and may use absolute paths for its bundle assets. When the iframe +// blank-loads, the page shows a fallback hint with the canonical +// `kars connect sre --web` command + a "Open in new tab" link. +// +// In the local-k8s demo path the operator runs `kars sre talk` (which +// shells `kars connect sre --web --port 18790`). That sets up a +// port-forward on localhost; the iframe attempts that target first, +// then falls back to the apiserver-proxy URL. + +const HERMES_GATEWAY_PORT = 18789; + +function SREChat() { + // Try localhost first (port-forward path), then the apiserver + // service proxy fallback. Headlamp itself runs in the operator's + // browser; the apiserver proxy URL only resolves when Headlamp's + // own backend has cluster connectivity (true for both Docker + // Desktop kind cluster and the in-cluster Headlamp deployment). + const [mode, setMode] = React.useState<"local" | "proxy">("local"); + const localUrl = `http://localhost:${HERMES_GATEWAY_PORT}`; + const proxyUrl = `/clusters/kind-kars-dev/api/v1/namespaces/kars-sre/services/sre:${HERMES_GATEWAY_PORT}/proxy/`; + const src = mode === "local" ? localUrl : proxyUrl; + + return ( + +
+ + setMode(v)} + sx={{ minHeight: 32 }} + > + + + + + +
+ {mode === "local" ? ( + <> + Requires:  + kars connect sre --web --port {HERMES_GATEWAY_PORT} +  in another terminal. Hermes' WebUI binds to + localhost on the operator's laptop. + + ) : ( + <> + Routes through the cluster apiserver service proxy. Works without + port-forward, but Hermes asset paths may need extra config. + + )} +
+