From 0d79a2fabf33621e98127bd3a579121dde0985aa Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <pmatusza@redhat.com>
Date: Fri, 3 Jul 2026 11:58:38 +0200
Subject: [PATCH 1/7] ci-doctor: extract per-job evidence packs from prow
 artifacts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

extract-evidence.py condenses a failed job's downloaded artifacts into
one structured evidence file per job: failed steps, test and phase
failures, journal alerts, and container restart counts — every entry
stamped with its timestamp and merged into a single time-sorted
failure timeline. doctor.sh gains an `evidence` phase that runs it for
every downloaded job, and the lvms-ci/microshift-ci plugins symlink
the shared script.

The evidence pack becomes the single starting point for analysis
agents instead of each agent re-scanning raw artifacts.
---
 plugins/lvms-ci/scripts/extract-evidence.py   |    1 +
 .../microshift-ci/scripts/extract-evidence.py |    1 +
 plugins/shared/scripts/doctor.sh              |   29 +
 plugins/shared/scripts/extract-evidence.py    | 1048 +++++++++++++++++
 4 files changed, 1079 insertions(+)
 create mode 120000 plugins/lvms-ci/scripts/extract-evidence.py
 create mode 120000 plugins/microshift-ci/scripts/extract-evidence.py
 create mode 100644 plugins/shared/scripts/extract-evidence.py
diff --git a/plugins/lvms-ci/scripts/extract-evidence.py b/plugins/lvms-ci/scripts/extract-evidence.py
new file mode 120000
index 00000000..8d7b7223
--- /dev/null
+++ b/plugins/lvms-ci/scripts/extract-evidence.py
@@ -0,0 +1 @@
+../../shared/scripts/extract-evidence.py
\ No newline at end of file
diff --git a/plugins/microshift-ci/scripts/extract-evidence.py b/plugins/microshift-ci/scripts/extract-evidence.py
new file mode 120000
index 00000000..8d7b7223
--- /dev/null
+++ b/plugins/microshift-ci/scripts/extract-evidence.py
@@ -0,0 +1 @@
+../../shared/scripts/extract-evidence.py
\ No newline at end of file
diff --git a/plugins/shared/scripts/doctor.sh b/plugins/shared/scripts/doctor.sh
index 569c7302..a31f58fa 100755
--- a/plugins/shared/scripts/doctor.sh
+++ b/plugins/shared/scripts/doctor.sh
@@ -488,6 +488,33 @@ cmd_refresh() {
     python3 "${SCRIPT_DIR}/create-report.py" "${report_args[@]}" "${releases_arg}"
 }
 
+# ---------------------------------------------------------------------------
+# evidence
+# ---------------------------------------------------------------------------
+
+cmd_evidence() {
+    while [[ ${#} -gt 0 ]]; do
+        case "${1}" in
+            --workdir) WORKDIR="${2}"; shift 2 ;;
+            --component) COMPONENT="${2}"; shift 2 ;;
+            -*) echo "Unknown option: ${1}" >&2; return 1 ;;
+            *) echo "Unknown argument: ${1}" >&2; return 1 ;;
+        esac
+    done
+
+    [[ -z "${COMPONENT}" ]] && { echo "Error: --component is required" >&2; return 1; }
+
+    WORKDIR="${WORKDIR:-/tmp/${COMPONENT}-ci-claude-workdir.$(date +%y%m%d)}"
+
+    if [[ ! -d "${WORKDIR}/jobs" ]]; then
+        echo "No jobs directory found in ${WORKDIR}, skipping evidence extraction." >&2
+        return 0
+    fi
+
+    echo "=== Extracting evidence ===" >&2
+    python3 "${SCRIPT_DIR}/extract-evidence.py" --batch --workdir "${WORKDIR}"
+}
+
 # ---------------------------------------------------------------------------
 # main
 # ---------------------------------------------------------------------------
@@ -498,6 +525,7 @@ usage() {
     echo "Commands:" >&2
     echo "  prepare  --component C [--workdir DIR] <releases> [--rebase] [--repo ORG/NAME]  Collect jobs, download artifacts, optional source checkout" >&2
     echo "  graphs   --component C [--workdir DIR] [--timezone TZ]       Generate PCP performance graphs" >&2
+    echo "  evidence --component C [--workdir DIR]                        Extract structured evidence from artifacts" >&2
     echo "  finalize --component C [--workdir DIR] <releases>             Aggregate results and generate HTML" >&2
     echo "  refresh  --component C [--workdir DIR] [--ignore KEY1,KEY2,...] <releases>  Regenerate HTML from existing workdir data" >&2
     echo "" >&2
@@ -518,6 +546,7 @@ main() {
     case "${cmd}" in
         prepare)  cmd_prepare "${@}" ;;
         graphs)   cmd_graphs "${@}" ;;
+        evidence) cmd_evidence "${@}" ;;
         finalize) cmd_finalize "${@}" ;;
         refresh)  cmd_refresh "${@}" ;;
         *) echo "Unknown command: ${cmd}" >&2; usage ;;
diff --git a/plugins/shared/scripts/extract-evidence.py b/plugins/shared/scripts/extract-evidence.py
new file mode 100644
index 00000000..ae6e47d9
--- /dev/null
+++ b/plugins/shared/scripts/extract-evidence.py
@@ -0,0 +1,1048 @@
+#!/usr/bin/env python3
+"""
+Deterministic evidence extraction for CI job artifacts.
+
+Extracts structured evidence from Prow job artifacts into a JSON "evidence
+pack" that gives the LLM agent a head-start on analysis.  The agent still
+has access to raw artifacts for deeper investigation.
+
+Shared across components (MicroShift, LVMS, etc.) via symlinks in each
+plugin's scripts/ directory.
+
+Usage:
+    extract-evidence.py --artifacts-dir <DIR> --workdir <WORKDIR>
+    extract-evidence.py --batch --workdir <WORKDIR>
+"""
+
+import glob as glob_mod
+import json
+import os
+import re
+import subprocess
+import sys
+import xml.etree.ElementTree as ET
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+EVIDENCE_VERSION = 1
+
+
+# ---------------------------------------------------------------------------
+# CONFIGURABLE PATTERN LISTS
+# Add new patterns here.  Each is (label, regex).
+# ---------------------------------------------------------------------------
+
+JOURNAL_PATTERNS = [
+    ("oom_kills",           r"oom-kill|oom_kill|Out of memory|invoked oom-killer"),
+    ("panics",              r"panic|BUG:|RIP:|kernel BUG"),
+    ("segfaults",           r"segfault|SIGSEGV"),
+    ("container_restarts",  r"Created container"),
+    ("greenboot_verdicts",  r"greenboot.*Script.*SUCCESS|greenboot.*Script.*FAILURE"),
+    ("etcd_pressure",       r"apply request took too long"),
+    ("probe_failures",      r"Probe failed|probe failed"),
+    ("leader_election",     r"leader election lost|became leader|leaderelection"),
+    ("ovn_binding",         r"timed out waiting for OVS port binding"),
+    ("service_failures",    r"microshift\.service.*Failed|microshift\.service.*canceled"),
+    ("x509_errors",         r"x509: certificate"),
+    ("disk_pressure",       r"eviction manager|DiskPressure|nodefs.available"),
+    ("access_denied",       r"denied|forbidden|DENY"),
+    ("fatal_errors",        r"FATAL|fatal error"),
+    ("apiserver_issues",    r"etcdserver: request timed out|watch chan error|TLS handshake error"),
+]
+
+BOOT_AND_RUN_PATTERNS = [
+    ("failures",        r"\bFAIL\b|FAILED"),
+    ("errors",          r"\berror\b|\bError\b"),
+    ("cancels",         r"canceled|cancelled"),
+    ("timeouts",        r"timeout|timed out|Timing out"),
+    ("signals",         r"Execution terminated by signal|Test execution stopped"),
+    ("service_issues",  r"Job for .* canceled|Job for .* failed"),
+]
+
+INFRA_INDICATORS = [
+    ("scheduling_failure",  r"pod pending for more than|pod has not been scheduled"),
+    ("aws_credential",      r"InvalidClientTokenId|security token.*invalid"),
+    ("aws_quota",           r"RequestLimitExceeded|InstanceLimitExceeded"),
+    ("aws_provision",       r"ProvisioningFailed|CREATE_FAILED"),
+    ("ci_cluster_capacity", r"nodes are available:.*didn't match Pod's node affinity"),
+    ("prow_timeout",        r"Process did not finish before.*timeout"),
+    ("pod_pending_reason",  r"pod_pending|importing_release:pod_pending"),
+]
+
+BUILD_LOG_ERROR_PATTERNS = [
+    ("fatal_errors",    r"^ERRO|^ERROR|^Fatal|^FATAL|panic:"),
+    ("step_failures",   r"Container.*exited with code [^0]|step.*failed"),
+    ("exceptions",      r"Exception:|Traceback \(most recent"),
+]
+
+# Steps whose presence in the artifact tree signals a specific job type.
+JOB_TYPE_STEP_MARKERS = {
+    "openshift-microshift-infra-iso-build": "build",
+    "openshift-microshift-rebase": "rebase",
+    "openshift-microshift-manage-versions-releases": "config",
+}
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _read_json(path):
+    try:
+        with open(path) as f:
+            return json.load(f)
+    except (OSError, json.JSONDecodeError):
+        return None
+
+
+def _read_lines(path, limit=None):
+    try:
+        with open(path, errors="replace") as f:
+            if limit:
+                return [f.readline() for _ in range(limit)]
+            return f.readlines()
+    except OSError:
+        return []
+
+
+def _grep_file(path, pattern, max_matches=200):
+    """Run grep via subprocess — fast on large files (100MB+ journals)."""
+    try:
+        result = subprocess.run(
+            ["grep", "-n", "-E", pattern, path],
+            capture_output=True, text=True, timeout=30,
+        )
+        lines = result.stdout.strip().split("\n") if result.stdout.strip() else []
+        return lines[:max_matches]
+    except (subprocess.TimeoutExpired, OSError):
+        return []
+
+
+def _parse_grep_line(line):
+    """Parse 'linenum:text' from grep -n output."""
+    m = re.match(r"^(\d+):(.*)", line)
+    if m:
+        return int(m.group(1)), m.group(2).strip()
+    return 0, line.strip()
+
+
+def _find_glob(base, pattern):
+    return sorted(glob_mod.glob(os.path.join(base, pattern)))
+
+
+_JOURNAL_TS_RE = re.compile(r"^([A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})")
+
+
+def _parse_journal_timestamp(text):
+    m = _JOURNAL_TS_RE.match(text)
+    return m.group(1) if m else ""
+
+
+def _timestamp_from_epoch(epoch):
+    try:
+        return datetime.fromtimestamp(int(epoch), tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    except (ValueError, TypeError, OSError):
+        return ""
+
+
+def _date_from_epoch(epoch):
+    try:
+        return datetime.fromtimestamp(int(epoch), tz=timezone.utc).strftime("%Y-%m-%d")
+    except (ValueError, TypeError, OSError):
+        return ""
+
+
+# ---------------------------------------------------------------------------
+# Phase A: Metadata
+# ---------------------------------------------------------------------------
+
+def extract_metadata(artifacts_dir):
+    build_id = os.path.basename(artifacts_dir.rstrip("/"))
+
+    finished = _read_json(os.path.join(artifacts_dir, "finished.json")) or {}
+    started = _read_json(os.path.join(artifacts_dir, "started.json")) or {}
+    prowjob = _read_json(os.path.join(artifacts_dir, "prowjob.json")) or {}
+
+    ts_finished = finished.get("timestamp", 0)
+    ts_started = started.get("timestamp", 0)
+    duration_minutes = round((ts_finished - ts_started) / 60) if ts_finished and ts_started else 0
+
+    release = finished.get("revision", "")
+    if not release:
+        repos = (finished.get("metadata") or {}).get("repos") or {}
+        for repo_ref in repos.values():
+            if isinstance(repo_ref, str):
+                release = repo_ref
+                break
+
+    job_name = prowjob.get("spec", {}).get("job", "")
+    job_url = ""
+    refs = prowjob.get("spec", {}).get("refs") or prowjob.get("spec", {}).get("extra_refs", [{}])[0] if prowjob else {}
+    if isinstance(refs, dict):
+        org = refs.get("org", "")
+        repo = refs.get("repo", "")
+        if job_name and build_id:
+            pulls = refs.get("pulls", [])
+            if pulls:
+                pr_num = pulls[0].get("number", "")
+                job_url = f"https://prow.ci.openshift.org/view/gs/test-platform-results/pr-logs/pull/{org}_{repo}/{pr_num}/{job_name}/{build_id}"
+            else:
+                job_url = f"https://prow.ci.openshift.org/view/gs/test-platform-results/logs/{job_name}/{build_id}"
+
+    build_cluster = prowjob.get("spec", {}).get("cluster", "")
+
+    return {
+        "build_id": build_id,
+        "job_name": job_name,
+        "job_url": job_url,
+        "release": release,
+        "started": _timestamp_from_epoch(ts_started),
+        "finished": _timestamp_from_epoch(ts_finished),
+        "finished_date": _date_from_epoch(ts_finished),
+        "finished_epoch": ts_finished,
+        "duration_minutes": duration_minutes,
+        "result": finished.get("result", "UNKNOWN"),
+        "build_cluster": build_cluster,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Phase B: Job type classification
+# ---------------------------------------------------------------------------
+
+def classify_job_type(artifacts_dir):
+    inner = os.path.join(artifacts_dir, "artifacts")
+    if not os.path.isdir(inner):
+        return "other", None, None
+
+    test_dirs = [d for d in os.listdir(inner) if os.path.isdir(os.path.join(inner, d))
+                 and d not in ("build-resources", "release")]
+    if not test_dirs:
+        return "other", None, None
+
+    # Check each test variant directory and its step subdirectories.
+    # The failed step determines job type.  We scan all test dirs (not just
+    # the first) because some jobs have multiple test variants.
+    best_type = "other"
+    best_test = test_dirs[0]
+    best_step = None
+
+    for test_name in test_dirs:
+        test_dir = os.path.join(inner, test_name)
+        for step_dir_name in sorted(os.listdir(test_dir)):
+            step_path = os.path.join(test_dir, step_dir_name)
+            if not os.path.isdir(step_path):
+                continue
+
+            scenario_info = os.path.join(step_path, "artifacts", "scenario-info")
+            if os.path.isdir(scenario_info):
+                return "scenario-e2e", test_name, step_dir_name
+
+            junit_glob = _find_glob(os.path.join(step_path, "artifacts"), "junit/junit_e2e__*.xml")
+            if junit_glob:
+                return "conformance", test_name, step_dir_name
+
+            for marker_step, jtype in JOB_TYPE_STEP_MARKERS.items():
+                if marker_step in step_dir_name:
+                    best_type = jtype
+                    best_test = test_name
+                    best_step = step_dir_name
+
+    return best_type, best_test, best_step
+
+
+# ---------------------------------------------------------------------------
+# Phase C: Failed step identification
+# ---------------------------------------------------------------------------
+
+def _find_failed_step_from_finished_json(artifacts_dir):
+    """Find the step that failed by checking each step's finished.json.
+
+    Each step directory under artifacts/<test-name>/<step>/ has a
+    finished.json with {"passed": true/false}.  The step with passed=false
+    is the one that actually failed (not the deprovision/teardown steps
+    that run after the failure).
+    """
+    inner = os.path.join(artifacts_dir, "artifacts")
+    if not os.path.isdir(inner):
+        return None
+
+    # Teardown/infra steps to deprioritize — they often show passed=false
+    # because the test step failed first and the teardown is just cleanup.
+    teardown_keywords = ("deprovision", "pmlogs", "sos-aws", "includes")
+
+    for test_dir_name in os.listdir(inner):
+        test_dir = os.path.join(inner, test_dir_name)
+        if not os.path.isdir(test_dir) or test_dir_name in ("build-resources", "release"):
+            continue
+        candidates = []
+        for step_name in os.listdir(test_dir):
+            step_path = os.path.join(test_dir, step_name)
+            if not os.path.isdir(step_path):
+                continue
+            fj = _read_json(os.path.join(step_path, "finished.json"))
+            if fj and fj.get("passed") is False:
+                is_teardown = any(kw in step_name for kw in teardown_keywords)
+                candidates.append((step_name, is_teardown))
+
+        if candidates:
+            non_teardown = [c for c in candidates if not c[1]]
+            if non_teardown:
+                return non_teardown[0][0]
+            return candidates[0][0]
+
+    return None
+
+
+def extract_failed_step(artifacts_dir):
+    build_log = os.path.join(artifacts_dir, "build-log.txt")
+    lines = _read_lines(build_log)
+
+    step_diagram_url = ""
+    anchor_error = None
+    anchor_line_idx = -1
+
+    for i, line in enumerate(lines):
+        stripped = line.strip()
+
+        if re.search(r"steps\.ci\.openshift\.org", stripped):
+            url_m = re.search(r"(https://steps\.ci\.openshift\.org\S+)", stripped)
+            if url_m:
+                step_diagram_url = url_m.group(1)
+
+        if anchor_error is None:
+            if re.search(r"Container.*exited with code [^0]", stripped):
+                anchor_error = {"text": stripped[:300], "file": "build-log.txt", "line": i + 1}
+                anchor_line_idx = i
+            elif re.search(r"Process did not finish before.*timeout", stripped):
+                anchor_error = {"text": stripped[:300], "file": "build-log.txt", "line": i + 1}
+                anchor_line_idx = i
+            elif re.search(r"pod pending for more than", stripped):
+                anchor_error = {"text": stripped[:300], "file": "build-log.txt", "line": i + 1}
+                anchor_line_idx = i
+
+    context_lines = []
+    if anchor_line_idx >= 0:
+        start = max(0, anchor_line_idx - 5)
+        end = min(len(lines), anchor_line_idx + 6)
+        context_lines = [l.rstrip() for l in lines[start:end]]
+
+    # Primary: find the failed step from per-step finished.json (most reliable)
+    failed_step_name = _find_failed_step_from_finished_json(artifacts_dir) or ""
+
+    ci_operator_reason = ""
+    ci_op_log = os.path.join(artifacts_dir, "artifacts", "ci-operator.log")
+    for line in _read_lines(ci_op_log):
+        m = re.search(r"Reporting job state '(\w+)' with reason '([^']+)'", line)
+        if m:
+            ci_operator_reason = m.group(2)
+            break
+
+    # Fallback: extract step name from ci-operator reason when no step
+    # directory had finished.json (e.g. infra failure before any step ran)
+    if not failed_step_name and ci_operator_reason:
+        m = re.search(r"step_failed:(\w[\w-]*)", ci_operator_reason)
+        if m:
+            failed_step_name = m.group(1)
+        elif "importing_release" in ci_operator_reason:
+            failed_step_name = "[release:latest]"
+
+    exit_code = -1
+    if anchor_error:
+        m = re.search(r"exited with code (\d+)", anchor_error["text"])
+        if m:
+            exit_code = int(m.group(1))
+
+    return {
+        "name": failed_step_name,
+        "exit_code": exit_code,
+        "step_diagram_url": step_diagram_url,
+        "ci_operator_reason": ci_operator_reason,
+        "anchor_error": anchor_error,
+        "context_lines": context_lines,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Phase D: Infrastructure indicator scan
+# ---------------------------------------------------------------------------
+
+def scan_infra_indicators(artifacts_dir):
+    files_to_scan = [
+        os.path.join(artifacts_dir, "build-log.txt"),
+        os.path.join(artifacts_dir, "artifacts", "ci-operator.log"),
+    ]
+
+    matched = []
+    for fpath in files_to_scan:
+        if not os.path.isfile(fpath):
+            continue
+        for label, pattern in INFRA_INDICATORS:
+            hits = _grep_file(fpath, pattern, max_matches=3)
+            if hits:
+                _, text = _parse_grep_line(hits[0])
+                matched.append({"label": label, "file": os.path.basename(fpath), "text": text[:200]})
+
+    return {
+        "is_infra_failure": len(matched) > 0,
+        "matched_patterns": matched,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Phase E: Scenario extraction
+# ---------------------------------------------------------------------------
+
+def _parse_junit(path):
+    """Parse junit.xml → list of {name, classname, status, message, time_s}, counts, suite_timestamp."""
+    try:
+        tree = ET.parse(path)
+    except (ET.ParseError, OSError):
+        return [], 0, 0, ""
+
+    root = tree.getroot()
+    suite_timestamp = root.get("timestamp", "")
+    tests = int(root.get("tests", 0))
+    failures = int(root.get("failures", 0))
+
+    results = []
+    for tc in root.iter("testcase"):
+        entry = {
+            "name": tc.get("name", ""),
+            "classname": tc.get("classname", ""),
+            "time_s": tc.get("time", ""),
+        }
+        fail = tc.find("failure")
+        if fail is not None:
+            entry["status"] = "FAILED"
+            entry["message"] = (fail.get("message") or fail.text or "")[:300]
+        else:
+            entry["status"] = "PASSED"
+            entry["message"] = ""
+        results.append(entry)
+
+    return results, tests, failures, suite_timestamp
+
+
+def _extract_rf_failures(path):
+    hits = _grep_file(path, r"\| FAIL \|", max_matches=50)
+    results = []
+    for h in hits:
+        line_num, text = _parse_grep_line(h)
+        results.append({"line": line_num, "text": text[:300]})
+    return results
+
+
+def _extract_boot_and_run_alerts(path):
+    alerts = []
+    for label, pattern in BOOT_AND_RUN_PATTERNS:
+        hits = _grep_file(path, pattern, max_matches=20)
+        for h in hits:
+            line_num, text = _parse_grep_line(h)
+            if not text or re.match(r"^\d+:\s*#", text):
+                continue
+            alerts.append({"pattern": label, "line": line_num, "text": text[:300]})
+    return alerts
+
+
+def _detect_timeout_cascade(path):
+    content_check = _grep_file(path, r"Execution terminated by signal", max_matches=1)
+    if content_check:
+        stopped = _grep_file(path, r"Test execution stopped", max_matches=1)
+        return bool(stopped)
+    return False
+
+
+def _extract_journal_alerts(journal_path):
+    alerts = {}
+    for label, pattern in JOURNAL_PATTERNS:
+        hits = _grep_file(journal_path, pattern, max_matches=50)
+        entries = []
+        for h in hits:
+            line_num, text = _parse_grep_line(h)
+            entries.append({
+                "line": line_num,
+                "text": text[:300],
+                "timestamp": _parse_journal_timestamp(text),
+            })
+        alerts[label] = entries
+    return alerts
+
+
+_CREATED_CONTAINER_RE = re.compile(r"Created container [0-9a-f]+:\s*(\S+/\S+)/\S+")
+
+
+def _group_container_restarts(entries):
+    """Group 'Created container' journal entries by pod, flag pods with >1 creation."""
+    pod_counts = {}
+    for entry in entries:
+        m = _CREATED_CONTAINER_RE.search(entry.get("text", ""))
+        if m:
+            pod = m.group(1)
+            pod_counts[pod] = pod_counts.get(pod, 0) + 1
+    return sorted(
+        [{"pod": pod, "count": count} for pod, count in pod_counts.items() if count > 1],
+        key=lambda x: -x["count"],
+    )
+
+
+def _find_sosreports(scenario_dir):
+    tarballs = _find_glob(os.path.join(scenario_dir, "vms", "*", "sos"), "sosreport-*.tar.xz")
+    results = []
+    for t in tarballs:
+        m = re.search(r"(\d{4}-\d{2}-\d{2})", os.path.basename(t))
+        capture_date = m.group(1) if m else ""
+        results.append({"path": t, "capture_date": capture_date})
+    return results
+
+
+def extract_scenarios(artifacts_dir, test_name, step_name, workdir):
+    if not test_name or not step_name:
+        return []
+
+    scenario_info_dir = os.path.join(
+        artifacts_dir, "artifacts", test_name, step_name, "artifacts", "scenario-info"
+    )
+    if not os.path.isdir(scenario_info_dir):
+        return []
+
+    scenarios = []
+    for scenario_name in sorted(os.listdir(scenario_info_dir)):
+        sdir = os.path.join(scenario_info_dir, scenario_name)
+        if not os.path.isdir(sdir):
+            continue
+
+        junit_path = os.path.join(sdir, "junit.xml")
+        phase_junit_path = os.path.join(sdir, "phase_create-and-run", "junit.xml")
+        rf_debug_path = os.path.join(sdir, "rf-debug.log")
+        boot_run_path = os.path.join(sdir, "boot_and_run.log")
+        journal_paths = _find_glob(os.path.join(sdir, "vms", "*", "sos"), "journal_*.log")
+
+        has_junit = os.path.isfile(junit_path)
+        has_rf_debug = os.path.isfile(rf_debug_path)
+        has_journal = len(journal_paths) > 0
+        has_boot_run = os.path.isfile(boot_run_path)
+        sosreports = _find_sosreports(sdir)
+
+        test_failures, test_count, failure_count, suite_timestamp = _parse_junit(junit_path) if has_junit else ([], 0, 0, "")
+        test_failures = [t for t in test_failures if t["status"] == "FAILED"]
+
+        infra_results, _, infra_fail_count, _ = _parse_junit(phase_junit_path) if os.path.isfile(phase_junit_path) else ([], 0, 0, "")
+        infra_failures = [t["name"] for t in infra_results if t["status"] == "FAILED"]
+
+        rf_failures = _extract_rf_failures(rf_debug_path) if has_rf_debug else []
+        boot_alerts = _extract_boot_and_run_alerts(boot_run_path) if has_boot_run else []
+        timeout_cascade = _detect_timeout_cascade(boot_run_path) if has_boot_run else False
+
+        journal_alerts = {}
+        for jp in journal_paths:
+            alerts = _extract_journal_alerts(jp)
+            for label, entries in alerts.items():
+                journal_alerts.setdefault(label, []).extend(entries)
+
+        container_restarts = _group_container_restarts(
+            journal_alerts.get("container_restarts", [])
+        )
+
+        greenboot_status = None
+        for entry in journal_alerts.get("greenboot_verdicts", []):
+            if "SUCCESS" in entry["text"]:
+                greenboot_status = "SUCCESS"
+            elif "FAILURE" in entry["text"]:
+                greenboot_status = "FAILURE"
+
+        # Pre-extract sosreports when journal shows container restarts or crashes
+        should_extract = (
+            len(journal_alerts.get("container_restarts", [])) >= 2
+            or len(journal_alerts.get("panics", [])) > 0
+            or len(journal_alerts.get("oom_kills", [])) > 0
+        )
+        extracted_dirs = []
+        if should_extract and sosreports:
+            extract_script = os.path.join(SCRIPT_DIR, "extract-sosreport.sh")
+            if os.path.isfile(extract_script):
+                for sos in sosreports:
+                    try:
+                        result = subprocess.run(
+                            ["bash", extract_script, sos["path"]],
+                            capture_output=True, text=True, timeout=120,
+                        )
+                        extracted_path = result.stdout.strip().split("\n")[-1]
+                        if extracted_path and os.path.isdir(extracted_path):
+                            extracted_dirs.append(extracted_path)
+                    except (subprocess.TimeoutExpired, OSError):
+                        pass
+
+        gaps = []
+        if not has_journal:
+            gaps.append("no journal")
+        if not sosreports:
+            gaps.append("no sosreport")
+        if not has_rf_debug and has_junit and failure_count > 0:
+            gaps.append("no rf-debug.log")
+
+        scenarios.append({
+            "name": scenario_name,
+            "evidence_available": {
+                "junit": has_junit,
+                "rf_debug": has_rf_debug,
+                "journal": has_journal,
+                "boot_and_run": has_boot_run,
+                "sosreport": len(sosreports) > 0,
+            },
+            "infra_phase": {
+                "result": "FAILED" if infra_failures else "PASSED",
+                "failures": infra_failures,
+            },
+            "junit_suite_timestamp": suite_timestamp,
+            "test_count": test_count,
+            "failure_count": failure_count,
+            "test_failures": test_failures,
+            "rf_failures": rf_failures,
+            "boot_and_run_alerts": boot_alerts,
+            "journal_alerts": journal_alerts,
+            "container_restarts": container_restarts,
+            "timeout_cascade": timeout_cascade,
+            "greenboot_status": greenboot_status,
+            "sosreport_paths": sosreports,
+            "extracted_sosreport_dirs": extracted_dirs,
+            "analysis_gaps": gaps,
+        })
+
+    return scenarios
+
+
+# ---------------------------------------------------------------------------
+# Phase F: Conformance extraction
+# ---------------------------------------------------------------------------
+
+def extract_conformance_failures(artifacts_dir, test_name, step_name):
+    if not test_name or not step_name:
+        return []
+
+    step_dir = os.path.join(artifacts_dir, "artifacts", test_name, step_name)
+    junit_files = _find_glob(os.path.join(step_dir, "artifacts"), "junit/junit_e2e__*.xml")
+
+    failures = []
+    for jf in junit_files:
+        results, _, _, _ = _parse_junit(jf)
+        for tc in results:
+            if tc["status"] == "FAILED":
+                failures.append({
+                    "test_name": tc["name"],
+                    "classname": tc["classname"],
+                    "message": tc["message"],
+                    "file": os.path.relpath(jf, artifacts_dir),
+                })
+
+    step_log = os.path.join(step_dir, "build-log.txt")
+    if os.path.isfile(step_log):
+        monitor_hits = _grep_file(step_log, r"MonitorTest|Suite run returned error", max_matches=10)
+        for h in monitor_hits:
+            line_num, text = _parse_grep_line(h)
+            failures.append({
+                "test_name": "",
+                "classname": "build-log",
+                "message": text[:300],
+                "file": os.path.relpath(step_log, artifacts_dir),
+                "line": line_num,
+            })
+
+    return failures
+
+
+# ---------------------------------------------------------------------------
+# Phase G: Build/config/rebase extraction
+# ---------------------------------------------------------------------------
+
+def extract_build_errors(artifacts_dir, test_name, step_name):
+    if not test_name or not step_name:
+        return _extract_errors_from_main_log(artifacts_dir)
+
+    step_log = os.path.join(artifacts_dir, "artifacts", test_name, step_name, "build-log.txt")
+    if not os.path.isfile(step_log):
+        return _extract_errors_from_main_log(artifacts_dir)
+
+    errors = []
+    lines = _read_lines(step_log)
+    for i, line in enumerate(lines):
+        stripped = line.strip()
+        for label, pattern in BUILD_LOG_ERROR_PATTERNS:
+            if re.search(pattern, stripped):
+                start = max(0, i - 3)
+                end = min(len(lines), i + 4)
+                context = [l.rstrip() for l in lines[start:end]]
+                errors.append({
+                    "pattern": label,
+                    "line": i + 1,
+                    "text": stripped[:300],
+                    "file": os.path.relpath(step_log, artifacts_dir),
+                    "context": context,
+                })
+                break
+    return errors[:30]
+
+
+def _extract_errors_from_main_log(artifacts_dir):
+    main_log = os.path.join(artifacts_dir, "build-log.txt")
+    if not os.path.isfile(main_log):
+        return []
+
+    errors = []
+    for label, pattern in BUILD_LOG_ERROR_PATTERNS:
+        hits = _grep_file(main_log, pattern, max_matches=10)
+        for h in hits:
+            line_num, text = _parse_grep_line(h)
+            errors.append({
+                "pattern": label,
+                "line": line_num,
+                "text": text[:300],
+                "file": "build-log.txt",
+            })
+    return errors[:30]
+
+
+# ---------------------------------------------------------------------------
+# Phase H: Source context
+# ---------------------------------------------------------------------------
+
+def extract_source_context(workdir, release, finished_epoch):
+    if not workdir or not finished_epoch:
+        return {"available": False, "path": "", "recent_commits": [], "gap": "no workdir"}
+
+    if release and release != "main":
+        src_dir = os.path.join(workdir, "src", f"microshift-release-{release}")
+        if not os.path.isdir(src_dir):
+            src_dir = os.path.join(workdir, "src", "microshift")
+    else:
+        src_dir = os.path.join(workdir, "src", "microshift")
+
+    if not os.path.isdir(src_dir):
+        return {"available": False, "path": "", "recent_commits": [], "gap": "source checkout not available"}
+
+    finished_dt = datetime.fromtimestamp(int(finished_epoch), tz=timezone.utc)
+    since_dt = finished_dt - timedelta(days=30)
+    since_str = since_dt.strftime("%Y-%m-%d")
+    until_str = finished_dt.strftime("%Y-%m-%d")
+
+    repo_log_script = os.path.join(SCRIPT_DIR, "repo-log.sh")
+    if os.path.isfile(repo_log_script):
+        try:
+            result = subprocess.run(
+                ["bash", repo_log_script, src_dir,
+                 "--since", since_str, "--until", until_str, "--limit", "30"],
+                capture_output=True, text=True, timeout=30,
+            )
+            commits = []
+            for line in result.stdout.strip().split("\n"):
+                if not line.strip():
+                    continue
+                parts = line.split(None, 2)
+                if len(parts) >= 3:
+                    commits.append({"sha": parts[0], "date": parts[1], "subject": parts[2]})
+                elif len(parts) == 2:
+                    commits.append({"sha": parts[0], "date": parts[1], "subject": ""})
+            return {"available": True, "path": src_dir, "recent_commits": commits}
+        except (subprocess.TimeoutExpired, OSError):
+            pass
+
+    return {"available": True, "path": src_dir, "recent_commits": [], "gap": "repo-log.sh failed"}
+
+
+# ---------------------------------------------------------------------------
+# Phase I: PCP graphs
+# ---------------------------------------------------------------------------
+
+def find_pcp_graphs(workdir, build_id):
+    graphs_dir = os.path.join(workdir, "graphs", build_id)
+    if not os.path.isdir(graphs_dir):
+        return []
+    return sorted([os.path.basename(f) for f in _find_glob(graphs_dir, "*.png")])
+
+
+# ---------------------------------------------------------------------------
+# Phase J: Failure timeline
+# ---------------------------------------------------------------------------
+
+_FAILURE_JOURNAL_CATEGORIES = {
+    "oom_kills", "panics", "segfaults", "service_failures", "fatal_errors",
+    "etcd_pressure", "ovn_binding", "x509_errors", "disk_pressure",
+    "apiserver_issues",
+}
+
+
+def _resolve_journal_ts(ts_str, year):
+    """Convert 'Jun 30 03:58:39' + year → '2026-06-30T03:58:39Z'."""
+    try:
+        dt = datetime.strptime(f"{year} {ts_str}", "%Y %b %d %H:%M:%S")
+        return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
+    except (ValueError, TypeError):
+        return ""
+
+
+def _build_failure_timeline(scenarios, meta):
+    started = meta.get("started", "")
+    year = ""
+    if started:
+        try:
+            year = started[:4]
+        except (IndexError, TypeError):
+            pass
+    if not year:
+        finished = meta.get("finished", "")
+        if finished:
+            year = finished[:4]
+    if not year:
+        return []
+
+    timeline = []
+    for scenario in scenarios:
+        name = scenario.get("name", "")
+        has_test_failures = scenario.get("failure_count", 0) > 0
+        infra_failed = scenario.get("infra_phase", {}).get("result") == "FAILED"
+
+        journal_alerts = scenario.get("journal_alerts", {})
+        has_problem_alerts = any(
+            journal_alerts.get(cat)
+            for cat in _FAILURE_JOURNAL_CATEGORIES
+        )
+
+        if not has_test_failures and not infra_failed and not has_problem_alerts:
+            continue
+
+        earliest_ts = ""
+        earliest_source = ""
+        earliest_detail = ""
+
+        for cat in _FAILURE_JOURNAL_CATEGORIES:
+            for entry in journal_alerts.get(cat, []):
+                raw_ts = entry.get("timestamp", "")
+                if not raw_ts:
+                    continue
+                resolved = _resolve_journal_ts(raw_ts, year)
+                if resolved and (not earliest_ts or resolved < earliest_ts):
+                    earliest_ts = resolved
+                    earliest_source = "journal"
+                    earliest_detail = f"{cat}: {entry.get('text', '')[:80]}"
+
+        suite_ts = scenario.get("junit_suite_timestamp", "")
+        if suite_ts:
+            normalized = suite_ts
+            if len(normalized) >= 19 and "T" not in normalized:
+                normalized = normalized[:10] + "T" + normalized[11:19] + "Z"
+            elif not normalized.endswith("Z") and "+" not in normalized:
+                normalized = normalized.rstrip() + "Z"
+            if not earliest_ts or normalized < earliest_ts:
+                earliest_ts = normalized
+                earliest_source = "junit"
+                detail_parts = [t.get("name", "") for t in scenario.get("test_failures", [])[:2]]
+                earliest_detail = "; ".join(detail_parts)[:80] if detail_parts else "test suite"
+
+        for alert in scenario.get("boot_and_run_alerts", []):
+            raw_ts = _parse_journal_timestamp(alert.get("text", ""))
+            if not raw_ts:
+                continue
+            resolved = _resolve_journal_ts(raw_ts, year)
+            if resolved and (not earliest_ts or resolved < earliest_ts):
+                earliest_ts = resolved
+                earliest_source = "boot_and_run"
+                earliest_detail = alert.get("text", "")[:80]
+
+        if earliest_ts:
+            timeline.append({
+                "scenario": name,
+                "earliest_failure": earliest_ts,
+                "source": earliest_source,
+                "detail": earliest_detail,
+            })
+
+    timeline.sort(key=lambda e: e["earliest_failure"])
+    return timeline
+
+
+# ---------------------------------------------------------------------------
+# Main extraction
+# ---------------------------------------------------------------------------
+
+def extract_evidence(artifacts_dir, workdir):
+    artifacts_dir = os.path.abspath(artifacts_dir)
+    workdir = os.path.abspath(workdir)
+
+    meta = extract_metadata(artifacts_dir)
+    job_type, test_name, step_name = classify_job_type(artifacts_dir)
+    failed_step = extract_failed_step(artifacts_dir)
+    infra = scan_infra_indicators(artifacts_dir)
+
+    scenarios = []
+    conformance_failures = []
+    build_errors = []
+
+    if job_type == "scenario-e2e":
+        scenarios = extract_scenarios(artifacts_dir, test_name, step_name, workdir)
+    elif job_type == "conformance":
+        conformance_failures = extract_conformance_failures(artifacts_dir, test_name, step_name)
+    elif job_type in ("build", "config", "rebase", "other"):
+        build_errors = extract_build_errors(artifacts_dir, test_name, step_name)
+
+    # When no step was identified from the test artifacts but we detected infra
+    # failure patterns, still extract build errors from main log.
+    if not failed_step["name"] and infra["is_infra_failure"]:
+        build_errors = _extract_errors_from_main_log(artifacts_dir)
+
+    failure_timeline = _build_failure_timeline(scenarios, meta)
+
+    source = extract_source_context(workdir, meta["release"], meta["finished_epoch"])
+    pcp_graphs = find_pcp_graphs(workdir, meta["build_id"])
+
+    top_level_gaps = []
+    if source.get("gap"):
+        top_level_gaps.append(source["gap"])
+    if not pcp_graphs:
+        top_level_gaps.append("no PCP graphs")
+
+    evidence = {
+        "version": EVIDENCE_VERSION,
+        **{k: v for k, v in meta.items() if k != "finished_epoch"},
+        "artifacts_dir": artifacts_dir,
+        "job_type": job_type,
+        "failed_step": failed_step,
+        "infrastructure_indicators": infra,
+        "scenarios": scenarios,
+        "failure_timeline": failure_timeline,
+        "conformance_failures": conformance_failures,
+        "build_errors": build_errors,
+        "pcp_graphs": pcp_graphs,
+        "source_checkout": {k: v for k, v in source.items() if k != "gap"},
+        "analysis_gaps": top_level_gaps,
+    }
+
+    return evidence
+
+
+# ---------------------------------------------------------------------------
+# Batch mode
+# ---------------------------------------------------------------------------
+
+def _rebase_artifacts_dir(artifacts_dir, workdir):
+    """Rebase an artifacts_dir path onto the current workdir.
+
+    Job JSON files record the artifacts_dir from the original machine (e.g.
+    /tmp/microshift-ci-claude-workdir.260629/artifacts/BUILD_ID).  When the
+    workdir has been moved or mounted elsewhere, we rebase the path.
+    """
+    build_id = os.path.basename(artifacts_dir.rstrip("/"))
+    rebased = os.path.join(workdir, "artifacts", build_id)
+    if os.path.isdir(rebased):
+        return rebased
+    if os.path.isdir(artifacts_dir):
+        return artifacts_dir
+    return None
+
+
+def run_batch(workdir):
+    workdir = os.path.abspath(workdir)
+    jobs_dir = os.path.join(workdir, "jobs")
+    evidence_dir = os.path.join(workdir, "evidence")
+    os.makedirs(evidence_dir, exist_ok=True)
+
+    job_files = sorted(
+        _find_glob(jobs_dir, "release-*-jobs.json")
+        + _find_glob(jobs_dir, "prs-jobs.json")
+    )
+
+    total = 0
+    errors = 0
+    skipped = 0
+
+    for jf in job_files:
+        jobs = _read_json(jf)
+        if not isinstance(jobs, list):
+            continue
+        for job in jobs:
+            raw_dir = job.get("artifacts_dir", "")
+            if not raw_dir:
+                continue
+            artifacts_dir = _rebase_artifacts_dir(raw_dir, workdir)
+            if not artifacts_dir:
+                skipped += 1
+                continue
+            build_id = job.get("build_id", os.path.basename(artifacts_dir))
+            output_path = os.path.join(evidence_dir, f"evidence-{build_id}.json")
+            try:
+                evidence = extract_evidence(artifacts_dir, workdir)
+                with open(output_path, "w") as f:
+                    json.dump(evidence, f, indent=2)
+                total += 1
+                print(f"  {build_id}: {evidence['job_type']}, step={evidence['failed_step']['name']}", file=sys.stderr)
+            except Exception as e:
+                errors += 1
+                print(f"  ERROR {build_id}: {e}", file=sys.stderr)
+
+    if skipped:
+        print(f"  ({skipped} jobs skipped — artifacts_dir not found)", file=sys.stderr)
+    print(f"\nExtracted evidence for {total} jobs ({errors} errors) → {evidence_dir}", file=sys.stderr)
+    return total, errors
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def main():
+    artifacts_dir = None
+    workdir = None
+    batch = False
+    output = None
+
+    args = sys.argv[1:]
+    i = 0
+    while i < len(args):
+        if args[i] == "--artifacts-dir":
+            artifacts_dir = args[i + 1]; i += 2
+        elif args[i] == "--workdir":
+            workdir = args[i + 1]; i += 2
+        elif args[i] == "--batch":
+            batch = True; i += 1
+        elif args[i] == "--output":
+            output = args[i + 1]; i += 2
+        elif args[i] in ("-h", "--help"):
+            print(__doc__); sys.exit(0)
+        else:
+            print(f"Unknown argument: {args[i]}", file=sys.stderr)
+            sys.exit(1)
+
+    if not workdir:
+        print("Error: --workdir is required", file=sys.stderr)
+        sys.exit(1)
+
+    if batch:
+        print("=== Extracting evidence (batch) ===", file=sys.stderr)
+        total, errors = run_batch(workdir)
+        sys.exit(1 if errors > 0 and total == 0 else 0)
+
+    if not artifacts_dir:
+        print("Error: --artifacts-dir required (or use --batch)", file=sys.stderr)
+        sys.exit(1)
+
+    if not os.path.isdir(artifacts_dir):
+        print(f"Error: artifacts directory not found: {artifacts_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    evidence = extract_evidence(artifacts_dir, workdir)
+    build_id = evidence["build_id"]
+
+    if not output:
+        evidence_dir = os.path.join(workdir, "evidence")
+        os.makedirs(evidence_dir, exist_ok=True)
+        output = os.path.join(evidence_dir, f"evidence-{build_id}.json")
+
+    with open(output, "w") as f:
+        json.dump(evidence, f, indent=2)
+    print(f"Written: {output}", file=sys.stderr)
+    print(json.dumps(evidence, indent=2))
+
+
+if __name__ == "__main__":
+    main()

From 3c3c92f2253b117fe165fb8f92ef0b87ebdc04ba Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <pmatusza@redhat.com>
Date: Fri, 3 Jul 2026 11:58:37 +0200
Subject: [PATCH 2/7] microshift-ci: rework prow-job analysis around an
 analyze-evidence agent

Replace the prow-job skill's inline RCA instructions with a dedicated
analyze-evidence agent that starts from the evidence pack and consults
the MicroShift CI artifact primer (moved under agents/references/) and
a structured-summary contract with tightened causal-chain rules. The
doctor skill launches the same agent for its per-job analyses;
prow-job becomes a thin wrapper that downloads artifacts, extracts
evidence, and spawns the agent.

validate-reports.py checks every agent report against the structured
summary contract, and the doctor skill re-launches fix agents for
reports that fail; parse.py sanitizes structured summaries before
parsing.
---
 .../microshift-ci/agents/analyze-evidence.md  |  63 +++
 .../agents/references/microshift-ci-primer.md |  92 +++++
 .../agents/references/structured-summary.md   |  98 +++++
 .../microshift-ci/scripts/validate-reports.py |   1 +
 plugins/microshift-ci/skills/doctor/SKILL.md  | 111 +++--
 .../microshift-ci/skills/prow-job/SKILL.md    | 383 ++----------------
 .../references/microshift-ci-primer.md        | 253 ------------
 plugins/shared/scripts/parse.py               |   8 +-
 plugins/shared/scripts/validate-reports.py    | 145 +++++++
 9 files changed, 505 insertions(+), 649 deletions(-)
 create mode 100644 plugins/microshift-ci/agents/analyze-evidence.md
 create mode 100644 plugins/microshift-ci/agents/references/microshift-ci-primer.md
 create mode 100644 plugins/microshift-ci/agents/references/structured-summary.md
 create mode 120000 plugins/microshift-ci/scripts/validate-reports.py
 delete mode 100644 plugins/microshift-ci/skills/prow-job/references/microshift-ci-primer.md
 create mode 100644 plugins/shared/scripts/validate-reports.py

diff --git a/plugins/microshift-ci/agents/analyze-evidence.md b/plugins/microshift-ci/agents/analyze-evidence.md
new file mode 100644
index 00000000..8344a630
--- /dev/null
+++ b/plugins/microshift-ci/agents/analyze-evidence.md
@@ -0,0 +1,63 @@
+# Analyze Evidence Agent
+
+Analyze a MicroShift Prow CI job from a pre-extracted evidence pack. Your goal is the UNDERLYING root cause, not the first error in the log. Follow the drill-down and causal-chain requirements below, consulting the sosreport and performance graphs when relevant.
+
+## Inputs
+
+- `{EVIDENCE_PACK}` — path to evidence pack JSON
+- `{JOB_NAME}` — full Prow job name
+- `{JOB_URL}` — full Prow job URL
+- `{OUTPUT_FILE}` — path to save the analysis report
+
+## Instructions
+
+### 1. Read the evidence pack and references
+
+Read `{EVIDENCE_PACK}` and `plugins/microshift-ci/agents/references/microshift-ci-primer.md`.
+
+### 2. Assess the failure
+
+- `infrastructure_indicators.is_infra_failure` true → confirm from matched patterns and anchor error, produce report.
+- `scenario-e2e` → examine each scenario's alerts, failures, and journal. Use `failure_timeline` to distinguish cascade from independent failures.
+- `conformance` → examine `conformance_failures`.
+- `build`/`config`/`rebase` → examine `build_errors`.
+- No `failed_step` and no error indicators → job passed. Severity 1, `infrastructure_failure: false`. Do NOT drill down.
+
+### 3. Drill down
+
+Iterate hypothesis → evidence until the cause is actionable.
+
+**Mandatory raw-log verification** — BEFORE concluding, even when the evidence pack looks sufficient:
+- Read ~200 lines of raw journal around the failure timestamp — look for patterns NOT in the evidence pack (authorization denials, scheduler errors, admission failures, kubelet sandbox errors).
+- When a sosreport exists, check **kube-apiserver** pod logs for authorization/admission/scheduling decisions.
+- "Timed out waiting for X" is a symptom — read raw logs to find WHY X was slow or absent.
+
+**Deeper investigation** via raw artifacts:
+- **Sosreport pod logs**: read from `extracted_sosreport_dirs` when available, or run `bash plugins/microshift-ci/scripts/extract-sosreport.sh <tarball>` on paths in `sosreport_paths`.
+- **PCP graphs**: read PNGs listed in `pcp_graphs` when the failure involves timeouts, slowness, or resource exhaustion.
+- **Source code**: use `source_checkout.path` to read `test/suites/` or product code. Check `recent_commits` for related changes.
+
+**Critical rules**:
+- A test-layer fix is never the bottom when a product component misbehaved — reconstruct the component's story from journal and pod logs before concluding.
+- Two `Created container` events for the same pod = the first instance died. Read `previous.log` for the exit reason.
+- Multiple scenario failures: decide cascade vs independent using the **timeline**, not error-text similarity.
+- **Every causal-chain link MUST cite an artifact file path** (e.g., `artifacts/.../boot_and_run.log:4629`). Do NOT cite the evidence JSON, general knowledge, or architectural statements. The evidence pack includes `file` and `line` for each match — trace back to those. Drop unsupported links or record as analysis gaps.
+
+### 4. Validate causal chain
+
+Before producing the report, validate every causal-chain link:
+- Every link MUST have an `evidence` field containing an artifact file path with `:line` (e.g., `artifacts/.../boot_and_run.log:4629`).
+- Every link MUST have a `quote` field with verbatim text from that file.
+- If any link cites the evidence JSON, general knowledge, or architectural statements instead of an artifact file — fix it now by finding the actual artifact file, or drop the link.
+
+### 5. Produce the report
+
+Write the report per `plugins/microshift-ci/agents/references/structured-summary.md`. Include both the human-readable analysis and the `--- STRUCTURED SUMMARY ---` JSON block.
+
+When you read a raw artifact and find evidence NOT in the evidence pack, include `missing_patterns` entries: `{"file_type": "journal|boot_and_run|build_log", "grep_pattern": "<regex>", "reason": "<why>"}`.
+
+### 6. Save and reply
+
+Save the FULL report output (including the `--- STRUCTURED SUMMARY ---` block) to `{OUTPUT_FILE}` using the Write tool. The file must contain the complete analysis report.
+
+After saving, reply with EXACTLY one line: `DONE {OUTPUT_FILE}`. Do NOT include the report text in your reply.
diff --git a/plugins/microshift-ci/agents/references/microshift-ci-primer.md b/plugins/microshift-ci/agents/references/microshift-ci-primer.md
new file mode 100644
index 00000000..a36cfee5
--- /dev/null
+++ b/plugins/microshift-ci/agents/references/microshift-ci-primer.md
@@ -0,0 +1,92 @@
+# MicroShift CI Artifact Primer
+
+Reference for analyzing MicroShift Prow job artifacts — which file answers which question.
+
+## Job types
+
+- **Scenario-based e2e** (`e2e-aws-tests-*`): the `openshift-microshift-e2e-metal-tests` step boots ~20 VM-based test scenarios on a shared hypervisor. Failures are per-scenario.
+- **Direct-test** (`*-ocp-conformance-*`, `e2e-aws-ai-model-serving-*`, `e2e-aws-footprint-*`): run their test suite directly, no scenario fan-out.
+
+## Test framework
+
+Tests use [Robot Framework](https://robotframework.org). Suites: `test/suites/*.robot`. Shared keywords: `test/resources/`. Scenario definitions: `test/scenarios*/`.
+
+`TEST_EXECUTION_TIMEOUT` (default `30m`) wraps Robot Framework in `timeout`. When exceeded, the current test dies with `Execution terminated by signal` and every subsequent test reports `Test execution stopped due to a fatal error` — a cascade with ONE root cause (the time budget).
+
+## Deployment types
+
+Three deployment pipelines: **ostree** (scenarios in `test/scenarios/`), **bootc** (`test/scenarios-bootc/`), **RPM** (`test/suites/rpm/`). Job name indicates which (e.g. `e2e-aws-tests-bootc-*`). All produce the same artifact layout.
+
+## Scenario naming
+
+Scenario names encode OS, MicroShift version source, and suite. The `@` separator chains stages left-to-right; the **last segment** is always the test suite.
+
+### Version-source markers
+
+| Marker | Meaning |
+|---|---|
+| `src` | Built from source (PR or branch) |
+| `base` | Built from PR's target branch |
+| `prel` | Previous minor release (Y-1) |
+| `crel` | Current minor release (EC/RC/z-stream) |
+| `lrel` | Latest available release from staging repos |
+| `zprel` | Latest z-stream from rhocp |
+| `y1`/`y2` | Y-1/Y-2 minor versions back (also `yminus1`/`yminus2`) |
+
+### OS tokens
+
+`el96`/`el98`/`el102` — RHEL 9.6/9.8/10.2
+
+### Reading multi-@ names
+
+| Name | Meaning |
+|---|---|
+| `el96-lrel@standard1` | RHEL 9.6 + latest release, standard suite 1 |
+| `el94-y2@el96-lrel@standard1` | Start Y-2 on RHEL 9.4, upgrade to RHEL 9.6 + latest release, run standard1 |
+| `el96-yminus2@prel@src@delta-upgrade-ok` | Y-2 → Y-1 (prel) → source, static delta upgrade |
+
+## Artifact layout
+
+Per scenario, under `artifacts/<TEST_NAME>/openshift-microshift-e2e-metal-tests/artifacts/scenario-info/<scenario>/`:
+
+| File | Answers |
+|---|---|
+| `junit.xml` | Which tests failed; `testsuite name` = scenario name |
+| `rf-debug.log` | Robot Framework trace — failures marked `\| FAIL \|` |
+| `boot_and_run.log` | VM boot + orchestration; scenario-killing timeouts appear here |
+| `phase_create/junit.xml` | Infra junit from VM creation (greenboot check) |
+| `phase_run/junit.xml` | Infra junit from test run phase |
+| `vms/host1/sos/journal_*.log` | Plain-text journal exports — check FIRST for service failures, OOM, x509 |
+| `vms/host1/sos/sosreport-*.tar.xz` | Full sosreports (see below) |
+
+## Sosreports
+
+Two types: **on-failure** (captured at each test failure, includes test-created namespaces — **prefer this one**) and **end-of-scenario** (teardown, may lack test workloads). Match to failure by comparing capture timestamp with `rf-debug.log` failure time.
+
+**Journals**: use plain-text `journal_*.log` next to tarballs — no extraction needed.
+
+**Pod logs**: extract with `bash plugins/shared/scripts/extract-sosreport.sh <tarball>`. Output lands in `<tarball-parent>/sos-extracted/<sosreport-name>/`:
+
+- Pod logs: `sos_commands/microshift/namespaces/<ns>/pods/<pod>/<container>/<container>/logs/{current,previous}.log`
+- `previous.log` tail states why a dead container exited (fatal error, leader election lost, panic)
+- Cluster resources: `sos_commands/microshift/cluster-scoped-resources/`
+
+## Greenboot
+
+Before tests, the scenario waits for `greenboot-healthcheck.service` to exit. Failure → `pre_test_greenboot_check FAILED` in `phase_create/junit.xml`, no tests run. In the journal, `40_microshift_running_check.sh` lines show which deployments were waited on.
+
+## Journal reading
+
+Reconstruct a timestamped timeline before attributing fault:
+
+- Pod lifecycle: `Created container`/`Started container` (crio), `SyncLoop (PLEG)`, probe readiness transitions
+- Two `Created container` events for the same pod = first instance died — read `previous.log`
+- `apply request took too long` = apiserver/etcd latency (can cause leader-election loss)
+
+## Common patterns
+
+**Timeout cascade**: `TEST_EXECUTION_TIMEOUT` expires → one test gets `Execution terminated by signal`, all subsequent get `Test execution stopped due to a fatal error`. ONE root cause — find what consumed the time budget.
+
+**Greenboot masking**: greenboot failure → no tests run → only `phase_create/junit.xml` has the failure. Root cause is in the journal.
+
+**Shared-hypervisor contention**: all scenarios share one host. CPU/memory/disk contention → greenboot timeouts, etcd pressure, image pull timeouts. Attribute to infrastructure, not product/test.
diff --git a/plugins/microshift-ci/agents/references/structured-summary.md b/plugins/microshift-ci/agents/references/structured-summary.md
new file mode 100644
index 00000000..645a6c8e
--- /dev/null
+++ b/plugins/microshift-ci/agents/references/structured-summary.md
@@ -0,0 +1,98 @@
+# Structured Summary Output Format
+
+Output contract for CI job analysis skills, consumed by `aggregate.py`, `search-bugs.py`, and `create-report.py`.
+
+## Output Template
+
+```text
+Error Severity: {1-5}
+Stack Layer: {AWS Infra | External Infrastructure | build phase | deploy phase | test setup phase | Test Configuration | test | teardown}
+Step Name: {CI step where the error occurred}
+Error: {Exact error with log context}
+Causal Chain: {numbered list, each link cites file:line}
+Confidence: {high | medium | low}
+Suggested Remediation: {fix direction; do NOT propose test tolerance (waits/retries/timeouts) unless the product behaved correctly}
+```
+
+| Severity | Meaning |
+|---|---|
+| 5 | Release-blocking product regression — no workaround |
+| 4 | Persistent product or test failure — no workaround |
+| 3 | Persistent failure with workaround, or scoped to single scenario/arch |
+| 2 | Intermittent failure / likely flake |
+| 1 | Infrastructure noise or self-healing condition |
+
+## STRUCTURED SUMMARY JSON
+
+Append after all prose. **Both markers are required** — the parser skips the report if either is missing.
+
+```text
+--- STRUCTURED SUMMARY ---
+[ { ... } ]
+--- END STRUCTURED SUMMARY ---
+```
+
+### Fields
+
+| Field | Description |
+|---|---|
+| `severity` | 1-5 per rubric above |
+| `stack_layer` | One of the values from the template |
+| `step_name` | CI step where the error occurred |
+| `error_signature` | Concise one-line description for dedup and bug titles |
+| `root_cause` | WHY it failed — mechanism, not symptom (~80 chars, see rules below) |
+| `raw_error` | Verbatim log text — deterministic anchor (see rules below) |
+| `infrastructure_failure` | `true` if AWS/CI infra caused it, `false` otherwise |
+| `job_url` | Full Prow job URL |
+| `job_name` | Full job name |
+| `release` | Release branch (e.g. `4.22`, `main`) |
+| `remediation` | Fix direction (~120 chars). Infra → infra action. Product → code fix direction |
+| `finished` | Job finish date, `YYYY-MM-DD` |
+| `causal_chain` | Array of `{"cause", "evidence", "quote"}`. `evidence` = artifact file path with `:line`. `quote` = verbatim excerpt, no labels/commentary. **Re-read every cited file:line before finalizing** — wrong citations destroy trust. The `cause` text must use terms from actual log messages, not vague categories |
+| `confidence` | `high` / `medium` / `low` (see rules below) |
+| `analysis_gaps` | Array of strings naming missing evidence. Empty `[]` when nothing skipped |
+| `scenarios` | Array of scenario names where this failure occurred. Empty `[]` for non-scenario jobs |
+| `missing_patterns` | (optional) Array of `{"file_type", "grep_pattern", "reason"}` for patterns to add to `extract-evidence.py` |
+
+### CONFIDENCE rules
+
+- **high**: every causal-chain link directly evidenced by a quoted artifact line or graph
+- **medium**: mechanism is inferred but consistent with all evidence; citations still required — `medium` means the *interpretation* is inferred, not that citations can be omitted
+- **low**: symptom-level only — chain stops before actionable cause; `analysis_gaps` MUST be populated
+
+Do NOT inflate confidence — downstream automation acts on it.
+
+### RAW_ERROR rules
+
+Used for deterministic grouping. Two runs on the same job MUST produce the same value.
+
+1. **Copy-paste exact error text** — do NOT paraphrase
+2. **Pick ONE error** — the first fatal one
+3. **Only strip timestamps** — keep everything else verbatim
+4. **Never concatenate** multiple errors
+5. **Truncate to ~150 chars** if very long — keep the distinctive part
+
+### ROOT_CAUSE rules
+
+Used alongside RAW_ERROR for cross-release deduplication. Same underlying problem across releases MUST produce the same ROOT_CAUSE.
+
+| Field | Purpose |
+|---|---|
+| `error_signature` | WHAT failed (bug titles) |
+| `root_cause` | WHY it failed (dedup) |
+| `raw_error` | Verbatim log text (deterministic anchor) |
+
+1. **~80 chars max** — short enough for token matching
+2. **Focus on mechanism**, not symptom
+3. **Consistent across releases** — same problem = same text
+4. **Stable terms** — no version numbers, timestamps, or job names
+
+Describe the specific mechanism, not architectural generalizations ("framework expects annotation X which MicroShift does not set", not "MicroShift is single-node").
+
+### Multiple independent failures
+
+1. One entry per independent failure (different scenarios, different root causes)
+2. Same root cause = one entry — do NOT split
+3. At most 5 entries per job
+4. Cascading failures are NOT independent — report only the root failure
+5. Single failures are still a JSON array
diff --git a/plugins/microshift-ci/scripts/validate-reports.py b/plugins/microshift-ci/scripts/validate-reports.py
new file mode 120000
index 00000000..47acda1b
--- /dev/null
+++ b/plugins/microshift-ci/scripts/validate-reports.py
@@ -0,0 +1 @@
+../../shared/scripts/validate-reports.py
\ No newline at end of file
diff --git a/plugins/microshift-ci/skills/doctor/SKILL.md b/plugins/microshift-ci/skills/doctor/SKILL.md
index fdb85be8..c3e07e69 100644
--- a/plugins/microshift-ci/skills/doctor/SKILL.md
+++ b/plugins/microshift-ci/skills/doctor/SKILL.md
@@ -3,7 +3,7 @@ name: microshift-ci:doctor
 argument-hint: <release1,release2,...>
 description: Analyze CI for multiple MicroShift releases and produce an HTML summary
 user-invocable: true
-allowed-tools: Skill, Bash, Read, Write, Glob, Grep, Agent
+allowed-tools: Bash, Read, Write, Glob, Grep, Agent
 ---
 
 # microshift-ci:doctor
@@ -85,58 +85,83 @@ Compute once at the start by running `date +%y%m%d` and substituting into the pa
    - `4_disk_usage.png` — Disk usage by partition (% fill)
 3. If prerequisites are missing (`pcp2json`, `matplotlib`), the script errors and stops.
 
-### Step 2: Analyze Each Job Using /microshift-ci:prow-job
+### Step 1c: Extract Structured Evidence
 
-**Goal**: Get detailed root cause analysis for each failed job using pre-downloaded artifacts.
+**Goal**: Deterministically extract structured evidence from all job artifacts before LLM analysis. This gives each analysis agent a pre-extracted overview so it can skip exploratory file scanning and focus on root cause reasoning.
+
+**Actions**:
+
+1. Run the evidence extraction script:
+
+   ```text
+   bash plugins/microshift-ci/scripts/doctor.sh evidence --component microshift --workdir <WORKDIR>
+   ```
+
+2. The script processes each job's artifacts and produces `<WORKDIR>/evidence/evidence-<BUILD_ID>.json` containing:
+   - Failed step identification (from per-step `finished.json`)
+   - Infrastructure failure indicators (scheduling, AWS errors, CI cluster capacity)
+   - Per-scenario evidence: junit failures, RF failures, boot_and_run alerts, journal alerts (OOM, panics, container restarts, etcd pressure, OVN binding, probe failures), sosreport paths
+   - Conformance test failures
+   - Build/config error lines with context
+   - PCP graph availability
+   - Recent source commits (no path filter — product and test changes)
+   - Pre-extracted sosreports (when journal shows container restarts or crashes)
+
+3. If the script fails for some jobs, note the errors but continue — agents can fall back to raw artifacts.
+
+### Step 2: Analyze Each Job
+
+**Goal**: Get detailed root cause analysis for each failed job using evidence packs and pre-downloaded artifacts.
 
 **Actions**:
 
 1. Use the JSON summary output from Step 1 to build agent prompts. Do NOT read the job JSON files into the main conversation — the prepare script already printed all job details (artifacts_dir, build_id, job name) and agents receive artifacts_dir directly in their prompt.
-2. For **every** failed job across all releases and PRs, launch a separate **Agent** (using the `Agent` tool, NOT the `Skill` tool). For PR jobs, only launch agents for jobs with FAILURE status.
+2. Read `plugins/microshift-ci/agents/analyze-evidence.md` once. For **every** failed job across all releases and PRs, substitute the `{VARIABLE}` placeholders and launch a separate **Agent** (using the `Agent` tool). For PR jobs, only launch agents for jobs with FAILURE status.
+
+   Substitute these placeholders from the prepare script's JSON output (`job`, `url`, `build_id` fields):
 
-   **For release jobs:**
+   | Placeholder | Value |
+   |---|---|
+   | `{EVIDENCE_PACK}` | `<WORKDIR>/evidence/evidence-<BUILD_ID>.json` |
+   | `{JOB_NAME}` | `job` field (for PR jobs, append ` (PR #<PR>)`) |
+   | `{JOB_URL}` | `url` field |
+   | `{OUTPUT_FILE}` | Release: `<WORKDIR>/jobs/release-<RELEASE>-job-<N>-<JOB_ID>.txt`. PR: `<WORKDIR>/jobs/prs-job-<N>-pr<PR>-<JOB_NAME_SUFFIX>.txt` |
+
+3. Launch **ALL** agents (all releases + PRs) in a **single message** as **foreground** agents (do NOT use `run_in_background`). Foreground agents in the same message run concurrently — this is just as fast as background agents but keeps your turn active until all complete.
+4. Say "Analyzing N jobs in parallel..." in your message text alongside the Agent tool calls.
+5. When all agents return, **validate all output files**:
 
    ```text
-   Agent: subagent_type=general_purpose, prompt="Analyze this Prow job and save the report:
-   Job: <JOB_NAME>
-   URL: <JOB_URL>
-   Performance graphs (if generated): <WORKDIR>/graphs/<JOB_ID>/
-   MicroShift source (if present): <WORKDIR>/src/microshift/ (for main) or <WORKDIR>/src/microshift-release-<RELEASE>/ (for release branches)
-   1. Run /microshift-ci:prow-job <ARTIFACTS_DIR>
-   2. Your goal is the UNDERLYING root cause, not the first error in the log — follow the
-      skill's drill-down and causal-chain requirements, consulting the sosreport and the
-      performance graphs when relevant.
-   3. After the analysis completes, save the FULL report output (including the --- STRUCTURED SUMMARY --- block) to:
-      <WORKDIR>/jobs/release-<RELEASE>-job-<N>-<JOB_ID>.txt
-      Use the Write tool to save the file. The file must contain the complete analysis report.
-   4. After saving, reply with EXACTLY one line: DONE <output-file-path>. Do NOT include the
-      report text in your reply."
+   python3 plugins/microshift-ci/scripts/validate-reports.py <WORKDIR>/jobs/release-*-job-*.txt <WORKDIR>/jobs/prs-job-*.txt
    ```
 
-   **For PR jobs:**
+   If the script exits 0 (all pass), proceed to Step 3.
+
+   If it exits 1, it prints a `--- VALIDATION FAILURES ---` block listing each failed file and its errors. For each failed file, launch a **fix agent**:
 
    ```text
-   Agent: subagent_type=general_purpose, prompt="Analyze this Prow job and save the report:
-   Job: <JOB_NAME> (PR #<PR>)
-   URL: <JOB_URL>
-   Performance graphs (if generated): <WORKDIR>/graphs/<BUILD_ID>/
-   MicroShift source (if present): <WORKDIR>/src/microshift/
-   1. Run /microshift-ci:prow-job <ARTIFACTS_DIR>
-   2. Your goal is the UNDERLYING root cause, not the first error in the log — follow the
-      skill's drill-down and causal-chain requirements, consulting the sosreport and the
-      performance graphs when relevant.
-   3. After the analysis completes, save the FULL report output (including the --- STRUCTURED SUMMARY --- block) to:
-      <WORKDIR>/jobs/prs-job-<N>-pr<PR>-<JOB_NAME_SUFFIX>.txt
-      Use the Write tool to save the file. The file must contain the complete analysis report.
-   4. After saving, reply with EXACTLY one line: DONE <output-file-path>. Do NOT include the
-      report text in your reply."
+   Agent: subagent_type=general_purpose, prompt="Fix citation errors in a CI analysis report.
+
+   The report at <FAILED_FILE> has causal-chain links that cite the evidence JSON,
+   general knowledge, or lack file paths. The specific errors are:
+   <PASTE ERRORS FOR THIS FILE FROM VALIDATION OUTPUT>
+
+   Fix the report:
+   1. Read the report at <FAILED_FILE>
+   2. For each flagged causal-chain link, find the actual artifact file and line number.
+      The artifacts are under <ARTIFACTS_DIR>. Use Grep to locate the quoted text in the
+      artifact files. The evidence pack at <WORKDIR>/evidence/evidence-<BUILD_ID>.json
+      has file and line fields for each extracted alert — use those to map back to real
+      artifact paths.
+   3. If no artifact supports a causal-chain link, remove that link entirely.
+   4. Rewrite the corrected report (BOTH the human-readable Causal Chain section AND the
+      STRUCTURED SUMMARY JSON causal_chain array) back to <FAILED_FILE>.
+   5. Reply with EXACTLY: FIXED <FAILED_FILE>"
    ```
 
-   Substitute `<JOB_NAME>`, `<JOB_URL>`, and `<JOB_ID>`/`<BUILD_ID>` from the prepare script's JSON output (`job`, `url`, `build_id` fields).
+   Launch all fix agents in a single message (parallel). Then proceed to Step 3.
 
-3. Launch **ALL** agents (all releases + PRs) in a **single message** as **foreground** agents (do NOT use `run_in_background`). Foreground agents in the same message run concurrently — this is just as fast as background agents but keeps your turn active until all complete.
-4. Say "Analyzing N jobs in parallel..." in your message text alongside the Agent tool calls.
-5. When all agents return, immediately proceed to Step 3 in the same turn. Do NOT stop or end your turn between Step 2 and Step 3.
+6. Proceed to Step 3. Do NOT stop or end your turn between Step 2 and Step 3.
 
 ### Step 3: Run Bug Correlation (Dry-Run)
 
@@ -232,20 +257,22 @@ HTML report generated: <WORKDIR>/report-microshift-ci-doctor.html
 - `pcp-export-pcp2json` — for PCP graph generation
 - `matplotlib` Python package — for PCP graph plotting
 
-## Related Skills
+## Related Skills and Agents
 
-- **microshift-ci:prow-job**: Single job analysis (used by Step 2 agents)
+- **agents/analyze-evidence.md**: Evidence-aware job analysis agent (used by Step 2 — read, substitute, spawn)
+- **microshift-ci:prow-job**: Standalone job analysis from URL or artifacts directory (for manual use)
 - **microshift-ci:create-bugs**: Bug correlation and creation (used in Step 3; can also be run with `--create` after this command)
 - **microshift-ci:doctor-refresh**: Regenerate the HTML report from existing data (e.g., after `/microshift-ci:create-bugs --create`)
 
 ## Notes
 
-- **Deterministic scripts** handle: data collection, artifact download, aggregation, HTML generation
+- **Deterministic scripts** handle: data collection, artifact download, evidence extraction, aggregation, HTML generation
 - **LLM agents** handle: per-job root cause analysis (Step 2), Jira bug search and open bugs query (Step 3)
+- Step 1c evidence extraction pre-processes all artifacts so Step 2 agents (from `plugins/microshift-ci/agents/analyze-evidence.md`) receive structured evidence packs and can skip exploratory log scanning
 - `/microshift-ci:doctor-refresh` regenerates the HTML report from existing data. Use it after `/microshift-ci:create-bugs --create` to include newly created bugs
 - Step 2 agents (per-job analysis) are launched in a single parallel wave
 - Step 3 uses a single create-bugs agent with all sources (releases + rebase) comma-separated
-- The `prepare` script downloads all artifacts upfront so prow-job agents use local paths (no redundant downloads)
+- The `prepare` script downloads all artifacts upfront so analysis agents use local paths (no redundant downloads)
 - The `prepare` script also clones the MicroShift source to `<WORKDIR>/src/microshift` with per-release worktrees (`--repo openshift/microshift`); clone failure is non-fatal — agents record the absence in `analysis_gaps` and proceed
 - The `finalize` script runs aggregation and HTML generation in one call
 - All intermediate files use prescribed filenames in `<WORKDIR>` subdirectories (`jobs/`, `bugs/`) — no improvised names
diff --git a/plugins/microshift-ci/skills/prow-job/SKILL.md b/plugins/microshift-ci/skills/prow-job/SKILL.md
index 497aaa72..e733f472 100644
--- a/plugins/microshift-ci/skills/prow-job/SKILL.md
+++ b/plugins/microshift-ci/skills/prow-job/SKILL.md
@@ -1,386 +1,63 @@
 ---
 name: microshift-ci:prow-job
 argument-hint: <prow-job-url-or-artifacts-dir>
-description: Download Prow job artifacts, identify root cause of failure, and produce a structured error report
+description: Download Prow job artifacts, extract evidence, and analyze the failure
 user-invocable: true
-allowed-tools: Skill, Bash, Read, Write, Glob, Grep, Agent
+allowed-tools: Bash, Read, Write, Glob, Grep, Agent
 ---
 
 # microshift-ci:prow-job
 
-## Synopsis
-
-```bash
-/microshift-ci:prow-job <prow-job-url>
-/microshift-ci:prow-job <artifacts-dir>
-```
-
-## Description
-
-Analyzes a single Prow CI test job by scanning artifacts for errors and producing a structured failure report. Accepts either a Prow job URL (downloads artifacts) or a local directory path (uses pre-downloaded artifacts).
+Analyzes a single Prow CI job. Accepts a Prow URL or local artifacts directory.
+Downloads artifacts if needed, extracts structured evidence, then spawns an
+analyze-evidence agent for root cause analysis.
 
 ## Arguments
 
-- `<ARGUMENTS>` (required): Either a job URL or a local artifacts directory path:
-  - **Prow URL**: `https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-microshift-release-4.21-periodics-e2e-aws-ovn-ocp-conformance-serial/1984108354347208704`
-  - **GCS web URL**: `https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/periodic-ci-openshift-microshift-release-4.21-periodics-e2e-aws-ovn-ocp-conformance-serial/1984108354347208704`
-  - **Local artifacts directory**: `/tmp/microshift-ci-claude-workdir.260404/artifacts/1984108354347208704` (must contain `build-log.txt` and `finished.json`)
-
-## Goal
-
-Reduce noise for developers by processing large logs from a CI test pipeline and producing a verified root cause analysis, not just the first error found. A report is acceptable when:
-
-- The failing step and (for test failures) the failing test/scenario are named
-- The causal chain bottoms out in an actionable cause (a specific code, configuration, test, or infrastructure problem someone can act on) — or in an explicitly recorded evidence gap
-- Every causal-chain link cites evidence from the artifacts (file path and line where applicable)
-- The analysis determines whether the **product** or the **test** is at fault. The purpose of this analysis is to surface product defects — NOT to make tests green. "Make the test wait/retry/tolerate" is not a root cause unless the product behavior has been shown to be correct.
-
-## Audience
-
-Software Engineer
-
-## Glossary
-
-- **ci-config**: Top level configuration file specifying build inputs, versions, and test workflows to execute. Periodic tests are suffixed with `__periodic.yaml`.
-- **test**: The set of configurations and commands that specify how to execute the test. Can be defined in-line in ci-config, or as individual "steps" (see below).
-- **step-registry**: Root directory where all openshift-ci test step configs and commands are stored.
-- **step**: Smallest component of the test infrastructure. A step yaml specifies the command or script to execute, environmental variables and default values, and step metadata. Also called "ref" or "step ref".
-- **chain**: A yaml configuration specifying 1 or more steps or chains in an array. Steps and chains are exploded and executed serially by index. May override step environment variable values.
-- **workflow**: A yaml configuration specifying 1 or more steps, chains, or workflows in an array. Steps, chains, and workflows are exploded and executed serially. May override chain or step environmental variable values. Typically referenced by a test in a ci-config.
-- **scenario**: MicroShift integration tests are built on the robotframework test framework. A "scenario" represents the RF suite, the test's environment, the microshift deployment, and the virtual machine on which the entire testing process takes place. Scenarios also include the manner of deployment: rpm-ostree, rpm installation, or bootc container.
-
-## Job Name and Job ID
-
-The Job Name and Job ID are encoded in the URL. There are two URL formats depending on the job type:
-
-**Periodic/postsubmit jobs:**
-
-```text
-https://prow.ci.openshift.org/view/gs/test-platform-results/logs/{JOB_NAME}/{JOB_ID}
-https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/logs/{JOB_NAME}/{JOB_ID}
-```
-
-GCS path: `gs://test-platform-results/logs/{JOB_NAME}/{JOB_ID}/`
-
-**Presubmit (PR) jobs:**
-
-```text
-https://prow.ci.openshift.org/view/gs/test-platform-results/pr-logs/pull/openshift_microshift/{PR_NUMBER}/{JOB_NAME}/{JOB_ID}
-https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/pr-logs/pull/openshift_microshift/{PR_NUMBER}/{JOB_NAME}/{JOB_ID}
-```
-
-GCS path: `gs://test-platform-results/pr-logs/pull/openshift_microshift/{PR_NUMBER}/{JOB_NAME}/{JOB_ID}/`
-
-To determine the GCS path from any job URL, strip the web prefix and replace with `gs://`:
-
-- Prow URL: strip `https://prow.ci.openshift.org/view/gs/`
-- GCS web URL: strip `https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/`
-
-## Important Files
-
-> These files are available after artifacts are downloaded (via the download script or workflow step 0).
-> For a map of which artifact answers which question (scenario naming, journal patterns, sosreport layout, timeout cascades), read `references/microshift-ci-primer.md` next to this skill.
+`<ARGUMENTS>`: Prow URL, GCS web URL, or local artifacts directory.
 
-- `<TMP>/build-log.txt`: Log containing prow job output and most likely place to identify AWS infra related or hypervisor related errors.
-- `<STEP>/build-log.txt`: Each step in the CI job is individually logged in a build-log.txt file.
-- `<TMP>/artifacts/<TEST_NAME>/openshift-microshift-infra-sos-aws/artifacts/sosreport-*.tar.xz`: Compressed archive containing select portions of the test host's filesystem, relevant logs, and system configurations. `<TEST_NAME>` varies by job (e.g., `e2e-aws-tests`, `e2e-aws-ovn-ocp-conformance-arm64`).
-- `<TMP>/artifacts/<TEST_NAME>/openshift-microshift-e2e-origin-conformance/build-log.txt`: Step-specific build log for origin conformance tests.
-
-## Important Links
-
-**Step Diagram URL** (found at the end of the main build-log):
-
-```text
-https://steps.ci.openshift.org/job?org=openshift&repo=microshift&branch=release-4.19&test=e2e-aws-tests-bootc-nightly&variant=periodics
-```
-
-This link provides a diagram of the steps that make up the test. Think about reading this diagram when identifying step failures because not all fatal errors cause the current step to fail but may cause the next step to fail.
-
-**SOS Report** (contains pod/container logs and cluster-scoped resources)
-
-**Journals:** use the plain-text `journal_*.log` files next to the sosreport tarballs (e.g., `scenario-info/<scenario>/vms/host1/sos/journal_*.log`). These are readable directly with Read/Grep and contain the journal evidence you need (service failures, x509 errors, OOM kills, microshift unit logs).
-
-**Pod logs, cluster state, inspect outputs:** extract a specific sosreport tarball when you need pod logs (container crashes, restarts, probe failures). The extraction script pulls pod logs, inspect outputs, and cluster-scoped resources.
-
-**When to extract a sosreport:** when the journal shows `CrashLoopBackOff`, `Back-off restarting`, repeated `Created container` events, or probe failures after readiness. Pod and container logs — in particular `previous.log`, the only record of WHY a dead container exited — exist exclusively inside the sosreport tarball.
-
-**How to extract:** find the tarball for the scenario, then run the extraction script on that single tarball:
-
-```bash
-# Find sosreport tarballs for the scenario
-find <scenario-dir>/.. -name 'sosreport-*.tar.xz'
-
-# Extract only pod logs, inspect outputs, and cluster-scoped resources
-bash plugins/shared/scripts/extract-sosreport.sh <tarball-path>
-```
-
-The script prints the extraction directory to stdout. Extracted files land in `<tarball-parent>/sos-extracted/<sosreport-name>/`. The extraction is idempotent — running it again on the same tarball is a no-op. Inside the extracted tree:
-
-- `sos_commands/microshift/namespaces/<namespace>/pods/<pod>/<container>/<container>/logs/{current,previous}.log` — container logs
-- `sos_commands/microshift/namespaces/<namespace>/core/{pods.yaml,events.yaml}` — pod status and events
-- `sos_commands/microshift/cluster-scoped-resources/` — nodes, CRDs, webhooks
-- `sos_commands/*/inspect_*` — component command outputs
-
-**There may be several sosreports for a single scenario**: the test framework's sos-on-failure listener (`test/resources/sos-on-failure-listener.py` in openshift/microshift) captures a sosreport at the moment of each test failure, in addition to the one collected at the end of the scenario. **Prefer the on-failure sosreport when investigating a specific test failure**: it contains the pods and container logs of the namespaces created specifically for that test (suite), which are absent from the end-of-scenario sosreport because they have already been cleaned up by then. Match a sosreport to its test failure by capture time.
-
-Correlate journal entries with the failure timestamp recorded during the Characterize phase.
-
-## Performance Graphs
-
-When the input is a local artifacts directory of the form `<WORKDIR>/artifacts/<BUILD_ID>` (the doctor workflow), pre-generated PCP performance graphs may exist in the sibling directory:
-
-```text
-<WORKDIR>/graphs/<BUILD_ID>/
-  1_cpu_usage.png    — CPU usage (user, system, I/O wait)
-  2_mem_usage.png    — Memory usage (used, cached)
-  3_disk_io.png      — Disk I/O (read/write OPS, await)
-  4_disk_usage.png   — Disk usage by partition (% fill)
-```
-
-Use the Read tool to view these PNGs during the drill-down phase whenever the failure involves a timeout, slowness, readiness/health-check expiry, eviction, OOM, or any resource-related error. Look for CPU saturation, memory exhaustion, or disk I/O stalls overlapping the failure window. If the directory does not exist (e.g., standalone URL invocation), skip graph correlation — do not attempt to generate graphs.
+URL formats — periodic: `.../logs/{JOB_NAME}/{JOB_ID}`, presubmit: `.../pr-logs/pull/openshift_microshift/{PR}/{JOB_NAME}/{JOB_ID}`.
+Hosts: `prow.ci.openshift.org/view/gs/test-platform-results/...` or `gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/test-platform-results/...`.
 
 ## Work Directory
 
-Compute once at the start by running `date +%y%m%d` and substituting into the path below. In all commands, replace `<WORKDIR>` with the computed path — do not store the work directory in a shell variable.
-
-```text
-/tmp/microshift-ci-claude-workdir.<YYMMDD>
-```
-
-## Common Commands
-
-Scan the build log for arbitrary text:
-
-```bash
-grep '${SOME_TEXT}' ${GREP_OPTS} ${TMP}/build-log.txt
-```
-
-Download all prow job artifacts (only needed when given a URL, not a local path):
-
-```bash
-GCS_PATH=$(echo "${PROW_URL}" | sed -e 's|https://prow.ci.openshift.org/view/gs/|gs://|' -e 's|https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/|gs://|')
-gsutil -q -m cp -r "${GCS_PATH}/" ${TMP}/
-```
+`/tmp/microshift-ci-claude-workdir.<YYMMDD>` — compute `<YYMMDD>` once via `date +%y%m%d`.
 
 ## Workflow
 
 The user argument is: `<ARGUMENTS>`
 
-0. **Determine input type and set up artifacts directory**:
-   - If `<ARGUMENTS>` is a **local directory path** (starts with `/` and contains `build-log.txt`): set `TMP` to that directory. Skip step 1.
-   - If `<ARGUMENTS>` is a **URL** (starts with `http`): create a temporary working directory with `mktemp -d <WORKDIR>/openshift-ci-analysis-XXXX`, set `TMP` to that directory, and proceed to step 1.
+1. **Set up artifacts**:
+   - Local path (starts with `/`): use it as `<TMP>`. Skip step 2.
+   - URL: create `<TMP>` with `mktemp -d <WORKDIR>/openshift-ci-analysis-XXXX`.
 
-1. **Download all artifacts** (skip if using pre-downloaded artifacts from step 0):
-   Download all prow job artifacts using `gsutil -q -m cp -r` into the temporary working directory. Derive the GCS path by stripping the web prefix from the job URL (handles both Prow and GCS web URL formats):
+2. **Download** (URL only):
 
    ```bash
-   GCS_PATH=$(echo "${PROW_URL}" | sed -e 's|https://prow.ci.openshift.org/view/gs/|gs://|' -e 's|https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/|gs://|')
-   gsutil -q -m cp -r "${GCS_PATH}/" ${TMP}/
+   GCS_PATH=$(echo "<URL>" | sed -e 's|https://prow.ci.openshift.org/view/gs/|gs://|' \
+                                  -e 's|https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/|gs://|')
+   gsutil -q -m cp -r "${GCS_PATH}/" <TMP>/
    ```
 
-   This works for both periodic (`logs/...`) and presubmit PR (`pr-logs/pull/...`) job URLs, and for both Prow and GCS web URL formats.
-   This makes all build logs, step logs, and SOS reports available locally for analysis.
-
-2. **Localize — identify the failed step and the anchor error**:
-   - Scan the top level `build-log.txt` to determine the step where the failure occurred (the last `Running step ...` line before the container logs is a quick anchor — see Tips), then open that step's own `build-log.txt`.
-   - Record each candidate error with its filepath, line number, and timestamp. Read 50 lines before and 50 lines after each to separate the fatal error from setup/teardown noise.
-   - Select the **anchor error**: the first fatal error that caused the step to fail. This becomes `raw_error` in the report.
-   - **The anchor identifies the failure for deduplication — it is NOT the conclusion of the investigation. The first error found is rarely the root cause.**
-
-3. **Characterize — establish exactly WHAT failed before asking why**:
-   - For test steps with scenarios: enumerate the failing tests from `scenario-info/<scenario>/junit.xml` under the step's artifacts, then read the failing scenario's `rf-debug.log` and `phase_*/` logs (Robot Framework marks failures with `| FAIL |`). Record the failing scenario name(s) — the top-level `testsuite name` in each junit.xml — they populate the `scenarios` field in the report.
-   - For each failing scenario, check the plain-text `journal_*.log` files (next to the sosreport tarballs) for fatal patterns (panics, OOM kills, `leader election lost`, container exits). If the journal shows container crashes or restarts, extract the specific sosreport tarball with `bash plugins/shared/scripts/extract-sosreport.sh <tarball>` and read the pod logs (see SOS Report section).
-   - For conformance steps: extract the failing test names and their failure output from the step's `build-log.txt`.
-   - For build/infra steps: extract the failing command and its complete error output from the step log.
-   - Record the failure timestamp(s) — they drive the journal and graph correlation in the next phase.
-   - When the MicroShift source checkout is available — check with Glob for `<WORKDIR>/src/microshift-release-<RELEASE>/` (release jobs) or `<WORKDIR>/src/microshift/` (main) — read the failing test's source: Robot Framework suites under `test/suites/`, scenario definitions under `test/scenarios*/`. Its assertions, timeouts, and setup are how you distinguish a test bug from a product bug. If the checkout is absent, note `"source checkout not available"` in `analysis_gaps` and continue.
-   - Decide the stack layer: cloud infra, ci-config, hypervisor, or a legitimate test failure — and for test failures, the stage: setup, testing, teardown.
+3. **Extract evidence**:
 
-4. **Drill down — iterate hypothesis → evidence until the cause is actionable**:
-   Repeat this loop until you reach a cause that is **actionable** (a specific code, configuration, test, or infrastructure problem someone can act on) or until the available evidence is exhausted:
-   - State a hypothesis for WHY the error in hand occurred.
-   - Seek confirming or refuting evidence ONE LAYER DEEPER than the current log:
-     - **Journal** — ALWAYS check the plain-text `journal_*.log` files for the scenario (see SOS Report section). Correlate with the failure timestamp (entries within ±5 minutes) and scan for OOM kills, segfaults, service restarts, and disk pressure.
-     - **Sosreport** — when the journal shows container crashes or restarts, extract the specific sosreport tarball with `bash plugins/shared/scripts/extract-sosreport.sh <tarball>` (see SOS Report section for how to pick the right one when several exist). Read the pod/container logs of the failing workload.
-     - **Performance graphs** — when the failure involves a timeout, slowness, readiness/health-check expiry, eviction, or any resource error, Read the PNGs (see Performance Graphs section) and look for saturation overlapping the failure window.
-   - Treat restating errors as symptoms: an error like "timed out waiting for X" is NOT a root cause — explain why X was slow or absent, or explicitly record that the evidence ran out.
-   - **A test-layer fix is never the bottom when a product component misbehaved.** When the failure involves a product component that was unavailable, not ready, crashed, or slow ("no endpoints available", "connection refused", "not ready", "CrashLoopBackOff", probe failures), you MUST reconstruct that component's story from the journal and its pod logs before concluding. Build an exact timestamped timeline: when was the pod created, when did each container start, when did it become ready, did probes fail afterwards, did it restart, and why. Only then attribute the failure:
-     - **Product defect** — the component became ready and later flapped, crashed, or stopped serving (e.g., readiness flips back to not-ready, liveness probe connection refused after startup, container exits and restarts). Report the product mechanism as the root cause even if a test-side wait would also "fix" the symptom.
-     - **Test defect** — the component was still starting up normally and the test simply ran too early against a documented startup sequence.
-   - **Always check for container restarts.** Grep the journal for repeated `Created container`/`Started container` (crio) and `RemoveContainer`/PLEG events (kubelet) for the same pod. Two container instances for one pod means the first one DIED — a single startup story is the wrong narrative. Extract the sosreport (`bash plugins/shared/scripts/extract-sosreport.sh <tarball>`) and read the dead container's log at `sos_commands/microshift/namespaces/<namespace>/pods/<pod>/<container>/<container>/logs/previous.log` (`current.log` is the running instance). The last ~20 lines of `previous.log` usually state the exit reason (fatal error, leader election lost, panic, OOM).
-   - Record every accepted hop as a causal-chain link with its evidence file and line — these become `causal_chain` in the report. Discarded hypotheses do not go into the chain.
+   ```bash
+   python3 plugins/shared/scripts/extract-evidence.py --artifacts-dir <TMP> --workdir <WORKDIR>
+   ```
 
-5. **Corroborate — cross-check the explanation**:
-   - When the source checkout is available, list commits from the last month that could be related:
+   Produces `<WORKDIR>/evidence/evidence-<BUILD_ID>.json`. The `<BUILD_ID>` is the last path component of `<TMP>`.
 
-     ```text
-     bash plugins/microshift-ci/scripts/repo-log.sh <SRC_DIR> --since <1_MONTH_BEFORE_FINISHED> --until <FINISHED_DATE> --paths test/
-     ```
+4. **Analyze**: Read `plugins/microshift-ci/agents/analyze-evidence.md`. Substitute placeholders:
 
-     Derive `FINISHED_DATE` from the job's `finished.json` timestamp. Drop `--paths` to see all changes. Name candidate commits in the causal chain when their timing and touched paths match the failure.
-   - If multiple scenarios in this job failed, decide cascade vs independent using the **timeline** (which failed first; did the earlier failure poison shared state?), not just error-text similarity.
+   | Placeholder | Value |
+   |---|---|
+   | `{EVIDENCE_PACK}` | `<WORKDIR>/evidence/evidence-<BUILD_ID>.json` |
+   | `{JOB_NAME}` | job name extracted from URL or directory path |
+   | `{JOB_URL}` | the original URL (or reconstruct from artifacts path) |
+   | `{OUTPUT_FILE}` | `<WORKDIR>/report-<BUILD_ID>.txt` |
 
-6. **Produce a report**: Create a concise report of the failure. The report MUST specify:
-   - Where in the pipeline the error occurred
-   - The specific step the error occurred in
-   - Whether the test failure was legitimate (i.e., a test failed) or due to an infrastructure failure (i.e., build image was not found, AWS infra failed due to quota, hypervisor failed to create test host VM, etc.)
-   - The causal chain from the observed symptom to the root cause, each link backed by evidence (file and line)
-   - A confidence rating for the root cause (see the field rules below)
+   Spawn the agent with the substituted content. When it replies `DONE`, read the output file and present the report to the user.
 
 ## Prerequisites
 
-- `gsutil` CLI must be installed for GCS access (uses anonymous access on public buckets; only needed for URL input — pre-downloaded artifacts skip it)
-- Internet access to fetch job data from Prow/GCS
-- Bash shell
-
-## Tips
-
-1. There are many setup and teardown stages so fatal errors may be buried by log output from the teardown phase. It is not common to find the fatal error at the end of the log.
-2. You can quickly determine the failed step from the build-log.txt by reading the last `Running step e2e-aws-tests-bootc-nightly-openshift-microshift-e2e-metal-tests` line before the container logs appear.
-
-## Output Template
-
-Use this template for your error analysis reports:
-
-```text
-Error Severity: {1-5, per the rubric below}
-Stack Layer: {AWS Infra, External Infrastructure, build phase, deploy phase, test setup phase, Test Configuration, test, teardown}
-Step Name: {The specific step where the error occurred}
-Error: {The exact error, including additional log context if it relates to the failure}
-Causal Chain: {numbered list from observed symptom to root cause; each link cites its evidence as file:line}
-Confidence: {high | medium | low — see CONFIDENCE rules below}
-Suggested Remediation: {Based on where the error occurs, think hard about how to correct the error ONLY if it requires fixing. Infrastructure failures may not require code changes.}
-```
-
-### Severity rubric
-
-| Severity | Meaning |
-|---|---|
-| 5 | Release-blocking product regression — product broken, no workaround |
-| 4 | Persistent product or test failure with no workaround |
-| 3 | Persistent failure with a workaround, or scoped to a single scenario/architecture |
-| 2 | Intermittent failure / likely flake |
-| 1 | Infrastructure noise or self-healing condition |
-
-After the human-readable report above, append a machine-readable JSON block for downstream automation. This block MUST appear at the very end of the report, after all prose and analysis. The block is a JSON array with one object per failure.
-
-**CRITICAL:** You MUST include BOTH the opening `--- STRUCTURED SUMMARY ---` marker AND the closing `--- END STRUCTURED SUMMARY ---` marker.
-
-```text
---- STRUCTURED SUMMARY ---
-[
-  {
-    "severity": 3,
-    "stack_layer": "test",
-    "step_name": "openshift-microshift-e2e-metal-tests",
-    "error_signature": "cert-manager not ready within greenboot 10m timeout on ARM",
-    "root_cause": "greenboot health check timeout during slow ARM service deployment",
-    "raw_error": "cert-manager webhook not ready after 600s",
-    "infrastructure_failure": false,
-    "job_url": "https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-microshift-release-4.22-periodics-e2e-aws-tests-arm-nightly/123456",
-    "job_name": "periodic-ci-openshift-microshift-release-4.22-periodics-e2e-aws-tests-arm-nightly",
-    "release": "4.22",
-    "remediation": "investigate greenboot timeout configuration for ARM deployments",
-    "finished": "2026-06-01",
-    "causal_chain": [
-      {"cause": "cert-manager webhook pod not Ready before greenboot deadline",
-       "evidence": "artifacts/e2e-aws-tests-arm-nightly/openshift-microshift-e2e-metal-tests/artifacts/scenario-info/el96-lrel@standard1/rf-debug.log:2241",
-       "quote": "cert-manager webhook not ready after 600s"},
-      {"cause": "image pulls saturated disk I/O, delaying all service startups",
-       "evidence": "graphs/123456/3_disk_io.png",
-       "quote": "write await >800ms during 06:18-06:24 startup window"}
-    ],
-    "confidence": "medium",
-    "analysis_gaps": [],
-    "scenarios": ["el96-lrel@standard1", "el94-y2@el96-lrel@standard1"]
-  }
-]
---- END STRUCTURED SUMMARY ---
-```
-
-**Field descriptions:**
-
-- `severity`: 1-5, same as Error Severity above
-- `stack_layer`: one of: AWS Infra, External Infrastructure, build phase, deploy phase, test setup phase, Test Configuration, test, teardown
-- `step_name`: the CI step where the error occurred
-- `error_signature`: a concise, unique one-line description of the root cause — not the full error, just enough to identify and deduplicate this failure
-- `root_cause`: one-line description of WHY the failure happened — the underlying mechanism, not the surface symptom (~80 chars max, see rules below)
-- `raw_error`: the primary error message copied VERBATIM from the log file (see rules below)
-- `infrastructure_failure`: true if stack_layer is AWS Infra or the failure is due to CI infrastructure rather than product code, false otherwise
-- `job_url`: the full prow job URL — when given a URL as input, use it directly; when given a local artifacts dir, reconstruct from the build-log.txt "Link to job on registry info site" line or from the directory path structure
-- `job_name`: the full job name — extract from the job_url path, or from the build-log.txt "Running step" lines, or from the artifacts directory structure
-- `release`: the release branch — extract from job_name (e.g. 4.22 from release-4.22), or from finished.json metadata repos field, or default to "main"
-- `remediation`: suggested fix or next step — what should be done to address this failure (~120 chars max). For infrastructure failures, state the infra action (e.g. "retry the job", "rotate AWS credentials"). For product bugs, state the code-level fix direction. Do NOT propose making the test more tolerant (waits, retries, longer timeouts) unless the causal chain shows the product behaved correctly — masking a product flake with a test change hides the defect
-- `finished`: the job finish date in YYYY-MM-DD format, extracted from finished.json timestamp field or build log timestamps
-- `causal_chain`: array of links from the observed symptom toward the root cause, in order, built during the drill-down phase. Each link is `{"cause": ..., "evidence": ..., "quote": ...}` where `evidence` is the artifact file path (relative to the artifacts dir, with `:line` where applicable) and `quote` is a short verbatim excerpt supporting the link — copied exactly, with NO prepended labels, summaries, or commentary. The evidence path MUST be a file that actually exists — cite the path you read, not a description of it. **Before finalizing the report, re-read every cited `file:line` and confirm the quote is actually there** — a wrong citation destroys trust in the whole analysis and is worse than an honest gap. A single-link chain is valid when the anchor error IS the actionable cause
-- `confidence`: one of `high`, `medium`, `low` (see CONFIDENCE rules below)
-- `analysis_gaps`: array of strings naming evidence that was missing or could not be checked (e.g. `"no sosreport in artifacts"`, `"source checkout not available"`). Empty array when nothing was skipped
-- `scenarios`: array of scenario names in which this failure occurred, taken from the `scenario-info/<scenario>/` directory names or the junit `testsuite name` (e.g. `["el96-lrel@standard2"]`). Empty array `[]` for non-scenario-based jobs and for build/infra failures that happen before scenarios run
-
-### CONFIDENCE rules
-
-- `high`: every causal-chain link, including the final (root) one, is directly evidenced by a quoted artifact line or graph
-- `medium`: the mechanism is inferred but consistent with all available evidence; no link is contradicted
-- `low`: the analysis is symptom-level only — the chain stops before an actionable cause because the evidence ran out (`analysis_gaps` MUST be populated in this case)
-
-Do NOT inflate confidence: downstream automation uses it to decide whether to act on the analysis. A `low` confidence report with honest gaps is more useful than a `high` confidence guess.
-
-### RAW_ERROR rules
-
-The `RAW_ERROR` field is used by downstream scripts for deterministic grouping. Two runs analyzing the same job MUST produce the same RAW_ERROR. Keep it simple — fewer rules mean less room for variation.
-
-RAW_ERROR is the **deduplication anchor**, not the investigation result: picking the first fatal error here does NOT mean the analysis stops there — the drill-down phase and `causal_chain` capture the actual root cause investigation.
-
-1. **Copy-paste the exact error text** from the log — do NOT paraphrase, summarize, or reword
-2. **Pick only ONE error** — the primary error that caused the step to fail. If multiple errors exist, pick the first fatal one.
-3. **Only strip timestamps** — remove leading timestamps like `2026-04-01T06:21:48Z`. Keep everything else verbatim, including prefixes like `An error occurred...` or `error:`.
-4. **Never concatenate multiple errors** — pick ONE error, not a semicolon-separated list
-5. **Truncate to ~150 characters** if the raw message is very long — keep the distinctive part
-
-Examples of good RAW_ERROR values (copied verbatim from logs):
-
-- `An error occurred (InvalidClientTokenId) when calling the CreateStack operation: The security token included in the request is invalid.`
-- `panic: runtime error: index out of range [6] with length 6`
-- `Process did not finish before 4h0m0s timeout`
-- `error: the server doesn't have a resource type "clusterversion"`
-- `package github.com/opencontainers/runc/libcontainer/cgroups: module github.com/opencontainers/runc@latest found, but does not contain package`
-
-The ERROR_SIGNATURE field remains as a human-readable description for reports and Jira bug titles.
-
-### ROOT_CAUSE rules
-
-The `ROOT_CAUSE` field captures the underlying mechanism behind the failure — used by downstream scripts alongside `RAW_ERROR` for cross-release deduplication. Two jobs that fail with different surface errors but the same root cause should produce the same `ROOT_CAUSE`.
-
-**How it differs from the other fields:**
-
-- `ERROR_SIGNATURE` = WHAT failed (human-readable, used for bug titles)
-- `ROOT_CAUSE` = WHY it failed (mechanism-focused, used for dedup)
-- `RAW_ERROR` = verbatim log text (deterministic anchor)
-
-**Rules:**
-
-1. **One line, ~80 characters max** — short enough for token-based matching
-2. **Focus on the mechanism**, not the symptom — ask "why did this happen?" not "what error appeared?"
-3. **Be consistent across releases** — the same underlying problem in 4.20 and 4.22 MUST produce the same ROOT_CAUSE even if the error messages differ
-4. **Use stable terms** — avoid version numbers, timestamps, job names, or other run-specific details
-
-**Examples:**
-
-| ERROR_SIGNATURE | ROOT_CAUSE |
-|---|---|
-| MonitorTest failures (SCC annotations, disruption pollers) on ARM64 | OCP MonitorTest framework incompatible with MicroShift single-node topology |
-| Pod-network-disruption monitor poller CrashLoopBackOff on ARM64 | OCP MonitorTest framework incompatible with MicroShift single-node topology |
-| cert-manager not ready within greenboot 10m timeout on ARM | greenboot health check timeout during slow ARM service deployment |
-| InvalidClientTokenId when calling CreateStack | expired or invalid AWS credentials in CI environment |
-
-### Multiple independent failures
-
-When a job has multiple independent test failures across different scenarios, produce **one entry per failure** in the JSON array. Each entry must be self-contained with all fields populated.
-
-**Rules:**
-
-1. **One entry per independent failure** — failures are independent when they occur in different test scenarios with different root causes (e.g., cert-manager timeout in one scenario and storage PV error in another)
-2. **Same root cause = one entry** — when multiple scenarios fail with the same root cause, produce ONE entry. Do NOT split them into separate entries.
-3. **At most 5 entries per job** — if more than 5 independent failures exist, report the 5 most severe
-4. **Cascading failures are NOT independent** — when one failure causes others (e.g., a setup failure causing all subsequent tests to fail), report only the root failure
-5. **Single failures are still an array** — even when there is only one failure, wrap it in a JSON array
+- `gsutil` CLI (for URL input), Python 3, Bash
diff --git a/plugins/microshift-ci/skills/prow-job/references/microshift-ci-primer.md b/plugins/microshift-ci/skills/prow-job/references/microshift-ci-primer.md
deleted file mode 100644
index 9b29056a..00000000
--- a/plugins/microshift-ci/skills/prow-job/references/microshift-ci-primer.md
+++ /dev/null
@@ -1,253 +0,0 @@
-# MicroShift CI Artifact Primer
-
-Reference for analyzing MicroShift Prow job artifacts. Read this when
-unfamiliar with the artifact layout — it answers "which file answers
-which question".
-
-## Job types
-
-- **Scenario-based e2e jobs** (`e2e-aws-tests-*`): the
-  `openshift-microshift-e2e-metal-tests` step boots ~20 VM-based test
-  scenarios on a hypervisor host. Failures are per-scenario; "the job
-  failed" usually means at least 1 scenario failed.
-- **Direct-test jobs** (`*-ocp-conformance-*`, `e2e-aws-ai-model-serving-nightly`,
-  `e2e-aws-footprint-and-performance-*`): run their test suite directly,
-  no scenario fan-out. Job history IS the test history for these.
-
-## Test framework
-
-Tests are written in [Robot Framework](https://robotframework.org).
-Suites live in `test/suites/` as `.robot` files. Shared keywords and
-Python helpers live in `test/resources/` (e.g. `common.resource`,
-`microshift-host.resource`, `ostree.resource`, `sos-on-failure-listener.py`).
-Each scenario defines which suites to run, the VM image to boot, and
-any Robot variables (e.g. `EXPECTED_OS_VERSION`, `TARGET_REF`).
-
-Key runtime settings (overridable per scenario):
-
-- `TEST_EXECUTION_TIMEOUT` — default `30m`; the scenario runner wraps
-  Robot Framework in `timeout -v --kill-after=5m <timeout>`. When the
-  suite total exceeds this, the current test dies with
-  `Execution terminated by signal` and every subsequent test reports
-  `Test execution stopped due to a fatal error` — a cascade with ONE
-  root cause (the time budget), not independent failures.
-- `TEST_RANDOMIZATION` — default `all`; tests run in random order, so
-  test ordering in `rf-debug.log` varies between runs.
-- `TEST_EXCLUDES` — tag-based exclusion (default `none`).
-
-## Deployment types: ostree vs bootc vs RPM
-
-There are three distinct deployment pipelines for MicroShift on VMs.
-Scenarios (`.sh` files) are the same structure for all three, but how
-MicroShift gets onto the VM differs:
-
-- **ostree (rpm-ostree)** — images defined as TOML blueprints in
-  `test/image-blueprints/`. Built by `osbuild-composer` into
-  edge-commit images, installed via kickstart + ISO. Scenarios live
-  under `test/scenarios/`.
-- **bootc** — images defined as Containerfiles (with Go template
-  support) in `test/image-blueprints-bootc/`. Built as OCI container
-  images, installed via bootc. Scenarios live under
-  `test/scenarios-bootc/`.
-- **RPM** — a non-ostree RHEL system installed from a live image
-  (`kickstart-liveimg.ks.template` with `main-liveimg.cfg`), similar
-  to isolated/offline scenarios. MicroShift may be pre-installed in the
-  image or installed at test time via `dnf` from source-built or Brew
-  RPM repos. RPM suites live in `test/suites/rpm/` (install,
-  upgrade, remove).
-
-The job name indicates which pipeline was used (e.g.
-`e2e-aws-tests-bootc-*` vs `e2e-aws-tests-*`). All three produce the
-same artifact layout under `scenario-info/`.
-
-## Scenario naming
-
-Scenario names encode OS image, MicroShift version source, and suite.
-The `@` separator chains stages left-to-right: starting image →
-intermediate upgrades → final image → test suite.
-
-### Version-source markers
-
-- `src` — built from source (the code in the PR or branch)
-- `base` — built from the PR's target branch
-- `prel` — previous minor release (Y-1 as a released build)
-- `crel` — current minor release (already-released RPMs: EC, RC, or
-  z-stream); skipped shortly after branch cut before the first EC
-- `lrel` — latest available release (EC, RC, or z-stream) from internal
-  Red Hat staging repositories
-- `zprel` — z-previous release: latest z-stream from the rhocp repository
-- `y1` / `y2` — Y-1 / Y-2 minor versions back (e.g. on release-4.22,
-  `y1` = 4.21, `y2` = 4.20); also spelled `yminus1` / `yminus2` in
-  some scenario filenames
-
-### OS version tokens
-
-- `el96` / `el98` / `el102` — RHEL 9.6 / 9.8 / 10.2
-
-### Reading multi-@ names
-
-| Name | Meaning |
-| ---- | ------- |
-| `el96-lrel@standard1` | RHEL 9.6 + latest release of MicroShift, standard suite 1 |
-| `el94-y2@el96-lrel@standard1` | Start on RHEL 9.4 + Y-2 MicroShift, upgrade to RHEL 9.6 + latest release of MicroShift, run standard suite 1 |
-| `el96-yminus2@prel@src@delta-upgrade-ok` | Start on RHEL 9.6 + Y-2, upgrade through Y-1 (prel) to source, using static deltas |
-
-The last `@`-segment is always the test suite or test type.
-
-### Suite tokens
-
-`standard1`/`standard2`, `lvm`, `dual-stack`, `ipv6`, `multi-nic`,
-`low-latency`, `ginkgo-tests`, `ai-model-serving-online`, `osconfig`,
-`storage`, `tlsv13-*`, `multi-config-*`, `c2cc`, `c2cc-ipv6`,
-`c2cc-ipsec`, `upgrade-ok`, `upgrade-fails-*`, `auto-recovery`,
-`greenboot`, `fips`, `offline`, `isolated-net`, `cncf-conformance`,
-`rpm-*`, `delta-upgrade-*`
-
-### Disabled scenarios
-
-Scenario files ending in `.sh.disabled` are skipped by the CI runner.
-They appear in the repo but produce no artifacts.
-
-Scenario definitions (what each one deploys and runs) live in
-openshift/microshift under `test/scenarios*/` (e.g.
-`test/scenarios-bootc/el9/`); Robot Framework suites under
-`test/suites/`.
-
-## How scenarios run in CI
-
-All scenarios run **in parallel** via GNU `parallel`:
-
-```text
-parallel --results <scenario-info>/{/.}/boot_and_run.log \
-    --delay 5 \
-    bash -x ./bin/scenario.sh create-and-run ::: <scenarios>/*.sh
-```
-
-`create-and-run` executes two phases per scenario:
-
-1. **create** (`action_create`) — load scenario script, create VMs,
-   wait for greenboot health check, collect SOS report + PCP archives
-   on failure. Infrastructure junit goes to `phase_create/junit.xml`.
-2. **run** (`action_run`) — execute `scenario_run_tests()` (which calls
-   Robot Framework), collect SOS + PCP on failure. Infrastructure junit
-   goes to `phase_run/junit.xml`.
-
-Because scenarios run in parallel on the same hypervisor, resource
-contention (CPU, disk I/O, memory) can cause timeouts that don't
-reproduce in isolation. These are infrastructure failures — still
-report them, but attribute them to shared-hypervisor contention rather
-than a product or test bug.
-
-## Where the evidence lives
-
-Per scenario, under
-`artifacts/<TEST_NAME>/openshift-microshift-e2e-metal-tests/artifacts/scenario-info/<scenario>/`:
-
-| File | Answers |
-| ---- | ------- |
-| `junit.xml` | Which tests failed; the top-level `testsuite name` IS the scenario name |
-| `rf-debug.log` | Robot Framework execution trace with timestamps — failures marked `\| FAIL \|`; the primary test-failure evidence |
-| `boot_and_run.log` | VM boot + scenario orchestration; timeouts killing the whole scenario show up here (`timeout: sending signal TERM`) |
-| `phase_create/junit.xml` | Infrastructure-level junit from VM creation (greenboot check, kickstart, SOS collection) — distinct from the test-level `junit.xml` |
-| `phase_run/junit.xml` | Infrastructure-level junit from the test run phase |
-| `vms/host1/sos/journal_*.log` | **Plain-text journal exports** — readable without extracting anything; check these FIRST for service failures, x509 errors, OOM kills |
-| `vms/host1/sos/sosreport-*.tar.xz` | Full sosreports (see below) |
-
-## Sosreports
-
-- Two types of sosreport are collected:
-  1. **On-failure** — the `sos-on-failure-listener.py` Robot Framework
-     listener captures a sosreport at each test-case-level keyword
-     failure. This report includes the namespaces that the test created
-     (detected by tracking Robot variables containing "namespace" or
-     "ns"). **Prefer this report**: by the end of the scenario, test
-     namespaces are cleaned up and their pod logs are gone.
-  2. **End-of-scenario** — collected during teardown regardless of
-     pass/fail. Contains system state but may lack test-created
-     workloads.
-  Match report to failure by comparing the sosreport's capture timestamp
-  with the failure timestamp from `rf-debug.log`.
-- **Journals:** use the plain-text `journal_*.log` files next to the
-  sosreport tarballs — no extraction needed.
-- **Pod logs:** extract a specific tarball with
-  `bash plugins/shared/scripts/extract-sosreport.sh <tarball>`.
-  This extracts pod logs, inspect outputs, and cluster-scoped
-  resources (not journals or the full filesystem) into
-  `<tarball-parent>/sos-extracted/<sosreport-name>/`.
-- The on-failure listener respects the `SKIP_SOS` environment variable —
-  when `true`, no on-failure reports are generated (development
-  environments only; CI always collects them).
-- Inside an extracted report:
-  - Per-namespace pod logs:
-    `sos_commands/microshift/namespaces/<ns>/pods/<pod>/<container>/<container>/logs/current.log`
-    — and `previous.log` when the container was restarted. **The tail of
-    `previous.log` states why the container died** (fatal error, leader
-    election lost, panic).
-  - Cluster-scoped resources:
-    `sos_commands/microshift/cluster-scoped-resources/` — nodes, CRDs,
-    webhooks.
-  - Component inspect outputs: `sos_commands/*/inspect_*`.
-
-## Greenboot health check
-
-Before running tests, the scenario runner waits for
-`greenboot-healthcheck.service` to reach `exited` state. This verifies
-MicroShift started successfully. If greenboot fails or times out
-(`VM_GREENBOOT_TIMEOUT`), the scenario aborts with
-`pre_test_greenboot_check FAILED` in `phase_create/junit.xml` and no
-tests run.
-
-In the journal, look for `40_microshift_running_check.sh` lines —
-they show which deployments greenboot waited for and when each became
-ready. The final verdict is
-`greenboot[...]: Script '40_microshift_running_check.sh' SUCCESS/FAILURE`.
-
-## Reading the journal for component failures
-
-Reconstruct a timestamped component timeline before attributing fault:
-
-- Pod lifecycle: kubelet `SyncLoop (PLEG)` events, `Created container` /
-  `Started container` (crio), `SyncLoop (probe)` readiness transitions,
-  `prober.go "Probe failed"` lines.
-- **Two `Created container` events for the same pod = the first instance
-  died and was restarted** — a single startup narrative is wrong; read
-  `previous.log` for the exit reason.
-- etcd pressure: `apply request took too long` warnings indicate
-  apiserver/etcd latency (can cost components their leader-election
-  leases).
-
-## Common failure patterns
-
-### Timeout cascade
-
-When `TEST_EXECUTION_TIMEOUT` (default 30m) expires, the `timeout`
-command sends TERM to Robot Framework. The current test dies with
-`Execution terminated by signal` and every subsequent test reports
-`Test execution stopped due to a fatal error`. This is a cascade
-with ONE root cause — identify what consumed the time budget.
-
-### Greenboot failure masking test failures
-
-If greenboot fails, no tests run — the only junit is the
-infrastructure-level `phase_create/junit.xml` recording the
-`pre_test_greenboot_check FAILED`. The root cause is in the journal
-(MicroShift didn't start, a deployment didn't become ready, etc.).
-
-### Resource contention from parallel scenarios
-
-All scenarios share a single hypervisor. When many scenarios boot
-simultaneously, CPU/memory/disk contention can cause:
-
-- Slow MicroShift startup → greenboot timeouts
-- etcd `apply request took too long` → leader election loss
-- Image pull timeouts
-
-Report these as infrastructure failures attributed to
-shared-hypervisor contention.
-
-## Search/index coverage of external tools
-
-- Sippy tracks these jobs at **job level only** — scenario junits and RF
-  suite names are not ingested.
-- Search.CI indexes build logs and junit, **not** scenario-internal logs
-  (`rf-debug.log` content is not searchable).
diff --git a/plugins/shared/scripts/parse.py b/plugins/shared/scripts/parse.py
index 06df9429..d4051729 100644
--- a/plugins/shared/scripts/parse.py
+++ b/plugins/shared/scripts/parse.py
@@ -45,8 +45,14 @@ def parse_structured_summary(filepath):
     if not m:
         return []
 
+    json_text = m.group(1)
+    # LLM agents sometimes copy raw control characters (tabs, CRs) from
+    # build logs into JSON string values.  Sanitize before parsing.
+    json_text = json_text.replace('\t', '\\t').replace('\r', '\\r')
+    json_text = re.sub(r'[\x00-\x09\x0b\x0c\x0e-\x1f\x7f]', '', json_text)
+
     try:
-        entries = json.loads(m.group(1))
+        entries = json.loads(json_text)
     except json.JSONDecodeError:
         return []
 
diff --git a/plugins/shared/scripts/validate-reports.py b/plugins/shared/scripts/validate-reports.py
new file mode 100644
index 00000000..e7147a48
--- /dev/null
+++ b/plugins/shared/scripts/validate-reports.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+"""Validate causal-chain citations in per-job analysis reports.
+
+Checks that every causal-chain link in the STRUCTURED SUMMARY has:
+  - An 'evidence' field pointing to a real file (artifact or source path)
+  - A 'quote' field with non-empty text
+
+Prints validation errors per file.  Exits 0 if all files pass, 1 if any fail.
+
+Usage:
+    validate-reports.py <report1.txt> [<report2.txt> ...]
+    validate-reports.py <workdir>/jobs/release-*-job-*.txt
+"""
+
+import json
+import os
+import re
+import sys
+
+
+def _is_valid_evidence(evidence):
+    """Check if evidence looks like a file path, not general knowledge."""
+    if not evidence or not evidence.strip():
+        return False
+    e = evidence.strip()
+    # Reject obvious non-file citations
+    bad_prefixes = (
+        "architectural",
+        "design",
+        "general knowledge",
+        "by definition",
+        "well-known",
+        "documentation",
+    )
+    if e.lower().startswith(bad_prefixes):
+        return False
+    # Must contain a path separator or look like a filename
+    return "/" in e or ":" in e or "." in e
+
+
+def validate_file(filepath):
+    """Validate a single report file.  Returns list of error strings."""
+    with open(filepath, "r") as f:
+        content = f.read()
+
+    m = re.search(
+        r"--- STRUCTURED SUMMARY ---\n(.+?)(?:\n--- END STRUCTURED SUMMARY ---|\Z)",
+        content,
+        re.DOTALL,
+    )
+    if not m:
+        return []  # no summary block — not our problem (parse.py handles this)
+
+    json_text = m.group(1)
+    json_text = json_text.replace("\t", "\\t").replace("\r", "\\r")
+    json_text = re.sub(r"[\x00-\x09\x0b\x0c\x0e-\x1f\x7f]", "", json_text)
+
+    try:
+        entries = json.loads(json_text)
+    except json.JSONDecodeError:
+        return []  # malformed JSON — parse.py handles this
+
+    if isinstance(entries, dict):
+        entries = [entries]
+    if not isinstance(entries, list):
+        return []
+
+    errors = []
+    for ei, entry in enumerate(entries):
+        if not isinstance(entry, dict):
+            continue
+        chain = entry.get("causal_chain") or []
+        sig = entry.get("error_signature", "<unknown>")
+        for li, link in enumerate(chain):
+            if not isinstance(link, dict):
+                continue
+            cause = link.get("cause", "")
+            evidence = link.get("evidence", "").strip()
+            quote = link.get("quote", "").strip()
+
+            if not evidence and not quote:
+                errors.append(
+                    f"  [{sig}] chain link {li+1}: missing both evidence and quote"
+                    f" — cause: {cause[:80]}"
+                )
+            elif not evidence:
+                errors.append(
+                    f"  [{sig}] chain link {li+1}: missing evidence field"
+                    f" — cause: {cause[:80]}"
+                )
+            elif not quote:
+                errors.append(
+                    f"  [{sig}] chain link {li+1}: missing quote field"
+                    f" — cause: {cause[:80]}"
+                )
+            elif not _is_valid_evidence(evidence):
+                errors.append(
+                    f"  [{sig}] chain link {li+1}: evidence is not a file path:"
+                    f" '{evidence[:80]}' — cause: {cause[:80]}"
+                )
+
+    return errors
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: validate-reports.py <report.txt> [...]", file=sys.stderr)
+        sys.exit(2)
+
+    files = sys.argv[1:]
+    total_errors = 0
+    failed_files = []
+
+    for filepath in files:
+        if not os.path.isfile(filepath):
+            continue
+        errors = validate_file(filepath)
+        if errors:
+            name = os.path.basename(filepath)
+            print(f"FAIL {name}:")
+            for e in errors:
+                print(e)
+            print()
+            total_errors += len(errors)
+            failed_files.append((filepath, errors))
+        else:
+            name = os.path.basename(filepath)
+            print(f"OK   {name}")
+
+    print(f"\n{len(files)} files checked, {len(failed_files)} failed, {total_errors} errors")
+
+    if failed_files:
+        # Machine-readable output for the doctor workflow
+        print("\n--- VALIDATION FAILURES ---")
+        for filepath, errors in failed_files:
+            print(f"FILE: {filepath}")
+            for e in errors:
+                print(e)
+            print()
+        print("--- END VALIDATION FAILURES ---")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()

From 65a528c7d35a6708411c5e210b0b225394dee78a Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <pmatusza@redhat.com>
Date: Fri, 3 Jul 2026 11:58:33 +0200
Subject: [PATCH 3/7] ci-doctor: verify causal-chain citations against real
 artifacts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The validator previously only checked that 'evidence' looked like a
path — a hallucinated-but-plausible citation passed. It now resolves
each citation against the job's downloaded artifacts (build dir derived
from the entry's job_url), checks the file exists, the line is in
range, and the quote actually appears near the cited line (timestamps
stripped, whitespace normalized). Error messages include where the
quote really is so fix agents can re-ground citations instead of
guessing.

Fix agents are no longer told to delete unsupported links to pass
validation — they must re-ground each link or move the claim to
analysis_gaps and downgrade confidence, then re-run the validator on
their own output.

Evidence packs now record the source file for every rf/boot_and_run/
journal alert entry (journal alerts from multiple files are merged, so
line numbers alone were ambiguous).

Drop missing_patterns from the agent contract: nothing consumed it —
parse.py discarded it at aggregation — so it was pure token cost.
---
 .../microshift-ci/agents/analyze-evidence.md  |   6 +-
 .../agents/references/structured-summary.md   |   1 -
 plugins/microshift-ci/skills/doctor/SKILL.md  |  30 +-
 plugins/shared/scripts/extract-evidence.py    |   7 +-
 plugins/shared/scripts/validate-reports.py    | 273 ++++++++++++++----
 5 files changed, 252 insertions(+), 65 deletions(-)

diff --git a/plugins/microshift-ci/agents/analyze-evidence.md b/plugins/microshift-ci/agents/analyze-evidence.md
index 8344a630..090d5ed3 100644
--- a/plugins/microshift-ci/agents/analyze-evidence.md
+++ b/plugins/microshift-ci/agents/analyze-evidence.md
@@ -28,16 +28,19 @@ Read `{EVIDENCE_PACK}` and `plugins/microshift-ci/agents/references/microshift-c
 Iterate hypothesis → evidence until the cause is actionable.
 
 **Mandatory raw-log verification** — BEFORE concluding, even when the evidence pack looks sufficient:
+
 - Read ~200 lines of raw journal around the failure timestamp — look for patterns NOT in the evidence pack (authorization denials, scheduler errors, admission failures, kubelet sandbox errors).
 - When a sosreport exists, check **kube-apiserver** pod logs for authorization/admission/scheduling decisions.
 - "Timed out waiting for X" is a symptom — read raw logs to find WHY X was slow or absent.
 
 **Deeper investigation** via raw artifacts:
+
 - **Sosreport pod logs**: read from `extracted_sosreport_dirs` when available, or run `bash plugins/microshift-ci/scripts/extract-sosreport.sh <tarball>` on paths in `sosreport_paths`.
 - **PCP graphs**: read PNGs listed in `pcp_graphs` when the failure involves timeouts, slowness, or resource exhaustion.
 - **Source code**: use `source_checkout.path` to read `test/suites/` or product code. Check `recent_commits` for related changes.
 
 **Critical rules**:
+
 - A test-layer fix is never the bottom when a product component misbehaved — reconstruct the component's story from journal and pod logs before concluding.
 - Two `Created container` events for the same pod = the first instance died. Read `previous.log` for the exit reason.
 - Multiple scenario failures: decide cascade vs independent using the **timeline**, not error-text similarity.
@@ -46,6 +49,7 @@ Iterate hypothesis → evidence until the cause is actionable.
 ### 4. Validate causal chain
 
 Before producing the report, validate every causal-chain link:
+
 - Every link MUST have an `evidence` field containing an artifact file path with `:line` (e.g., `artifacts/.../boot_and_run.log:4629`).
 - Every link MUST have a `quote` field with verbatim text from that file.
 - If any link cites the evidence JSON, general knowledge, or architectural statements instead of an artifact file — fix it now by finding the actual artifact file, or drop the link.
@@ -54,8 +58,6 @@ Before producing the report, validate every causal-chain link:
 
 Write the report per `plugins/microshift-ci/agents/references/structured-summary.md`. Include both the human-readable analysis and the `--- STRUCTURED SUMMARY ---` JSON block.
 
-When you read a raw artifact and find evidence NOT in the evidence pack, include `missing_patterns` entries: `{"file_type": "journal|boot_and_run|build_log", "grep_pattern": "<regex>", "reason": "<why>"}`.
-
 ### 6. Save and reply
 
 Save the FULL report output (including the `--- STRUCTURED SUMMARY ---` block) to `{OUTPUT_FILE}` using the Write tool. The file must contain the complete analysis report.
diff --git a/plugins/microshift-ci/agents/references/structured-summary.md b/plugins/microshift-ci/agents/references/structured-summary.md
index 645a6c8e..856c9d8a 100644
--- a/plugins/microshift-ci/agents/references/structured-summary.md
+++ b/plugins/microshift-ci/agents/references/structured-summary.md
@@ -52,7 +52,6 @@ Append after all prose. **Both markers are required** — the parser skips the r
 | `confidence` | `high` / `medium` / `low` (see rules below) |
 | `analysis_gaps` | Array of strings naming missing evidence. Empty `[]` when nothing skipped |
 | `scenarios` | Array of scenario names where this failure occurred. Empty `[]` for non-scenario jobs |
-| `missing_patterns` | (optional) Array of `{"file_type", "grep_pattern", "reason"}` for patterns to add to `extract-evidence.py` |
 
 ### CONFIDENCE rules
 
diff --git a/plugins/microshift-ci/skills/doctor/SKILL.md b/plugins/microshift-ci/skills/doctor/SKILL.md
index c3e07e69..23022993 100644
--- a/plugins/microshift-ci/skills/doctor/SKILL.md
+++ b/plugins/microshift-ci/skills/doctor/SKILL.md
@@ -123,7 +123,7 @@ Compute once at the start by running `date +%y%m%d` and substituting into the pa
    | Placeholder | Value |
    |---|---|
    | `{EVIDENCE_PACK}` | `<WORKDIR>/evidence/evidence-<BUILD_ID>.json` |
-   | `{JOB_NAME}` | `job` field (for PR jobs, append ` (PR #<PR>)`) |
+   | `{JOB_NAME}` | `job` field (for PR jobs, append a space and `(PR #<PR>)`) |
    | `{JOB_URL}` | `url` field |
    | `{OUTPUT_FILE}` | Release: `<WORKDIR>/jobs/release-<RELEASE>-job-<N>-<JOB_ID>.txt`. PR: `<WORKDIR>/jobs/prs-job-<N>-pr<PR>-<JOB_NAME_SUFFIX>.txt` |
 
@@ -142,21 +142,29 @@ Compute once at the start by running `date +%y%m%d` and substituting into the pa
    ```text
    Agent: subagent_type=general_purpose, prompt="Fix citation errors in a CI analysis report.
 
-   The report at <FAILED_FILE> has causal-chain links that cite the evidence JSON,
-   general knowledge, or lack file paths. The specific errors are:
+   The report at <FAILED_FILE> has causal-chain links whose citations failed
+   verification against the actual artifact files. The specific errors are:
    <PASTE ERRORS FOR THIS FILE FROM VALIDATION OUTPUT>
 
-   Fix the report:
+   Fix the report by RE-GROUNDING each flagged link in the real artifacts:
    1. Read the report at <FAILED_FILE>
-   2. For each flagged causal-chain link, find the actual artifact file and line number.
-      The artifacts are under <ARTIFACTS_DIR>. Use Grep to locate the quoted text in the
-      artifact files. The evidence pack at <WORKDIR>/evidence/evidence-<BUILD_ID>.json
-      has file and line fields for each extracted alert — use those to map back to real
-      artifact paths.
-   3. If no artifact supports a causal-chain link, remove that link entirely.
+   2. For each flagged link:
+      - 'found at line N' → re-read that line in the cited file; if it supports
+        the cause, update the citation to that line.
+      - 'cited file not found' → Grep the quoted text under <ARTIFACTS_DIR> and
+        cite the file:line where it actually appears. The evidence pack at
+        <WORKDIR>/evidence/evidence-<BUILD_ID>.json has file and line fields for
+        each extracted alert.
+      - 'quote not found' → re-read the cited file around the cited line and
+        replace the quote with the verbatim text that supports the cause.
+   3. NEVER delete a link merely to pass validation. Only if a real search finds
+      no supporting artifact: remove the link, add the unverified claim to
+      analysis_gaps (e.g. "unverified: <cause>"), and downgrade confidence.
    4. Rewrite the corrected report (BOTH the human-readable Causal Chain section AND the
       STRUCTURED SUMMARY JSON causal_chain array) back to <FAILED_FILE>.
-   5. Reply with EXACTLY: FIXED <FAILED_FILE>"
+   5. Verify your fix: python3 plugins/microshift-ci/scripts/validate-reports.py <FAILED_FILE>
+      must print OK. Iterate until it does.
+   6. Reply with EXACTLY: FIXED <FAILED_FILE>"
    ```
 
    Launch all fix agents in a single message (parallel). Then proceed to Step 3.
diff --git a/plugins/shared/scripts/extract-evidence.py b/plugins/shared/scripts/extract-evidence.py
index ae6e47d9..4b8b0406 100644
--- a/plugins/shared/scripts/extract-evidence.py
+++ b/plugins/shared/scripts/extract-evidence.py
@@ -431,7 +431,7 @@ def _extract_rf_failures(path):
     results = []
     for h in hits:
         line_num, text = _parse_grep_line(h)
-        results.append({"line": line_num, "text": text[:300]})
+        results.append({"file": path, "line": line_num, "text": text[:300]})
     return results
 
 
@@ -443,7 +443,7 @@ def _extract_boot_and_run_alerts(path):
             line_num, text = _parse_grep_line(h)
             if not text or re.match(r"^\d+:\s*#", text):
                 continue
-            alerts.append({"pattern": label, "line": line_num, "text": text[:300]})
+            alerts.append({"pattern": label, "file": path, "line": line_num, "text": text[:300]})
     return alerts
 
 
@@ -463,6 +463,9 @@ def _extract_journal_alerts(journal_path):
         for h in hits:
             line_num, text = _parse_grep_line(h)
             entries.append({
+                # Alerts from all of a scenario's journals are merged, so
+                # each entry must carry its own file for unambiguous citation.
+                "file": journal_path,
                 "line": line_num,
                 "text": text[:300],
                 "timestamp": _parse_journal_timestamp(text),
diff --git a/plugins/shared/scripts/validate-reports.py b/plugins/shared/scripts/validate-reports.py
index e7147a48..08fadab8 100644
--- a/plugins/shared/scripts/validate-reports.py
+++ b/plugins/shared/scripts/validate-reports.py
@@ -1,45 +1,173 @@
 #!/usr/bin/env python3
 """Validate causal-chain citations in per-job analysis reports.
 
-Checks that every causal-chain link in the STRUCTURED SUMMARY has:
-  - An 'evidence' field pointing to a real file (artifact or source path)
-  - A 'quote' field with non-empty text
+Verifies that every causal-chain link in the STRUCTURED SUMMARY cites a
+real file, an in-range line number, and a quote that actually appears at
+(or near) the cited location.  Citation paths are resolved against the
+workdir, the job's downloaded artifacts (derived from the entry's
+job_url build id), and source checkouts under <workdir>/src/.
 
 Prints validation errors per file.  Exits 0 if all files pass, 1 if any fail.
 
 Usage:
-    validate-reports.py <report1.txt> [<report2.txt> ...]
+    validate-reports.py [--workdir DIR] <report1.txt> [<report2.txt> ...]
     validate-reports.py <workdir>/jobs/release-*-job-*.txt
+
+When --workdir is omitted it is derived from each report's path
+(reports live in <workdir>/jobs/).  When no artifact roots can be found
+at all (e.g. the workdir was moved), falls back to format-only checks.
 """
 
+import glob as glob_mod
 import json
 import os
 import re
 import sys
 
+QUOTE_SEARCH_WINDOW = 5      # lines around the cited line to search for the quote
+MIN_NEEDLE_LEN = 8           # normalized quote fragments shorter than this are not checked
+BINARY_EXTENSIONS = (".png", ".jpg", ".jpeg", ".svg", ".gif")
 
-def _is_valid_evidence(evidence):
-    """Check if evidence looks like a file path, not general knowledge."""
-    if not evidence or not evidence.strip():
-        return False
+# Timestamp shapes commonly copied from logs; stripped before comparing.
+_TIMESTAMP_RES = [
+    re.compile(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:?\d{2})?"),
+    re.compile(r"[A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}"),  # journal: Jun 30 03:58:39
+    re.compile(r"\d{2}:\d{2}:\d{2}(\.\d+)?"),
+    re.compile(r"\[\s*\d+\.\d+\]"),                              # kernel: [ 1234.567890]
+]
+
+
+def _normalize(text):
+    """Strip timestamps, collapse whitespace, casefold — for lenient matching."""
+    for ts_re in _TIMESTAMP_RES:
+        text = ts_re.sub(" ", text)
+    return " ".join(text.split()).casefold()
+
+
+def _quote_needle(quote):
+    """Pick the longest verifiable fragment of a quote.
+
+    Agents sometimes elide long lines with '...' — verify the longest
+    fragment instead of the full quote.
+    """
+    fragments = re.split(r"\.{3}|…", quote)
+    fragments = [_normalize(f) for f in fragments]
+    fragments = [f for f in fragments if f]
+    if not fragments:
+        return ""
+    return max(fragments, key=len)
+
+
+def _split_line_ref(evidence):
+    """Split 'path/to/file.log:4629' → ('path/to/file.log', 4629)."""
+    m = re.match(r"^(.*?):(\d+)$", evidence.strip())
+    if m:
+        return m.group(1), int(m.group(2))
+    return evidence.strip(), None
+
+
+def _build_id_from_url(job_url):
+    """The Prow job URL's last path segment is the build id."""
+    return job_url.rstrip("/").rsplit("/", 1)[-1] if job_url else ""
+
+
+def _candidate_roots(workdir, build_id):
+    """Directories a citation path may be relative to, most specific first.
+
+    Returns (roots, verifiable).  verifiable is False when the workdir has
+    no downloaded artifacts at all (e.g. pruned or moved) — real
+    verification is impossible and callers fall back to format-only checks.
+    """
+    roots = []
+    if not workdir:
+        return roots, False
+    artifacts_root = os.path.join(workdir, "artifacts")
+    verifiable = os.path.isdir(artifacts_root)
+    build_dir = os.path.join(artifacts_root, build_id) if build_id else ""
+    if build_dir and os.path.isdir(build_dir):
+        roots.append(build_dir)
+    elif verifiable:
+        # Unknown or missing build id — try every downloaded build.
+        roots.extend(
+            d for d in sorted(glob_mod.glob(os.path.join(artifacts_root, "*")))
+            if os.path.isdir(d)
+        )
+    for src_dir in sorted(glob_mod.glob(os.path.join(workdir, "src", "*"))):
+        if os.path.isdir(src_dir):
+            roots.append(src_dir)
+    if os.path.isdir(workdir):
+        roots.append(workdir)
+    return roots, verifiable
+
+
+def _resolve_path(path, roots):
+    """Resolve a cited path against candidate roots.  Returns None if not found."""
+    if os.path.isabs(path):
+        return path if os.path.isfile(path) else None
+    rel = path.lstrip("./")
+    for root in roots:
+        candidate = os.path.join(root, rel)
+        if os.path.isfile(candidate):
+            return candidate
+    return None
+
+
+def _find_quote(filepath, line_num, needle):
+    """Check whether needle appears near line_num in filepath.
+
+    Returns (ok, error_message).  On mismatch the message includes where
+    the quote actually is (if anywhere), so fix agents can re-ground it.
+    """
+    try:
+        with open(filepath, errors="replace") as f:
+            lines = f.readlines()
+    except OSError as e:
+        return False, f"cannot read file: {e}"
+
+    total = len(lines)
+    if line_num is not None and line_num > total:
+        return False, f"line {line_num} out of range (file has {total} lines)"
+
+    if line_num is not None:
+        lo = max(0, line_num - 1 - QUOTE_SEARCH_WINDOW)
+        hi = min(total, line_num + QUOTE_SEARCH_WINDOW)
+        window = _normalize("".join(lines[lo:hi]))
+        if needle in window:
+            return True, ""
+
+    for i, line in enumerate(lines, 1):
+        if needle in _normalize(line):
+            if line_num is None:
+                return True, ""
+            return False, (
+                f"quote not found at cited line {line_num}"
+                f" (found at line {i})"
+            )
+
+    # Multi-line quotes may straddle line boundaries; try the whole file.
+    if needle in _normalize("".join(lines)):
+        if line_num is None:
+            return True, ""
+        return False, f"quote not found near cited line {line_num}"
+
+    return False, "quote not found anywhere in cited file"
+
+
+def _is_plausible_path(evidence):
+    """Legacy format-only heuristic, used when no artifact roots exist."""
     e = evidence.strip()
-    # Reject obvious non-file citations
+    if not e:
+        return False
     bad_prefixes = (
-        "architectural",
-        "design",
-        "general knowledge",
-        "by definition",
-        "well-known",
-        "documentation",
+        "architectural", "design", "general knowledge",
+        "by definition", "well-known", "documentation",
     )
     if e.lower().startswith(bad_prefixes):
         return False
-    # Must contain a path separator or look like a filename
     return "/" in e or ":" in e or "." in e
 
 
-def validate_file(filepath):
-    """Validate a single report file.  Returns list of error strings."""
+def _load_entries(filepath):
     with open(filepath, "r") as f:
         content = f.read()
 
@@ -64,59 +192,107 @@ def validate_file(filepath):
         entries = [entries]
     if not isinstance(entries, list):
         return []
+    return [e for e in entries if isinstance(e, dict)]
+
+
+def validate_file(filepath, workdir=None):
+    """Validate a single report file.  Returns list of error strings."""
+    entries = _load_entries(filepath)
+    if not entries:
+        return []
+
+    if workdir is None:
+        # Reports live in <workdir>/jobs/ — derive the workdir.
+        workdir = os.path.dirname(os.path.dirname(os.path.abspath(filepath)))
 
     errors = []
-    for ei, entry in enumerate(entries):
-        if not isinstance(entry, dict):
-            continue
+    for entry in entries:
         chain = entry.get("causal_chain") or []
         sig = entry.get("error_signature", "<unknown>")
+        build_id = _build_id_from_url(entry.get("job_url", ""))
+        roots, verifiable = _candidate_roots(workdir, build_id)
+
         for li, link in enumerate(chain):
             if not isinstance(link, dict):
                 continue
             cause = link.get("cause", "")
-            evidence = link.get("evidence", "").strip()
-            quote = link.get("quote", "").strip()
+            evidence = (link.get("evidence") or "").strip()
+            quote = (link.get("quote") or "").strip()
+
+            def err(msg):
+                errors.append(f"  [{sig}] chain link {li+1}: {msg} — cause: {cause[:80]}")
 
             if not evidence and not quote:
-                errors.append(
-                    f"  [{sig}] chain link {li+1}: missing both evidence and quote"
-                    f" — cause: {cause[:80]}"
-                )
-            elif not evidence:
-                errors.append(
-                    f"  [{sig}] chain link {li+1}: missing evidence field"
-                    f" — cause: {cause[:80]}"
-                )
-            elif not quote:
-                errors.append(
-                    f"  [{sig}] chain link {li+1}: missing quote field"
-                    f" — cause: {cause[:80]}"
-                )
-            elif not _is_valid_evidence(evidence):
-                errors.append(
-                    f"  [{sig}] chain link {li+1}: evidence is not a file path:"
-                    f" '{evidence[:80]}' — cause: {cause[:80]}"
-                )
+                err("missing both evidence and quote")
+                continue
+            if not evidence:
+                err("missing evidence field")
+                continue
+            if not quote:
+                err("missing quote field")
+                continue
+
+            path, line_num = _split_line_ref(evidence)
+            resolved = _resolve_path(path, roots)
+
+            if not verifiable and resolved is None:
+                # Workdir moved or artifacts pruned — format check only.
+                if not _is_plausible_path(evidence):
+                    err(f"evidence is not a file path: '{evidence[:80]}'")
+                continue
+
+            if resolved is None:
+                if not _is_plausible_path(evidence):
+                    err(f"evidence is not a file path: '{evidence[:80]}'")
+                else:
+                    err(f"cited file not found: '{path[:120]}'")
+                continue
+
+            if resolved.lower().endswith(BINARY_EXTENSIONS):
+                continue  # graphs etc. — existence is all we can check
+
+            needle = _quote_needle(quote)
+            if len(needle) < MIN_NEEDLE_LEN:
+                continue  # too short to verify meaningfully
+
+            ok, msg = _find_quote(resolved, line_num, needle)
+            if not ok:
+                err(f"{msg} ('{os.path.basename(resolved)}')")
 
     return errors
 
 
 def main():
-    if len(sys.argv) < 2:
-        print("Usage: validate-reports.py <report.txt> [...]", file=sys.stderr)
+    args = sys.argv[1:]
+    workdir = None
+    files = []
+    i = 0
+    while i < len(args):
+        if args[i] == "--workdir":
+            if i + 1 >= len(args):
+                print("Error: --workdir requires an argument", file=sys.stderr)
+                sys.exit(2)
+            workdir = args[i + 1]
+            i += 2
+        else:
+            files.append(args[i])
+            i += 1
+
+    if not files:
+        print("Usage: validate-reports.py [--workdir DIR] <report.txt> [...]", file=sys.stderr)
         sys.exit(2)
 
-    files = sys.argv[1:]
     total_errors = 0
     failed_files = []
+    checked = 0
 
     for filepath in files:
         if not os.path.isfile(filepath):
             continue
-        errors = validate_file(filepath)
+        checked += 1
+        errors = validate_file(filepath, workdir)
+        name = os.path.basename(filepath)
         if errors:
-            name = os.path.basename(filepath)
             print(f"FAIL {name}:")
             for e in errors:
                 print(e)
@@ -124,10 +300,9 @@ def main():
             total_errors += len(errors)
             failed_files.append((filepath, errors))
         else:
-            name = os.path.basename(filepath)
             print(f"OK   {name}")
 
-    print(f"\n{len(files)} files checked, {len(failed_files)} failed, {total_errors} errors")
+    print(f"\n{checked} files checked, {len(failed_files)} failed, {total_errors} errors")
 
     if failed_files:
         # Machine-readable output for the doctor workflow

From b5ce340dca21414a3da892f14126acb8ceb87e01 Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <pmatusza@redhat.com>
Date: Fri, 3 Jul 2026 12:05:26 +0200
Subject: [PATCH 4/7] ci-doctor: group jobs by deterministic failure
 fingerprint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Grouping and cross-release dedup previously keyed on LLM-authored text
(raw_error + root_cause) with a 0.5 token-similarity threshold —
demanding cross-run determinism a sampled model cannot give, while the
truly deterministic key (which step/tests/scenarios failed) already sat
in the evidence pack.

extract-evidence.py now computes a failure fingerprint from artifact
facts only (job type, failed step, failing test names, phase failures,
timeout cascade, greenboot verdict, infra indicator labels, first build
error — all normalized, no job names/builds/timestamps).

New doctor.sh plan/fanout phases (plan-analysis.py):
- plan groups all failed jobs (releases + PRs) by fingerprint, writes
  template verdicts for pure-infrastructure and no-failure groups (no
  agent at all), and renders one fully substituted agent prompt file
  per remaining group
- ONE agent analyzes each distinct failure instead of one per job —
  cross-release verdicts consistent by construction and fewer agents
  against the CI session's 45-minute budget
- fanout explodes each validated group report into the per-job report
  files aggregate.py/search-bugs.py/create-report.py already consume,
  patching job fields and injecting 'fingerprint' (+ entry ordinal so
  independent failures stay separate issues)

parse.py groups by fingerprint when present; token similarity remains
as fallback for legacy reports. The validator resolves citations
against all group members' build dirs. The analyze-evidence agent
template is now group-native; prow-job renders it as a group of one.

lvms-ci symlinks the new shared plan-analysis.py so its doctor flow
resolves it too.

Verified on a synthetic workdir: 5 jobs → 3 groups (1 agent), two
consecutive runs produce byte-identical grouping.
---
 plugins/lvms-ci/scripts/plan-analysis.py      |   1 +
 .../microshift-ci/agents/analyze-evidence.md  |  29 +-
 .../agents/references/structured-summary.md   |  15 +-
 .../microshift-ci/scripts/plan-analysis.py    |   1 +
 plugins/microshift-ci/skills/doctor/SKILL.md  |  70 ++-
 .../microshift-ci/skills/prow-job/SKILL.md    |   6 +-
 plugins/shared/scripts/doctor.sh              |  55 ++
 plugins/shared/scripts/extract-evidence.py    |  98 +++-
 plugins/shared/scripts/parse.py               |  35 +-
 plugins/shared/scripts/plan-analysis.py       | 501 ++++++++++++++++++
 plugins/shared/scripts/validate-reports.py    |   7 +-
 11 files changed, 755 insertions(+), 63 deletions(-)
 create mode 120000 plugins/lvms-ci/scripts/plan-analysis.py
 create mode 120000 plugins/microshift-ci/scripts/plan-analysis.py
 create mode 100644 plugins/shared/scripts/plan-analysis.py

diff --git a/plugins/lvms-ci/scripts/plan-analysis.py b/plugins/lvms-ci/scripts/plan-analysis.py
new file mode 120000
index 00000000..1e070d4a
--- /dev/null
+++ b/plugins/lvms-ci/scripts/plan-analysis.py
@@ -0,0 +1 @@
+../../shared/scripts/plan-analysis.py
\ No newline at end of file
diff --git a/plugins/microshift-ci/agents/analyze-evidence.md b/plugins/microshift-ci/agents/analyze-evidence.md
index 090d5ed3..c607b239 100644
--- a/plugins/microshift-ci/agents/analyze-evidence.md
+++ b/plugins/microshift-ci/agents/analyze-evidence.md
@@ -1,31 +1,34 @@
 # Analyze Evidence Agent
 
-Analyze a MicroShift Prow CI job from a pre-extracted evidence pack. Your goal is the UNDERLYING root cause, not the first error in the log. Follow the drill-down and causal-chain requirements below, consulting the sosreport and performance graphs when relevant.
+Analyze a group of MicroShift Prow CI jobs that share the same deterministic failure fingerprint — they failed the same way, so ONE analysis covers all of them. Your goal is the UNDERLYING root cause, not the first error in the log. Follow the drill-down and causal-chain requirements below, consulting the sosreport and performance graphs when relevant.
 
 ## Inputs
 
-- `{EVIDENCE_PACK}` — path to evidence pack JSON
-- `{JOB_NAME}` — full Prow job name
-- `{JOB_URL}` — full Prow job URL
-- `{OUTPUT_FILE}` — path to save the analysis report
+Jobs in this group (same failure fingerprint):
+
+{GROUP_JOBS}
+
+Output file for the analysis report: `{OUTPUT_FILE}`
 
 ## Instructions
 
-### 1. Read the evidence pack and references
+### 1. Read the evidence and references
+
+Read `plugins/microshift-ci/agents/references/microshift-ci-primer.md` and every evidence pack listed above. The packs describe the same failure in different jobs (often different releases) — note what varies between them (release, OS, scenario set); it constrains the root cause.
 
-Read `{EVIDENCE_PACK}` and `plugins/microshift-ci/agents/references/microshift-ci-primer.md`.
+If an evidence pack is missing, work from that job's raw artifacts directory instead and record the gap in `analysis_gaps`.
 
 ### 2. Assess the failure
 
-- `infrastructure_indicators.is_infra_failure` true → confirm from matched patterns and anchor error, produce report.
 - `scenario-e2e` → examine each scenario's alerts, failures, and journal. Use `failure_timeline` to distinguish cascade from independent failures.
 - `conformance` → examine `conformance_failures`.
 - `build`/`config`/`rebase` → examine `build_errors`.
-- No `failed_step` and no error indicators → job passed. Severity 1, `infrastructure_failure: false`. Do NOT drill down.
+- `infrastructure_indicators.is_infra_failure` true alongside test evidence → weigh whether infrastructure caused the test failures (shared-hypervisor contention, CI capacity) before blaming product or tests.
+- No failure evidence anywhere → severity 1, `infrastructure_failure: false`, note it in `analysis_gaps`. Do NOT drill down.
 
 ### 3. Drill down
 
-Iterate hypothesis → evidence until the cause is actionable.
+Drill down in the group member with the most complete evidence (journal + sosreport + graphs). Iterate hypothesis → evidence until the cause is actionable. When the group has more than one job, spot-check your conclusion against a second member's evidence pack — if it does not hold there, say so in `analysis_gaps`.
 
 **Mandatory raw-log verification** — BEFORE concluding, even when the evidence pack looks sufficient:
 
@@ -58,6 +61,12 @@ Before producing the report, validate every causal-chain link:
 
 Write the report per `plugins/microshift-ci/agents/references/structured-summary.md`. Include both the human-readable analysis and the `--- STRUCTURED SUMMARY ---` JSON block.
 
+Group handling in the structured summary:
+
+- Emit one entry per INDEPENDENT failure (not per job — the group shares the failures).
+- Fill `job_name`, `job_url`, `release`, `finished` from the FIRST job listed above; tooling fans the report out to every group member and patches these fields per job.
+- Do NOT emit a `fingerprint` field; tooling injects it.
+
 ### 6. Save and reply
 
 Save the FULL report output (including the `--- STRUCTURED SUMMARY ---` block) to `{OUTPUT_FILE}` using the Write tool. The file must contain the complete analysis report.
diff --git a/plugins/microshift-ci/agents/references/structured-summary.md b/plugins/microshift-ci/agents/references/structured-summary.md
index 856c9d8a..0dee490b 100644
--- a/plugins/microshift-ci/agents/references/structured-summary.md
+++ b/plugins/microshift-ci/agents/references/structured-summary.md
@@ -63,7 +63,7 @@ Do NOT inflate confidence — downstream automation acts on it.
 
 ### RAW_ERROR rules
 
-Used for deterministic grouping. Two runs on the same job MUST produce the same value.
+The verbatim anchor readers use to match the report against logs.
 
 1. **Copy-paste exact error text** — do NOT paraphrase
 2. **Pick ONE error** — the first fatal one
@@ -73,21 +73,20 @@ Used for deterministic grouping. Two runs on the same job MUST produce the same
 
 ### ROOT_CAUSE rules
 
-Used alongside RAW_ERROR for cross-release deduplication. Same underlying problem across releases MUST produce the same ROOT_CAUSE.
-
 | Field | Purpose |
 |---|---|
 | `error_signature` | WHAT failed (bug titles) |
-| `root_cause` | WHY it failed (dedup) |
-| `raw_error` | Verbatim log text (deterministic anchor) |
+| `root_cause` | WHY it failed (mechanism) |
+| `raw_error` | Verbatim log text (anchor) |
 
-1. **~80 chars max** — short enough for token matching
+1. **~80 chars max**
 2. **Focus on mechanism**, not symptom
-3. **Consistent across releases** — same problem = same text
-4. **Stable terms** — no version numbers, timestamps, or job names
+3. **Stable terms** — no version numbers, timestamps, or job names
 
 Describe the specific mechanism, not architectural generalizations ("framework expects annotation X which MicroShift does not set", not "MicroShift is single-node").
 
+Grouping and cross-release deduplication key on the deterministic failure `fingerprint` injected by tooling — do NOT emit a `fingerprint` field yourself.
+
 ### Multiple independent failures
 
 1. One entry per independent failure (different scenarios, different root causes)
diff --git a/plugins/microshift-ci/scripts/plan-analysis.py b/plugins/microshift-ci/scripts/plan-analysis.py
new file mode 120000
index 00000000..1e070d4a
--- /dev/null
+++ b/plugins/microshift-ci/scripts/plan-analysis.py
@@ -0,0 +1 @@
+../../shared/scripts/plan-analysis.py
\ No newline at end of file
diff --git a/plugins/microshift-ci/skills/doctor/SKILL.md b/plugins/microshift-ci/skills/doctor/SKILL.md
index 23022993..4003fa37 100644
--- a/plugins/microshift-ci/skills/doctor/SKILL.md
+++ b/plugins/microshift-ci/skills/doctor/SKILL.md
@@ -109,33 +109,45 @@ Compute once at the start by running `date +%y%m%d` and substituting into the pa
 
 3. If the script fails for some jobs, note the errors but continue — agents can fall back to raw artifacts.
 
-### Step 2: Analyze Each Job
+### Step 1d: Plan Analysis Groups
 
-**Goal**: Get detailed root cause analysis for each failed job using evidence packs and pre-downloaded artifacts.
+**Goal**: Deterministically group failed jobs by failure fingerprint so each distinct failure is analyzed exactly once.
 
 **Actions**:
 
-1. Use the JSON summary output from Step 1 to build agent prompts. Do NOT read the job JSON files into the main conversation — the prepare script already printed all job details (artifacts_dir, build_id, job name) and agents receive artifacts_dir directly in their prompt.
-2. Read `plugins/microshift-ci/agents/analyze-evidence.md` once. For **every** failed job across all releases and PRs, substitute the `{VARIABLE}` placeholders and launch a separate **Agent** (using the `Agent` tool). For PR jobs, only launch agents for jobs with FAILURE status.
+1. Run the plan script:
 
-   Substitute these placeholders from the prepare script's JSON output (`job`, `url`, `build_id` fields):
+   ```text
+   bash plugins/microshift-ci/scripts/doctor.sh plan --component microshift --workdir <WORKDIR>
+   ```
+
+2. The script deterministically:
+   - Groups all failed jobs (releases + PRs) by the failure fingerprint from their evidence packs
+   - Writes template reports directly for pure-infrastructure and no-failure groups — those need NO agent
+   - Renders a fully substituted agent prompt file per remaining group under `<WORKDIR>/prompts/`
+   - Writes `<WORKDIR>/analysis-plan.json` and prints a JSON summary whose `agent_groups` array lists each group's `prompt_file` and `report_file`
 
-   | Placeholder | Value |
-   |---|---|
-   | `{EVIDENCE_PACK}` | `<WORKDIR>/evidence/evidence-<BUILD_ID>.json` |
-   | `{JOB_NAME}` | `job` field (for PR jobs, append a space and `(PR #<PR>)`) |
-   | `{JOB_URL}` | `url` field |
-   | `{OUTPUT_FILE}` | Release: `<WORKDIR>/jobs/release-<RELEASE>-job-<N>-<JOB_ID>.txt`. PR: `<WORKDIR>/jobs/prs-job-<N>-pr<PR>-<JOB_NAME_SUFFIX>.txt` |
+### Step 2: Analyze Each Group
 
-3. Launch **ALL** agents (all releases + PRs) in a **single message** as **foreground** agents (do NOT use `run_in_background`). Foreground agents in the same message run concurrently — this is just as fast as background agents but keeps your turn active until all complete.
-4. Say "Analyzing N jobs in parallel..." in your message text alongside the Agent tool calls.
-5. When all agents return, **validate all output files**:
+**Goal**: Get detailed root cause analysis for each failure group using evidence packs and pre-downloaded artifacts.
+
+**Actions**:
+
+1. For **every** entry in the plan summary's `agent_groups`, launch a separate **Agent** (using the `Agent` tool) with exactly this prompt — the prompt files are fully pre-rendered, do NOT read or modify them yourself:
 
    ```text
-   python3 plugins/microshift-ci/scripts/validate-reports.py <WORKDIR>/jobs/release-*-job-*.txt <WORKDIR>/jobs/prs-job-*.txt
+   Read <PROMPT_FILE> and follow its instructions exactly.
    ```
 
-   If the script exits 0 (all pass), proceed to Step 3.
+2. Launch **ALL** group agents in a **single message** as **foreground** agents (do NOT use `run_in_background`). Foreground agents in the same message run concurrently — this is just as fast as background agents but keeps your turn active until all complete.
+3. Say "Analyzing N failure groups (M jobs) in parallel..." in your message text alongside the Agent tool calls. If `agent_groups` is empty, skip directly to step 5 (fan-out).
+4. When all agents return, **validate the group reports**:
+
+   ```text
+   python3 plugins/microshift-ci/scripts/validate-reports.py <WORKDIR>/jobs/analysis-group-*.txt
+   ```
+
+   If the script exits 0 (all pass), continue to step 5.
 
    If it exits 1, it prints a `--- VALIDATION FAILURES ---` block listing each failed file and its errors. For each failed file, launch a **fix agent**:
 
@@ -146,15 +158,16 @@ Compute once at the start by running `date +%y%m%d` and substituting into the pa
    verification against the actual artifact files. The specific errors are:
    <PASTE ERRORS FOR THIS FILE FROM VALIDATION OUTPUT>
 
-   Fix the report by RE-GROUNDING each flagged link in the real artifacts:
-   1. Read the report at <FAILED_FILE>
+   Fix the report by RE-GROUNDING each flagged link in the real artifacts.
+   The group's jobs, evidence packs, and artifacts directories are listed in
+   <PROMPT_FILE>.
+   1. Read the report at <FAILED_FILE> and the job list in <PROMPT_FILE>
    2. For each flagged link:
       - 'found at line N' → re-read that line in the cited file; if it supports
         the cause, update the citation to that line.
-      - 'cited file not found' → Grep the quoted text under <ARTIFACTS_DIR> and
-        cite the file:line where it actually appears. The evidence pack at
-        <WORKDIR>/evidence/evidence-<BUILD_ID>.json has file and line fields for
-        each extracted alert.
+      - 'cited file not found' → Grep the quoted text under the group's
+        artifacts directories and cite the file:line where it actually appears.
+        The evidence packs have file and line fields for each extracted alert.
       - 'quote not found' → re-read the cited file around the cited line and
         replace the quote with the verbatim text that supports the cause.
    3. NEVER delete a link merely to pass validation. Only if a real search finds
@@ -167,7 +180,13 @@ Compute once at the start by running `date +%y%m%d` and substituting into the pa
    6. Reply with EXACTLY: FIXED <FAILED_FILE>"
    ```
 
-   Launch all fix agents in a single message (parallel). Then proceed to Step 3.
+   Launch all fix agents in a single message (parallel). Then continue to step 5.
+
+5. **Fan out** the group reports into the per-job report files consumed by aggregation and bug correlation:
+
+   ```text
+   bash plugins/microshift-ci/scripts/doctor.sh fanout --component microshift --workdir <WORKDIR>
+   ```
 
 6. Proceed to Step 3. Do NOT stop or end your turn between Step 2 and Step 3.
 
@@ -267,7 +286,7 @@ HTML report generated: <WORKDIR>/report-microshift-ci-doctor.html
 
 ## Related Skills and Agents
 
-- **agents/analyze-evidence.md**: Evidence-aware job analysis agent (used by Step 2 — read, substitute, spawn)
+- **agents/analyze-evidence.md**: Evidence-aware group analysis agent template (rendered per group by the Step 1d plan script; spawned in Step 2)
 - **microshift-ci:prow-job**: Standalone job analysis from URL or artifacts directory (for manual use)
 - **microshift-ci:create-bugs**: Bug correlation and creation (used in Step 3; can also be run with `--create` after this command)
 - **microshift-ci:doctor-refresh**: Regenerate the HTML report from existing data (e.g., after `/microshift-ci:create-bugs --create`)
@@ -277,8 +296,9 @@ HTML report generated: <WORKDIR>/report-microshift-ci-doctor.html
 - **Deterministic scripts** handle: data collection, artifact download, evidence extraction, aggregation, HTML generation
 - **LLM agents** handle: per-job root cause analysis (Step 2), Jira bug search and open bugs query (Step 3)
 - Step 1c evidence extraction pre-processes all artifacts so Step 2 agents (from `plugins/microshift-ci/agents/analyze-evidence.md`) receive structured evidence packs and can skip exploratory log scanning
+- Step 1d groups jobs by deterministic failure fingerprint: one agent per distinct failure (not per job), pure-infrastructure and no-failure groups resolved without any agent, and per-job report files produced by the deterministic fan-out in Step 2
 - `/microshift-ci:doctor-refresh` regenerates the HTML report from existing data. Use it after `/microshift-ci:create-bugs --create` to include newly created bugs
-- Step 2 agents (per-job analysis) are launched in a single parallel wave
+- Step 2 agents (per-group analysis) are launched in a single parallel wave
 - Step 3 uses a single create-bugs agent with all sources (releases + rebase) comma-separated
 - The `prepare` script downloads all artifacts upfront so analysis agents use local paths (no redundant downloads)
 - The `prepare` script also clones the MicroShift source to `<WORKDIR>/src/microshift` with per-release worktrees (`--repo openshift/microshift`); clone failure is non-fatal — agents record the absence in `analysis_gaps` and proceed
diff --git a/plugins/microshift-ci/skills/prow-job/SKILL.md b/plugins/microshift-ci/skills/prow-job/SKILL.md
index e733f472..6c009bdd 100644
--- a/plugins/microshift-ci/skills/prow-job/SKILL.md
+++ b/plugins/microshift-ci/skills/prow-job/SKILL.md
@@ -47,13 +47,11 @@ The user argument is: `<ARGUMENTS>`
 
    Produces `<WORKDIR>/evidence/evidence-<BUILD_ID>.json`. The `<BUILD_ID>` is the last path component of `<TMP>`.
 
-4. **Analyze**: Read `plugins/microshift-ci/agents/analyze-evidence.md`. Substitute placeholders:
+4. **Analyze**: Read `plugins/microshift-ci/agents/analyze-evidence.md`. The template is group-oriented; render it for a single-job group by substituting:
 
    | Placeholder | Value |
    |---|---|
-   | `{EVIDENCE_PACK}` | `<WORKDIR>/evidence/evidence-<BUILD_ID>.json` |
-   | `{JOB_NAME}` | job name extracted from URL or directory path |
-   | `{JOB_URL}` | the original URL (or reconstruct from artifacts path) |
+   | `{GROUP_JOBS}` | `- <JOB_NAME> — <JOB_URL>`, then indented lines `evidence pack: <WORKDIR>/evidence/evidence-<BUILD_ID>.json` and `artifacts: <TMP>` |
    | `{OUTPUT_FILE}` | `<WORKDIR>/report-<BUILD_ID>.txt` |
 
    Spawn the agent with the substituted content. When it replies `DONE`, read the output file and present the report to the user.
diff --git a/plugins/shared/scripts/doctor.sh b/plugins/shared/scripts/doctor.sh
index a31f58fa..7ce5615c 100755
--- a/plugins/shared/scripts/doctor.sh
+++ b/plugins/shared/scripts/doctor.sh
@@ -515,6 +515,57 @@ cmd_evidence() {
     python3 "${SCRIPT_DIR}/extract-evidence.py" --batch --workdir "${WORKDIR}"
 }
 
+# ---------------------------------------------------------------------------
+# plan
+# ---------------------------------------------------------------------------
+
+cmd_plan() {
+    while [[ ${#} -gt 0 ]]; do
+        case "${1}" in
+            --workdir) WORKDIR="${2}"; shift 2 ;;
+            --component) COMPONENT="${2}"; shift 2 ;;
+            -*) echo "Unknown option: ${1}" >&2; return 1 ;;
+            *) echo "Unknown argument: ${1}" >&2; return 1 ;;
+        esac
+    done
+
+    [[ -z "${COMPONENT}" ]] && { echo "Error: --component is required" >&2; return 1; }
+
+    WORKDIR="${WORKDIR:-/tmp/${COMPONENT}-ci-claude-workdir.$(date +%y%m%d)}"
+
+    local template="${SCRIPT_DIR}/../agents/analyze-evidence.md"
+    if [[ ! -f "${template}" ]]; then
+        echo "Error: agent template not found: ${template}" >&2
+        return 1
+    fi
+
+    echo "=== Planning analysis groups ===" >&2
+    python3 "${SCRIPT_DIR}/plan-analysis.py" plan \
+        --workdir "${WORKDIR}" --template "${template}"
+}
+
+# ---------------------------------------------------------------------------
+# fanout
+# ---------------------------------------------------------------------------
+
+cmd_fanout() {
+    while [[ ${#} -gt 0 ]]; do
+        case "${1}" in
+            --workdir) WORKDIR="${2}"; shift 2 ;;
+            --component) COMPONENT="${2}"; shift 2 ;;
+            -*) echo "Unknown option: ${1}" >&2; return 1 ;;
+            *) echo "Unknown argument: ${1}" >&2; return 1 ;;
+        esac
+    done
+
+    [[ -z "${COMPONENT}" ]] && { echo "Error: --component is required" >&2; return 1; }
+
+    WORKDIR="${WORKDIR:-/tmp/${COMPONENT}-ci-claude-workdir.$(date +%y%m%d)}"
+
+    echo "=== Fanning out group reports to per-job files ===" >&2
+    python3 "${SCRIPT_DIR}/plan-analysis.py" fanout --workdir "${WORKDIR}"
+}
+
 # ---------------------------------------------------------------------------
 # main
 # ---------------------------------------------------------------------------
@@ -526,6 +577,8 @@ usage() {
     echo "  prepare  --component C [--workdir DIR] <releases> [--rebase] [--repo ORG/NAME]  Collect jobs, download artifacts, optional source checkout" >&2
     echo "  graphs   --component C [--workdir DIR] [--timezone TZ]       Generate PCP performance graphs" >&2
     echo "  evidence --component C [--workdir DIR]                        Extract structured evidence from artifacts" >&2
+    echo "  plan     --component C [--workdir DIR]                        Group jobs by fingerprint, render agent prompts" >&2
+    echo "  fanout   --component C [--workdir DIR]                        Explode group reports into per-job report files" >&2
     echo "  finalize --component C [--workdir DIR] <releases>             Aggregate results and generate HTML" >&2
     echo "  refresh  --component C [--workdir DIR] [--ignore KEY1,KEY2,...] <releases>  Regenerate HTML from existing workdir data" >&2
     echo "" >&2
@@ -547,6 +600,8 @@ main() {
         prepare)  cmd_prepare "${@}" ;;
         graphs)   cmd_graphs "${@}" ;;
         evidence) cmd_evidence "${@}" ;;
+        plan)     cmd_plan "${@}" ;;
+        fanout)   cmd_fanout "${@}" ;;
         finalize) cmd_finalize "${@}" ;;
         refresh)  cmd_refresh "${@}" ;;
         *) echo "Unknown command: ${cmd}" >&2; usage ;;
diff --git a/plugins/shared/scripts/extract-evidence.py b/plugins/shared/scripts/extract-evidence.py
index 4b8b0406..9dd81d28 100644
--- a/plugins/shared/scripts/extract-evidence.py
+++ b/plugins/shared/scripts/extract-evidence.py
@@ -15,6 +15,7 @@
 """
 
 import glob as glob_mod
+import hashlib
 import json
 import os
 import re
@@ -24,9 +25,13 @@
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 
-
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-EVIDENCE_VERSION = 1
+sys.path.insert(0, SCRIPT_DIR)
+
+from parse import normalize_step_name  # noqa: E402
+
+EVIDENCE_VERSION = 2
+FINGERPRINT_VERSION = 1
 
 
 # ---------------------------------------------------------------------------
@@ -382,8 +387,13 @@ def scan_infra_indicators(artifacts_dir):
         for label, pattern in INFRA_INDICATORS:
             hits = _grep_file(fpath, pattern, max_matches=3)
             if hits:
-                _, text = _parse_grep_line(hits[0])
-                matched.append({"label": label, "file": os.path.basename(fpath), "text": text[:200]})
+                line_num, text = _parse_grep_line(hits[0])
+                matched.append({
+                    "label": label,
+                    "file": os.path.relpath(fpath, artifacts_dir),
+                    "line": line_num,
+                    "text": text[:200],
+                })
 
     return {
         "is_infra_failure": len(matched) > 0,
@@ -865,6 +875,82 @@ def _build_failure_timeline(scenarios, meta):
     return timeline
 
 
+# ---------------------------------------------------------------------------
+# Phase K: Deterministic failure fingerprint
+# ---------------------------------------------------------------------------
+#
+# The fingerprint identifies WHAT failed from deterministic facts only, so
+# that identical failures — across releases, PRs, and repeated runs — group
+# together without relying on LLM-authored text.  It deliberately excludes
+# job names, build ids, releases, and timestamps.
+
+_FP_TIMESTAMP_RES = [
+    re.compile(r"\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:?\d{2})?"),
+    re.compile(r"[A-Z][a-z]{2}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}"),
+    re.compile(r"\d{2}:\d{2}:\d{2}(\.\d+)?"),
+]
+
+
+def _normalize_fp_text(text):
+    """Normalize log/test text for fingerprinting: drop volatile tokens."""
+    for ts_re in _FP_TIMESTAMP_RES:
+        text = ts_re.sub("<TS>", text)
+    text = re.sub(r"\b[0-9a-f]{8,}\b", "<HEX>", text)   # hashes, pod suffixes
+    text = re.sub(r"\b\d+\.\d+\.\d+[.\d-]*\b", "<VER>", text)  # versions, IPs
+    text = re.sub(r"\b\d{3,}\b", "<N>", text)           # ids, durations, ports
+    return " ".join(text.split()).lower()
+
+
+def compute_fingerprint(job_type, failed_step, infra, scenarios,
+                        conformance_failures, build_errors):
+    failing_tests = sorted({
+        _normalize_fp_text(t["name"])
+        for s in scenarios for t in s.get("test_failures", [])
+    } | {
+        _normalize_fp_text(c["test_name"] or c["message"])
+        for c in conformance_failures
+    })
+    phase_failures = sorted({
+        _normalize_fp_text(name)
+        for s in scenarios for name in s.get("infra_phase", {}).get("failures", [])
+    })
+
+    has_test_evidence = bool(failing_tests or phase_failures)
+
+    # Journal problem categories are a weak signal — only used when nothing
+    # test-level failed (e.g. greenboot death before any test ran).
+    journal_categories = []
+    if not has_test_evidence:
+        journal_categories = sorted({
+            cat
+            for s in scenarios
+            for cat, entries in s.get("journal_alerts", {}).items()
+            if entries and cat in _FAILURE_JOURNAL_CATEGORIES
+        })
+
+    inputs = {
+        "job_type": job_type,
+        "failed_step": normalize_step_name(failed_step.get("name", "")),
+        "infra_labels": sorted({m["label"] for m in infra.get("matched_patterns", [])})
+        if infra.get("is_infra_failure") else [],
+        "failing_tests": failing_tests,
+        "phase_failures": phase_failures,
+        "timeout_cascade": any(s.get("timeout_cascade") for s in scenarios),
+        "greenboot_failure": any(
+            s.get("greenboot_status") == "FAILURE" for s in scenarios
+        ),
+        "journal_categories": journal_categories,
+        "build_error": _normalize_fp_text(build_errors[0]["text"])[:200]
+        if build_errors else "",
+    }
+
+    key = hashlib.sha256(
+        json.dumps(inputs, sort_keys=True).encode()
+    ).hexdigest()[:12]
+
+    return {"version": FINGERPRINT_VERSION, "key": key, "inputs": inputs}
+
+
 # ---------------------------------------------------------------------------
 # Main extraction
 # ---------------------------------------------------------------------------
@@ -895,6 +981,9 @@ def extract_evidence(artifacts_dir, workdir):
         build_errors = _extract_errors_from_main_log(artifacts_dir)
 
     failure_timeline = _build_failure_timeline(scenarios, meta)
+    fingerprint = compute_fingerprint(
+        job_type, failed_step, infra, scenarios, conformance_failures, build_errors
+    )
 
     source = extract_source_context(workdir, meta["release"], meta["finished_epoch"])
     pcp_graphs = find_pcp_graphs(workdir, meta["build_id"])
@@ -910,6 +999,7 @@ def extract_evidence(artifacts_dir, workdir):
         **{k: v for k, v in meta.items() if k != "finished_epoch"},
         "artifacts_dir": artifacts_dir,
         "job_type": job_type,
+        "fingerprint": fingerprint,
         "failed_step": failed_step,
         "infrastructure_indicators": infra,
         "scenarios": scenarios,
diff --git a/plugins/shared/scripts/parse.py b/plugins/shared/scripts/parse.py
index d4051729..7f1cdad1 100644
--- a/plugins/shared/scripts/parse.py
+++ b/plugins/shared/scripts/parse.py
@@ -84,6 +84,7 @@ def parse_structured_summary(filepath):
             "finished": data.get("finished", ""),
             "remediation": data.get("remediation", ""),
             "confidence": data.get("confidence", ""),
+            "fingerprint": data.get("fingerprint", ""),
             "causal_chain": [
                 link for link in (data.get("causal_chain") or [])
                 if isinstance(link, dict) and "cause" in link
@@ -174,19 +175,35 @@ def cluster_by_similarity(items, key_fn):
 
 
 def group_by_signature(jobs):
-    """Two-pass grouping: first by step_name, then by signature similarity.
-
-    Grouping by step_name first prevents jobs from different CI steps
-    (e.g. conformance vs metal-tests) from being merged together even
-    when their error signatures share enough tokens to exceed the
-    similarity threshold.
+    """Group failure entries, preferring the deterministic fingerprint.
+
+    Entries that carry a `fingerprint` (computed by extract-evidence.py
+    from deterministic artifact facts and injected at fan-out) group by
+    exact key — identical failures across releases, PRs, and runs land in
+    the same bucket by construction.
+
+    Entries without a fingerprint (legacy reports) fall back to two-pass
+    grouping: first by step_name, then by signature similarity.  Grouping
+    by step_name first prevents jobs from different CI steps (e.g.
+    conformance vs metal-tests) from being merged together even when
+    their error signatures share enough tokens to exceed the similarity
+    threshold.
     """
-    by_step = {}
+    by_fingerprint = {}
+    legacy = []
     for job in jobs:
+        fp = job.get("fingerprint", "")
+        if fp:
+            by_fingerprint.setdefault(fp, []).append(job)
+        else:
+            legacy.append(job)
+
+    all_groups = list(by_fingerprint.values())
+
+    by_step = {}
+    for job in legacy:
         step = normalize_step_name(job.get("step_name", ""))
         by_step.setdefault(step, []).append(job)
-
-    all_groups = []
     for step_jobs in by_step.values():
         all_groups.extend(cluster_by_similarity(step_jobs, grouping_text))
     return all_groups
diff --git a/plugins/shared/scripts/plan-analysis.py b/plugins/shared/scripts/plan-analysis.py
new file mode 100644
index 00000000..d9611aaa
--- /dev/null
+++ b/plugins/shared/scripts/plan-analysis.py
@@ -0,0 +1,501 @@
+#!/usr/bin/env python3
+"""
+Deterministic analysis planning for CI doctor workflows.
+
+Shared across components (MicroShift, LVMS, etc.) via symlinks in each
+plugin's scripts/ directory.
+
+plan mode
+    Groups failed jobs by the deterministic failure fingerprint from their
+    evidence packs.  Pure-infrastructure and no-failure groups get template
+    reports written directly (no LLM).  Every other group gets a fully
+    rendered agent prompt file.  Writes <workdir>/analysis-plan.json and
+    prints a compact JSON summary for the orchestrator.
+
+fanout mode
+    After agents have written (and validation has passed) the per-group
+    reports, explodes each group report into the per-job report files that
+    aggregate.py, search-bugs.py and create-report.py consume, patching
+    job-specific fields and injecting the fingerprint key.
+
+Usage:
+    plan-analysis.py plan   --workdir DIR --template FILE
+    plan-analysis.py fanout --workdir DIR
+"""
+
+import glob as glob_mod
+import json
+import os
+import re
+import sys
+
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+def _read_json(path):
+    try:
+        with open(path) as f:
+            return json.load(f)
+    except (OSError, json.JSONDecodeError):
+        return None
+
+
+def _normalize_release(value, job_name=""):
+    """Map 'release-4.22' / '4.22' / 'main' → '4.22' / 'main'; else ''."""
+    for candidate in (value or "", ""):
+        m = re.match(r"^(?:release-)?(\d+\.\d+|main)$", candidate)
+        if m:
+            return m.group(1)
+    m = re.search(r"release-(\d+\.\d+)", job_name)
+    if m:
+        return m.group(1)
+    if "main" in job_name:
+        return "main"
+    return ""
+
+
+def _load_jobs(workdir):
+    """Load all failed jobs from the prepare phase's JSON files.
+
+    Returns a list of job dicts with source bookkeeping attached:
+    release (source release or '' for PRs), pr_number (or None), and
+    index (1-based ordinal within its source, used for fan-out filenames).
+    """
+    jobs_dir = os.path.join(workdir, "jobs")
+    jobs = []
+
+    for jf in sorted(glob_mod.glob(os.path.join(jobs_dir, "release-*-jobs.json"))):
+        m = re.match(r"release-(.+)-jobs\.json$", os.path.basename(jf))
+        release = m.group(1) if m else ""
+        entries = _read_json(jf) or []
+        for i, entry in enumerate(entries, 1):
+            if not isinstance(entry, dict) or not entry.get("build_id"):
+                continue
+            jobs.append({
+                "job_name": entry.get("job", ""),
+                "job_url": entry.get("url", ""),
+                "build_id": entry["build_id"],
+                "artifacts_dir": entry.get("artifacts_dir", ""),
+                "release": release,
+                "pr_number": None,
+                "index": i,
+            })
+
+    pr_entries = _read_json(os.path.join(jobs_dir, "prs-jobs.json")) or []
+    pr_index = 0
+    for entry in pr_entries:
+        if not isinstance(entry, dict) or not entry.get("build_id"):
+            continue
+        if entry.get("status", "").upper() != "FAILURE":
+            continue
+        pr_index += 1
+        jobs.append({
+            "job_name": entry.get("job", ""),
+            "job_url": entry.get("url", ""),
+            "build_id": entry["build_id"],
+            "artifacts_dir": entry.get("artifacts_dir", ""),
+            "release": "",
+            "pr_number": entry.get("pr_number"),
+            "index": pr_index,
+        })
+
+    return jobs
+
+
+def _evidence_path(workdir, build_id):
+    return os.path.join(workdir, "evidence", f"evidence-{build_id}.json")
+
+
+# ---------------------------------------------------------------------------
+# plan: grouping
+# ---------------------------------------------------------------------------
+
+def _classify_group(evidence):
+    """Decide whether a group needs an agent.
+
+    Returns 'infra', 'no-failure', or 'analysis'.  Deterministic verdicts
+    are only issued when there is NO test-level evidence at all — any
+    scenario/test/journal signal sends the group to an agent.
+    """
+    if evidence is None:
+        return "analysis"
+
+    fp = evidence.get("fingerprint", {}).get("inputs", {})
+    has_test_evidence = (
+        fp.get("failing_tests")
+        or fp.get("phase_failures")
+        or fp.get("journal_categories")
+        or fp.get("timeout_cascade")
+        or fp.get("greenboot_failure")
+    )
+    if has_test_evidence:
+        return "analysis"
+
+    if evidence.get("infrastructure_indicators", {}).get("is_infra_failure"):
+        return "infra"
+
+    if not evidence.get("failed_step", {}).get("name") and not fp.get("build_error"):
+        return "no-failure"
+
+    return "analysis"
+
+
+def _group_jobs(workdir, jobs):
+    """Group jobs by fingerprint key.  Jobs without evidence get singleton
+    groups so the agent can fall back to raw artifacts."""
+    groups = {}
+    for job in jobs:
+        ep = _evidence_path(workdir, job["build_id"])
+        evidence = _read_json(ep)
+        job["evidence_pack"] = ep
+        job["finished"] = (evidence or {}).get("finished", "")[:10]
+        if evidence and evidence.get("fingerprint", {}).get("key"):
+            key = evidence["fingerprint"]["key"]
+        else:
+            key = f"noevidence-{job['build_id']}"
+        group = groups.setdefault(key, {"key": key, "jobs": [], "evidence": None})
+        group["jobs"].append(job)
+        if group["evidence"] is None and evidence is not None:
+            group["evidence"] = evidence
+    return sorted(groups.values(), key=lambda g: (-len(g["jobs"]), g["key"]))
+
+
+# ---------------------------------------------------------------------------
+# plan: deterministic reports
+# ---------------------------------------------------------------------------
+
+def _job_lines(jobs):
+    lines = []
+    for job in jobs:
+        label = job["job_name"]
+        if job.get("pr_number"):
+            label += f" (PR #{job['pr_number']})"
+        lines.append(f"- {label} — {job['job_url']}")
+        lines.append(f"  evidence pack: {job['evidence_pack']}")
+        lines.append(f"  artifacts: {job['artifacts_dir']}")
+    return "\n".join(lines)
+
+
+def _deterministic_entry(reason, group):
+    """Build the structured-summary entry for a no-LLM group verdict."""
+    rep = group["jobs"][0]
+    evidence = group["evidence"] or {}
+    failed_step = evidence.get("failed_step", {})
+    matched = evidence.get("infrastructure_indicators", {}).get("matched_patterns", [])
+
+    entry = {
+        "severity": 1,
+        "step_name": failed_step.get("name", ""),
+        "job_url": rep["job_url"],
+        "job_name": rep["job_name"],
+        "release": rep["release"] or _normalize_release(
+            evidence.get("release", ""), rep["job_name"]),
+        "finished": rep["finished"],
+        "fingerprint": f"{group['key']}#1",
+        "confidence": "high",
+        "analysis_gaps": [],
+        "scenarios": [],
+        "causal_chain": [],
+    }
+
+    if reason == "infra":
+        labels = sorted({m["label"] for m in matched})
+        first = matched[0] if matched else {}
+        entry.update({
+            "stack_layer": "AWS Infra" if any(l.startswith("aws_") for l in labels)
+            else "External Infrastructure",
+            "error_signature": "CI infrastructure failure: " + ", ".join(labels),
+            "root_cause": "CI/cloud infrastructure failure before tests ran: " + ", ".join(labels),
+            "raw_error": first.get("text", "")[:150],
+            "infrastructure_failure": True,
+            "remediation": "No product action; re-run the job and monitor CI infrastructure",
+            "causal_chain": [
+                {
+                    "cause": f"infrastructure indicator '{m['label']}' matched",
+                    "evidence": f"{m['file']}:{m.get('line', 1)}",
+                    "quote": m.get("text", ""),
+                }
+                for m in matched[:3]
+            ],
+        })
+    else:  # no-failure
+        entry.update({
+            "stack_layer": "test",
+            "error_signature": "no failure evidence found in artifacts",
+            "root_cause": "no failure indicators in artifacts; job may have self-healed",
+            "raw_error": "",
+            "infrastructure_failure": False,
+            "remediation": "No action; verify job status in Prow if it reported failure",
+            "confidence": "low",
+            "analysis_gaps": [
+                "no failed step, infra indicator, test failure, or build error was extracted"
+            ],
+        })
+
+    return entry
+
+
+def _write_deterministic_report(reason, group, report_file):
+    entry = _deterministic_entry(reason, group)
+    chain_text = "\n".join(
+        f"  {i}. {link['cause']} — {link['evidence']}: \"{link['quote']}\""
+        for i, link in enumerate(entry["causal_chain"], 1)
+    ) or "  (none)"
+
+    body = f"""Deterministic CI Doctor verdict (no LLM analysis needed)
+Fingerprint group: {group['key']} ({len(group['jobs'])} job(s))
+
+Jobs:
+{_job_lines(group['jobs'])}
+
+Error Severity: {entry['severity']}
+Stack Layer: {entry['stack_layer']}
+Step Name: {entry['step_name']}
+Error: {entry['raw_error'] or entry['error_signature']}
+Causal Chain:
+{chain_text}
+Confidence: {entry['confidence']}
+Suggested Remediation: {entry['remediation']}
+
+--- STRUCTURED SUMMARY ---
+{json.dumps([entry], indent=2)}
+--- END STRUCTURED SUMMARY ---
+"""
+    with open(report_file, "w") as f:
+        f.write(body)
+
+
+# ---------------------------------------------------------------------------
+# plan: prompt rendering
+# ---------------------------------------------------------------------------
+
+def _render_prompt(template_text, group, report_file):
+    text = template_text
+    text = text.replace("{GROUP_JOBS}", _job_lines(group["jobs"]))
+    text = text.replace("{OUTPUT_FILE}", report_file)
+    return text
+
+
+def cmd_plan(workdir, template):
+    template_text = None
+    if template:
+        try:
+            with open(template) as f:
+                template_text = f.read()
+        except OSError as e:
+            print(f"Error: cannot read template: {e}", file=sys.stderr)
+            return 1
+    if not template_text:
+        print("Error: --template is required for plan", file=sys.stderr)
+        return 1
+
+    jobs = _load_jobs(workdir)
+    if not jobs:
+        print("No failed jobs found — nothing to plan", file=sys.stderr)
+        print(json.dumps({"workdir": workdir, "total_jobs": 0, "groups": []}))
+        return 0
+
+    groups = _group_jobs(workdir, jobs)
+    prompts_dir = os.path.join(workdir, "prompts")
+    jobs_dir = os.path.join(workdir, "jobs")
+    os.makedirs(prompts_dir, exist_ok=True)
+    os.makedirs(jobs_dir, exist_ok=True)
+
+    plan = []
+    for group in groups:
+        report_file = os.path.join(jobs_dir, f"analysis-group-{group['key']}.txt")
+        reason = _classify_group(group["evidence"])
+        entry = {
+            "key": group["key"],
+            "reason": reason,
+            "deterministic": reason != "analysis",
+            "report_file": report_file,
+            "jobs": [
+                {k: v for k, v in job.items()}
+                for job in group["jobs"]
+            ],
+        }
+        if reason == "analysis":
+            prompt_file = os.path.join(prompts_dir, f"group-{group['key']}.md")
+            with open(prompt_file, "w") as f:
+                f.write(_render_prompt(template_text, group, report_file))
+            entry["prompt_file"] = prompt_file
+        else:
+            _write_deterministic_report(reason, group, report_file)
+        plan.append(entry)
+
+    plan_path = os.path.join(workdir, "analysis-plan.json")
+    with open(plan_path, "w") as f:
+        json.dump(plan, f, indent=2)
+
+    agent_groups = [p for p in plan if not p["deterministic"]]
+    summary = {
+        "workdir": workdir,
+        "plan_file": plan_path,
+        "total_jobs": len(jobs),
+        "total_groups": len(plan),
+        "deterministic_groups": len(plan) - len(agent_groups),
+        "agent_groups": [
+            {
+                "key": p["key"],
+                "jobs": len(p["jobs"]),
+                "prompt_file": p["prompt_file"],
+                "report_file": p["report_file"],
+            }
+            for p in agent_groups
+        ],
+    }
+    print(
+        f"Planned {len(plan)} groups for {len(jobs)} jobs: "
+        f"{len(agent_groups)} need agents, "
+        f"{len(plan) - len(agent_groups)} resolved deterministically",
+        file=sys.stderr,
+    )
+    print(json.dumps(summary, indent=2))
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# fanout
+# ---------------------------------------------------------------------------
+
+_SUMMARY_RE = re.compile(
+    r"--- STRUCTURED SUMMARY ---\n(.+?)(?:\n--- END STRUCTURED SUMMARY ---|\Z)",
+    re.DOTALL,
+)
+
+
+def _split_report(content):
+    """Split a report into (prose, entries).  entries is None if unparseable."""
+    m = _SUMMARY_RE.search(content)
+    if not m:
+        return content, None
+    prose = content[:m.start()].rstrip("\n")
+    json_text = m.group(1).replace("\t", "\\t").replace("\r", "\\r")
+    json_text = re.sub(r"[\x00-\x09\x0b\x0c\x0e-\x1f\x7f]", "", json_text)
+    try:
+        entries = json.loads(json_text)
+    except json.JSONDecodeError:
+        return prose, None
+    if isinstance(entries, dict):
+        entries = [entries]
+    if not isinstance(entries, list):
+        return prose, None
+    return prose, entries
+
+
+def _member_filename(job):
+    if job.get("pr_number"):
+        return f"prs-job-{job['index']}-pr{job['pr_number']}-{job['build_id']}.txt"
+    return f"release-{job['release']}-job-{job['index']}-{job['build_id']}.txt"
+
+
+def _patch_entries(entries, job, key, workdir):
+    evidence = _read_json(_evidence_path(workdir, job["build_id"])) or {}
+    release = job["release"] or _normalize_release(
+        evidence.get("release", ""), job["job_name"])
+    patched = []
+    for i, entry in enumerate(entries, 1):
+        if not isinstance(entry, dict):
+            continue
+        e = dict(entry)
+        e["job_name"] = job["job_name"]
+        e["job_url"] = job["job_url"]
+        e["release"] = release
+        e["finished"] = job.get("finished", "") or e.get("finished", "")
+        # Independent failures within one group must stay separate issues
+        # downstream, so the grouping key is fingerprint + entry ordinal
+        # (the entries array is identical across the group's members).
+        e["fingerprint"] = f"{key}#{i}"
+        patched.append(e)
+    return patched
+
+
+def cmd_fanout(workdir):
+    plan = _read_json(os.path.join(workdir, "analysis-plan.json"))
+    if not isinstance(plan, list):
+        print(f"Error: no analysis-plan.json in {workdir} — run plan first", file=sys.stderr)
+        return 1
+
+    jobs_dir = os.path.join(workdir, "jobs")
+    written = 0
+    missing = []
+    unparseable = []
+
+    for group in plan:
+        report_file = group.get("report_file", "")
+        if not os.path.isfile(report_file):
+            missing.append(group["key"])
+            continue
+        with open(report_file, errors="replace") as f:
+            content = f.read()
+        prose, entries = _split_report(content)
+        if entries is None:
+            unparseable.append(group["key"])
+            continue
+
+        note = (
+            f"NOTE: analyzed as fingerprint group {group['key']} "
+            f"covering {len(group['jobs'])} job(s); shared analysis fanned out per job.\n\n"
+        )
+        for job in group["jobs"]:
+            patched = _patch_entries(entries, job, group["key"], workdir)
+            target = os.path.join(jobs_dir, _member_filename(job))
+            with open(target, "w") as f:
+                f.write(note + prose + "\n\n--- STRUCTURED SUMMARY ---\n")
+                json.dump(patched, f, indent=2)
+                f.write("\n--- END STRUCTURED SUMMARY ---\n")
+            written += 1
+
+    result = {
+        "written": written,
+        "missing_reports": missing,
+        "unparseable_reports": unparseable,
+    }
+    print(
+        f"Fanned out {written} per-job reports"
+        + (f"; MISSING group reports: {', '.join(missing)}" if missing else "")
+        + (f"; UNPARSEABLE: {', '.join(unparseable)}" if unparseable else ""),
+        file=sys.stderr,
+    )
+    print(json.dumps(result, indent=2))
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def main():
+    if len(sys.argv) < 2 or sys.argv[1] not in ("plan", "fanout"):
+        print(__doc__, file=sys.stderr)
+        sys.exit(2)
+
+    mode = sys.argv[1]
+    workdir = None
+    template = None
+    args = sys.argv[2:]
+    i = 0
+    while i < len(args):
+        if args[i] == "--workdir":
+            workdir = args[i + 1]; i += 2
+        elif args[i] == "--template":
+            template = args[i + 1]; i += 2
+        else:
+            print(f"Unknown argument: {args[i]}", file=sys.stderr)
+            sys.exit(2)
+
+    if not workdir or not os.path.isdir(workdir):
+        print(f"Error: --workdir missing or not a directory: {workdir}", file=sys.stderr)
+        sys.exit(2)
+    workdir = os.path.abspath(workdir)
+
+    if mode == "plan":
+        sys.exit(cmd_plan(workdir, template))
+    sys.exit(cmd_fanout(workdir))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/plugins/shared/scripts/validate-reports.py b/plugins/shared/scripts/validate-reports.py
index 08fadab8..db5dd18c 100644
--- a/plugins/shared/scripts/validate-reports.py
+++ b/plugins/shared/scripts/validate-reports.py
@@ -86,11 +86,12 @@ def _candidate_roots(workdir, build_id):
     build_dir = os.path.join(artifacts_root, build_id) if build_id else ""
     if build_dir and os.path.isdir(build_dir):
         roots.append(build_dir)
-    elif verifiable:
-        # Unknown or missing build id — try every downloaded build.
+    if verifiable:
+        # Group reports may cite artifacts of any member job, so all
+        # downloaded builds are candidate roots (own build dir first).
         roots.extend(
             d for d in sorted(glob_mod.glob(os.path.join(artifacts_root, "*")))
-            if os.path.isdir(d)
+            if os.path.isdir(d) and d != build_dir
         )
     for src_dir in sorted(glob_mod.glob(os.path.join(workdir, "src", "*"))):
         if os.path.isdir(src_dir):

From 9661d29458a1846dcaa0ed41ea6537cb72bad39c Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <pmatusza@redhat.com>
Date: Fri, 3 Jul 2026 12:00:24 +0200
Subject: [PATCH 5/7] ci-doctor: make fanout fail loudly with retry_groups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fanout used to exit 0 even when group reports were missing or
unparseable, merely listing them in its JSON — easy for the
orchestrating session to ignore, silently dropping every job in those
groups. Now it exits 3 and emits a retry_groups array with each group's
prompt_file (null for deterministic groups) so the orchestrator can
re-launch the failed analysis agents directly.
---
 plugins/shared/scripts/plan-analysis.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/plugins/shared/scripts/plan-analysis.py b/plugins/shared/scripts/plan-analysis.py
index d9611aaa..1d0ccf37 100644
--- a/plugins/shared/scripts/plan-analysis.py
+++ b/plugins/shared/scripts/plan-analysis.py
@@ -21,6 +21,13 @@
 Usage:
     plan-analysis.py plan   --workdir DIR --template FILE
     plan-analysis.py fanout --workdir DIR
+
+Exit codes:
+    0  success (fanout: every planned group report was fanned out)
+    1  fanout: no analysis-plan.json (run plan first)
+    2  usage error
+    3  fanout incomplete: group reports missing or unparseable; the printed
+       JSON's retry_groups lists them with their prompt_file for re-launch
 """
 
 import glob as glob_mod
@@ -449,10 +456,25 @@ def cmd_fanout(workdir):
                 f.write("\n--- END STRUCTURED SUMMARY ---\n")
             written += 1
 
+    by_key = {g.get("key"): g for g in plan if isinstance(g, dict)}
+    retry_groups = [
+        {
+            "key": key,
+            "reason": reason,
+            # prompt_file is null for deterministic groups,
+            # which cannot be re-launched (their reports are script-written).
+            "prompt_file": by_key.get(key, {}).get("prompt_file"),
+            "report_file": by_key.get(key, {}).get("report_file", ""),
+        }
+        for key, reason in [(k, "missing") for k in missing]
+        + [(k, "unparseable") for k in unparseable]
+    ]
+
     result = {
         "written": written,
         "missing_reports": missing,
         "unparseable_reports": unparseable,
+        "retry_groups": retry_groups,
     }
     print(
         f"Fanned out {written} per-job reports"
@@ -461,7 +483,7 @@ def cmd_fanout(workdir):
         file=sys.stderr,
     )
     print(json.dumps(result, indent=2))
-    return 0
+    return 3 if retry_groups else 0
 
 
 # ---------------------------------------------------------------------------

From ac719197ecc61adabe91362f6df69031bf6e23f6 Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <pmatusza@redhat.com>
Date: Fri, 3 Jul 2026 12:00:16 +0200
Subject: [PATCH 6/7] microshift-ci: slim the doctor skill prose
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The group-first flow made most of the orchestration prose obsolete:
the orchestrator no longer reads job JSON fields or builds agent
prompts, so the field-name warnings, evidence-content inventories,
duplicate examples, and step-restating notes are gone (318 → ~200
lines). The '-p mode' turn-keeping scaffolding stays until the CI step
takes over the deterministic phases.
---
 plugins/microshift-ci/skills/doctor/SKILL.md | 175 +++----------------
 1 file changed, 28 insertions(+), 147 deletions(-)

diff --git a/plugins/microshift-ci/skills/doctor/SKILL.md b/plugins/microshift-ci/skills/doctor/SKILL.md
index 4003fa37..c1b3a484 100644
--- a/plugins/microshift-ci/skills/doctor/SKILL.md
+++ b/plugins/microshift-ci/skills/doctor/SKILL.md
@@ -16,7 +16,7 @@ allowed-tools: Bash, Read, Write, Glob, Grep, Agent
 
 ## Description
 
-Accepts a comma-separated list of MicroShift release versions, runs analysis for each release and for open rebase PRs, and produces a single HTML summary file consolidating all results. Uses deterministic scripts for data collection, artifact download, aggregation, and HTML generation. LLM agents are used only for per-job root cause analysis and Jira bug correlation.
+Accepts a comma-separated list of MicroShift release versions, runs analysis for each release and for open rebase PRs, and produces a single HTML summary file consolidating all results. Deterministic scripts handle data collection, artifact download, evidence extraction, failure grouping, aggregation, and HTML generation. LLM agents handle exactly two things: root cause analysis of each distinct failure group (Step 2) and Jira bug correlation (Step 3).
 
 ## Arguments
 
@@ -34,106 +34,50 @@ Compute once at the start by running `date +%y%m%d` and substituting into the pa
 
 ### Step 1: Prepare — Collect and Download All Artifacts
 
-**Goal**: Deterministically collect all failed jobs and download their artifacts before any LLM analysis.
-
-**Actions**:
-
-1. Determine today's `<WORKDIR>` by running `date +%y%m%d` and substituting into `/tmp/microshift-ci-claude-workdir.<YYMMDD>`. Use this value in all subsequent commands.
+1. Determine today's `<WORKDIR>` (see above). Use this value in all subsequent commands.
 2. Run the prepare script:
 
    ```text
    bash plugins/microshift-ci/scripts/doctor.sh prepare --component microshift --workdir <WORKDIR> <ARGUMENTS> --rebase --repo openshift/microshift
    ```
 
-3. The script deterministically:
-   - For each release: fetches failed periodic jobs, downloads artifacts, writes `<WORKDIR>/jobs/release-<version>-jobs.json`
-   - For rebase PRs: fetches PRs with failures, downloads artifacts, writes `<WORKDIR>/jobs/prs-jobs.json` and `<WORKDIR>/jobs/prs-status.json`
-   - Outputs a JSON summary listing all releases, job counts, and file paths
-4. Read the JSON output to know which releases have jobs to analyze and how many
-
-**Job JSON field names** (use these exactly — do NOT guess alternatives like `job_name`):
-
-- `job` — full job name
-- `build_id` — unique build identifier
-- `artifacts_dir` — local path to downloaded artifacts
-- `url` — Prow job URL
-- `status` — job result (`failure`, `FAILURE`, `SUCCESS`, `PENDING`)
-- `pr_number` — PR number (PR jobs only)
+3. The script fetches failed periodic jobs per release and rebase PRs with failures, downloads all artifacts, clones the MicroShift source with per-release worktrees, and prints a JSON summary of releases, job counts, and file paths. Read that summary — do NOT read the job JSON files it references.
 
 **Error Handling**:
 
 - If `<ARGUMENTS>` is empty, show usage and stop
-- If a release has no failed jobs, its jobs JSON will be an empty array — skip analysis for that release
-- If a release has an `"error"` field in the JSON summary, data collection failed for that release — report the error to the user but continue with other releases
+- A release with no failed jobs simply has nothing to analyze
+- A release with an `"error"` field failed data collection — report it to the user but continue with other releases
 
 ### Step 1b: Generate PCP Performance Graphs
 
-**Goal**: Generate performance graphs from PCP archives for all jobs that have pmlogs.
-
-**Actions**:
-
-1. Run the graphs script (this is deterministic, no LLM needed):
-
-   ```text
-   bash plugins/microshift-ci/scripts/doctor.sh graphs --component microshift --workdir <WORKDIR>
-   ```
+```text
+bash plugins/microshift-ci/scripts/doctor.sh graphs --component microshift --workdir <WORKDIR>
+```
 
-2. The script finds PCP archives in downloaded artifacts and generates PNG graphs at `<WORKDIR>/graphs/<build_id>/`:
-   - `1_cpu_usage.png` — CPU usage (user, system, I/O wait)
-   - `2_mem_usage.png` — Memory usage (used, cached)
-   - `3_disk_io.png` — Disk I/O (read/write OPS, await)
-   - `4_disk_usage.png` — Disk usage by partition (% fill)
-3. If prerequisites are missing (`pcp2json`, `matplotlib`), the script errors and stops.
+Generates CPU/memory/disk graphs at `<WORKDIR>/graphs/<build_id>/` for jobs with PCP archives. If prerequisites are missing (`pcp2json`, `matplotlib`), the script errors and stops.
 
 ### Step 1c: Extract Structured Evidence
 
-**Goal**: Deterministically extract structured evidence from all job artifacts before LLM analysis. This gives each analysis agent a pre-extracted overview so it can skip exploratory file scanning and focus on root cause reasoning.
-
-**Actions**:
-
-1. Run the evidence extraction script:
-
-   ```text
-   bash plugins/microshift-ci/scripts/doctor.sh evidence --component microshift --workdir <WORKDIR>
-   ```
-
-2. The script processes each job's artifacts and produces `<WORKDIR>/evidence/evidence-<BUILD_ID>.json` containing:
-   - Failed step identification (from per-step `finished.json`)
-   - Infrastructure failure indicators (scheduling, AWS errors, CI cluster capacity)
-   - Per-scenario evidence: junit failures, RF failures, boot_and_run alerts, journal alerts (OOM, panics, container restarts, etcd pressure, OVN binding, probe failures), sosreport paths
-   - Conformance test failures
-   - Build/config error lines with context
-   - PCP graph availability
-   - Recent source commits (no path filter — product and test changes)
-   - Pre-extracted sosreports (when journal shows container restarts or crashes)
+```text
+bash plugins/microshift-ci/scripts/doctor.sh evidence --component microshift --workdir <WORKDIR>
+```
 
-3. If the script fails for some jobs, note the errors but continue — agents can fall back to raw artifacts.
+Produces `<WORKDIR>/evidence/evidence-<BUILD_ID>.json` per job — the structured evidence packs (failed step, failure fingerprint, per-scenario alerts, sosreport paths) that analysis agents start from. If it fails for some jobs, note the errors and continue — agents fall back to raw artifacts.
 
 ### Step 1d: Plan Analysis Groups
 
-**Goal**: Deterministically group failed jobs by failure fingerprint so each distinct failure is analyzed exactly once.
-
-**Actions**:
-
 1. Run the plan script:
 
    ```text
    bash plugins/microshift-ci/scripts/doctor.sh plan --component microshift --workdir <WORKDIR>
    ```
 
-2. The script deterministically:
-   - Groups all failed jobs (releases + PRs) by the failure fingerprint from their evidence packs
-   - Writes template reports directly for pure-infrastructure and no-failure groups — those need NO agent
-   - Renders a fully substituted agent prompt file per remaining group under `<WORKDIR>/prompts/`
-   - Writes `<WORKDIR>/analysis-plan.json` and prints a JSON summary whose `agent_groups` array lists each group's `prompt_file` and `report_file`
+   It groups all failed jobs (releases + PRs) by failure fingerprint, writes template verdicts for pure-infrastructure and no-failure groups (no agent needed), and renders one fully substituted agent prompt file per remaining group. Its JSON summary's `agent_groups` array lists each group's `prompt_file` and `report_file`.
 
 ### Step 2: Analyze Each Group
 
-**Goal**: Get detailed root cause analysis for each failure group using evidence packs and pre-downloaded artifacts.
-
-**Actions**:
-
-1. For **every** entry in the plan summary's `agent_groups`, launch a separate **Agent** (using the `Agent` tool) with exactly this prompt — the prompt files are fully pre-rendered, do NOT read or modify them yourself:
+1. For **every** entry in `agent_groups`, launch a separate **Agent** with exactly this prompt — the prompt files are fully pre-rendered, do NOT read or modify them yourself:
 
    ```text
    Read <PROMPT_FILE> and follow its instructions exactly.
@@ -192,86 +136,34 @@ Compute once at the start by running `date +%y%m%d` and substituting into the pa
 
 ### Step 3: Run Bug Correlation (Dry-Run)
 
-**Goal**: Search Jira for existing bugs matching each failure. Results are embedded in the HTML report.
-
-**Actions**:
-
-1. Collect all release versions from `<ARGUMENTS>` into a comma-separated list (e.g., `4.19,4.20,4.21,4.22`)
-2. Check for rebase PR source identifiers from the PR jobs JSON (e.g., `rebase-release-4.22`). Append them to the source list.
-3. Launch a **single** `microshift-ci:create-bugs` **foreground** agent in dry-run mode with all sources:
+1. Build the source list: all release versions from `<ARGUMENTS>` plus any rebase PR source identifiers from the PR jobs (e.g., `rebase-release-4.22`).
+2. Launch a **single** `microshift-ci:create-bugs` **foreground** agent in dry-run mode with all sources:
 
    ```text
    Agent: subagent_type=general_purpose, prompt="Run /microshift-ci:create-bugs <all-sources-comma-separated>"
    ```
 
-4. The agent produces:
-   - `<WORKDIR>/bugs/bug-matches-<source>.json` for each source (mapping files with open bugs data for the Bugs tab)
-   - `<WORKDIR>/report-create-bugs.txt` — merged report covering all releases and rebase sources
-5. When the agent returns, immediately proceed to Step 4 in the same turn. Do NOT stop or end your turn between Step 3 and Step 4.
-
-**Error Handling**:
-
-- If create-bugs fails, note the failure but do not block HTML generation
+   It produces `<WORKDIR>/bugs/bug-matches-<source>.json` per source and `<WORKDIR>/report-create-bugs.txt`.
+3. When the agent returns, immediately proceed to Step 4 in the same turn. Do NOT stop or end your turn between Step 3 and Step 4. If create-bugs fails, note the failure but do not block HTML generation.
 
 ### Step 4: Finalize — Aggregate and Generate HTML Report
 
 **IMPORTANT**: This step is MANDATORY. The task is incomplete without it. You MUST run this even if previous steps produced errors.
 
-**Goal**: Deterministically aggregate results and generate the HTML report.
-
-**Actions**:
-
-1. Run the finalize script:
-
-   ```text
-   bash plugins/microshift-ci/scripts/doctor.sh finalize --component microshift --workdir <WORKDIR> <ARGUMENTS>
-   ```
-
-2. The script deterministically:
-   - Runs `aggregate.py` for each release and for PRs → `summary.json` files
-   - Runs `create-report.py` → `report-microshift-ci-doctor.html`
-3. Report the script's output to the user
-
-### Step 5: Report Completion
-
-**Actions**:
-
-1. Display the path to the generated HTML file
-2. Summarize: failed job counts per release, rebase PR status, bug correlation results
-
-**Example Output**:
-
 ```text
-Summary:
-  Periodics:
-    Release 4.19: 3 failed periodic jobs
-    Release 4.20: ERROR - data collection failed
-    Release 4.21: 0 failed periodic jobs
-    Release 4.22: 12 failed periodic jobs
-  Pull Requests:
-    2 rebase PRs with 5 total failed jobs
-
-HTML report generated: <WORKDIR>/report-microshift-ci-doctor.html
+bash plugins/microshift-ci/scripts/doctor.sh finalize --component microshift --workdir <WORKDIR> <ARGUMENTS>
 ```
 
-## Examples
-
-### Example 1: Analyze Multiple Releases
-
-```bash
-/microshift-ci:doctor 4.19,4.20,4.21,4.22
-```
+Aggregates per-release and PR summaries and generates `report-microshift-ci-doctor.html`.
 
-### Example 2: Analyze Two Releases
+### Step 5: Report Completion
 
-```bash
-/microshift-ci:doctor 4.21,4.22
-```
+Display the path to the generated HTML file and summarize: failed job counts per release, analysis groups (agents vs deterministic), rebase PR status, and bug correlation results.
 
-### Example 3: Single Release (still produces HTML)
+## Example
 
 ```bash
-/microshift-ci:doctor 4.22
+/microshift-ci:doctor 4.19,4.20,4.21,4.22
 ```
 
 ## Prerequisites
@@ -281,8 +173,7 @@ HTML report generated: <WORKDIR>/report-microshift-ci-doctor.html
 - MCP Jira server must be configured (for bug correlation)
 - Internet access to fetch job data from Prow/GCS
 - Bash shell, Python 3
-- `pcp-export-pcp2json` — for PCP graph generation
-- `matplotlib` Python package — for PCP graph plotting
+- `pcp-export-pcp2json` and `matplotlib` — for PCP graph generation
 
 ## Related Skills and Agents
 
@@ -293,17 +184,7 @@ HTML report generated: <WORKDIR>/report-microshift-ci-doctor.html
 
 ## Notes
 
-- **Deterministic scripts** handle: data collection, artifact download, evidence extraction, aggregation, HTML generation
-- **LLM agents** handle: per-job root cause analysis (Step 2), Jira bug search and open bugs query (Step 3)
-- Step 1c evidence extraction pre-processes all artifacts so Step 2 agents (from `plugins/microshift-ci/agents/analyze-evidence.md`) receive structured evidence packs and can skip exploratory log scanning
-- Step 1d groups jobs by deterministic failure fingerprint: one agent per distinct failure (not per job), pure-infrastructure and no-failure groups resolved without any agent, and per-job report files produced by the deterministic fan-out in Step 2
-- `/microshift-ci:doctor-refresh` regenerates the HTML report from existing data. Use it after `/microshift-ci:create-bugs --create` to include newly created bugs
-- Step 2 agents (per-group analysis) are launched in a single parallel wave
-- Step 3 uses a single create-bugs agent with all sources (releases + rebase) comma-separated
-- The `prepare` script downloads all artifacts upfront so analysis agents use local paths (no redundant downloads)
-- The `prepare` script also clones the MicroShift source to `<WORKDIR>/src/microshift` with per-release worktrees (`--repo openshift/microshift`); clone failure is non-fatal — agents record the absence in `analysis_gaps` and proceed
-- The `finalize` script runs aggregation and HTML generation in one call
-- All intermediate files use prescribed filenames in `<WORKDIR>` subdirectories (`jobs/`, `bugs/`) — no improvised names
+- One agent analyzes each distinct failure fingerprint (not each job); pure-infrastructure and no-failure groups are resolved by script with no agent at all
+- All intermediate files use prescribed filenames in `<WORKDIR>` subdirectories (`jobs/`, `bugs/`, `evidence/`, `prompts/`) — no improvised names
 - The HTML report is self-contained (no external CSS/JS dependencies)
 - If a release analysis fails, it is noted in the report but does not block other releases
-- If no rebase PRs are open, the Pull Requests tab shows "No open rebase pull requests found"

From 6a149239a46bb54f1c19e77f2d8dd8becd256630 Mon Sep 17 00:00:00 2001
From: Patryk Matuszak <pmatusza@redhat.com>
Date: Fri, 3 Jul 2026 12:00:07 +0200
Subject: [PATCH 7/7] microshift-ci: add --prepared mode to the doctor skill

In CI the deterministic phases (prepare, graphs, evidence,
fetch-previous, finalize) burn the Claude session's 45-minute wall
clock while the model just waits on downloads. With --prepared the CI
step runs them in bash around the session, and the skill covers only
what needs a model: planning-driven agent launches, validation,
fan-out, and bug correlation. Interactive use without the flag is
unchanged.
---
 plugins/microshift-ci/skills/doctor/SKILL.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/plugins/microshift-ci/skills/doctor/SKILL.md b/plugins/microshift-ci/skills/doctor/SKILL.md
index c1b3a484..e380210d 100644
--- a/plugins/microshift-ci/skills/doctor/SKILL.md
+++ b/plugins/microshift-ci/skills/doctor/SKILL.md
@@ -21,6 +21,7 @@ Accepts a comma-separated list of MicroShift release versions, runs analysis for
 ## Arguments
 
 - `<ARGUMENTS>` (required): Comma-separated list of release versions (e.g., `4.19,4.20,4.21,4.22`)
+- `--prepared` (optional, used by CI): the caller has already run the deterministic preparation phases (`prepare`, `graphs`, `evidence`) and will run `finalize` itself. Skip Steps 1–1c and Step 4 — start at the Step 1d plan script and stop after Step 3.
 
 ## Work Directory
 
@@ -32,6 +33,8 @@ Compute once at the start by running `date +%y%m%d` and substituting into the pa
 
 ## Implementation Steps
 
+With `--prepared`: run only Step 1d, Step 2, Step 3, and Step 5.
+
 ### Step 1: Prepare — Collect and Download All Artifacts
 
 1. Determine today's `<WORKDIR>` (see above). Use this value in all subsequent commands.
@@ -148,7 +151,9 @@ Produces `<WORKDIR>/evidence/evidence-<BUILD_ID>.json` per job — the structure
 
 ### Step 4: Finalize — Aggregate and Generate HTML Report
 
-**IMPORTANT**: This step is MANDATORY. The task is incomplete without it. You MUST run this even if previous steps produced errors.
+Skip this step when `--prepared` — the CI step runs finalize itself after this session ends.
+
+**IMPORTANT**: Otherwise this step is MANDATORY. The task is incomplete without it. You MUST run this even if previous steps produced errors.
 
 ```text
 bash plugins/microshift-ci/scripts/doctor.sh finalize --component microshift --workdir <WORKDIR> <ARGUMENTS>
@@ -158,7 +163,7 @@ Aggregates per-release and PR summaries and generates `report-microshift-ci-doct
 
 ### Step 5: Report Completion
 
-Display the path to the generated HTML file and summarize: failed job counts per release, analysis groups (agents vs deterministic), rebase PR status, and bug correlation results.
+Display the path to the generated HTML file and summarize: failed job counts per release, analysis groups (agents vs deterministic), rebase PR status, and bug correlation results. With `--prepared`, there is no HTML yet — summarize the analysis and note that the caller generates the report.
 
 ## Example