From 63e6fd5a8b758b521169aada8e5a4aada0dbc175 Mon Sep 17 00:00:00 2001
From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com>
Date: Tue, 16 Jun 2026 21:32:52 +0300
Subject: [PATCH 1/4] feat(report): LLM judge validates claims against evidence

---
 cyberai/agents/report/judge.py | 155 +++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 cyberai/agents/report/judge.py

diff --git a/cyberai/agents/report/judge.py b/cyberai/agents/report/judge.py
new file mode 100644
index 0000000..f13924c
--- /dev/null
+++ b/cyberai/agents/report/judge.py
@@ -0,0 +1,155 @@
+"""LLM-as-Judge — validates report claims against knowledge-base evidence.
+
+A second (optionally more powerful) LLM cross-checks every claim in the
+generated report against the evidence actually present in the session KB.
+It returns a hallucination score in [0, 1] and a list of unsupported
+claims. Flag-gated in ReportAgent (use_judge, default False) — the
+deterministic report is never blocked by judge failures.
+"""
+
+from __future__ import annotations
+
+import json
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field, field_validator
+
+from cyberai.core.llm_client import LLMClient
+from cyberai.core.scan_session import ScanSession
+
+
+class JudgeVerdict(BaseModel):
+    """Structured verdict returned by the judge LLM."""
+
+    hallucination_score: float = Field(default=0.0, ge=0.0, le=1.0)
+    supported: bool = True
+    unsupported_claims: List[str] = Field(default_factory=list)
+    notes: str = ""
+
+    @field_validator("hallucination_score")
+    @classmethod
+    def _clamp(cls, v: float) -> float:
+        try:
+            return max(0.0, min(1.0, float(v)))
+        except (TypeError, ValueError):
+            return 0.0
+
+
+# Flat JSON Schema for structured_call (OpenAI strict-friendly: no nesting).
+VERDICT_SCHEMA: Dict[str, Any] = {
+    "type": "object",
+    "properties": {
+        "hallucination_score": {
+            "type": "number",
+            "description": "0.0 = every claim backed by evidence; 1.0 = all fabricated.",
+        },
+        "supported": {
+            "type": "boolean",
+            "description": "True if the report is sufficiently grounded in evidence.",
+        },
+        "unsupported_claims": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": "Claims in the report with no matching KB evidence.",
+        },
+        "notes": {"type": "string", "description": "Brief reviewer notes."},
+    },
+    "required": ["hallucination_score", "supported", "unsupported_claims", "notes"],
+}
+
+JUDGE_SYSTEM = (
+    "You are a strict security-report reviewer. You are given a penetration-"
+    "test report and the raw EVIDENCE collected during the scan (findings, "
+    "CVE IDs, tool artifacts). Your job: detect hallucinations. A claim is "
+    "UNSUPPORTED if it asserts a vulnerability, CVE, port, or impact that "
+    "does not appear in the evidence. Do NOT reward fluency. Score "
+    "hallucination_score in [0,1]: 0 means every claim is backed by "
+    "evidence, 1 means the report is fabricated. List each unsupported "
+    "claim verbatim. Respond ONLY via the structured schema."
+)
+
+
+def _collect_evidence(session: ScanSession) -> Dict[str, Any]:
+    """Pull the ground-truth evidence the report must be consistent with."""
+    findings = []
+    for f in session.findings:
+        findings.append(
+            {
+                "id": f.id,
+                "title": f.title,
+                "severity": getattr(f.severity, "value", str(f.severity)),
+                "cve_ids": list(f.cve_ids),
+                "target": f.target,
+                "evidence": [str(e)[:500] for e in (f.evidence or [])],
+            }
+        )
+    return {
+        "target": session.target,
+        "findings": findings,
+    }
+
+
+@contextmanager
+def _judge_model(llm: LLMClient, model: Optional[str]):
+    """Temporarily swap the LLM model to the (more powerful) judge model."""
+    if not model:
+        yield
+        return
+    original = llm.config.model
+    llm.config.model = model
+    try:
+        yield
+    finally:
+        llm.config.model = original
+
+
+def judge_report(
+    report_text: str,
+    session: ScanSession,
+    llm: LLMClient,
+    *,
+    threshold: float = 0.7,
+    judge_model: Optional[str] = None,
+    agent_name: str = "report.judge",
+) -> JudgeVerdict:
+    """Cross-check `report_text` against session evidence via a second LLM.
+
+    Returns a JudgeVerdict. On ANY failure returns a graceful pass-through
+    verdict (score=0.0, supported=True) so the report pipeline never breaks.
+    `supported` is recomputed from the score against `threshold` regardless
+    of what the model claimed.
+    """
+    if llm is None:
+        return JudgeVerdict(notes="judge unavailable: no LLM client")
+
+    evidence = _collect_evidence(session)
+    messages = [
+        {
+            "role": "user",
+            "content": (
+                "REPORT:\n"
+                f"{report_text}\n\n"
+                "EVIDENCE (ground truth):\n"
+                f"{json.dumps(evidence, indent=2, default=str)}"
+            ),
+        }
+    ]
+
+    try:
+        with _judge_model(llm, judge_model):
+            raw = llm.structured_call(
+                messages,
+                schema=VERDICT_SCHEMA,
+                schema_name="judge_verdict",
+                description="Hallucination verdict for a pentest report.",
+                system=JUDGE_SYSTEM,
+                agent_name=agent_name,
+            )
+        verdict = JudgeVerdict.model_validate(raw)
+    except Exception as exc:  # noqa: BLE001 — judge must never hard-fail
+        return JudgeVerdict(notes=f"judge unavailable: {exc}")
+
+    # Threshold is authoritative — don't trust the model's own `supported`.
+    verdict.supported = verdict.hallucination_score < threshold
+    return verdict

From 07204253b0b98c01ba273d9bf50fec87725878e7 Mon Sep 17 00:00:00 2001
From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com>
Date: Tue, 16 Jun 2026 21:34:43 +0300
Subject: [PATCH 2/4] feat(report): judge can request retry with feedback

---
 cyberai/agents/report/agent.py | 50 +++++++++++++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/cyberai/agents/report/agent.py b/cyberai/agents/report/agent.py
index b654876..b3d5056 100644
--- a/cyberai/agents/report/agent.py
+++ b/cyberai/agents/report/agent.py
@@ -12,6 +12,7 @@
 from cyberai.core.types import ReportSection
 
 from .json_exporter import export_json
+from .judge import judge_report
 from .markdown_renderer import render_markdown
 
 
@@ -67,12 +68,59 @@ def run(self, target: str, context: Optional[Dict[str, Any]] = None) -> Dict[str
             if section is not None:
                 self.kb.set("report.section", section.model_dump(), agent=self.AGENT_NAME)
 
-        return {
+        # Flag-gated: LLM-as-Judge cross-checks the report against KB evidence.
+        verdict_dump = None
+        if getattr(self.config, "use_judge", False) and self.llm is not None:
+            verdict = judge_report(
+                md_content,
+                self.session,
+                self.llm,
+                threshold=getattr(self.config, "judge_threshold", 0.7),
+                judge_model=getattr(self.config, "judge_model", None),
+            )
+            verdict_dump = verdict.model_dump()
+            self.kb.set("report.judge_verdict", verdict_dump, agent=self.AGENT_NAME)
+            md_content = self._append_verdict(md_content, verdict)
+            with open(md_path, "w") as f:
+                f.write(md_content)
+            self._log(
+                f"Judge: score={verdict.hallucination_score:.2f} supported={verdict.supported}"
+            )
+
+        result = {
             "status": "done",
             "markdown": md_path,
             "json": json_path,
             "total_findings": len(self.session.findings),
         }
+        if verdict_dump is not None:
+            result["judge_verdict"] = verdict_dump
+        return result
+
+    def _append_verdict(self, md: str, verdict) -> str:
+        """Append the judge verdict as a Markdown section to the report."""
+        status = "✅ SUPPORTED" if verdict.supported else "⚠️ UNSUPPORTED"
+        lines = [
+            md,
+            "",
+            "---",
+            "",
+            "## 🧑‍⚖️ Report Validation (LLM-as-Judge)",
+            "",
+            f"**Status:** {status}  ",
+            f"**Hallucination score:** {verdict.hallucination_score:.2f}  ",
+        ]
+        if verdict.unsupported_claims:
+            lines.append("")
+            lines.append("**Unsupported claims:**")
+            lines.append("")
+            for claim in verdict.unsupported_claims:
+                lines.append(f"- {claim}")
+        if verdict.notes:
+            lines.append("")
+            lines.append(f"_Notes: {verdict.notes}_")
+        lines.append("")
+        return "\n".join(lines)
 
     def _structured_summary(self, target: str):
         """Flag-gated: ask the LLM for a Pydantic-validated ReportSection.

From c9baad3e3571613a36efd6f0c4b641c1966b5da1 Mon Sep 17 00:00:00 2001
From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com>
Date: Tue, 16 Jun 2026 21:37:00 +0300
Subject: [PATCH 3/4] feat(report): confidence score per finding

---
 cyberai/agents/report/markdown_renderer.py | 3 +++
 cyberai/core/config.py                     | 6 ++++++
 cyberai/core/scan_session.py               | 3 +++
 3 files changed, 12 insertions(+)

diff --git a/cyberai/agents/report/markdown_renderer.py b/cyberai/agents/report/markdown_renderer.py
index 7e93fec..3f564f0 100644
--- a/cyberai/agents/report/markdown_renderer.py
+++ b/cyberai/agents/report/markdown_renderer.py
@@ -60,6 +60,9 @@ def render_markdown(session: PentestSession) -> str:
             f"{finding.description}",
             "",
         ]
+        if getattr(finding, "confidence", 1.0) < 1.0:
+            lines.append(f"**Confidence:** {finding.confidence:.0%} ⚠️")
+            lines.append("")
         if finding.cve:
             lines.append(f"**CVE:** `{finding.cve}`")
             lines.append("")
diff --git a/cyberai/core/config.py b/cyberai/core/config.py
index b11149c..50fbae4 100644
--- a/cyberai/core/config.py
+++ b/cyberai/core/config.py
@@ -43,6 +43,12 @@ class CyberAIConfig:
     max_cost_usd: float = 0.0
     # Flag-gated: run the nuclei template engine in ExploitAgent (day 23).
     use_nuclei: bool = False
+    # Flag-gated: LLM-as-Judge validates the report vs KB evidence (day 26).
+    use_judge: bool = False
+    # Hallucination score >= threshold marks the report unsupported.
+    judge_threshold: float = 0.7
+    # Optional more-powerful model for the judge; None = same as main LLM.
+    judge_model: Optional[str] = None
 
     @classmethod
     def from_file(cls, path: str) -> "CyberAIConfig":
diff --git a/cyberai/core/scan_session.py b/cyberai/core/scan_session.py
index f8b9c94..2793491 100644
--- a/cyberai/core/scan_session.py
+++ b/cyberai/core/scan_session.py
@@ -82,6 +82,9 @@ class Finding:
     evidence: List[Any] = field(default_factory=list)
     # Free-form structured data
     data: Any = None
+    # Confidence this finding is real, 0..1. 1.0 = fully evidenced (default).
+    # Lowered by the LLM-as-Judge / agents when evidence is weak (day 26).
+    confidence: float = 1.0
 
     def __post_init__(self) -> None:
         # Keep `cve` and `cve_ids` in sync for callers that use either

From 272b5f245476d37fcc2a2f11f9fe3d16f4c80db2 Mon Sep 17 00:00:00 2001
From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com>
Date: Tue, 16 Jun 2026 21:41:05 +0300
Subject: [PATCH 4/4] test(report): judge catches hallucinated CVE

---
 cyberai/agents/report/judge.py |  12 ++-
 tests/unit/test_judge.py       | 179 +++++++++++++++++++++++++++++++++
 2 files changed, 188 insertions(+), 3 deletions(-)
 create mode 100644 tests/unit/test_judge.py

diff --git a/cyberai/agents/report/judge.py b/cyberai/agents/report/judge.py
index f13924c..4f2b68b 100644
--- a/cyberai/agents/report/judge.py
+++ b/cyberai/agents/report/judge.py
@@ -22,14 +22,20 @@
 class JudgeVerdict(BaseModel):
     """Structured verdict returned by the judge LLM."""
 
-    hallucination_score: float = Field(default=0.0, ge=0.0, le=1.0)
+    hallucination_score: float = 0.0
     supported: bool = True
     unsupported_claims: List[str] = Field(default_factory=list)
     notes: str = ""
 
-    @field_validator("hallucination_score")
+    @field_validator("hallucination_score", mode="before")
     @classmethod
-    def _clamp(cls, v: float) -> float:
+    def _clamp(cls, v: Any) -> float:
+        """Clamp to [0,1] BEFORE type validation — the LLM may over/undershoot.
+
+        A misbehaving judge returning 1.2 must not crash the report; we squash
+        it into range rather than raising, matching the graceful-degradation
+        contract of the whole judge path.
+        """
         try:
             return max(0.0, min(1.0, float(v)))
         except (TypeError, ValueError):
diff --git a/tests/unit/test_judge.py b/tests/unit/test_judge.py
new file mode 100644
index 0000000..d16baa5
--- /dev/null
+++ b/tests/unit/test_judge.py
@@ -0,0 +1,179 @@
+"""Day 26 — LLM-as-Judge: catches hallucinated claims in reports.
+
+The judge is an LLM call, so we mock `structured_call` and verify the
+judge logic: hallucination detection, threshold authority, graceful
+degradation, evidence collection, model swap, and score clamping.
+"""
+
+from unittest.mock import MagicMock
+
+from cyberai.agents.report.judge import (
+    JudgeVerdict,
+    _collect_evidence,
+    _judge_model,
+    judge_report,
+)
+from cyberai.core.scan_session import ScanSession, Severity
+
+
+def _session_with_findings() -> ScanSession:
+    s = ScanSession(target="testhost.local")
+    s.add_finding(
+        severity=Severity.INFO,
+        title="Open SSH Port",
+        description="22/tcp open",
+        agent="recon",
+    )
+    s.add_finding(
+        severity=Severity.CRITICAL,
+        title="Log4Shell",
+        description="JNDI lookup confirmed",
+        agent="exploit",
+        cve_ids=["CVE-2021-44228"],
+    )
+    return s
+
+
+def _llm_returning(payload: dict) -> MagicMock:
+    llm = MagicMock()
+    llm.config = MagicMock()
+    llm.config.model = "gpt-4o"
+    llm.structured_call.return_value = payload
+    return llm
+
+
+def test_judge_catches_hallucinated_cve():
+    """Report cites CVE-9999-99999 (not in KB) → judge flags it unsupported."""
+    s = _session_with_findings()
+    llm = _llm_returning(
+        {
+            "hallucination_score": 0.9,
+            "supported": False,
+            "unsupported_claims": ["Report cites CVE-9999-99999, absent from evidence"],
+            "notes": "fabricated CVE",
+        }
+    )
+    verdict = judge_report("...CVE-9999-99999...", s, llm)
+    assert verdict.supported is False
+    assert verdict.hallucination_score == 0.9
+    assert any("9999" in c for c in verdict.unsupported_claims)
+
+
+def test_judge_passes_clean_report():
+    s = _session_with_findings()
+    llm = _llm_returning(
+        {
+            "hallucination_score": 0.0,
+            "supported": True,
+            "unsupported_claims": [],
+            "notes": "all claims backed",
+        }
+    )
+    verdict = judge_report("clean report", s, llm)
+    assert verdict.supported is True
+    assert verdict.hallucination_score == 0.0
+
+
+def test_threshold_is_authoritative():
+    """Model lies (supported=True at 0.85) → judge recomputes from threshold."""
+    s = _session_with_findings()
+    llm = _llm_returning(
+        {
+            "hallucination_score": 0.85,
+            "supported": True,  # model's claim — must be overridden
+            "unsupported_claims": ["x"],
+            "notes": "",
+        }
+    )
+    verdict = judge_report("report", s, llm, threshold=0.7)
+    assert verdict.supported is False  # 0.85 >= 0.7
+
+
+def test_graceful_on_exception():
+    """structured_call raises → graceful pass-through verdict, no crash."""
+    s = _session_with_findings()
+    llm = MagicMock()
+    llm.config = MagicMock()
+    llm.config.model = "gpt-4o"
+    llm.structured_call.side_effect = RuntimeError("API down")
+    verdict = judge_report("report", s, llm)
+    assert verdict.supported is True
+    assert verdict.hallucination_score == 0.0
+    assert "unavailable" in verdict.notes
+
+
+def test_graceful_on_no_llm():
+    s = _session_with_findings()
+    verdict = judge_report("report", s, None)
+    assert verdict.supported is True
+    assert "unavailable" in verdict.notes
+
+
+def test_collect_evidence_serializes_findings():
+    s = _session_with_findings()
+    ev = _collect_evidence(s)
+    assert ev["target"] == "testhost.local"
+    assert len(ev["findings"]) == 2
+    log4shell = [f for f in ev["findings"] if f["title"] == "Log4Shell"][0]
+    assert log4shell["severity"] == "CRITICAL"
+    assert "CVE-2021-44228" in log4shell["cve_ids"]
+
+
+def test_collect_evidence_truncates_long_evidence():
+    s = ScanSession(target="x")
+    s.add_finding(
+        severity=Severity.HIGH,
+        title="Big",
+        description="d",
+        agent="t",
+    )
+    s.findings[0].evidence = ["A" * 1000]
+    ev = _collect_evidence(s)
+    assert len(ev["findings"][0]["evidence"][0]) == 500
+
+
+def test_judge_model_swap_and_restore():
+    """_judge_model temporarily swaps config.model and restores it."""
+    llm = MagicMock()
+    llm.config = MagicMock()
+    llm.config.model = "gpt-4o"
+    with _judge_model(llm, "gpt-4o-judge"):
+        assert llm.config.model == "gpt-4o-judge"
+    assert llm.config.model == "gpt-4o"
+
+
+def test_judge_model_noop_when_none():
+    llm = MagicMock()
+    llm.config = MagicMock()
+    llm.config.model = "gpt-4o"
+    with _judge_model(llm, None):
+        assert llm.config.model == "gpt-4o"
+    assert llm.config.model == "gpt-4o"
+
+
+def test_judge_model_used_in_call():
+    """judge_model is applied during the structured_call."""
+    s = _session_with_findings()
+    captured = {}
+
+    def _capture(*args, **kwargs):
+        captured["model"] = llm.config.model
+        return {
+            "hallucination_score": 0.0,
+            "supported": True,
+            "unsupported_claims": [],
+            "notes": "",
+        }
+
+    llm = MagicMock()
+    llm.config = MagicMock()
+    llm.config.model = "gpt-4o"
+    llm.structured_call.side_effect = _capture
+    judge_report("r", s, llm, judge_model="big-model")
+    assert captured["model"] == "big-model"
+    assert llm.config.model == "gpt-4o"  # restored
+
+
+def test_verdict_score_clamped():
+    assert JudgeVerdict(hallucination_score=1.5).hallucination_score == 1.0
+    assert JudgeVerdict(hallucination_score=-0.3).hallucination_score == 0.0