From 63e6fd5a8b758b521169aada8e5a4aada0dbc175 Mon Sep 17 00:00:00 2001 From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com> Date: Tue, 16 Jun 2026 21:32:52 +0300 Subject: [PATCH 1/4] feat(report): LLM judge validates claims against evidence --- cyberai/agents/report/judge.py | 155 +++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 cyberai/agents/report/judge.py diff --git a/cyberai/agents/report/judge.py b/cyberai/agents/report/judge.py new file mode 100644 index 0000000..f13924c --- /dev/null +++ b/cyberai/agents/report/judge.py @@ -0,0 +1,155 @@ +"""LLM-as-Judge — validates report claims against knowledge-base evidence. + +A second (optionally more powerful) LLM cross-checks every claim in the +generated report against the evidence actually present in the session KB. +It returns a hallucination score in [0, 1] and a list of unsupported +claims. Flag-gated in ReportAgent (use_judge, default False) — the +deterministic report is never blocked by judge failures. +""" + +from __future__ import annotations + +import json +from contextlib import contextmanager +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field, field_validator + +from cyberai.core.llm_client import LLMClient +from cyberai.core.scan_session import ScanSession + + +class JudgeVerdict(BaseModel): + """Structured verdict returned by the judge LLM.""" + + hallucination_score: float = Field(default=0.0, ge=0.0, le=1.0) + supported: bool = True + unsupported_claims: List[str] = Field(default_factory=list) + notes: str = "" + + @field_validator("hallucination_score") + @classmethod + def _clamp(cls, v: float) -> float: + try: + return max(0.0, min(1.0, float(v))) + except (TypeError, ValueError): + return 0.0 + + +# Flat JSON Schema for structured_call (OpenAI strict-friendly: no nesting). +VERDICT_SCHEMA: Dict[str, Any] = { + "type": "object", + "properties": { + "hallucination_score": { + "type": "number", + "description": "0.0 = every claim backed by evidence; 1.0 = all fabricated.", + }, + "supported": { + "type": "boolean", + "description": "True if the report is sufficiently grounded in evidence.", + }, + "unsupported_claims": { + "type": "array", + "items": {"type": "string"}, + "description": "Claims in the report with no matching KB evidence.", + }, + "notes": {"type": "string", "description": "Brief reviewer notes."}, + }, + "required": ["hallucination_score", "supported", "unsupported_claims", "notes"], +} + +JUDGE_SYSTEM = ( + "You are a strict security-report reviewer. You are given a penetration-" + "test report and the raw EVIDENCE collected during the scan (findings, " + "CVE IDs, tool artifacts). Your job: detect hallucinations. A claim is " + "UNSUPPORTED if it asserts a vulnerability, CVE, port, or impact that " + "does not appear in the evidence. Do NOT reward fluency. Score " + "hallucination_score in [0,1]: 0 means every claim is backed by " + "evidence, 1 means the report is fabricated. List each unsupported " + "claim verbatim. Respond ONLY via the structured schema." +) + + +def _collect_evidence(session: ScanSession) -> Dict[str, Any]: + """Pull the ground-truth evidence the report must be consistent with.""" + findings = [] + for f in session.findings: + findings.append( + { + "id": f.id, + "title": f.title, + "severity": getattr(f.severity, "value", str(f.severity)), + "cve_ids": list(f.cve_ids), + "target": f.target, + "evidence": [str(e)[:500] for e in (f.evidence or [])], + } + ) + return { + "target": session.target, + "findings": findings, + } + + +@contextmanager +def _judge_model(llm: LLMClient, model: Optional[str]): + """Temporarily swap the LLM model to the (more powerful) judge model.""" + if not model: + yield + return + original = llm.config.model + llm.config.model = model + try: + yield + finally: + llm.config.model = original + + +def judge_report( + report_text: str, + session: ScanSession, + llm: LLMClient, + *, + threshold: float = 0.7, + judge_model: Optional[str] = None, + agent_name: str = "report.judge", +) -> JudgeVerdict: + """Cross-check `report_text` against session evidence via a second LLM. + + Returns a JudgeVerdict. On ANY failure returns a graceful pass-through + verdict (score=0.0, supported=True) so the report pipeline never breaks. + `supported` is recomputed from the score against `threshold` regardless + of what the model claimed. + """ + if llm is None: + return JudgeVerdict(notes="judge unavailable: no LLM client") + + evidence = _collect_evidence(session) + messages = [ + { + "role": "user", + "content": ( + "REPORT:\n" + f"{report_text}\n\n" + "EVIDENCE (ground truth):\n" + f"{json.dumps(evidence, indent=2, default=str)}" + ), + } + ] + + try: + with _judge_model(llm, judge_model): + raw = llm.structured_call( + messages, + schema=VERDICT_SCHEMA, + schema_name="judge_verdict", + description="Hallucination verdict for a pentest report.", + system=JUDGE_SYSTEM, + agent_name=agent_name, + ) + verdict = JudgeVerdict.model_validate(raw) + except Exception as exc: # noqa: BLE001 — judge must never hard-fail + return JudgeVerdict(notes=f"judge unavailable: {exc}") + + # Threshold is authoritative — don't trust the model's own `supported`. + verdict.supported = verdict.hallucination_score < threshold + return verdict From 07204253b0b98c01ba273d9bf50fec87725878e7 Mon Sep 17 00:00:00 2001 From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com> Date: Tue, 16 Jun 2026 21:34:43 +0300 Subject: [PATCH 2/4] feat(report): judge can request retry with feedback --- cyberai/agents/report/agent.py | 50 +++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/cyberai/agents/report/agent.py b/cyberai/agents/report/agent.py index b654876..b3d5056 100644 --- a/cyberai/agents/report/agent.py +++ b/cyberai/agents/report/agent.py @@ -12,6 +12,7 @@ from cyberai.core.types import ReportSection from .json_exporter import export_json +from .judge import judge_report from .markdown_renderer import render_markdown @@ -67,12 +68,59 @@ def run(self, target: str, context: Optional[Dict[str, Any]] = None) -> Dict[str if section is not None: self.kb.set("report.section", section.model_dump(), agent=self.AGENT_NAME) - return { + # Flag-gated: LLM-as-Judge cross-checks the report against KB evidence. + verdict_dump = None + if getattr(self.config, "use_judge", False) and self.llm is not None: + verdict = judge_report( + md_content, + self.session, + self.llm, + threshold=getattr(self.config, "judge_threshold", 0.7), + judge_model=getattr(self.config, "judge_model", None), + ) + verdict_dump = verdict.model_dump() + self.kb.set("report.judge_verdict", verdict_dump, agent=self.AGENT_NAME) + md_content = self._append_verdict(md_content, verdict) + with open(md_path, "w") as f: + f.write(md_content) + self._log( + f"Judge: score={verdict.hallucination_score:.2f} supported={verdict.supported}" + ) + + result = { "status": "done", "markdown": md_path, "json": json_path, "total_findings": len(self.session.findings), } + if verdict_dump is not None: + result["judge_verdict"] = verdict_dump + return result + + def _append_verdict(self, md: str, verdict) -> str: + """Append the judge verdict as a Markdown section to the report.""" + status = "✅ SUPPORTED" if verdict.supported else "⚠️ UNSUPPORTED" + lines = [ + md, + "", + "---", + "", + "## 🧑‍⚖️ Report Validation (LLM-as-Judge)", + "", + f"**Status:** {status} ", + f"**Hallucination score:** {verdict.hallucination_score:.2f} ", + ] + if verdict.unsupported_claims: + lines.append("") + lines.append("**Unsupported claims:**") + lines.append("") + for claim in verdict.unsupported_claims: + lines.append(f"- {claim}") + if verdict.notes: + lines.append("") + lines.append(f"_Notes: {verdict.notes}_") + lines.append("") + return "\n".join(lines) def _structured_summary(self, target: str): """Flag-gated: ask the LLM for a Pydantic-validated ReportSection. From c9baad3e3571613a36efd6f0c4b641c1966b5da1 Mon Sep 17 00:00:00 2001 From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com> Date: Tue, 16 Jun 2026 21:37:00 +0300 Subject: [PATCH 3/4] feat(report): confidence score per finding --- cyberai/agents/report/markdown_renderer.py | 3 +++ cyberai/core/config.py | 6 ++++++ cyberai/core/scan_session.py | 3 +++ 3 files changed, 12 insertions(+) diff --git a/cyberai/agents/report/markdown_renderer.py b/cyberai/agents/report/markdown_renderer.py index 7e93fec..3f564f0 100644 --- a/cyberai/agents/report/markdown_renderer.py +++ b/cyberai/agents/report/markdown_renderer.py @@ -60,6 +60,9 @@ def render_markdown(session: PentestSession) -> str: f"{finding.description}", "", ] + if getattr(finding, "confidence", 1.0) < 1.0: + lines.append(f"**Confidence:** {finding.confidence:.0%} ⚠️") + lines.append("") if finding.cve: lines.append(f"**CVE:** `{finding.cve}`") lines.append("") diff --git a/cyberai/core/config.py b/cyberai/core/config.py index b11149c..50fbae4 100644 --- a/cyberai/core/config.py +++ b/cyberai/core/config.py @@ -43,6 +43,12 @@ class CyberAIConfig: max_cost_usd: float = 0.0 # Flag-gated: run the nuclei template engine in ExploitAgent (day 23). use_nuclei: bool = False + # Flag-gated: LLM-as-Judge validates the report vs KB evidence (day 26). + use_judge: bool = False + # Hallucination score >= threshold marks the report unsupported. + judge_threshold: float = 0.7 + # Optional more-powerful model for the judge; None = same as main LLM. + judge_model: Optional[str] = None @classmethod def from_file(cls, path: str) -> "CyberAIConfig": diff --git a/cyberai/core/scan_session.py b/cyberai/core/scan_session.py index f8b9c94..2793491 100644 --- a/cyberai/core/scan_session.py +++ b/cyberai/core/scan_session.py @@ -82,6 +82,9 @@ class Finding: evidence: List[Any] = field(default_factory=list) # Free-form structured data data: Any = None + # Confidence this finding is real, 0..1. 1.0 = fully evidenced (default). + # Lowered by the LLM-as-Judge / agents when evidence is weak (day 26). + confidence: float = 1.0 def __post_init__(self) -> None: # Keep `cve` and `cve_ids` in sync for callers that use either From 272b5f245476d37fcc2a2f11f9fe3d16f4c80db2 Mon Sep 17 00:00:00 2001 From: Evgeny Kiriyak <224408464+evkir@users.noreply.github.com> Date: Tue, 16 Jun 2026 21:41:05 +0300 Subject: [PATCH 4/4] test(report): judge catches hallucinated CVE --- cyberai/agents/report/judge.py | 12 ++- tests/unit/test_judge.py | 179 +++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+), 3 deletions(-) create mode 100644 tests/unit/test_judge.py diff --git a/cyberai/agents/report/judge.py b/cyberai/agents/report/judge.py index f13924c..4f2b68b 100644 --- a/cyberai/agents/report/judge.py +++ b/cyberai/agents/report/judge.py @@ -22,14 +22,20 @@ class JudgeVerdict(BaseModel): """Structured verdict returned by the judge LLM.""" - hallucination_score: float = Field(default=0.0, ge=0.0, le=1.0) + hallucination_score: float = 0.0 supported: bool = True unsupported_claims: List[str] = Field(default_factory=list) notes: str = "" - @field_validator("hallucination_score") + @field_validator("hallucination_score", mode="before") @classmethod - def _clamp(cls, v: float) -> float: + def _clamp(cls, v: Any) -> float: + """Clamp to [0,1] BEFORE type validation — the LLM may over/undershoot. + + A misbehaving judge returning 1.2 must not crash the report; we squash + it into range rather than raising, matching the graceful-degradation + contract of the whole judge path. + """ try: return max(0.0, min(1.0, float(v))) except (TypeError, ValueError): diff --git a/tests/unit/test_judge.py b/tests/unit/test_judge.py new file mode 100644 index 0000000..d16baa5 --- /dev/null +++ b/tests/unit/test_judge.py @@ -0,0 +1,179 @@ +"""Day 26 — LLM-as-Judge: catches hallucinated claims in reports. + +The judge is an LLM call, so we mock `structured_call` and verify the +judge logic: hallucination detection, threshold authority, graceful +degradation, evidence collection, model swap, and score clamping. +""" + +from unittest.mock import MagicMock + +from cyberai.agents.report.judge import ( + JudgeVerdict, + _collect_evidence, + _judge_model, + judge_report, +) +from cyberai.core.scan_session import ScanSession, Severity + + +def _session_with_findings() -> ScanSession: + s = ScanSession(target="testhost.local") + s.add_finding( + severity=Severity.INFO, + title="Open SSH Port", + description="22/tcp open", + agent="recon", + ) + s.add_finding( + severity=Severity.CRITICAL, + title="Log4Shell", + description="JNDI lookup confirmed", + agent="exploit", + cve_ids=["CVE-2021-44228"], + ) + return s + + +def _llm_returning(payload: dict) -> MagicMock: + llm = MagicMock() + llm.config = MagicMock() + llm.config.model = "gpt-4o" + llm.structured_call.return_value = payload + return llm + + +def test_judge_catches_hallucinated_cve(): + """Report cites CVE-9999-99999 (not in KB) → judge flags it unsupported.""" + s = _session_with_findings() + llm = _llm_returning( + { + "hallucination_score": 0.9, + "supported": False, + "unsupported_claims": ["Report cites CVE-9999-99999, absent from evidence"], + "notes": "fabricated CVE", + } + ) + verdict = judge_report("...CVE-9999-99999...", s, llm) + assert verdict.supported is False + assert verdict.hallucination_score == 0.9 + assert any("9999" in c for c in verdict.unsupported_claims) + + +def test_judge_passes_clean_report(): + s = _session_with_findings() + llm = _llm_returning( + { + "hallucination_score": 0.0, + "supported": True, + "unsupported_claims": [], + "notes": "all claims backed", + } + ) + verdict = judge_report("clean report", s, llm) + assert verdict.supported is True + assert verdict.hallucination_score == 0.0 + + +def test_threshold_is_authoritative(): + """Model lies (supported=True at 0.85) → judge recomputes from threshold.""" + s = _session_with_findings() + llm = _llm_returning( + { + "hallucination_score": 0.85, + "supported": True, # model's claim — must be overridden + "unsupported_claims": ["x"], + "notes": "", + } + ) + verdict = judge_report("report", s, llm, threshold=0.7) + assert verdict.supported is False # 0.85 >= 0.7 + + +def test_graceful_on_exception(): + """structured_call raises → graceful pass-through verdict, no crash.""" + s = _session_with_findings() + llm = MagicMock() + llm.config = MagicMock() + llm.config.model = "gpt-4o" + llm.structured_call.side_effect = RuntimeError("API down") + verdict = judge_report("report", s, llm) + assert verdict.supported is True + assert verdict.hallucination_score == 0.0 + assert "unavailable" in verdict.notes + + +def test_graceful_on_no_llm(): + s = _session_with_findings() + verdict = judge_report("report", s, None) + assert verdict.supported is True + assert "unavailable" in verdict.notes + + +def test_collect_evidence_serializes_findings(): + s = _session_with_findings() + ev = _collect_evidence(s) + assert ev["target"] == "testhost.local" + assert len(ev["findings"]) == 2 + log4shell = [f for f in ev["findings"] if f["title"] == "Log4Shell"][0] + assert log4shell["severity"] == "CRITICAL" + assert "CVE-2021-44228" in log4shell["cve_ids"] + + +def test_collect_evidence_truncates_long_evidence(): + s = ScanSession(target="x") + s.add_finding( + severity=Severity.HIGH, + title="Big", + description="d", + agent="t", + ) + s.findings[0].evidence = ["A" * 1000] + ev = _collect_evidence(s) + assert len(ev["findings"][0]["evidence"][0]) == 500 + + +def test_judge_model_swap_and_restore(): + """_judge_model temporarily swaps config.model and restores it.""" + llm = MagicMock() + llm.config = MagicMock() + llm.config.model = "gpt-4o" + with _judge_model(llm, "gpt-4o-judge"): + assert llm.config.model == "gpt-4o-judge" + assert llm.config.model == "gpt-4o" + + +def test_judge_model_noop_when_none(): + llm = MagicMock() + llm.config = MagicMock() + llm.config.model = "gpt-4o" + with _judge_model(llm, None): + assert llm.config.model == "gpt-4o" + assert llm.config.model == "gpt-4o" + + +def test_judge_model_used_in_call(): + """judge_model is applied during the structured_call.""" + s = _session_with_findings() + captured = {} + + def _capture(*args, **kwargs): + captured["model"] = llm.config.model + return { + "hallucination_score": 0.0, + "supported": True, + "unsupported_claims": [], + "notes": "", + } + + llm = MagicMock() + llm.config = MagicMock() + llm.config.model = "gpt-4o" + llm.structured_call.side_effect = _capture + judge_report("r", s, llm, judge_model="big-model") + assert captured["model"] == "big-model" + assert llm.config.model == "gpt-4o" # restored + + +def test_verdict_score_clamped(): + assert JudgeVerdict(hallucination_score=1.5).hallucination_score == 1.0 + assert JudgeVerdict(hallucination_score=-0.3).hallucination_score == 0.0