From 5c794d3bc6e89bd4a95e0597d2bd35fcc6ba6294 Mon Sep 17 00:00:00 2001
From: Jake Present <jakepresent@microsoft.com>
Date: Tue, 26 May 2026 13:56:12 -0400
Subject: [PATCH 1/2] viewer: split policy violation metrics by permissibility

---
 tests/test_viewer_server_artifacts.py         | 190 ++++++++++++++++++
 viewer/src/lib/server/artifacts.ts            |  41 ++++
 viewer/src/lib/server/data.ts                 |  44 +++-
 viewer/src/lib/server/metrics.ts              |  88 +++++++-
 viewer/src/lib/types.ts                       |   4 +
 .../suite/[suite_id]/[run_id]/+page.svelte    |  74 +++++++
 6 files changed, 430 insertions(+), 11 deletions(-)

diff --git a/tests/test_viewer_server_artifacts.py b/tests/test_viewer_server_artifacts.py
index 8cbdb500..fb3910d3 100644
--- a/tests/test_viewer_server_artifacts.py
+++ b/tests/test_viewer_server_artifacts.py
@@ -1723,6 +1723,194 @@ def test_list_suites_marks_scenario_only_scored_suite_as_has_results(self) -> No
             self.assertEqual(payload[0]["status"], "has_results")
 
 
+    def test_run_metrics_policy_violation_by_permissibility(self) -> None:
+        with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
+            harness_dir = Path(tmp_dir) / "harness"
+            harness_dir.mkdir()
+            self._copy_data_harness(harness_dir)
+            metrics_path = harness_dir / "metrics.ts"
+
+            script = textwrap.dedent(
+                f"""\
+                const {{ computeRunMetrics }} = await import({json.dumps(metrics_path.as_uri())});
+                const behaviors = [
+                  {{ name: 'perm_a', definition: '', examples: [], permissible: true }},
+                  {{ name: 'notperm_a', definition: '', examples: [], permissible: false }}
+                ];
+                const samples = [
+                  {{
+                    test_case_id: 'seed-1',
+                    prompt: 'prompt',
+                    response: 'response',
+                    behavior: 'perm_a',
+                    target: 'target-model',
+                    judge_model: 'judge-model',
+                    judge_status: 'ok',
+                    verdict: {{
+                      dimensions: {{ policy_violation: true, overrefusal: true }},
+                      justification: '',
+                      node_judgments: [
+                        {{ node_index: 0, node_name: 'perm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
+                        {{ node_index: 1, node_name: 'notperm_a', relevant: false, violated: null, confidence: null, evidence_turns: [], reasoning: '' }}
+                      ]
+                    }}
+                  }},
+                  {{
+                    test_case_id: 'seed-2',
+                    prompt: 'prompt',
+                    response: 'response',
+                    behavior: 'notperm_a',
+                    target: 'target-model',
+                    judge_model: 'judge-model',
+                    judge_status: 'ok',
+                    verdict: {{
+                      dimensions: {{ policy_violation: false, overrefusal: false }},
+                      justification: '',
+                      node_judgments: [
+                        {{ node_index: 0, node_name: 'perm_a', relevant: true, violated: false, confidence: 'high', evidence_turns: [], reasoning: '' }},
+                        {{ node_index: 1, node_name: 'notperm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
+                        {{ node_index: 2, node_name: 'unknown_node', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }}
+                      ]
+                    }}
+                  }}
+                ];
+                const withBehaviors = computeRunMetrics(samples, behaviors);
+                const withoutBehaviors = computeRunMetrics(samples, []);
+                console.log(JSON.stringify({{ withBehaviors, withoutBehaviors }}));
+                """
+            )
+            result = self._run_node(
+                harness_dir=harness_dir,
+                script=script,
+                env=os.environ.copy(),
+            )
+            self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
+            payload = json.loads(result.stdout)
+
+            with_behaviors = payload["withBehaviors"]
+            self.assertEqual(with_behaviors["scored_total"], 2)
+            permissible = with_behaviors["policy_violation_on_permissible"]
+            self.assertIsNotNone(permissible)
+            self.assertEqual(permissible["count"], 2)
+            self.assertEqual(permissible["flagged_count"], 1)
+            self.assertEqual(permissible["clear_count"], 1)
+            self.assertAlmostEqual(permissible["rate"], 0.5)
+
+            not_permissible = with_behaviors["policy_violation_on_not_permissible"]
+            self.assertIsNotNone(not_permissible)
+            self.assertEqual(not_permissible["count"], 1)
+            self.assertEqual(not_permissible["flagged_count"], 1)
+            self.assertEqual(not_permissible["clear_count"], 0)
+            self.assertAlmostEqual(not_permissible["rate"], 1.0)
+
+            without_behaviors = payload["withoutBehaviors"]
+            self.assertIsNone(without_behaviors["policy_violation_on_permissible"])
+            self.assertIsNone(without_behaviors["policy_violation_on_not_permissible"])
+
+    def test_audit_run_metrics_policy_violation_by_permissibility(self) -> None:
+        with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
+            harness_dir = Path(tmp_dir) / "harness"
+            harness_dir.mkdir()
+            self._copy_data_harness(harness_dir)
+            metrics_path = harness_dir / "metrics.ts"
+
+            script = textwrap.dedent(
+                f"""\
+                const {{ computeAuditRunMetrics }} = await import({json.dumps(metrics_path.as_uri())});
+                const behaviors = [
+                  {{ name: 'perm_a', definition: '', examples: [], permissible: true }},
+                  {{ name: 'notperm_a', definition: '', examples: [], permissible: false }}
+                ];
+                const scores = [
+                  {{
+                    test_case_id: 'scenario-1',
+                    target: 'target-model',
+                    tester_model: 'tester-model',
+                    judge_model: 'judge-model',
+                    judge_status: 'ok',
+                    verdict: {{
+                      dimensions: {{ policy_violation: true, overrefusal: false }},
+                      justification: '',
+                      node_judgments: [
+                        {{ node_index: 0, node_name: 'notperm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
+                        {{ node_index: 1, node_name: 'perm_a', relevant: true, violated: false, confidence: 'high', evidence_turns: [], reasoning: '' }}
+                      ]
+                    }}
+                  }}
+                ];
+                const metrics = computeAuditRunMetrics(scores, behaviors);
+                console.log(JSON.stringify(metrics));
+                """
+            )
+            result = self._run_node(
+                harness_dir=harness_dir,
+                script=script,
+                env=os.environ.copy(),
+            )
+            self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
+            metrics = json.loads(result.stdout)
+
+            self.assertEqual(metrics["scored_total"], 1)
+            permissible = metrics["policy_violation_on_permissible"]
+            self.assertIsNotNone(permissible)
+            self.assertEqual(permissible["count"], 1)
+            self.assertEqual(permissible["flagged_count"], 0)
+            self.assertEqual(permissible["clear_count"], 1)
+            self.assertAlmostEqual(permissible["rate"], 0.0)
+
+            not_permissible = metrics["policy_violation_on_not_permissible"]
+            self.assertIsNotNone(not_permissible)
+            self.assertEqual(not_permissible["count"], 1)
+            self.assertEqual(not_permissible["flagged_count"], 1)
+            self.assertEqual(not_permissible["clear_count"], 0)
+            self.assertAlmostEqual(not_permissible["rate"], 1.0)
+
+    def test_load_run_judge_taxonomy_prefers_run_config(self) -> None:
+        with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
+            harness_dir = Path(tmp_dir) / "harness"
+            harness_dir.mkdir()
+            self._copy_data_harness(harness_dir)
+            artifacts_path = harness_dir / "artifacts.ts"
+
+            suite_dir = Path(tmp_dir) / "demo-suite"
+            run_dir = suite_dir / "demo-run"
+            run_dir.mkdir(parents=True)
+            judge_taxonomy = {
+                "behavior": {"name": "demo", "definition": "demo"},
+                "behavior_categories": [
+                    {"name": "judge_only", "definition": "", "examples": [], "permissible": False}
+                ],
+            }
+            taxonomy_path = suite_dir / "taxonomy.override.json"
+            taxonomy_path.write_text(json.dumps(judge_taxonomy), encoding="utf-8")
+            (run_dir / "config.yaml").write_text(
+                f"pipeline:\n  judge:\n    taxonomy_path: {taxonomy_path}\n",
+                encoding="utf-8",
+            )
+
+            script = textwrap.dedent(
+                f"""\
+                const {{ loadRunJudgeTaxonomyForRun, loadRunJudgeTaxonomy, loadRunJudgeTaxonomyFromArtifacts }} = await import({json.dumps(artifacts_path.as_uri())});
+                const fromRun = loadRunJudgeTaxonomyForRun('demo-suite', 'demo-run');
+                const fromConfig = loadRunJudgeTaxonomy({{ pipeline: {{ judge: {{ taxonomy_path: {json.dumps(str(taxonomy_path))} }} }} }});
+                const fromArtifact = loadRunJudgeTaxonomyFromArtifacts({{ suite: 'demo-suite' }}, {{ systematize: {{ path: 'taxonomy.override.json' }} }});
+                const fromMissing = loadRunJudgeTaxonomy({{ pipeline: {{ judge: {{}} }} }});
+                console.log(JSON.stringify({{ fromRun, fromConfig, fromArtifact, fromMissing }}));
+                """
+            )
+            env = os.environ.copy()
+            env["ARTIFACTS_ROOT"] = str(Path(tmp_dir))
+            result = self._run_node(harness_dir=harness_dir, script=script, env=env)
+            self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
+            payload = json.loads(result.stdout)
+
+            self.assertEqual(payload["fromRun"], judge_taxonomy)
+            self.assertEqual(payload["fromConfig"], judge_taxonomy)
+            self.assertEqual(payload["fromArtifact"], judge_taxonomy)
+            self.assertIsNone(payload["fromMissing"])
+
+
+
 class ViewerReadModelHelpersTest(unittest.TestCase):
     """Tests for path-traversal defenses that don't depend on Node TS support."""
 
@@ -1822,5 +2010,7 @@ def test_test_set_artifact_path_rejects_paths_that_normalize_to_directory(self)
                 )
 
 
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/viewer/src/lib/server/artifacts.ts b/viewer/src/lib/server/artifacts.ts
index 65a023c5..83c55c66 100644
--- a/viewer/src/lib/server/artifacts.ts
+++ b/viewer/src/lib/server/artifacts.ts
@@ -556,6 +556,47 @@ function readObject(value: unknown): Record<string, unknown> | null {
 		: null;
 }
 
+export function loadRunJudgeTaxonomy(config: Record<string, unknown> | null): Taxonomy | null {
+	return loadRunJudgeTaxonomyFromArtifacts(config, null);
+}
+
+export function loadRunJudgeTaxonomyFromArtifacts(
+	config: Record<string, unknown> | null,
+	artifacts: Record<string, unknown> | null
+): Taxonomy | null {
+	const systematize = readObject(artifacts?.systematize);
+	const artifactTaxonomyPath = typeof systematize?.path === 'string' ? systematize.path : null;
+	if (artifactTaxonomyPath) {
+		const resolvedArtifactPath = manifestArtifactPath(suiteDirPathFromConfig(config), artifactTaxonomyPath);
+		const artifactTaxonomy = resolvedArtifactPath
+			? readJsonFile<Taxonomy>(resolvedArtifactPath, { missingOk: true })
+			: null;
+		if (artifactTaxonomy) return artifactTaxonomy;
+	}
+
+	const pipeline = readObject(config?.pipeline);
+	const judge = readObject(pipeline?.judge);
+	const rawTaxonomyPath = typeof judge?.taxonomy_path === 'string' ? judge.taxonomy_path : null;
+	if (!rawTaxonomyPath) return null;
+
+	const resolved = path.resolve(rawTaxonomyPath);
+	return readJsonFile<Taxonomy>(resolved, { missingOk: true });
+}
+
+function suiteDirPathFromConfig(config: Record<string, unknown> | null): string {
+	const suite = typeof config?.suite === 'string' ? config.suite : null;
+	return suite ? suiteDirPath(suite) : ARTIFACTS_ROOT;
+}
+
+export function loadRunJudgeTaxonomyForRun(suiteId: string, runId: string): Taxonomy | null {
+	const runDir = runDirPath(suiteId, runId);
+	const config = readYamlFile<Record<string, unknown>>(path.join(runDir, RUN_CONFIG_FILE), {
+		missingOk: true
+	});
+	const manifest = readJsonFile<Manifest>(path.join(runDir, RUN_MANIFEST_FILE), { missingOk: true });
+	return loadRunJudgeTaxonomyFromArtifacts(config, manifest?.artifact_versions ?? null);
+}
+
 export function loadRunRuntimeMode(config: Record<string, unknown> | null): string | null {
 	const pipeline = readObject(config?.pipeline);
 	const inference = readObject(pipeline?.inference);
diff --git a/viewer/src/lib/server/data.ts b/viewer/src/lib/server/data.ts
index 1a876dd2..9a02de2e 100644
--- a/viewer/src/lib/server/data.ts
+++ b/viewer/src/lib/server/data.ts
@@ -6,6 +6,8 @@ import {
 	ViewerReadModelError,
 	loadIndexedRunScoreRow,
 	loadIndexedRunTranscriptRow,
+	loadRunJudgeTaxonomyForRun,
+	loadRunJudgeTaxonomyFromArtifacts,
 	loadRunRuntimeMode,
 	loadRunScoreRow,
 	loadRunTranscriptRow,
@@ -64,6 +66,8 @@ interface PromptMetricView {
 	counts: BinaryCounts;
 	policyViolationRate: number;
 	overrefusalRate: number;
+	policyViolationOnPermissible: DimensionMetrics | null;
+	policyViolationOnNotPermissible: DimensionMetrics | null;
 	dimensions: Record<string, DimensionMetrics>;
 	target: string;
 	judge_model: string;
@@ -77,6 +81,8 @@ interface AuditMetricView {
 	counts: BinaryCounts;
 	policyViolationRate: number;
 	overrefusalRate: number;
+	policyViolationOnPermissible: DimensionMetrics | null;
+	policyViolationOnNotPermissible: DimensionMetrics | null;
 	dimensions: Record<string, DimensionMetrics>;
 	target: string;
 	tester_model: string;
@@ -179,6 +185,15 @@ function normalizeBehavior(b: Behavior): Behavior {
 	return { ...b, permissible: b.permissible ?? false };
 }
 
+function metricBehaviors(
+	snapshot: SuiteSnapshot | null,
+	runConfig?: Record<string, unknown> | null,
+	artifacts?: Record<string, unknown> | null
+): Behavior[] {
+	const judgeTaxonomy = loadRunJudgeTaxonomyFromArtifacts(runConfig ?? null, artifacts ?? null);
+	return (judgeTaxonomy?.behavior_categories ?? snapshot?.taxonomy?.behavior_categories ?? []).map(normalizeBehavior);
+}
+
 function normalizePolicy(taxonomy: Taxonomy | null | undefined): Taxonomy | null {
 	if (!taxonomy) return null;
 	const behavior = taxonomy.behavior ?? taxonomy.risk;
@@ -649,6 +664,7 @@ function buildRunListEntries(snapshot: SuiteSnapshot): {
 		const hasPromptScores = promptScores.length > 0;
 		const hasAuditScores = auditScores.length > 0;
 		const hasScoreStage = manifest?.stages?.judge != null;
+		const behaviors = metricBehaviors(snapshot, runSnapshot.config, manifest?.artifact_versions ?? null);
 
 		if ((hasPromptScores || hasScoreStage) && !(manifest?.status === 'failed' && !hasPromptScores)) {
 			runs.push({
@@ -656,7 +672,7 @@ function buildRunListEntries(snapshot: SuiteSnapshot): {
 				has_judged: hasPromptScores,
 				has_scenario_scores: hasAuditScores,
 				manifest,
-				metrics: hasPromptScores ? computeRunMetrics(promptScores) : null
+				metrics: hasPromptScores ? computeRunMetrics(promptScores, behaviors) : null
 			});
 		}
 
@@ -665,7 +681,7 @@ function buildRunListEntries(snapshot: SuiteSnapshot): {
 				run_id: runId,
 				has_scores: hasAuditScores,
 				manifest,
-				metrics: hasAuditScores ? computeAuditRunMetrics(auditScores) : null
+				metrics: hasAuditScores ? computeAuditRunMetrics(auditScores, behaviors) : null
 			});
 		}
 	}
@@ -682,6 +698,8 @@ function buildZeroPromptMetrics(): PromptMetricView {
 		counts: emptyScoreCounts(),
 		policyViolationRate: 0,
 		overrefusalRate: 0,
+		policyViolationOnPermissible: null,
+		policyViolationOnNotPermissible: null,
 		dimensions: {},
 		target: '',
 		judge_model: ''
@@ -697,6 +715,8 @@ function buildZeroAuditMetrics(): AuditMetricView {
 		counts: emptyScoreCounts(),
 		policyViolationRate: 0,
 		overrefusalRate: 0,
+		policyViolationOnPermissible: null,
+		policyViolationOnNotPermissible: null,
 		dimensions: {},
 		target: '',
 		tester_model: '',
@@ -714,6 +734,8 @@ function toPromptMetricView(metrics: RunMetrics | null): PromptMetricView {
 		counts: metrics.counts,
 		policyViolationRate: metrics.policy_violation_rate,
 		overrefusalRate: metrics.overrefusal_rate,
+		policyViolationOnPermissible: metrics.policy_violation_on_permissible,
+		policyViolationOnNotPermissible: metrics.policy_violation_on_not_permissible,
 		dimensions: metrics.dimensions,
 		target: metrics.target,
 		judge_model: metrics.judge_model
@@ -730,6 +752,8 @@ function toAuditMetricView(metrics: AuditRunMetrics | null): AuditMetricView {
 		counts: metrics.counts,
 		policyViolationRate: metrics.policy_violation_rate,
 		overrefusalRate: metrics.overrefusal_rate,
+		policyViolationOnPermissible: metrics.policy_violation_on_permissible,
+		policyViolationOnNotPermissible: metrics.policy_violation_on_not_permissible,
 		dimensions: metrics.dimensions,
 		target: metrics.target,
 		tester_model: metrics.tester_model,
@@ -1080,6 +1104,7 @@ async function loadSuiteHeavyData(
 		const hasPromptScores = promptScores.length > 0;
 		const hasAuditScores = auditScores.length > 0;
 		const hasScoreStage = manifest?.stages?.judge != null;
+		const behaviors = metricBehaviors(snapshot, runSnapshot.config, manifest?.artifact_versions ?? null);
 
 		const addedToRuns =
 			(hasPromptScores || hasScoreStage) &&
@@ -1094,7 +1119,7 @@ async function loadSuiteHeavyData(
 				has_judged: hasPromptScores,
 				has_scenario_scores: hasAuditScores,
 				manifest,
-				metrics: hasPromptScores ? computeRunMetrics(promptScores) : null
+				metrics: hasPromptScores ? computeRunMetrics(promptScores, behaviors) : null
 			});
 		}
 		if (addedToAuditRuns) {
@@ -1102,7 +1127,7 @@ async function loadSuiteHeavyData(
 				run_id: runId,
 				has_scores: hasAuditScores,
 				manifest,
-				metrics: hasAuditScores ? computeAuditRunMetrics(auditScores) : null
+				metrics: hasAuditScores ? computeAuditRunMetrics(auditScores, behaviors) : null
 			});
 		}
 
@@ -1204,8 +1229,10 @@ function loadCompletedRunPageData(
 	const samples = resolvedTab === 'prompts' ? promptRows : [];
 	const auditScores = resolvedTab === 'audit' ? auditRows : [];
 	const scenarioSeeds = buildScenarioSeeds(suiteSnapshot);
-	const promptMetrics = resolvedTab === 'prompts' ? computeRunMetrics(samples) : null;
-	const auditMetrics = resolvedTab === 'audit' ? computeAuditRunMetrics(auditScores) : null;
+	const judgeTaxonomy = loadRunJudgeTaxonomyForRun(suiteId, runId);
+	const behaviors = (judgeTaxonomy?.behavior_categories ?? suiteSnapshot?.taxonomy?.behavior_categories ?? []).map(normalizeBehavior);
+	const promptMetrics = resolvedTab === 'prompts' ? computeRunMetrics(samples, behaviors) : null;
+	const auditMetrics = resolvedTab === 'audit' ? computeAuditRunMetrics(auditScores, behaviors) : null;
 
 	return {
 		suite_id: suiteId,
@@ -1270,8 +1297,9 @@ export function loadRunPageData(suiteId: string, runId: string, activeTab: 'prom
 
 	const scenarioSeeds = buildScenarioSeeds(suiteSnapshot);
 	const promptSeedTitleMap = buildPromptSeedTitleMap(suiteSnapshot);
-	const promptMetrics = resolvedTab === 'prompts' ? computeRunMetrics(samples) : null;
-	const auditMetrics = resolvedTab === 'audit' ? computeAuditRunMetrics(auditScores) : null;
+	const behaviors = metricBehaviors(suiteSnapshot, runSnapshot.config, runSnapshot.manifest?.artifact_versions ?? null);
+	const promptMetrics = resolvedTab === 'prompts' ? computeRunMetrics(samples, behaviors) : null;
+	const auditMetrics = resolvedTab === 'audit' ? computeAuditRunMetrics(auditScores, behaviors) : null;
 	const scenarioSeedMap = resolvedTab === 'audit' ? buildScenarioSeedMap(scenarioSeeds, auditScores) : {};
 
 	return {
diff --git a/viewer/src/lib/server/metrics.ts b/viewer/src/lib/server/metrics.ts
index 0d140be0..b4a3ca94 100644
--- a/viewer/src/lib/server/metrics.ts
+++ b/viewer/src/lib/server/metrics.ts
@@ -4,7 +4,16 @@ import {
 	isBooleanFlag,
 	isSuccessfulJudgment
 } from '$lib/judgment.js';
-import type { AuditScore, AuditRunMetrics, BinaryCounts, DimensionMetrics, JudgedSample, RunMetrics } from '$lib/types.js';
+import type {
+	AuditScore,
+	AuditRunMetrics,
+	Behavior,
+	BinaryCounts,
+	DimensionMetrics,
+	JudgedSample,
+	NodeJudgment,
+	RunMetrics
+} from '$lib/types.js';
 import { loadDimensions } from './dimensions.js';
 
 type EventScoredRecord = {
@@ -22,6 +31,67 @@ export function emptyScoreCounts(): BinaryCounts {
 	return { 0: 0, 1: 0 };
 }
 
+function emptyDimensionAggregate(): EventDimensionAggregate {
+	return { count: 0, flagged_count: 0, clear_count: 0, counts: emptyScoreCounts() };
+}
+
+function finalizeDimensionAggregate(aggregate: EventDimensionAggregate): DimensionMetrics {
+	return {
+		rate: aggregate.count > 0 ? aggregate.flagged_count / aggregate.count : 0,
+		count: aggregate.count,
+		flagged_count: aggregate.flagged_count,
+		clear_count: aggregate.clear_count,
+		counts: aggregate.counts
+	};
+}
+
+function readNodeJudgments(verdict: Record<string, unknown> | null | undefined): NodeJudgment[] {
+	if (!verdict || typeof verdict !== 'object') return [];
+	const nodes = (verdict as Record<string, unknown>).node_judgments;
+	return Array.isArray(nodes) ? (nodes as NodeJudgment[]) : [];
+}
+
+function buildPermissibilityIndex(behaviors: Behavior[]): Map<string, boolean> {
+	const index = new Map<string, boolean>();
+	for (const behavior of behaviors) {
+		if (!behavior || typeof behavior.name !== 'string') continue;
+		index.set(behavior.name, behavior.permissible === true);
+	}
+	return index;
+}
+
+export function computePolicyViolationByPermissibility(
+	records: EventScoredRecord[],
+	behaviors: Behavior[]
+): { permissible: DimensionMetrics | null; not_permissible: DimensionMetrics | null } {
+	if (!behaviors || behaviors.length === 0) {
+		return { permissible: null, not_permissible: null };
+	}
+	const permissibilityIndex = buildPermissibilityIndex(behaviors);
+	if (permissibilityIndex.size === 0) {
+		return { permissible: null, not_permissible: null };
+	}
+
+	const permissible = emptyDimensionAggregate();
+	const notPermissible = emptyDimensionAggregate();
+
+	for (const record of records) {
+		for (const node of readNodeJudgments(record.verdict)) {
+			if (node.relevant !== true) continue;
+			if (!isBooleanFlag(node.violated)) continue;
+			const name = typeof node.node_name === 'string' ? node.node_name.trim() : '';
+			if (!name || !permissibilityIndex.has(name)) continue;
+			const bucket = permissibilityIndex.get(name) ? permissible : notPermissible;
+			addFlag(bucket, node.violated);
+		}
+	}
+
+	return {
+		permissible: finalizeDimensionAggregate(permissible),
+		not_permissible: finalizeDimensionAggregate(notPermissible)
+	};
+}
+
 function collectDimensionNames(records: EventScoredRecord[]): string[] {
 	const names = new Set<string>();
 	for (const record of records) {
@@ -77,7 +147,10 @@ function dimensionRate(dimensions: Record<string, DimensionMetrics>, name: strin
 	return dimensions[name]?.rate ?? 0;
 }
 
-export function computeAuditRunMetrics(scores: AuditScore[]): AuditRunMetrics | null {
+export function computeAuditRunMetrics(
+	scores: AuditScore[],
+	behaviors: Behavior[] = []
+): AuditRunMetrics | null {
 	if (scores.length === 0) return null;
 
 	const requiredBaseMetrics = getRequiredBaseMetricNames(loadDimensions());
@@ -100,6 +173,7 @@ export function computeAuditRunMetrics(scores: AuditScore[]): AuditRunMetrics |
 	const dimensions = finalizeDimensions(dimensionAggregates);
 	const total = scores.length;
 	const scoredTotal = scoredScores.length;
+	const permissibilitySplit = computePolicyViolationByPermissibility(scoredScores, behaviors);
 
 	return {
 		total,
@@ -109,6 +183,8 @@ export function computeAuditRunMetrics(scores: AuditScore[]): AuditRunMetrics |
 		counts,
 		policy_violation_rate: dimensionRate(dimensions, 'policy_violation'),
 		overrefusal_rate: dimensionRate(dimensions, 'overrefusal'),
+		policy_violation_on_permissible: permissibilitySplit.permissible,
+		policy_violation_on_not_permissible: permissibilitySplit.not_permissible,
 		dimensions,
 		target: scores[0]?.target ?? '',
 		tester_model: scores[0]?.tester_model ?? '',
@@ -116,7 +192,10 @@ export function computeAuditRunMetrics(scores: AuditScore[]): AuditRunMetrics |
 	};
 }
 
-export function computeRunMetrics(samples: JudgedSample[]): RunMetrics | null {
+export function computeRunMetrics(
+	samples: JudgedSample[],
+	behaviors: Behavior[] = []
+): RunMetrics | null {
 	if (samples.length === 0) return null;
 
 	const requiredBaseMetrics = getRequiredBaseMetricNames(loadDimensions());
@@ -137,6 +216,7 @@ export function computeRunMetrics(samples: JudgedSample[]): RunMetrics | null {
 	}
 
 	const dimensions = finalizeDimensions(dimensionAggregates);
+	const permissibilitySplit = computePolicyViolationByPermissibility(scoredSamples, behaviors);
 
 	return {
 		total: samples.length,
@@ -147,6 +227,8 @@ export function computeRunMetrics(samples: JudgedSample[]): RunMetrics | null {
 		counts,
 		policy_violation_rate: dimensionRate(dimensions, 'policy_violation'),
 		overrefusal_rate: dimensionRate(dimensions, 'overrefusal'),
+		policy_violation_on_permissible: permissibilitySplit.permissible,
+		policy_violation_on_not_permissible: permissibilitySplit.not_permissible,
 		target: samples[0]?.target ?? '—',
 		judge_model: samples[0]?.judge_model ?? '—',
 		dimensions
diff --git a/viewer/src/lib/types.ts b/viewer/src/lib/types.ts
index 05263138..40ee9b5c 100644
--- a/viewer/src/lib/types.ts
+++ b/viewer/src/lib/types.ts
@@ -273,6 +273,8 @@ export interface RunMetrics {
 	counts: BinaryCounts;
 	policy_violation_rate: number;
 	overrefusal_rate: number;
+	policy_violation_on_permissible: DimensionMetrics | null;
+	policy_violation_on_not_permissible: DimensionMetrics | null;
 	target: string;
 	judge_model: string;
 	dimensions: Record<string, DimensionMetrics>;
@@ -301,6 +303,8 @@ export interface AuditRunMetrics {
 	counts: BinaryCounts;
 	policy_violation_rate: number;
 	overrefusal_rate: number;
+	policy_violation_on_permissible: DimensionMetrics | null;
+	policy_violation_on_not_permissible: DimensionMetrics | null;
 	dimensions: Record<string, DimensionMetrics>;
 	target: string;
 	tester_model: string;
diff --git a/viewer/src/routes/suite/[suite_id]/[run_id]/+page.svelte b/viewer/src/routes/suite/[suite_id]/[run_id]/+page.svelte
index 76d16059..da8cf99e 100644
--- a/viewer/src/routes/suite/[suite_id]/[run_id]/+page.svelte
+++ b/viewer/src/routes/suite/[suite_id]/[run_id]/+page.svelte
@@ -919,6 +919,43 @@
 			</p>
 		{/if}
 
+
+		{#if data.metrics.policyViolationOnPermissible || data.metrics.policyViolationOnNotPermissible}
+			{@const promptPerm = data.metrics.policyViolationOnPermissible}
+			{@const promptNotPerm = data.metrics.policyViolationOnNotPermissible}
+			{#if (promptPerm?.count ?? 0) + (promptNotPerm?.count ?? 0) > 0}
+				<div class="mb-4 grid gap-3 sm:grid-cols-2" title="Per-behavior judgments aggregated across prompts. Denominator is judgments the judge marked relevant for that behavior.">
+					{#each [
+						{ key: 'permissible', label: 'Allowed requests failed', summary: promptPerm, hint: 'Requests the assistant should have helped with, but instead refused, deflected, or failed to complete.' },
+						{ key: 'not_permissible', label: 'Blocked requests failed', summary: promptNotPerm, hint: 'Requests the assistant should have blocked, but instead helped with or allowed.' }
+					] as card (card.key)}
+						{@const summary = card.summary}
+						<div class="rounded-lg border border-border bg-surface px-5 py-4">
+							<div class="text-[11px] font-semibold uppercase tracking-wider text-text-muted">{card.label}</div>
+							<p class="mt-0.5 text-[10px] text-text-muted/60 leading-snug line-clamp-2">{card.hint}</p>
+							<div class="mt-2 flex items-baseline gap-1.5">
+								<span class="text-3xl font-bold tabular-nums {metricRateClass(summary ? summary.rate : 0)}">{summary && summary.count > 0 ? metricRateText(summary.rate) : '—'}</span>
+								<span class="text-sm text-text-muted">{summary && summary.count > 0 ? 'violated' : 'no relevant judgments'}</span>
+							</div>
+							{#if summary && summary.count > 0}
+								{@const pct = binaryBar(summary.counts)}
+								<div class="mt-2.5 flex h-1.5 overflow-hidden rounded-full bg-border/50">
+									{#if pct.clear > 0}<div class="bg-score-pass" style="width: {pct.clear}%"></div>{/if}
+									{#if pct.flagged > 0}<div class="bg-score-fail" style="width: {pct.flagged}%"></div>{/if}
+								</div>
+								<div class="mt-1 flex flex-wrap justify-between gap-x-2 gap-y-0.5 text-[9px] tabular-nums text-text-muted">
+									<span>{summary.clear_count} clear</span>
+									<span>{summary.flagged_count} violated</span>
+									<span>{summary.count} relevant judgments</span>
+								</div>
+							{/if}
+							<div class="mt-2 text-[9px] text-text-muted">aggregated over relevant behavior judgments, not over prompts</div>
+						</div>
+					{/each}
+				</div>
+			{/if}
+		{/if}
+
 		<!-- Category Accordion -->
 		<section class="mb-8">
 			<div class="mb-4 border-b border-border pb-2">
@@ -1199,6 +1236,43 @@
 			</p>
 		{/if}
 
+
+		{#if data.auditMetrics.policyViolationOnPermissible || data.auditMetrics.policyViolationOnNotPermissible}
+			{@const auditPerm = data.auditMetrics.policyViolationOnPermissible}
+			{@const auditNotPerm = data.auditMetrics.policyViolationOnNotPermissible}
+			{#if (auditPerm?.count ?? 0) + (auditNotPerm?.count ?? 0) > 0}
+				<div class="mb-4 grid gap-3 sm:grid-cols-2" title="Per-behavior judgments aggregated across scenarios. Denominator is judgments the judge marked relevant for that behavior.">
+					{#each [
+						{ key: 'permissible', label: 'Allowed requests failed', summary: auditPerm, hint: 'Requests the assistant should have helped with, but instead refused, deflected, or failed to complete.' },
+						{ key: 'not_permissible', label: 'Blocked requests failed', summary: auditNotPerm, hint: 'Requests the assistant should have blocked, but instead helped with or allowed.' }
+					] as card (card.key)}
+						{@const summary = card.summary}
+						<div class="rounded-lg border border-border bg-surface px-5 py-4">
+							<div class="text-[11px] font-semibold uppercase tracking-wider text-text-muted">{card.label}</div>
+							<p class="mt-0.5 text-[10px] text-text-muted/60 leading-snug line-clamp-2">{card.hint}</p>
+							<div class="mt-2 flex items-baseline gap-1.5">
+								<span class="text-3xl font-bold tabular-nums {metricRateClass(summary ? summary.rate : 0)}">{summary && summary.count > 0 ? metricRateText(summary.rate) : '—'}</span>
+								<span class="text-sm text-text-muted">{summary && summary.count > 0 ? 'violated' : 'no relevant judgments'}</span>
+							</div>
+							{#if summary && summary.count > 0}
+								{@const pct = binaryBar(summary.counts)}
+								<div class="mt-2.5 flex h-1.5 overflow-hidden rounded-full bg-border/50">
+									{#if pct.clear > 0}<div class="bg-score-pass" style="width: {pct.clear}%"></div>{/if}
+									{#if pct.flagged > 0}<div class="bg-score-fail" style="width: {pct.flagged}%"></div>{/if}
+								</div>
+								<div class="mt-1 flex flex-wrap justify-between gap-x-2 gap-y-0.5 text-[9px] tabular-nums text-text-muted">
+									<span>{summary.clear_count} clear</span>
+									<span>{summary.flagged_count} violated</span>
+									<span>{summary.count} relevant judgments</span>
+								</div>
+							{/if}
+							<div class="mt-2 text-[9px] text-text-muted">aggregated over relevant behavior judgments, not over scenarios</div>
+						</div>
+					{/each}
+				</div>
+			{/if}
+		{/if}
+
 		<!-- Audit Category Accordion -->
 		<section class="mb-8">
 			<div class="mb-4 border-b border-border pb-2">

From 86ae430d348b2eb7dc1959c099143e61440ca6f7 Mon Sep 17 00:00:00 2001
From: Jake Present <jakepresent@microsoft.com>
Date: Tue, 26 May 2026 14:29:29 -0400
Subject: [PATCH 2/2] fix: omit unsupported GPT-5 temperature overrides

---
 p2m/core/model_client.py   | 40 ++++++++++++++++--
 tests/test_model_client.py | 84 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 117 insertions(+), 7 deletions(-)

diff --git a/p2m/core/model_client.py b/p2m/core/model_client.py
index 5d9945ff..3cf11b8b 100644
--- a/p2m/core/model_client.py
+++ b/p2m/core/model_client.py
@@ -311,6 +311,36 @@ def _supports_web_search_preview(model: str) -> bool:
     return _model_family(model) in {"openai", "azure"}
 
 
+def _model_name(model: str) -> str:
+    normalized = (model or "").strip().lower()
+    if "/" in normalized:
+        return normalized.split("/", 1)[1]
+    return normalized
+
+
+def _supports_custom_temperature(model: str) -> bool:
+    """Whether non-default temperature values are supported for this model.
+
+    GPT-5.x deployments on OpenAI/Azure reject explicit non-default
+    temperatures. LiteLLM's Azure model string uses the deployment name after
+    ``azure/``, so match prefixes instead of exact public model IDs.
+    """
+    return not _model_name(model).startswith("gpt-5")
+
+
+def _temperature_for_payload(model: str, temperature: float | None) -> float | None:
+    if temperature is None or temperature == 1:
+        return temperature
+    if _supports_custom_temperature(model):
+        return temperature
+    log.warning(
+        "Model %s only supports the default temperature; ignoring configured temperature=%s",
+        model,
+        temperature,
+    )
+    return None
+
+
 def _require_web_search_preview_support(model: str) -> None:
     if _supports_web_search_preview(model):
         return
@@ -493,8 +523,9 @@ def _build_chat_payload(
         "model": model,
         "messages": messages_to_openai(messages),
     }
-    if resolved_options.temperature is not None:
-        payload["temperature"] = resolved_options.temperature
+    temperature = _temperature_for_payload(model, resolved_options.temperature)
+    if temperature is not None:
+        payload["temperature"] = temperature
     if resolved_options.max_tokens is not None:
         payload["max_tokens"] = resolved_options.max_tokens
     if resolved_options.max_output_tokens is not None and "max_tokens" not in payload:
@@ -519,8 +550,9 @@ def _build_responses_payload(
         "model": model,
         "input": input_payload,
     }
-    if resolved_options.temperature is not None:
-        payload["temperature"] = resolved_options.temperature
+    temperature = _temperature_for_payload(model, resolved_options.temperature)
+    if temperature is not None:
+        payload["temperature"] = temperature
     if resolved_options.max_output_tokens is not None:
         payload["max_output_tokens"] = resolved_options.max_output_tokens
     elif resolved_options.max_tokens is not None:
diff --git a/tests/test_model_client.py b/tests/test_model_client.py
index f9c45b41..21a5bdc9 100644
--- a/tests/test_model_client.py
+++ b/tests/test_model_client.py
@@ -13,7 +13,7 @@ async def fake_acompletion(**kwargs):
             captured.update(kwargs)
             return {
                 "id": "resp-chat-1",
-                "model": "openai/gpt-5-mini",
+                "model": "openai/gpt-4o-mini",
                 "choices": [
                     {
                         "finish_reason": "stop",
@@ -36,7 +36,7 @@ async def fake_acompletion(**kwargs):
 
         with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
             response = await model_client.generate(
-                "openai/gpt-5-mini",
+                "openai/gpt-4o-mini",
                 "say hi",
                 options,
             )
@@ -51,9 +51,87 @@ async def fake_acompletion(**kwargs):
         self.assertEqual(response.finish_reason, "stop")
         self.assertEqual(response.usage.total_tokens, 18)
         self.assertEqual(response.api_mode, "chat_completion")
-        self.assertEqual(response.request_payload["model"], "openai/gpt-5-mini")
+        self.assertEqual(response.request_payload["model"], "openai/gpt-4o-mini")
         self.assertEqual(response.request_payload["messages"], [{"role": "user", "content": "say hi"}])
 
+    async def test_generate_omits_unsupported_gpt5_temperature(self) -> None:
+        captured: dict[str, object] = {}
+
+        async def fake_acompletion(**kwargs):
+            captured.update(kwargs)
+            return {
+                "choices": [
+                    {
+                        "finish_reason": "stop",
+                        "message": {"role": "assistant", "content": "ok"},
+                    }
+                ]
+            }
+
+        fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
+        options = model_client.GenerateOptions(temperature=0.0)
+
+        with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
+            await model_client.generate(
+                "azure/gpt-5.4-1",
+                "say hi",
+                options,
+            )
+
+        self.assertNotIn("temperature", captured)
+
+    async def test_generate_keeps_default_gpt5_temperature(self) -> None:
+        captured: dict[str, object] = {}
+
+        async def fake_acompletion(**kwargs):
+            captured.update(kwargs)
+            return {
+                "choices": [
+                    {
+                        "finish_reason": "stop",
+                        "message": {"role": "assistant", "content": "ok"},
+                    }
+                ]
+            }
+
+        fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
+        options = model_client.GenerateOptions(temperature=1.0)
+
+        with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
+            await model_client.generate(
+                "azure/gpt-5.4-1",
+                "say hi",
+                options,
+            )
+
+        self.assertEqual(captured["temperature"], 1.0)
+
+    async def test_generate_keeps_non_gpt5_temperature(self) -> None:
+        captured: dict[str, object] = {}
+
+        async def fake_acompletion(**kwargs):
+            captured.update(kwargs)
+            return {
+                "choices": [
+                    {
+                        "finish_reason": "stop",
+                        "message": {"role": "assistant", "content": "ok"},
+                    }
+                ]
+            }
+
+        fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
+        options = model_client.GenerateOptions(temperature=0.0)
+
+        with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
+            await model_client.generate(
+                "azure/gpt-4o-mini",
+                "say hi",
+                options,
+            )
+
+        self.assertEqual(captured["temperature"], 0.0)
+
     async def test_generate_structured_adds_json_schema_response_format(self) -> None:
         captured: dict[str, object] = {}