From 5c794d3bc6e89bd4a95e0597d2bd35fcc6ba6294 Mon Sep 17 00:00:00 2001
From: Jake Present
Date: Tue, 26 May 2026 13:56:12 -0400
Subject: [PATCH 1/2] viewer: split policy violation metrics by permissibility
---
tests/test_viewer_server_artifacts.py | 190 ++++++++++++++++++
viewer/src/lib/server/artifacts.ts | 41 ++++
viewer/src/lib/server/data.ts | 44 +++-
viewer/src/lib/server/metrics.ts | 88 +++++++-
viewer/src/lib/types.ts | 4 +
.../suite/[suite_id]/[run_id]/+page.svelte | 74 +++++++
6 files changed, 430 insertions(+), 11 deletions(-)
diff --git a/tests/test_viewer_server_artifacts.py b/tests/test_viewer_server_artifacts.py
index 8cbdb500..fb3910d3 100644
--- a/tests/test_viewer_server_artifacts.py
+++ b/tests/test_viewer_server_artifacts.py
@@ -1723,6 +1723,194 @@ def test_list_suites_marks_scenario_only_scored_suite_as_has_results(self) -> No
self.assertEqual(payload[0]["status"], "has_results")
+ def test_run_metrics_policy_violation_by_permissibility(self) -> None:
+ with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
+ harness_dir = Path(tmp_dir) / "harness"
+ harness_dir.mkdir()
+ self._copy_data_harness(harness_dir)
+ metrics_path = harness_dir / "metrics.ts"
+
+ script = textwrap.dedent(
+ f"""\
+ const {{ computeRunMetrics }} = await import({json.dumps(metrics_path.as_uri())});
+ const behaviors = [
+ {{ name: 'perm_a', definition: '', examples: [], permissible: true }},
+ {{ name: 'notperm_a', definition: '', examples: [], permissible: false }}
+ ];
+ const samples = [
+ {{
+ test_case_id: 'seed-1',
+ prompt: 'prompt',
+ response: 'response',
+ behavior: 'perm_a',
+ target: 'target-model',
+ judge_model: 'judge-model',
+ judge_status: 'ok',
+ verdict: {{
+ dimensions: {{ policy_violation: true, overrefusal: true }},
+ justification: '',
+ node_judgments: [
+ {{ node_index: 0, node_name: 'perm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
+ {{ node_index: 1, node_name: 'notperm_a', relevant: false, violated: null, confidence: null, evidence_turns: [], reasoning: '' }}
+ ]
+ }}
+ }},
+ {{
+ test_case_id: 'seed-2',
+ prompt: 'prompt',
+ response: 'response',
+ behavior: 'notperm_a',
+ target: 'target-model',
+ judge_model: 'judge-model',
+ judge_status: 'ok',
+ verdict: {{
+ dimensions: {{ policy_violation: false, overrefusal: false }},
+ justification: '',
+ node_judgments: [
+ {{ node_index: 0, node_name: 'perm_a', relevant: true, violated: false, confidence: 'high', evidence_turns: [], reasoning: '' }},
+ {{ node_index: 1, node_name: 'notperm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
+ {{ node_index: 2, node_name: 'unknown_node', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }}
+ ]
+ }}
+ }}
+ ];
+ const withBehaviors = computeRunMetrics(samples, behaviors);
+ const withoutBehaviors = computeRunMetrics(samples, []);
+ console.log(JSON.stringify({{ withBehaviors, withoutBehaviors }}));
+ """
+ )
+ result = self._run_node(
+ harness_dir=harness_dir,
+ script=script,
+ env=os.environ.copy(),
+ )
+ self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
+ payload = json.loads(result.stdout)
+
+ with_behaviors = payload["withBehaviors"]
+ self.assertEqual(with_behaviors["scored_total"], 2)
+ permissible = with_behaviors["policy_violation_on_permissible"]
+ self.assertIsNotNone(permissible)
+ self.assertEqual(permissible["count"], 2)
+ self.assertEqual(permissible["flagged_count"], 1)
+ self.assertEqual(permissible["clear_count"], 1)
+ self.assertAlmostEqual(permissible["rate"], 0.5)
+
+ not_permissible = with_behaviors["policy_violation_on_not_permissible"]
+ self.assertIsNotNone(not_permissible)
+ self.assertEqual(not_permissible["count"], 1)
+ self.assertEqual(not_permissible["flagged_count"], 1)
+ self.assertEqual(not_permissible["clear_count"], 0)
+ self.assertAlmostEqual(not_permissible["rate"], 1.0)
+
+ without_behaviors = payload["withoutBehaviors"]
+ self.assertIsNone(without_behaviors["policy_violation_on_permissible"])
+ self.assertIsNone(without_behaviors["policy_violation_on_not_permissible"])
+
+ def test_audit_run_metrics_policy_violation_by_permissibility(self) -> None:
+ with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
+ harness_dir = Path(tmp_dir) / "harness"
+ harness_dir.mkdir()
+ self._copy_data_harness(harness_dir)
+ metrics_path = harness_dir / "metrics.ts"
+
+ script = textwrap.dedent(
+ f"""\
+ const {{ computeAuditRunMetrics }} = await import({json.dumps(metrics_path.as_uri())});
+ const behaviors = [
+ {{ name: 'perm_a', definition: '', examples: [], permissible: true }},
+ {{ name: 'notperm_a', definition: '', examples: [], permissible: false }}
+ ];
+ const scores = [
+ {{
+ test_case_id: 'scenario-1',
+ target: 'target-model',
+ tester_model: 'tester-model',
+ judge_model: 'judge-model',
+ judge_status: 'ok',
+ verdict: {{
+ dimensions: {{ policy_violation: true, overrefusal: false }},
+ justification: '',
+ node_judgments: [
+ {{ node_index: 0, node_name: 'notperm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
+ {{ node_index: 1, node_name: 'perm_a', relevant: true, violated: false, confidence: 'high', evidence_turns: [], reasoning: '' }}
+ ]
+ }}
+ }}
+ ];
+ const metrics = computeAuditRunMetrics(scores, behaviors);
+ console.log(JSON.stringify(metrics));
+ """
+ )
+ result = self._run_node(
+ harness_dir=harness_dir,
+ script=script,
+ env=os.environ.copy(),
+ )
+ self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
+ metrics = json.loads(result.stdout)
+
+ self.assertEqual(metrics["scored_total"], 1)
+ permissible = metrics["policy_violation_on_permissible"]
+ self.assertIsNotNone(permissible)
+ self.assertEqual(permissible["count"], 1)
+ self.assertEqual(permissible["flagged_count"], 0)
+ self.assertEqual(permissible["clear_count"], 1)
+ self.assertAlmostEqual(permissible["rate"], 0.0)
+
+ not_permissible = metrics["policy_violation_on_not_permissible"]
+ self.assertIsNotNone(not_permissible)
+ self.assertEqual(not_permissible["count"], 1)
+ self.assertEqual(not_permissible["flagged_count"], 1)
+ self.assertEqual(not_permissible["clear_count"], 0)
+ self.assertAlmostEqual(not_permissible["rate"], 1.0)
+
+ def test_load_run_judge_taxonomy_prefers_run_config(self) -> None:
+ with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
+ harness_dir = Path(tmp_dir) / "harness"
+ harness_dir.mkdir()
+ self._copy_data_harness(harness_dir)
+ artifacts_path = harness_dir / "artifacts.ts"
+
+ suite_dir = Path(tmp_dir) / "demo-suite"
+ run_dir = suite_dir / "demo-run"
+ run_dir.mkdir(parents=True)
+ judge_taxonomy = {
+ "behavior": {"name": "demo", "definition": "demo"},
+ "behavior_categories": [
+ {"name": "judge_only", "definition": "", "examples": [], "permissible": False}
+ ],
+ }
+ taxonomy_path = suite_dir / "taxonomy.override.json"
+ taxonomy_path.write_text(json.dumps(judge_taxonomy), encoding="utf-8")
+ (run_dir / "config.yaml").write_text(
+ f"pipeline:\n judge:\n taxonomy_path: {taxonomy_path}\n",
+ encoding="utf-8",
+ )
+
+ script = textwrap.dedent(
+ f"""\
+ const {{ loadRunJudgeTaxonomyForRun, loadRunJudgeTaxonomy, loadRunJudgeTaxonomyFromArtifacts }} = await import({json.dumps(artifacts_path.as_uri())});
+ const fromRun = loadRunJudgeTaxonomyForRun('demo-suite', 'demo-run');
+ const fromConfig = loadRunJudgeTaxonomy({{ pipeline: {{ judge: {{ taxonomy_path: {json.dumps(str(taxonomy_path))} }} }} }});
+ const fromArtifact = loadRunJudgeTaxonomyFromArtifacts({{ suite: 'demo-suite' }}, {{ systematize: {{ path: 'taxonomy.override.json' }} }});
+ const fromMissing = loadRunJudgeTaxonomy({{ pipeline: {{ judge: {{}} }} }});
+ console.log(JSON.stringify({{ fromRun, fromConfig, fromArtifact, fromMissing }}));
+ """
+ )
+ env = os.environ.copy()
+ env["ARTIFACTS_ROOT"] = str(Path(tmp_dir))
+ result = self._run_node(harness_dir=harness_dir, script=script, env=env)
+ self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
+ payload = json.loads(result.stdout)
+
+ self.assertEqual(payload["fromRun"], judge_taxonomy)
+ self.assertEqual(payload["fromConfig"], judge_taxonomy)
+ self.assertEqual(payload["fromArtifact"], judge_taxonomy)
+ self.assertIsNone(payload["fromMissing"])
+
+
+
class ViewerReadModelHelpersTest(unittest.TestCase):
"""Tests for path-traversal defenses that don't depend on Node TS support."""
@@ -1822,5 +2010,7 @@ def test_test_set_artifact_path_rejects_paths_that_normalize_to_directory(self)
)
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/viewer/src/lib/server/artifacts.ts b/viewer/src/lib/server/artifacts.ts
index 65a023c5..83c55c66 100644
--- a/viewer/src/lib/server/artifacts.ts
+++ b/viewer/src/lib/server/artifacts.ts
@@ -556,6 +556,47 @@ function readObject(value: unknown): Record | null {
: null;
}
+export function loadRunJudgeTaxonomy(config: Record | null): Taxonomy | null {
+ return loadRunJudgeTaxonomyFromArtifacts(config, null);
+}
+
+export function loadRunJudgeTaxonomyFromArtifacts(
+ config: Record | null,
+ artifacts: Record | null
+): Taxonomy | null {
+ const systematize = readObject(artifacts?.systematize);
+ const artifactTaxonomyPath = typeof systematize?.path === 'string' ? systematize.path : null;
+ if (artifactTaxonomyPath) {
+ const resolvedArtifactPath = manifestArtifactPath(suiteDirPathFromConfig(config), artifactTaxonomyPath);
+ const artifactTaxonomy = resolvedArtifactPath
+ ? readJsonFile(resolvedArtifactPath, { missingOk: true })
+ : null;
+ if (artifactTaxonomy) return artifactTaxonomy;
+ }
+
+ const pipeline = readObject(config?.pipeline);
+ const judge = readObject(pipeline?.judge);
+ const rawTaxonomyPath = typeof judge?.taxonomy_path === 'string' ? judge.taxonomy_path : null;
+ if (!rawTaxonomyPath) return null;
+
+ const resolved = path.resolve(rawTaxonomyPath);
+ return readJsonFile(resolved, { missingOk: true });
+}
+
+function suiteDirPathFromConfig(config: Record | null): string {
+ const suite = typeof config?.suite === 'string' ? config.suite : null;
+ return suite ? suiteDirPath(suite) : ARTIFACTS_ROOT;
+}
+
+export function loadRunJudgeTaxonomyForRun(suiteId: string, runId: string): Taxonomy | null {
+ const runDir = runDirPath(suiteId, runId);
+ const config = readYamlFile>(path.join(runDir, RUN_CONFIG_FILE), {
+ missingOk: true
+ });
+ const manifest = readJsonFile(path.join(runDir, RUN_MANIFEST_FILE), { missingOk: true });
+ return loadRunJudgeTaxonomyFromArtifacts(config, manifest?.artifact_versions ?? null);
+}
+
export function loadRunRuntimeMode(config: Record | null): string | null {
const pipeline = readObject(config?.pipeline);
const inference = readObject(pipeline?.inference);
diff --git a/viewer/src/lib/server/data.ts b/viewer/src/lib/server/data.ts
index 1a876dd2..9a02de2e 100644
--- a/viewer/src/lib/server/data.ts
+++ b/viewer/src/lib/server/data.ts
@@ -6,6 +6,8 @@ import {
ViewerReadModelError,
loadIndexedRunScoreRow,
loadIndexedRunTranscriptRow,
+ loadRunJudgeTaxonomyForRun,
+ loadRunJudgeTaxonomyFromArtifacts,
loadRunRuntimeMode,
loadRunScoreRow,
loadRunTranscriptRow,
@@ -64,6 +66,8 @@ interface PromptMetricView {
counts: BinaryCounts;
policyViolationRate: number;
overrefusalRate: number;
+ policyViolationOnPermissible: DimensionMetrics | null;
+ policyViolationOnNotPermissible: DimensionMetrics | null;
dimensions: Record;
target: string;
judge_model: string;
@@ -77,6 +81,8 @@ interface AuditMetricView {
counts: BinaryCounts;
policyViolationRate: number;
overrefusalRate: number;
+ policyViolationOnPermissible: DimensionMetrics | null;
+ policyViolationOnNotPermissible: DimensionMetrics | null;
dimensions: Record;
target: string;
tester_model: string;
@@ -179,6 +185,15 @@ function normalizeBehavior(b: Behavior): Behavior {
return { ...b, permissible: b.permissible ?? false };
}
+function metricBehaviors(
+ snapshot: SuiteSnapshot | null,
+ runConfig?: Record | null,
+ artifacts?: Record | null
+): Behavior[] {
+ const judgeTaxonomy = loadRunJudgeTaxonomyFromArtifacts(runConfig ?? null, artifacts ?? null);
+ return (judgeTaxonomy?.behavior_categories ?? snapshot?.taxonomy?.behavior_categories ?? []).map(normalizeBehavior);
+}
+
function normalizePolicy(taxonomy: Taxonomy | null | undefined): Taxonomy | null {
if (!taxonomy) return null;
const behavior = taxonomy.behavior ?? taxonomy.risk;
@@ -649,6 +664,7 @@ function buildRunListEntries(snapshot: SuiteSnapshot): {
const hasPromptScores = promptScores.length > 0;
const hasAuditScores = auditScores.length > 0;
const hasScoreStage = manifest?.stages?.judge != null;
+ const behaviors = metricBehaviors(snapshot, runSnapshot.config, manifest?.artifact_versions ?? null);
if ((hasPromptScores || hasScoreStage) && !(manifest?.status === 'failed' && !hasPromptScores)) {
runs.push({
@@ -656,7 +672,7 @@ function buildRunListEntries(snapshot: SuiteSnapshot): {
has_judged: hasPromptScores,
has_scenario_scores: hasAuditScores,
manifest,
- metrics: hasPromptScores ? computeRunMetrics(promptScores) : null
+ metrics: hasPromptScores ? computeRunMetrics(promptScores, behaviors) : null
});
}
@@ -665,7 +681,7 @@ function buildRunListEntries(snapshot: SuiteSnapshot): {
run_id: runId,
has_scores: hasAuditScores,
manifest,
- metrics: hasAuditScores ? computeAuditRunMetrics(auditScores) : null
+ metrics: hasAuditScores ? computeAuditRunMetrics(auditScores, behaviors) : null
});
}
}
@@ -682,6 +698,8 @@ function buildZeroPromptMetrics(): PromptMetricView {
counts: emptyScoreCounts(),
policyViolationRate: 0,
overrefusalRate: 0,
+ policyViolationOnPermissible: null,
+ policyViolationOnNotPermissible: null,
dimensions: {},
target: '',
judge_model: ''
@@ -697,6 +715,8 @@ function buildZeroAuditMetrics(): AuditMetricView {
counts: emptyScoreCounts(),
policyViolationRate: 0,
overrefusalRate: 0,
+ policyViolationOnPermissible: null,
+ policyViolationOnNotPermissible: null,
dimensions: {},
target: '',
tester_model: '',
@@ -714,6 +734,8 @@ function toPromptMetricView(metrics: RunMetrics | null): PromptMetricView {
counts: metrics.counts,
policyViolationRate: metrics.policy_violation_rate,
overrefusalRate: metrics.overrefusal_rate,
+ policyViolationOnPermissible: metrics.policy_violation_on_permissible,
+ policyViolationOnNotPermissible: metrics.policy_violation_on_not_permissible,
dimensions: metrics.dimensions,
target: metrics.target,
judge_model: metrics.judge_model
@@ -730,6 +752,8 @@ function toAuditMetricView(metrics: AuditRunMetrics | null): AuditMetricView {
counts: metrics.counts,
policyViolationRate: metrics.policy_violation_rate,
overrefusalRate: metrics.overrefusal_rate,
+ policyViolationOnPermissible: metrics.policy_violation_on_permissible,
+ policyViolationOnNotPermissible: metrics.policy_violation_on_not_permissible,
dimensions: metrics.dimensions,
target: metrics.target,
tester_model: metrics.tester_model,
@@ -1080,6 +1104,7 @@ async function loadSuiteHeavyData(
const hasPromptScores = promptScores.length > 0;
const hasAuditScores = auditScores.length > 0;
const hasScoreStage = manifest?.stages?.judge != null;
+ const behaviors = metricBehaviors(snapshot, runSnapshot.config, manifest?.artifact_versions ?? null);
const addedToRuns =
(hasPromptScores || hasScoreStage) &&
@@ -1094,7 +1119,7 @@ async function loadSuiteHeavyData(
has_judged: hasPromptScores,
has_scenario_scores: hasAuditScores,
manifest,
- metrics: hasPromptScores ? computeRunMetrics(promptScores) : null
+ metrics: hasPromptScores ? computeRunMetrics(promptScores, behaviors) : null
});
}
if (addedToAuditRuns) {
@@ -1102,7 +1127,7 @@ async function loadSuiteHeavyData(
run_id: runId,
has_scores: hasAuditScores,
manifest,
- metrics: hasAuditScores ? computeAuditRunMetrics(auditScores) : null
+ metrics: hasAuditScores ? computeAuditRunMetrics(auditScores, behaviors) : null
});
}
@@ -1204,8 +1229,10 @@ function loadCompletedRunPageData(
const samples = resolvedTab === 'prompts' ? promptRows : [];
const auditScores = resolvedTab === 'audit' ? auditRows : [];
const scenarioSeeds = buildScenarioSeeds(suiteSnapshot);
- const promptMetrics = resolvedTab === 'prompts' ? computeRunMetrics(samples) : null;
- const auditMetrics = resolvedTab === 'audit' ? computeAuditRunMetrics(auditScores) : null;
+ const judgeTaxonomy = loadRunJudgeTaxonomyForRun(suiteId, runId);
+ const behaviors = (judgeTaxonomy?.behavior_categories ?? suiteSnapshot?.taxonomy?.behavior_categories ?? []).map(normalizeBehavior);
+ const promptMetrics = resolvedTab === 'prompts' ? computeRunMetrics(samples, behaviors) : null;
+ const auditMetrics = resolvedTab === 'audit' ? computeAuditRunMetrics(auditScores, behaviors) : null;
return {
suite_id: suiteId,
@@ -1270,8 +1297,9 @@ export function loadRunPageData(suiteId: string, runId: string, activeTab: 'prom
const scenarioSeeds = buildScenarioSeeds(suiteSnapshot);
const promptSeedTitleMap = buildPromptSeedTitleMap(suiteSnapshot);
- const promptMetrics = resolvedTab === 'prompts' ? computeRunMetrics(samples) : null;
- const auditMetrics = resolvedTab === 'audit' ? computeAuditRunMetrics(auditScores) : null;
+ const behaviors = metricBehaviors(suiteSnapshot, runSnapshot.config, runSnapshot.manifest?.artifact_versions ?? null);
+ const promptMetrics = resolvedTab === 'prompts' ? computeRunMetrics(samples, behaviors) : null;
+ const auditMetrics = resolvedTab === 'audit' ? computeAuditRunMetrics(auditScores, behaviors) : null;
const scenarioSeedMap = resolvedTab === 'audit' ? buildScenarioSeedMap(scenarioSeeds, auditScores) : {};
return {
diff --git a/viewer/src/lib/server/metrics.ts b/viewer/src/lib/server/metrics.ts
index 0d140be0..b4a3ca94 100644
--- a/viewer/src/lib/server/metrics.ts
+++ b/viewer/src/lib/server/metrics.ts
@@ -4,7 +4,16 @@ import {
isBooleanFlag,
isSuccessfulJudgment
} from '$lib/judgment.js';
-import type { AuditScore, AuditRunMetrics, BinaryCounts, DimensionMetrics, JudgedSample, RunMetrics } from '$lib/types.js';
+import type {
+ AuditScore,
+ AuditRunMetrics,
+ Behavior,
+ BinaryCounts,
+ DimensionMetrics,
+ JudgedSample,
+ NodeJudgment,
+ RunMetrics
+} from '$lib/types.js';
import { loadDimensions } from './dimensions.js';
type EventScoredRecord = {
@@ -22,6 +31,67 @@ export function emptyScoreCounts(): BinaryCounts {
return { 0: 0, 1: 0 };
}
+function emptyDimensionAggregate(): EventDimensionAggregate {
+ return { count: 0, flagged_count: 0, clear_count: 0, counts: emptyScoreCounts() };
+}
+
+function finalizeDimensionAggregate(aggregate: EventDimensionAggregate): DimensionMetrics {
+ return {
+ rate: aggregate.count > 0 ? aggregate.flagged_count / aggregate.count : 0,
+ count: aggregate.count,
+ flagged_count: aggregate.flagged_count,
+ clear_count: aggregate.clear_count,
+ counts: aggregate.counts
+ };
+}
+
+function readNodeJudgments(verdict: Record | null | undefined): NodeJudgment[] {
+ if (!verdict || typeof verdict !== 'object') return [];
+ const nodes = (verdict as Record).node_judgments;
+ return Array.isArray(nodes) ? (nodes as NodeJudgment[]) : [];
+}
+
+function buildPermissibilityIndex(behaviors: Behavior[]): Map {
+ const index = new Map();
+ for (const behavior of behaviors) {
+ if (!behavior || typeof behavior.name !== 'string') continue;
+ index.set(behavior.name, behavior.permissible === true);
+ }
+ return index;
+}
+
+export function computePolicyViolationByPermissibility(
+ records: EventScoredRecord[],
+ behaviors: Behavior[]
+): { permissible: DimensionMetrics | null; not_permissible: DimensionMetrics | null } {
+ if (!behaviors || behaviors.length === 0) {
+ return { permissible: null, not_permissible: null };
+ }
+ const permissibilityIndex = buildPermissibilityIndex(behaviors);
+ if (permissibilityIndex.size === 0) {
+ return { permissible: null, not_permissible: null };
+ }
+
+ const permissible = emptyDimensionAggregate();
+ const notPermissible = emptyDimensionAggregate();
+
+ for (const record of records) {
+ for (const node of readNodeJudgments(record.verdict)) {
+ if (node.relevant !== true) continue;
+ if (!isBooleanFlag(node.violated)) continue;
+ const name = typeof node.node_name === 'string' ? node.node_name.trim() : '';
+ if (!name || !permissibilityIndex.has(name)) continue;
+ const bucket = permissibilityIndex.get(name) ? permissible : notPermissible;
+ addFlag(bucket, node.violated);
+ }
+ }
+
+ return {
+ permissible: finalizeDimensionAggregate(permissible),
+ not_permissible: finalizeDimensionAggregate(notPermissible)
+ };
+}
+
function collectDimensionNames(records: EventScoredRecord[]): string[] {
const names = new Set();
for (const record of records) {
@@ -77,7 +147,10 @@ function dimensionRate(dimensions: Record, name: strin
return dimensions[name]?.rate ?? 0;
}
-export function computeAuditRunMetrics(scores: AuditScore[]): AuditRunMetrics | null {
+export function computeAuditRunMetrics(
+ scores: AuditScore[],
+ behaviors: Behavior[] = []
+): AuditRunMetrics | null {
if (scores.length === 0) return null;
const requiredBaseMetrics = getRequiredBaseMetricNames(loadDimensions());
@@ -100,6 +173,7 @@ export function computeAuditRunMetrics(scores: AuditScore[]): AuditRunMetrics |
const dimensions = finalizeDimensions(dimensionAggregates);
const total = scores.length;
const scoredTotal = scoredScores.length;
+ const permissibilitySplit = computePolicyViolationByPermissibility(scoredScores, behaviors);
return {
total,
@@ -109,6 +183,8 @@ export function computeAuditRunMetrics(scores: AuditScore[]): AuditRunMetrics |
counts,
policy_violation_rate: dimensionRate(dimensions, 'policy_violation'),
overrefusal_rate: dimensionRate(dimensions, 'overrefusal'),
+ policy_violation_on_permissible: permissibilitySplit.permissible,
+ policy_violation_on_not_permissible: permissibilitySplit.not_permissible,
dimensions,
target: scores[0]?.target ?? '',
tester_model: scores[0]?.tester_model ?? '',
@@ -116,7 +192,10 @@ export function computeAuditRunMetrics(scores: AuditScore[]): AuditRunMetrics |
};
}
-export function computeRunMetrics(samples: JudgedSample[]): RunMetrics | null {
+export function computeRunMetrics(
+ samples: JudgedSample[],
+ behaviors: Behavior[] = []
+): RunMetrics | null {
if (samples.length === 0) return null;
const requiredBaseMetrics = getRequiredBaseMetricNames(loadDimensions());
@@ -137,6 +216,7 @@ export function computeRunMetrics(samples: JudgedSample[]): RunMetrics | null {
}
const dimensions = finalizeDimensions(dimensionAggregates);
+ const permissibilitySplit = computePolicyViolationByPermissibility(scoredSamples, behaviors);
return {
total: samples.length,
@@ -147,6 +227,8 @@ export function computeRunMetrics(samples: JudgedSample[]): RunMetrics | null {
counts,
policy_violation_rate: dimensionRate(dimensions, 'policy_violation'),
overrefusal_rate: dimensionRate(dimensions, 'overrefusal'),
+ policy_violation_on_permissible: permissibilitySplit.permissible,
+ policy_violation_on_not_permissible: permissibilitySplit.not_permissible,
target: samples[0]?.target ?? '—',
judge_model: samples[0]?.judge_model ?? '—',
dimensions
diff --git a/viewer/src/lib/types.ts b/viewer/src/lib/types.ts
index 05263138..40ee9b5c 100644
--- a/viewer/src/lib/types.ts
+++ b/viewer/src/lib/types.ts
@@ -273,6 +273,8 @@ export interface RunMetrics {
counts: BinaryCounts;
policy_violation_rate: number;
overrefusal_rate: number;
+ policy_violation_on_permissible: DimensionMetrics | null;
+ policy_violation_on_not_permissible: DimensionMetrics | null;
target: string;
judge_model: string;
dimensions: Record;
@@ -301,6 +303,8 @@ export interface AuditRunMetrics {
counts: BinaryCounts;
policy_violation_rate: number;
overrefusal_rate: number;
+ policy_violation_on_permissible: DimensionMetrics | null;
+ policy_violation_on_not_permissible: DimensionMetrics | null;
dimensions: Record;
target: string;
tester_model: string;
diff --git a/viewer/src/routes/suite/[suite_id]/[run_id]/+page.svelte b/viewer/src/routes/suite/[suite_id]/[run_id]/+page.svelte
index 76d16059..da8cf99e 100644
--- a/viewer/src/routes/suite/[suite_id]/[run_id]/+page.svelte
+++ b/viewer/src/routes/suite/[suite_id]/[run_id]/+page.svelte
@@ -919,6 +919,43 @@
{/if}
+
+ {#if data.metrics.policyViolationOnPermissible || data.metrics.policyViolationOnNotPermissible}
+ {@const promptPerm = data.metrics.policyViolationOnPermissible}
+ {@const promptNotPerm = data.metrics.policyViolationOnNotPermissible}
+ {#if (promptPerm?.count ?? 0) + (promptNotPerm?.count ?? 0) > 0}
+
+ {#each [
+ { key: 'permissible', label: 'Allowed requests failed', summary: promptPerm, hint: 'Requests the assistant should have helped with, but instead refused, deflected, or failed to complete.' },
+ { key: 'not_permissible', label: 'Blocked requests failed', summary: promptNotPerm, hint: 'Requests the assistant should have blocked, but instead helped with or allowed.' }
+ ] as card (card.key)}
+ {@const summary = card.summary}
+
+
{card.label}
+
{card.hint}
+
+ {summary && summary.count > 0 ? metricRateText(summary.rate) : '—'}
+ {summary && summary.count > 0 ? 'violated' : 'no relevant judgments'}
+
+ {#if summary && summary.count > 0}
+ {@const pct = binaryBar(summary.counts)}
+
+ {#if pct.clear > 0}
{/if}
+ {#if pct.flagged > 0}
{/if}
+
+
+ {summary.clear_count} clear
+ {summary.flagged_count} violated
+ {summary.count} relevant judgments
+
+ {/if}
+
aggregated over relevant behavior judgments, not over prompts
+
+ {/each}
+
+ {/if}
+ {/if}
+
@@ -1199,6 +1236,43 @@
{/if}
+
+ {#if data.auditMetrics.policyViolationOnPermissible || data.auditMetrics.policyViolationOnNotPermissible}
+ {@const auditPerm = data.auditMetrics.policyViolationOnPermissible}
+ {@const auditNotPerm = data.auditMetrics.policyViolationOnNotPermissible}
+ {#if (auditPerm?.count ?? 0) + (auditNotPerm?.count ?? 0) > 0}
+
+ {#each [
+ { key: 'permissible', label: 'Allowed requests failed', summary: auditPerm, hint: 'Requests the assistant should have helped with, but instead refused, deflected, or failed to complete.' },
+ { key: 'not_permissible', label: 'Blocked requests failed', summary: auditNotPerm, hint: 'Requests the assistant should have blocked, but instead helped with or allowed.' }
+ ] as card (card.key)}
+ {@const summary = card.summary}
+
+
{card.label}
+
{card.hint}
+
+ {summary && summary.count > 0 ? metricRateText(summary.rate) : '—'}
+ {summary && summary.count > 0 ? 'violated' : 'no relevant judgments'}
+
+ {#if summary && summary.count > 0}
+ {@const pct = binaryBar(summary.counts)}
+
+ {#if pct.clear > 0}
{/if}
+ {#if pct.flagged > 0}
{/if}
+
+
+ {summary.clear_count} clear
+ {summary.flagged_count} violated
+ {summary.count} relevant judgments
+
+ {/if}
+
aggregated over relevant behavior judgments, not over scenarios
+
+ {/each}
+
+ {/if}
+ {/if}
+
From 86ae430d348b2eb7dc1959c099143e61440ca6f7 Mon Sep 17 00:00:00 2001
From: Jake Present
Date: Tue, 26 May 2026 14:29:29 -0400
Subject: [PATCH 2/2] fix: omit unsupported GPT-5 temperature overrides
---
p2m/core/model_client.py | 40 ++++++++++++++++--
tests/test_model_client.py | 84 ++++++++++++++++++++++++++++++++++++--
2 files changed, 117 insertions(+), 7 deletions(-)
diff --git a/p2m/core/model_client.py b/p2m/core/model_client.py
index 5d9945ff..3cf11b8b 100644
--- a/p2m/core/model_client.py
+++ b/p2m/core/model_client.py
@@ -311,6 +311,36 @@ def _supports_web_search_preview(model: str) -> bool:
return _model_family(model) in {"openai", "azure"}
+def _model_name(model: str) -> str:
+ normalized = (model or "").strip().lower()
+ if "/" in normalized:
+ return normalized.split("/", 1)[1]
+ return normalized
+
+
+def _supports_custom_temperature(model: str) -> bool:
+ """Whether non-default temperature values are supported for this model.
+
+ GPT-5.x deployments on OpenAI/Azure reject explicit non-default
+ temperatures. LiteLLM's Azure model string uses the deployment name after
+ ``azure/``, so match prefixes instead of exact public model IDs.
+ """
+ return not _model_name(model).startswith("gpt-5")
+
+
+def _temperature_for_payload(model: str, temperature: float | None) -> float | None:
+ if temperature is None or temperature == 1:
+ return temperature
+ if _supports_custom_temperature(model):
+ return temperature
+ log.warning(
+ "Model %s only supports the default temperature; ignoring configured temperature=%s",
+ model,
+ temperature,
+ )
+ return None
+
+
def _require_web_search_preview_support(model: str) -> None:
if _supports_web_search_preview(model):
return
@@ -493,8 +523,9 @@ def _build_chat_payload(
"model": model,
"messages": messages_to_openai(messages),
}
- if resolved_options.temperature is not None:
- payload["temperature"] = resolved_options.temperature
+ temperature = _temperature_for_payload(model, resolved_options.temperature)
+ if temperature is not None:
+ payload["temperature"] = temperature
if resolved_options.max_tokens is not None:
payload["max_tokens"] = resolved_options.max_tokens
if resolved_options.max_output_tokens is not None and "max_tokens" not in payload:
@@ -519,8 +550,9 @@ def _build_responses_payload(
"model": model,
"input": input_payload,
}
- if resolved_options.temperature is not None:
- payload["temperature"] = resolved_options.temperature
+ temperature = _temperature_for_payload(model, resolved_options.temperature)
+ if temperature is not None:
+ payload["temperature"] = temperature
if resolved_options.max_output_tokens is not None:
payload["max_output_tokens"] = resolved_options.max_output_tokens
elif resolved_options.max_tokens is not None:
diff --git a/tests/test_model_client.py b/tests/test_model_client.py
index f9c45b41..21a5bdc9 100644
--- a/tests/test_model_client.py
+++ b/tests/test_model_client.py
@@ -13,7 +13,7 @@ async def fake_acompletion(**kwargs):
captured.update(kwargs)
return {
"id": "resp-chat-1",
- "model": "openai/gpt-5-mini",
+ "model": "openai/gpt-4o-mini",
"choices": [
{
"finish_reason": "stop",
@@ -36,7 +36,7 @@ async def fake_acompletion(**kwargs):
with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
response = await model_client.generate(
- "openai/gpt-5-mini",
+ "openai/gpt-4o-mini",
"say hi",
options,
)
@@ -51,9 +51,87 @@ async def fake_acompletion(**kwargs):
self.assertEqual(response.finish_reason, "stop")
self.assertEqual(response.usage.total_tokens, 18)
self.assertEqual(response.api_mode, "chat_completion")
- self.assertEqual(response.request_payload["model"], "openai/gpt-5-mini")
+ self.assertEqual(response.request_payload["model"], "openai/gpt-4o-mini")
self.assertEqual(response.request_payload["messages"], [{"role": "user", "content": "say hi"}])
+ async def test_generate_omits_unsupported_gpt5_temperature(self) -> None:
+ captured: dict[str, object] = {}
+
+ async def fake_acompletion(**kwargs):
+ captured.update(kwargs)
+ return {
+ "choices": [
+ {
+ "finish_reason": "stop",
+ "message": {"role": "assistant", "content": "ok"},
+ }
+ ]
+ }
+
+ fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
+ options = model_client.GenerateOptions(temperature=0.0)
+
+ with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
+ await model_client.generate(
+ "azure/gpt-5.4-1",
+ "say hi",
+ options,
+ )
+
+ self.assertNotIn("temperature", captured)
+
+ async def test_generate_keeps_default_gpt5_temperature(self) -> None:
+ captured: dict[str, object] = {}
+
+ async def fake_acompletion(**kwargs):
+ captured.update(kwargs)
+ return {
+ "choices": [
+ {
+ "finish_reason": "stop",
+ "message": {"role": "assistant", "content": "ok"},
+ }
+ ]
+ }
+
+ fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
+ options = model_client.GenerateOptions(temperature=1.0)
+
+ with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
+ await model_client.generate(
+ "azure/gpt-5.4-1",
+ "say hi",
+ options,
+ )
+
+ self.assertEqual(captured["temperature"], 1.0)
+
+ async def test_generate_keeps_non_gpt5_temperature(self) -> None:
+ captured: dict[str, object] = {}
+
+ async def fake_acompletion(**kwargs):
+ captured.update(kwargs)
+ return {
+ "choices": [
+ {
+ "finish_reason": "stop",
+ "message": {"role": "assistant", "content": "ok"},
+ }
+ ]
+ }
+
+ fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
+ options = model_client.GenerateOptions(temperature=0.0)
+
+ with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
+ await model_client.generate(
+ "azure/gpt-4o-mini",
+ "say hi",
+ options,
+ )
+
+ self.assertEqual(captured["temperature"], 0.0)
+
async def test_generate_structured_adds_json_schema_response_format(self) -> None:
captured: dict[str, object] = {}