diff --git a/p2m/core/model_client.py b/p2m/core/model_client.py
index 5d9945ff..3cf11b8b 100644
--- a/p2m/core/model_client.py
+++ b/p2m/core/model_client.py
@@ -311,6 +311,36 @@ def _supports_web_search_preview(model: str) -> bool:
return _model_family(model) in {"openai", "azure"}
+def _model_name(model: str) -> str:
+ normalized = (model or "").strip().lower()
+ if "/" in normalized:
+ return normalized.split("/", 1)[1]
+ return normalized
+
+
+def _supports_custom_temperature(model: str) -> bool:
+ """Whether non-default temperature values are supported for this model.
+
+ GPT-5.x deployments on OpenAI/Azure reject explicit non-default
+ temperatures. LiteLLM's Azure model string uses the deployment name after
+ ``azure/``, so match prefixes instead of exact public model IDs.
+ """
+ return not _model_name(model).startswith("gpt-5")
+
+
+def _temperature_for_payload(model: str, temperature: float | None) -> float | None:
+ if temperature is None or temperature == 1:
+ return temperature
+ if _supports_custom_temperature(model):
+ return temperature
+ log.warning(
+ "Model %s only supports the default temperature; ignoring configured temperature=%s",
+ model,
+ temperature,
+ )
+ return None
+
+
def _require_web_search_preview_support(model: str) -> None:
if _supports_web_search_preview(model):
return
@@ -493,8 +523,9 @@ def _build_chat_payload(
"model": model,
"messages": messages_to_openai(messages),
}
- if resolved_options.temperature is not None:
- payload["temperature"] = resolved_options.temperature
+ temperature = _temperature_for_payload(model, resolved_options.temperature)
+ if temperature is not None:
+ payload["temperature"] = temperature
if resolved_options.max_tokens is not None:
payload["max_tokens"] = resolved_options.max_tokens
if resolved_options.max_output_tokens is not None and "max_tokens" not in payload:
@@ -519,8 +550,9 @@ def _build_responses_payload(
"model": model,
"input": input_payload,
}
- if resolved_options.temperature is not None:
- payload["temperature"] = resolved_options.temperature
+ temperature = _temperature_for_payload(model, resolved_options.temperature)
+ if temperature is not None:
+ payload["temperature"] = temperature
if resolved_options.max_output_tokens is not None:
payload["max_output_tokens"] = resolved_options.max_output_tokens
elif resolved_options.max_tokens is not None:
diff --git a/tests/test_model_client.py b/tests/test_model_client.py
index f9c45b41..21a5bdc9 100644
--- a/tests/test_model_client.py
+++ b/tests/test_model_client.py
@@ -13,7 +13,7 @@ async def fake_acompletion(**kwargs):
captured.update(kwargs)
return {
"id": "resp-chat-1",
- "model": "openai/gpt-5-mini",
+ "model": "openai/gpt-4o-mini",
"choices": [
{
"finish_reason": "stop",
@@ -36,7 +36,7 @@ async def fake_acompletion(**kwargs):
with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
response = await model_client.generate(
- "openai/gpt-5-mini",
+ "openai/gpt-4o-mini",
"say hi",
options,
)
@@ -51,9 +51,87 @@ async def fake_acompletion(**kwargs):
self.assertEqual(response.finish_reason, "stop")
self.assertEqual(response.usage.total_tokens, 18)
self.assertEqual(response.api_mode, "chat_completion")
- self.assertEqual(response.request_payload["model"], "openai/gpt-5-mini")
+ self.assertEqual(response.request_payload["model"], "openai/gpt-4o-mini")
self.assertEqual(response.request_payload["messages"], [{"role": "user", "content": "say hi"}])
+ async def test_generate_omits_unsupported_gpt5_temperature(self) -> None:
+ captured: dict[str, object] = {}
+
+ async def fake_acompletion(**kwargs):
+ captured.update(kwargs)
+ return {
+ "choices": [
+ {
+ "finish_reason": "stop",
+ "message": {"role": "assistant", "content": "ok"},
+ }
+ ]
+ }
+
+ fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
+ options = model_client.GenerateOptions(temperature=0.0)
+
+ with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
+ await model_client.generate(
+ "azure/gpt-5.4-1",
+ "say hi",
+ options,
+ )
+
+ self.assertNotIn("temperature", captured)
+
+ async def test_generate_keeps_default_gpt5_temperature(self) -> None:
+ captured: dict[str, object] = {}
+
+ async def fake_acompletion(**kwargs):
+ captured.update(kwargs)
+ return {
+ "choices": [
+ {
+ "finish_reason": "stop",
+ "message": {"role": "assistant", "content": "ok"},
+ }
+ ]
+ }
+
+ fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
+ options = model_client.GenerateOptions(temperature=1.0)
+
+ with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
+ await model_client.generate(
+ "azure/gpt-5.4-1",
+ "say hi",
+ options,
+ )
+
+ self.assertEqual(captured["temperature"], 1.0)
+
+ async def test_generate_keeps_non_gpt5_temperature(self) -> None:
+ captured: dict[str, object] = {}
+
+ async def fake_acompletion(**kwargs):
+ captured.update(kwargs)
+ return {
+ "choices": [
+ {
+ "finish_reason": "stop",
+ "message": {"role": "assistant", "content": "ok"},
+ }
+ ]
+ }
+
+ fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
+ options = model_client.GenerateOptions(temperature=0.0)
+
+ with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
+ await model_client.generate(
+ "azure/gpt-4o-mini",
+ "say hi",
+ options,
+ )
+
+ self.assertEqual(captured["temperature"], 0.0)
+
async def test_generate_structured_adds_json_schema_response_format(self) -> None:
captured: dict[str, object] = {}
diff --git a/tests/test_viewer_server_artifacts.py b/tests/test_viewer_server_artifacts.py
index 8cbdb500..fb3910d3 100644
--- a/tests/test_viewer_server_artifacts.py
+++ b/tests/test_viewer_server_artifacts.py
@@ -1723,6 +1723,194 @@ def test_list_suites_marks_scenario_only_scored_suite_as_has_results(self) -> No
self.assertEqual(payload[0]["status"], "has_results")
+ def test_run_metrics_policy_violation_by_permissibility(self) -> None:
+ with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
+ harness_dir = Path(tmp_dir) / "harness"
+ harness_dir.mkdir()
+ self._copy_data_harness(harness_dir)
+ metrics_path = harness_dir / "metrics.ts"
+
+ script = textwrap.dedent(
+ f"""\
+ const {{ computeRunMetrics }} = await import({json.dumps(metrics_path.as_uri())});
+ const behaviors = [
+ {{ name: 'perm_a', definition: '', examples: [], permissible: true }},
+ {{ name: 'notperm_a', definition: '', examples: [], permissible: false }}
+ ];
+ const samples = [
+ {{
+ test_case_id: 'seed-1',
+ prompt: 'prompt',
+ response: 'response',
+ behavior: 'perm_a',
+ target: 'target-model',
+ judge_model: 'judge-model',
+ judge_status: 'ok',
+ verdict: {{
+ dimensions: {{ policy_violation: true, overrefusal: true }},
+ justification: '',
+ node_judgments: [
+ {{ node_index: 0, node_name: 'perm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
+ {{ node_index: 1, node_name: 'notperm_a', relevant: false, violated: null, confidence: null, evidence_turns: [], reasoning: '' }}
+ ]
+ }}
+ }},
+ {{
+ test_case_id: 'seed-2',
+ prompt: 'prompt',
+ response: 'response',
+ behavior: 'notperm_a',
+ target: 'target-model',
+ judge_model: 'judge-model',
+ judge_status: 'ok',
+ verdict: {{
+ dimensions: {{ policy_violation: false, overrefusal: false }},
+ justification: '',
+ node_judgments: [
+ {{ node_index: 0, node_name: 'perm_a', relevant: true, violated: false, confidence: 'high', evidence_turns: [], reasoning: '' }},
+ {{ node_index: 1, node_name: 'notperm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
+ {{ node_index: 2, node_name: 'unknown_node', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }}
+ ]
+ }}
+ }}
+ ];
+ const withBehaviors = computeRunMetrics(samples, behaviors);
+ const withoutBehaviors = computeRunMetrics(samples, []);
+ console.log(JSON.stringify({{ withBehaviors, withoutBehaviors }}));
+ """
+ )
+ result = self._run_node(
+ harness_dir=harness_dir,
+ script=script,
+ env=os.environ.copy(),
+ )
+ self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
+ payload = json.loads(result.stdout)
+
+ with_behaviors = payload["withBehaviors"]
+ self.assertEqual(with_behaviors["scored_total"], 2)
+ permissible = with_behaviors["policy_violation_on_permissible"]
+ self.assertIsNotNone(permissible)
+ self.assertEqual(permissible["count"], 2)
+ self.assertEqual(permissible["flagged_count"], 1)
+ self.assertEqual(permissible["clear_count"], 1)
+ self.assertAlmostEqual(permissible["rate"], 0.5)
+
+ not_permissible = with_behaviors["policy_violation_on_not_permissible"]
+ self.assertIsNotNone(not_permissible)
+ self.assertEqual(not_permissible["count"], 1)
+ self.assertEqual(not_permissible["flagged_count"], 1)
+ self.assertEqual(not_permissible["clear_count"], 0)
+ self.assertAlmostEqual(not_permissible["rate"], 1.0)
+
+ without_behaviors = payload["withoutBehaviors"]
+ self.assertIsNone(without_behaviors["policy_violation_on_permissible"])
+ self.assertIsNone(without_behaviors["policy_violation_on_not_permissible"])
+
+ def test_audit_run_metrics_policy_violation_by_permissibility(self) -> None:
+ with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
+ harness_dir = Path(tmp_dir) / "harness"
+ harness_dir.mkdir()
+ self._copy_data_harness(harness_dir)
+ metrics_path = harness_dir / "metrics.ts"
+
+ script = textwrap.dedent(
+ f"""\
+ const {{ computeAuditRunMetrics }} = await import({json.dumps(metrics_path.as_uri())});
+ const behaviors = [
+ {{ name: 'perm_a', definition: '', examples: [], permissible: true }},
+ {{ name: 'notperm_a', definition: '', examples: [], permissible: false }}
+ ];
+ const scores = [
+ {{
+ test_case_id: 'scenario-1',
+ target: 'target-model',
+ tester_model: 'tester-model',
+ judge_model: 'judge-model',
+ judge_status: 'ok',
+ verdict: {{
+ dimensions: {{ policy_violation: true, overrefusal: false }},
+ justification: '',
+ node_judgments: [
+ {{ node_index: 0, node_name: 'notperm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
+ {{ node_index: 1, node_name: 'perm_a', relevant: true, violated: false, confidence: 'high', evidence_turns: [], reasoning: '' }}
+ ]
+ }}
+ }}
+ ];
+ const metrics = computeAuditRunMetrics(scores, behaviors);
+ console.log(JSON.stringify(metrics));
+ """
+ )
+ result = self._run_node(
+ harness_dir=harness_dir,
+ script=script,
+ env=os.environ.copy(),
+ )
+ self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
+ metrics = json.loads(result.stdout)
+
+ self.assertEqual(metrics["scored_total"], 1)
+ permissible = metrics["policy_violation_on_permissible"]
+ self.assertIsNotNone(permissible)
+ self.assertEqual(permissible["count"], 1)
+ self.assertEqual(permissible["flagged_count"], 0)
+ self.assertEqual(permissible["clear_count"], 1)
+ self.assertAlmostEqual(permissible["rate"], 0.0)
+
+ not_permissible = metrics["policy_violation_on_not_permissible"]
+ self.assertIsNotNone(not_permissible)
+ self.assertEqual(not_permissible["count"], 1)
+ self.assertEqual(not_permissible["flagged_count"], 1)
+ self.assertEqual(not_permissible["clear_count"], 0)
+ self.assertAlmostEqual(not_permissible["rate"], 1.0)
+
+ def test_load_run_judge_taxonomy_prefers_run_config(self) -> None:
+ with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
+ harness_dir = Path(tmp_dir) / "harness"
+ harness_dir.mkdir()
+ self._copy_data_harness(harness_dir)
+ artifacts_path = harness_dir / "artifacts.ts"
+
+ suite_dir = Path(tmp_dir) / "demo-suite"
+ run_dir = suite_dir / "demo-run"
+ run_dir.mkdir(parents=True)
+ judge_taxonomy = {
+ "behavior": {"name": "demo", "definition": "demo"},
+ "behavior_categories": [
+ {"name": "judge_only", "definition": "", "examples": [], "permissible": False}
+ ],
+ }
+ taxonomy_path = suite_dir / "taxonomy.override.json"
+ taxonomy_path.write_text(json.dumps(judge_taxonomy), encoding="utf-8")
+ (run_dir / "config.yaml").write_text(
+ f"pipeline:\n judge:\n taxonomy_path: {taxonomy_path}\n",
+ encoding="utf-8",
+ )
+
+ script = textwrap.dedent(
+ f"""\
+ const {{ loadRunJudgeTaxonomyForRun, loadRunJudgeTaxonomy, loadRunJudgeTaxonomyFromArtifacts }} = await import({json.dumps(artifacts_path.as_uri())});
+ const fromRun = loadRunJudgeTaxonomyForRun('demo-suite', 'demo-run');
+ const fromConfig = loadRunJudgeTaxonomy({{ pipeline: {{ judge: {{ taxonomy_path: {json.dumps(str(taxonomy_path))} }} }} }});
+ const fromArtifact = loadRunJudgeTaxonomyFromArtifacts({{ suite: 'demo-suite' }}, {{ systematize: {{ path: 'taxonomy.override.json' }} }});
+ const fromMissing = loadRunJudgeTaxonomy({{ pipeline: {{ judge: {{}} }} }});
+ console.log(JSON.stringify({{ fromRun, fromConfig, fromArtifact, fromMissing }}));
+ """
+ )
+ env = os.environ.copy()
+ env["ARTIFACTS_ROOT"] = str(Path(tmp_dir))
+ result = self._run_node(harness_dir=harness_dir, script=script, env=env)
+ self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
+ payload = json.loads(result.stdout)
+
+ self.assertEqual(payload["fromRun"], judge_taxonomy)
+ self.assertEqual(payload["fromConfig"], judge_taxonomy)
+ self.assertEqual(payload["fromArtifact"], judge_taxonomy)
+ self.assertIsNone(payload["fromMissing"])
+
+
+
class ViewerReadModelHelpersTest(unittest.TestCase):
"""Tests for path-traversal defenses that don't depend on Node TS support."""
@@ -1822,5 +2010,7 @@ def test_test_set_artifact_path_rejects_paths_that_normalize_to_directory(self)
)
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/viewer/src/lib/server/artifacts.ts b/viewer/src/lib/server/artifacts.ts
index 65a023c5..83c55c66 100644
--- a/viewer/src/lib/server/artifacts.ts
+++ b/viewer/src/lib/server/artifacts.ts
@@ -556,6 +556,47 @@ function readObject(value: unknown): Record
{card.hint}
+{card.hint}
+