Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 36 additions & 4 deletions p2m/core/model_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,36 @@ def _supports_web_search_preview(model: str) -> bool:
return _model_family(model) in {"openai", "azure"}


def _model_name(model: str) -> str:
normalized = (model or "").strip().lower()
if "/" in normalized:
return normalized.split("/", 1)[1]
return normalized


def _supports_custom_temperature(model: str) -> bool:
"""Whether non-default temperature values are supported for this model.

GPT-5.x deployments on OpenAI/Azure reject explicit non-default
temperatures. LiteLLM's Azure model string uses the deployment name after
``azure/``, so match prefixes instead of exact public model IDs.
"""
return not _model_name(model).startswith("gpt-5")


def _temperature_for_payload(model: str, temperature: float | None) -> float | None:
if temperature is None or temperature == 1:
return temperature
if _supports_custom_temperature(model):
return temperature
log.warning(
"Model %s only supports the default temperature; ignoring configured temperature=%s",
model,
temperature,
)
return None


def _require_web_search_preview_support(model: str) -> None:
if _supports_web_search_preview(model):
return
Expand Down Expand Up @@ -493,8 +523,9 @@ def _build_chat_payload(
"model": model,
"messages": messages_to_openai(messages),
}
if resolved_options.temperature is not None:
payload["temperature"] = resolved_options.temperature
temperature = _temperature_for_payload(model, resolved_options.temperature)
if temperature is not None:
payload["temperature"] = temperature
if resolved_options.max_tokens is not None:
payload["max_tokens"] = resolved_options.max_tokens
if resolved_options.max_output_tokens is not None and "max_tokens" not in payload:
Expand All @@ -519,8 +550,9 @@ def _build_responses_payload(
"model": model,
"input": input_payload,
}
if resolved_options.temperature is not None:
payload["temperature"] = resolved_options.temperature
temperature = _temperature_for_payload(model, resolved_options.temperature)
if temperature is not None:
payload["temperature"] = temperature
Comment on lines +553 to +555
if resolved_options.max_output_tokens is not None:
payload["max_output_tokens"] = resolved_options.max_output_tokens
elif resolved_options.max_tokens is not None:
Expand Down
84 changes: 81 additions & 3 deletions tests/test_model_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ async def fake_acompletion(**kwargs):
captured.update(kwargs)
return {
"id": "resp-chat-1",
"model": "openai/gpt-5-mini",
"model": "openai/gpt-4o-mini",
"choices": [
{
"finish_reason": "stop",
Expand All @@ -36,7 +36,7 @@ async def fake_acompletion(**kwargs):

with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
response = await model_client.generate(
"openai/gpt-5-mini",
"openai/gpt-4o-mini",
"say hi",
options,
)
Expand All @@ -51,9 +51,87 @@ async def fake_acompletion(**kwargs):
self.assertEqual(response.finish_reason, "stop")
self.assertEqual(response.usage.total_tokens, 18)
self.assertEqual(response.api_mode, "chat_completion")
self.assertEqual(response.request_payload["model"], "openai/gpt-5-mini")
self.assertEqual(response.request_payload["model"], "openai/gpt-4o-mini")
self.assertEqual(response.request_payload["messages"], [{"role": "user", "content": "say hi"}])

async def test_generate_omits_unsupported_gpt5_temperature(self) -> None:
captured: dict[str, object] = {}

async def fake_acompletion(**kwargs):
captured.update(kwargs)
return {
"choices": [
{
"finish_reason": "stop",
"message": {"role": "assistant", "content": "ok"},
}
]
}

fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
options = model_client.GenerateOptions(temperature=0.0)

with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
await model_client.generate(
"azure/gpt-5.4-1",
"say hi",
options,
)

self.assertNotIn("temperature", captured)

async def test_generate_keeps_default_gpt5_temperature(self) -> None:
captured: dict[str, object] = {}

async def fake_acompletion(**kwargs):
captured.update(kwargs)
return {
"choices": [
{
"finish_reason": "stop",
"message": {"role": "assistant", "content": "ok"},
}
]
}

fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
options = model_client.GenerateOptions(temperature=1.0)

with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
await model_client.generate(
"azure/gpt-5.4-1",
"say hi",
options,
)

self.assertEqual(captured["temperature"], 1.0)

async def test_generate_keeps_non_gpt5_temperature(self) -> None:
captured: dict[str, object] = {}

async def fake_acompletion(**kwargs):
captured.update(kwargs)
return {
"choices": [
{
"finish_reason": "stop",
"message": {"role": "assistant", "content": "ok"},
}
]
}

fake_litellm = SimpleNamespace(acompletion=fake_acompletion)
options = model_client.GenerateOptions(temperature=0.0)

with patch.object(model_client, "_get_litellm_module", return_value=fake_litellm):
await model_client.generate(
"azure/gpt-4o-mini",
"say hi",
options,
)

self.assertEqual(captured["temperature"], 0.0)

async def test_generate_structured_adds_json_schema_response_format(self) -> None:
captured: dict[str, object] = {}

Expand Down
190 changes: 190 additions & 0 deletions tests/test_viewer_server_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -1723,6 +1723,194 @@ def test_list_suites_marks_scenario_only_scored_suite_as_has_results(self) -> No
self.assertEqual(payload[0]["status"], "has_results")


def test_run_metrics_policy_violation_by_permissibility(self) -> None:
with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
harness_dir = Path(tmp_dir) / "harness"
harness_dir.mkdir()
self._copy_data_harness(harness_dir)
metrics_path = harness_dir / "metrics.ts"

script = textwrap.dedent(
f"""\
const {{ computeRunMetrics }} = await import({json.dumps(metrics_path.as_uri())});
const behaviors = [
{{ name: 'perm_a', definition: '', examples: [], permissible: true }},
{{ name: 'notperm_a', definition: '', examples: [], permissible: false }}
];
const samples = [
{{
test_case_id: 'seed-1',
prompt: 'prompt',
response: 'response',
behavior: 'perm_a',
target: 'target-model',
judge_model: 'judge-model',
judge_status: 'ok',
verdict: {{
dimensions: {{ policy_violation: true, overrefusal: true }},
justification: '',
node_judgments: [
{{ node_index: 0, node_name: 'perm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
{{ node_index: 1, node_name: 'notperm_a', relevant: false, violated: null, confidence: null, evidence_turns: [], reasoning: '' }}
]
}}
}},
{{
test_case_id: 'seed-2',
prompt: 'prompt',
response: 'response',
behavior: 'notperm_a',
target: 'target-model',
judge_model: 'judge-model',
judge_status: 'ok',
verdict: {{
dimensions: {{ policy_violation: false, overrefusal: false }},
justification: '',
node_judgments: [
{{ node_index: 0, node_name: 'perm_a', relevant: true, violated: false, confidence: 'high', evidence_turns: [], reasoning: '' }},
{{ node_index: 1, node_name: 'notperm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
{{ node_index: 2, node_name: 'unknown_node', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }}
]
}}
}}
];
const withBehaviors = computeRunMetrics(samples, behaviors);
const withoutBehaviors = computeRunMetrics(samples, []);
console.log(JSON.stringify({{ withBehaviors, withoutBehaviors }}));
"""
)
result = self._run_node(
harness_dir=harness_dir,
script=script,
env=os.environ.copy(),
)
self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
payload = json.loads(result.stdout)

with_behaviors = payload["withBehaviors"]
self.assertEqual(with_behaviors["scored_total"], 2)
permissible = with_behaviors["policy_violation_on_permissible"]
self.assertIsNotNone(permissible)
self.assertEqual(permissible["count"], 2)
self.assertEqual(permissible["flagged_count"], 1)
self.assertEqual(permissible["clear_count"], 1)
self.assertAlmostEqual(permissible["rate"], 0.5)

not_permissible = with_behaviors["policy_violation_on_not_permissible"]
self.assertIsNotNone(not_permissible)
self.assertEqual(not_permissible["count"], 1)
self.assertEqual(not_permissible["flagged_count"], 1)
self.assertEqual(not_permissible["clear_count"], 0)
self.assertAlmostEqual(not_permissible["rate"], 1.0)

without_behaviors = payload["withoutBehaviors"]
self.assertIsNone(without_behaviors["policy_violation_on_permissible"])
self.assertIsNone(without_behaviors["policy_violation_on_not_permissible"])

def test_audit_run_metrics_policy_violation_by_permissibility(self) -> None:
with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
harness_dir = Path(tmp_dir) / "harness"
harness_dir.mkdir()
self._copy_data_harness(harness_dir)
metrics_path = harness_dir / "metrics.ts"

script = textwrap.dedent(
f"""\
const {{ computeAuditRunMetrics }} = await import({json.dumps(metrics_path.as_uri())});
const behaviors = [
{{ name: 'perm_a', definition: '', examples: [], permissible: true }},
{{ name: 'notperm_a', definition: '', examples: [], permissible: false }}
];
const scores = [
{{
test_case_id: 'scenario-1',
target: 'target-model',
tester_model: 'tester-model',
judge_model: 'judge-model',
judge_status: 'ok',
verdict: {{
dimensions: {{ policy_violation: true, overrefusal: false }},
justification: '',
node_judgments: [
{{ node_index: 0, node_name: 'notperm_a', relevant: true, violated: true, confidence: 'high', evidence_turns: [], reasoning: '' }},
{{ node_index: 1, node_name: 'perm_a', relevant: true, violated: false, confidence: 'high', evidence_turns: [], reasoning: '' }}
]
}}
}}
];
const metrics = computeAuditRunMetrics(scores, behaviors);
console.log(JSON.stringify(metrics));
"""
)
result = self._run_node(
harness_dir=harness_dir,
script=script,
env=os.environ.copy(),
)
self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
metrics = json.loads(result.stdout)

self.assertEqual(metrics["scored_total"], 1)
permissible = metrics["policy_violation_on_permissible"]
self.assertIsNotNone(permissible)
self.assertEqual(permissible["count"], 1)
self.assertEqual(permissible["flagged_count"], 0)
self.assertEqual(permissible["clear_count"], 1)
self.assertAlmostEqual(permissible["rate"], 0.0)

not_permissible = metrics["policy_violation_on_not_permissible"]
self.assertIsNotNone(not_permissible)
self.assertEqual(not_permissible["count"], 1)
self.assertEqual(not_permissible["flagged_count"], 1)
self.assertEqual(not_permissible["clear_count"], 0)
self.assertAlmostEqual(not_permissible["rate"], 1.0)

def test_load_run_judge_taxonomy_prefers_run_config(self) -> None:
with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
harness_dir = Path(tmp_dir) / "harness"
harness_dir.mkdir()
self._copy_data_harness(harness_dir)
artifacts_path = harness_dir / "artifacts.ts"

suite_dir = Path(tmp_dir) / "demo-suite"
run_dir = suite_dir / "demo-run"
run_dir.mkdir(parents=True)
judge_taxonomy = {
"behavior": {"name": "demo", "definition": "demo"},
"behavior_categories": [
{"name": "judge_only", "definition": "", "examples": [], "permissible": False}
],
}
taxonomy_path = suite_dir / "taxonomy.override.json"
taxonomy_path.write_text(json.dumps(judge_taxonomy), encoding="utf-8")
(run_dir / "config.yaml").write_text(
f"pipeline:\n judge:\n taxonomy_path: {taxonomy_path}\n",
encoding="utf-8",
)

script = textwrap.dedent(
f"""\
const {{ loadRunJudgeTaxonomyForRun, loadRunJudgeTaxonomy, loadRunJudgeTaxonomyFromArtifacts }} = await import({json.dumps(artifacts_path.as_uri())});
const fromRun = loadRunJudgeTaxonomyForRun('demo-suite', 'demo-run');
const fromConfig = loadRunJudgeTaxonomy({{ pipeline: {{ judge: {{ taxonomy_path: {json.dumps(str(taxonomy_path))} }} }} }});
const fromArtifact = loadRunJudgeTaxonomyFromArtifacts({{ suite: 'demo-suite' }}, {{ systematize: {{ path: 'taxonomy.override.json' }} }});
const fromMissing = loadRunJudgeTaxonomy({{ pipeline: {{ judge: {{}} }} }});
console.log(JSON.stringify({{ fromRun, fromConfig, fromArtifact, fromMissing }}));
"""
)
env = os.environ.copy()
env["ARTIFACTS_ROOT"] = str(Path(tmp_dir))
result = self._run_node(harness_dir=harness_dir, script=script, env=env)
self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
payload = json.loads(result.stdout)

self.assertEqual(payload["fromRun"], judge_taxonomy)
self.assertEqual(payload["fromConfig"], judge_taxonomy)
self.assertEqual(payload["fromArtifact"], judge_taxonomy)
self.assertIsNone(payload["fromMissing"])



class ViewerReadModelHelpersTest(unittest.TestCase):
"""Tests for path-traversal defenses that don't depend on Node TS support."""

Expand Down Expand Up @@ -1822,5 +2010,7 @@ def test_test_set_artifact_path_rejects_paths_that_normalize_to_directory(self)
)




if __name__ == "__main__":
unittest.main()
Loading
Loading