Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions tests/test_viewer_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

import json
import subprocess
import textwrap
import unittest
from pathlib import Path
from tempfile import TemporaryDirectory

from tests.node_runner import node_supports_ts, node_ts_args


ROOT = Path(__file__).resolve().parents[1]
METRICS_SRC = ROOT / "viewer" / "src" / "lib" / "server" / "metrics.ts"


@unittest.skipUnless(node_supports_ts(), "node binary lacks TypeScript support (need ≥ 22.6)")
class ViewerMetricsTest(unittest.TestCase):
def _run_node(self, *, harness_dir: Path, script: str) -> subprocess.CompletedProcess[str]:
return subprocess.run(
["node", *node_ts_args(), "--input-type=module"],
input=script,
text=True,
capture_output=True,
cwd=harness_dir,
check=False,
)

def test_audit_metrics_fall_back_to_tester_model_for_target_label(self) -> None:
with TemporaryDirectory(dir=ROOT / "viewer") as tmp_dir:
harness_dir = Path(tmp_dir)
source = METRICS_SRC.read_text(encoding="utf-8")
source = source.replace("from '$lib/judgment.js';", "from './judgment.js';")
source = source.replace("from './dimensions.js';", "from './dimensions.js';")
(harness_dir / "metrics.ts").write_text(source, encoding="utf-8")
(harness_dir / "judgment.js").write_text(
textwrap.dedent(
"""\
export function getRequiredBaseMetricNames() { return ['policy_violation', 'overrefusal']; }
export function isBooleanFlag(value) { return typeof value === 'boolean'; }
export function getRecordFlag(record, name) {
const value = record?.verdict?.dimensions?.[name];
return typeof value === 'boolean' ? value : null;
}
export function isSuccessfulJudgment(record) { return record?.judge_status !== 'error'; }
"""
),
encoding="utf-8",
)
(harness_dir / "dimensions.js").write_text(
"export function loadDimensions() { return []; }\n",
encoding="utf-8",
)

script = textwrap.dedent(
"""\
const { computeAuditRunMetrics } = await import('./metrics.ts');
const metrics = computeAuditRunMetrics([{
test_case_id: 'scenario-1',
behavior: 'medical advice',
tester_model: 'azure/gpt-5.4-1',
judge_model: 'azure/gpt-5.4-1',
judge_status: 'ok',
verdict: { dimensions: { policy_violation: false, overrefusal: false } },
metadata: { turns_count: 2, stop_reason: 'max_turns' }
}]);
console.log(JSON.stringify(metrics));
"""
)
result = self._run_node(harness_dir=harness_dir, script=script)

self.assertEqual(result.returncode, 0, msg=f"{result.stdout}\n{result.stderr}")
payload = json.loads(result.stdout)
self.assertEqual(payload["target"], "azure/gpt-5.4-1")
self.assertEqual(payload["tester_model"], "azure/gpt-5.4-1")
self.assertEqual(payload["judge_model"], "azure/gpt-5.4-1")


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion viewer/src/lib/server/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ export function computeAuditRunMetrics(scores: AuditScore[]): AuditRunMetrics |
policy_violation_rate: dimensionRate(dimensions, 'policy_violation'),
overrefusal_rate: dimensionRate(dimensions, 'overrefusal'),
dimensions,
target: scores[0]?.target ?? '',
target: scores[0]?.target ?? scores[0]?.tester_model ?? '',
tester_model: scores[0]?.tester_model ?? '',
judge_model: scores[0]?.judge_model ?? ''
};
Expand Down
4 changes: 2 additions & 2 deletions viewer/src/routes/suite/[suite_id]/[run_id]/+page.svelte
Original file line number Diff line number Diff line change
Expand Up @@ -760,9 +760,9 @@
{/if}
</div>
<div class="mt-2 text-sm text-text-muted">
Evaluation target: <span class="font-mono text-text">{data.metrics?.target ?? data.auditMetrics?.target ?? '—'}</span>
Evaluation target: <span class="font-mono text-text">{data.metrics?.target || data.auditMetrics?.target || '—'}</span>
<span class="mx-2 text-text-muted/50">·</span>
Judge: <span class="font-mono text-text">{data.metrics?.judge_model ?? data.auditMetrics?.judge_model ?? '—'}</span>
Judge: <span class="font-mono text-text">{data.metrics?.judge_model || data.auditMetrics?.judge_model || '—'}</span>
</div>
{#if data.manifest?.stages}
<button
Expand Down