diff --git a/python/tests/test_reference_solution_mode.py b/python/tests/test_reference_solution_mode.py index 9bfa62b..d179724 100644 --- a/python/tests/test_reference_solution_mode.py +++ b/python/tests/test_reference_solution_mode.py @@ -42,8 +42,8 @@ def _execution_test(test_id: str = "e-one") -> ExecutionTest: category="general", difficulty="basic", requirements=["Requirement"], - static_checks={"required_patterns": []}, - runtime_checks={"assertions": []}, + static_checks={"required_patterns": [{"pattern": "ref", "weight": 1}]}, + runtime_checks={"assertions": [{"type": "custom_assertion", "code": "return true;", "weight": 1}]}, reference_solution="function ref() { return true; }", metadata={}, ) diff --git a/python/tests/test_scoring.py b/python/tests/test_scoring.py new file mode 100644 index 0000000..24ec2ba --- /dev/null +++ b/python/tests/test_scoring.py @@ -0,0 +1,152 @@ +from __future__ import annotations + +from typing import Any, Dict + +from wp_bench.core import BenchmarkRunner +from wp_bench.datasets import ExecutionTest + + +def _make_test( + static_checks: Dict[str, Any] | None = None, + runtime_checks: Dict[str, Any] | None = None, +) -> ExecutionTest: + return ExecutionTest( + id="e-test-001", + suite="wp-core-v1", + prompt="Do something.", + expected_behavior="Reviewer contract: does something observable.", + test_type="execution", + category="hooks", + difficulty="intermediate", + requirements=[], + static_checks=static_checks or {}, + runtime_checks=runtime_checks or {}, + reference_solution=None, + metadata={}, + ) + + +def test_correctness_averages_static_and_runtime_dimensions() -> None: + test = _make_test( + static_checks={"required_patterns": [{"pattern": "add_filter", "weight": 1.0}]}, + runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]}, + ) + raw = { + "static": {"score": 0.5}, + "runtime": {"score": 1.0, "details": {"total_weight": 1.0}}, + } + + assert BenchmarkRunner._score_correctness(raw, test) == 0.75 + + +def test_correctness_uses_only_runtime_when_no_static_checks() -> None: + test = _make_test( + runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]}, + ) + # Runtime returns 1.0 for absent static checks, which must NOT inflate the score. + raw = { + "static": {"score": 1.0}, + "runtime": {"score": 0.4, "details": {"total_weight": 1.0}}, + } + + assert BenchmarkRunner._score_correctness(raw, test) == 0.4 + + +def test_correctness_uses_only_static_when_no_runtime_checks() -> None: + test = _make_test( + static_checks={"required_patterns": [{"pattern": "esc_html", "weight": 1.0}]}, + ) + # Runtime returns 0.0 when no assertions are defined; it must be ignored here. + raw = {"static": {"score": 0.8}, "runtime": {"score": 0.0}} + + assert BenchmarkRunner._score_correctness(raw, test) == 0.8 + + +def test_correctness_respects_forbidden_pattern_hard_fail() -> None: + test = _make_test( + static_checks={"forbidden_patterns": [{"pattern": "eval\\(", "severity": "error"}]}, + runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]}, + ) + # Static hard-failed to 0.0 via a forbidden pattern; runtime passed fully. + raw = { + "static": {"score": 0.0}, + "runtime": {"score": 1.0, "details": {"total_weight": 1.0}}, + } + + assert BenchmarkRunner._score_correctness(raw, test) == 0.5 + + +def test_correctness_hard_zeroes_on_crash_despite_perfect_static() -> None: + test = _make_test( + static_checks={"required_patterns": [{"pattern": "add_filter", "weight": 1.0}]}, + runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]}, + ) + # Static is perfect, but the code crashed before any assertion ran + # (runtime accumulated no weight). Static must not rescue unrunnable code. + raw = { + "static": {"score": 1.0}, + "runtime": { + "score": 0.0, + "details": { + "assertions": [{"type": "fatal_error", "passed": False}], + "total_weight": 0.0, + }, + }, + } + + assert BenchmarkRunner._score_correctness(raw, test) == 0.0 + + +def test_correctness_crash_detected_by_error_assertion_type() -> None: + test = _make_test( + runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]}, + ) + # An execution_error entry signals a crash even if some weight accumulated. + raw = { + "runtime": { + "score": 0.5, + "details": { + "assertions": [ + {"type": "hook_registered", "passed": True, "weight": 1.0}, + {"type": "execution_error", "passed": False}, + ], + "total_weight": 1.0, + }, + }, + } + + assert BenchmarkRunner._score_correctness(raw, test) == 0.0 + + +def test_correctness_keeps_partial_credit_when_code_runs() -> None: + test = _make_test( + runtime_checks={ + "assertions": [ + {"type": "hook_registered", "target": "x"}, + {"type": "hook_registered", "target": "y"}, + ] + }, + ) + # Code ran fine but only half the assertions passed: partial credit stays. + raw = { + "runtime": { + "score": 0.5, + "details": { + "assertions": [ + {"type": "hook_registered", "passed": True, "weight": 1.0}, + {"type": "hook_registered", "passed": False, "weight": 1.0}, + ], + "total_weight": 2.0, + }, + }, + } + + assert BenchmarkRunner._score_correctness(raw, test) == 0.5 + + +def test_correctness_returns_zero_for_empty_raw() -> None: + test = _make_test( + runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]}, + ) + + assert BenchmarkRunner._score_correctness({}, test) == 0.0 diff --git a/python/wp_bench/core.py b/python/wp_bench/core.py index 15187c4..5014081 100644 --- a/python/wp_bench/core.py +++ b/python/wp_bench/core.py @@ -6,6 +6,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime from pathlib import Path +from statistics import mean from typing import Any, Dict, List import orjson @@ -234,7 +235,7 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]: "runtime_checks": test.runtime_checks, } env_result = self.environment.execute_code(code, verification_spec) - correctness = self._score_assertions(env_result.raw) + correctness = self._score_correctness(env_result.raw, test) return { "test_id": test.id, "type": "execution", @@ -278,7 +279,7 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]: "runtime_checks": test.runtime_checks, } env_result = self.environment.execute_code(test.reference_solution, verification_spec) - correctness = self._score_assertions(env_result.raw) + correctness = self._score_correctness(env_result.raw, test) return { "test_id": test.id, "type": "execution", @@ -340,35 +341,87 @@ def _render_execution_prompt(test: ExecutionTest) -> str: return "\n".join(lines) @staticmethod - def _score_assertions(raw: Dict[str, Any]) -> float: - """Calculate correctness from static and runtime verifier scores. - - Only sections that actually defined checks contribute to the score. - An empty static (or runtime) section reports a perfect score of 1.0 - (or 0.0), so averaging it in unconditionally would inflate or deflate - correctness for tests that exercise only one section. + def _score_correctness(raw: Dict[str, Any], test: ExecutionTest) -> float: + """Combine static-analysis and runtime-assertion scores into correctness. + + Both sub-scores are already weighted by the WordPress runtime + (Static_Analysis::check and Sandbox::execute_and_verify), including the + per-pattern/per-assertion weights and the forbidden-pattern hard fail. + A dimension contributes only when the test actually defines checks for + it, so a test with only runtime assertions is scored purely on runtime, + and vice versa. Applicability is read from the test definition rather + than the runtime output, since a crash before assertions run leaves the + runtime weight at zero even though the dimension was meant to count. + + When the code is supposed to run but crashes (a fatal or execution + error before any assertion executes), correctness is forced to 0.0: + the runtime is ground truth, and static pattern matches must not rescue + code that does not run. This applies only to hard crashes, not to code + that runs but fails some assertions, which still earns partial credit. Args: - raw: Raw result dict from WordPress environment. + raw: Raw result dict from the WordPress runtime (static/runtime). + test: The execution test, used to know which dimensions apply. Returns: - Float between 0.0 and 1.0 representing combined verifier score. + Float between 0.0 and 1.0 averaging the applicable dimensions. """ + if not raw: + return 0.0 + + static_checks = test.static_checks or {} + runtime_checks = test.runtime_checks or {} + applicable = { + "static": bool( + static_checks.get("required_patterns") + or static_checks.get("forbidden_patterns") + ), + "runtime": bool(runtime_checks.get("assertions")), + } + + if applicable["runtime"] and BenchmarkRunner._runtime_crashed(raw): + return 0.0 + scores: List[float] = [] - for section in ("static", "runtime"): - result = raw.get(section) or {} - score = result.get("score") - total_weight = result.get("details", {}).get("total_weight", 0) - if isinstance(score, (int, float)) and total_weight: - scores.append(float(score)) - if scores: - return round(sum(scores) / len(scores), 4) - - assertions = raw.get("assertions") or [] - if not assertions: + for dimension, is_applicable in applicable.items(): + if not is_applicable: + continue + result = raw.get(dimension) + score = result.get("score") if isinstance(result, dict) else None + scores.append(float(score) if isinstance(score, (int, float)) else 0.0) + + if not scores: return 0.0 - passed = sum(1 for assertion in assertions if assertion.get("passed")) - return round(passed / len(assertions), 4) + return round(mean(scores), 4) + + @staticmethod + def _runtime_crashed(raw: Dict[str, Any]) -> bool: + """Detect a hard execution failure in the runtime result. + + Only call this when the test defines runtime assertions. A crash shows + up two ways: the assertion loop never accumulated weight (execution + threw before any assertion ran), or the runtime appended a synthetic + ``execution_error``/``fatal_error`` entry to the assertions. + + Args: + raw: Raw result dict from the WordPress runtime. + + Returns: + True if the code failed to run, as opposed to running but failing + some assertions. + """ + runtime = raw.get("runtime") + if not isinstance(runtime, dict): + return True + details = runtime.get("details") or {} + if not details.get("total_weight"): + return True + assertions = details.get("assertions") or [] + return any( + isinstance(assertion, dict) + and assertion.get("type") in {"execution_error", "fatal_error"} + for assertion in assertions + ) def _write_outputs(self, payload: Dict[str, Any]) -> None: """Write benchmark results to JSON and JSONL files. @@ -579,7 +632,7 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]: "runtime_checks": test.runtime_checks, } env_result = self.environment.execute_code(code, verification_spec) - correctness = BenchmarkRunner._score_assertions(env_result.raw) + correctness = BenchmarkRunner._score_correctness(env_result.raw, test) return { "test_id": test.id, "type": "execution",