WordPress · lezama · Jun 16, 2026
diff --git a/python/tests/test_reference_solution_mode.py b/python/tests/test_reference_solution_mode.py
@@ -42,8 +42,8 @@ def _execution_test(test_id: str = "e-one") -> ExecutionTest:
         category="general",
         difficulty="basic",
         requirements=["Requirement"],
-        static_checks={"required_patterns": []},
-        runtime_checks={"assertions": []},
+        static_checks={"required_patterns": [{"pattern": "ref", "weight": 1}]},
+        runtime_checks={"assertions": [{"type": "custom_assertion", "code": "return true;", "weight": 1}]},
         reference_solution="function ref() { return true; }",
         metadata={},
     )

diff --git a/python/tests/test_scoring.py b/python/tests/test_scoring.py
@@ -0,0 +1,152 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from wp_bench.core import BenchmarkRunner
+from wp_bench.datasets import ExecutionTest
+
+
+def _make_test(
+    static_checks: Dict[str, Any] | None = None,
+    runtime_checks: Dict[str, Any] | None = None,
+) -> ExecutionTest:
+    return ExecutionTest(
+        id="e-test-001",
+        suite="wp-core-v1",
+        prompt="Do something.",
+        expected_behavior="Reviewer contract: does something observable.",
+        test_type="execution",
+        category="hooks",
+        difficulty="intermediate",
+        requirements=[],
+        static_checks=static_checks or {},
+        runtime_checks=runtime_checks or {},
+        reference_solution=None,
+        metadata={},
+    )
+
+
+def test_correctness_averages_static_and_runtime_dimensions() -> None:
+    test = _make_test(
+        static_checks={"required_patterns": [{"pattern": "add_filter", "weight": 1.0}]},
+        runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
+    )
+    raw = {
+        "static": {"score": 0.5},
+        "runtime": {"score": 1.0, "details": {"total_weight": 1.0}},
+    }
+
+    assert BenchmarkRunner._score_correctness(raw, test) == 0.75
+
+
+def test_correctness_uses_only_runtime_when_no_static_checks() -> None:
+    test = _make_test(
+        runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
+    )
+    # Runtime returns 1.0 for absent static checks, which must NOT inflate the score.
+    raw = {
+        "static": {"score": 1.0},
+        "runtime": {"score": 0.4, "details": {"total_weight": 1.0}},
+    }
+
+    assert BenchmarkRunner._score_correctness(raw, test) == 0.4
+
+
+def test_correctness_uses_only_static_when_no_runtime_checks() -> None:
+    test = _make_test(
+        static_checks={"required_patterns": [{"pattern": "esc_html", "weight": 1.0}]},
+    )
+    # Runtime returns 0.0 when no assertions are defined; it must be ignored here.
+    raw = {"static": {"score": 0.8}, "runtime": {"score": 0.0}}
+
+    assert BenchmarkRunner._score_correctness(raw, test) == 0.8
+
+
+def test_correctness_respects_forbidden_pattern_hard_fail() -> None:
+    test = _make_test(
+        static_checks={"forbidden_patterns": [{"pattern": "eval\\(", "severity": "error"}]},
+        runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
+    )
+    # Static hard-failed to 0.0 via a forbidden pattern; runtime passed fully.
+    raw = {
+        "static": {"score": 0.0},
+        "runtime": {"score": 1.0, "details": {"total_weight": 1.0}},
+    }
+
+    assert BenchmarkRunner._score_correctness(raw, test) == 0.5
+
+
+def test_correctness_hard_zeroes_on_crash_despite_perfect_static() -> None:
+    test = _make_test(
+        static_checks={"required_patterns": [{"pattern": "add_filter", "weight": 1.0}]},
+        runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
+    )
+    # Static is perfect, but the code crashed before any assertion ran
+    # (runtime accumulated no weight). Static must not rescue unrunnable code.
+    raw = {
+        "static": {"score": 1.0},
+        "runtime": {
+            "score": 0.0,
+            "details": {
+                "assertions": [{"type": "fatal_error", "passed": False}],
+                "total_weight": 0.0,
+            },
+        },
+    }
+
+    assert BenchmarkRunner._score_correctness(raw, test) == 0.0
+
+
+def test_correctness_crash_detected_by_error_assertion_type() -> None:
+    test = _make_test(
+        runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
+    )
+    # An execution_error entry signals a crash even if some weight accumulated.
+    raw = {
+        "runtime": {
+            "score": 0.5,
+            "details": {
+                "assertions": [
+                    {"type": "hook_registered", "passed": True, "weight": 1.0},
+                    {"type": "execution_error", "passed": False},
+                ],
+                "total_weight": 1.0,
+            },
+        },
+    }
+
+    assert BenchmarkRunner._score_correctness(raw, test) == 0.0
+
+
+def test_correctness_keeps_partial_credit_when_code_runs() -> None:
+    test = _make_test(
+        runtime_checks={
+            "assertions": [
+                {"type": "hook_registered", "target": "x"},
+                {"type": "hook_registered", "target": "y"},
+            ]
+        },
+    )
+    # Code ran fine but only half the assertions passed: partial credit stays.
+    raw = {
+        "runtime": {
+            "score": 0.5,
+            "details": {
+                "assertions": [
+                    {"type": "hook_registered", "passed": True, "weight": 1.0},
+                    {"type": "hook_registered", "passed": False, "weight": 1.0},
+                ],
+                "total_weight": 2.0,
+            },
+        },
+    }
+
+    assert BenchmarkRunner._score_correctness(raw, test) == 0.5
+
+
+def test_correctness_returns_zero_for_empty_raw() -> None:
+    test = _make_test(
+        runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
+    )
+
+    assert BenchmarkRunner._score_correctness({}, test) == 0.0
diff --git a/python/wp_bench/core.py b/python/wp_bench/core.py
@@ -6,6 +6,7 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 from pathlib import Path
+from statistics import mean
 from typing import Any, Dict, List
 
 import orjson
@@ -234,7 +235,7 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]:
                     "runtime_checks": test.runtime_checks,
                 }
                 env_result = self.environment.execute_code(code, verification_spec)
-                correctness = self._score_assertions(env_result.raw)
+                correctness = self._score_correctness(env_result.raw, test)
                 return {
                     "test_id": test.id,
                     "type": "execution",
@@ -278,7 +279,7 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]:
                     "runtime_checks": test.runtime_checks,
                 }
                 env_result = self.environment.execute_code(test.reference_solution, verification_spec)
-                correctness = self._score_assertions(env_result.raw)
+                correctness = self._score_correctness(env_result.raw, test)
                 return {
                     "test_id": test.id,
                     "type": "execution",
@@ -340,35 +341,87 @@ def _render_execution_prompt(test: ExecutionTest) -> str:
         return "\n".join(lines)
 
     @staticmethod
-    def _score_assertions(raw: Dict[str, Any]) -> float:
-        """Calculate correctness from static and runtime verifier scores.
-
-        Only sections that actually defined checks contribute to the score.
-        An empty static (or runtime) section reports a perfect score of 1.0
-        (or 0.0), so averaging it in unconditionally would inflate or deflate
-        correctness for tests that exercise only one section.
+    def _score_correctness(raw: Dict[str, Any], test: ExecutionTest) -> float:
+        """Combine static-analysis and runtime-assertion scores into correctness.
+
+        Both sub-scores are already weighted by the WordPress runtime
+        (Static_Analysis::check and Sandbox::execute_and_verify), including the
+        per-pattern/per-assertion weights and the forbidden-pattern hard fail.
+        A dimension contributes only when the test actually defines checks for
+        it, so a test with only runtime assertions is scored purely on runtime,
+        and vice versa. Applicability is read from the test definition rather
+        than the runtime output, since a crash before assertions run leaves the
+        runtime weight at zero even though the dimension was meant to count.
+
+        When the code is supposed to run but crashes (a fatal or execution
+        error before any assertion executes), correctness is forced to 0.0:
+        the runtime is ground truth, and static pattern matches must not rescue
+        code that does not run. This applies only to hard crashes, not to code
+        that runs but fails some assertions, which still earns partial credit.
 
         Args:
-            raw: Raw result dict from WordPress environment.
+            raw: Raw result dict from the WordPress runtime (static/runtime).
+            test: The execution test, used to know which dimensions apply.
 
         Returns:
-            Float between 0.0 and 1.0 representing combined verifier score.
+            Float between 0.0 and 1.0 averaging the applicable dimensions.
         """
+        if not raw:
+            return 0.0
+
+        static_checks = test.static_checks or {}
+        runtime_checks = test.runtime_checks or {}
+        applicable = {
+            "static": bool(
+                static_checks.get("required_patterns")
+                or static_checks.get("forbidden_patterns")
+            ),
+            "runtime": bool(runtime_checks.get("assertions")),
+        }
+
+        if applicable["runtime"] and BenchmarkRunner._runtime_crashed(raw):
+            return 0.0
+
         scores: List[float] = []
-        for section in ("static", "runtime"):
-            result = raw.get(section) or {}
-            score = result.get("score")
-            total_weight = result.get("details", {}).get("total_weight", 0)
-            if isinstance(score, (int, float)) and total_weight:
-                scores.append(float(score))
-        if scores:
-            return round(sum(scores) / len(scores), 4)
-
-        assertions = raw.get("assertions") or []
-        if not assertions:
+        for dimension, is_applicable in applicable.items():
+            if not is_applicable:
+                continue
+            result = raw.get(dimension)
+            score = result.get("score") if isinstance(result, dict) else None
+            scores.append(float(score) if isinstance(score, (int, float)) else 0.0)
+
+        if not scores:
             return 0.0
-        passed = sum(1 for assertion in assertions if assertion.get("passed"))
-        return round(passed / len(assertions), 4)
+        return round(mean(scores), 4)
+
+    @staticmethod
+    def _runtime_crashed(raw: Dict[str, Any]) -> bool:
+        """Detect a hard execution failure in the runtime result.
+
+        Only call this when the test defines runtime assertions. A crash shows
+        up two ways: the assertion loop never accumulated weight (execution
+        threw before any assertion ran), or the runtime appended a synthetic
+        ``execution_error``/``fatal_error`` entry to the assertions.
+
+        Args:
+            raw: Raw result dict from the WordPress runtime.
+
+        Returns:
+            True if the code failed to run, as opposed to running but failing
+            some assertions.
+        """
+        runtime = raw.get("runtime")
+        if not isinstance(runtime, dict):
+            return True
+        details = runtime.get("details") or {}
+        if not details.get("total_weight"):
+            return True
+        assertions = details.get("assertions") or []
+        return any(
+            isinstance(assertion, dict)
+            and assertion.get("type") in {"execution_error", "fatal_error"}
+            for assertion in assertions
+        )
 
     def _write_outputs(self, payload: Dict[str, Any]) -> None:
         """Write benchmark results to JSON and JSONL files.
@@ -579,7 +632,7 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]:
                     "runtime_checks": test.runtime_checks,
                 }
                 env_result = self.environment.execute_code(code, verification_spec)
-                correctness = BenchmarkRunner._score_assertions(env_result.raw)
+                correctness = BenchmarkRunner._score_correctness(env_result.raw, test)
                 return {
                     "test_id": test.id,
                     "type": "execution",