Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/tests/test_reference_solution_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ def _execution_test(test_id: str = "e-one") -> ExecutionTest:
category="general",
difficulty="basic",
requirements=["Requirement"],
static_checks={"required_patterns": []},
runtime_checks={"assertions": []},
static_checks={"required_patterns": [{"pattern": "ref", "weight": 1}]},
runtime_checks={"assertions": [{"type": "custom_assertion", "code": "return true;", "weight": 1}]},
reference_solution="function ref() { return true; }",
metadata={},
)
Expand Down
152 changes: 152 additions & 0 deletions python/tests/test_scoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
from __future__ import annotations

from typing import Any, Dict

from wp_bench.core import BenchmarkRunner
from wp_bench.datasets import ExecutionTest


def _make_test(
static_checks: Dict[str, Any] | None = None,
runtime_checks: Dict[str, Any] | None = None,
) -> ExecutionTest:
return ExecutionTest(
id="e-test-001",
suite="wp-core-v1",
prompt="Do something.",
expected_behavior="Reviewer contract: does something observable.",
test_type="execution",
category="hooks",
difficulty="intermediate",
requirements=[],
static_checks=static_checks or {},
runtime_checks=runtime_checks or {},
reference_solution=None,
metadata={},
)


def test_correctness_averages_static_and_runtime_dimensions() -> None:
test = _make_test(
static_checks={"required_patterns": [{"pattern": "add_filter", "weight": 1.0}]},
runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
)
raw = {
"static": {"score": 0.5},
"runtime": {"score": 1.0, "details": {"total_weight": 1.0}},
}

assert BenchmarkRunner._score_correctness(raw, test) == 0.75


def test_correctness_uses_only_runtime_when_no_static_checks() -> None:
test = _make_test(
runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
)
# Runtime returns 1.0 for absent static checks, which must NOT inflate the score.
raw = {
"static": {"score": 1.0},
"runtime": {"score": 0.4, "details": {"total_weight": 1.0}},
}

assert BenchmarkRunner._score_correctness(raw, test) == 0.4


def test_correctness_uses_only_static_when_no_runtime_checks() -> None:
test = _make_test(
static_checks={"required_patterns": [{"pattern": "esc_html", "weight": 1.0}]},
)
# Runtime returns 0.0 when no assertions are defined; it must be ignored here.
raw = {"static": {"score": 0.8}, "runtime": {"score": 0.0}}

assert BenchmarkRunner._score_correctness(raw, test) == 0.8


def test_correctness_respects_forbidden_pattern_hard_fail() -> None:
test = _make_test(
static_checks={"forbidden_patterns": [{"pattern": "eval\\(", "severity": "error"}]},
runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
)
# Static hard-failed to 0.0 via a forbidden pattern; runtime passed fully.
raw = {
"static": {"score": 0.0},
"runtime": {"score": 1.0, "details": {"total_weight": 1.0}},
}

assert BenchmarkRunner._score_correctness(raw, test) == 0.5


def test_correctness_hard_zeroes_on_crash_despite_perfect_static() -> None:
test = _make_test(
static_checks={"required_patterns": [{"pattern": "add_filter", "weight": 1.0}]},
runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
)
# Static is perfect, but the code crashed before any assertion ran
# (runtime accumulated no weight). Static must not rescue unrunnable code.
raw = {
"static": {"score": 1.0},
"runtime": {
"score": 0.0,
"details": {
"assertions": [{"type": "fatal_error", "passed": False}],
"total_weight": 0.0,
},
},
}

assert BenchmarkRunner._score_correctness(raw, test) == 0.0


def test_correctness_crash_detected_by_error_assertion_type() -> None:
test = _make_test(
runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
)
# An execution_error entry signals a crash even if some weight accumulated.
raw = {
"runtime": {
"score": 0.5,
"details": {
"assertions": [
{"type": "hook_registered", "passed": True, "weight": 1.0},
{"type": "execution_error", "passed": False},
],
"total_weight": 1.0,
},
},
}

assert BenchmarkRunner._score_correctness(raw, test) == 0.0


def test_correctness_keeps_partial_credit_when_code_runs() -> None:
test = _make_test(
runtime_checks={
"assertions": [
{"type": "hook_registered", "target": "x"},
{"type": "hook_registered", "target": "y"},
]
},
)
# Code ran fine but only half the assertions passed: partial credit stays.
raw = {
"runtime": {
"score": 0.5,
"details": {
"assertions": [
{"type": "hook_registered", "passed": True, "weight": 1.0},
{"type": "hook_registered", "passed": False, "weight": 1.0},
],
"total_weight": 2.0,
},
},
}

assert BenchmarkRunner._score_correctness(raw, test) == 0.5


def test_correctness_returns_zero_for_empty_raw() -> None:
test = _make_test(
runtime_checks={"assertions": [{"type": "hook_registered", "target": "x"}]},
)

assert BenchmarkRunner._score_correctness({}, test) == 0.0
103 changes: 78 additions & 25 deletions python/wp_bench/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path
from statistics import mean
from typing import Any, Dict, List

import orjson
Expand Down Expand Up @@ -234,7 +235,7 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]:
"runtime_checks": test.runtime_checks,
}
env_result = self.environment.execute_code(code, verification_spec)
correctness = self._score_assertions(env_result.raw)
correctness = self._score_correctness(env_result.raw, test)
return {
"test_id": test.id,
"type": "execution",
Expand Down Expand Up @@ -278,7 +279,7 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]:
"runtime_checks": test.runtime_checks,
}
env_result = self.environment.execute_code(test.reference_solution, verification_spec)
correctness = self._score_assertions(env_result.raw)
correctness = self._score_correctness(env_result.raw, test)
return {
"test_id": test.id,
"type": "execution",
Expand Down Expand Up @@ -340,35 +341,87 @@ def _render_execution_prompt(test: ExecutionTest) -> str:
return "\n".join(lines)

@staticmethod
def _score_assertions(raw: Dict[str, Any]) -> float:
"""Calculate correctness from static and runtime verifier scores.

Only sections that actually defined checks contribute to the score.
An empty static (or runtime) section reports a perfect score of 1.0
(or 0.0), so averaging it in unconditionally would inflate or deflate
correctness for tests that exercise only one section.
def _score_correctness(raw: Dict[str, Any], test: ExecutionTest) -> float:
"""Combine static-analysis and runtime-assertion scores into correctness.

Both sub-scores are already weighted by the WordPress runtime
(Static_Analysis::check and Sandbox::execute_and_verify), including the
per-pattern/per-assertion weights and the forbidden-pattern hard fail.
A dimension contributes only when the test actually defines checks for
it, so a test with only runtime assertions is scored purely on runtime,
and vice versa. Applicability is read from the test definition rather
than the runtime output, since a crash before assertions run leaves the
runtime weight at zero even though the dimension was meant to count.

When the code is supposed to run but crashes (a fatal or execution
error before any assertion executes), correctness is forced to 0.0:
the runtime is ground truth, and static pattern matches must not rescue
code that does not run. This applies only to hard crashes, not to code
that runs but fails some assertions, which still earns partial credit.

Args:
raw: Raw result dict from WordPress environment.
raw: Raw result dict from the WordPress runtime (static/runtime).
test: The execution test, used to know which dimensions apply.

Returns:
Float between 0.0 and 1.0 representing combined verifier score.
Float between 0.0 and 1.0 averaging the applicable dimensions.
"""
if not raw:
return 0.0

static_checks = test.static_checks or {}
runtime_checks = test.runtime_checks or {}
applicable = {
"static": bool(
static_checks.get("required_patterns")
or static_checks.get("forbidden_patterns")
),
"runtime": bool(runtime_checks.get("assertions")),
}

if applicable["runtime"] and BenchmarkRunner._runtime_crashed(raw):
return 0.0

scores: List[float] = []
for section in ("static", "runtime"):
result = raw.get(section) or {}
score = result.get("score")
total_weight = result.get("details", {}).get("total_weight", 0)
if isinstance(score, (int, float)) and total_weight:
scores.append(float(score))
if scores:
return round(sum(scores) / len(scores), 4)

assertions = raw.get("assertions") or []
if not assertions:
for dimension, is_applicable in applicable.items():
if not is_applicable:
continue
result = raw.get(dimension)
score = result.get("score") if isinstance(result, dict) else None
scores.append(float(score) if isinstance(score, (int, float)) else 0.0)

if not scores:
return 0.0
passed = sum(1 for assertion in assertions if assertion.get("passed"))
return round(passed / len(assertions), 4)
return round(mean(scores), 4)

@staticmethod
def _runtime_crashed(raw: Dict[str, Any]) -> bool:
"""Detect a hard execution failure in the runtime result.

Only call this when the test defines runtime assertions. A crash shows
up two ways: the assertion loop never accumulated weight (execution
threw before any assertion ran), or the runtime appended a synthetic
``execution_error``/``fatal_error`` entry to the assertions.

Args:
raw: Raw result dict from the WordPress runtime.

Returns:
True if the code failed to run, as opposed to running but failing
some assertions.
"""
runtime = raw.get("runtime")
if not isinstance(runtime, dict):
return True
details = runtime.get("details") or {}
if not details.get("total_weight"):
return True
assertions = details.get("assertions") or []
return any(
isinstance(assertion, dict)
and assertion.get("type") in {"execution_error", "fatal_error"}
for assertion in assertions
)

def _write_outputs(self, payload: Dict[str, Any]) -> None:
"""Write benchmark results to JSON and JSONL files.
Expand Down Expand Up @@ -579,7 +632,7 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]:
"runtime_checks": test.runtime_checks,
}
env_result = self.environment.execute_code(code, verification_spec)
correctness = BenchmarkRunner._score_assertions(env_result.raw)
correctness = BenchmarkRunner._score_correctness(env_result.raw, test)
return {
"test_id": test.id,
"type": "execution",
Expand Down