Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The official WordPress AI benchmark. Evaluate how well language models understan

WP-Bench measures AI model capabilities across two dimensions:

- **Knowledge** — Multiple-choice questions testing WordPress concepts, APIs, and best practices
- **Knowledge** — Multiple-choice and short-answer questions testing WordPress concepts, APIs, and best practices
- **Execution** — Code generation tasks graded by a real WordPress runtime for correctness and quality

The benchmark uses WordPress itself as the grader, running generated code in a sandboxed environment with static analysis and runtime assertions.
Expand Down Expand Up @@ -114,7 +114,7 @@ wp-bench dry-run --config wp-bench.yaml # validate config without calling m
Test suites live in `datasets/suites/<suite-name>/` with two directories per suite:

- `execution/` — Code generation tasks with assertions (one JSON file per category)
- `knowledge/` — Multiple-choice knowledge questions (one JSON file per category)
- `knowledge/` — Multiple-choice and short-answer knowledge questions (one JSON file per category)

The default suite `wp-core-v1` covers WordPress core APIs, hooks, database operations, and security patterns.

Expand Down
6 changes: 4 additions & 2 deletions datasets/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,5 +75,7 @@ dataset:
|-------|------|-------------|
| `id` | string | Unique test ID |
| `prompt` | string | Question text |
| `choices` | array | Multiple choice options `[{key, text}]` |
| `correct_answer` | string | Correct choice key (e.g., "B") |
| `type` | string | Knowledge mode such as `multiple_choice` or `short_answer` |
| `choices` | array | Optional multiple choice options `[{key, text}]` |
| `correct_answer` | string | Correct choice key or canonical short answer |
| `answer_type` | string | Optional short-answer scoring mode such as `exact` or `contains` |
4 changes: 4 additions & 0 deletions datasets/export_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,13 @@ def load_suite(suite_name: str) -> list[dict]:
"id": t["id"],
"suite": suite_name,
"test_kind": "execution",
"type": "execution",
"prompt": t["prompt"],
"category": t.get("category", "general"),
"difficulty": t.get("difficulty", "unknown"),
"choices": orjson.dumps(t.get("choices", [])).decode(),
"correct_answer": "",
"answer_type": "",
"requirements": orjson.dumps(t.get("requirements", [])).decode(),
"static_checks": orjson.dumps(t.get("static_checks", {})).decode(),
"runtime_checks": orjson.dumps(t.get("runtime_checks", {})).decode(),
Expand All @@ -56,11 +58,13 @@ def load_suite(suite_name: str) -> list[dict]:
"id": t["id"],
"suite": suite_name,
"test_kind": "knowledge",
"type": t.get("type", "knowledge"),
"prompt": t["prompt"],
"category": t.get("category", "general"),
"difficulty": t.get("difficulty", "unknown"),
"choices": orjson.dumps(t.get("choices", [])).decode(),
"correct_answer": t.get("correct_answer", ""),
"answer_type": t.get("answer_type", ""),
"requirements": "[]",
"static_checks": "{}",
"runtime_checks": "{}",
Expand Down
146 changes: 146 additions & 0 deletions python/tests/test_knowledge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from __future__ import annotations

from pathlib import Path

from wp_bench.config import DatasetConfig
from wp_bench.datasets import KnowledgeTest, _parse_knowledge_suite
from wp_bench.knowledge import render_knowledge_prompt, score_knowledge_answer
import wp_bench.datasets as datasets_module


def test_render_knowledge_prompt_uses_choice_instructions_for_multiple_choice() -> None:
test = KnowledgeTest(
id="k-rest-002",
suite="wp-core-v1",
prompt="Which function is used to register a custom REST API route in WordPress?",
test_type="multiple_choice",
category="rest-api",
difficulty="intermediate",
choices=[
{"key": "A", "text": "add_rest_route()"},
{"key": "B", "text": "register_rest_route()"},
],
correct_answer="B",
)

prompt = render_knowledge_prompt(test)

assert "Choices:" in prompt
assert "B. register_rest_route()" in prompt
assert "Answer with only the letter of the correct choice." in prompt


def test_render_knowledge_prompt_uses_short_answer_instructions_without_choices() -> None:
test = KnowledgeTest(
id="k-rest-001",
suite="wp-core-v1",
prompt="What is the default namespace prefix for WordPress core REST API endpoints?",
test_type="short_answer",
category="rest-api",
difficulty="intermediate",
correct_answer="wp/v2",
answer_type="exact",
)

prompt = render_knowledge_prompt(test)

assert "Choices:" not in prompt
assert "Answer with only the letter" not in prompt
assert "Answer briefly with the correct WordPress function, API, hook, or value." in prompt


def test_score_multiple_choice_accepts_letter_or_choice_text() -> None:
test = KnowledgeTest(
id="k-rest-002",
suite="wp-core-v1",
prompt="Which function is used to register a custom REST API route in WordPress?",
test_type="multiple_choice",
category="rest-api",
difficulty="intermediate",
choices=[
{"key": "A", "text": "add_rest_route()"},
{"key": "B", "text": "register_rest_route()"},
],
correct_answer="B",
)

assert score_knowledge_answer(test, "B") == 1.0
assert score_knowledge_answer(test, "The answer is B.") == 1.0
assert score_knowledge_answer(test, "register_rest_route()") == 1.0
assert score_knowledge_answer(test, "A. register_rest_route()") == 0.0


def test_score_short_answer_exact_accepts_expected_value_in_prose() -> None:
test = KnowledgeTest(
id="k-rest-001",
suite="wp-core-v1",
prompt="What is the default namespace prefix for WordPress core REST API endpoints?",
test_type="short_answer",
category="rest-api",
difficulty="intermediate",
correct_answer="wp/v2",
answer_type="exact",
)

assert score_knowledge_answer(test, "wp/v2") == 1.0
assert score_knowledge_answer(test, "The answer is wp/v2.") == 1.0


def test_score_short_answer_contains_accepts_function_name_with_parens() -> None:
test = KnowledgeTest(
id="k-security-001",
suite="wp-core-v1",
prompt="Which function should be used to escape HTML output in WordPress?",
test_type="short_answer",
category="security",
difficulty="basic",
correct_answer="esc_html",
answer_type="contains",
)

assert score_knowledge_answer(test, "Use esc_html() for this.") == 1.0


def test_local_parser_preserves_short_answer_metadata() -> None:
suite_path = (
Path(__file__).resolve().parents[2]
/ "datasets"
/ "suites"
/ "wp-core-v1"
/ "knowledge"
/ "rest-api.json"
)

test = _parse_knowledge_suite(suite_path)[0]

assert test.test_type == "short_answer"
assert test.answer_type == "exact"
assert test.choices is None


def test_huggingface_loader_preserves_short_answer_metadata(monkeypatch) -> None:
rows = [
{
"id": "k-rest-001",
"suite": "wp-core-v1",
"test_kind": "knowledge",
"type": "short_answer",
"prompt": "What is the default namespace prefix for WordPress core REST API endpoints?",
"category": "rest-api",
"difficulty": "intermediate",
"choices": "[]",
"correct_answer": "wp/v2",
"answer_type": "exact",
}
]

monkeypatch.setattr(datasets_module, "hf_load_dataset", lambda *args, **kwargs: rows)

loaded = datasets_module.load_tests(
DatasetConfig(source="huggingface", name="WordPress/wp-bench-v1")
)
test = loaded["knowledge"][0]

assert test.test_type == "short_answer"
assert test.answer_type == "exact"
assert test.choices is None
53 changes: 53 additions & 0 deletions python/tests/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from __future__ import annotations

from types import SimpleNamespace

from litellm.exceptions import BadRequestError

from wp_bench.config import ModelConfig
from wp_bench.models import ModelInterface
import wp_bench.models as models_module


def test_generate_retries_without_temperature_on_deprecated_error(monkeypatch) -> None:
calls: list[dict] = []

def fake_completion(**kwargs):
calls.append(kwargs)
if len(calls) == 1:
raise BadRequestError(
message="AnthropicException - `temperature` is deprecated for this model.",
model="anthropic/claude-opus-4-7",
llm_provider="anthropic",
)
return SimpleNamespace(choices=[SimpleNamespace(message={"content": "ok"})])

monkeypatch.setattr(models_module, "completion", fake_completion)

model = ModelInterface(ModelConfig(name="anthropic/claude-opus-4-7"))
result = model.generate("hello")

assert result == "ok"
assert len(calls) == 2
assert calls[0]["temperature"] == 0.0
assert "temperature" not in calls[1]


def test_generate_does_not_retry_other_bad_request_errors(monkeypatch) -> None:
def fake_completion(**kwargs):
raise BadRequestError(
message="AnthropicException - some other invalid request.",
model="anthropic/claude-opus-4-7",
llm_provider="anthropic",
)

monkeypatch.setattr(models_module, "completion", fake_completion)

model = ModelInterface(ModelConfig(name="anthropic/claude-opus-4-7"))

try:
model.generate("hello")
except BadRequestError as error:
assert "some other invalid request" in str(error)
else:
raise AssertionError("Expected BadRequestError to be raised")
2 changes: 1 addition & 1 deletion python/wp_bench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from __future__ import annotations

from pathlib import Path
from typing import List, Literal, Optional, Union
from typing import List, Literal, Optional

from pydantic import BaseModel, Field, HttpUrl, validator

Expand Down
21 changes: 8 additions & 13 deletions python/wp_bench/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .config import HarnessConfig, ModelConfig
from .datasets import ExecutionTest, KnowledgeTest, load_tests
from .environment import WordPressEnvironment
from .knowledge import render_knowledge_prompt, score_knowledge_answer
from .models import ModelInterface
from .output import (
create_progress,
Expand Down Expand Up @@ -106,10 +107,10 @@ def run(self) -> Dict[str, Any]:
return payload

def _run_knowledge_tests(self, tests: List[KnowledgeTest]) -> None:
"""Run multiple-choice knowledge tests in parallel.
"""Run knowledge tests in parallel.

Prompts the model with WordPress knowledge questions and scores responses
based on whether they match the expected answer letter.
against either multiple-choice or short-answer expectations.

Args:
tests: List of knowledge test definitions.
Expand All @@ -125,7 +126,7 @@ def process_test(test: KnowledgeTest) -> Dict[str, Any]:
try:
prompt = self._render_knowledge_prompt(test)
answer = strip_code_fences(self.model.generate(prompt)).strip()
correct = 1.0 if (test.correct_answer and answer.upper().startswith(test.correct_answer)) else 0.0
correct = score_knowledge_answer(test, answer)
return {
"test_id": test.id,
"type": "knowledge",
Expand Down Expand Up @@ -215,21 +216,15 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]:

@staticmethod
def _render_knowledge_prompt(test: KnowledgeTest) -> str:
"""Format a knowledge test into a multiple-choice prompt string.
"""Format a knowledge test into a prompt string.

Args:
test: Knowledge test with question and choices.

Returns:
Formatted prompt asking for a single letter answer.
Formatted prompt with instructions that match the answer mode.
"""
prompt = [test.prompt]
if test.choices:
prompt.append("Choices:")
for choice in test.choices:
prompt.append(f"{choice['key']}. {choice['text']}")
prompt.append("Answer with only the letter of the correct choice.")
return "\n".join(prompt)
return render_knowledge_prompt(test)

@staticmethod
def _render_execution_prompt(test: ExecutionTest) -> str:
Expand Down Expand Up @@ -425,7 +420,7 @@ def process_test(test: KnowledgeTest) -> Dict[str, Any]:
try:
prompt = BenchmarkRunner._render_knowledge_prompt(test)
answer = strip_code_fences(self.model.generate(prompt)).strip()
correct = 1.0 if (test.correct_answer and answer.upper().startswith(test.correct_answer)) else 0.0
correct = score_knowledge_answer(test, answer)
return {
"test_id": test.id,
"type": "knowledge",
Expand Down
8 changes: 6 additions & 2 deletions python/wp_bench/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class KnowledgeTest:
difficulty: str
choices: Optional[List[Dict[str, Any]]] = None
correct_answer: Optional[str] = None
answer_type: Optional[str] = None
answer: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None

Expand Down Expand Up @@ -88,16 +89,18 @@ def _load_from_huggingface(config: DatasetConfig) -> Dict[str, List[Any]]:
)
)
else:
choice_list = choices if isinstance(choices, list) and choices else None
knowledge.append(
KnowledgeTest(
id=row["id"],
suite=row.get("suite", config.name),
prompt=row["prompt"],
test_type="knowledge",
test_type=row.get("type", "knowledge"),
category=row.get("category", "general"),
difficulty=row.get("difficulty", "unknown"),
choices=choices if isinstance(choices, list) else None,
choices=choice_list,
correct_answer=row.get("correct_answer"),
answer_type=row.get("answer_type"),
metadata={},
)
)
Expand Down Expand Up @@ -179,6 +182,7 @@ def _parse_knowledge_suite(path: Path) -> List[KnowledgeTest]:
difficulty=test.get("difficulty", "unknown"),
choices=test.get("choices"),
correct_answer=test.get("correct_answer"),
answer_type=test.get("answer_type"),
metadata={"suite_metadata": metadata},
)
)
Expand Down
Loading