diff --git a/README.md b/README.md index 382594a..cc856fa 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The official WordPress AI benchmark. Evaluate how well language models understan WP-Bench measures AI model capabilities across two dimensions: -- **Knowledge** — Multiple-choice questions testing WordPress concepts, APIs, and best practices +- **Knowledge** — Multiple-choice and short-answer questions testing WordPress concepts, APIs, and best practices - **Execution** — Code generation tasks graded by a real WordPress runtime for correctness and quality The benchmark uses WordPress itself as the grader, running generated code in a sandboxed environment with static analysis and runtime assertions. @@ -114,7 +114,7 @@ wp-bench dry-run --config wp-bench.yaml # validate config without calling m Test suites live in `datasets/suites//` with two directories per suite: - `execution/` — Code generation tasks with assertions (one JSON file per category) -- `knowledge/` — Multiple-choice knowledge questions (one JSON file per category) +- `knowledge/` — Multiple-choice and short-answer knowledge questions (one JSON file per category) The default suite `wp-core-v1` covers WordPress core APIs, hooks, database operations, and security patterns. diff --git a/datasets/README.md b/datasets/README.md index 334041d..0758880 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -75,5 +75,7 @@ dataset: |-------|------|-------------| | `id` | string | Unique test ID | | `prompt` | string | Question text | -| `choices` | array | Multiple choice options `[{key, text}]` | -| `correct_answer` | string | Correct choice key (e.g., "B") | +| `type` | string | Knowledge mode such as `multiple_choice` or `short_answer` | +| `choices` | array | Optional multiple choice options `[{key, text}]` | +| `correct_answer` | string | Correct choice key or canonical short answer | +| `answer_type` | string | Optional short-answer scoring mode such as `exact` or `contains` | diff --git a/datasets/export_dataset.py b/datasets/export_dataset.py index 2cce090..3f53d8b 100644 --- a/datasets/export_dataset.py +++ b/datasets/export_dataset.py @@ -34,11 +34,13 @@ def load_suite(suite_name: str) -> list[dict]: "id": t["id"], "suite": suite_name, "test_kind": "execution", + "type": "execution", "prompt": t["prompt"], "category": t.get("category", "general"), "difficulty": t.get("difficulty", "unknown"), "choices": orjson.dumps(t.get("choices", [])).decode(), "correct_answer": "", + "answer_type": "", "requirements": orjson.dumps(t.get("requirements", [])).decode(), "static_checks": orjson.dumps(t.get("static_checks", {})).decode(), "runtime_checks": orjson.dumps(t.get("runtime_checks", {})).decode(), @@ -56,11 +58,13 @@ def load_suite(suite_name: str) -> list[dict]: "id": t["id"], "suite": suite_name, "test_kind": "knowledge", + "type": t.get("type", "knowledge"), "prompt": t["prompt"], "category": t.get("category", "general"), "difficulty": t.get("difficulty", "unknown"), "choices": orjson.dumps(t.get("choices", [])).decode(), "correct_answer": t.get("correct_answer", ""), + "answer_type": t.get("answer_type", ""), "requirements": "[]", "static_checks": "{}", "runtime_checks": "{}", diff --git a/python/tests/test_knowledge.py b/python/tests/test_knowledge.py new file mode 100644 index 0000000..382ee15 --- /dev/null +++ b/python/tests/test_knowledge.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +from pathlib import Path + +from wp_bench.config import DatasetConfig +from wp_bench.datasets import KnowledgeTest, _parse_knowledge_suite +from wp_bench.knowledge import render_knowledge_prompt, score_knowledge_answer +import wp_bench.datasets as datasets_module + + +def test_render_knowledge_prompt_uses_choice_instructions_for_multiple_choice() -> None: + test = KnowledgeTest( + id="k-rest-002", + suite="wp-core-v1", + prompt="Which function is used to register a custom REST API route in WordPress?", + test_type="multiple_choice", + category="rest-api", + difficulty="intermediate", + choices=[ + {"key": "A", "text": "add_rest_route()"}, + {"key": "B", "text": "register_rest_route()"}, + ], + correct_answer="B", + ) + + prompt = render_knowledge_prompt(test) + + assert "Choices:" in prompt + assert "B. register_rest_route()" in prompt + assert "Answer with only the letter of the correct choice." in prompt + + +def test_render_knowledge_prompt_uses_short_answer_instructions_without_choices() -> None: + test = KnowledgeTest( + id="k-rest-001", + suite="wp-core-v1", + prompt="What is the default namespace prefix for WordPress core REST API endpoints?", + test_type="short_answer", + category="rest-api", + difficulty="intermediate", + correct_answer="wp/v2", + answer_type="exact", + ) + + prompt = render_knowledge_prompt(test) + + assert "Choices:" not in prompt + assert "Answer with only the letter" not in prompt + assert "Answer briefly with the correct WordPress function, API, hook, or value." in prompt + + +def test_score_multiple_choice_accepts_letter_or_choice_text() -> None: + test = KnowledgeTest( + id="k-rest-002", + suite="wp-core-v1", + prompt="Which function is used to register a custom REST API route in WordPress?", + test_type="multiple_choice", + category="rest-api", + difficulty="intermediate", + choices=[ + {"key": "A", "text": "add_rest_route()"}, + {"key": "B", "text": "register_rest_route()"}, + ], + correct_answer="B", + ) + + assert score_knowledge_answer(test, "B") == 1.0 + assert score_knowledge_answer(test, "The answer is B.") == 1.0 + assert score_knowledge_answer(test, "register_rest_route()") == 1.0 + assert score_knowledge_answer(test, "A. register_rest_route()") == 0.0 + + +def test_score_short_answer_exact_accepts_expected_value_in_prose() -> None: + test = KnowledgeTest( + id="k-rest-001", + suite="wp-core-v1", + prompt="What is the default namespace prefix for WordPress core REST API endpoints?", + test_type="short_answer", + category="rest-api", + difficulty="intermediate", + correct_answer="wp/v2", + answer_type="exact", + ) + + assert score_knowledge_answer(test, "wp/v2") == 1.0 + assert score_knowledge_answer(test, "The answer is wp/v2.") == 1.0 + + +def test_score_short_answer_contains_accepts_function_name_with_parens() -> None: + test = KnowledgeTest( + id="k-security-001", + suite="wp-core-v1", + prompt="Which function should be used to escape HTML output in WordPress?", + test_type="short_answer", + category="security", + difficulty="basic", + correct_answer="esc_html", + answer_type="contains", + ) + + assert score_knowledge_answer(test, "Use esc_html() for this.") == 1.0 + + +def test_local_parser_preserves_short_answer_metadata() -> None: + suite_path = ( + Path(__file__).resolve().parents[2] + / "datasets" + / "suites" + / "wp-core-v1" + / "knowledge" + / "rest-api.json" + ) + + test = _parse_knowledge_suite(suite_path)[0] + + assert test.test_type == "short_answer" + assert test.answer_type == "exact" + assert test.choices is None + + +def test_huggingface_loader_preserves_short_answer_metadata(monkeypatch) -> None: + rows = [ + { + "id": "k-rest-001", + "suite": "wp-core-v1", + "test_kind": "knowledge", + "type": "short_answer", + "prompt": "What is the default namespace prefix for WordPress core REST API endpoints?", + "category": "rest-api", + "difficulty": "intermediate", + "choices": "[]", + "correct_answer": "wp/v2", + "answer_type": "exact", + } + ] + + monkeypatch.setattr(datasets_module, "hf_load_dataset", lambda *args, **kwargs: rows) + + loaded = datasets_module.load_tests( + DatasetConfig(source="huggingface", name="WordPress/wp-bench-v1") + ) + test = loaded["knowledge"][0] + + assert test.test_type == "short_answer" + assert test.answer_type == "exact" + assert test.choices is None diff --git a/python/tests/test_models.py b/python/tests/test_models.py new file mode 100644 index 0000000..21c6706 --- /dev/null +++ b/python/tests/test_models.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +from types import SimpleNamespace + +from litellm.exceptions import BadRequestError + +from wp_bench.config import ModelConfig +from wp_bench.models import ModelInterface +import wp_bench.models as models_module + + +def test_generate_retries_without_temperature_on_deprecated_error(monkeypatch) -> None: + calls: list[dict] = [] + + def fake_completion(**kwargs): + calls.append(kwargs) + if len(calls) == 1: + raise BadRequestError( + message="AnthropicException - `temperature` is deprecated for this model.", + model="anthropic/claude-opus-4-7", + llm_provider="anthropic", + ) + return SimpleNamespace(choices=[SimpleNamespace(message={"content": "ok"})]) + + monkeypatch.setattr(models_module, "completion", fake_completion) + + model = ModelInterface(ModelConfig(name="anthropic/claude-opus-4-7")) + result = model.generate("hello") + + assert result == "ok" + assert len(calls) == 2 + assert calls[0]["temperature"] == 0.0 + assert "temperature" not in calls[1] + + +def test_generate_does_not_retry_other_bad_request_errors(monkeypatch) -> None: + def fake_completion(**kwargs): + raise BadRequestError( + message="AnthropicException - some other invalid request.", + model="anthropic/claude-opus-4-7", + llm_provider="anthropic", + ) + + monkeypatch.setattr(models_module, "completion", fake_completion) + + model = ModelInterface(ModelConfig(name="anthropic/claude-opus-4-7")) + + try: + model.generate("hello") + except BadRequestError as error: + assert "some other invalid request" in str(error) + else: + raise AssertionError("Expected BadRequestError to be raised") diff --git a/python/wp_bench/config.py b/python/wp_bench/config.py index 8e63144..8ff6195 100644 --- a/python/wp_bench/config.py +++ b/python/wp_bench/config.py @@ -2,7 +2,7 @@ from __future__ import annotations from pathlib import Path -from typing import List, Literal, Optional, Union +from typing import List, Literal, Optional from pydantic import BaseModel, Field, HttpUrl, validator diff --git a/python/wp_bench/core.py b/python/wp_bench/core.py index ca7d95a..de7a778 100644 --- a/python/wp_bench/core.py +++ b/python/wp_bench/core.py @@ -13,6 +13,7 @@ from .config import HarnessConfig, ModelConfig from .datasets import ExecutionTest, KnowledgeTest, load_tests from .environment import WordPressEnvironment +from .knowledge import render_knowledge_prompt, score_knowledge_answer from .models import ModelInterface from .output import ( create_progress, @@ -106,10 +107,10 @@ def run(self) -> Dict[str, Any]: return payload def _run_knowledge_tests(self, tests: List[KnowledgeTest]) -> None: - """Run multiple-choice knowledge tests in parallel. + """Run knowledge tests in parallel. Prompts the model with WordPress knowledge questions and scores responses - based on whether they match the expected answer letter. + against either multiple-choice or short-answer expectations. Args: tests: List of knowledge test definitions. @@ -125,7 +126,7 @@ def process_test(test: KnowledgeTest) -> Dict[str, Any]: try: prompt = self._render_knowledge_prompt(test) answer = strip_code_fences(self.model.generate(prompt)).strip() - correct = 1.0 if (test.correct_answer and answer.upper().startswith(test.correct_answer)) else 0.0 + correct = score_knowledge_answer(test, answer) return { "test_id": test.id, "type": "knowledge", @@ -215,21 +216,15 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]: @staticmethod def _render_knowledge_prompt(test: KnowledgeTest) -> str: - """Format a knowledge test into a multiple-choice prompt string. + """Format a knowledge test into a prompt string. Args: test: Knowledge test with question and choices. Returns: - Formatted prompt asking for a single letter answer. + Formatted prompt with instructions that match the answer mode. """ - prompt = [test.prompt] - if test.choices: - prompt.append("Choices:") - for choice in test.choices: - prompt.append(f"{choice['key']}. {choice['text']}") - prompt.append("Answer with only the letter of the correct choice.") - return "\n".join(prompt) + return render_knowledge_prompt(test) @staticmethod def _render_execution_prompt(test: ExecutionTest) -> str: @@ -425,7 +420,7 @@ def process_test(test: KnowledgeTest) -> Dict[str, Any]: try: prompt = BenchmarkRunner._render_knowledge_prompt(test) answer = strip_code_fences(self.model.generate(prompt)).strip() - correct = 1.0 if (test.correct_answer and answer.upper().startswith(test.correct_answer)) else 0.0 + correct = score_knowledge_answer(test, answer) return { "test_id": test.id, "type": "knowledge", diff --git a/python/wp_bench/datasets.py b/python/wp_bench/datasets.py index c945677..97ae55e 100644 --- a/python/wp_bench/datasets.py +++ b/python/wp_bench/datasets.py @@ -40,6 +40,7 @@ class KnowledgeTest: difficulty: str choices: Optional[List[Dict[str, Any]]] = None correct_answer: Optional[str] = None + answer_type: Optional[str] = None answer: Optional[str] = None metadata: Optional[Dict[str, Any]] = None @@ -88,16 +89,18 @@ def _load_from_huggingface(config: DatasetConfig) -> Dict[str, List[Any]]: ) ) else: + choice_list = choices if isinstance(choices, list) and choices else None knowledge.append( KnowledgeTest( id=row["id"], suite=row.get("suite", config.name), prompt=row["prompt"], - test_type="knowledge", + test_type=row.get("type", "knowledge"), category=row.get("category", "general"), difficulty=row.get("difficulty", "unknown"), - choices=choices if isinstance(choices, list) else None, + choices=choice_list, correct_answer=row.get("correct_answer"), + answer_type=row.get("answer_type"), metadata={}, ) ) @@ -179,6 +182,7 @@ def _parse_knowledge_suite(path: Path) -> List[KnowledgeTest]: difficulty=test.get("difficulty", "unknown"), choices=test.get("choices"), correct_answer=test.get("correct_answer"), + answer_type=test.get("answer_type"), metadata={"suite_metadata": metadata}, ) ) diff --git a/python/wp_bench/knowledge.py b/python/wp_bench/knowledge.py new file mode 100644 index 0000000..8d03e5a --- /dev/null +++ b/python/wp_bench/knowledge.py @@ -0,0 +1,86 @@ +"""Knowledge prompt rendering and scoring helpers.""" +from __future__ import annotations + +import re + +from .datasets import KnowledgeTest +from .utils import strip_code_fences + +_CHOICE_ANSWER_RE = re.compile( + r"^\s*(?:(?:the\s+)?(?:correct\s+)?(?:answer|option|choice)\s*(?:is|:)\s*)?" + r"[\(\[]?([A-Z])(?:[\)\].:\-,\s]|$)", + re.IGNORECASE, +) + + +def render_knowledge_prompt(test: KnowledgeTest) -> str: + """Format a knowledge test prompt based on its answer mode.""" + prompt = [test.prompt] + if test.choices: + prompt.append("Choices:") + for choice in test.choices: + prompt.append(f"{choice['key']}. {choice['text']}") + prompt.append("Answer with only the letter of the correct choice.") + else: + prompt.append("Answer briefly with the correct WordPress function, API, hook, or value.") + return "\n".join(prompt) + + +def score_knowledge_answer(test: KnowledgeTest, answer: str) -> float: + """Score a model response for a knowledge test.""" + if not test.correct_answer: + return 0.0 + + if test.choices: + correct_key = test.correct_answer.upper() + answer_key = _extract_choice_key(answer) + if answer_key is not None: + return 1.0 if answer_key == correct_key else 0.0 + + correct_choice_text = _lookup_choice_text(test, correct_key) + if not correct_choice_text: + return 0.0 + + answer_normalized = _normalize_knowledge_text(answer) + choice_normalized = _normalize_knowledge_text(correct_choice_text) + return 1.0 if choice_normalized and choice_normalized in answer_normalized else 0.0 + + expected = _normalize_knowledge_text(test.correct_answer) + actual = _normalize_knowledge_text(answer) + if not expected or not actual: + return 0.0 + + if actual == expected: + return 1.0 + + answer_type = (test.answer_type or "exact").lower() + if answer_type in {"contains", "exact"} and expected in actual: + return 1.0 + + return 0.0 + + +def _extract_choice_key(answer: str) -> str | None: + text = strip_code_fences(answer).strip() + if not text: + return None + match = _CHOICE_ANSWER_RE.match(text) + if not match: + return None + return match.group(1).upper() + + +def _lookup_choice_text(test: KnowledgeTest, key: str) -> str | None: + if not test.choices: + return None + for choice in test.choices: + choice_key = str(choice.get("key", "")).upper() + if choice_key == key: + text = choice.get("text") + return text if isinstance(text, str) else None + return None + + +def _normalize_knowledge_text(value: str) -> str: + text = strip_code_fences(value).strip().casefold() + return re.sub(r"\s+", " ", text) diff --git a/python/wp_bench/models.py b/python/wp_bench/models.py index 509086e..9b7a0e7 100644 --- a/python/wp_bench/models.py +++ b/python/wp_bench/models.py @@ -1,7 +1,10 @@ """Model interface leveraging LiteLLM providers.""" from __future__ import annotations +from typing import Any + from litellm import completion, completion_cost +from litellm.exceptions import BadRequestError from litellm.utils import ModelResponse from .config import ModelConfig @@ -22,17 +25,32 @@ def generate(self, prompt: str) -> str: Returns: The model's response text. """ - response: ModelResponse = completion( - model=self.config.name, - messages=[{"role": "user", "content": prompt}], - temperature=self.config.temperature, - max_tokens=self.config.max_tokens, - top_p=self.config.top_p, - timeout=self.config.request_timeout, - ) + kwargs = self._completion_kwargs(prompt) + try: + response: ModelResponse = completion(**kwargs) + except BadRequestError as error: + if not _is_deprecated_temperature_error(error) or "temperature" not in kwargs: + raise + kwargs.pop("temperature") + response = completion(**kwargs) choice = response.choices[0] return choice.message["content"] # type: ignore[index] @staticmethod def estimate_cost(response: ModelResponse) -> float: return completion_cost(response) + + def _completion_kwargs(self, prompt: str) -> dict[str, Any]: + kwargs: dict[str, Any] = { + "model": self.config.name, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": self.config.max_tokens, + "top_p": self.config.top_p, + "timeout": self.config.request_timeout, + } + kwargs["temperature"] = self.config.temperature + return kwargs + + +def _is_deprecated_temperature_error(error: BadRequestError) -> bool: + return "`temperature` is deprecated" in str(error)