WordPress · Jameswlepage · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ The official WordPress AI benchmark. Evaluate how well language models understan
 
 WP-Bench measures AI model capabilities across two dimensions:
 
-- **Knowledge** — Multiple-choice questions testing WordPress concepts, APIs, and best practices
+- **Knowledge** — Multiple-choice and short-answer questions testing WordPress concepts, APIs, and best practices
 - **Execution** — Code generation tasks graded by a real WordPress runtime for correctness and quality
 
 The benchmark uses WordPress itself as the grader, running generated code in a sandboxed environment with static analysis and runtime assertions.
@@ -114,7 +114,7 @@ wp-bench dry-run --config wp-bench.yaml      # validate config without calling m
 Test suites live in `datasets/suites/<suite-name>/` with two directories per suite:
 
 - `execution/` — Code generation tasks with assertions (one JSON file per category)
-- `knowledge/` — Multiple-choice knowledge questions (one JSON file per category)
+- `knowledge/` — Multiple-choice and short-answer knowledge questions (one JSON file per category)
 
 The default suite `wp-core-v1` covers WordPress core APIs, hooks, database operations, and security patterns.
 

diff --git a/datasets/README.md b/datasets/README.md
@@ -75,5 +75,7 @@ dataset:
 |-------|------|-------------|
 | `id` | string | Unique test ID |
 | `prompt` | string | Question text |
-| `choices` | array | Multiple choice options `[{key, text}]` |
-| `correct_answer` | string | Correct choice key (e.g., "B") |
+| `type` | string | Knowledge mode such as `multiple_choice` or `short_answer` |
+| `choices` | array | Optional multiple choice options `[{key, text}]` |
+| `correct_answer` | string | Correct choice key or canonical short answer |
+| `answer_type` | string | Optional short-answer scoring mode such as `exact` or `contains` |
diff --git a/datasets/export_dataset.py b/datasets/export_dataset.py
@@ -34,11 +34,13 @@ def load_suite(suite_name: str) -> list[dict]:
                     "id": t["id"],
                     "suite": suite_name,
                     "test_kind": "execution",
+                    "type": "execution",
                     "prompt": t["prompt"],
                     "category": t.get("category", "general"),
                     "difficulty": t.get("difficulty", "unknown"),
                     "choices": orjson.dumps(t.get("choices", [])).decode(),
                     "correct_answer": "",
+                    "answer_type": "",
                     "requirements": orjson.dumps(t.get("requirements", [])).decode(),
                     "static_checks": orjson.dumps(t.get("static_checks", {})).decode(),
                     "runtime_checks": orjson.dumps(t.get("runtime_checks", {})).decode(),
@@ -56,11 +58,13 @@ def load_suite(suite_name: str) -> list[dict]:
                     "id": t["id"],
                     "suite": suite_name,
                     "test_kind": "knowledge",
+                    "type": t.get("type", "knowledge"),
                     "prompt": t["prompt"],
                     "category": t.get("category", "general"),
                     "difficulty": t.get("difficulty", "unknown"),
                     "choices": orjson.dumps(t.get("choices", [])).decode(),
                     "correct_answer": t.get("correct_answer", ""),
+                    "answer_type": t.get("answer_type", ""),
                     "requirements": "[]",
                     "static_checks": "{}",
                     "runtime_checks": "{}",

diff --git a/python/tests/test_knowledge.py b/python/tests/test_knowledge.py
@@ -0,0 +1,146 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from wp_bench.config import DatasetConfig
+from wp_bench.datasets import KnowledgeTest, _parse_knowledge_suite
+from wp_bench.knowledge import render_knowledge_prompt, score_knowledge_answer
+import wp_bench.datasets as datasets_module
+
+
+def test_render_knowledge_prompt_uses_choice_instructions_for_multiple_choice() -> None:
+    test = KnowledgeTest(
+        id="k-rest-002",
+        suite="wp-core-v1",
+        prompt="Which function is used to register a custom REST API route in WordPress?",
+        test_type="multiple_choice",
+        category="rest-api",
+        difficulty="intermediate",
+        choices=[
+            {"key": "A", "text": "add_rest_route()"},
+            {"key": "B", "text": "register_rest_route()"},
+        ],
+        correct_answer="B",
+    )
+
+    prompt = render_knowledge_prompt(test)
+
+    assert "Choices:" in prompt
+    assert "B. register_rest_route()" in prompt
+    assert "Answer with only the letter of the correct choice." in prompt
+
+
+def test_render_knowledge_prompt_uses_short_answer_instructions_without_choices() -> None:
+    test = KnowledgeTest(
+        id="k-rest-001",
+        suite="wp-core-v1",
+        prompt="What is the default namespace prefix for WordPress core REST API endpoints?",
+        test_type="short_answer",
+        category="rest-api",
+        difficulty="intermediate",
+        correct_answer="wp/v2",
+        answer_type="exact",
+    )
+
+    prompt = render_knowledge_prompt(test)
+
+    assert "Choices:" not in prompt
+    assert "Answer with only the letter" not in prompt
+    assert "Answer briefly with the correct WordPress function, API, hook, or value." in prompt
+
+
+def test_score_multiple_choice_accepts_letter_or_choice_text() -> None:
+    test = KnowledgeTest(
+        id="k-rest-002",
+        suite="wp-core-v1",
+        prompt="Which function is used to register a custom REST API route in WordPress?",
+        test_type="multiple_choice",
+        category="rest-api",
+        difficulty="intermediate",
+        choices=[
+            {"key": "A", "text": "add_rest_route()"},
+            {"key": "B", "text": "register_rest_route()"},
+        ],
+        correct_answer="B",
+    )
+
+    assert score_knowledge_answer(test, "B") == 1.0
+    assert score_knowledge_answer(test, "The answer is B.") == 1.0
+    assert score_knowledge_answer(test, "register_rest_route()") == 1.0
+    assert score_knowledge_answer(test, "A. register_rest_route()") == 0.0
+
+
+def test_score_short_answer_exact_accepts_expected_value_in_prose() -> None:
+    test = KnowledgeTest(
+        id="k-rest-001",
+        suite="wp-core-v1",
+        prompt="What is the default namespace prefix for WordPress core REST API endpoints?",
+        test_type="short_answer",
+        category="rest-api",
+        difficulty="intermediate",
+        correct_answer="wp/v2",
+        answer_type="exact",
+    )
+
+    assert score_knowledge_answer(test, "wp/v2") == 1.0
+    assert score_knowledge_answer(test, "The answer is wp/v2.") == 1.0
+
+
+def test_score_short_answer_contains_accepts_function_name_with_parens() -> None:
+    test = KnowledgeTest(
+        id="k-security-001",
+        suite="wp-core-v1",
+        prompt="Which function should be used to escape HTML output in WordPress?",
+        test_type="short_answer",
+        category="security",
+        difficulty="basic",
+        correct_answer="esc_html",
+        answer_type="contains",
+    )
+
+    assert score_knowledge_answer(test, "Use esc_html() for this.") == 1.0
+
+
+def test_local_parser_preserves_short_answer_metadata() -> None:
+    suite_path = (
+        Path(__file__).resolve().parents[2]
+        / "datasets"
+        / "suites"
+        / "wp-core-v1"
+        / "knowledge"
+        / "rest-api.json"
+    )
+
+    test = _parse_knowledge_suite(suite_path)[0]
+
+    assert test.test_type == "short_answer"
+    assert test.answer_type == "exact"
+    assert test.choices is None
+
+
+def test_huggingface_loader_preserves_short_answer_metadata(monkeypatch) -> None:
+    rows = [
+        {
+            "id": "k-rest-001",
+            "suite": "wp-core-v1",
+            "test_kind": "knowledge",
+            "type": "short_answer",
+            "prompt": "What is the default namespace prefix for WordPress core REST API endpoints?",
+            "category": "rest-api",
+            "difficulty": "intermediate",
+            "choices": "[]",
+            "correct_answer": "wp/v2",
+            "answer_type": "exact",
+        }
+    ]
+
+    monkeypatch.setattr(datasets_module, "hf_load_dataset", lambda *args, **kwargs: rows)
+
+    loaded = datasets_module.load_tests(
+        DatasetConfig(source="huggingface", name="WordPress/wp-bench-v1")
+    )
+    test = loaded["knowledge"][0]
+
+    assert test.test_type == "short_answer"
+    assert test.answer_type == "exact"
+    assert test.choices is None
diff --git a/python/tests/test_models.py b/python/tests/test_models.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+from litellm.exceptions import BadRequestError
+
+from wp_bench.config import ModelConfig
+from wp_bench.models import ModelInterface
+import wp_bench.models as models_module
+
+
+def test_generate_retries_without_temperature_on_deprecated_error(monkeypatch) -> None:
+    calls: list[dict] = []
+
+    def fake_completion(**kwargs):
+        calls.append(kwargs)
+        if len(calls) == 1:
+            raise BadRequestError(
+                message="AnthropicException - `temperature` is deprecated for this model.",
+                model="anthropic/claude-opus-4-7",
+                llm_provider="anthropic",
+            )
+        return SimpleNamespace(choices=[SimpleNamespace(message={"content": "ok"})])
+
+    monkeypatch.setattr(models_module, "completion", fake_completion)
+
+    model = ModelInterface(ModelConfig(name="anthropic/claude-opus-4-7"))
+    result = model.generate("hello")
+
+    assert result == "ok"
+    assert len(calls) == 2
+    assert calls[0]["temperature"] == 0.0
+    assert "temperature" not in calls[1]
+
+
+def test_generate_does_not_retry_other_bad_request_errors(monkeypatch) -> None:
+    def fake_completion(**kwargs):
+        raise BadRequestError(
+            message="AnthropicException - some other invalid request.",
+            model="anthropic/claude-opus-4-7",
+            llm_provider="anthropic",
+        )
+
+    monkeypatch.setattr(models_module, "completion", fake_completion)
+
+    model = ModelInterface(ModelConfig(name="anthropic/claude-opus-4-7"))
+
+    try:
+        model.generate("hello")
+    except BadRequestError as error:
+        assert "some other invalid request" in str(error)
+    else:
+        raise AssertionError("Expected BadRequestError to be raised")
diff --git a/python/wp_bench/config.py b/python/wp_bench/config.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import List, Literal, Optional, Union
+from typing import List, Literal, Optional
 
 from pydantic import BaseModel, Field, HttpUrl, validator
 

diff --git a/python/wp_bench/core.py b/python/wp_bench/core.py
@@ -13,6 +13,7 @@
 from .config import HarnessConfig, ModelConfig
 from .datasets import ExecutionTest, KnowledgeTest, load_tests
 from .environment import WordPressEnvironment
+from .knowledge import render_knowledge_prompt, score_knowledge_answer
 from .models import ModelInterface
 from .output import (
     create_progress,
@@ -106,10 +107,10 @@ def run(self) -> Dict[str, Any]:
         return payload
 
     def _run_knowledge_tests(self, tests: List[KnowledgeTest]) -> None:
-        """Run multiple-choice knowledge tests in parallel.
+        """Run knowledge tests in parallel.
 
         Prompts the model with WordPress knowledge questions and scores responses
-        based on whether they match the expected answer letter.
+        against either multiple-choice or short-answer expectations.
 
         Args:
             tests: List of knowledge test definitions.
@@ -125,7 +126,7 @@ def process_test(test: KnowledgeTest) -> Dict[str, Any]:
             try:
                 prompt = self._render_knowledge_prompt(test)
                 answer = strip_code_fences(self.model.generate(prompt)).strip()
-                correct = 1.0 if (test.correct_answer and answer.upper().startswith(test.correct_answer)) else 0.0
+                correct = score_knowledge_answer(test, answer)
                 return {
                     "test_id": test.id,
                     "type": "knowledge",
@@ -215,21 +216,15 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]:
 
     @staticmethod
     def _render_knowledge_prompt(test: KnowledgeTest) -> str:
-        """Format a knowledge test into a multiple-choice prompt string.
+        """Format a knowledge test into a prompt string.
 
         Args:
             test: Knowledge test with question and choices.
 
         Returns:
-            Formatted prompt asking for a single letter answer.
+            Formatted prompt with instructions that match the answer mode.
         """
-        prompt = [test.prompt]
-        if test.choices:
-            prompt.append("Choices:")
-            for choice in test.choices:
-                prompt.append(f"{choice['key']}. {choice['text']}")
-        prompt.append("Answer with only the letter of the correct choice.")
-        return "\n".join(prompt)
+        return render_knowledge_prompt(test)
 
     @staticmethod
     def _render_execution_prompt(test: ExecutionTest) -> str:
@@ -425,7 +420,7 @@ def process_test(test: KnowledgeTest) -> Dict[str, Any]:
             try:
                 prompt = BenchmarkRunner._render_knowledge_prompt(test)
                 answer = strip_code_fences(self.model.generate(prompt)).strip()
-                correct = 1.0 if (test.correct_answer and answer.upper().startswith(test.correct_answer)) else 0.0
+                correct = score_knowledge_answer(test, answer)
                 return {
                     "test_id": test.id,
                     "type": "knowledge",

diff --git a/python/wp_bench/datasets.py b/python/wp_bench/datasets.py
@@ -40,6 +40,7 @@ class KnowledgeTest:
     difficulty: str
     choices: Optional[List[Dict[str, Any]]] = None
     correct_answer: Optional[str] = None
+    answer_type: Optional[str] = None
     answer: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
 
@@ -88,16 +89,18 @@ def _load_from_huggingface(config: DatasetConfig) -> Dict[str, List[Any]]:
                 )
             )
         else:
+            choice_list = choices if isinstance(choices, list) and choices else None
             knowledge.append(
                 KnowledgeTest(
                     id=row["id"],
                     suite=row.get("suite", config.name),
                     prompt=row["prompt"],
-                    test_type="knowledge",
+                    test_type=row.get("type", "knowledge"),
                     category=row.get("category", "general"),
                     difficulty=row.get("difficulty", "unknown"),
-                    choices=choices if isinstance(choices, list) else None,
+                    choices=choice_list,
                     correct_answer=row.get("correct_answer"),
+                    answer_type=row.get("answer_type"),
                     metadata={},
                 )
             )
@@ -179,6 +182,7 @@ def _parse_knowledge_suite(path: Path) -> List[KnowledgeTest]:
                 difficulty=test.get("difficulty", "unknown"),
                 choices=test.get("choices"),
                 correct_answer=test.get("correct_answer"),
+                answer_type=test.get("answer_type"),
                 metadata={"suite_metadata": metadata},
             )
         )