diff --git a/README.md b/README.md
index 382594a..cc856fa 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ The official WordPress AI benchmark. Evaluate how well language models understan
 
 WP-Bench measures AI model capabilities across two dimensions:
 
-- **Knowledge** — Multiple-choice questions testing WordPress concepts, APIs, and best practices
+- **Knowledge** — Multiple-choice and short-answer questions testing WordPress concepts, APIs, and best practices
 - **Execution** — Code generation tasks graded by a real WordPress runtime for correctness and quality
 
 The benchmark uses WordPress itself as the grader, running generated code in a sandboxed environment with static analysis and runtime assertions.
@@ -114,7 +114,7 @@ wp-bench dry-run --config wp-bench.yaml      # validate config without calling m
 Test suites live in `datasets/suites/<suite-name>/` with two directories per suite:
 
 - `execution/` — Code generation tasks with assertions (one JSON file per category)
-- `knowledge/` — Multiple-choice knowledge questions (one JSON file per category)
+- `knowledge/` — Multiple-choice and short-answer knowledge questions (one JSON file per category)
 
 The default suite `wp-core-v1` covers WordPress core APIs, hooks, database operations, and security patterns.
 
diff --git a/datasets/README.md b/datasets/README.md
index 334041d..0758880 100644
--- a/datasets/README.md
+++ b/datasets/README.md
@@ -75,5 +75,7 @@ dataset:
 |-------|------|-------------|
 | `id` | string | Unique test ID |
 | `prompt` | string | Question text |
-| `choices` | array | Multiple choice options `[{key, text}]` |
-| `correct_answer` | string | Correct choice key (e.g., "B") |
+| `type` | string | Knowledge mode such as `multiple_choice` or `short_answer` |
+| `choices` | array | Optional multiple choice options `[{key, text}]` |
+| `correct_answer` | string | Correct choice key or canonical short answer |
+| `answer_type` | string | Optional short-answer scoring mode such as `exact` or `contains` |
diff --git a/datasets/export_dataset.py b/datasets/export_dataset.py
index 2cce090..3f53d8b 100644
--- a/datasets/export_dataset.py
+++ b/datasets/export_dataset.py
@@ -34,11 +34,13 @@ def load_suite(suite_name: str) -> list[dict]:
                     "id": t["id"],
                     "suite": suite_name,
                     "test_kind": "execution",
+                    "type": "execution",
                     "prompt": t["prompt"],
                     "category": t.get("category", "general"),
                     "difficulty": t.get("difficulty", "unknown"),
                     "choices": orjson.dumps(t.get("choices", [])).decode(),
                     "correct_answer": "",
+                    "answer_type": "",
                     "requirements": orjson.dumps(t.get("requirements", [])).decode(),
                     "static_checks": orjson.dumps(t.get("static_checks", {})).decode(),
                     "runtime_checks": orjson.dumps(t.get("runtime_checks", {})).decode(),
@@ -56,11 +58,13 @@ def load_suite(suite_name: str) -> list[dict]:
                     "id": t["id"],
                     "suite": suite_name,
                     "test_kind": "knowledge",
+                    "type": t.get("type", "knowledge"),
                     "prompt": t["prompt"],
                     "category": t.get("category", "general"),
                     "difficulty": t.get("difficulty", "unknown"),
                     "choices": orjson.dumps(t.get("choices", [])).decode(),
                     "correct_answer": t.get("correct_answer", ""),
+                    "answer_type": t.get("answer_type", ""),
                     "requirements": "[]",
                     "static_checks": "{}",
                     "runtime_checks": "{}",
diff --git a/python/tests/test_knowledge.py b/python/tests/test_knowledge.py
new file mode 100644
index 0000000..382ee15
--- /dev/null
+++ b/python/tests/test_knowledge.py
@@ -0,0 +1,146 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from wp_bench.config import DatasetConfig
+from wp_bench.datasets import KnowledgeTest, _parse_knowledge_suite
+from wp_bench.knowledge import render_knowledge_prompt, score_knowledge_answer
+import wp_bench.datasets as datasets_module
+
+
+def test_render_knowledge_prompt_uses_choice_instructions_for_multiple_choice() -> None:
+    test = KnowledgeTest(
+        id="k-rest-002",
+        suite="wp-core-v1",
+        prompt="Which function is used to register a custom REST API route in WordPress?",
+        test_type="multiple_choice",
+        category="rest-api",
+        difficulty="intermediate",
+        choices=[
+            {"key": "A", "text": "add_rest_route()"},
+            {"key": "B", "text": "register_rest_route()"},
+        ],
+        correct_answer="B",
+    )
+
+    prompt = render_knowledge_prompt(test)
+
+    assert "Choices:" in prompt
+    assert "B. register_rest_route()" in prompt
+    assert "Answer with only the letter of the correct choice." in prompt
+
+
+def test_render_knowledge_prompt_uses_short_answer_instructions_without_choices() -> None:
+    test = KnowledgeTest(
+        id="k-rest-001",
+        suite="wp-core-v1",
+        prompt="What is the default namespace prefix for WordPress core REST API endpoints?",
+        test_type="short_answer",
+        category="rest-api",
+        difficulty="intermediate",
+        correct_answer="wp/v2",
+        answer_type="exact",
+    )
+
+    prompt = render_knowledge_prompt(test)
+
+    assert "Choices:" not in prompt
+    assert "Answer with only the letter" not in prompt
+    assert "Answer briefly with the correct WordPress function, API, hook, or value." in prompt
+
+
+def test_score_multiple_choice_accepts_letter_or_choice_text() -> None:
+    test = KnowledgeTest(
+        id="k-rest-002",
+        suite="wp-core-v1",
+        prompt="Which function is used to register a custom REST API route in WordPress?",
+        test_type="multiple_choice",
+        category="rest-api",
+        difficulty="intermediate",
+        choices=[
+            {"key": "A", "text": "add_rest_route()"},
+            {"key": "B", "text": "register_rest_route()"},
+        ],
+        correct_answer="B",
+    )
+
+    assert score_knowledge_answer(test, "B") == 1.0
+    assert score_knowledge_answer(test, "The answer is B.") == 1.0
+    assert score_knowledge_answer(test, "register_rest_route()") == 1.0
+    assert score_knowledge_answer(test, "A. register_rest_route()") == 0.0
+
+
+def test_score_short_answer_exact_accepts_expected_value_in_prose() -> None:
+    test = KnowledgeTest(
+        id="k-rest-001",
+        suite="wp-core-v1",
+        prompt="What is the default namespace prefix for WordPress core REST API endpoints?",
+        test_type="short_answer",
+        category="rest-api",
+        difficulty="intermediate",
+        correct_answer="wp/v2",
+        answer_type="exact",
+    )
+
+    assert score_knowledge_answer(test, "wp/v2") == 1.0
+    assert score_knowledge_answer(test, "The answer is wp/v2.") == 1.0
+
+
+def test_score_short_answer_contains_accepts_function_name_with_parens() -> None:
+    test = KnowledgeTest(
+        id="k-security-001",
+        suite="wp-core-v1",
+        prompt="Which function should be used to escape HTML output in WordPress?",
+        test_type="short_answer",
+        category="security",
+        difficulty="basic",
+        correct_answer="esc_html",
+        answer_type="contains",
+    )
+
+    assert score_knowledge_answer(test, "Use esc_html() for this.") == 1.0
+
+
+def test_local_parser_preserves_short_answer_metadata() -> None:
+    suite_path = (
+        Path(__file__).resolve().parents[2]
+        / "datasets"
+        / "suites"
+        / "wp-core-v1"
+        / "knowledge"
+        / "rest-api.json"
+    )
+
+    test = _parse_knowledge_suite(suite_path)[0]
+
+    assert test.test_type == "short_answer"
+    assert test.answer_type == "exact"
+    assert test.choices is None
+
+
+def test_huggingface_loader_preserves_short_answer_metadata(monkeypatch) -> None:
+    rows = [
+        {
+            "id": "k-rest-001",
+            "suite": "wp-core-v1",
+            "test_kind": "knowledge",
+            "type": "short_answer",
+            "prompt": "What is the default namespace prefix for WordPress core REST API endpoints?",
+            "category": "rest-api",
+            "difficulty": "intermediate",
+            "choices": "[]",
+            "correct_answer": "wp/v2",
+            "answer_type": "exact",
+        }
+    ]
+
+    monkeypatch.setattr(datasets_module, "hf_load_dataset", lambda *args, **kwargs: rows)
+
+    loaded = datasets_module.load_tests(
+        DatasetConfig(source="huggingface", name="WordPress/wp-bench-v1")
+    )
+    test = loaded["knowledge"][0]
+
+    assert test.test_type == "short_answer"
+    assert test.answer_type == "exact"
+    assert test.choices is None
diff --git a/python/tests/test_models.py b/python/tests/test_models.py
new file mode 100644
index 0000000..21c6706
--- /dev/null
+++ b/python/tests/test_models.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+from litellm.exceptions import BadRequestError
+
+from wp_bench.config import ModelConfig
+from wp_bench.models import ModelInterface
+import wp_bench.models as models_module
+
+
+def test_generate_retries_without_temperature_on_deprecated_error(monkeypatch) -> None:
+    calls: list[dict] = []
+
+    def fake_completion(**kwargs):
+        calls.append(kwargs)
+        if len(calls) == 1:
+            raise BadRequestError(
+                message="AnthropicException - `temperature` is deprecated for this model.",
+                model="anthropic/claude-opus-4-7",
+                llm_provider="anthropic",
+            )
+        return SimpleNamespace(choices=[SimpleNamespace(message={"content": "ok"})])
+
+    monkeypatch.setattr(models_module, "completion", fake_completion)
+
+    model = ModelInterface(ModelConfig(name="anthropic/claude-opus-4-7"))
+    result = model.generate("hello")
+
+    assert result == "ok"
+    assert len(calls) == 2
+    assert calls[0]["temperature"] == 0.0
+    assert "temperature" not in calls[1]
+
+
+def test_generate_does_not_retry_other_bad_request_errors(monkeypatch) -> None:
+    def fake_completion(**kwargs):
+        raise BadRequestError(
+            message="AnthropicException - some other invalid request.",
+            model="anthropic/claude-opus-4-7",
+            llm_provider="anthropic",
+        )
+
+    monkeypatch.setattr(models_module, "completion", fake_completion)
+
+    model = ModelInterface(ModelConfig(name="anthropic/claude-opus-4-7"))
+
+    try:
+        model.generate("hello")
+    except BadRequestError as error:
+        assert "some other invalid request" in str(error)
+    else:
+        raise AssertionError("Expected BadRequestError to be raised")
diff --git a/python/wp_bench/config.py b/python/wp_bench/config.py
index 8e63144..8ff6195 100644
--- a/python/wp_bench/config.py
+++ b/python/wp_bench/config.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import List, Literal, Optional, Union
+from typing import List, Literal, Optional
 
 from pydantic import BaseModel, Field, HttpUrl, validator
 
diff --git a/python/wp_bench/core.py b/python/wp_bench/core.py
index ca7d95a..de7a778 100644
--- a/python/wp_bench/core.py
+++ b/python/wp_bench/core.py
@@ -13,6 +13,7 @@
 from .config import HarnessConfig, ModelConfig
 from .datasets import ExecutionTest, KnowledgeTest, load_tests
 from .environment import WordPressEnvironment
+from .knowledge import render_knowledge_prompt, score_knowledge_answer
 from .models import ModelInterface
 from .output import (
     create_progress,
@@ -106,10 +107,10 @@ def run(self) -> Dict[str, Any]:
         return payload
 
     def _run_knowledge_tests(self, tests: List[KnowledgeTest]) -> None:
-        """Run multiple-choice knowledge tests in parallel.
+        """Run knowledge tests in parallel.
 
         Prompts the model with WordPress knowledge questions and scores responses
-        based on whether they match the expected answer letter.
+        against either multiple-choice or short-answer expectations.
 
         Args:
             tests: List of knowledge test definitions.
@@ -125,7 +126,7 @@ def process_test(test: KnowledgeTest) -> Dict[str, Any]:
             try:
                 prompt = self._render_knowledge_prompt(test)
                 answer = strip_code_fences(self.model.generate(prompt)).strip()
-                correct = 1.0 if (test.correct_answer and answer.upper().startswith(test.correct_answer)) else 0.0
+                correct = score_knowledge_answer(test, answer)
                 return {
                     "test_id": test.id,
                     "type": "knowledge",
@@ -215,21 +216,15 @@ def process_test(test: ExecutionTest) -> Dict[str, Any]:
 
     @staticmethod
     def _render_knowledge_prompt(test: KnowledgeTest) -> str:
-        """Format a knowledge test into a multiple-choice prompt string.
+        """Format a knowledge test into a prompt string.
 
         Args:
             test: Knowledge test with question and choices.
 
         Returns:
-            Formatted prompt asking for a single letter answer.
+            Formatted prompt with instructions that match the answer mode.
         """
-        prompt = [test.prompt]
-        if test.choices:
-            prompt.append("Choices:")
-            for choice in test.choices:
-                prompt.append(f"{choice['key']}. {choice['text']}")
-        prompt.append("Answer with only the letter of the correct choice.")
-        return "\n".join(prompt)
+        return render_knowledge_prompt(test)
 
     @staticmethod
     def _render_execution_prompt(test: ExecutionTest) -> str:
@@ -425,7 +420,7 @@ def process_test(test: KnowledgeTest) -> Dict[str, Any]:
             try:
                 prompt = BenchmarkRunner._render_knowledge_prompt(test)
                 answer = strip_code_fences(self.model.generate(prompt)).strip()
-                correct = 1.0 if (test.correct_answer and answer.upper().startswith(test.correct_answer)) else 0.0
+                correct = score_knowledge_answer(test, answer)
                 return {
                     "test_id": test.id,
                     "type": "knowledge",
diff --git a/python/wp_bench/datasets.py b/python/wp_bench/datasets.py
index c945677..97ae55e 100644
--- a/python/wp_bench/datasets.py
+++ b/python/wp_bench/datasets.py
@@ -40,6 +40,7 @@ class KnowledgeTest:
     difficulty: str
     choices: Optional[List[Dict[str, Any]]] = None
     correct_answer: Optional[str] = None
+    answer_type: Optional[str] = None
     answer: Optional[str] = None
     metadata: Optional[Dict[str, Any]] = None
 
@@ -88,16 +89,18 @@ def _load_from_huggingface(config: DatasetConfig) -> Dict[str, List[Any]]:
                 )
             )
         else:
+            choice_list = choices if isinstance(choices, list) and choices else None
             knowledge.append(
                 KnowledgeTest(
                     id=row["id"],
                     suite=row.get("suite", config.name),
                     prompt=row["prompt"],
-                    test_type="knowledge",
+                    test_type=row.get("type", "knowledge"),
                     category=row.get("category", "general"),
                     difficulty=row.get("difficulty", "unknown"),
-                    choices=choices if isinstance(choices, list) else None,
+                    choices=choice_list,
                     correct_answer=row.get("correct_answer"),
+                    answer_type=row.get("answer_type"),
                     metadata={},
                 )
             )
@@ -179,6 +182,7 @@ def _parse_knowledge_suite(path: Path) -> List[KnowledgeTest]:
                 difficulty=test.get("difficulty", "unknown"),
                 choices=test.get("choices"),
                 correct_answer=test.get("correct_answer"),
+                answer_type=test.get("answer_type"),
                 metadata={"suite_metadata": metadata},
             )
         )
diff --git a/python/wp_bench/knowledge.py b/python/wp_bench/knowledge.py
new file mode 100644
index 0000000..8d03e5a
--- /dev/null
+++ b/python/wp_bench/knowledge.py
@@ -0,0 +1,86 @@
+"""Knowledge prompt rendering and scoring helpers."""
+from __future__ import annotations
+
+import re
+
+from .datasets import KnowledgeTest
+from .utils import strip_code_fences
+
+_CHOICE_ANSWER_RE = re.compile(
+    r"^\s*(?:(?:the\s+)?(?:correct\s+)?(?:answer|option|choice)\s*(?:is|:)\s*)?"
+    r"[\(\[]?([A-Z])(?:[\)\].:\-,\s]|$)",
+    re.IGNORECASE,
+)
+
+
+def render_knowledge_prompt(test: KnowledgeTest) -> str:
+    """Format a knowledge test prompt based on its answer mode."""
+    prompt = [test.prompt]
+    if test.choices:
+        prompt.append("Choices:")
+        for choice in test.choices:
+            prompt.append(f"{choice['key']}. {choice['text']}")
+        prompt.append("Answer with only the letter of the correct choice.")
+    else:
+        prompt.append("Answer briefly with the correct WordPress function, API, hook, or value.")
+    return "\n".join(prompt)
+
+
+def score_knowledge_answer(test: KnowledgeTest, answer: str) -> float:
+    """Score a model response for a knowledge test."""
+    if not test.correct_answer:
+        return 0.0
+
+    if test.choices:
+        correct_key = test.correct_answer.upper()
+        answer_key = _extract_choice_key(answer)
+        if answer_key is not None:
+            return 1.0 if answer_key == correct_key else 0.0
+
+        correct_choice_text = _lookup_choice_text(test, correct_key)
+        if not correct_choice_text:
+            return 0.0
+
+        answer_normalized = _normalize_knowledge_text(answer)
+        choice_normalized = _normalize_knowledge_text(correct_choice_text)
+        return 1.0 if choice_normalized and choice_normalized in answer_normalized else 0.0
+
+    expected = _normalize_knowledge_text(test.correct_answer)
+    actual = _normalize_knowledge_text(answer)
+    if not expected or not actual:
+        return 0.0
+
+    if actual == expected:
+        return 1.0
+
+    answer_type = (test.answer_type or "exact").lower()
+    if answer_type in {"contains", "exact"} and expected in actual:
+        return 1.0
+
+    return 0.0
+
+
+def _extract_choice_key(answer: str) -> str | None:
+    text = strip_code_fences(answer).strip()
+    if not text:
+        return None
+    match = _CHOICE_ANSWER_RE.match(text)
+    if not match:
+        return None
+    return match.group(1).upper()
+
+
+def _lookup_choice_text(test: KnowledgeTest, key: str) -> str | None:
+    if not test.choices:
+        return None
+    for choice in test.choices:
+        choice_key = str(choice.get("key", "")).upper()
+        if choice_key == key:
+            text = choice.get("text")
+            return text if isinstance(text, str) else None
+    return None
+
+
+def _normalize_knowledge_text(value: str) -> str:
+    text = strip_code_fences(value).strip().casefold()
+    return re.sub(r"\s+", " ", text)
diff --git a/python/wp_bench/models.py b/python/wp_bench/models.py
index 509086e..9b7a0e7 100644
--- a/python/wp_bench/models.py
+++ b/python/wp_bench/models.py
@@ -1,7 +1,10 @@
 """Model interface leveraging LiteLLM providers."""
 from __future__ import annotations
 
+from typing import Any
+
 from litellm import completion, completion_cost
+from litellm.exceptions import BadRequestError
 from litellm.utils import ModelResponse
 
 from .config import ModelConfig
@@ -22,17 +25,32 @@ def generate(self, prompt: str) -> str:
         Returns:
             The model's response text.
         """
-        response: ModelResponse = completion(
-            model=self.config.name,
-            messages=[{"role": "user", "content": prompt}],
-            temperature=self.config.temperature,
-            max_tokens=self.config.max_tokens,
-            top_p=self.config.top_p,
-            timeout=self.config.request_timeout,
-        )
+        kwargs = self._completion_kwargs(prompt)
+        try:
+            response: ModelResponse = completion(**kwargs)
+        except BadRequestError as error:
+            if not _is_deprecated_temperature_error(error) or "temperature" not in kwargs:
+                raise
+            kwargs.pop("temperature")
+            response = completion(**kwargs)
         choice = response.choices[0]
         return choice.message["content"]  # type: ignore[index]
 
     @staticmethod
     def estimate_cost(response: ModelResponse) -> float:
         return completion_cost(response)
+
+    def _completion_kwargs(self, prompt: str) -> dict[str, Any]:
+        kwargs: dict[str, Any] = {
+            "model": self.config.name,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": self.config.max_tokens,
+            "top_p": self.config.top_p,
+            "timeout": self.config.request_timeout,
+        }
+        kwargs["temperature"] = self.config.temperature
+        return kwargs
+
+
+def _is_deprecated_temperature_error(error: BadRequestError) -> bool:
+    return "`temperature` is deprecated" in str(error)