From 996d9e5d1de9b164219ec88d743d33240fd1113e Mon Sep 17 00:00:00 2001
From: Virginia Wu <vadams@nvidia.com>
Date: Thu, 25 Jun 2026 17:06:58 -0700
Subject: [PATCH 1/8] added task distribution calculation code for gdpval and
 other datasets

Signed-off-by: Virginia Wu <vadams@nvidia.com>
---
 .../stirrup_agent/task_distribution.py        | 406 ++++++++++++++++++
 .../tests/test_task_distribution.py           | 269 ++++++++++++
 2 files changed, 675 insertions(+)
 create mode 100644 responses_api_agents/stirrup_agent/task_distribution.py
 create mode 100644 responses_api_agents/stirrup_agent/tests/test_task_distribution.py
diff --git a/responses_api_agents/stirrup_agent/task_distribution.py b/responses_api_agents/stirrup_agent/task_distribution.py
new file mode 100644
index 000000000..20f41fb45
--- /dev/null
+++ b/responses_api_agents/stirrup_agent/task_distribution.py
@@ -0,0 +1,406 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Build a task distribution over one or more dataset columns.
+
+A *distribution* groups every task in a dataset by the value(s) of one or
+more metadata columns (e.g. ``sector``, or ``sector`` + ``occupation``) and
+records, for each group, the fraction of the dataset it covers and the list
+of ``task_id``s that fall into it::
+
+    {
+      "Business, Finance & Operations": {"percentage": 0.05, "task_ids": ["a", "b"]},
+      "Legal": {"percentage": 0.50, "task_ids": [...]},
+      "Healthcare": {"percentage": 0.45, "task_ids": [...]}
+    }
+
+Datasets are the NeMo Gym Responses-API JSONL format: one task per line, with
+the groupable columns living under ``responses_create_params.metadata``.
+
+The grouping logic is intentionally separated from the CLI so the resulting
+distribution can later be reused to *sample* ``task_id``s (see
+``sample_task_ids``).
+
+Usage::
+
+    # --dataset defaults to the prepared GDPVal dataset
+    # (benchmarks/gdpval/data/gdpval_benchmark.jsonl) when omitted.
+    python -m responses_api_agents.stirrup_agent.task_distribution \
+        --column sector \
+        --output sector_distribution.json
+
+    # Composite key over multiple columns, explicit dataset:
+    python -m responses_api_agents.stirrup_agent.task_distribution \
+        --dataset data/gdpval.jsonl --column sector --column occupation \
+        --output sector_occupation_distribution.json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import random
+import sys
+from pathlib import Path
+from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Sequence
+
+
+# Sentinel used when a row is missing one of the requested columns.
+MISSING_VALUE = "<missing>"
+
+# Separator joining multiple column values into a single composite key.
+DEFAULT_KEY_SEPARATOR = " | "
+
+# Repo root: this file is responses_api_agents/stirrup_agent/task_distribution.py.
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+
+# Candidate GDPVal dataset locations, in priority order. The first that exists
+# is used when ``--dataset`` is not given. The prepared benchmark JSONL (written
+# by ``gym eval prepare --benchmark gdpval``) is preferred; the agent-local
+# ``data/gdpval.jsonl`` (written by setup_scripts/gdpval.sh) is a fallback.
+# The synthetic ``example.jsonl`` is intentionally *not* a default so the
+# command never silently computes a distribution over a single fake task.
+DEFAULT_DATASET_CANDIDATES = (
+    _REPO_ROOT / "benchmarks" / "gdpval" / "data" / "gdpval_benchmark.jsonl",
+    Path(__file__).resolve().parent / "data" / "gdpval.jsonl",
+)
+
+
+def resolve_default_dataset(
+    candidates: Optional[Sequence[Path]] = None,
+) -> Optional[Path]:
+    """Return the first existing default GDPVal dataset, or ``None``.
+
+    Used when the caller does not pass an explicit ``--dataset``; prefers the
+    prepared benchmark JSONL and falls back to agent-local datasets.
+    """
+    if candidates is None:
+        candidates = DEFAULT_DATASET_CANDIDATES
+    for candidate in candidates:
+        if candidate.is_file():
+            return candidate
+    return None
+
+
+def _no_dataset_message() -> str:
+    """Actionable error shown when no dataset is specified and no default exists."""
+    searched = "".join(f"  - {c}\n" for c in DEFAULT_DATASET_CANDIDATES)
+    return (
+        "No dataset specified and no default GDPVal dataset was found.\n"
+        f"\nSearched these default locations:\n{searched}"
+        "\nTo fix this, do one of the following:\n"
+        "\n  1. Prepare the GDPVal benchmark dataset (recommended). This downloads\n"
+        "     the openai/gdpval dataset from HuggingFace and writes\n"
+        "     benchmarks/gdpval/data/gdpval_benchmark.jsonl (220 tasks):\n"
+        "\n         export HF_TOKEN=<your-huggingface-token>\n"
+        "         gym eval prepare --benchmark gdpval\n"
+        "\n     (or run: bash responses_api_agents/stirrup_agent/setup_scripts/gdpval.sh)\n"
+        "\n  2. Pass an explicit dataset path with --dataset <path-to.jsonl>.\n"
+        "\nNote: the GDPVal dataset is gated on HuggingFace, so HF_TOKEN must be set\n"
+        "and your account must have access to https://huggingface.co/datasets/openai/gdpval.\n"
+    )
+
+
+def iter_dataset_rows(dataset_path: str | Path) -> Iterator[Dict[str, Any]]:
+    """Yield parsed JSON objects from a Responses-API JSONL dataset.
+
+    Blank lines are skipped; malformed lines raise ``ValueError`` with the
+    1-based line number so the offending row is easy to find.
+    """
+    path = Path(dataset_path)
+    with path.open("r", encoding="utf-8") as handle:
+        for line_no, line in enumerate(handle, start=1):
+            stripped = line.strip()
+            if not stripped:
+                continue
+            try:
+                yield json.loads(stripped)
+            except json.JSONDecodeError as exc:
+                raise ValueError(f"{path}:{line_no}: invalid JSON line: {exc}") from exc
+
+
+def extract_metadata(row: Mapping[str, Any]) -> Dict[str, Any]:
+    """Return the ``responses_create_params.metadata`` dict for a row.
+
+    Falls back to a top-level ``metadata`` key (and finally the row itself)
+    so the function also works on flatter dataset variants.
+    """
+    params = row.get("responses_create_params")
+    if isinstance(params, Mapping):
+        metadata = params.get("metadata")
+        if isinstance(metadata, Mapping):
+            return dict(metadata)
+    metadata = row.get("metadata")
+    if isinstance(metadata, Mapping):
+        return dict(metadata)
+    return dict(row)
+
+
+def compose_key(
+    metadata: Mapping[str, Any],
+    columns: Sequence[str],
+    *,
+    separator: str = DEFAULT_KEY_SEPARATOR,
+    missing_value: str = MISSING_VALUE,
+) -> str:
+    """Build the distribution key for a row from one or more columns.
+
+    Each column value is stringified; missing values become ``missing_value``.
+    Multiple columns are joined with ``separator`` into a composite key.
+    """
+    parts: List[str] = []
+    for column in columns:
+        value = metadata.get(column, None)
+        if value is None:
+            parts.append(missing_value)
+        else:
+            parts.append(str(value))
+    return separator.join(parts)
+
+
+def build_distribution(
+    rows: Iterable[Mapping[str, Any]],
+    columns: Sequence[str],
+    *,
+    task_id_column: str = "task_id",
+    separator: str = DEFAULT_KEY_SEPARATOR,
+    missing_value: str = MISSING_VALUE,
+    precision: Optional[int] = 6,
+) -> Dict[str, Dict[str, Any]]:
+    """Compute the task distribution across ``columns``.
+
+    Returns a mapping ``key -> {"percentage": float, "task_ids": [...]}`` where
+    ``percentage`` is the fraction (0..1) of all tasks that share that key and
+    ``task_ids`` lists every matching task in first-seen order. The mapping is
+    ordered by descending ``percentage`` (ties broken by key) for readability.
+
+    ``percentage`` values are rounded to ``precision`` decimal places when
+    ``precision`` is not ``None``. Note that rounding can make the percentages
+    sum to slightly more or less than 1.0; the unrounded fractions always sum
+    to 1.0.
+    """
+    if not columns:
+        raise ValueError("At least one column is required to build a distribution.")
+
+    grouped: Dict[str, List[str]] = {}
+    total = 0
+    for index, row in enumerate(rows):
+        metadata = extract_metadata(row)
+        key = compose_key(metadata, columns, separator=separator, missing_value=missing_value)
+        task_id = metadata.get(task_id_column)
+        # Fall back to a positional id so every task is still counted/listed
+        # even when the dataset lacks an explicit task-id column.
+        task_id_str = str(task_id) if task_id is not None else f"{task_id_column}_index_{index}"
+        grouped.setdefault(key, []).append(task_id_str)
+        total += 1
+
+    distribution: Dict[str, Dict[str, Any]] = {}
+    for key, task_ids in grouped.items():
+        fraction = (len(task_ids) / total) if total else 0.0
+        percentage = round(fraction, precision) if precision is not None else fraction
+        distribution[key] = {"percentage": percentage, "task_ids": task_ids}
+
+    # Sort by descending share, then by key for stable, readable output.
+    ordered = dict(
+        sorted(
+            distribution.items(),
+            key=lambda item: (-len(item[1]["task_ids"]), item[0]),
+        )
+    )
+    return ordered
+
+
+def build_distribution_from_dataset(
+    dataset_path: str | Path,
+    columns: Sequence[str],
+    *,
+    task_id_column: str = "task_id",
+    separator: str = DEFAULT_KEY_SEPARATOR,
+    missing_value: str = MISSING_VALUE,
+    precision: Optional[int] = 6,
+) -> Dict[str, Dict[str, Any]]:
+    """Convenience wrapper: read a JSONL dataset and build its distribution."""
+    return build_distribution(
+        iter_dataset_rows(dataset_path),
+        columns,
+        task_id_column=task_id_column,
+        separator=separator,
+        missing_value=missing_value,
+        precision=precision,
+    )
+
+
+def sample_task_ids(
+    distribution: Mapping[str, Mapping[str, Any]],
+    n: int,
+    *,
+    rng: Optional[random.Random] = None,
+    replace: bool = False,
+) -> List[str]:
+    """Sample ``n`` ``task_id``s in proportion to a distribution's percentages.
+
+    Each task id is drawn by first choosing a group weighted by its
+    ``percentage`` and then choosing a task id within that group. With
+    ``replace=False`` (default) the same task id is never returned twice and
+    ``n`` is capped at the total number of available task ids.
+
+    This is the consumption-side counterpart to ``build_distribution`` and is
+    provided so the saved distribution file can directly drive task sampling.
+    """
+    if n <= 0:
+        return []
+    rng = rng or random.Random()
+
+    keys = list(distribution.keys())
+    weights = [float(distribution[key].get("percentage", 0.0)) for key in keys]
+    if not keys or sum(weights) <= 0:
+        return []
+
+    if replace:
+        sampled: List[str] = []
+        for _ in range(n):
+            (chosen_key,) = rng.choices(keys, weights=weights, k=1)
+            task_ids = list(distribution[chosen_key].get("task_ids", []))
+            if not task_ids:
+                continue
+            sampled.append(rng.choice(task_ids))
+        return sampled
+
+    # Without replacement: track remaining ids per group and renormalise.
+    remaining: Dict[str, List[str]] = {key: list(distribution[key].get("task_ids", [])) for key in keys}
+    total_available = sum(len(ids) for ids in remaining.values())
+    target = min(n, total_available)
+
+    sampled = []
+    while len(sampled) < target:
+        live_keys = [key for key in keys if remaining[key]]
+        live_weights = [float(distribution[key].get("percentage", 0.0)) for key in live_keys]
+        if not live_keys or sum(live_weights) <= 0:
+            break
+        (chosen_key,) = rng.choices(live_keys, weights=live_weights, k=1)
+        bucket = remaining[chosen_key]
+        idx = rng.randrange(len(bucket))
+        sampled.append(bucket.pop(idx))
+    return sampled
+
+
+def _build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="task_distribution",
+        description=(
+            "Build a JSON distribution of tasks across one or more dataset "
+            "columns (e.g. sector, occupation) from a Responses-API JSONL dataset."
+        ),
+    )
+    parser.add_argument(
+        "--dataset",
+        default=None,
+        help=(
+            "Path to the input JSONL dataset (one task per line). If omitted, "
+            "defaults to the prepared GDPVal dataset "
+            "(benchmarks/gdpval/data/gdpval_benchmark.jsonl), falling back to "
+            "the agent-local data/gdpval.jsonl or data/example.jsonl."
+        ),
+    )
+    parser.add_argument(
+        "--column",
+        dest="columns",
+        action="append",
+        required=True,
+        metavar="COLUMN",
+        help=(
+            "Metadata column to group by. Repeat to group by a composite key "
+            "(e.g. --column sector --column occupation)."
+        ),
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        default=None,
+        help="Path to write the distribution JSON. Defaults to stdout.",
+    )
+    parser.add_argument(
+        "--task-id-column",
+        default="task_id",
+        help="Metadata column holding the task id (default: task_id).",
+    )
+    parser.add_argument(
+        "--separator",
+        default=DEFAULT_KEY_SEPARATOR,
+        help=f"Separator joining multiple column values into one key (default: {DEFAULT_KEY_SEPARATOR!r}).",
+    )
+    parser.add_argument(
+        "--missing-value",
+        default=MISSING_VALUE,
+        help=f"Placeholder for rows missing a column (default: {MISSING_VALUE!r}).",
+    )
+    parser.add_argument(
+        "--precision",
+        type=int,
+        default=6,
+        help="Decimal places to round percentages to; use -1 for no rounding (default: 6).",
+    )
+    parser.add_argument(
+        "--indent",
+        type=int,
+        default=2,
+        help="Indentation for the output JSON; use -1 for compact output (default: 2).",
+    )
+    return parser
+
+
+def main(argv: Optional[Sequence[str]] = None) -> int:
+    parser = _build_arg_parser()
+    args = parser.parse_args(argv)
+
+    if args.dataset is not None:
+        dataset_path = Path(args.dataset)
+        if not dataset_path.is_file():
+            print(f"Dataset not found: {dataset_path}", file=sys.stderr)
+            return 2
+    else:
+        dataset_path = resolve_default_dataset()
+        if dataset_path is None:
+            print(_no_dataset_message(), file=sys.stderr)
+            return 2
+        print(f"Using default dataset: {dataset_path}", file=sys.stderr)
+
+    precision = None if args.precision is not None and args.precision < 0 else args.precision
+    indent = None if args.indent is not None and args.indent < 0 else args.indent
+
+    distribution = build_distribution_from_dataset(
+        dataset_path,
+        args.columns,
+        task_id_column=args.task_id_column,
+        separator=args.separator,
+        missing_value=args.missing_value,
+        precision=precision,
+    )
+
+    payload = json.dumps(distribution, indent=indent, ensure_ascii=False)
+    if args.output:
+        out_path = Path(args.output)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        out_path.write_text(payload + "\n", encoding="utf-8")
+        total_tasks = sum(len(entry["task_ids"]) for entry in distribution.values())
+        print(
+            f"Wrote distribution over {args.columns} ({len(distribution)} groups, {total_tasks} tasks) to {out_path}",
+            file=sys.stderr,
+        )
+    else:
+        print(payload)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/responses_api_agents/stirrup_agent/tests/test_task_distribution.py b/responses_api_agents/stirrup_agent/tests/test_task_distribution.py
new file mode 100644
index 000000000..6e56e4423
--- /dev/null
+++ b/responses_api_agents/stirrup_agent/tests/test_task_distribution.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import random
+from pathlib import Path
+
+import pytest
+
+from responses_api_agents.stirrup_agent import task_distribution as td
+from responses_api_agents.stirrup_agent.task_distribution import (
+    MISSING_VALUE,
+    build_distribution,
+    build_distribution_from_dataset,
+    compose_key,
+    extract_metadata,
+    iter_dataset_rows,
+    main,
+    resolve_default_dataset,
+    sample_task_ids,
+)
+
+
+def _row(task_id: str, **metadata) -> dict:
+    return {"responses_create_params": {"input": "", "metadata": {"task_id": task_id, **metadata}}}
+
+
+def _write_jsonl(path: Path, rows) -> Path:
+    path.write_text("\n".join(json.dumps(r) for r in rows) + "\n", encoding="utf-8")
+    return path
+
+
+class TestExtractMetadata:
+    def test_responses_create_params_metadata(self) -> None:
+        row = _row("t1", sector="Legal")
+        assert extract_metadata(row) == {"task_id": "t1", "sector": "Legal"}
+
+    def test_top_level_metadata_fallback(self) -> None:
+        row = {"metadata": {"task_id": "t1", "sector": "Legal"}}
+        assert extract_metadata(row) == {"task_id": "t1", "sector": "Legal"}
+
+    def test_row_itself_fallback(self) -> None:
+        row = {"task_id": "t1", "sector": "Legal"}
+        assert extract_metadata(row) == {"task_id": "t1", "sector": "Legal"}
+
+    def test_non_mapping_params_falls_through(self) -> None:
+        row = {"responses_create_params": "oops", "metadata": {"task_id": "t1"}}
+        assert extract_metadata(row) == {"task_id": "t1"}
+
+
+class TestComposeKey:
+    def test_single_column(self) -> None:
+        assert compose_key({"sector": "Legal"}, ["sector"]) == "Legal"
+
+    def test_composite_key(self) -> None:
+        meta = {"sector": "Legal", "occupation": "Lawyer"}
+        assert compose_key(meta, ["sector", "occupation"]) == "Legal | Lawyer"
+
+    def test_missing_value_placeholder(self) -> None:
+        assert compose_key({}, ["sector"]) == MISSING_VALUE
+
+    def test_custom_separator(self) -> None:
+        meta = {"a": "x", "b": "y"}
+        assert compose_key(meta, ["a", "b"], separator="::") == "x::y"
+
+    def test_non_string_value_is_stringified(self) -> None:
+        assert compose_key({"n": 5}, ["n"]) == "5"
+
+
+class TestBuildDistribution:
+    def test_percentages_and_task_ids(self) -> None:
+        rows = [
+            _row("a", sector="Legal"),
+            _row("b", sector="Legal"),
+            _row("c", sector="Healthcare"),
+            _row("d", sector="Finance"),
+        ]
+        dist = build_distribution(rows, ["sector"])
+        assert dist["Legal"]["percentage"] == 0.5
+        assert dist["Legal"]["task_ids"] == ["a", "b"]
+        assert dist["Healthcare"]["percentage"] == 0.25
+        assert dist["Finance"]["task_ids"] == ["d"]
+
+    def test_ordering_is_descending_by_share(self) -> None:
+        rows = [
+            _row("a", sector="Legal"),
+            _row("b", sector="Legal"),
+            _row("c", sector="Healthcare"),
+        ]
+        assert list(build_distribution(rows, ["sector"]).keys()) == ["Legal", "Healthcare"]
+
+    def test_percentages_sum_to_one_unrounded(self) -> None:
+        rows = [_row(str(i), sector=s) for i, s in enumerate(["a", "a", "b", "c", "c", "c", "d"])]
+        dist = build_distribution(rows, ["sector"], precision=None)
+        assert pytest.approx(sum(e["percentage"] for e in dist.values())) == 1.0
+
+    def test_composite_columns(self) -> None:
+        rows = [
+            _row("a", sector="Legal", occupation="Lawyer"),
+            _row("b", sector="Legal", occupation="Paralegal"),
+        ]
+        dist = build_distribution(rows, ["sector", "occupation"])
+        assert set(dist.keys()) == {"Legal | Lawyer", "Legal | Paralegal"}
+
+    def test_empty_rows_yields_empty(self) -> None:
+        assert build_distribution([], ["sector"]) == {}
+
+    def test_missing_column_grouped_under_placeholder(self) -> None:
+        rows = [_row("a"), _row("b", sector="Legal")]
+        dist = build_distribution(rows, ["sector"])
+        assert MISSING_VALUE in dist
+        assert dist[MISSING_VALUE]["task_ids"] == ["a"]
+
+    def test_missing_task_id_uses_positional_fallback(self) -> None:
+        rows = [{"responses_create_params": {"metadata": {"sector": "Legal"}}}]
+        dist = build_distribution(rows, ["sector"])
+        assert dist["Legal"]["task_ids"] == ["task_id_index_0"]
+
+    def test_requires_columns(self) -> None:
+        with pytest.raises(ValueError):
+            build_distribution([_row("a", sector="Legal")], [])
+
+    def test_precision_rounding(self) -> None:
+        rows = [_row(str(i), sector="a" if i == 0 else "b") for i in range(3)]
+        dist = build_distribution(rows, ["sector"], precision=2)
+        assert dist["b"]["percentage"] == 0.67
+
+
+class TestIterAndDatasetWrapper:
+    def test_iter_skips_blank_lines(self, tmp_path: Path) -> None:
+        path = tmp_path / "d.jsonl"
+        path.write_text(json.dumps(_row("a", sector="Legal")) + "\n\n", encoding="utf-8")
+        assert len(list(iter_dataset_rows(path))) == 1
+
+    def test_iter_raises_on_bad_json(self, tmp_path: Path) -> None:
+        path = tmp_path / "d.jsonl"
+        path.write_text("{not json}\n", encoding="utf-8")
+        with pytest.raises(ValueError, match="invalid JSON"):
+            list(iter_dataset_rows(path))
+
+    def test_build_from_dataset(self, tmp_path: Path) -> None:
+        path = _write_jsonl(tmp_path / "d.jsonl", [_row("a", sector="Legal"), _row("b", sector="Legal")])
+        dist = build_distribution_from_dataset(path, ["sector"])
+        assert dist["Legal"]["percentage"] == 1.0
+
+
+class TestSampleTaskIds:
+    def _dist(self):
+        return {
+            "Legal": {"percentage": 0.5, "task_ids": ["a", "b"]},
+            "Healthcare": {"percentage": 0.5, "task_ids": ["c", "d"]},
+        }
+
+    def test_zero_or_negative_returns_empty(self) -> None:
+        assert sample_task_ids(self._dist(), 0) == []
+        assert sample_task_ids(self._dist(), -3) == []
+
+    def test_without_replacement_no_duplicates(self) -> None:
+        rng = random.Random(0)
+        sampled = sample_task_ids(self._dist(), 3, rng=rng)
+        assert len(sampled) == 3
+        assert len(set(sampled)) == 3
+
+    def test_without_replacement_capped_at_total(self) -> None:
+        sampled = sample_task_ids(self._dist(), 100, rng=random.Random(1))
+        assert sorted(sampled) == ["a", "b", "c", "d"]
+
+    def test_with_replacement_allows_more_than_total(self) -> None:
+        sampled = sample_task_ids(self._dist(), 10, rng=random.Random(2), replace=True)
+        assert len(sampled) == 10
+
+    def test_empty_distribution_returns_empty(self) -> None:
+        assert sample_task_ids({}, 5) == []
+
+    def test_zero_weight_distribution_returns_empty(self) -> None:
+        dist = {"x": {"percentage": 0.0, "task_ids": ["a"]}}
+        assert sample_task_ids(dist, 5) == []
+        assert sample_task_ids(dist, 5, replace=True) == []
+
+    def test_with_replacement_skips_empty_groups(self) -> None:
+        dist = {"x": {"percentage": 1.0, "task_ids": []}}
+        assert sample_task_ids(dist, 3, rng=random.Random(3), replace=True) == []
+
+
+class TestResolveDefaultDataset:
+    def test_returns_first_existing(self, tmp_path: Path) -> None:
+        missing = tmp_path / "missing.jsonl"
+        present = _write_jsonl(tmp_path / "present.jsonl", [_row("a", sector="Legal")])
+        assert resolve_default_dataset([missing, present]) == present
+
+    def test_priority_order(self, tmp_path: Path) -> None:
+        first = _write_jsonl(tmp_path / "first.jsonl", [_row("a", sector="Legal")])
+        second = _write_jsonl(tmp_path / "second.jsonl", [_row("b", sector="Legal")])
+        assert resolve_default_dataset([first, second]) == first
+
+    def test_returns_none_when_nothing_exists(self, tmp_path: Path) -> None:
+        assert resolve_default_dataset([tmp_path / "a.jsonl", tmp_path / "b.jsonl"]) is None
+
+
+class TestMain:
+    def test_uses_default_dataset_when_omitted(self, tmp_path: Path, capsys, monkeypatch) -> None:
+        default_ds = _write_jsonl(tmp_path / "gdpval.jsonl", [_row("a", sector="Legal")])
+        monkeypatch.setattr(td, "DEFAULT_DATASET_CANDIDATES", (tmp_path / "missing.jsonl", default_ds))
+        rc = main(["--column", "sector"])
+        assert rc == 0
+        captured = capsys.readouterr()
+        assert str(default_ds) in captured.err
+        assert json.loads(captured.out)["Legal"]["percentage"] == 1.0
+
+    def test_errors_when_no_default_and_none_specified(self, tmp_path: Path, capsys, monkeypatch) -> None:
+        monkeypatch.setattr(td, "DEFAULT_DATASET_CANDIDATES", (tmp_path / "missing.jsonl",))
+        rc = main(["--column", "sector"])
+        assert rc == 2
+        err = capsys.readouterr().err
+        assert "no default gdpval dataset was found" in err.lower()
+        assert "gym eval prepare --benchmark gdpval" in err
+        assert "--dataset" in err
+
+    def test_errors_when_specified_dataset_missing(self, tmp_path: Path, capsys) -> None:
+        rc = main(["--dataset", str(tmp_path / "nope.jsonl"), "--column", "sector"])
+        assert rc == 2
+        assert "Dataset not found" in capsys.readouterr().err
+
+    def test_writes_output_file(self, tmp_path: Path, capsys) -> None:
+        dataset = _write_jsonl(
+            tmp_path / "d.jsonl",
+            [_row("a", sector="Legal"), _row("b", sector="Legal"), _row("c", sector="Healthcare")],
+        )
+        out = tmp_path / "dist.json"
+        rc = main(["--dataset", str(dataset), "--column", "sector", "--output", str(out)])
+        assert rc == 0
+        data = json.loads(out.read_text())
+        assert data["Legal"]["task_ids"] == ["a", "b"]
+        assert "3 tasks" in capsys.readouterr().err
+
+    def test_stdout_when_no_output(self, tmp_path: Path, capsys) -> None:
+        dataset = _write_jsonl(tmp_path / "d.jsonl", [_row("a", sector="Legal")])
+        rc = main(["--dataset", str(dataset), "--column", "sector"])
+        assert rc == 0
+        assert json.loads(capsys.readouterr().out)["Legal"]["percentage"] == 1.0
+
+    def test_no_rounding_and_compact(self, tmp_path: Path, capsys) -> None:
+        dataset = _write_jsonl(
+            tmp_path / "d.jsonl", [_row("a", sector="x"), _row("b", sector="y"), _row("c", sector="y")]
+        )
+        rc = main(["--dataset", str(dataset), "--column", "sector", "--precision", "-1", "--indent", "-1"])
+        assert rc == 0
+        out = capsys.readouterr().out
+        assert "\n  " not in out  # compact (no indentation)
+        assert json.loads(out)["y"]["percentage"] == pytest.approx(2 / 3)
+
+    def test_composite_columns_cli(self, tmp_path: Path, capsys) -> None:
+        dataset = _write_jsonl(
+            tmp_path / "d.jsonl",
+            [_row("a", sector="Legal", occupation="Lawyer")],
+        )
+        rc = main(["--dataset", str(dataset), "--column", "sector", "--column", "occupation"])
+        assert rc == 0
+        assert "Legal | Lawyer" in json.loads(capsys.readouterr().out)

From 9057f17d6973755082a90e8c304e55250dca4859 Mon Sep 17 00:00:00 2001
From: Virginia Wu <vadams@nvidia.com>
Date: Thu, 25 Jun 2026 17:19:19 -0700
Subject: [PATCH 2/8] made task_distribution default to occupation column

Signed-off-by: Virginia Wu <vadams@nvidia.com>
---
 .../stirrup_agent/task_distribution.py        | 26 ++++++++++++++-----
 .../tests/test_task_distribution.py           | 13 ++++++++++
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/responses_api_agents/stirrup_agent/task_distribution.py b/responses_api_agents/stirrup_agent/task_distribution.py
index 20f41fb45..9b963e7ea 100644
--- a/responses_api_agents/stirrup_agent/task_distribution.py
+++ b/responses_api_agents/stirrup_agent/task_distribution.py
@@ -33,8 +33,13 @@
 
 Usage::
 
-    # --dataset defaults to the prepared GDPVal dataset
-    # (benchmarks/gdpval/data/gdpval_benchmark.jsonl) when omitted.
+    # Full config defaults: the prepared GDPVal dataset
+    # (benchmarks/gdpval/data/gdpval_benchmark.jsonl) grouped by ``occupation``.
+    # Without --output the distribution is printed to stdout.
+    python -m responses_api_agents.stirrup_agent.task_distribution \
+        --output occupation_distribution.json
+
+    # --dataset defaults to the prepared GDPVal dataset when omitted.
     python -m responses_api_agents.stirrup_agent.task_distribution \
         --column sector \
         --output sector_distribution.json
@@ -61,6 +66,9 @@
 # Separator joining multiple column values into a single composite key.
 DEFAULT_KEY_SEPARATOR = " | "
 
+# Column grouped on when ``--column`` is not specified.
+DEFAULT_COLUMN = "occupation"
+
 # Repo root: this file is responses_api_agents/stirrup_agent/task_distribution.py.
 _REPO_ROOT = Path(__file__).resolve().parents[2]
 
@@ -316,11 +324,12 @@ def _build_arg_parser() -> argparse.ArgumentParser:
         "--column",
         dest="columns",
         action="append",
-        required=True,
+        default=None,
         metavar="COLUMN",
         help=(
             "Metadata column to group by. Repeat to group by a composite key "
-            "(e.g. --column sector --column occupation)."
+            "(e.g. --column sector --column occupation). "
+            f"Defaults to {DEFAULT_COLUMN!r} if not specified."
         ),
     )
     parser.add_argument(
@@ -375,12 +384,17 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
             return 2
         print(f"Using default dataset: {dataset_path}", file=sys.stderr)
 
+    columns = args.columns
+    if not columns:
+        columns = [DEFAULT_COLUMN]
+        print(f"No --column specified; defaulting to {DEFAULT_COLUMN!r}.", file=sys.stderr)
+
     precision = None if args.precision is not None and args.precision < 0 else args.precision
     indent = None if args.indent is not None and args.indent < 0 else args.indent
 
     distribution = build_distribution_from_dataset(
         dataset_path,
-        args.columns,
+        columns,
         task_id_column=args.task_id_column,
         separator=args.separator,
         missing_value=args.missing_value,
@@ -394,7 +408,7 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
         out_path.write_text(payload + "\n", encoding="utf-8")
         total_tasks = sum(len(entry["task_ids"]) for entry in distribution.values())
         print(
-            f"Wrote distribution over {args.columns} ({len(distribution)} groups, {total_tasks} tasks) to {out_path}",
+            f"Wrote distribution over {columns} ({len(distribution)} groups, {total_tasks} tasks) to {out_path}",
             file=sys.stderr,
         )
     else:
diff --git a/responses_api_agents/stirrup_agent/tests/test_task_distribution.py b/responses_api_agents/stirrup_agent/tests/test_task_distribution.py
index 6e56e4423..6250e5194 100644
--- a/responses_api_agents/stirrup_agent/tests/test_task_distribution.py
+++ b/responses_api_agents/stirrup_agent/tests/test_task_distribution.py
@@ -226,6 +226,19 @@ def test_errors_when_no_default_and_none_specified(self, tmp_path: Path, capsys,
         assert "gym eval prepare --benchmark gdpval" in err
         assert "--dataset" in err
 
+    def test_defaults_to_occupation_column(self, tmp_path: Path, capsys) -> None:
+        dataset = _write_jsonl(
+            tmp_path / "d.jsonl",
+            [_row("a", occupation="Lawyer"), _row("b", occupation="Lawyer"), _row("c", occupation="Nurse")],
+        )
+        rc = main(["--dataset", str(dataset)])
+        assert rc == 0
+        captured = capsys.readouterr()
+        assert "defaulting to 'occupation'" in captured.err
+        data = json.loads(captured.out)
+        assert data["Lawyer"]["task_ids"] == ["a", "b"]
+        assert data["Nurse"]["percentage"] == pytest.approx(1 / 3)
+
     def test_errors_when_specified_dataset_missing(self, tmp_path: Path, capsys) -> None:
         rc = main(["--dataset", str(tmp_path / "nope.jsonl"), "--column", "sector"])
         assert rc == 2

From e05d8e0e5f0405a20246a9f883c62c5d5f771c13 Mon Sep 17 00:00:00 2001
From: Virginia Wu <vadams@nvidia.com>
Date: Thu, 25 Jun 2026 17:44:32 -0700
Subject: [PATCH 3/8] added occupation distribution

Signed-off-by: Virginia Wu <vadams@nvidia.com>
---
 .../data/occupation_distribution.json         | 442 ++++++++++++++++++
 .../stirrup_agent/task_distribution.py        |  16 +-
 2 files changed, 453 insertions(+), 5 deletions(-)
 create mode 100644 responses_api_agents/stirrup_agent/data/occupation_distribution.json

diff --git a/responses_api_agents/stirrup_agent/data/occupation_distribution.json b/responses_api_agents/stirrup_agent/data/occupation_distribution.json
new file mode 100644
index 000000000..a91a899a0
--- /dev/null
+++ b/responses_api_agents/stirrup_agent/data/occupation_distribution.json
@@ -0,0 +1,442 @@
+{
+  "Accountants and Auditors": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "83d10b06-26d1-4636-a32c-23f92c57f30b",
+      "7b08cd4d-df60-41ae-9102-8aaa49306ba2",
+      "7d7fc9a7-21a7-4b83-906f-416dea5ad04f",
+      "43dc9778-450b-4b46-b77e-b6d82b202035",
+      "ee09d943-5a11-430a-b7a2-971b4e9b01b5"
+    ]
+  },
+  "Administrative Services Managers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "f84ea6ac-8f9f-428c-b96c-d0884e30f7c7",
+      "a328feea-47db-4856-b4be-2bdc63dd88fb",
+      "27e8912c-8bd5-44ba-ad87-64066ea05264",
+      "17111c03-aac7-45c2-857d-c06d8223d6ad",
+      "c44e9b62-7cd8-4f72-8ad9-f8fbddb94083"
+    ]
+  },
+  "Audio and Video Technicians": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "99ac6944-4ec6-4848-959c-a460ac705c6f",
+      "f9a1c16c-53fd-4c8f-88cc-5c325ec2f0bb",
+      "38889c3b-e3d4-49c8-816a-3cc8e5313aba",
+      "ff85ee58-bc9f-4aa2-806d-87edeabb1b81",
+      "4b894ae3-1f23-4560-b13d-07ed1132074e"
+    ]
+  },
+  "Buyers and Purchasing Agents": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "1b1ade2d-f9f6-4a04-baa5-aa15012b53be",
+      "93b336f3-61f3-4287-86d2-87445e1e0f90",
+      "15ddd28d-8445-4baa-ac7f-f41372e1344e",
+      "24d1e93f-9018-45d4-b522-ad89dfd78079",
+      "05389f78-589a-473c-a4ae-67c61050bfca"
+    ]
+  },
+  "Child, Family, and School Social Workers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "575f8679-b4c1-47a2-8e96-d570d4ed9269",
+      "a74ead3b-f67d-4b1c-9116-f6bb81b29d4f",
+      "bbe0a93b-ebf0-40b0-98dc-8d9243099034",
+      "85d95ce5-b20c-41e2-834e-e788ce9622b6",
+      "76d10872-9ffa-4ede-83ee-e0f1ec5e2b8d"
+    ]
+  },
+  "Compliance Officers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "36d567ba-e205-4313-9756-931c6e4691fe",
+      "7bbfcfe9-132d-4194-82bb-d6f29d001b01",
+      "2696757c-1f8a-4959-8f0d-f5597b9e70fc",
+      "dfb4e0cd-a0b7-454e-b943-0dd586c2764c",
+      "4c18ebae-dfaa-4b76-b10c-61fcdf26734c"
+    ]
+  },
+  "Computer and Information Systems Managers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "cebf301e-5ea7-41ae-b117-ad8f43e7ac22",
+      "c2e8f271-7858-412f-b460-472463ad81d9",
+      "2ea2e5b5-257f-42e6-a7dc-93763f28b19d",
+      "c357f0e2-963d-4eb7-a6fa-3078fe55b3ba",
+      "a45bc83b-22f9-4def-8d89-9c5661b2b86f"
+    ]
+  },
+  "Concierges": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "a10ec48c-168e-476c-8fe3-23b2a5f616ac",
+      "fccaa4a1-1c39-49ac-b701-55361a19966b",
+      "f5d428fd-b38e-41f0-8783-35423dab80f6",
+      "2fa8e956-7b35-4c13-95dc-027f02be318b",
+      "0e4fe8cd-16d0-4f41-8247-6385b4762582"
+    ]
+  },
+  "Counter and Rental Clerks": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "a0ef404e-82a6-4507-bff1-633d7c8e0004",
+      "b7a5912e-0e63-41f5-8c22-9cdb8f46ab01",
+      "aa071045-bcb0-4164-bb85-97245d56287e",
+      "476db143-163a-4537-9e21-fe46adad703b",
+      "61f546a8-c374-467f-95cc-d0d9b5656eb6"
+    ]
+  },
+  "Customer Service Representatives": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "f3351922-dbdd-45da-85c5-e7110696bbe5",
+      "61717508-4df7-41be-bf97-318dfb2475c0",
+      "0ed38524-a4ad-405f-9dee-7b2252659aad",
+      "87da214f-fd92-4c58-9854-f4d0d10adce0",
+      "d025a41c-c439-4ee1-bc79-dd5c94b27a2d"
+    ]
+  },
+  "Editors": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "401a07f1-d57e-4bb0-889b-22de8c900f0e",
+      "afe56d05-dac8-47d7-a233-ad1d035ca5bd",
+      "9a8c8e28-ce76-408b-83c3-488422892e58",
+      "3a4c347c-4aec-43c7-9a54-eb1f816ab1f9",
+      "ec2fccc9-b7f6-4c73-bf51-896fdb433cec"
+    ]
+  },
+  "Film and Video Editors": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "8c8fc328-69fc-4559-a13f-82087baef0a1",
+      "e222075d-5d62-4757-ae3c-e34b0846583b",
+      "c94452e4-39cd-4846-b73a-ab75933d1ad7",
+      "75401f7c-396d-406d-b08e-938874ad1045",
+      "a941b6d8-4289-4500-b45a-f8e4fc94a724"
+    ]
+  },
+  "Financial Managers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "a1963a68-1bea-4bb1-b7e0-145c92a57449",
+      "5f6c57dd-feb6-4e70-b152-4969d92d1608",
+      "b39a5aa7-cd1b-47ad-b249-90afd22f8f21",
+      "b78fd844-db76-448e-a783-5e9877cb74c2",
+      "4520f882-715a-482d-8e87-1cb3cbdfe975"
+    ]
+  },
+  "Financial and Investment Analysts": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "8079e27d-b6f3-4f75-a9b5-db27903c798d",
+      "e21cd746-404d-4602-b9d2-01d2812c5b87",
+      "9e8607e7-a38a-491f-ace1-e5ea7dc477cb",
+      "c7d83f01-2874-4876-b7fd-52582ec99e1a",
+      "46b34f78-6c06-4416-87e2-77b6d8b20ce9"
+    ]
+  },
+  "First-Line Supervisors of Non-Retail Sales Workers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "ec591973-04d5-48c0-981c-1ab2fcec2dc1",
+      "62f04c2f-e0f7-4710-876c-54ee9c2e8256",
+      "3f821c2d-ab97-46ec-a0fb-b8f73c2682bc",
+      "e996036e-8287-4e7f-8d0a-90a57cb53c45",
+      "327fbc21-7d26-4964-bf7c-f4f41e55c54d"
+    ]
+  },
+  "First-Line Supervisors of Office and Administrative Support Workers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "6dcae3f5-bf1c-48e0-8b4b-23e6486a934c",
+      "1aecc095-4d76-4b89-b752-1a0f870502cd",
+      "0353ee0c-18b5-4ad3-88e8-e001d223e1d7",
+      "40a8c4b1-b169-4f92-a38b-7f79685037ec",
+      "4d1a8410-e9c5-4be5-ab43-cc55563c594c"
+    ]
+  },
+  "First-Line Supervisors of Police and Detectives": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "8c823e32-537c-42b2-84ba-635d63c2853a",
+      "eb54f575-93f9-408b-b9e0-f1208a0b6759",
+      "11e1b169-5fb6-4d79-8a83-82ddf4987a85",
+      "a95a5829-34bb-40f3-993b-558aed6dcdef",
+      "22c0809b-f8db-489e-93b3-b4da225e3e0e"
+    ]
+  },
+  "First-Line Supervisors of Production and Operating Workers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "bf68f2ad-eac5-490a-adec-d847eb45bd6f",
+      "efca245f-c24f-4f75-a9d5-59201330ab7a",
+      "9e39df84-ac57-4c9b-a2e3-12b8abf2c797",
+      "68d8d901-dd0b-4a7e-bf9a-1074fddf1a96",
+      "1752cb53-5983-46b6-92ee-58ac85a11283"
+    ]
+  },
+  "First-Line Supervisors of Retail Sales Workers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "bd72994f-5659-4084-9fab-fc547d1efe3b",
+      "211d0093-2c64-4bd0-828c-0201f18924e7",
+      "d4525420-a427-4ef2-b4e9-2dcc2d31b3b6",
+      "45c6237b-f9c9-4526-9a8d-6a5c404624ec",
+      "cecac8f9-8203-4ebd-ad49-54436a8c4171"
+    ]
+  },
+  "General and Operations Managers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "8f9e8bcd-6102-40da-ab76-23f51d8b21fa",
+      "0fad6023-767b-42c1-a1b3-027cd4f583cb",
+      "02314fc6-a24e-42f4-a8cd-362cae0f0ec1",
+      "4d61a19a-8438-4d4c-9fc2-cf167e36dcd6",
+      "6436ff9e-c5f2-47ba-9aaa-49d89b0594ab"
+    ]
+  },
+  "Industrial Engineers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "8a7b6fca-60cc-4ae3-b649-971753cbf8b9",
+      "40a99a31-42d6-4f23-b3ec-8f591afe25b6",
+      "b9665ca1-4da4-4ff9-86f2-40b9a8683048",
+      "c6269101-fdc8-4602-b345-eac7597c0c81",
+      "be830ca0-b352-4658-a5bd-57139d6780ba"
+    ]
+  },
+  "Lawyers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "cd9efc18-d14a-4f69-8531-5d178a08084d",
+      "a97369c7-e5cf-40ca-99e8-d06f81c57d53",
+      "3f625cb2-f40e-4ead-8a97-6924356d5989",
+      "aad21e4c-1d43-45fc-899a-97754a1b1b63",
+      "8314d1b1-5b0f-42a4-b5d5-91c0867b0913"
+    ]
+  },
+  "Mechanical Engineers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "5e2b6aab-f9fb-4dd6-a1a5-874ef1743909",
+      "46fc494e-a24f-45ce-b099-851d5c181fd4",
+      "3940b7e7-ec4f-4cea-8097-3ab4cfdcaaa6",
+      "8077e700-2b31-402d-bd09-df4d33c39653",
+      "5a2d70da-0a42-4a6b-a3ca-763e03f070a5"
+    ]
+  },
+  "Medical Secretaries and Administrative Assistants": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "f1be6436-ffff-4fee-9e66-d550291a1735",
+      "41f6ef59-88c9-4b2c-bcc7-9ceb88422f48",
+      "a0552909-bc66-4a3a-8970-ee0d17b49718",
+      "6d2c8e55-fe20-45c6-bdaf-93e676868503",
+      "4b98ccce-9e42-44e9-9115-6fc3e79de288"
+    ]
+  },
+  "Medical and Health Services Managers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "74d6e8b0-f334-4e7e-af55-c095d5d4d1a6",
+      "81db15ff-ceea-4f63-a1cd-06dc88114709",
+      "61b0946a-5c1c-4bf6-8607-84d7c7e0dfe0",
+      "61e7b9c6-0051-429f-a341-fda9b6578a84",
+      "c9bf9801-9640-45fa-8166-1ab01f2d98e4"
+    ]
+  },
+  "News Analysts, Reporters, and Journalists": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "60221cd0-686e-4a08-985e-d9bb2fa18501",
+      "ef8719da-18e5-4bfe-b986-399652d77376",
+      "3baa0009-5a60-4ae8-ae99-4955cb328ff3",
+      "5d0feb24-e8b6-4ace-b64f-d5cd1a8b563d",
+      "6974adea-8326-43fa-8187-2724b15d9546"
+    ]
+  },
+  "Nurse Practitioners": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "1a78e076-445e-4c5d-b8ce-387d2fe5e715",
+      "1b9ec237-bf9c-41f9-8fa9-0e685fcd93c6",
+      "0112fc9b-c3b2-4084-8993-5a4abb1f54f1",
+      "772e7524-174e-4c88-957e-6e510b61ea69",
+      "e6429658-4de1-42dd-a9e0-2d2b9b02fb10"
+    ]
+  },
+  "Order Clerks": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "b5d2e6f1-62a2-433a-bcdd-95b260cdd860",
+      "f841ddcf-2a28-4f6d-bac3-61b607219d3e",
+      "47ef842d-8eac-4b90-bda8-dd934c228c96",
+      "1137e2bb-bdf9-4876-b572-f29b7de5e595",
+      "c3525d4d-2012-45df-853e-2d2a0e902991"
+    ]
+  },
+  "Personal Financial Advisors": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "9a0d8d36-6233-4c76-9107-0d1f783c7340",
+      "664a42e5-3240-413a-9a57-ea93c6303269",
+      "feb5eefc-39f1-4451-9ef9-bffe011b71dd",
+      "3600de06-3f71-4e48-9480-e4828c579924",
+      "c657103b-b348-4496-a848-b2b7165d28b2"
+    ]
+  },
+  "Pharmacists": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "91060ff0-3eb5-4ddf-9edb-f6758b95499e",
+      "8384083a-c31b-4194-80ba-4d335a444918",
+      "045aba2e-4093-42aa-ab7f-159cc538278c",
+      "f2986c1f-2bbf-4b83-bc93-624a9d617f45",
+      "ffed32d8-d192-4e3f-8cd4-eda5a730aec3"
+    ]
+  },
+  "Private Detectives and Investigators": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "ae0c1093-5ea8-4b84-a81e-53ebf7a4321d",
+      "f9f82549-fdde-4462-aff8-e70fba5b8c66",
+      "57b2cdf2-ad62-4591-aa91-aad489740320",
+      "84322284-5c2c-4873-b507-b147449d209d",
+      "a46d5cd2-55fe-48fa-a4c6-6aaf6b9991b5"
+    ]
+  },
+  "Producers and Directors": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "6241e678-4ba3-4831-b3c7-78412697febc",
+      "e14e32ba-d310-4d45-9b8a-6d73d0ece1ae",
+      "b1a79ce1-86b0-41fb-97dc-9206dfd7b044",
+      "e4f664ea-0e5c-4e4e-a0d3-a87a33da947a",
+      "a079d38f-c529-436a-beca-3e291f9e62a3"
+    ]
+  },
+  "Project Management Specialists": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "02aa1805-c658-4069-8a6a-02dec146063a",
+      "fd6129bd-f095-429b-873c-dcc3137be2c3",
+      "ce864f41-8584-49ba-b24f-9c9104b47bf0",
+      "58ac1cc5-5754-4580-8c9c-8c67e1a9d619",
+      "3c19c6d1-672c-467a-8437-6fe21afb8eae"
+    ]
+  },
+  "Property, Real Estate, and Community Association Managers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "a99d85fc-eff8-48d2-a7d4-42a75d62f18d",
+      "55ddb773-23a4-454c-8704-d432fe1b99d9",
+      "1e5a1d7f-12c1-48c6-afd9-82257b3f2409",
+      "0419f1c3-d669-45d0-81cd-f4d5923b06a5",
+      "ed2bc14c-99ac-4a2a-8467-482a1a5d67f3"
+    ]
+  },
+  "Real Estate Brokers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "46bc7238-3501-4839-b989-e2bd47853676",
+      "2d06bc0a-89c6-4e89-9417-5ffe725c1bc6",
+      "fd3ad420-6f7d-43b1-a990-c0c5c047d071",
+      "0818571f-5ff7-4d39-9d2c-ced5ae44299e",
+      "6074bba3-7e3a-4b1c-b8c6-a15bb6695c3b"
+    ]
+  },
+  "Real Estate Sales Agents": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "5ad0c554-a7a2-48cd-b41a-ebc1bff4a9de",
+      "11593a50-734d-4449-b5b4-f8986a133fd8",
+      "94925f49-36bc-42da-b45b-61078d329300",
+      "90f37ff3-e4ed-4a0b-94bb-bed0f7def1ef",
+      "d3d255b2-f5f2-4841-9f62-2083ec9ef3da"
+    ]
+  },
+  "Recreation Workers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "403b9234-6299-4b5f-a106-70c1bc11ec4c",
+      "1bff4551-1d54-4e37-b2e0-d5c3f2ea4a45",
+      "650adcb1-ed19-4f88-8117-77640f7b94b6",
+      "01d7e53e-0513-4109-a242-8ccaf442cd21",
+      "a73fbc98-90d4-4134-a54f-2b1d0c838791"
+    ]
+  },
+  "Registered Nurses": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "0ec25916-1b5c-4bfe-93d3-4e103d860f3a",
+      "116e791e-890c-42b1-ba90-1db02e8bfd45",
+      "dd724c67-8118-4b99-ab50-4761af705c3b",
+      "7151c60a-d4cb-4fc4-8169-3d4cb446e6b9",
+      "90edba97-74f0-425a-8ff6-8b93182eb7cb"
+    ]
+  },
+  "Sales Managers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "b3573f20-5d3e-4954-948f-9461fda693d2",
+      "a69be28f-9a84-47c9-992e-b90446cdca9d",
+      "788d2bc6-82df-4dc7-8467-a0f31405dc14",
+      "74ed1dc7-1468-48a8-9071-58775c0d667a",
+      "69a8ef86-4e69-4fe2-9168-080f1e978e67"
+    ]
+  },
+  "Sales Representatives, Wholesale and Manufacturing, Except Technical and Scientific Products": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "ab81b076-e5d8-473a-9bdb-7ea7c38f6ebc",
+      "d7cfae6f-4a82-4289-955e-c799dfe1e0f4",
+      "19403010-3e5c-494e-a6d3-13594e99f6af",
+      "7ed932dd-244f-4d61-bf02-1bc3bab1af14",
+      "105f8ad0-8dd2-422f-9e88-2be5fbd2b215"
+    ]
+  },
+  "Sales Representatives, Wholesale and Manufacturing, Technical and Scientific Products": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "b57efde3-26d6-4742-bbff-2b63c43b4baa",
+      "15d37511-75c5-4c7f-81f1-16e00c0d95f3",
+      "bb863dd9-31c2-4f64-911a-ce11f457143b",
+      "fe0d3941-e32c-4bf1-a643-b566d2b4cb3c",
+      "6a900a40-8d2b-4064-a5b1-13a60bc173d8"
+    ]
+  },
+  "Securities, Commodities, and Financial Services Sales Agents": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "9efbcd35-186d-49b6-ac24-28ee2bc9a263",
+      "1d4672c8-b0a7-488f-905f-9ab4e25a19f7",
+      "4de6a529-4f61-41a1-b2dc-64951ba03457",
+      "4c4dc603-c21c-4284-8fb1-1b827c1fddf4",
+      "bb499d9c-0263-4684-9238-75e8e86077b1"
+    ]
+  },
+  "Shipping, Receiving, and Inventory Clerks": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "5349dd7b-bf0a-4544-9a17-75b7013767e6",
+      "a4a9195c-5ebe-4b8d-a0c2-4a6b7a49da8b",
+      "552b7dd0-96f4-437c-a749-0691e0e4b381",
+      "11dcc268-cb07-4d3a-a184-c6d7a19349bc",
+      "76418a2c-a3c0-4894-b89d-2493369135d9"
+    ]
+  },
+  "Software Developers": {
+    "percentage": 0.022727,
+    "task_ids": [
+      "0e386e32-df20-4d1f-b536-7159bc409ad5",
+      "7de33b48-5163-4f50-b5f3-8deea8185e57",
+      "854f3814-681c-4950-91ac-55b0db0e3781",
+      "4122f866-01fa-400b-904d-fa171cdab7c7",
+      "2c249e0f-4a8c-4f8e-b4f4-6508ba29b34f"
+    ]
+  }
+}
diff --git a/responses_api_agents/stirrup_agent/task_distribution.py b/responses_api_agents/stirrup_agent/task_distribution.py
index 9b963e7ea..4f661cc96 100644
--- a/responses_api_agents/stirrup_agent/task_distribution.py
+++ b/responses_api_agents/stirrup_agent/task_distribution.py
@@ -33,7 +33,7 @@
 
 Usage::
 
-    # Full config defaults: the prepared GDPVal dataset
+    # Full defaults: the prepared GDPVal dataset (220 tasks)
     # (benchmarks/gdpval/data/gdpval_benchmark.jsonl) grouped by ``occupation``.
     # Without --output the distribution is printed to stdout.
     python -m responses_api_agents.stirrup_agent.task_distribution \
@@ -109,10 +109,16 @@ def _no_dataset_message() -> str:
         "\nTo fix this, do one of the following:\n"
         "\n  1. Prepare the GDPVal benchmark dataset (recommended). This downloads\n"
         "     the openai/gdpval dataset from HuggingFace and writes\n"
-        "     benchmarks/gdpval/data/gdpval_benchmark.jsonl (220 tasks):\n"
-        "\n         export HF_TOKEN=<your-huggingface-token>\n"
-        "         gym eval prepare --benchmark gdpval\n"
-        "\n     (or run: bash responses_api_agents/stirrup_agent/setup_scripts/gdpval.sh)\n"
+        "     benchmarks/gdpval/data/gdpval_benchmark.jsonl (220 tasks).\n"
+        "\n     First activate the project virtualenv so the Gym CLI is on PATH\n"
+        "     (the `gym`/`ng_*` commands live in .venv, not on your global PATH):\n"
+        "\n         source .venv/bin/activate\n"
+        "         export HF_TOKEN=<your-huggingface-token>\n"
+        "\n     Then run the setup script (works on all installs):\n"
+        "\n         bash responses_api_agents/stirrup_agent/setup_scripts/gdpval.sh\n"
+        "\n     Or call a prepare CLI directly:\n"
+        "\n         gym eval prepare --benchmark gdpval        # newer installs\n"
+        "         ng_prepare_benchmark '+config_paths=[benchmarks/gdpval/config.yaml]'  # any install\n"
         "\n  2. Pass an explicit dataset path with --dataset <path-to.jsonl>.\n"
         "\nNote: the GDPVal dataset is gated on HuggingFace, so HF_TOKEN must be set\n"
         "and your account must have access to https://huggingface.co/datasets/openai/gdpval.\n"

From 846441eda371bc019c4cd9460570e271811a666d Mon Sep 17 00:00:00 2001
From: Virginia Wu <vadams@nvidia.com>
Date: Thu, 25 Jun 2026 17:47:45 -0700
Subject: [PATCH 4/8] shouldn't include occupation data file in repo

Signed-off-by: Virginia Wu <vadams@nvidia.com>
---
 .../data/occupation_distribution.json         | 442 ------------------
 1 file changed, 442 deletions(-)
 delete mode 100644 responses_api_agents/stirrup_agent/data/occupation_distribution.json

diff --git a/responses_api_agents/stirrup_agent/data/occupation_distribution.json b/responses_api_agents/stirrup_agent/data/occupation_distribution.json
deleted file mode 100644
index a91a899a0..000000000
--- a/responses_api_agents/stirrup_agent/data/occupation_distribution.json
+++ /dev/null
@@ -1,442 +0,0 @@
-{
-  "Accountants and Auditors": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "83d10b06-26d1-4636-a32c-23f92c57f30b",
-      "7b08cd4d-df60-41ae-9102-8aaa49306ba2",
-      "7d7fc9a7-21a7-4b83-906f-416dea5ad04f",
-      "43dc9778-450b-4b46-b77e-b6d82b202035",
-      "ee09d943-5a11-430a-b7a2-971b4e9b01b5"
-    ]
-  },
-  "Administrative Services Managers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "f84ea6ac-8f9f-428c-b96c-d0884e30f7c7",
-      "a328feea-47db-4856-b4be-2bdc63dd88fb",
-      "27e8912c-8bd5-44ba-ad87-64066ea05264",
-      "17111c03-aac7-45c2-857d-c06d8223d6ad",
-      "c44e9b62-7cd8-4f72-8ad9-f8fbddb94083"
-    ]
-  },
-  "Audio and Video Technicians": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "99ac6944-4ec6-4848-959c-a460ac705c6f",
-      "f9a1c16c-53fd-4c8f-88cc-5c325ec2f0bb",
-      "38889c3b-e3d4-49c8-816a-3cc8e5313aba",
-      "ff85ee58-bc9f-4aa2-806d-87edeabb1b81",
-      "4b894ae3-1f23-4560-b13d-07ed1132074e"
-    ]
-  },
-  "Buyers and Purchasing Agents": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "1b1ade2d-f9f6-4a04-baa5-aa15012b53be",
-      "93b336f3-61f3-4287-86d2-87445e1e0f90",
-      "15ddd28d-8445-4baa-ac7f-f41372e1344e",
-      "24d1e93f-9018-45d4-b522-ad89dfd78079",
-      "05389f78-589a-473c-a4ae-67c61050bfca"
-    ]
-  },
-  "Child, Family, and School Social Workers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "575f8679-b4c1-47a2-8e96-d570d4ed9269",
-      "a74ead3b-f67d-4b1c-9116-f6bb81b29d4f",
-      "bbe0a93b-ebf0-40b0-98dc-8d9243099034",
-      "85d95ce5-b20c-41e2-834e-e788ce9622b6",
-      "76d10872-9ffa-4ede-83ee-e0f1ec5e2b8d"
-    ]
-  },
-  "Compliance Officers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "36d567ba-e205-4313-9756-931c6e4691fe",
-      "7bbfcfe9-132d-4194-82bb-d6f29d001b01",
-      "2696757c-1f8a-4959-8f0d-f5597b9e70fc",
-      "dfb4e0cd-a0b7-454e-b943-0dd586c2764c",
-      "4c18ebae-dfaa-4b76-b10c-61fcdf26734c"
-    ]
-  },
-  "Computer and Information Systems Managers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "cebf301e-5ea7-41ae-b117-ad8f43e7ac22",
-      "c2e8f271-7858-412f-b460-472463ad81d9",
-      "2ea2e5b5-257f-42e6-a7dc-93763f28b19d",
-      "c357f0e2-963d-4eb7-a6fa-3078fe55b3ba",
-      "a45bc83b-22f9-4def-8d89-9c5661b2b86f"
-    ]
-  },
-  "Concierges": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "a10ec48c-168e-476c-8fe3-23b2a5f616ac",
-      "fccaa4a1-1c39-49ac-b701-55361a19966b",
-      "f5d428fd-b38e-41f0-8783-35423dab80f6",
-      "2fa8e956-7b35-4c13-95dc-027f02be318b",
-      "0e4fe8cd-16d0-4f41-8247-6385b4762582"
-    ]
-  },
-  "Counter and Rental Clerks": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "a0ef404e-82a6-4507-bff1-633d7c8e0004",
-      "b7a5912e-0e63-41f5-8c22-9cdb8f46ab01",
-      "aa071045-bcb0-4164-bb85-97245d56287e",
-      "476db143-163a-4537-9e21-fe46adad703b",
-      "61f546a8-c374-467f-95cc-d0d9b5656eb6"
-    ]
-  },
-  "Customer Service Representatives": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "f3351922-dbdd-45da-85c5-e7110696bbe5",
-      "61717508-4df7-41be-bf97-318dfb2475c0",
-      "0ed38524-a4ad-405f-9dee-7b2252659aad",
-      "87da214f-fd92-4c58-9854-f4d0d10adce0",
-      "d025a41c-c439-4ee1-bc79-dd5c94b27a2d"
-    ]
-  },
-  "Editors": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "401a07f1-d57e-4bb0-889b-22de8c900f0e",
-      "afe56d05-dac8-47d7-a233-ad1d035ca5bd",
-      "9a8c8e28-ce76-408b-83c3-488422892e58",
-      "3a4c347c-4aec-43c7-9a54-eb1f816ab1f9",
-      "ec2fccc9-b7f6-4c73-bf51-896fdb433cec"
-    ]
-  },
-  "Film and Video Editors": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "8c8fc328-69fc-4559-a13f-82087baef0a1",
-      "e222075d-5d62-4757-ae3c-e34b0846583b",
-      "c94452e4-39cd-4846-b73a-ab75933d1ad7",
-      "75401f7c-396d-406d-b08e-938874ad1045",
-      "a941b6d8-4289-4500-b45a-f8e4fc94a724"
-    ]
-  },
-  "Financial Managers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "a1963a68-1bea-4bb1-b7e0-145c92a57449",
-      "5f6c57dd-feb6-4e70-b152-4969d92d1608",
-      "b39a5aa7-cd1b-47ad-b249-90afd22f8f21",
-      "b78fd844-db76-448e-a783-5e9877cb74c2",
-      "4520f882-715a-482d-8e87-1cb3cbdfe975"
-    ]
-  },
-  "Financial and Investment Analysts": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "8079e27d-b6f3-4f75-a9b5-db27903c798d",
-      "e21cd746-404d-4602-b9d2-01d2812c5b87",
-      "9e8607e7-a38a-491f-ace1-e5ea7dc477cb",
-      "c7d83f01-2874-4876-b7fd-52582ec99e1a",
-      "46b34f78-6c06-4416-87e2-77b6d8b20ce9"
-    ]
-  },
-  "First-Line Supervisors of Non-Retail Sales Workers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "ec591973-04d5-48c0-981c-1ab2fcec2dc1",
-      "62f04c2f-e0f7-4710-876c-54ee9c2e8256",
-      "3f821c2d-ab97-46ec-a0fb-b8f73c2682bc",
-      "e996036e-8287-4e7f-8d0a-90a57cb53c45",
-      "327fbc21-7d26-4964-bf7c-f4f41e55c54d"
-    ]
-  },
-  "First-Line Supervisors of Office and Administrative Support Workers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "6dcae3f5-bf1c-48e0-8b4b-23e6486a934c",
-      "1aecc095-4d76-4b89-b752-1a0f870502cd",
-      "0353ee0c-18b5-4ad3-88e8-e001d223e1d7",
-      "40a8c4b1-b169-4f92-a38b-7f79685037ec",
-      "4d1a8410-e9c5-4be5-ab43-cc55563c594c"
-    ]
-  },
-  "First-Line Supervisors of Police and Detectives": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "8c823e32-537c-42b2-84ba-635d63c2853a",
-      "eb54f575-93f9-408b-b9e0-f1208a0b6759",
-      "11e1b169-5fb6-4d79-8a83-82ddf4987a85",
-      "a95a5829-34bb-40f3-993b-558aed6dcdef",
-      "22c0809b-f8db-489e-93b3-b4da225e3e0e"
-    ]
-  },
-  "First-Line Supervisors of Production and Operating Workers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "bf68f2ad-eac5-490a-adec-d847eb45bd6f",
-      "efca245f-c24f-4f75-a9d5-59201330ab7a",
-      "9e39df84-ac57-4c9b-a2e3-12b8abf2c797",
-      "68d8d901-dd0b-4a7e-bf9a-1074fddf1a96",
-      "1752cb53-5983-46b6-92ee-58ac85a11283"
-    ]
-  },
-  "First-Line Supervisors of Retail Sales Workers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "bd72994f-5659-4084-9fab-fc547d1efe3b",
-      "211d0093-2c64-4bd0-828c-0201f18924e7",
-      "d4525420-a427-4ef2-b4e9-2dcc2d31b3b6",
-      "45c6237b-f9c9-4526-9a8d-6a5c404624ec",
-      "cecac8f9-8203-4ebd-ad49-54436a8c4171"
-    ]
-  },
-  "General and Operations Managers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "8f9e8bcd-6102-40da-ab76-23f51d8b21fa",
-      "0fad6023-767b-42c1-a1b3-027cd4f583cb",
-      "02314fc6-a24e-42f4-a8cd-362cae0f0ec1",
-      "4d61a19a-8438-4d4c-9fc2-cf167e36dcd6",
-      "6436ff9e-c5f2-47ba-9aaa-49d89b0594ab"
-    ]
-  },
-  "Industrial Engineers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "8a7b6fca-60cc-4ae3-b649-971753cbf8b9",
-      "40a99a31-42d6-4f23-b3ec-8f591afe25b6",
-      "b9665ca1-4da4-4ff9-86f2-40b9a8683048",
-      "c6269101-fdc8-4602-b345-eac7597c0c81",
-      "be830ca0-b352-4658-a5bd-57139d6780ba"
-    ]
-  },
-  "Lawyers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "cd9efc18-d14a-4f69-8531-5d178a08084d",
-      "a97369c7-e5cf-40ca-99e8-d06f81c57d53",
-      "3f625cb2-f40e-4ead-8a97-6924356d5989",
-      "aad21e4c-1d43-45fc-899a-97754a1b1b63",
-      "8314d1b1-5b0f-42a4-b5d5-91c0867b0913"
-    ]
-  },
-  "Mechanical Engineers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "5e2b6aab-f9fb-4dd6-a1a5-874ef1743909",
-      "46fc494e-a24f-45ce-b099-851d5c181fd4",
-      "3940b7e7-ec4f-4cea-8097-3ab4cfdcaaa6",
-      "8077e700-2b31-402d-bd09-df4d33c39653",
-      "5a2d70da-0a42-4a6b-a3ca-763e03f070a5"
-    ]
-  },
-  "Medical Secretaries and Administrative Assistants": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "f1be6436-ffff-4fee-9e66-d550291a1735",
-      "41f6ef59-88c9-4b2c-bcc7-9ceb88422f48",
-      "a0552909-bc66-4a3a-8970-ee0d17b49718",
-      "6d2c8e55-fe20-45c6-bdaf-93e676868503",
-      "4b98ccce-9e42-44e9-9115-6fc3e79de288"
-    ]
-  },
-  "Medical and Health Services Managers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "74d6e8b0-f334-4e7e-af55-c095d5d4d1a6",
-      "81db15ff-ceea-4f63-a1cd-06dc88114709",
-      "61b0946a-5c1c-4bf6-8607-84d7c7e0dfe0",
-      "61e7b9c6-0051-429f-a341-fda9b6578a84",
-      "c9bf9801-9640-45fa-8166-1ab01f2d98e4"
-    ]
-  },
-  "News Analysts, Reporters, and Journalists": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "60221cd0-686e-4a08-985e-d9bb2fa18501",
-      "ef8719da-18e5-4bfe-b986-399652d77376",
-      "3baa0009-5a60-4ae8-ae99-4955cb328ff3",
-      "5d0feb24-e8b6-4ace-b64f-d5cd1a8b563d",
-      "6974adea-8326-43fa-8187-2724b15d9546"
-    ]
-  },
-  "Nurse Practitioners": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "1a78e076-445e-4c5d-b8ce-387d2fe5e715",
-      "1b9ec237-bf9c-41f9-8fa9-0e685fcd93c6",
-      "0112fc9b-c3b2-4084-8993-5a4abb1f54f1",
-      "772e7524-174e-4c88-957e-6e510b61ea69",
-      "e6429658-4de1-42dd-a9e0-2d2b9b02fb10"
-    ]
-  },
-  "Order Clerks": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "b5d2e6f1-62a2-433a-bcdd-95b260cdd860",
-      "f841ddcf-2a28-4f6d-bac3-61b607219d3e",
-      "47ef842d-8eac-4b90-bda8-dd934c228c96",
-      "1137e2bb-bdf9-4876-b572-f29b7de5e595",
-      "c3525d4d-2012-45df-853e-2d2a0e902991"
-    ]
-  },
-  "Personal Financial Advisors": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "9a0d8d36-6233-4c76-9107-0d1f783c7340",
-      "664a42e5-3240-413a-9a57-ea93c6303269",
-      "feb5eefc-39f1-4451-9ef9-bffe011b71dd",
-      "3600de06-3f71-4e48-9480-e4828c579924",
-      "c657103b-b348-4496-a848-b2b7165d28b2"
-    ]
-  },
-  "Pharmacists": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "91060ff0-3eb5-4ddf-9edb-f6758b95499e",
-      "8384083a-c31b-4194-80ba-4d335a444918",
-      "045aba2e-4093-42aa-ab7f-159cc538278c",
-      "f2986c1f-2bbf-4b83-bc93-624a9d617f45",
-      "ffed32d8-d192-4e3f-8cd4-eda5a730aec3"
-    ]
-  },
-  "Private Detectives and Investigators": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "ae0c1093-5ea8-4b84-a81e-53ebf7a4321d",
-      "f9f82549-fdde-4462-aff8-e70fba5b8c66",
-      "57b2cdf2-ad62-4591-aa91-aad489740320",
-      "84322284-5c2c-4873-b507-b147449d209d",
-      "a46d5cd2-55fe-48fa-a4c6-6aaf6b9991b5"
-    ]
-  },
-  "Producers and Directors": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "6241e678-4ba3-4831-b3c7-78412697febc",
-      "e14e32ba-d310-4d45-9b8a-6d73d0ece1ae",
-      "b1a79ce1-86b0-41fb-97dc-9206dfd7b044",
-      "e4f664ea-0e5c-4e4e-a0d3-a87a33da947a",
-      "a079d38f-c529-436a-beca-3e291f9e62a3"
-    ]
-  },
-  "Project Management Specialists": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "02aa1805-c658-4069-8a6a-02dec146063a",
-      "fd6129bd-f095-429b-873c-dcc3137be2c3",
-      "ce864f41-8584-49ba-b24f-9c9104b47bf0",
-      "58ac1cc5-5754-4580-8c9c-8c67e1a9d619",
-      "3c19c6d1-672c-467a-8437-6fe21afb8eae"
-    ]
-  },
-  "Property, Real Estate, and Community Association Managers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "a99d85fc-eff8-48d2-a7d4-42a75d62f18d",
-      "55ddb773-23a4-454c-8704-d432fe1b99d9",
-      "1e5a1d7f-12c1-48c6-afd9-82257b3f2409",
-      "0419f1c3-d669-45d0-81cd-f4d5923b06a5",
-      "ed2bc14c-99ac-4a2a-8467-482a1a5d67f3"
-    ]
-  },
-  "Real Estate Brokers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "46bc7238-3501-4839-b989-e2bd47853676",
-      "2d06bc0a-89c6-4e89-9417-5ffe725c1bc6",
-      "fd3ad420-6f7d-43b1-a990-c0c5c047d071",
-      "0818571f-5ff7-4d39-9d2c-ced5ae44299e",
-      "6074bba3-7e3a-4b1c-b8c6-a15bb6695c3b"
-    ]
-  },
-  "Real Estate Sales Agents": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "5ad0c554-a7a2-48cd-b41a-ebc1bff4a9de",
-      "11593a50-734d-4449-b5b4-f8986a133fd8",
-      "94925f49-36bc-42da-b45b-61078d329300",
-      "90f37ff3-e4ed-4a0b-94bb-bed0f7def1ef",
-      "d3d255b2-f5f2-4841-9f62-2083ec9ef3da"
-    ]
-  },
-  "Recreation Workers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "403b9234-6299-4b5f-a106-70c1bc11ec4c",
-      "1bff4551-1d54-4e37-b2e0-d5c3f2ea4a45",
-      "650adcb1-ed19-4f88-8117-77640f7b94b6",
-      "01d7e53e-0513-4109-a242-8ccaf442cd21",
-      "a73fbc98-90d4-4134-a54f-2b1d0c838791"
-    ]
-  },
-  "Registered Nurses": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "0ec25916-1b5c-4bfe-93d3-4e103d860f3a",
-      "116e791e-890c-42b1-ba90-1db02e8bfd45",
-      "dd724c67-8118-4b99-ab50-4761af705c3b",
-      "7151c60a-d4cb-4fc4-8169-3d4cb446e6b9",
-      "90edba97-74f0-425a-8ff6-8b93182eb7cb"
-    ]
-  },
-  "Sales Managers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "b3573f20-5d3e-4954-948f-9461fda693d2",
-      "a69be28f-9a84-47c9-992e-b90446cdca9d",
-      "788d2bc6-82df-4dc7-8467-a0f31405dc14",
-      "74ed1dc7-1468-48a8-9071-58775c0d667a",
-      "69a8ef86-4e69-4fe2-9168-080f1e978e67"
-    ]
-  },
-  "Sales Representatives, Wholesale and Manufacturing, Except Technical and Scientific Products": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "ab81b076-e5d8-473a-9bdb-7ea7c38f6ebc",
-      "d7cfae6f-4a82-4289-955e-c799dfe1e0f4",
-      "19403010-3e5c-494e-a6d3-13594e99f6af",
-      "7ed932dd-244f-4d61-bf02-1bc3bab1af14",
-      "105f8ad0-8dd2-422f-9e88-2be5fbd2b215"
-    ]
-  },
-  "Sales Representatives, Wholesale and Manufacturing, Technical and Scientific Products": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "b57efde3-26d6-4742-bbff-2b63c43b4baa",
-      "15d37511-75c5-4c7f-81f1-16e00c0d95f3",
-      "bb863dd9-31c2-4f64-911a-ce11f457143b",
-      "fe0d3941-e32c-4bf1-a643-b566d2b4cb3c",
-      "6a900a40-8d2b-4064-a5b1-13a60bc173d8"
-    ]
-  },
-  "Securities, Commodities, and Financial Services Sales Agents": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "9efbcd35-186d-49b6-ac24-28ee2bc9a263",
-      "1d4672c8-b0a7-488f-905f-9ab4e25a19f7",
-      "4de6a529-4f61-41a1-b2dc-64951ba03457",
-      "4c4dc603-c21c-4284-8fb1-1b827c1fddf4",
-      "bb499d9c-0263-4684-9238-75e8e86077b1"
-    ]
-  },
-  "Shipping, Receiving, and Inventory Clerks": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "5349dd7b-bf0a-4544-9a17-75b7013767e6",
-      "a4a9195c-5ebe-4b8d-a0c2-4a6b7a49da8b",
-      "552b7dd0-96f4-437c-a749-0691e0e4b381",
-      "11dcc268-cb07-4d3a-a184-c6d7a19349bc",
-      "76418a2c-a3c0-4894-b89d-2493369135d9"
-    ]
-  },
-  "Software Developers": {
-    "percentage": 0.022727,
-    "task_ids": [
-      "0e386e32-df20-4d1f-b536-7159bc409ad5",
-      "7de33b48-5163-4f50-b5f3-8deea8185e57",
-      "854f3814-681c-4950-91ac-55b0db0e3781",
-      "4122f866-01fa-400b-904d-fa171cdab7c7",
-      "2c249e0f-4a8c-4f8e-b4f4-6508ba29b34f"
-    ]
-  }
-}

From 85ef58b756c8fa6eec6d932472a118a6d036d8ed Mon Sep 17 00:00:00 2001
From: Virginia Wu <vadams@nvidia.com>
Date: Thu, 25 Jun 2026 23:19:04 -0700
Subject: [PATCH 5/8] added multistage elo estimation

Signed-off-by: Virginia Wu <vadams@nvidia.com>
---
 resources_servers/gdpval/app.py               |  15 +-
 resources_servers/gdpval/multistage_elo.py    | 305 +++++++++
 .../gdpval/multistage_elo_driver.py           | 613 ++++++++++++++++++
 resources_servers/gdpval/tests/test_app.py    |  68 ++
 .../gdpval/tests/test_multistage_elo.py       | 185 ++++++
 .../tests/test_multistage_elo_driver.py       | 429 ++++++++++++
 6 files changed, 1614 insertions(+), 1 deletion(-)
 create mode 100644 resources_servers/gdpval/multistage_elo.py
 create mode 100644 resources_servers/gdpval/multistage_elo_driver.py
 create mode 100644 resources_servers/gdpval/tests/test_multistage_elo.py
 create mode 100644 resources_servers/gdpval/tests/test_multistage_elo_driver.py

diff --git a/resources_servers/gdpval/app.py b/resources_servers/gdpval/app.py
index 18463586a..6da29d77b 100644
--- a/resources_servers/gdpval/app.py
+++ b/resources_servers/gdpval/app.py
@@ -184,6 +184,12 @@ class GDPValVerifyRequest(BaseVerifyRequest):
     rubric_pretty: Optional[str] = None
     reference_file_urls: Optional[List[str]] = None
     deliverables_dir: Optional[str] = None
+    # Optional per-request filter (comparison mode): judge the eval deliverable
+    # only against this subset of the configured ``reference_models``. Unknown
+    # ids are ignored; ``None`` (default) judges against every configured
+    # reference. Used by the multi-stage ELO driver to select a different set of
+    # reference models per judgementstage without reconfiguring the server.
+    reference_ids: Optional[List[str]] = None
 
 
 class GDPValVerifyResponse(GDPValVerifyRequest, BaseVerifyResponse):
@@ -369,11 +375,18 @@ async def _verify_comparison(self, body: GDPValVerifyRequest) -> GDPValVerifyRes
 
         eval_task_dir = Path(body.deliverables_dir) if body.deliverables_dir else None
 
+        # Optional per-request reference subset (multi-stage ELO). When set, only
+        # the named references are judged this call; unknown ids are ignored.
+        active_references = self._references
+        if body.reference_ids is not None:
+            requested = set(body.reference_ids)
+            active_references = {rid: cfg for rid, cfg in self._references.items() if rid in requested}
+
         # Resolve, per reference model, the available (attempted) repeat dirs
         # for this task. A reference that has no deliverable for this task is
         # simply skipped — the eval model just isn't judged against it here.
         ref_dirs_by_id: Dict[str, List[Path]] = {}
-        for ref_id, ref_cfg in self._references.items():
+        for ref_id, ref_cfg in active_references.items():
             ref_task_root = Path(ref_cfg.deliverables_dir) / f"task_{body.task_id}"
             dirs = [d for d in _iter_ref_repeat_dirs(ref_task_root) if task_attempted(str(d))]
             if dirs:
diff --git a/resources_servers/gdpval/multistage_elo.py b/resources_servers/gdpval/multistage_elo.py
new file mode 100644
index 000000000..a8f4fccc1
--- /dev/null
+++ b/resources_servers/gdpval/multistage_elo.py
@@ -0,0 +1,305 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multi-stage adaptive ELO estimation for GDPVal pairwise comparison.
+
+Instead of comparing the evaluated model against every reference model on all
+tasks, this runs a sequence of *stages*. Each stage:
+
+1. fixes a set of ``T`` tasks sampled from a task-distribution JSON file (see
+   ``responses_api_agents.stirrup_agent.task_distribution``),
+2. judges the evaluated model against a set of ``M`` reference models on those
+   tasks (delegated to an injected ``judge_stage`` callable),
+3. fits an anchored Bradley-Terry MLE ELO from that stage's win/loss/tie
+   battles (reusing ``comparison.calculate_mle_elo``), and
+4. uses that estimate to choose the ``M`` references for the next stage.
+
+Across stages, ``M`` typically shrinks (zooming in on references whose known
+ELO is closest to the evaluated model's current estimate) while ``T`` grows
+(spending the saved judge budget on a tighter final estimate).
+
+This module is intentionally **pure / server-agnostic**: the actual judging
+(running rollouts, calling ``/verify``, reading cached deliverables) is supplied
+by the caller as a ``judge_stage`` callable, so the staging/selection/ELO logic
+is unit-testable without any servers. The orchestration that wires this to the
+GDPVal servers lives in the driver (see the module docstring there).
+"""
+
+from __future__ import annotations
+
+import random
+from dataclasses import dataclass, field
+from typing import Callable, Dict, List, Mapping, Optional, Sequence
+
+from resources_servers.gdpval.comparison import calculate_mle_elo
+
+
+# A mapping ``ref_id -> {"wins": int, "losses": int, "ties": int,
+# "reference_elo": float}`` as produced (per task, then pooled) by the GDPVal
+# comparison verifier. This is the unit the ELO MLE is fit over.
+PerReferenceTotals = Dict[str, Dict[str, float]]
+
+# Signature of the injected judging step. Given the stage's fixed task ids and
+# the selected reference ids, return pooled per-reference win/loss/tie totals
+# for the evaluated model across those tasks.
+JudgeStageFn = Callable[[Sequence[str], Sequence[str]], PerReferenceTotals]
+
+
+@dataclass
+class StageSpec:
+    """Configuration for a single stage.
+
+    ``num_tasks`` is ``T`` (the number of tasks judged this stage). ``num_models``
+    is ``M`` (the number of reference models compared against); ``None`` means
+    "all available references" (used for the first, broad stage). ``seed`` makes
+    task sampling for this stage reproducible.
+    """
+
+    num_tasks: int
+    num_models: Optional[int] = None
+    seed: Optional[int] = None
+
+
+@dataclass
+class StageResult:
+    """Outcome of one stage."""
+
+    stage_index: int
+    task_ids: List[str]
+    reference_ids: List[str]
+    per_reference: PerReferenceTotals
+    eval_elo: Optional[float]
+    normalized_elo: Optional[float]
+    # Number of reference models included in this stage's ELO fit.
+    num_references: int
+
+
+@dataclass
+class MultiStageEloConfig:
+    """End-to-end configuration for a multi-stage ELO run."""
+
+    stages: List[StageSpec]
+    # ref_id -> known/anchor ELO. Both the MLE (anchors) and reference selection
+    # ("closest to the eval estimate") require these.
+    reference_elos: Dict[str, float]
+
+    # Task distribution source. When ``distribution_path`` is unset (or missing),
+    # the driver builds a distribution from ``dataset_path`` (or the default
+    # GDPVal dataset) grouped by ``column`` and caches it. See
+    # ``multistage_elo_driver.ensure_distribution``.
+    distribution_path: Optional[str] = None
+    dataset_path: Optional[str] = None
+
+    # Eval deliverables source. When set, pre-existing cached deliverables under
+    # this directory (``task_<id>/repeat_<n>/``) are reused instead of producing
+    # fresh rollouts. ``produce_missing`` controls whether tasks absent from the
+    # cache are produced on demand (True) or dropped from the stage (False).
+    eval_deliverables_dir: Optional[str] = None
+    produce_missing: bool = True
+
+    # Sampling behaviour across stages. ``nested=True`` makes each stage's task set
+    # a superset of the previous stage's, which is cheaper (reuses produced
+    # deliverables and judgments) but couples the stages' samples. The default
+    # (False) samples each stage independently: later stages draw fresh tasks, so
+    # the stages contribute more independent information to the ELO estimate.
+    nested_tasks: bool = False
+
+    selection: str = "closest"
+    column: List[str] = field(default_factory=lambda: ["occupation"])
+
+    def __post_init__(self) -> None:
+        if not self.stages:
+            raise ValueError("At least one stage is required.")
+        if self.selection != "closest":
+            raise ValueError(f"Unknown selection strategy: {self.selection!r}")
+
+
+# ---------------------------------------------------------------------------
+# Reference selection
+# ---------------------------------------------------------------------------
+
+
+def select_references(
+    reference_elos: Mapping[str, float],
+    eval_elo: Optional[float],
+    num_models: Optional[int],
+) -> List[str]:
+    """Choose reference ids for a stage.
+
+    Returns all references (sorted by id) when ``num_models`` is ``None`` or the
+    estimate is not yet available (the first, broad stage). Otherwise returns the
+    ``num_models`` references whose anchor ELO is closest to ``eval_elo``, ties
+    broken by ``ref_id`` for determinism.
+    """
+    all_ids = sorted(reference_elos)
+    if num_models is None or eval_elo is None or num_models >= len(all_ids):
+        return all_ids
+    if num_models <= 0:
+        return []
+    ranked = sorted(all_ids, key=lambda rid: (abs(reference_elos[rid] - eval_elo), rid))
+    chosen = ranked[:num_models]
+    # Return in stable id order rather than distance order for readable output.
+    return sorted(chosen)
+
+
+# ---------------------------------------------------------------------------
+# Task planning
+# ---------------------------------------------------------------------------
+
+
+def plan_stage_task_ids(
+    distribution: Mapping[str, Mapping[str, object]],
+    stages: Sequence[StageSpec],
+    *,
+    rng: Optional[random.Random] = None,
+    nested: bool = True,
+) -> List[List[str]]:
+    """Pre-sample the task set for every stage from a task distribution.
+
+    Task selection is independent of any ELO estimate, so all stages' task sets
+    can be planned up front.
+
+    ``nested=True`` makes each stage's set a superset of the previous one. We get
+    this for free in a single draw: ``sample_task_ids`` samples without
+    replacement one task at a time, so a prefix of a large draw is identical to a
+    smaller draw made with the same RNG. We therefore draw once, sized to the
+    largest stage, and slice each stage's prefix from it — O(max T) work and
+    exactly proportional per stage, with nesting guaranteed. A single shared RNG
+    is used (per-stage ``seed`` only applies to independent sampling).
+
+    ``nested=False`` samples each stage independently, honoring its own ``seed``.
+    """
+    from responses_api_agents.stirrup_agent.task_distribution import sample_task_ids
+
+    base_rng = rng or random.Random()
+
+    if not nested:
+        return [
+            sample_task_ids(
+                distribution,
+                s.num_tasks,
+                rng=random.Random(s.seed) if s.seed is not None else base_rng,
+            )
+            for s in stages
+        ]
+
+    max_target = max(s.num_tasks for s in stages)
+    ordered = sample_task_ids(distribution, max_target, rng=base_rng)
+    return [list(ordered[: s.num_tasks]) for s in stages]
+
+
+# ---------------------------------------------------------------------------
+# ELO fitting
+# ---------------------------------------------------------------------------
+
+
+def fit_stage_elo(
+    per_reference: Mapping[str, Mapping[str, float]],
+    reference_elos: Mapping[str, float],
+) -> tuple[Optional[float], Optional[float], int]:
+    """Fit the eval model's ELO for a stage from per-reference battle totals.
+
+    A reference is included in the fit only if it has a known anchor ELO (from
+    ``reference_elos`` or a ``reference_elo`` recorded on its counts) and at
+    least one judged game (win + loss + tie > 0).
+
+    Returns ``(elo, normalized_elo, num_references)``:
+    - ``num_references`` is how many references met both criteria above and were
+      passed to the MLE.
+    - ``elo`` / ``normalized_elo`` are ``None`` when no reference qualified
+      (``num_references == 0``) or when the MLE itself could not produce a rating;
+      in the latter case ``num_references`` is still > 0.
+    """
+    battles: List[tuple[float, float, float, float]] = []
+    for ref_id, counts in per_reference.items():
+        ref_elo = reference_elos.get(ref_id, counts.get("reference_elo"))
+        if ref_elo is None:
+            continue
+        wins = float(counts.get("wins", 0) or 0)
+        losses = float(counts.get("losses", 0) or 0)
+        ties = float(counts.get("ties", 0) or 0)
+        if wins + losses + ties <= 0:
+            continue
+        battles.append((float(ref_elo), wins, losses, ties))
+
+    if not battles:
+        return None, None, 0
+
+    mle = calculate_mle_elo(battles)
+    if mle is None:
+        return None, None, len(battles)
+    elo, normalized = mle
+    return elo, normalized, len(battles)
+
+
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+
+
+class MultiStageEloRunner:
+    """Drive the multi-stage ELO procedure.
+
+    ``run`` first plans every stage's task set up front (task selection does not
+    depend on any ELO estimate), then walks the stages sequentially: for each
+    stage it selects the references (closest known ELO to the running estimate),
+    judges the stage, fits the stage ELO, and threads that estimate into the next
+    stage's reference selection. Matchup judging is not the runner's concern; it
+    is supplied as ``judge_stage(task_ids, reference_ids) -> per_reference_totals``.
+
+    ``run`` returns one ``StageResult`` per stage; the last stage's ``eval_elo``
+    is the headline estimate.
+    """
+
+    def __init__(
+        self,
+        config: MultiStageEloConfig,
+        distribution: Mapping[str, Mapping[str, object]],
+        judge_stage: JudgeStageFn,
+        *,
+        rng: Optional[random.Random] = None,
+    ) -> None:
+        self.config = config
+        self.distribution = distribution
+        self.judge_stage = judge_stage
+        self.rng = rng or random.Random()
+
+    def run(self) -> List[StageResult]:
+        stage_task_sets = plan_stage_task_ids(
+            self.distribution,
+            self.config.stages,
+            rng=self.rng,
+            nested=self.config.nested_tasks,
+        )
+
+        results: List[StageResult] = []
+        eval_elo: Optional[float] = None
+        for index, stage in enumerate(self.config.stages):
+            reference_ids = select_references(self.config.reference_elos, eval_elo, stage.num_models)
+            task_ids = stage_task_sets[index]
+            per_reference = self.judge_stage(task_ids, reference_ids)
+            stage_elo, normalized, num_references = fit_stage_elo(per_reference, self.config.reference_elos)
+            if stage_elo is not None:
+                eval_elo = stage_elo
+            results.append(
+                StageResult(
+                    stage_index=index,
+                    task_ids=list(task_ids),
+                    reference_ids=list(reference_ids),
+                    per_reference=dict(per_reference),
+                    eval_elo=stage_elo,
+                    normalized_elo=normalized,
+                    num_references=num_references,
+                )
+            )
+        return results
diff --git a/resources_servers/gdpval/multistage_elo_driver.py b/resources_servers/gdpval/multistage_elo_driver.py
new file mode 100644
index 000000000..310c4d1d4
--- /dev/null
+++ b/resources_servers/gdpval/multistage_elo_driver.py
@@ -0,0 +1,613 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Driver that wires the multi-stage ELO logic to the GDPVal comparison server.
+
+This composes the pure staging logic in ``multistage_elo`` with the GDPVal
+resources server's ``/verify`` (comparison mode). For each stage it:
+
+1. asks the runner to select the stage's references (closest known ELO to the
+   current estimate) and fix the stage's sampled tasks,
+2. judges the evaluated model's cached deliverables against that reference
+   subset, one ``/verify`` call per (task, repeat) with the per-request
+   ``reference_ids`` filter,
+3. pools the per-reference win/loss/tie votes and fits the stage ELO.
+
+The evaluated model's deliverables are read from a directory laid out as
+``<eval_deliverables_dir>/task_<id>/repeat_<n>/`` (the same layout the Stirrup
+agent persists). Point ``eval_deliverables_dir`` at deliverables produced by an
+earlier run to score them with **zero rollouts**. Tasks missing from the cache
+are either produced on demand via an injected ``producer`` callback or reported,
+controlled by ``produce_missing``.
+
+The judging primitive ``verify_one`` is injected so the orchestration is
+testable without a running server; ``make_http_verify_one`` provides the real
+implementation that POSTs to the resources server.
+
+CLI usage (run from the repo root, against a running comparison server)::
+
+    python -m resources_servers.gdpval.multistage_elo_driver \\
+        --server-url http://localhost:8000 \\
+        --eval-deliverables-dir /path/to/eval/deliverables \\
+        --reference-elos '@refs.json' \\
+        --stage 5 --stage 88:4 \\
+        --output elo_summary.json
+
+where ``refs.json`` is ``{"<ref_id>": <elo>, ...}`` with ids matching the
+server's configured ``reference_models``. See ``--help`` for all flags.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import random
+import sys
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence
+
+from resources_servers.gdpval.comparison import task_attempted
+from resources_servers.gdpval.multistage_elo import (
+    MultiStageEloConfig,
+    MultiStageEloRunner,
+    PerReferenceTotals,
+    StageResult,
+    StageSpec,
+)
+
+
+# verify_one(task_id, deliverables_dir, prompt, reference_ids) -> verify response dict
+VerifyOneFn = Callable[[str, str, str, Sequence[str]], Dict[str, Any]]
+# producer(task_ids) -> None: materialize eval deliverables for the given tasks.
+ProducerFn = Callable[[Sequence[str]], None]
+
+
+# ---------------------------------------------------------------------------
+# Dataset / distribution loading
+# ---------------------------------------------------------------------------
+
+
+# Default location for distributions this driver builds on demand. Lives under
+# the resources server's data dir so it is reachable from wherever the driver
+# runs and is easy to inspect/reuse across runs.
+DEFAULT_DISTRIBUTION_CACHE_DIR = Path(__file__).resolve().parent / "data" / "distributions"
+
+
+def load_distribution(path: str | Path) -> Dict[str, Dict[str, Any]]:
+    """Load a task-distribution JSON file produced by ``task_distribution.py``."""
+    with Path(path).open("r", encoding="utf-8") as handle:
+        data = json.load(handle)
+    if not isinstance(data, dict):
+        raise ValueError(f"Distribution file {path} must be a JSON object.")
+    return data
+
+
+def ensure_distribution(
+    distribution_path: Optional[str | Path] = None,
+    *,
+    dataset_path: Optional[str | Path] = None,
+    columns: Optional[Sequence[str]] = None,
+    cache_dir: Optional[str | Path] = None,
+) -> tuple[Dict[str, Dict[str, Any]], Path]:
+    """Return ``(distribution, path)``, building the distribution if needed.
+
+    If ``distribution_path`` exists it is loaded as-is. Otherwise a distribution
+    is built from ``dataset_path`` (or the default GDPVal dataset) grouped by
+    ``columns`` (default ``["occupation"]``) via ``task_distribution``, then saved
+    so subsequent runs reuse it. It is written to ``distribution_path`` when
+    given, else to ``<cache_dir>/<columns>_distribution.json`` (cache_dir
+    defaults to ``DEFAULT_DISTRIBUTION_CACHE_DIR``).
+    """
+    column_list = list(columns) if columns else ["occupation"]
+
+    if distribution_path is not None and Path(distribution_path).is_file():
+        return load_distribution(distribution_path), Path(distribution_path)
+
+    from responses_api_agents.stirrup_agent.task_distribution import (
+        build_distribution_from_dataset,
+        resolve_default_dataset,
+    )
+
+    resolved_dataset = Path(dataset_path) if dataset_path is not None else resolve_default_dataset()
+    if resolved_dataset is None:
+        raise FileNotFoundError(
+            "No distribution file was provided and no default GDPVal dataset could be found to "
+            "build one from. Provide distribution_path, pass dataset_path, or prepare the GDPVal "
+            "dataset (gym eval prepare --benchmark gdpval)."
+        )
+
+    distribution = build_distribution_from_dataset(resolved_dataset, column_list)
+
+    if distribution_path is not None:
+        out_path = Path(distribution_path)
+    else:
+        base = Path(cache_dir) if cache_dir is not None else DEFAULT_DISTRIBUTION_CACHE_DIR
+        out_path = base / f"{'_'.join(column_list)}_distribution.json"
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", encoding="utf-8") as handle:
+        json.dump(distribution, handle, indent=2, ensure_ascii=False)
+    print(
+        f"[multistage-elo] built task distribution over {column_list} from {resolved_dataset} -> {out_path}",
+        flush=True,
+    )
+    return distribution, out_path
+
+
+def load_task_prompts(jsonl_path: str | Path) -> Dict[str, str]:
+    """Map ``task_id -> prompt`` from a benchmark JSONL.
+
+    The prompt is needed when judging cached deliverables (the judge sees the
+    task description). Looks for ``prompt`` and ``task_id`` at the top level and,
+    failing that, under ``responses_create_params.metadata`` — covering both the
+    prepared benchmark layout and the metadata-nested layout.
+    """
+    prompts: Dict[str, str] = {}
+    with Path(jsonl_path).open("r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            meta = (row.get("responses_create_params") or {}).get("metadata") or {}
+            task_id = row.get("task_id") or meta.get("task_id")
+            prompt = row.get("prompt") or meta.get("prompt")
+            if task_id is not None:
+                prompts[str(task_id)] = prompt or ""
+
+    return prompts
+
+
+# ---------------------------------------------------------------------------
+# Cached-deliverable discovery
+# ---------------------------------------------------------------------------
+
+
+def task_repeat_dirs(eval_deliverables_dir: str | Path, task_id: str) -> List[Path]:
+    """Return attempted ``repeat_<n>`` dirs (or a flat task dir) for a task.
+
+    Mirrors the resources server's reference-repeat resolution: prefers
+    ``task_<id>/repeat_<n>/`` subdirs, falls back to a flat ``task_<id>/``, and
+    only returns dirs that look like a completed run (``finish_params.json``).
+    """
+    task_root = Path(eval_deliverables_dir) / f"task_{task_id}"
+    if not task_root.is_dir():
+        return []
+    repeats = sorted(p for p in task_root.iterdir() if p.is_dir() and p.name.startswith("repeat_"))
+    candidates = repeats or [task_root]
+    return [d for d in candidates if task_attempted(str(d))]
+
+
+def cached_task_ids(eval_deliverables_dir: str | Path) -> set:
+    """All task ids that have at least one attempted deliverable in the cache."""
+    root = Path(eval_deliverables_dir)
+    if not root.is_dir():
+        return set()
+    found = set()
+    for child in root.iterdir():
+        if child.is_dir() and child.name.startswith("task_"):
+            task_id = child.name[len("task_") :]
+            if task_repeat_dirs(eval_deliverables_dir, task_id):
+                found.add(task_id)
+    return found
+
+
+def check_coverage(eval_deliverables_dir: str | Path, task_ids: Sequence[str]) -> tuple[List[str], List[str]]:
+    """Split ``task_ids`` into ``(present, missing)`` against the cache."""
+    present, missing = [], []
+    for tid in task_ids:
+        (present if task_repeat_dirs(eval_deliverables_dir, tid) else missing).append(tid)
+
+    return present, missing
+
+
+# ---------------------------------------------------------------------------
+# Vote pooling
+# ---------------------------------------------------------------------------
+
+
+def pool_per_reference(verify_responses: Sequence[Mapping[str, Any]]) -> PerReferenceTotals:
+    """Sum ``per_reference`` win/loss/tie counts across many verify responses."""
+    totals: PerReferenceTotals = {}
+    for vr in verify_responses:
+        per_ref = vr.get("per_reference") or {}
+        for ref_id, counts in per_ref.items():
+            entry = totals.setdefault(ref_id, {"wins": 0, "losses": 0, "ties": 0, "reference_elo": None})
+            entry["wins"] += int(counts.get("wins", 0) or 0)
+            entry["losses"] += int(counts.get("losses", 0) or 0)
+            entry["ties"] += int(counts.get("ties", 0) or 0)
+            if entry["reference_elo"] is None:
+                entry["reference_elo"] = counts.get("reference_elo")
+
+    return totals
+
+
+# ---------------------------------------------------------------------------
+# judge_stage builder
+# ---------------------------------------------------------------------------
+
+
+def build_judge_stage(
+    verify_one: VerifyOneFn,
+    eval_deliverables_dir: str | Path,
+    task_prompts: Mapping[str, str],
+    *,
+    produce_missing: bool = True,
+    producer: Optional[ProducerFn] = None,
+):
+    """Build the ``judge_stage`` callable expected by ``MultiStageEloRunner``.
+
+    For each stage's tasks, judges the cached eval deliverables against the
+    selected references (one ``verify_one`` call per task-repeat) and pools the
+    per-reference votes. Missing tasks are produced via ``producer`` when given;
+    otherwise ``produce_missing=True`` raises an actionable error and
+    ``produce_missing=False`` drops them with a warning.
+    """
+
+    def judge_stage(task_ids: Sequence[str], reference_ids: Sequence[str]) -> PerReferenceTotals:
+        present, missing = check_coverage(eval_deliverables_dir, task_ids)
+        if missing:
+            if producer is not None:
+                producer(missing)
+                present, missing = check_coverage(eval_deliverables_dir, task_ids)
+            if missing and produce_missing and producer is None:
+                raise FileNotFoundError(
+                    f"{len(missing)} task(s) have no cached eval deliverable under "
+                    f"{eval_deliverables_dir} (e.g. {missing[:3]}). Produce them first with an "
+                    f"execute_only run, pass a producer, or set produce_missing=False to skip them."
+                )
+            if missing:
+                print(
+                    f"[multistage-elo] WARNING: skipping {len(missing)} task(s) with no cached "
+                    f"deliverable (e.g. {missing[:3]})",
+                    flush=True,
+                )
+
+        responses: List[Dict[str, Any]] = []
+        for task_id in present:
+            prompt = task_prompts.get(task_id, "")
+            for repeat_dir in task_repeat_dirs(eval_deliverables_dir, task_id):
+                responses.append(verify_one(task_id, str(repeat_dir), prompt, list(reference_ids)))
+        return pool_per_reference(responses)
+
+    return judge_stage
+
+
+# ---------------------------------------------------------------------------
+# Real verify_one (HTTP)
+# ---------------------------------------------------------------------------
+
+
+def build_verify_request_body(
+    task_id: str,
+    deliverables_dir: str,
+    prompt: str,
+    reference_ids: Sequence[str],
+    *,
+    model: str = "eval",
+) -> Dict[str, Any]:
+    """Build a minimal comparison-mode ``/verify`` request body.
+
+    In comparison mode the judge reads deliverable files from ``deliverables_dir``
+    rather than the response payload, so a placeholder response is sufficient.
+    """
+    return {
+        "responses_create_params": {"input": [], "model": model},
+        "response": {
+            "id": f"multistage-{task_id}",
+            "created_at": 0,
+            "model": model,
+            "object": "response",
+            "output": [],
+            "parallel_tool_calls": False,
+            "tool_choice": "none",
+            "tools": [],
+        },
+        "task_id": task_id,
+        "prompt": prompt,
+        "deliverables_dir": deliverables_dir,
+        "reference_ids": list(reference_ids),
+    }
+
+
+def make_http_verify_one(server_url: str, *, timeout: float = 1800.0, model: str = "eval") -> VerifyOneFn:
+    """Return a blocking ``verify_one`` that POSTs to a running resources server.
+
+    ``server_url`` is the resources server base URL (e.g. ``http://host:port``);
+    ``/verify`` is appended. Uses stdlib ``urllib`` so the driver pulls in no
+    async machinery — it is a standalone orchestration script, not part of the
+    server hot path.
+    """
+    import urllib.request
+
+    endpoint = server_url.rstrip("/") + "/verify"
+
+    def verify_one(task_id: str, deliverables_dir: str, prompt: str, reference_ids: Sequence[str]) -> Dict[str, Any]:
+        body = build_verify_request_body(task_id, deliverables_dir, prompt, reference_ids, model=model)
+        data = json.dumps(body).encode("utf-8")
+        req = urllib.request.Request(endpoint, data=data, headers={"Content-Type": "application/json"})
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            return json.loads(resp.read().decode("utf-8"))
+
+    return verify_one
+
+
+# ---------------------------------------------------------------------------
+# Top-level run
+# ---------------------------------------------------------------------------
+
+
+def run_multistage_elo(
+    config: MultiStageEloConfig,
+    verify_one: VerifyOneFn,
+    task_prompts: Mapping[str, str],
+    *,
+    rng=None,
+    producer: Optional[ProducerFn] = None,
+) -> List[StageResult]:
+    """Run the full multi-stage ELO procedure and return per-stage results.
+
+    ``config.eval_deliverables_dir`` must be set — it is the source of the eval
+    model's (cached or produced) deliverables.
+    """
+    if not config.eval_deliverables_dir:
+        raise ValueError("config.eval_deliverables_dir must be set (source of eval deliverables).")
+
+    distribution, _ = ensure_distribution(
+        config.distribution_path,
+        dataset_path=config.dataset_path,
+        columns=config.column,
+    )
+    judge_stage = build_judge_stage(
+        verify_one,
+        config.eval_deliverables_dir,
+        task_prompts,
+        produce_missing=config.produce_missing,
+        producer=producer,
+    )
+    runner = MultiStageEloRunner(config, distribution, judge_stage, rng=rng)
+    return runner.run()
+
+
+def stage_results_to_dict(results: Sequence[StageResult]) -> Dict[str, Any]:
+    """Serialize stage results to a JSON-friendly summary dict."""
+    final = results[-1] if results else None
+    return {
+        "final_eval_elo": final.eval_elo if final else None,
+        "final_normalized_elo": final.normalized_elo if final else None,
+        "num_stages": len(results),
+        "stages": [
+            {
+                "stage_index": r.stage_index,
+                "num_tasks": len(r.task_ids),
+                "reference_ids": r.reference_ids,
+                "eval_elo": r.eval_elo,
+                "normalized_elo": r.normalized_elo,
+                "num_references": r.num_references,
+                "per_reference": r.per_reference,
+                "task_ids": r.task_ids,
+            }
+            for r in results
+        ],
+    }
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+DEFAULT_TASK_PROMPTS = "benchmarks/gdpval/data/gdpval_benchmark.jsonl"
+
+
+def _parse_stage(spec: str) -> StageSpec:
+    """Parse a ``--stage`` value ``num_tasks[:num_models[:seed]]`` into a StageSpec.
+
+    ``num_models`` may be ``all`` or empty for "all available references". Examples:
+    ``5`` (5 tasks, all refs), ``88:4`` (88 tasks, 4 closest refs), ``5:all:7``
+    (5 tasks, all refs, seed 7).
+    """
+    parts = spec.split(":")
+    if not parts or not parts[0].strip():
+        raise argparse.ArgumentTypeError(f"Invalid --stage {spec!r}: num_tasks is required.")
+    try:
+        num_tasks = int(parts[0])
+    except ValueError:
+        raise argparse.ArgumentTypeError(f"Invalid --stage {spec!r}: num_tasks must be an integer.")
+
+    num_models: Optional[int] = None
+    if len(parts) >= 2 and parts[1].strip() and parts[1].strip().lower() != "all":
+        try:
+            num_models = int(parts[1])
+        except ValueError:
+            raise argparse.ArgumentTypeError(f"Invalid --stage {spec!r}: num_models must be an integer or 'all'.")
+
+    seed: Optional[int] = None
+    if len(parts) >= 3 and parts[2].strip():
+        try:
+            seed = int(parts[2])
+        except ValueError:
+            raise argparse.ArgumentTypeError(f"Invalid --stage {spec!r}: seed must be an integer.")
+
+    return StageSpec(num_tasks=num_tasks, num_models=num_models, seed=seed)
+
+
+def _load_reference_elos(value: str) -> Dict[str, float]:
+    """Load reference ELOs from inline JSON or, if prefixed with ``@``, a JSON file.
+
+    Accepts ``{"ref_id": elo, ...}``. The ids must match the running server's
+    ``reference_models`` ids.
+    """
+    text = value
+    if value.startswith("@"):
+        text = Path(value[1:]).read_text(encoding="utf-8")
+    data = json.loads(text)
+    if not isinstance(data, dict) or not data:
+        raise argparse.ArgumentTypeError("--reference-elos must be a non-empty JSON object of {ref_id: elo}.")
+    return {str(k): float(v) for k, v in data.items()}
+
+
+def _build_arg_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="multistage_elo",
+        description=(
+            "Run multi-stage adaptive ELO estimation for a model's GDPVal deliverables "
+            "against a running GDPVal comparison server."
+        ),
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "Example:\n"
+            "  python -m resources_servers.gdpval.multistage_elo_driver \\\n"
+            "    --server-url http://localhost:8000 \\\n"
+            "    --eval-deliverables-dir /path/to/eval/deliverables \\\n"
+            "    --reference-elos '@refs.json' \\\n"
+            "    --stage 5 --stage 88:4 \\\n"
+            "    --output elo_summary.json\n"
+        ),
+    )
+    parser.add_argument(
+        "--server-url",
+        required=True,
+        help="Base URL of the running GDPVal comparison-mode resources server (e.g. http://localhost:8000).",
+    )
+    parser.add_argument(
+        "--eval-deliverables-dir",
+        required=True,
+        help="Directory of the evaluated model's deliverables (task_<id>/repeat_<n>/ layout).",
+    )
+    parser.add_argument(
+        "--reference-elos",
+        required=True,
+        type=_load_reference_elos,
+        metavar="JSON",
+        help=(
+            "Reference anchor ELOs as inline JSON ('{\"ref\": 1500, ...}') or '@path.json'. "
+            "Keys must match the server's reference_models ids."
+        ),
+    )
+    parser.add_argument(
+        "--stage",
+        dest="stages",
+        action="append",
+        required=True,
+        type=_parse_stage,
+        metavar="N[:M[:SEED]]",
+        help=(
+            "A stage as num_tasks[:num_models[:seed]] (num_models 'all' or omitted = all references). "
+            "Repeat for multiple stages, e.g. --stage 5 --stage 88:4."
+        ),
+    )
+    parser.add_argument(
+        "--task-prompts",
+        default=DEFAULT_TASK_PROMPTS,
+        help=f"Benchmark JSONL mapping task_id -> prompt (default: {DEFAULT_TASK_PROMPTS}).",
+    )
+    parser.add_argument(
+        "--distribution",
+        default=None,
+        help="Existing task-distribution JSON to sample tasks from. If omitted, one is built and cached.",
+    )
+    parser.add_argument(
+        "--dataset",
+        default=None,
+        help="Dataset JSONL to build the distribution from when --distribution is not given (default: GDPVal).",
+    )
+    parser.add_argument(
+        "--column",
+        dest="columns",
+        action="append",
+        default=None,
+        metavar="COLUMN",
+        help="Column(s) to group the distribution by when building one (default: occupation). Repeatable.",
+    )
+    parser.add_argument(
+        "--nested-tasks",
+        action="store_true",
+        help="Make each stage's task set a superset of the previous (default: independent per-stage sampling).",
+    )
+    parser.add_argument(
+        "--skip-missing",
+        action="store_true",
+        help="Drop tasks with no cached eval deliverable instead of erroring (sets produce_missing=False).",
+    )
+    parser.add_argument(
+        "--model",
+        default="eval",
+        help="Label for the evaluated model in verify requests (default: eval).",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        default=1800.0,
+        help="Per-request /verify timeout in seconds (default: 1800).",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Top-level RNG seed for reproducible task sampling and reference selection.",
+    )
+    parser.add_argument(
+        "--output",
+        "-o",
+        default=None,
+        help="Path to write the JSON ELO summary. Defaults to stdout.",
+    )
+    return parser
+
+
+def main(argv: Optional[Sequence[str]] = None) -> int:
+    parser = _build_arg_parser()
+    args = parser.parse_args(argv)
+
+    eval_dir = Path(args.eval_deliverables_dir)
+    if not eval_dir.is_dir():
+        print(f"Eval deliverables dir not found: {eval_dir}", file=sys.stderr)
+        return 2
+
+    prompts_path = Path(args.task_prompts)
+    if not prompts_path.is_file():
+        print(f"Task prompts JSONL not found: {prompts_path}", file=sys.stderr)
+        return 2
+
+    config = MultiStageEloConfig(
+        stages=list(args.stages),
+        reference_elos=args.reference_elos,
+        distribution_path=args.distribution,
+        dataset_path=args.dataset,
+        eval_deliverables_dir=str(eval_dir),
+        produce_missing=not args.skip_missing,
+        nested_tasks=args.nested_tasks,
+        column=list(args.columns) if args.columns else ["occupation"],
+    )
+
+    verify_one = make_http_verify_one(args.server_url, timeout=args.timeout, model=args.model)
+    task_prompts = load_task_prompts(prompts_path)
+    rng = random.Random(args.seed) if args.seed is not None else None
+
+    results = run_multistage_elo(config, verify_one, task_prompts, rng=rng)
+    payload = json.dumps(stage_results_to_dict(results), indent=2, ensure_ascii=False)
+
+    if args.output:
+        out_path = Path(args.output)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        out_path.write_text(payload + "\n", encoding="utf-8")
+        final = results[-1] if results else None
+        final_elo = final.eval_elo if final else None
+        print(f"Wrote ELO summary ({len(results)} stages, final_eval_elo={final_elo}) to {out_path}", file=sys.stderr)
+    else:
+        print(payload)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/resources_servers/gdpval/tests/test_app.py b/resources_servers/gdpval/tests/test_app.py
index b2585b950..a61652f0e 100644
--- a/resources_servers/gdpval/tests/test_app.py
+++ b/resources_servers/gdpval/tests/test_app.py
@@ -595,6 +595,74 @@ def fake_run_trials(**_kwargs):
         assert resp.total_losses == 2
         assert resp.judge_response["reference_count"] == 2
 
+    @pytest.mark.asyncio
+    async def test_reference_ids_filter_judges_subset(self, tmp_path) -> None:
+        """``reference_ids`` on the verify request restricts judging to the named
+        references; unknown ids are ignored."""
+        eval_dir = tmp_path / "eval" / "task_task-1" / "repeat_0"
+        eval_dir.mkdir(parents=True)
+        (eval_dir / "finish_params.json").write_text("{}")
+
+        ref_roots = {}
+        for ref_id in ("kimi", "gpt5"):
+            root = tmp_path / ref_id
+            td = root / "task_task-1"
+            td.mkdir(parents=True)
+            (td / "finish_params.json").write_text("{}")
+            ref_roots[ref_id] = root
+
+        server = _server(
+            reward_mode="comparison",
+            reference_models={
+                "kimi": {"deliverables_dir": str(ref_roots["kimi"]), "elo": 1290.0},
+                "gpt5": {"deliverables_dir": str(ref_roots["gpt5"]), "elo": 1320.0},
+            },
+            preconvert_office_to_pdf=False,
+            num_comparison_trials=4,
+        )
+
+        def fake_run_trials(**_kwargs):
+            return {"winner": "[[B]]", "win_count_a": 1, "win_count_b": 3, "tie_count": 0, "task_count": 4}
+
+        # Only judge against gpt5 (and an unknown id, which is ignored).
+        body = _verify_request(deliverables_dir=str(eval_dir), reference_ids=["gpt5", "nonexistent"])
+
+        with (
+            patch("resources_servers.gdpval.comparison.run_trials", side_effect=fake_run_trials),
+            patch("resources_servers.gdpval.app.get_server_url", return_value="http://localhost:9999"),
+            patch("resources_servers.gdpval.comparison.build_file_section", return_value=[]),
+            patch("openai.OpenAI", return_value=MagicMock()),
+        ):
+            resp = await server.verify(body)
+
+        assert set(resp.per_reference) == {"gpt5"}
+        assert resp.total_wins == 3
+        assert resp.total_losses == 1
+        assert resp.judge_response["reference_count"] == 1
+
+    @pytest.mark.asyncio
+    async def test_reference_ids_empty_yields_no_references(self, tmp_path) -> None:
+        """An empty ``reference_ids`` list judges against nothing → reference_missing."""
+        eval_dir = tmp_path / "eval" / "task_task-1" / "repeat_0"
+        eval_dir.mkdir(parents=True)
+        (eval_dir / "finish_params.json").write_text("{}")
+        root = tmp_path / "kimi"
+        (root / "task_task-1").mkdir(parents=True)
+        (root / "task_task-1" / "finish_params.json").write_text("{}")
+
+        server = _server(
+            reward_mode="comparison",
+            reference_models={"kimi": {"deliverables_dir": str(root), "elo": 1290.0}},
+            preconvert_office_to_pdf=False,
+        )
+        body = _verify_request(deliverables_dir=str(eval_dir), reference_ids=[])
+
+        with patch("resources_servers.gdpval.app.get_server_url", return_value="http://localhost:9999"):
+            resp = await server.verify(body)
+
+        assert resp.reward == 0.0
+        assert resp.judge_response == {"error": "reference_missing"}
+
     @staticmethod
     def _two_ref_server_and_body(tmp_path):
         eval_dir = tmp_path / "eval" / "task_task-1" / "repeat_0"
diff --git a/resources_servers/gdpval/tests/test_multistage_elo.py b/resources_servers/gdpval/tests/test_multistage_elo.py
new file mode 100644
index 000000000..4633cdda7
--- /dev/null
+++ b/resources_servers/gdpval/tests/test_multistage_elo.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+
+import pytest
+
+from resources_servers.gdpval.multistage_elo import (
+    MultiStageEloConfig,
+    MultiStageEloRunner,
+    StageSpec,
+    fit_stage_elo,
+    plan_stage_task_ids,
+    select_references,
+)
+
+
+def _dist(groups):
+    """groups: {key: [task_ids]} -> distribution dict with proportional pct."""
+    total = sum(len(v) for v in groups.values()) or 1
+    return {k: {"percentage": len(v) / total, "task_ids": list(v)} for k, v in groups.items()}
+
+
+class TestSelectReferences:
+    ELOS = {"a": 1000.0, "b": 1200.0, "c": 1300.0, "d": 1500.0}
+
+    def test_all_when_num_models_none(self) -> None:
+        assert select_references(self.ELOS, 1234.0, None) == ["a", "b", "c", "d"]
+
+    def test_all_when_eval_elo_none(self) -> None:
+        assert select_references(self.ELOS, None, 2) == ["a", "b", "c", "d"]
+
+    def test_all_when_num_models_exceeds_available(self) -> None:
+        assert select_references(self.ELOS, 1234.0, 10) == ["a", "b", "c", "d"]
+
+    def test_closest_subset(self) -> None:
+        # eval 1250 -> closest are c(1300,50) and b(1200,50); tie broken by id.
+        assert select_references(self.ELOS, 1250.0, 2) == ["b", "c"]
+
+    def test_closest_single(self) -> None:
+        assert select_references(self.ELOS, 1490.0, 1) == ["d"]
+
+    def test_zero_models_returns_empty(self) -> None:
+        assert select_references(self.ELOS, 1250.0, 0) == []
+
+    def test_result_sorted_by_id(self) -> None:
+        chosen = select_references(self.ELOS, 1100.0, 3)
+        assert chosen == sorted(chosen)
+
+
+class TestPlanStageTaskIds:
+    def test_nested_is_superset(self) -> None:
+        dist = _dist({"x": [f"x{i}" for i in range(10)], "y": [f"y{i}" for i in range(10)]})
+        stages = [StageSpec(num_tasks=3), StageSpec(num_tasks=8)]
+        planned = plan_stage_task_ids(dist, stages, rng=random.Random(0), nested=True)
+        assert len(planned[0]) == 3
+        assert len(planned[1]) == 8
+        assert set(planned[0]).issubset(set(planned[1]))
+
+    def test_nested_no_duplicates(self) -> None:
+        dist = _dist({"x": [f"x{i}" for i in range(20)]})
+        stages = [StageSpec(num_tasks=5), StageSpec(num_tasks=12)]
+        planned = plan_stage_task_ids(dist, stages, rng=random.Random(1), nested=True)
+        assert len(planned[1]) == len(set(planned[1]))
+
+    def test_nested_capped_at_available(self) -> None:
+        dist = _dist({"x": ["a", "b", "c"]})
+        stages = [StageSpec(num_tasks=2), StageSpec(num_tasks=100)]
+        planned = plan_stage_task_ids(dist, stages, rng=random.Random(2), nested=True)
+        assert sorted(planned[1]) == ["a", "b", "c"]
+
+    def test_non_increasing_stage_reuses_prefix(self) -> None:
+        dist = _dist({"x": [f"x{i}" for i in range(10)]})
+        stages = [StageSpec(num_tasks=5), StageSpec(num_tasks=3)]
+        planned = plan_stage_task_ids(dist, stages, rng=random.Random(3), nested=True)
+        assert planned[1] == planned[0][:3]
+
+    def test_independent_sampling(self) -> None:
+        dist = _dist({"x": [f"x{i}" for i in range(50)]})
+        stages = [StageSpec(num_tasks=5, seed=1), StageSpec(num_tasks=5, seed=2)]
+        planned = plan_stage_task_ids(dist, stages, nested=False)
+        assert len(planned[0]) == 5 and len(planned[1]) == 5
+
+    def test_seed_reproducible(self) -> None:
+        dist = _dist({"x": [f"x{i}" for i in range(50)]})
+        stages = [StageSpec(num_tasks=7, seed=42)]
+        a = plan_stage_task_ids(dist, stages, nested=False)
+        b = plan_stage_task_ids(dist, stages, nested=False)
+        assert a == b
+
+
+class TestFitStageElo:
+    ELOS = {"a": 1000.0, "b": 1400.0}
+
+    def test_no_battles_returns_none(self) -> None:
+        assert fit_stage_elo({}, self.ELOS) == (None, None, 0)
+
+    def test_zero_games_skipped(self) -> None:
+        per_ref = {"a": {"wins": 0, "losses": 0, "ties": 0}}
+        assert fit_stage_elo(per_ref, self.ELOS) == (None, None, 0)
+
+    def test_fits_elo_uses_config_anchor(self) -> None:
+        per_ref = {"a": {"wins": 5, "losses": 5, "ties": 0}}
+        elo, norm, n = fit_stage_elo(per_ref, self.ELOS)
+        # 50% win rate vs a single anchor -> eval elo ~= anchor elo.
+        assert n == 1
+        assert elo == pytest.approx(1000.0, abs=1.0)
+        assert norm == pytest.approx((elo - 500.0) / 2000.0)
+
+    def test_falls_back_to_recorded_reference_elo(self) -> None:
+        per_ref = {"z": {"wins": 5, "losses": 5, "ties": 0, "reference_elo": 1100.0}}
+        elo, _norm, n = fit_stage_elo(per_ref, {})
+        assert n == 1
+        assert elo == pytest.approx(1100.0, abs=1.0)
+
+    def test_multi_reference_battles(self) -> None:
+        per_ref = {
+            "a": {"wins": 8, "losses": 2, "ties": 0},
+            "b": {"wins": 2, "losses": 8, "ties": 0},
+        }
+        elo, _norm, n = fit_stage_elo(per_ref, self.ELOS)
+        assert n == 2
+        assert 1000.0 < elo < 1400.0
+
+
+class TestMultiStageEloRunner:
+    def _config(self, **overrides):
+        base = dict(
+            distribution_path="unused.json",
+            stages=[StageSpec(num_tasks=3, num_models=None), StageSpec(num_tasks=6, num_models=2)],
+            reference_elos={"a": 1000.0, "b": 1200.0, "c": 1300.0, "d": 1500.0},
+        )
+        base.update(overrides)
+        return MultiStageEloConfig(**base)
+
+    def test_requires_stages(self) -> None:
+        with pytest.raises(ValueError):
+            MultiStageEloConfig(distribution_path="x", stages=[], reference_elos={})
+
+    def test_unknown_selection_rejected(self) -> None:
+        with pytest.raises(ValueError):
+            MultiStageEloConfig(distribution_path="x", stages=[StageSpec(1)], reference_elos={}, selection="zzz")
+
+    def test_two_stage_flow_threads_elo_and_shrinks_refs(self) -> None:
+        dist = _dist({"x": [f"x{i}" for i in range(20)]})
+        seen_stage_refs = []
+
+        def judge_stage(task_ids, reference_ids):
+            seen_stage_refs.append(list(reference_ids))
+            # Eval beats everyone 7-3 -> high elo estimate.
+            return {rid: {"wins": 7, "losses": 3, "ties": 0} for rid in reference_ids}
+
+        runner = MultiStageEloRunner(self._config(nested_tasks=True), dist, judge_stage, rng=random.Random(0))
+        results = runner.run()
+
+        assert len(results) == 2
+        # Stage 1 uses all references.
+        assert seen_stage_refs[0] == ["a", "b", "c", "d"]
+        # Stage 2 narrows to 2 references (closest to the stage-1 estimate).
+        assert len(seen_stage_refs[1]) == 2
+        assert set(seen_stage_refs[1]).issubset({"a", "b", "c", "d"})
+        # Nested task sets (nested_tasks=True): stage 2 superset of stage 1.
+        assert set(results[0].task_ids).issubset(set(results[1].task_ids))
+        assert results[1].eval_elo is not None
+
+    def test_stage_with_no_games_leaves_elo_unset(self) -> None:
+        dist = _dist({"x": [f"x{i}" for i in range(10)]})
+
+        def judge_stage(task_ids, reference_ids):
+            return {}
+
+        cfg = self._config(stages=[StageSpec(num_tasks=2, num_models=None)])
+        results = MultiStageEloRunner(cfg, dist, judge_stage, rng=random.Random(0)).run()
+        assert results[0].eval_elo is None
+        assert results[0].num_references == 0
diff --git a/resources_servers/gdpval/tests/test_multistage_elo_driver.py b/resources_servers/gdpval/tests/test_multistage_elo_driver.py
new file mode 100644
index 000000000..0cce37910
--- /dev/null
+++ b/resources_servers/gdpval/tests/test_multistage_elo_driver.py
@@ -0,0 +1,429 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import random
+from pathlib import Path
+
+import pytest
+
+import resources_servers.gdpval.multistage_elo_driver as driver
+from resources_servers.gdpval.multistage_elo import MultiStageEloConfig, StageResult, StageSpec
+from resources_servers.gdpval.multistage_elo_driver import (
+    _load_reference_elos,
+    _parse_stage,
+    build_judge_stage,
+    build_verify_request_body,
+    cached_task_ids,
+    check_coverage,
+    ensure_distribution,
+    load_distribution,
+    load_task_prompts,
+    main,
+    pool_per_reference,
+    run_multistage_elo,
+    stage_results_to_dict,
+    task_repeat_dirs,
+)
+
+
+def _make_cache(root: Path, task_ids, repeats=("repeat_0",)):
+    for tid in task_ids:
+        for rep in repeats:
+            d = root / f"task_{tid}" / rep
+            d.mkdir(parents=True)
+            (d / "finish_params.json").write_text("{}")
+
+
+def _dist(groups):
+    total = sum(len(v) for v in groups.values()) or 1
+    return {k: {"percentage": len(v) / total, "task_ids": list(v)} for k, v in groups.items()}
+
+
+class TestCacheDiscovery:
+    def test_task_repeat_dirs_lists_attempted_repeats(self, tmp_path: Path) -> None:
+        _make_cache(tmp_path, ["a"], repeats=("repeat_0", "repeat_1"))
+        dirs = task_repeat_dirs(tmp_path, "a")
+        assert [d.name for d in dirs] == ["repeat_0", "repeat_1"]
+
+    def test_task_repeat_dirs_skips_unattempted(self, tmp_path: Path) -> None:
+        (tmp_path / "task_a" / "repeat_0").mkdir(parents=True)  # no finish_params.json
+        assert task_repeat_dirs(tmp_path, "a") == []
+
+    def test_task_repeat_dirs_flat_layout(self, tmp_path: Path) -> None:
+        d = tmp_path / "task_a"
+        d.mkdir(parents=True)
+        (d / "finish_params.json").write_text("{}")
+        assert [p.name for p in task_repeat_dirs(tmp_path, "a")] == ["task_a"]
+
+    def test_missing_task_returns_empty(self, tmp_path: Path) -> None:
+        assert task_repeat_dirs(tmp_path, "ghost") == []
+
+    def test_cached_task_ids(self, tmp_path: Path) -> None:
+        _make_cache(tmp_path, ["a", "b"])
+        assert cached_task_ids(tmp_path) == {"a", "b"}
+
+    def test_cached_task_ids_missing_dir(self, tmp_path: Path) -> None:
+        assert cached_task_ids(tmp_path / "nope") == set()
+
+    def test_check_coverage(self, tmp_path: Path) -> None:
+        _make_cache(tmp_path, ["a", "c"])
+        present, missing = check_coverage(tmp_path, ["a", "b", "c"])
+        assert present == ["a", "c"]
+        assert missing == ["b"]
+
+
+class TestPoolPerReference:
+    def test_sums_counts_and_keeps_elo(self) -> None:
+        responses = [
+            {"per_reference": {"a": {"wins": 2, "losses": 1, "ties": 0, "reference_elo": 1000.0}}},
+            {"per_reference": {"a": {"wins": 1, "losses": 0, "ties": 1, "reference_elo": 1000.0}}},
+        ]
+        pooled = pool_per_reference(responses)
+        assert pooled["a"]["wins"] == 3
+        assert pooled["a"]["losses"] == 1
+        assert pooled["a"]["ties"] == 1
+        assert pooled["a"]["reference_elo"] == 1000.0
+
+    def test_handles_missing_per_reference(self) -> None:
+        assert pool_per_reference([{}, {"per_reference": None}]) == {}
+
+
+class TestLoaders:
+    def test_load_distribution(self, tmp_path: Path) -> None:
+        p = tmp_path / "d.json"
+        p.write_text(json.dumps(_dist({"x": ["a"]})))
+        assert load_distribution(p)["x"]["task_ids"] == ["a"]
+
+    def test_load_distribution_rejects_non_object(self, tmp_path: Path) -> None:
+        p = tmp_path / "d.json"
+        p.write_text("[1,2,3]")
+        with pytest.raises(ValueError):
+            load_distribution(p)
+
+    def test_load_task_prompts_top_level(self, tmp_path: Path) -> None:
+        p = tmp_path / "b.jsonl"
+        p.write_text(json.dumps({"task_id": "a", "prompt": "do x"}) + "\n")
+        assert load_task_prompts(p) == {"a": "do x"}
+
+    def test_load_task_prompts_metadata_nested(self, tmp_path: Path) -> None:
+        p = tmp_path / "b.jsonl"
+        p.write_text(json.dumps({"responses_create_params": {"metadata": {"task_id": "a", "prompt": "y"}}}) + "\n")
+        assert load_task_prompts(p) == {"a": "y"}
+
+
+class TestEnsureDistribution:
+    def test_loads_existing_file(self, tmp_path: Path) -> None:
+        p = tmp_path / "d.json"
+        p.write_text(json.dumps(_dist({"x": ["a"]})))
+        dist, path = ensure_distribution(str(p))
+        assert path == p
+        assert dist["x"]["task_ids"] == ["a"]
+
+    def test_builds_from_dataset_when_missing(self, tmp_path: Path) -> None:
+        dataset = tmp_path / "tasks.jsonl"
+        rows = [
+            {"task_id": "t1", "occupation": "Lawyer"},
+            {"task_id": "t2", "occupation": "Lawyer"},
+            {"task_id": "t3", "occupation": "Nurse"},
+        ]
+        dataset.write_text("\n".join(json.dumps(r) for r in rows) + "\n")
+        cache = tmp_path / "cache"
+
+        dist, path = ensure_distribution(None, dataset_path=str(dataset), cache_dir=str(cache))
+
+        assert path == cache / "occupation_distribution.json"
+        assert path.is_file()
+        assert dist["Lawyer"]["task_ids"] == ["t1", "t2"]
+        assert dist["Nurse"]["task_ids"] == ["t3"]
+
+    def test_writes_to_distribution_path_when_given(self, tmp_path: Path) -> None:
+        dataset = tmp_path / "tasks.jsonl"
+        dataset.write_text(json.dumps({"task_id": "t1", "occupation": "Lawyer"}) + "\n")
+        out = tmp_path / "sub" / "mydist.json"
+
+        _dist_, path = ensure_distribution(str(out), dataset_path=str(dataset))
+
+        assert path == out
+        assert out.is_file()
+
+    def test_custom_columns_in_filename(self, tmp_path: Path) -> None:
+        dataset = tmp_path / "tasks.jsonl"
+        dataset.write_text(json.dumps({"task_id": "t1", "sector": "Legal", "occupation": "Lawyer"}) + "\n")
+        cache = tmp_path / "cache"
+        _dist_, path = ensure_distribution(
+            None, dataset_path=str(dataset), columns=["sector", "occupation"], cache_dir=str(cache)
+        )
+        assert path == cache / "sector_occupation_distribution.json"
+
+    def test_raises_when_no_dataset_available(self, tmp_path: Path, monkeypatch) -> None:
+        import responses_api_agents.stirrup_agent.task_distribution as td
+
+        monkeypatch.setattr(td, "DEFAULT_DATASET_CANDIDATES", (tmp_path / "missing.jsonl",))
+        with pytest.raises(FileNotFoundError):
+            ensure_distribution(None, cache_dir=str(tmp_path / "cache"))
+
+
+class TestBuildVerifyRequestBody:
+    def test_includes_reference_ids_and_deliverables(self) -> None:
+        body = build_verify_request_body("t1", "/cache/task_t1/repeat_0", "prompt", ["a", "b"])
+        assert body["task_id"] == "t1"
+        assert body["deliverables_dir"] == "/cache/task_t1/repeat_0"
+        assert body["reference_ids"] == ["a", "b"]
+        assert body["prompt"] == "prompt"
+
+
+class TestBuildJudgeStage:
+    def test_judges_present_tasks_and_pools(self, tmp_path: Path) -> None:
+        _make_cache(tmp_path, ["a", "b"], repeats=("repeat_0", "repeat_1"))
+        calls = []
+
+        def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids):
+            calls.append((task_id, Path(deliverables_dir).name, tuple(reference_ids)))
+            return {"per_reference": {reference_ids[0]: {"wins": 1, "losses": 0, "ties": 0, "reference_elo": 1000.0}}}
+
+        judge = build_judge_stage(fake_verify_one, tmp_path, {"a": "pa", "b": "pb"})
+        pooled = judge(["a", "b"], ["ref1"])
+        # 2 tasks x 2 repeats = 4 verify calls.
+        assert len(calls) == 4
+        assert pooled["ref1"]["wins"] == 4
+
+    def test_missing_raises_when_no_producer(self, tmp_path: Path) -> None:
+        _make_cache(tmp_path, ["a"])
+        judge = build_judge_stage(lambda *a: {}, tmp_path, {})
+        with pytest.raises(FileNotFoundError):
+            judge(["a", "missing"], ["ref1"])
+
+    def test_missing_skipped_when_produce_missing_false(self, tmp_path: Path) -> None:
+        _make_cache(tmp_path, ["a"])
+
+        def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids):
+            return {"per_reference": {"ref1": {"wins": 1, "losses": 0, "ties": 0, "reference_elo": 1000.0}}}
+
+        judge = build_judge_stage(fake_verify_one, tmp_path, {"a": ""}, produce_missing=False)
+        pooled = judge(["a", "missing"], ["ref1"])
+        assert pooled["ref1"]["wins"] == 1
+
+    def test_producer_materializes_then_judges(self, tmp_path: Path) -> None:
+        _make_cache(tmp_path, ["a"])
+
+        def producer(task_ids):
+            _make_cache(tmp_path, list(task_ids))
+
+        def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids):
+            return {"per_reference": {"ref1": {"wins": 1, "losses": 0, "ties": 0, "reference_elo": 1000.0}}}
+
+        judge = build_judge_stage(fake_verify_one, tmp_path, {}, producer=producer)
+        pooled = judge(["a", "b"], ["ref1"])
+        assert pooled["ref1"]["wins"] == 2  # both tasks judged after production
+
+
+class TestRunMultistageElo:
+    def test_requires_eval_dir(self, tmp_path: Path) -> None:
+        cfg = MultiStageEloConfig(distribution_path="x.json", stages=[StageSpec(1)], reference_elos={"a": 1000.0})
+        with pytest.raises(ValueError):
+            run_multistage_elo(cfg, lambda *a: {}, {})
+
+    def test_end_to_end_with_fakes(self, tmp_path: Path) -> None:
+        # 30 cached tasks, 2-stage adaptive run with a fake judge.
+        task_ids = [f"t{i}" for i in range(30)]
+        _make_cache(tmp_path, task_ids)
+        dist_path = tmp_path / "dist.json"
+        dist_path.write_text(json.dumps(_dist({"x": task_ids})))
+
+        def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids):
+            return {
+                "per_reference": {
+                    rid: {"wins": 7, "losses": 3, "ties": 0, "reference_elo": elo}
+                    for rid, elo in {"a": 1000.0, "b": 1200.0, "c": 1300.0, "d": 1500.0}.items()
+                    if rid in reference_ids
+                }
+            }
+
+        cfg = MultiStageEloConfig(
+            distribution_path=str(dist_path),
+            stages=[StageSpec(num_tasks=5, num_models=None), StageSpec(num_tasks=12, num_models=2)],
+            reference_elos={"a": 1000.0, "b": 1200.0, "c": 1300.0, "d": 1500.0},
+            eval_deliverables_dir=str(tmp_path),
+        )
+        results = run_multistage_elo(cfg, fake_verify_one, {t: "" for t in task_ids}, rng=random.Random(0))
+
+        assert len(results) == 2
+        assert results[0].reference_ids == ["a", "b", "c", "d"]
+        assert len(results[1].reference_ids) == 2
+        assert results[1].eval_elo is not None
+
+        summary = stage_results_to_dict(results)
+        assert summary["num_stages"] == 2
+        assert summary["final_eval_elo"] == results[1].eval_elo
+
+    def test_stage_results_to_dict_empty(self) -> None:
+        assert stage_results_to_dict([])["final_eval_elo"] is None
+
+
+class TestParseStage:
+    def test_tasks_only(self) -> None:
+        s = _parse_stage("5")
+        assert (s.num_tasks, s.num_models, s.seed) == (5, None, None)
+
+    def test_tasks_and_models(self) -> None:
+        s = _parse_stage("88:4")
+        assert (s.num_tasks, s.num_models, s.seed) == (88, 4, None)
+
+    def test_all_models_keyword_and_seed(self) -> None:
+        s = _parse_stage("5:all:7")
+        assert (s.num_tasks, s.num_models, s.seed) == (5, None, 7)
+
+    @pytest.mark.parametrize("bad", ["", "x", "5:y", "5:4:z"])
+    def test_invalid(self, bad: str) -> None:
+        import argparse
+
+        with pytest.raises(argparse.ArgumentTypeError):
+            _parse_stage(bad)
+
+
+class TestLoadReferenceElos:
+    def test_inline_json(self) -> None:
+        assert _load_reference_elos('{"a": 1500, "b": 1200}') == {"a": 1500.0, "b": 1200.0}
+
+    def test_from_file(self, tmp_path: Path) -> None:
+        f = tmp_path / "refs.json"
+        f.write_text(json.dumps({"a": 1000}))
+        assert _load_reference_elos(f"@{f}") == {"a": 1000.0}
+
+    @pytest.mark.parametrize("bad", ["[]", "{}", '"x"'])
+    def test_invalid(self, bad: str) -> None:
+        import argparse
+
+        with pytest.raises(argparse.ArgumentTypeError):
+            _load_reference_elos(bad)
+
+
+class TestCliMain:
+    def _setup(self, tmp_path: Path):
+        _make_cache(tmp_path, ["a", "b"])
+        prompts = tmp_path / "bench.jsonl"
+        prompts.write_text(json.dumps({"task_id": "a", "prompt": "p"}) + "\n")
+        refs = tmp_path / "refs.json"
+        refs.write_text(json.dumps({"a": 1000.0, "b": 1200.0}))
+        return prompts, refs
+
+    def test_main_writes_summary(self, tmp_path: Path, monkeypatch, capsys) -> None:
+        prompts, refs = self._setup(tmp_path)
+        captured = {}
+
+        def fake_run(config, verify_one, task_prompts, *, rng=None, producer=None):
+            captured["config"] = config
+            captured["rng"] = rng
+            return [
+                StageResult(
+                    stage_index=0,
+                    task_ids=["a"],
+                    reference_ids=["a", "b"],
+                    per_reference={},
+                    eval_elo=1234.0,
+                    normalized_elo=0.5,
+                    num_references=2,
+                )
+            ]
+
+        monkeypatch.setattr(driver, "run_multistage_elo", fake_run)
+        out = tmp_path / "summary.json"
+        rc = main(
+            [
+                "--server-url",
+                "http://localhost:9999",
+                "--eval-deliverables-dir",
+                str(tmp_path),
+                "--reference-elos",
+                f"@{refs}",
+                "--stage",
+                "5",
+                "--stage",
+                "12:1",
+                "--task-prompts",
+                str(prompts),
+                "--nested-tasks",
+                "--skip-missing",
+                "--seed",
+                "3",
+                "--output",
+                str(out),
+            ]
+        )
+        assert rc == 0
+        summary = json.loads(out.read_text())
+        assert summary["final_eval_elo"] == 1234.0
+        cfg = captured["config"]
+        assert [s.num_tasks for s in cfg.stages] == [5, 12]
+        assert cfg.stages[1].num_models == 1
+        assert cfg.nested_tasks is True
+        assert cfg.produce_missing is False
+        assert cfg.reference_elos == {"a": 1000.0, "b": 1200.0}
+        assert isinstance(captured["rng"], random.Random)
+
+    def test_main_to_stdout(self, tmp_path: Path, monkeypatch, capsys) -> None:
+        prompts, refs = self._setup(tmp_path)
+        monkeypatch.setattr(driver, "run_multistage_elo", lambda *a, **k: [])
+        rc = main(
+            [
+                "--server-url",
+                "http://localhost:9999",
+                "--eval-deliverables-dir",
+                str(tmp_path),
+                "--reference-elos",
+                f"@{refs}",
+                "--stage",
+                "5",
+                "--task-prompts",
+                str(prompts),
+            ]
+        )
+        assert rc == 0
+        assert json.loads(capsys.readouterr().out)["num_stages"] == 0
+
+    def test_main_missing_eval_dir(self, tmp_path: Path, capsys) -> None:
+        _, refs = self._setup(tmp_path)
+        rc = main(
+            [
+                "--server-url",
+                "http://x",
+                "--eval-deliverables-dir",
+                str(tmp_path / "nope"),
+                "--reference-elos",
+                f"@{refs}",
+                "--stage",
+                "5",
+            ]
+        )
+        assert rc == 2
+        assert "not found" in capsys.readouterr().err.lower()
+
+    def test_main_missing_prompts(self, tmp_path: Path, capsys) -> None:
+        _, refs = self._setup(tmp_path)
+        rc = main(
+            [
+                "--server-url",
+                "http://x",
+                "--eval-deliverables-dir",
+                str(tmp_path),
+                "--reference-elos",
+                f"@{refs}",
+                "--stage",
+                "5",
+                "--task-prompts",
+                str(tmp_path / "nope.jsonl"),
+            ]
+        )
+        assert rc == 2
+        assert "not found" in capsys.readouterr().err.lower()

From af7f34411b12678a1552d23778a0c1f99362ffef Mon Sep 17 00:00:00 2001
From: Virginia Wu <vadams@nvidia.com>
Date: Fri, 26 Jun 2026 17:06:27 -0700
Subject: [PATCH 6/8] multistage-elo E2E smoke test works

Signed-off-by: Virginia Wu <vadams@nvidia.com>
---
 multistage_refs.json                          |  7 ++
 .../configs/gdpval_comparison_smoketest.yaml  | 56 ++++++++++++
 resources_servers/gdpval/multistage_elo.py    | 28 ++++++
 .../gdpval/multistage_elo_driver.py           | 85 +++++++++++++++++--
 .../gdpval/tests/test_multistage_elo.py       | 21 +++++
 .../tests/test_multistage_elo_driver.py       | 17 +++-
 6 files changed, 207 insertions(+), 7 deletions(-)
 create mode 100644 multistage_refs.json
 create mode 100644 resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml

diff --git a/multistage_refs.json b/multistage_refs.json
new file mode 100644
index 000000000..e6e383192
--- /dev/null
+++ b/multistage_refs.json
@@ -0,0 +1,7 @@
+{
+  "glm51": 1259,
+  "minimax_m27": 1165,
+  "nemotron3_ultra_ga": 1168,
+  "kimi_k25": 1000,
+  "qwen35_397b": 956
+}
diff --git a/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml b/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml
new file mode 100644
index 000000000..4d2e2a0e6
--- /dev/null
+++ b/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml
@@ -0,0 +1,56 @@
+# Scratch config for an end-to-end smoke test of the multi-stage ELO driver.
+#
+# Starts ONLY the two servers the driver needs: the GDPVal resources server in
+# comparison mode (with the five reference models baked in) and the Gemini judge
+# it calls. The Stirrup agent / policy model are intentionally omitted because
+# the driver judges *cached* deliverables and never runs the agent.
+#
+# Usage:
+#   export JUDGE_API_KEY=<your-key>
+#   gym env start "+config_paths=[resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml]"
+#
+# Delete this file when done; it is a local scratch config, not a committed env.
+
+# Gemini 3.1 Pro judge (proxied via the NVIDIA inference API).
+gdpval_judge_model:
+  responses_api_models:
+    openai_model:
+      entrypoint: app.py
+      openai_base_url: ${oc.env:JUDGE_BASE_URL,https://inference-api.nvidia.com/v1}
+      openai_api_key: ${oc.env:JUDGE_API_KEY,dummy}
+      openai_model: ${oc.env:JUDGE_MODEL_NAME,gcp/google/gemini-3.1-pro-preview}
+      max_concurrent_requests: 4
+
+# GDPVal resources server in multi-reference comparison mode.
+gdpval_resources_server:
+  resources_servers:
+    gdpval:
+      entrypoint: app.py
+      domain: other
+      verified: false
+      reward_mode: comparison
+      # Text deliverables (soap_note.txt/.md) need no office->pdf conversion, so
+      # disable preconvert to avoid the libreoffice dependency for the smoke test.
+      preconvert_office_to_pdf: false
+      num_comparison_trials: 1
+      reference_models:
+        glm51:
+          deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/GLM5.1-GDPval-Reference-Outputs
+          elo: 1259
+        minimax_m27:
+          deliverables_dir: /lustre/fsw/portfolios/llmservice/users/agronskiy/nemo-evaluator-rundirs/ultra-v3/MiniMaxAI/MiniMax-M2.7/20260516_133717-5cf449e4fc2a6ffd/nemo_gym.0/artifacts/gdpval/deliverables_cache
+          elo: 1165
+        nemotron3_ultra_ga:
+          deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Nemotron-3-Ultra-GA-checkpoint-no-interleaved-reasoning-05-29-2026
+          elo: 1168
+        kimi_k25:
+          deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Kimi-K2.5-Thinking
+          elo: 1000
+        qwen35_397b:
+          deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Qwen3.5-397B
+          elo: 956
+      judge_model_server:
+        type: responses_api_models
+        name: gdpval_judge_model
+      judge_responses_create_params_overrides:
+        model: gcp/google/gemini-3.1-pro-preview
diff --git a/resources_servers/gdpval/multistage_elo.py b/resources_servers/gdpval/multistage_elo.py
index a8f4fccc1..ceaf28b48 100644
--- a/resources_servers/gdpval/multistage_elo.py
+++ b/resources_servers/gdpval/multistage_elo.py
@@ -268,11 +268,20 @@ def __init__(
         judge_stage: JudgeStageFn,
         *,
         rng: Optional[random.Random] = None,
+        on_event: Optional[Callable[[str, dict], None]] = None,
     ) -> None:
         self.config = config
         self.distribution = distribution
         self.judge_stage = judge_stage
         self.rng = rng or random.Random()
+        # Optional progress hook. Called as ``on_event(name, data)`` for the
+        # events "planned", "stage_start", and "stage_end". Kept as a callback so
+        # this module performs no I/O itself; the driver/CLI does the printing.
+        self.on_event = on_event
+
+    def _emit(self, name: str, **data: object) -> None:
+        if self.on_event is not None:
+            self.on_event(name, data)
 
     def run(self) -> List[StageResult]:
         stage_task_sets = plan_stage_task_ids(
@@ -281,16 +290,35 @@ def run(self) -> List[StageResult]:
             rng=self.rng,
             nested=self.config.nested_tasks,
         )
+        total_stages = len(self.config.stages)
+        self._emit("planned", stage_task_counts=[len(s) for s in stage_task_sets], total_stages=total_stages)
 
         results: List[StageResult] = []
         eval_elo: Optional[float] = None
         for index, stage in enumerate(self.config.stages):
             reference_ids = select_references(self.config.reference_elos, eval_elo, stage.num_models)
             task_ids = stage_task_sets[index]
+            self._emit(
+                "stage_start",
+                index=index,
+                total_stages=total_stages,
+                reference_ids=list(reference_ids),
+                num_tasks=len(task_ids),
+                prior_elo=eval_elo,
+            )
             per_reference = self.judge_stage(task_ids, reference_ids)
             stage_elo, normalized, num_references = fit_stage_elo(per_reference, self.config.reference_elos)
             if stage_elo is not None:
                 eval_elo = stage_elo
+            self._emit(
+                "stage_end",
+                index=index,
+                total_stages=total_stages,
+                eval_elo=stage_elo,
+                normalized_elo=normalized,
+                num_references=num_references,
+                per_reference=dict(per_reference),
+            )
             results.append(
                 StageResult(
                     stage_index=index,
diff --git a/resources_servers/gdpval/multistage_elo_driver.py b/resources_servers/gdpval/multistage_elo_driver.py
index 310c4d1d4..48a741857 100644
--- a/resources_servers/gdpval/multistage_elo_driver.py
+++ b/resources_servers/gdpval/multistage_elo_driver.py
@@ -44,7 +44,9 @@
         --output elo_summary.json
 
 where ``refs.json`` is ``{"<ref_id>": <elo>, ...}`` with ids matching the
-server's configured ``reference_models``. See ``--help`` for all flags.
+server's configured ``reference_models``. Each stage has a set number of 
+tasks and reference models set like ``--stage num_tasks:num_models``. 
+See ``--help`` for all flags.
 """
 
 from __future__ import annotations
@@ -243,6 +245,7 @@ def build_judge_stage(
     *,
     produce_missing: bool = True,
     producer: Optional[ProducerFn] = None,
+    progress: Optional[Callable[[int, int, str], None]] = None,
 ):
     """Build the ``judge_stage`` callable expected by ``MultiStageEloRunner``.
 
@@ -251,6 +254,9 @@ def build_judge_stage(
     per-reference votes. Missing tasks are produced via ``producer`` when given;
     otherwise ``produce_missing=True`` raises an actionable error and
     ``produce_missing=False`` drops them with a warning.
+
+    ``progress`` is an optional callback invoked as ``progress(done, total,
+    task_id)`` after each ``verify_one`` completes, for live status reporting.
     """
 
     def judge_stage(task_ids: Sequence[str], reference_ids: Sequence[str]) -> PerReferenceTotals:
@@ -272,11 +278,16 @@ def judge_stage(task_ids: Sequence[str], reference_ids: Sequence[str]) -> PerRef
                     flush=True,
                 )
 
+        # Flatten to (task_id, repeat_dir) units up front so progress can report
+        # an accurate done/total across all repeats in the stage.
+        units = [(tid, repeat_dir) for tid in present for repeat_dir in task_repeat_dirs(eval_deliverables_dir, tid)]
+        total = len(units)
         responses: List[Dict[str, Any]] = []
-        for task_id in present:
+        for done, (task_id, repeat_dir) in enumerate(units, start=1):
             prompt = task_prompts.get(task_id, "")
-            for repeat_dir in task_repeat_dirs(eval_deliverables_dir, task_id):
-                responses.append(verify_one(task_id, str(repeat_dir), prompt, list(reference_ids)))
+            responses.append(verify_one(task_id, str(repeat_dir), prompt, list(reference_ids)))
+            if progress is not None:
+                progress(done, total, task_id)
         return pool_per_reference(responses)
 
     return judge_stage
@@ -353,11 +364,17 @@ def run_multistage_elo(
     *,
     rng=None,
     producer: Optional[ProducerFn] = None,
+    on_event: Optional[Callable[[str, dict], None]] = None,
+    progress: Optional[Callable[[int, int, str], None]] = None,
 ) -> List[StageResult]:
     """Run the full multi-stage ELO procedure and return per-stage results.
 
     ``config.eval_deliverables_dir`` must be set — it is the source of the eval
     model's (cached or produced) deliverables.
+
+    ``on_event``/``progress`` are optional callbacks for live status reporting:
+    ``on_event`` receives stage-level events (see ``MultiStageEloRunner``) and
+    ``progress`` receives per-(task, repeat) judging progress.
     """
     if not config.eval_deliverables_dir:
         raise ValueError("config.eval_deliverables_dir must be set (source of eval deliverables).")
@@ -373,8 +390,9 @@ def run_multistage_elo(
         task_prompts,
         produce_missing=config.produce_missing,
         producer=producer,
+        progress=progress,
     )
-    runner = MultiStageEloRunner(config, distribution, judge_stage, rng=rng)
+    runner = MultiStageEloRunner(config, distribution, judge_stage, rng=rng, on_event=on_event)
     return runner.run()
 
 
@@ -556,6 +574,12 @@ def _build_arg_parser() -> argparse.ArgumentParser:
         default=None,
         help="Top-level RNG seed for reproducible task sampling and reference selection.",
     )
+    parser.add_argument(
+        "--quiet",
+        "-q",
+        action="store_true",
+        help="Suppress live per-stage / per-task progress output on stderr.",
+    )
     parser.add_argument(
         "--output",
         "-o",
@@ -565,6 +589,54 @@ def _build_arg_parser() -> argparse.ArgumentParser:
     return parser
 
 
+def _make_progress_printers():
+    """Return ``(on_event, progress)`` callbacks that print human-readable status to stderr.
+
+    ``on_event`` prints a banner at the start/end of each stage (selected
+    references, task count, fitted ELO); ``progress`` prints a per-(task, repeat)
+    counter as each ``/verify`` completes.
+    """
+
+    def on_event(name: str, data: dict) -> None:
+        if name == "planned":
+            counts = data.get("stage_task_counts", [])
+            print(
+                f"[multistage-elo] planned {data.get('total_stages')} stage(s); tasks per stage: {counts}",
+                file=sys.stderr,
+                flush=True,
+            )
+        elif name == "stage_start":
+            idx = int(data["index"]) + 1
+            total = data["total_stages"]
+            refs = data.get("reference_ids", [])
+            prior = data.get("prior_elo")
+            prior_str = f"{prior:.1f}" if isinstance(prior, (int, float)) else "n/a"
+            print(
+                f"[multistage-elo] stage {idx}/{total}: {data.get('num_tasks')} task(s) "
+                f"vs {len(refs)} ref(s) {refs} (prior ELO: {prior_str})",
+                file=sys.stderr,
+                flush=True,
+            )
+        elif name == "stage_end":
+            idx = int(data["index"]) + 1
+            total = data["total_stages"]
+            elo = data.get("eval_elo")
+            elo_str = f"{elo:.1f}" if isinstance(elo, (int, float)) else "unset (no games)"
+            print(
+                f"[multistage-elo] stage {idx}/{total} done: eval ELO = {elo_str} "
+                f"(fit over {data.get('num_references')} ref(s))",
+                file=sys.stderr,
+                flush=True,
+            )
+
+    def progress(done: int, total: int, task_id: str) -> None:
+        short = task_id[:18] + "…" if len(task_id) > 19 else task_id
+        end = "\n" if done == total else "\r"
+        print(f"[multistage-elo]   judged {done}/{total} (task {short})   ", end=end, file=sys.stderr, flush=True)
+
+    return on_event, progress
+
+
 def main(argv: Optional[Sequence[str]] = None) -> int:
     parser = _build_arg_parser()
     args = parser.parse_args(argv)
@@ -594,7 +666,8 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
     task_prompts = load_task_prompts(prompts_path)
     rng = random.Random(args.seed) if args.seed is not None else None
 
-    results = run_multistage_elo(config, verify_one, task_prompts, rng=rng)
+    on_event, progress = (None, None) if args.quiet else _make_progress_printers()
+    results = run_multistage_elo(config, verify_one, task_prompts, rng=rng, on_event=on_event, progress=progress)
     payload = json.dumps(stage_results_to_dict(results), indent=2, ensure_ascii=False)
 
     if args.output:
diff --git a/resources_servers/gdpval/tests/test_multistage_elo.py b/resources_servers/gdpval/tests/test_multistage_elo.py
index 4633cdda7..d0b091321 100644
--- a/resources_servers/gdpval/tests/test_multistage_elo.py
+++ b/resources_servers/gdpval/tests/test_multistage_elo.py
@@ -183,3 +183,24 @@ def judge_stage(task_ids, reference_ids):
         results = MultiStageEloRunner(cfg, dist, judge_stage, rng=random.Random(0)).run()
         assert results[0].eval_elo is None
         assert results[0].num_references == 0
+
+    def test_on_event_emits_lifecycle_events(self) -> None:
+        dist = _dist({"x": [f"x{i}" for i in range(10)]})
+
+        def judge_stage(task_ids, reference_ids):
+            return {rid: {"wins": 6, "losses": 4, "ties": 0} for rid in reference_ids}
+
+        events = []
+        cfg = self._config(stages=[StageSpec(num_tasks=2, num_models=None), StageSpec(num_tasks=3, num_models=2)])
+        MultiStageEloRunner(
+            cfg, dist, judge_stage, rng=random.Random(0), on_event=lambda name, data: events.append((name, data))
+        ).run()
+
+        names = [n for n, _ in events]
+        assert names[0] == "planned"
+        assert names.count("stage_start") == 2
+        assert names.count("stage_end") == 2
+        # stage_start carries the selected references and task count.
+        first_start = next(d for n, d in events if n == "stage_start")
+        assert first_start["num_tasks"] == 2
+        assert first_start["reference_ids"] == ["a", "b", "c", "d"]
diff --git a/resources_servers/gdpval/tests/test_multistage_elo_driver.py b/resources_servers/gdpval/tests/test_multistage_elo_driver.py
index 0cce37910..09d5d80e3 100644
--- a/resources_servers/gdpval/tests/test_multistage_elo_driver.py
+++ b/resources_servers/gdpval/tests/test_multistage_elo_driver.py
@@ -227,6 +227,21 @@ def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids):
         pooled = judge(["a", "b"], ["ref1"])
         assert pooled["ref1"]["wins"] == 2  # both tasks judged after production
 
+    def test_progress_callback_reports_each_unit(self, tmp_path: Path) -> None:
+        _make_cache(tmp_path, ["a", "b"], repeats=("repeat_0", "repeat_1"))
+
+        def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids):
+            return {"per_reference": {"ref1": {"wins": 1, "losses": 0, "ties": 0, "reference_elo": 1000.0}}}
+
+        seen = []
+        judge = build_judge_stage(
+            fake_verify_one, tmp_path, {}, progress=lambda done, total, tid: seen.append((done, total, tid))
+        )
+        judge(["a", "b"], ["ref1"])
+        # 2 tasks x 2 repeats = 4 units; progress reports running done/total.
+        assert [s[0] for s in seen] == [1, 2, 3, 4]
+        assert all(s[1] == 4 for s in seen)
+
 
 class TestRunMultistageElo:
     def test_requires_eval_dir(self, tmp_path: Path) -> None:
@@ -322,7 +337,7 @@ def test_main_writes_summary(self, tmp_path: Path, monkeypatch, capsys) -> None:
         prompts, refs = self._setup(tmp_path)
         captured = {}
 
-        def fake_run(config, verify_one, task_prompts, *, rng=None, producer=None):
+        def fake_run(config, verify_one, task_prompts, *, rng=None, producer=None, on_event=None, progress=None):
             captured["config"] = config
             captured["rng"] = rng
             return [

From b11f4462e40ee0feea109046ab385f97920aa28a Mon Sep 17 00:00:00 2001
From: Virginia Wu <vadams@nvidia.com>
Date: Fri, 26 Jun 2026 17:29:44 -0700
Subject: [PATCH 7/8] trimmed white space

Signed-off-by: Virginia Wu <vadams@nvidia.com>
---
 resources_servers/gdpval/multistage_elo_driver.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/resources_servers/gdpval/multistage_elo_driver.py b/resources_servers/gdpval/multistage_elo_driver.py
index 48a741857..6d39c20fb 100644
--- a/resources_servers/gdpval/multistage_elo_driver.py
+++ b/resources_servers/gdpval/multistage_elo_driver.py
@@ -44,8 +44,8 @@
         --output elo_summary.json
 
 where ``refs.json`` is ``{"<ref_id>": <elo>, ...}`` with ids matching the
-server's configured ``reference_models``. Each stage has a set number of 
-tasks and reference models set like ``--stage num_tasks:num_models``. 
+server's configured ``reference_models``. Each stage has a set number of
+tasks and reference models set like ``--stage num_tasks:num_models``.
 See ``--help`` for all flags.
 """
 

From ce40b2f8b70f18d80c9d808bef5554bf07fd9bff Mon Sep 17 00:00:00 2001
From: Virginia Wu <vadams@nvidia.com>
Date: Fri, 26 Jun 2026 17:33:51 -0700
Subject: [PATCH 8/8] removed smoked test yaml

Signed-off-by: Virginia Wu <vadams@nvidia.com>
---
 .../configs/gdpval_comparison_smoketest.yaml  | 56 -------------------
 1 file changed, 56 deletions(-)
 delete mode 100644 resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml

diff --git a/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml b/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml
deleted file mode 100644
index 4d2e2a0e6..000000000
--- a/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Scratch config for an end-to-end smoke test of the multi-stage ELO driver.
-#
-# Starts ONLY the two servers the driver needs: the GDPVal resources server in
-# comparison mode (with the five reference models baked in) and the Gemini judge
-# it calls. The Stirrup agent / policy model are intentionally omitted because
-# the driver judges *cached* deliverables and never runs the agent.
-#
-# Usage:
-#   export JUDGE_API_KEY=<your-key>
-#   gym env start "+config_paths=[resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml]"
-#
-# Delete this file when done; it is a local scratch config, not a committed env.
-
-# Gemini 3.1 Pro judge (proxied via the NVIDIA inference API).
-gdpval_judge_model:
-  responses_api_models:
-    openai_model:
-      entrypoint: app.py
-      openai_base_url: ${oc.env:JUDGE_BASE_URL,https://inference-api.nvidia.com/v1}
-      openai_api_key: ${oc.env:JUDGE_API_KEY,dummy}
-      openai_model: ${oc.env:JUDGE_MODEL_NAME,gcp/google/gemini-3.1-pro-preview}
-      max_concurrent_requests: 4
-
-# GDPVal resources server in multi-reference comparison mode.
-gdpval_resources_server:
-  resources_servers:
-    gdpval:
-      entrypoint: app.py
-      domain: other
-      verified: false
-      reward_mode: comparison
-      # Text deliverables (soap_note.txt/.md) need no office->pdf conversion, so
-      # disable preconvert to avoid the libreoffice dependency for the smoke test.
-      preconvert_office_to_pdf: false
-      num_comparison_trials: 1
-      reference_models:
-        glm51:
-          deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/GLM5.1-GDPval-Reference-Outputs
-          elo: 1259
-        minimax_m27:
-          deliverables_dir: /lustre/fsw/portfolios/llmservice/users/agronskiy/nemo-evaluator-rundirs/ultra-v3/MiniMaxAI/MiniMax-M2.7/20260516_133717-5cf449e4fc2a6ffd/nemo_gym.0/artifacts/gdpval/deliverables_cache
-          elo: 1165
-        nemotron3_ultra_ga:
-          deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Nemotron-3-Ultra-GA-checkpoint-no-interleaved-reasoning-05-29-2026
-          elo: 1168
-        kimi_k25:
-          deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Kimi-K2.5-Thinking
-          elo: 1000
-        qwen35_397b:
-          deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Qwen3.5-397B
-          elo: 956
-      judge_model_server:
-        type: responses_api_models
-        name: gdpval_judge_model
-      judge_responses_create_params_overrides:
-        model: gcp/google/gemini-3.1-pro-preview