From 996d9e5d1de9b164219ec88d743d33240fd1113e Mon Sep 17 00:00:00 2001 From: Virginia Wu Date: Thu, 25 Jun 2026 17:06:58 -0700 Subject: [PATCH 1/8] added task distribution calculation code for gdpval and other datasets Signed-off-by: Virginia Wu --- .../stirrup_agent/task_distribution.py | 406 ++++++++++++++++++ .../tests/test_task_distribution.py | 269 ++++++++++++ 2 files changed, 675 insertions(+) create mode 100644 responses_api_agents/stirrup_agent/task_distribution.py create mode 100644 responses_api_agents/stirrup_agent/tests/test_task_distribution.py diff --git a/responses_api_agents/stirrup_agent/task_distribution.py b/responses_api_agents/stirrup_agent/task_distribution.py new file mode 100644 index 000000000..20f41fb45 --- /dev/null +++ b/responses_api_agents/stirrup_agent/task_distribution.py @@ -0,0 +1,406 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Build a task distribution over one or more dataset columns. + +A *distribution* groups every task in a dataset by the value(s) of one or +more metadata columns (e.g. ``sector``, or ``sector`` + ``occupation``) and +records, for each group, the fraction of the dataset it covers and the list +of ``task_id``s that fall into it:: + + { + "Business, Finance & Operations": {"percentage": 0.05, "task_ids": ["a", "b"]}, + "Legal": {"percentage": 0.50, "task_ids": [...]}, + "Healthcare": {"percentage": 0.45, "task_ids": [...]} + } + +Datasets are the NeMo Gym Responses-API JSONL format: one task per line, with +the groupable columns living under ``responses_create_params.metadata``. + +The grouping logic is intentionally separated from the CLI so the resulting +distribution can later be reused to *sample* ``task_id``s (see +``sample_task_ids``). + +Usage:: + + # --dataset defaults to the prepared GDPVal dataset + # (benchmarks/gdpval/data/gdpval_benchmark.jsonl) when omitted. + python -m responses_api_agents.stirrup_agent.task_distribution \ + --column sector \ + --output sector_distribution.json + + # Composite key over multiple columns, explicit dataset: + python -m responses_api_agents.stirrup_agent.task_distribution \ + --dataset data/gdpval.jsonl --column sector --column occupation \ + --output sector_occupation_distribution.json +""" + +from __future__ import annotations + +import argparse +import json +import random +import sys +from pathlib import Path +from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional, Sequence + + +# Sentinel used when a row is missing one of the requested columns. +MISSING_VALUE = "" + +# Separator joining multiple column values into a single composite key. +DEFAULT_KEY_SEPARATOR = " | " + +# Repo root: this file is responses_api_agents/stirrup_agent/task_distribution.py. +_REPO_ROOT = Path(__file__).resolve().parents[2] + +# Candidate GDPVal dataset locations, in priority order. The first that exists +# is used when ``--dataset`` is not given. The prepared benchmark JSONL (written +# by ``gym eval prepare --benchmark gdpval``) is preferred; the agent-local +# ``data/gdpval.jsonl`` (written by setup_scripts/gdpval.sh) is a fallback. +# The synthetic ``example.jsonl`` is intentionally *not* a default so the +# command never silently computes a distribution over a single fake task. +DEFAULT_DATASET_CANDIDATES = ( + _REPO_ROOT / "benchmarks" / "gdpval" / "data" / "gdpval_benchmark.jsonl", + Path(__file__).resolve().parent / "data" / "gdpval.jsonl", +) + + +def resolve_default_dataset( + candidates: Optional[Sequence[Path]] = None, +) -> Optional[Path]: + """Return the first existing default GDPVal dataset, or ``None``. + + Used when the caller does not pass an explicit ``--dataset``; prefers the + prepared benchmark JSONL and falls back to agent-local datasets. + """ + if candidates is None: + candidates = DEFAULT_DATASET_CANDIDATES + for candidate in candidates: + if candidate.is_file(): + return candidate + return None + + +def _no_dataset_message() -> str: + """Actionable error shown when no dataset is specified and no default exists.""" + searched = "".join(f" - {c}\n" for c in DEFAULT_DATASET_CANDIDATES) + return ( + "No dataset specified and no default GDPVal dataset was found.\n" + f"\nSearched these default locations:\n{searched}" + "\nTo fix this, do one of the following:\n" + "\n 1. Prepare the GDPVal benchmark dataset (recommended). This downloads\n" + " the openai/gdpval dataset from HuggingFace and writes\n" + " benchmarks/gdpval/data/gdpval_benchmark.jsonl (220 tasks):\n" + "\n export HF_TOKEN=\n" + " gym eval prepare --benchmark gdpval\n" + "\n (or run: bash responses_api_agents/stirrup_agent/setup_scripts/gdpval.sh)\n" + "\n 2. Pass an explicit dataset path with --dataset .\n" + "\nNote: the GDPVal dataset is gated on HuggingFace, so HF_TOKEN must be set\n" + "and your account must have access to https://huggingface.co/datasets/openai/gdpval.\n" + ) + + +def iter_dataset_rows(dataset_path: str | Path) -> Iterator[Dict[str, Any]]: + """Yield parsed JSON objects from a Responses-API JSONL dataset. + + Blank lines are skipped; malformed lines raise ``ValueError`` with the + 1-based line number so the offending row is easy to find. + """ + path = Path(dataset_path) + with path.open("r", encoding="utf-8") as handle: + for line_no, line in enumerate(handle, start=1): + stripped = line.strip() + if not stripped: + continue + try: + yield json.loads(stripped) + except json.JSONDecodeError as exc: + raise ValueError(f"{path}:{line_no}: invalid JSON line: {exc}") from exc + + +def extract_metadata(row: Mapping[str, Any]) -> Dict[str, Any]: + """Return the ``responses_create_params.metadata`` dict for a row. + + Falls back to a top-level ``metadata`` key (and finally the row itself) + so the function also works on flatter dataset variants. + """ + params = row.get("responses_create_params") + if isinstance(params, Mapping): + metadata = params.get("metadata") + if isinstance(metadata, Mapping): + return dict(metadata) + metadata = row.get("metadata") + if isinstance(metadata, Mapping): + return dict(metadata) + return dict(row) + + +def compose_key( + metadata: Mapping[str, Any], + columns: Sequence[str], + *, + separator: str = DEFAULT_KEY_SEPARATOR, + missing_value: str = MISSING_VALUE, +) -> str: + """Build the distribution key for a row from one or more columns. + + Each column value is stringified; missing values become ``missing_value``. + Multiple columns are joined with ``separator`` into a composite key. + """ + parts: List[str] = [] + for column in columns: + value = metadata.get(column, None) + if value is None: + parts.append(missing_value) + else: + parts.append(str(value)) + return separator.join(parts) + + +def build_distribution( + rows: Iterable[Mapping[str, Any]], + columns: Sequence[str], + *, + task_id_column: str = "task_id", + separator: str = DEFAULT_KEY_SEPARATOR, + missing_value: str = MISSING_VALUE, + precision: Optional[int] = 6, +) -> Dict[str, Dict[str, Any]]: + """Compute the task distribution across ``columns``. + + Returns a mapping ``key -> {"percentage": float, "task_ids": [...]}`` where + ``percentage`` is the fraction (0..1) of all tasks that share that key and + ``task_ids`` lists every matching task in first-seen order. The mapping is + ordered by descending ``percentage`` (ties broken by key) for readability. + + ``percentage`` values are rounded to ``precision`` decimal places when + ``precision`` is not ``None``. Note that rounding can make the percentages + sum to slightly more or less than 1.0; the unrounded fractions always sum + to 1.0. + """ + if not columns: + raise ValueError("At least one column is required to build a distribution.") + + grouped: Dict[str, List[str]] = {} + total = 0 + for index, row in enumerate(rows): + metadata = extract_metadata(row) + key = compose_key(metadata, columns, separator=separator, missing_value=missing_value) + task_id = metadata.get(task_id_column) + # Fall back to a positional id so every task is still counted/listed + # even when the dataset lacks an explicit task-id column. + task_id_str = str(task_id) if task_id is not None else f"{task_id_column}_index_{index}" + grouped.setdefault(key, []).append(task_id_str) + total += 1 + + distribution: Dict[str, Dict[str, Any]] = {} + for key, task_ids in grouped.items(): + fraction = (len(task_ids) / total) if total else 0.0 + percentage = round(fraction, precision) if precision is not None else fraction + distribution[key] = {"percentage": percentage, "task_ids": task_ids} + + # Sort by descending share, then by key for stable, readable output. + ordered = dict( + sorted( + distribution.items(), + key=lambda item: (-len(item[1]["task_ids"]), item[0]), + ) + ) + return ordered + + +def build_distribution_from_dataset( + dataset_path: str | Path, + columns: Sequence[str], + *, + task_id_column: str = "task_id", + separator: str = DEFAULT_KEY_SEPARATOR, + missing_value: str = MISSING_VALUE, + precision: Optional[int] = 6, +) -> Dict[str, Dict[str, Any]]: + """Convenience wrapper: read a JSONL dataset and build its distribution.""" + return build_distribution( + iter_dataset_rows(dataset_path), + columns, + task_id_column=task_id_column, + separator=separator, + missing_value=missing_value, + precision=precision, + ) + + +def sample_task_ids( + distribution: Mapping[str, Mapping[str, Any]], + n: int, + *, + rng: Optional[random.Random] = None, + replace: bool = False, +) -> List[str]: + """Sample ``n`` ``task_id``s in proportion to a distribution's percentages. + + Each task id is drawn by first choosing a group weighted by its + ``percentage`` and then choosing a task id within that group. With + ``replace=False`` (default) the same task id is never returned twice and + ``n`` is capped at the total number of available task ids. + + This is the consumption-side counterpart to ``build_distribution`` and is + provided so the saved distribution file can directly drive task sampling. + """ + if n <= 0: + return [] + rng = rng or random.Random() + + keys = list(distribution.keys()) + weights = [float(distribution[key].get("percentage", 0.0)) for key in keys] + if not keys or sum(weights) <= 0: + return [] + + if replace: + sampled: List[str] = [] + for _ in range(n): + (chosen_key,) = rng.choices(keys, weights=weights, k=1) + task_ids = list(distribution[chosen_key].get("task_ids", [])) + if not task_ids: + continue + sampled.append(rng.choice(task_ids)) + return sampled + + # Without replacement: track remaining ids per group and renormalise. + remaining: Dict[str, List[str]] = {key: list(distribution[key].get("task_ids", [])) for key in keys} + total_available = sum(len(ids) for ids in remaining.values()) + target = min(n, total_available) + + sampled = [] + while len(sampled) < target: + live_keys = [key for key in keys if remaining[key]] + live_weights = [float(distribution[key].get("percentage", 0.0)) for key in live_keys] + if not live_keys or sum(live_weights) <= 0: + break + (chosen_key,) = rng.choices(live_keys, weights=live_weights, k=1) + bucket = remaining[chosen_key] + idx = rng.randrange(len(bucket)) + sampled.append(bucket.pop(idx)) + return sampled + + +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="task_distribution", + description=( + "Build a JSON distribution of tasks across one or more dataset " + "columns (e.g. sector, occupation) from a Responses-API JSONL dataset." + ), + ) + parser.add_argument( + "--dataset", + default=None, + help=( + "Path to the input JSONL dataset (one task per line). If omitted, " + "defaults to the prepared GDPVal dataset " + "(benchmarks/gdpval/data/gdpval_benchmark.jsonl), falling back to " + "the agent-local data/gdpval.jsonl or data/example.jsonl." + ), + ) + parser.add_argument( + "--column", + dest="columns", + action="append", + required=True, + metavar="COLUMN", + help=( + "Metadata column to group by. Repeat to group by a composite key " + "(e.g. --column sector --column occupation)." + ), + ) + parser.add_argument( + "--output", + "-o", + default=None, + help="Path to write the distribution JSON. Defaults to stdout.", + ) + parser.add_argument( + "--task-id-column", + default="task_id", + help="Metadata column holding the task id (default: task_id).", + ) + parser.add_argument( + "--separator", + default=DEFAULT_KEY_SEPARATOR, + help=f"Separator joining multiple column values into one key (default: {DEFAULT_KEY_SEPARATOR!r}).", + ) + parser.add_argument( + "--missing-value", + default=MISSING_VALUE, + help=f"Placeholder for rows missing a column (default: {MISSING_VALUE!r}).", + ) + parser.add_argument( + "--precision", + type=int, + default=6, + help="Decimal places to round percentages to; use -1 for no rounding (default: 6).", + ) + parser.add_argument( + "--indent", + type=int, + default=2, + help="Indentation for the output JSON; use -1 for compact output (default: 2).", + ) + return parser + + +def main(argv: Optional[Sequence[str]] = None) -> int: + parser = _build_arg_parser() + args = parser.parse_args(argv) + + if args.dataset is not None: + dataset_path = Path(args.dataset) + if not dataset_path.is_file(): + print(f"Dataset not found: {dataset_path}", file=sys.stderr) + return 2 + else: + dataset_path = resolve_default_dataset() + if dataset_path is None: + print(_no_dataset_message(), file=sys.stderr) + return 2 + print(f"Using default dataset: {dataset_path}", file=sys.stderr) + + precision = None if args.precision is not None and args.precision < 0 else args.precision + indent = None if args.indent is not None and args.indent < 0 else args.indent + + distribution = build_distribution_from_dataset( + dataset_path, + args.columns, + task_id_column=args.task_id_column, + separator=args.separator, + missing_value=args.missing_value, + precision=precision, + ) + + payload = json.dumps(distribution, indent=indent, ensure_ascii=False) + if args.output: + out_path = Path(args.output) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(payload + "\n", encoding="utf-8") + total_tasks = sum(len(entry["task_ids"]) for entry in distribution.values()) + print( + f"Wrote distribution over {args.columns} ({len(distribution)} groups, {total_tasks} tasks) to {out_path}", + file=sys.stderr, + ) + else: + print(payload) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/responses_api_agents/stirrup_agent/tests/test_task_distribution.py b/responses_api_agents/stirrup_agent/tests/test_task_distribution.py new file mode 100644 index 000000000..6e56e4423 --- /dev/null +++ b/responses_api_agents/stirrup_agent/tests/test_task_distribution.py @@ -0,0 +1,269 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import random +from pathlib import Path + +import pytest + +from responses_api_agents.stirrup_agent import task_distribution as td +from responses_api_agents.stirrup_agent.task_distribution import ( + MISSING_VALUE, + build_distribution, + build_distribution_from_dataset, + compose_key, + extract_metadata, + iter_dataset_rows, + main, + resolve_default_dataset, + sample_task_ids, +) + + +def _row(task_id: str, **metadata) -> dict: + return {"responses_create_params": {"input": "", "metadata": {"task_id": task_id, **metadata}}} + + +def _write_jsonl(path: Path, rows) -> Path: + path.write_text("\n".join(json.dumps(r) for r in rows) + "\n", encoding="utf-8") + return path + + +class TestExtractMetadata: + def test_responses_create_params_metadata(self) -> None: + row = _row("t1", sector="Legal") + assert extract_metadata(row) == {"task_id": "t1", "sector": "Legal"} + + def test_top_level_metadata_fallback(self) -> None: + row = {"metadata": {"task_id": "t1", "sector": "Legal"}} + assert extract_metadata(row) == {"task_id": "t1", "sector": "Legal"} + + def test_row_itself_fallback(self) -> None: + row = {"task_id": "t1", "sector": "Legal"} + assert extract_metadata(row) == {"task_id": "t1", "sector": "Legal"} + + def test_non_mapping_params_falls_through(self) -> None: + row = {"responses_create_params": "oops", "metadata": {"task_id": "t1"}} + assert extract_metadata(row) == {"task_id": "t1"} + + +class TestComposeKey: + def test_single_column(self) -> None: + assert compose_key({"sector": "Legal"}, ["sector"]) == "Legal" + + def test_composite_key(self) -> None: + meta = {"sector": "Legal", "occupation": "Lawyer"} + assert compose_key(meta, ["sector", "occupation"]) == "Legal | Lawyer" + + def test_missing_value_placeholder(self) -> None: + assert compose_key({}, ["sector"]) == MISSING_VALUE + + def test_custom_separator(self) -> None: + meta = {"a": "x", "b": "y"} + assert compose_key(meta, ["a", "b"], separator="::") == "x::y" + + def test_non_string_value_is_stringified(self) -> None: + assert compose_key({"n": 5}, ["n"]) == "5" + + +class TestBuildDistribution: + def test_percentages_and_task_ids(self) -> None: + rows = [ + _row("a", sector="Legal"), + _row("b", sector="Legal"), + _row("c", sector="Healthcare"), + _row("d", sector="Finance"), + ] + dist = build_distribution(rows, ["sector"]) + assert dist["Legal"]["percentage"] == 0.5 + assert dist["Legal"]["task_ids"] == ["a", "b"] + assert dist["Healthcare"]["percentage"] == 0.25 + assert dist["Finance"]["task_ids"] == ["d"] + + def test_ordering_is_descending_by_share(self) -> None: + rows = [ + _row("a", sector="Legal"), + _row("b", sector="Legal"), + _row("c", sector="Healthcare"), + ] + assert list(build_distribution(rows, ["sector"]).keys()) == ["Legal", "Healthcare"] + + def test_percentages_sum_to_one_unrounded(self) -> None: + rows = [_row(str(i), sector=s) for i, s in enumerate(["a", "a", "b", "c", "c", "c", "d"])] + dist = build_distribution(rows, ["sector"], precision=None) + assert pytest.approx(sum(e["percentage"] for e in dist.values())) == 1.0 + + def test_composite_columns(self) -> None: + rows = [ + _row("a", sector="Legal", occupation="Lawyer"), + _row("b", sector="Legal", occupation="Paralegal"), + ] + dist = build_distribution(rows, ["sector", "occupation"]) + assert set(dist.keys()) == {"Legal | Lawyer", "Legal | Paralegal"} + + def test_empty_rows_yields_empty(self) -> None: + assert build_distribution([], ["sector"]) == {} + + def test_missing_column_grouped_under_placeholder(self) -> None: + rows = [_row("a"), _row("b", sector="Legal")] + dist = build_distribution(rows, ["sector"]) + assert MISSING_VALUE in dist + assert dist[MISSING_VALUE]["task_ids"] == ["a"] + + def test_missing_task_id_uses_positional_fallback(self) -> None: + rows = [{"responses_create_params": {"metadata": {"sector": "Legal"}}}] + dist = build_distribution(rows, ["sector"]) + assert dist["Legal"]["task_ids"] == ["task_id_index_0"] + + def test_requires_columns(self) -> None: + with pytest.raises(ValueError): + build_distribution([_row("a", sector="Legal")], []) + + def test_precision_rounding(self) -> None: + rows = [_row(str(i), sector="a" if i == 0 else "b") for i in range(3)] + dist = build_distribution(rows, ["sector"], precision=2) + assert dist["b"]["percentage"] == 0.67 + + +class TestIterAndDatasetWrapper: + def test_iter_skips_blank_lines(self, tmp_path: Path) -> None: + path = tmp_path / "d.jsonl" + path.write_text(json.dumps(_row("a", sector="Legal")) + "\n\n", encoding="utf-8") + assert len(list(iter_dataset_rows(path))) == 1 + + def test_iter_raises_on_bad_json(self, tmp_path: Path) -> None: + path = tmp_path / "d.jsonl" + path.write_text("{not json}\n", encoding="utf-8") + with pytest.raises(ValueError, match="invalid JSON"): + list(iter_dataset_rows(path)) + + def test_build_from_dataset(self, tmp_path: Path) -> None: + path = _write_jsonl(tmp_path / "d.jsonl", [_row("a", sector="Legal"), _row("b", sector="Legal")]) + dist = build_distribution_from_dataset(path, ["sector"]) + assert dist["Legal"]["percentage"] == 1.0 + + +class TestSampleTaskIds: + def _dist(self): + return { + "Legal": {"percentage": 0.5, "task_ids": ["a", "b"]}, + "Healthcare": {"percentage": 0.5, "task_ids": ["c", "d"]}, + } + + def test_zero_or_negative_returns_empty(self) -> None: + assert sample_task_ids(self._dist(), 0) == [] + assert sample_task_ids(self._dist(), -3) == [] + + def test_without_replacement_no_duplicates(self) -> None: + rng = random.Random(0) + sampled = sample_task_ids(self._dist(), 3, rng=rng) + assert len(sampled) == 3 + assert len(set(sampled)) == 3 + + def test_without_replacement_capped_at_total(self) -> None: + sampled = sample_task_ids(self._dist(), 100, rng=random.Random(1)) + assert sorted(sampled) == ["a", "b", "c", "d"] + + def test_with_replacement_allows_more_than_total(self) -> None: + sampled = sample_task_ids(self._dist(), 10, rng=random.Random(2), replace=True) + assert len(sampled) == 10 + + def test_empty_distribution_returns_empty(self) -> None: + assert sample_task_ids({}, 5) == [] + + def test_zero_weight_distribution_returns_empty(self) -> None: + dist = {"x": {"percentage": 0.0, "task_ids": ["a"]}} + assert sample_task_ids(dist, 5) == [] + assert sample_task_ids(dist, 5, replace=True) == [] + + def test_with_replacement_skips_empty_groups(self) -> None: + dist = {"x": {"percentage": 1.0, "task_ids": []}} + assert sample_task_ids(dist, 3, rng=random.Random(3), replace=True) == [] + + +class TestResolveDefaultDataset: + def test_returns_first_existing(self, tmp_path: Path) -> None: + missing = tmp_path / "missing.jsonl" + present = _write_jsonl(tmp_path / "present.jsonl", [_row("a", sector="Legal")]) + assert resolve_default_dataset([missing, present]) == present + + def test_priority_order(self, tmp_path: Path) -> None: + first = _write_jsonl(tmp_path / "first.jsonl", [_row("a", sector="Legal")]) + second = _write_jsonl(tmp_path / "second.jsonl", [_row("b", sector="Legal")]) + assert resolve_default_dataset([first, second]) == first + + def test_returns_none_when_nothing_exists(self, tmp_path: Path) -> None: + assert resolve_default_dataset([tmp_path / "a.jsonl", tmp_path / "b.jsonl"]) is None + + +class TestMain: + def test_uses_default_dataset_when_omitted(self, tmp_path: Path, capsys, monkeypatch) -> None: + default_ds = _write_jsonl(tmp_path / "gdpval.jsonl", [_row("a", sector="Legal")]) + monkeypatch.setattr(td, "DEFAULT_DATASET_CANDIDATES", (tmp_path / "missing.jsonl", default_ds)) + rc = main(["--column", "sector"]) + assert rc == 0 + captured = capsys.readouterr() + assert str(default_ds) in captured.err + assert json.loads(captured.out)["Legal"]["percentage"] == 1.0 + + def test_errors_when_no_default_and_none_specified(self, tmp_path: Path, capsys, monkeypatch) -> None: + monkeypatch.setattr(td, "DEFAULT_DATASET_CANDIDATES", (tmp_path / "missing.jsonl",)) + rc = main(["--column", "sector"]) + assert rc == 2 + err = capsys.readouterr().err + assert "no default gdpval dataset was found" in err.lower() + assert "gym eval prepare --benchmark gdpval" in err + assert "--dataset" in err + + def test_errors_when_specified_dataset_missing(self, tmp_path: Path, capsys) -> None: + rc = main(["--dataset", str(tmp_path / "nope.jsonl"), "--column", "sector"]) + assert rc == 2 + assert "Dataset not found" in capsys.readouterr().err + + def test_writes_output_file(self, tmp_path: Path, capsys) -> None: + dataset = _write_jsonl( + tmp_path / "d.jsonl", + [_row("a", sector="Legal"), _row("b", sector="Legal"), _row("c", sector="Healthcare")], + ) + out = tmp_path / "dist.json" + rc = main(["--dataset", str(dataset), "--column", "sector", "--output", str(out)]) + assert rc == 0 + data = json.loads(out.read_text()) + assert data["Legal"]["task_ids"] == ["a", "b"] + assert "3 tasks" in capsys.readouterr().err + + def test_stdout_when_no_output(self, tmp_path: Path, capsys) -> None: + dataset = _write_jsonl(tmp_path / "d.jsonl", [_row("a", sector="Legal")]) + rc = main(["--dataset", str(dataset), "--column", "sector"]) + assert rc == 0 + assert json.loads(capsys.readouterr().out)["Legal"]["percentage"] == 1.0 + + def test_no_rounding_and_compact(self, tmp_path: Path, capsys) -> None: + dataset = _write_jsonl( + tmp_path / "d.jsonl", [_row("a", sector="x"), _row("b", sector="y"), _row("c", sector="y")] + ) + rc = main(["--dataset", str(dataset), "--column", "sector", "--precision", "-1", "--indent", "-1"]) + assert rc == 0 + out = capsys.readouterr().out + assert "\n " not in out # compact (no indentation) + assert json.loads(out)["y"]["percentage"] == pytest.approx(2 / 3) + + def test_composite_columns_cli(self, tmp_path: Path, capsys) -> None: + dataset = _write_jsonl( + tmp_path / "d.jsonl", + [_row("a", sector="Legal", occupation="Lawyer")], + ) + rc = main(["--dataset", str(dataset), "--column", "sector", "--column", "occupation"]) + assert rc == 0 + assert "Legal | Lawyer" in json.loads(capsys.readouterr().out) From 9057f17d6973755082a90e8c304e55250dca4859 Mon Sep 17 00:00:00 2001 From: Virginia Wu Date: Thu, 25 Jun 2026 17:19:19 -0700 Subject: [PATCH 2/8] made task_distribution default to occupation column Signed-off-by: Virginia Wu --- .../stirrup_agent/task_distribution.py | 26 ++++++++++++++----- .../tests/test_task_distribution.py | 13 ++++++++++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/responses_api_agents/stirrup_agent/task_distribution.py b/responses_api_agents/stirrup_agent/task_distribution.py index 20f41fb45..9b963e7ea 100644 --- a/responses_api_agents/stirrup_agent/task_distribution.py +++ b/responses_api_agents/stirrup_agent/task_distribution.py @@ -33,8 +33,13 @@ Usage:: - # --dataset defaults to the prepared GDPVal dataset - # (benchmarks/gdpval/data/gdpval_benchmark.jsonl) when omitted. + # Full config defaults: the prepared GDPVal dataset + # (benchmarks/gdpval/data/gdpval_benchmark.jsonl) grouped by ``occupation``. + # Without --output the distribution is printed to stdout. + python -m responses_api_agents.stirrup_agent.task_distribution \ + --output occupation_distribution.json + + # --dataset defaults to the prepared GDPVal dataset when omitted. python -m responses_api_agents.stirrup_agent.task_distribution \ --column sector \ --output sector_distribution.json @@ -61,6 +66,9 @@ # Separator joining multiple column values into a single composite key. DEFAULT_KEY_SEPARATOR = " | " +# Column grouped on when ``--column`` is not specified. +DEFAULT_COLUMN = "occupation" + # Repo root: this file is responses_api_agents/stirrup_agent/task_distribution.py. _REPO_ROOT = Path(__file__).resolve().parents[2] @@ -316,11 +324,12 @@ def _build_arg_parser() -> argparse.ArgumentParser: "--column", dest="columns", action="append", - required=True, + default=None, metavar="COLUMN", help=( "Metadata column to group by. Repeat to group by a composite key " - "(e.g. --column sector --column occupation)." + "(e.g. --column sector --column occupation). " + f"Defaults to {DEFAULT_COLUMN!r} if not specified." ), ) parser.add_argument( @@ -375,12 +384,17 @@ def main(argv: Optional[Sequence[str]] = None) -> int: return 2 print(f"Using default dataset: {dataset_path}", file=sys.stderr) + columns = args.columns + if not columns: + columns = [DEFAULT_COLUMN] + print(f"No --column specified; defaulting to {DEFAULT_COLUMN!r}.", file=sys.stderr) + precision = None if args.precision is not None and args.precision < 0 else args.precision indent = None if args.indent is not None and args.indent < 0 else args.indent distribution = build_distribution_from_dataset( dataset_path, - args.columns, + columns, task_id_column=args.task_id_column, separator=args.separator, missing_value=args.missing_value, @@ -394,7 +408,7 @@ def main(argv: Optional[Sequence[str]] = None) -> int: out_path.write_text(payload + "\n", encoding="utf-8") total_tasks = sum(len(entry["task_ids"]) for entry in distribution.values()) print( - f"Wrote distribution over {args.columns} ({len(distribution)} groups, {total_tasks} tasks) to {out_path}", + f"Wrote distribution over {columns} ({len(distribution)} groups, {total_tasks} tasks) to {out_path}", file=sys.stderr, ) else: diff --git a/responses_api_agents/stirrup_agent/tests/test_task_distribution.py b/responses_api_agents/stirrup_agent/tests/test_task_distribution.py index 6e56e4423..6250e5194 100644 --- a/responses_api_agents/stirrup_agent/tests/test_task_distribution.py +++ b/responses_api_agents/stirrup_agent/tests/test_task_distribution.py @@ -226,6 +226,19 @@ def test_errors_when_no_default_and_none_specified(self, tmp_path: Path, capsys, assert "gym eval prepare --benchmark gdpval" in err assert "--dataset" in err + def test_defaults_to_occupation_column(self, tmp_path: Path, capsys) -> None: + dataset = _write_jsonl( + tmp_path / "d.jsonl", + [_row("a", occupation="Lawyer"), _row("b", occupation="Lawyer"), _row("c", occupation="Nurse")], + ) + rc = main(["--dataset", str(dataset)]) + assert rc == 0 + captured = capsys.readouterr() + assert "defaulting to 'occupation'" in captured.err + data = json.loads(captured.out) + assert data["Lawyer"]["task_ids"] == ["a", "b"] + assert data["Nurse"]["percentage"] == pytest.approx(1 / 3) + def test_errors_when_specified_dataset_missing(self, tmp_path: Path, capsys) -> None: rc = main(["--dataset", str(tmp_path / "nope.jsonl"), "--column", "sector"]) assert rc == 2 From e05d8e0e5f0405a20246a9f883c62c5d5f771c13 Mon Sep 17 00:00:00 2001 From: Virginia Wu Date: Thu, 25 Jun 2026 17:44:32 -0700 Subject: [PATCH 3/8] added occupation distribution Signed-off-by: Virginia Wu --- .../data/occupation_distribution.json | 442 ++++++++++++++++++ .../stirrup_agent/task_distribution.py | 16 +- 2 files changed, 453 insertions(+), 5 deletions(-) create mode 100644 responses_api_agents/stirrup_agent/data/occupation_distribution.json diff --git a/responses_api_agents/stirrup_agent/data/occupation_distribution.json b/responses_api_agents/stirrup_agent/data/occupation_distribution.json new file mode 100644 index 000000000..a91a899a0 --- /dev/null +++ b/responses_api_agents/stirrup_agent/data/occupation_distribution.json @@ -0,0 +1,442 @@ +{ + "Accountants and Auditors": { + "percentage": 0.022727, + "task_ids": [ + "83d10b06-26d1-4636-a32c-23f92c57f30b", + "7b08cd4d-df60-41ae-9102-8aaa49306ba2", + "7d7fc9a7-21a7-4b83-906f-416dea5ad04f", + "43dc9778-450b-4b46-b77e-b6d82b202035", + "ee09d943-5a11-430a-b7a2-971b4e9b01b5" + ] + }, + "Administrative Services Managers": { + "percentage": 0.022727, + "task_ids": [ + "f84ea6ac-8f9f-428c-b96c-d0884e30f7c7", + "a328feea-47db-4856-b4be-2bdc63dd88fb", + "27e8912c-8bd5-44ba-ad87-64066ea05264", + "17111c03-aac7-45c2-857d-c06d8223d6ad", + "c44e9b62-7cd8-4f72-8ad9-f8fbddb94083" + ] + }, + "Audio and Video Technicians": { + "percentage": 0.022727, + "task_ids": [ + "99ac6944-4ec6-4848-959c-a460ac705c6f", + "f9a1c16c-53fd-4c8f-88cc-5c325ec2f0bb", + "38889c3b-e3d4-49c8-816a-3cc8e5313aba", + "ff85ee58-bc9f-4aa2-806d-87edeabb1b81", + "4b894ae3-1f23-4560-b13d-07ed1132074e" + ] + }, + "Buyers and Purchasing Agents": { + "percentage": 0.022727, + "task_ids": [ + "1b1ade2d-f9f6-4a04-baa5-aa15012b53be", + "93b336f3-61f3-4287-86d2-87445e1e0f90", + "15ddd28d-8445-4baa-ac7f-f41372e1344e", + "24d1e93f-9018-45d4-b522-ad89dfd78079", + "05389f78-589a-473c-a4ae-67c61050bfca" + ] + }, + "Child, Family, and School Social Workers": { + "percentage": 0.022727, + "task_ids": [ + "575f8679-b4c1-47a2-8e96-d570d4ed9269", + "a74ead3b-f67d-4b1c-9116-f6bb81b29d4f", + "bbe0a93b-ebf0-40b0-98dc-8d9243099034", + "85d95ce5-b20c-41e2-834e-e788ce9622b6", + "76d10872-9ffa-4ede-83ee-e0f1ec5e2b8d" + ] + }, + "Compliance Officers": { + "percentage": 0.022727, + "task_ids": [ + "36d567ba-e205-4313-9756-931c6e4691fe", + "7bbfcfe9-132d-4194-82bb-d6f29d001b01", + "2696757c-1f8a-4959-8f0d-f5597b9e70fc", + "dfb4e0cd-a0b7-454e-b943-0dd586c2764c", + "4c18ebae-dfaa-4b76-b10c-61fcdf26734c" + ] + }, + "Computer and Information Systems Managers": { + "percentage": 0.022727, + "task_ids": [ + "cebf301e-5ea7-41ae-b117-ad8f43e7ac22", + "c2e8f271-7858-412f-b460-472463ad81d9", + "2ea2e5b5-257f-42e6-a7dc-93763f28b19d", + "c357f0e2-963d-4eb7-a6fa-3078fe55b3ba", + "a45bc83b-22f9-4def-8d89-9c5661b2b86f" + ] + }, + "Concierges": { + "percentage": 0.022727, + "task_ids": [ + "a10ec48c-168e-476c-8fe3-23b2a5f616ac", + "fccaa4a1-1c39-49ac-b701-55361a19966b", + "f5d428fd-b38e-41f0-8783-35423dab80f6", + "2fa8e956-7b35-4c13-95dc-027f02be318b", + "0e4fe8cd-16d0-4f41-8247-6385b4762582" + ] + }, + "Counter and Rental Clerks": { + "percentage": 0.022727, + "task_ids": [ + "a0ef404e-82a6-4507-bff1-633d7c8e0004", + "b7a5912e-0e63-41f5-8c22-9cdb8f46ab01", + "aa071045-bcb0-4164-bb85-97245d56287e", + "476db143-163a-4537-9e21-fe46adad703b", + "61f546a8-c374-467f-95cc-d0d9b5656eb6" + ] + }, + "Customer Service Representatives": { + "percentage": 0.022727, + "task_ids": [ + "f3351922-dbdd-45da-85c5-e7110696bbe5", + "61717508-4df7-41be-bf97-318dfb2475c0", + "0ed38524-a4ad-405f-9dee-7b2252659aad", + "87da214f-fd92-4c58-9854-f4d0d10adce0", + "d025a41c-c439-4ee1-bc79-dd5c94b27a2d" + ] + }, + "Editors": { + "percentage": 0.022727, + "task_ids": [ + "401a07f1-d57e-4bb0-889b-22de8c900f0e", + "afe56d05-dac8-47d7-a233-ad1d035ca5bd", + "9a8c8e28-ce76-408b-83c3-488422892e58", + "3a4c347c-4aec-43c7-9a54-eb1f816ab1f9", + "ec2fccc9-b7f6-4c73-bf51-896fdb433cec" + ] + }, + "Film and Video Editors": { + "percentage": 0.022727, + "task_ids": [ + "8c8fc328-69fc-4559-a13f-82087baef0a1", + "e222075d-5d62-4757-ae3c-e34b0846583b", + "c94452e4-39cd-4846-b73a-ab75933d1ad7", + "75401f7c-396d-406d-b08e-938874ad1045", + "a941b6d8-4289-4500-b45a-f8e4fc94a724" + ] + }, + "Financial Managers": { + "percentage": 0.022727, + "task_ids": [ + "a1963a68-1bea-4bb1-b7e0-145c92a57449", + "5f6c57dd-feb6-4e70-b152-4969d92d1608", + "b39a5aa7-cd1b-47ad-b249-90afd22f8f21", + "b78fd844-db76-448e-a783-5e9877cb74c2", + "4520f882-715a-482d-8e87-1cb3cbdfe975" + ] + }, + "Financial and Investment Analysts": { + "percentage": 0.022727, + "task_ids": [ + "8079e27d-b6f3-4f75-a9b5-db27903c798d", + "e21cd746-404d-4602-b9d2-01d2812c5b87", + "9e8607e7-a38a-491f-ace1-e5ea7dc477cb", + "c7d83f01-2874-4876-b7fd-52582ec99e1a", + "46b34f78-6c06-4416-87e2-77b6d8b20ce9" + ] + }, + "First-Line Supervisors of Non-Retail Sales Workers": { + "percentage": 0.022727, + "task_ids": [ + "ec591973-04d5-48c0-981c-1ab2fcec2dc1", + "62f04c2f-e0f7-4710-876c-54ee9c2e8256", + "3f821c2d-ab97-46ec-a0fb-b8f73c2682bc", + "e996036e-8287-4e7f-8d0a-90a57cb53c45", + "327fbc21-7d26-4964-bf7c-f4f41e55c54d" + ] + }, + "First-Line Supervisors of Office and Administrative Support Workers": { + "percentage": 0.022727, + "task_ids": [ + "6dcae3f5-bf1c-48e0-8b4b-23e6486a934c", + "1aecc095-4d76-4b89-b752-1a0f870502cd", + "0353ee0c-18b5-4ad3-88e8-e001d223e1d7", + "40a8c4b1-b169-4f92-a38b-7f79685037ec", + "4d1a8410-e9c5-4be5-ab43-cc55563c594c" + ] + }, + "First-Line Supervisors of Police and Detectives": { + "percentage": 0.022727, + "task_ids": [ + "8c823e32-537c-42b2-84ba-635d63c2853a", + "eb54f575-93f9-408b-b9e0-f1208a0b6759", + "11e1b169-5fb6-4d79-8a83-82ddf4987a85", + "a95a5829-34bb-40f3-993b-558aed6dcdef", + "22c0809b-f8db-489e-93b3-b4da225e3e0e" + ] + }, + "First-Line Supervisors of Production and Operating Workers": { + "percentage": 0.022727, + "task_ids": [ + "bf68f2ad-eac5-490a-adec-d847eb45bd6f", + "efca245f-c24f-4f75-a9d5-59201330ab7a", + "9e39df84-ac57-4c9b-a2e3-12b8abf2c797", + "68d8d901-dd0b-4a7e-bf9a-1074fddf1a96", + "1752cb53-5983-46b6-92ee-58ac85a11283" + ] + }, + "First-Line Supervisors of Retail Sales Workers": { + "percentage": 0.022727, + "task_ids": [ + "bd72994f-5659-4084-9fab-fc547d1efe3b", + "211d0093-2c64-4bd0-828c-0201f18924e7", + "d4525420-a427-4ef2-b4e9-2dcc2d31b3b6", + "45c6237b-f9c9-4526-9a8d-6a5c404624ec", + "cecac8f9-8203-4ebd-ad49-54436a8c4171" + ] + }, + "General and Operations Managers": { + "percentage": 0.022727, + "task_ids": [ + "8f9e8bcd-6102-40da-ab76-23f51d8b21fa", + "0fad6023-767b-42c1-a1b3-027cd4f583cb", + "02314fc6-a24e-42f4-a8cd-362cae0f0ec1", + "4d61a19a-8438-4d4c-9fc2-cf167e36dcd6", + "6436ff9e-c5f2-47ba-9aaa-49d89b0594ab" + ] + }, + "Industrial Engineers": { + "percentage": 0.022727, + "task_ids": [ + "8a7b6fca-60cc-4ae3-b649-971753cbf8b9", + "40a99a31-42d6-4f23-b3ec-8f591afe25b6", + "b9665ca1-4da4-4ff9-86f2-40b9a8683048", + "c6269101-fdc8-4602-b345-eac7597c0c81", + "be830ca0-b352-4658-a5bd-57139d6780ba" + ] + }, + "Lawyers": { + "percentage": 0.022727, + "task_ids": [ + "cd9efc18-d14a-4f69-8531-5d178a08084d", + "a97369c7-e5cf-40ca-99e8-d06f81c57d53", + "3f625cb2-f40e-4ead-8a97-6924356d5989", + "aad21e4c-1d43-45fc-899a-97754a1b1b63", + "8314d1b1-5b0f-42a4-b5d5-91c0867b0913" + ] + }, + "Mechanical Engineers": { + "percentage": 0.022727, + "task_ids": [ + "5e2b6aab-f9fb-4dd6-a1a5-874ef1743909", + "46fc494e-a24f-45ce-b099-851d5c181fd4", + "3940b7e7-ec4f-4cea-8097-3ab4cfdcaaa6", + "8077e700-2b31-402d-bd09-df4d33c39653", + "5a2d70da-0a42-4a6b-a3ca-763e03f070a5" + ] + }, + "Medical Secretaries and Administrative Assistants": { + "percentage": 0.022727, + "task_ids": [ + "f1be6436-ffff-4fee-9e66-d550291a1735", + "41f6ef59-88c9-4b2c-bcc7-9ceb88422f48", + "a0552909-bc66-4a3a-8970-ee0d17b49718", + "6d2c8e55-fe20-45c6-bdaf-93e676868503", + "4b98ccce-9e42-44e9-9115-6fc3e79de288" + ] + }, + "Medical and Health Services Managers": { + "percentage": 0.022727, + "task_ids": [ + "74d6e8b0-f334-4e7e-af55-c095d5d4d1a6", + "81db15ff-ceea-4f63-a1cd-06dc88114709", + "61b0946a-5c1c-4bf6-8607-84d7c7e0dfe0", + "61e7b9c6-0051-429f-a341-fda9b6578a84", + "c9bf9801-9640-45fa-8166-1ab01f2d98e4" + ] + }, + "News Analysts, Reporters, and Journalists": { + "percentage": 0.022727, + "task_ids": [ + "60221cd0-686e-4a08-985e-d9bb2fa18501", + "ef8719da-18e5-4bfe-b986-399652d77376", + "3baa0009-5a60-4ae8-ae99-4955cb328ff3", + "5d0feb24-e8b6-4ace-b64f-d5cd1a8b563d", + "6974adea-8326-43fa-8187-2724b15d9546" + ] + }, + "Nurse Practitioners": { + "percentage": 0.022727, + "task_ids": [ + "1a78e076-445e-4c5d-b8ce-387d2fe5e715", + "1b9ec237-bf9c-41f9-8fa9-0e685fcd93c6", + "0112fc9b-c3b2-4084-8993-5a4abb1f54f1", + "772e7524-174e-4c88-957e-6e510b61ea69", + "e6429658-4de1-42dd-a9e0-2d2b9b02fb10" + ] + }, + "Order Clerks": { + "percentage": 0.022727, + "task_ids": [ + "b5d2e6f1-62a2-433a-bcdd-95b260cdd860", + "f841ddcf-2a28-4f6d-bac3-61b607219d3e", + "47ef842d-8eac-4b90-bda8-dd934c228c96", + "1137e2bb-bdf9-4876-b572-f29b7de5e595", + "c3525d4d-2012-45df-853e-2d2a0e902991" + ] + }, + "Personal Financial Advisors": { + "percentage": 0.022727, + "task_ids": [ + "9a0d8d36-6233-4c76-9107-0d1f783c7340", + "664a42e5-3240-413a-9a57-ea93c6303269", + "feb5eefc-39f1-4451-9ef9-bffe011b71dd", + "3600de06-3f71-4e48-9480-e4828c579924", + "c657103b-b348-4496-a848-b2b7165d28b2" + ] + }, + "Pharmacists": { + "percentage": 0.022727, + "task_ids": [ + "91060ff0-3eb5-4ddf-9edb-f6758b95499e", + "8384083a-c31b-4194-80ba-4d335a444918", + "045aba2e-4093-42aa-ab7f-159cc538278c", + "f2986c1f-2bbf-4b83-bc93-624a9d617f45", + "ffed32d8-d192-4e3f-8cd4-eda5a730aec3" + ] + }, + "Private Detectives and Investigators": { + "percentage": 0.022727, + "task_ids": [ + "ae0c1093-5ea8-4b84-a81e-53ebf7a4321d", + "f9f82549-fdde-4462-aff8-e70fba5b8c66", + "57b2cdf2-ad62-4591-aa91-aad489740320", + "84322284-5c2c-4873-b507-b147449d209d", + "a46d5cd2-55fe-48fa-a4c6-6aaf6b9991b5" + ] + }, + "Producers and Directors": { + "percentage": 0.022727, + "task_ids": [ + "6241e678-4ba3-4831-b3c7-78412697febc", + "e14e32ba-d310-4d45-9b8a-6d73d0ece1ae", + "b1a79ce1-86b0-41fb-97dc-9206dfd7b044", + "e4f664ea-0e5c-4e4e-a0d3-a87a33da947a", + "a079d38f-c529-436a-beca-3e291f9e62a3" + ] + }, + "Project Management Specialists": { + "percentage": 0.022727, + "task_ids": [ + "02aa1805-c658-4069-8a6a-02dec146063a", + "fd6129bd-f095-429b-873c-dcc3137be2c3", + "ce864f41-8584-49ba-b24f-9c9104b47bf0", + "58ac1cc5-5754-4580-8c9c-8c67e1a9d619", + "3c19c6d1-672c-467a-8437-6fe21afb8eae" + ] + }, + "Property, Real Estate, and Community Association Managers": { + "percentage": 0.022727, + "task_ids": [ + "a99d85fc-eff8-48d2-a7d4-42a75d62f18d", + "55ddb773-23a4-454c-8704-d432fe1b99d9", + "1e5a1d7f-12c1-48c6-afd9-82257b3f2409", + "0419f1c3-d669-45d0-81cd-f4d5923b06a5", + "ed2bc14c-99ac-4a2a-8467-482a1a5d67f3" + ] + }, + "Real Estate Brokers": { + "percentage": 0.022727, + "task_ids": [ + "46bc7238-3501-4839-b989-e2bd47853676", + "2d06bc0a-89c6-4e89-9417-5ffe725c1bc6", + "fd3ad420-6f7d-43b1-a990-c0c5c047d071", + "0818571f-5ff7-4d39-9d2c-ced5ae44299e", + "6074bba3-7e3a-4b1c-b8c6-a15bb6695c3b" + ] + }, + "Real Estate Sales Agents": { + "percentage": 0.022727, + "task_ids": [ + "5ad0c554-a7a2-48cd-b41a-ebc1bff4a9de", + "11593a50-734d-4449-b5b4-f8986a133fd8", + "94925f49-36bc-42da-b45b-61078d329300", + "90f37ff3-e4ed-4a0b-94bb-bed0f7def1ef", + "d3d255b2-f5f2-4841-9f62-2083ec9ef3da" + ] + }, + "Recreation Workers": { + "percentage": 0.022727, + "task_ids": [ + "403b9234-6299-4b5f-a106-70c1bc11ec4c", + "1bff4551-1d54-4e37-b2e0-d5c3f2ea4a45", + "650adcb1-ed19-4f88-8117-77640f7b94b6", + "01d7e53e-0513-4109-a242-8ccaf442cd21", + "a73fbc98-90d4-4134-a54f-2b1d0c838791" + ] + }, + "Registered Nurses": { + "percentage": 0.022727, + "task_ids": [ + "0ec25916-1b5c-4bfe-93d3-4e103d860f3a", + "116e791e-890c-42b1-ba90-1db02e8bfd45", + "dd724c67-8118-4b99-ab50-4761af705c3b", + "7151c60a-d4cb-4fc4-8169-3d4cb446e6b9", + "90edba97-74f0-425a-8ff6-8b93182eb7cb" + ] + }, + "Sales Managers": { + "percentage": 0.022727, + "task_ids": [ + "b3573f20-5d3e-4954-948f-9461fda693d2", + "a69be28f-9a84-47c9-992e-b90446cdca9d", + "788d2bc6-82df-4dc7-8467-a0f31405dc14", + "74ed1dc7-1468-48a8-9071-58775c0d667a", + "69a8ef86-4e69-4fe2-9168-080f1e978e67" + ] + }, + "Sales Representatives, Wholesale and Manufacturing, Except Technical and Scientific Products": { + "percentage": 0.022727, + "task_ids": [ + "ab81b076-e5d8-473a-9bdb-7ea7c38f6ebc", + "d7cfae6f-4a82-4289-955e-c799dfe1e0f4", + "19403010-3e5c-494e-a6d3-13594e99f6af", + "7ed932dd-244f-4d61-bf02-1bc3bab1af14", + "105f8ad0-8dd2-422f-9e88-2be5fbd2b215" + ] + }, + "Sales Representatives, Wholesale and Manufacturing, Technical and Scientific Products": { + "percentage": 0.022727, + "task_ids": [ + "b57efde3-26d6-4742-bbff-2b63c43b4baa", + "15d37511-75c5-4c7f-81f1-16e00c0d95f3", + "bb863dd9-31c2-4f64-911a-ce11f457143b", + "fe0d3941-e32c-4bf1-a643-b566d2b4cb3c", + "6a900a40-8d2b-4064-a5b1-13a60bc173d8" + ] + }, + "Securities, Commodities, and Financial Services Sales Agents": { + "percentage": 0.022727, + "task_ids": [ + "9efbcd35-186d-49b6-ac24-28ee2bc9a263", + "1d4672c8-b0a7-488f-905f-9ab4e25a19f7", + "4de6a529-4f61-41a1-b2dc-64951ba03457", + "4c4dc603-c21c-4284-8fb1-1b827c1fddf4", + "bb499d9c-0263-4684-9238-75e8e86077b1" + ] + }, + "Shipping, Receiving, and Inventory Clerks": { + "percentage": 0.022727, + "task_ids": [ + "5349dd7b-bf0a-4544-9a17-75b7013767e6", + "a4a9195c-5ebe-4b8d-a0c2-4a6b7a49da8b", + "552b7dd0-96f4-437c-a749-0691e0e4b381", + "11dcc268-cb07-4d3a-a184-c6d7a19349bc", + "76418a2c-a3c0-4894-b89d-2493369135d9" + ] + }, + "Software Developers": { + "percentage": 0.022727, + "task_ids": [ + "0e386e32-df20-4d1f-b536-7159bc409ad5", + "7de33b48-5163-4f50-b5f3-8deea8185e57", + "854f3814-681c-4950-91ac-55b0db0e3781", + "4122f866-01fa-400b-904d-fa171cdab7c7", + "2c249e0f-4a8c-4f8e-b4f4-6508ba29b34f" + ] + } +} diff --git a/responses_api_agents/stirrup_agent/task_distribution.py b/responses_api_agents/stirrup_agent/task_distribution.py index 9b963e7ea..4f661cc96 100644 --- a/responses_api_agents/stirrup_agent/task_distribution.py +++ b/responses_api_agents/stirrup_agent/task_distribution.py @@ -33,7 +33,7 @@ Usage:: - # Full config defaults: the prepared GDPVal dataset + # Full defaults: the prepared GDPVal dataset (220 tasks) # (benchmarks/gdpval/data/gdpval_benchmark.jsonl) grouped by ``occupation``. # Without --output the distribution is printed to stdout. python -m responses_api_agents.stirrup_agent.task_distribution \ @@ -109,10 +109,16 @@ def _no_dataset_message() -> str: "\nTo fix this, do one of the following:\n" "\n 1. Prepare the GDPVal benchmark dataset (recommended). This downloads\n" " the openai/gdpval dataset from HuggingFace and writes\n" - " benchmarks/gdpval/data/gdpval_benchmark.jsonl (220 tasks):\n" - "\n export HF_TOKEN=\n" - " gym eval prepare --benchmark gdpval\n" - "\n (or run: bash responses_api_agents/stirrup_agent/setup_scripts/gdpval.sh)\n" + " benchmarks/gdpval/data/gdpval_benchmark.jsonl (220 tasks).\n" + "\n First activate the project virtualenv so the Gym CLI is on PATH\n" + " (the `gym`/`ng_*` commands live in .venv, not on your global PATH):\n" + "\n source .venv/bin/activate\n" + " export HF_TOKEN=\n" + "\n Then run the setup script (works on all installs):\n" + "\n bash responses_api_agents/stirrup_agent/setup_scripts/gdpval.sh\n" + "\n Or call a prepare CLI directly:\n" + "\n gym eval prepare --benchmark gdpval # newer installs\n" + " ng_prepare_benchmark '+config_paths=[benchmarks/gdpval/config.yaml]' # any install\n" "\n 2. Pass an explicit dataset path with --dataset .\n" "\nNote: the GDPVal dataset is gated on HuggingFace, so HF_TOKEN must be set\n" "and your account must have access to https://huggingface.co/datasets/openai/gdpval.\n" From 846441eda371bc019c4cd9460570e271811a666d Mon Sep 17 00:00:00 2001 From: Virginia Wu Date: Thu, 25 Jun 2026 17:47:45 -0700 Subject: [PATCH 4/8] shouldn't include occupation data file in repo Signed-off-by: Virginia Wu --- .../data/occupation_distribution.json | 442 ------------------ 1 file changed, 442 deletions(-) delete mode 100644 responses_api_agents/stirrup_agent/data/occupation_distribution.json diff --git a/responses_api_agents/stirrup_agent/data/occupation_distribution.json b/responses_api_agents/stirrup_agent/data/occupation_distribution.json deleted file mode 100644 index a91a899a0..000000000 --- a/responses_api_agents/stirrup_agent/data/occupation_distribution.json +++ /dev/null @@ -1,442 +0,0 @@ -{ - "Accountants and Auditors": { - "percentage": 0.022727, - "task_ids": [ - "83d10b06-26d1-4636-a32c-23f92c57f30b", - "7b08cd4d-df60-41ae-9102-8aaa49306ba2", - "7d7fc9a7-21a7-4b83-906f-416dea5ad04f", - "43dc9778-450b-4b46-b77e-b6d82b202035", - "ee09d943-5a11-430a-b7a2-971b4e9b01b5" - ] - }, - "Administrative Services Managers": { - "percentage": 0.022727, - "task_ids": [ - "f84ea6ac-8f9f-428c-b96c-d0884e30f7c7", - "a328feea-47db-4856-b4be-2bdc63dd88fb", - "27e8912c-8bd5-44ba-ad87-64066ea05264", - "17111c03-aac7-45c2-857d-c06d8223d6ad", - "c44e9b62-7cd8-4f72-8ad9-f8fbddb94083" - ] - }, - "Audio and Video Technicians": { - "percentage": 0.022727, - "task_ids": [ - "99ac6944-4ec6-4848-959c-a460ac705c6f", - "f9a1c16c-53fd-4c8f-88cc-5c325ec2f0bb", - "38889c3b-e3d4-49c8-816a-3cc8e5313aba", - "ff85ee58-bc9f-4aa2-806d-87edeabb1b81", - "4b894ae3-1f23-4560-b13d-07ed1132074e" - ] - }, - "Buyers and Purchasing Agents": { - "percentage": 0.022727, - "task_ids": [ - "1b1ade2d-f9f6-4a04-baa5-aa15012b53be", - "93b336f3-61f3-4287-86d2-87445e1e0f90", - "15ddd28d-8445-4baa-ac7f-f41372e1344e", - "24d1e93f-9018-45d4-b522-ad89dfd78079", - "05389f78-589a-473c-a4ae-67c61050bfca" - ] - }, - "Child, Family, and School Social Workers": { - "percentage": 0.022727, - "task_ids": [ - "575f8679-b4c1-47a2-8e96-d570d4ed9269", - "a74ead3b-f67d-4b1c-9116-f6bb81b29d4f", - "bbe0a93b-ebf0-40b0-98dc-8d9243099034", - "85d95ce5-b20c-41e2-834e-e788ce9622b6", - "76d10872-9ffa-4ede-83ee-e0f1ec5e2b8d" - ] - }, - "Compliance Officers": { - "percentage": 0.022727, - "task_ids": [ - "36d567ba-e205-4313-9756-931c6e4691fe", - "7bbfcfe9-132d-4194-82bb-d6f29d001b01", - "2696757c-1f8a-4959-8f0d-f5597b9e70fc", - "dfb4e0cd-a0b7-454e-b943-0dd586c2764c", - "4c18ebae-dfaa-4b76-b10c-61fcdf26734c" - ] - }, - "Computer and Information Systems Managers": { - "percentage": 0.022727, - "task_ids": [ - "cebf301e-5ea7-41ae-b117-ad8f43e7ac22", - "c2e8f271-7858-412f-b460-472463ad81d9", - "2ea2e5b5-257f-42e6-a7dc-93763f28b19d", - "c357f0e2-963d-4eb7-a6fa-3078fe55b3ba", - "a45bc83b-22f9-4def-8d89-9c5661b2b86f" - ] - }, - "Concierges": { - "percentage": 0.022727, - "task_ids": [ - "a10ec48c-168e-476c-8fe3-23b2a5f616ac", - "fccaa4a1-1c39-49ac-b701-55361a19966b", - "f5d428fd-b38e-41f0-8783-35423dab80f6", - "2fa8e956-7b35-4c13-95dc-027f02be318b", - "0e4fe8cd-16d0-4f41-8247-6385b4762582" - ] - }, - "Counter and Rental Clerks": { - "percentage": 0.022727, - "task_ids": [ - "a0ef404e-82a6-4507-bff1-633d7c8e0004", - "b7a5912e-0e63-41f5-8c22-9cdb8f46ab01", - "aa071045-bcb0-4164-bb85-97245d56287e", - "476db143-163a-4537-9e21-fe46adad703b", - "61f546a8-c374-467f-95cc-d0d9b5656eb6" - ] - }, - "Customer Service Representatives": { - "percentage": 0.022727, - "task_ids": [ - "f3351922-dbdd-45da-85c5-e7110696bbe5", - "61717508-4df7-41be-bf97-318dfb2475c0", - "0ed38524-a4ad-405f-9dee-7b2252659aad", - "87da214f-fd92-4c58-9854-f4d0d10adce0", - "d025a41c-c439-4ee1-bc79-dd5c94b27a2d" - ] - }, - "Editors": { - "percentage": 0.022727, - "task_ids": [ - "401a07f1-d57e-4bb0-889b-22de8c900f0e", - "afe56d05-dac8-47d7-a233-ad1d035ca5bd", - "9a8c8e28-ce76-408b-83c3-488422892e58", - "3a4c347c-4aec-43c7-9a54-eb1f816ab1f9", - "ec2fccc9-b7f6-4c73-bf51-896fdb433cec" - ] - }, - "Film and Video Editors": { - "percentage": 0.022727, - "task_ids": [ - "8c8fc328-69fc-4559-a13f-82087baef0a1", - "e222075d-5d62-4757-ae3c-e34b0846583b", - "c94452e4-39cd-4846-b73a-ab75933d1ad7", - "75401f7c-396d-406d-b08e-938874ad1045", - "a941b6d8-4289-4500-b45a-f8e4fc94a724" - ] - }, - "Financial Managers": { - "percentage": 0.022727, - "task_ids": [ - "a1963a68-1bea-4bb1-b7e0-145c92a57449", - "5f6c57dd-feb6-4e70-b152-4969d92d1608", - "b39a5aa7-cd1b-47ad-b249-90afd22f8f21", - "b78fd844-db76-448e-a783-5e9877cb74c2", - "4520f882-715a-482d-8e87-1cb3cbdfe975" - ] - }, - "Financial and Investment Analysts": { - "percentage": 0.022727, - "task_ids": [ - "8079e27d-b6f3-4f75-a9b5-db27903c798d", - "e21cd746-404d-4602-b9d2-01d2812c5b87", - "9e8607e7-a38a-491f-ace1-e5ea7dc477cb", - "c7d83f01-2874-4876-b7fd-52582ec99e1a", - "46b34f78-6c06-4416-87e2-77b6d8b20ce9" - ] - }, - "First-Line Supervisors of Non-Retail Sales Workers": { - "percentage": 0.022727, - "task_ids": [ - "ec591973-04d5-48c0-981c-1ab2fcec2dc1", - "62f04c2f-e0f7-4710-876c-54ee9c2e8256", - "3f821c2d-ab97-46ec-a0fb-b8f73c2682bc", - "e996036e-8287-4e7f-8d0a-90a57cb53c45", - "327fbc21-7d26-4964-bf7c-f4f41e55c54d" - ] - }, - "First-Line Supervisors of Office and Administrative Support Workers": { - "percentage": 0.022727, - "task_ids": [ - "6dcae3f5-bf1c-48e0-8b4b-23e6486a934c", - "1aecc095-4d76-4b89-b752-1a0f870502cd", - "0353ee0c-18b5-4ad3-88e8-e001d223e1d7", - "40a8c4b1-b169-4f92-a38b-7f79685037ec", - "4d1a8410-e9c5-4be5-ab43-cc55563c594c" - ] - }, - "First-Line Supervisors of Police and Detectives": { - "percentage": 0.022727, - "task_ids": [ - "8c823e32-537c-42b2-84ba-635d63c2853a", - "eb54f575-93f9-408b-b9e0-f1208a0b6759", - "11e1b169-5fb6-4d79-8a83-82ddf4987a85", - "a95a5829-34bb-40f3-993b-558aed6dcdef", - "22c0809b-f8db-489e-93b3-b4da225e3e0e" - ] - }, - "First-Line Supervisors of Production and Operating Workers": { - "percentage": 0.022727, - "task_ids": [ - "bf68f2ad-eac5-490a-adec-d847eb45bd6f", - "efca245f-c24f-4f75-a9d5-59201330ab7a", - "9e39df84-ac57-4c9b-a2e3-12b8abf2c797", - "68d8d901-dd0b-4a7e-bf9a-1074fddf1a96", - "1752cb53-5983-46b6-92ee-58ac85a11283" - ] - }, - "First-Line Supervisors of Retail Sales Workers": { - "percentage": 0.022727, - "task_ids": [ - "bd72994f-5659-4084-9fab-fc547d1efe3b", - "211d0093-2c64-4bd0-828c-0201f18924e7", - "d4525420-a427-4ef2-b4e9-2dcc2d31b3b6", - "45c6237b-f9c9-4526-9a8d-6a5c404624ec", - "cecac8f9-8203-4ebd-ad49-54436a8c4171" - ] - }, - "General and Operations Managers": { - "percentage": 0.022727, - "task_ids": [ - "8f9e8bcd-6102-40da-ab76-23f51d8b21fa", - "0fad6023-767b-42c1-a1b3-027cd4f583cb", - "02314fc6-a24e-42f4-a8cd-362cae0f0ec1", - "4d61a19a-8438-4d4c-9fc2-cf167e36dcd6", - "6436ff9e-c5f2-47ba-9aaa-49d89b0594ab" - ] - }, - "Industrial Engineers": { - "percentage": 0.022727, - "task_ids": [ - "8a7b6fca-60cc-4ae3-b649-971753cbf8b9", - "40a99a31-42d6-4f23-b3ec-8f591afe25b6", - "b9665ca1-4da4-4ff9-86f2-40b9a8683048", - "c6269101-fdc8-4602-b345-eac7597c0c81", - "be830ca0-b352-4658-a5bd-57139d6780ba" - ] - }, - "Lawyers": { - "percentage": 0.022727, - "task_ids": [ - "cd9efc18-d14a-4f69-8531-5d178a08084d", - "a97369c7-e5cf-40ca-99e8-d06f81c57d53", - "3f625cb2-f40e-4ead-8a97-6924356d5989", - "aad21e4c-1d43-45fc-899a-97754a1b1b63", - "8314d1b1-5b0f-42a4-b5d5-91c0867b0913" - ] - }, - "Mechanical Engineers": { - "percentage": 0.022727, - "task_ids": [ - "5e2b6aab-f9fb-4dd6-a1a5-874ef1743909", - "46fc494e-a24f-45ce-b099-851d5c181fd4", - "3940b7e7-ec4f-4cea-8097-3ab4cfdcaaa6", - "8077e700-2b31-402d-bd09-df4d33c39653", - "5a2d70da-0a42-4a6b-a3ca-763e03f070a5" - ] - }, - "Medical Secretaries and Administrative Assistants": { - "percentage": 0.022727, - "task_ids": [ - "f1be6436-ffff-4fee-9e66-d550291a1735", - "41f6ef59-88c9-4b2c-bcc7-9ceb88422f48", - "a0552909-bc66-4a3a-8970-ee0d17b49718", - "6d2c8e55-fe20-45c6-bdaf-93e676868503", - "4b98ccce-9e42-44e9-9115-6fc3e79de288" - ] - }, - "Medical and Health Services Managers": { - "percentage": 0.022727, - "task_ids": [ - "74d6e8b0-f334-4e7e-af55-c095d5d4d1a6", - "81db15ff-ceea-4f63-a1cd-06dc88114709", - "61b0946a-5c1c-4bf6-8607-84d7c7e0dfe0", - "61e7b9c6-0051-429f-a341-fda9b6578a84", - "c9bf9801-9640-45fa-8166-1ab01f2d98e4" - ] - }, - "News Analysts, Reporters, and Journalists": { - "percentage": 0.022727, - "task_ids": [ - "60221cd0-686e-4a08-985e-d9bb2fa18501", - "ef8719da-18e5-4bfe-b986-399652d77376", - "3baa0009-5a60-4ae8-ae99-4955cb328ff3", - "5d0feb24-e8b6-4ace-b64f-d5cd1a8b563d", - "6974adea-8326-43fa-8187-2724b15d9546" - ] - }, - "Nurse Practitioners": { - "percentage": 0.022727, - "task_ids": [ - "1a78e076-445e-4c5d-b8ce-387d2fe5e715", - "1b9ec237-bf9c-41f9-8fa9-0e685fcd93c6", - "0112fc9b-c3b2-4084-8993-5a4abb1f54f1", - "772e7524-174e-4c88-957e-6e510b61ea69", - "e6429658-4de1-42dd-a9e0-2d2b9b02fb10" - ] - }, - "Order Clerks": { - "percentage": 0.022727, - "task_ids": [ - "b5d2e6f1-62a2-433a-bcdd-95b260cdd860", - "f841ddcf-2a28-4f6d-bac3-61b607219d3e", - "47ef842d-8eac-4b90-bda8-dd934c228c96", - "1137e2bb-bdf9-4876-b572-f29b7de5e595", - "c3525d4d-2012-45df-853e-2d2a0e902991" - ] - }, - "Personal Financial Advisors": { - "percentage": 0.022727, - "task_ids": [ - "9a0d8d36-6233-4c76-9107-0d1f783c7340", - "664a42e5-3240-413a-9a57-ea93c6303269", - "feb5eefc-39f1-4451-9ef9-bffe011b71dd", - "3600de06-3f71-4e48-9480-e4828c579924", - "c657103b-b348-4496-a848-b2b7165d28b2" - ] - }, - "Pharmacists": { - "percentage": 0.022727, - "task_ids": [ - "91060ff0-3eb5-4ddf-9edb-f6758b95499e", - "8384083a-c31b-4194-80ba-4d335a444918", - "045aba2e-4093-42aa-ab7f-159cc538278c", - "f2986c1f-2bbf-4b83-bc93-624a9d617f45", - "ffed32d8-d192-4e3f-8cd4-eda5a730aec3" - ] - }, - "Private Detectives and Investigators": { - "percentage": 0.022727, - "task_ids": [ - "ae0c1093-5ea8-4b84-a81e-53ebf7a4321d", - "f9f82549-fdde-4462-aff8-e70fba5b8c66", - "57b2cdf2-ad62-4591-aa91-aad489740320", - "84322284-5c2c-4873-b507-b147449d209d", - "a46d5cd2-55fe-48fa-a4c6-6aaf6b9991b5" - ] - }, - "Producers and Directors": { - "percentage": 0.022727, - "task_ids": [ - "6241e678-4ba3-4831-b3c7-78412697febc", - "e14e32ba-d310-4d45-9b8a-6d73d0ece1ae", - "b1a79ce1-86b0-41fb-97dc-9206dfd7b044", - "e4f664ea-0e5c-4e4e-a0d3-a87a33da947a", - "a079d38f-c529-436a-beca-3e291f9e62a3" - ] - }, - "Project Management Specialists": { - "percentage": 0.022727, - "task_ids": [ - "02aa1805-c658-4069-8a6a-02dec146063a", - "fd6129bd-f095-429b-873c-dcc3137be2c3", - "ce864f41-8584-49ba-b24f-9c9104b47bf0", - "58ac1cc5-5754-4580-8c9c-8c67e1a9d619", - "3c19c6d1-672c-467a-8437-6fe21afb8eae" - ] - }, - "Property, Real Estate, and Community Association Managers": { - "percentage": 0.022727, - "task_ids": [ - "a99d85fc-eff8-48d2-a7d4-42a75d62f18d", - "55ddb773-23a4-454c-8704-d432fe1b99d9", - "1e5a1d7f-12c1-48c6-afd9-82257b3f2409", - "0419f1c3-d669-45d0-81cd-f4d5923b06a5", - "ed2bc14c-99ac-4a2a-8467-482a1a5d67f3" - ] - }, - "Real Estate Brokers": { - "percentage": 0.022727, - "task_ids": [ - "46bc7238-3501-4839-b989-e2bd47853676", - "2d06bc0a-89c6-4e89-9417-5ffe725c1bc6", - "fd3ad420-6f7d-43b1-a990-c0c5c047d071", - "0818571f-5ff7-4d39-9d2c-ced5ae44299e", - "6074bba3-7e3a-4b1c-b8c6-a15bb6695c3b" - ] - }, - "Real Estate Sales Agents": { - "percentage": 0.022727, - "task_ids": [ - "5ad0c554-a7a2-48cd-b41a-ebc1bff4a9de", - "11593a50-734d-4449-b5b4-f8986a133fd8", - "94925f49-36bc-42da-b45b-61078d329300", - "90f37ff3-e4ed-4a0b-94bb-bed0f7def1ef", - "d3d255b2-f5f2-4841-9f62-2083ec9ef3da" - ] - }, - "Recreation Workers": { - "percentage": 0.022727, - "task_ids": [ - "403b9234-6299-4b5f-a106-70c1bc11ec4c", - "1bff4551-1d54-4e37-b2e0-d5c3f2ea4a45", - "650adcb1-ed19-4f88-8117-77640f7b94b6", - "01d7e53e-0513-4109-a242-8ccaf442cd21", - "a73fbc98-90d4-4134-a54f-2b1d0c838791" - ] - }, - "Registered Nurses": { - "percentage": 0.022727, - "task_ids": [ - "0ec25916-1b5c-4bfe-93d3-4e103d860f3a", - "116e791e-890c-42b1-ba90-1db02e8bfd45", - "dd724c67-8118-4b99-ab50-4761af705c3b", - "7151c60a-d4cb-4fc4-8169-3d4cb446e6b9", - "90edba97-74f0-425a-8ff6-8b93182eb7cb" - ] - }, - "Sales Managers": { - "percentage": 0.022727, - "task_ids": [ - "b3573f20-5d3e-4954-948f-9461fda693d2", - "a69be28f-9a84-47c9-992e-b90446cdca9d", - "788d2bc6-82df-4dc7-8467-a0f31405dc14", - "74ed1dc7-1468-48a8-9071-58775c0d667a", - "69a8ef86-4e69-4fe2-9168-080f1e978e67" - ] - }, - "Sales Representatives, Wholesale and Manufacturing, Except Technical and Scientific Products": { - "percentage": 0.022727, - "task_ids": [ - "ab81b076-e5d8-473a-9bdb-7ea7c38f6ebc", - "d7cfae6f-4a82-4289-955e-c799dfe1e0f4", - "19403010-3e5c-494e-a6d3-13594e99f6af", - "7ed932dd-244f-4d61-bf02-1bc3bab1af14", - "105f8ad0-8dd2-422f-9e88-2be5fbd2b215" - ] - }, - "Sales Representatives, Wholesale and Manufacturing, Technical and Scientific Products": { - "percentage": 0.022727, - "task_ids": [ - "b57efde3-26d6-4742-bbff-2b63c43b4baa", - "15d37511-75c5-4c7f-81f1-16e00c0d95f3", - "bb863dd9-31c2-4f64-911a-ce11f457143b", - "fe0d3941-e32c-4bf1-a643-b566d2b4cb3c", - "6a900a40-8d2b-4064-a5b1-13a60bc173d8" - ] - }, - "Securities, Commodities, and Financial Services Sales Agents": { - "percentage": 0.022727, - "task_ids": [ - "9efbcd35-186d-49b6-ac24-28ee2bc9a263", - "1d4672c8-b0a7-488f-905f-9ab4e25a19f7", - "4de6a529-4f61-41a1-b2dc-64951ba03457", - "4c4dc603-c21c-4284-8fb1-1b827c1fddf4", - "bb499d9c-0263-4684-9238-75e8e86077b1" - ] - }, - "Shipping, Receiving, and Inventory Clerks": { - "percentage": 0.022727, - "task_ids": [ - "5349dd7b-bf0a-4544-9a17-75b7013767e6", - "a4a9195c-5ebe-4b8d-a0c2-4a6b7a49da8b", - "552b7dd0-96f4-437c-a749-0691e0e4b381", - "11dcc268-cb07-4d3a-a184-c6d7a19349bc", - "76418a2c-a3c0-4894-b89d-2493369135d9" - ] - }, - "Software Developers": { - "percentage": 0.022727, - "task_ids": [ - "0e386e32-df20-4d1f-b536-7159bc409ad5", - "7de33b48-5163-4f50-b5f3-8deea8185e57", - "854f3814-681c-4950-91ac-55b0db0e3781", - "4122f866-01fa-400b-904d-fa171cdab7c7", - "2c249e0f-4a8c-4f8e-b4f4-6508ba29b34f" - ] - } -} From 85ef58b756c8fa6eec6d932472a118a6d036d8ed Mon Sep 17 00:00:00 2001 From: Virginia Wu Date: Thu, 25 Jun 2026 23:19:04 -0700 Subject: [PATCH 5/8] added multistage elo estimation Signed-off-by: Virginia Wu --- resources_servers/gdpval/app.py | 15 +- resources_servers/gdpval/multistage_elo.py | 305 +++++++++ .../gdpval/multistage_elo_driver.py | 613 ++++++++++++++++++ resources_servers/gdpval/tests/test_app.py | 68 ++ .../gdpval/tests/test_multistage_elo.py | 185 ++++++ .../tests/test_multistage_elo_driver.py | 429 ++++++++++++ 6 files changed, 1614 insertions(+), 1 deletion(-) create mode 100644 resources_servers/gdpval/multistage_elo.py create mode 100644 resources_servers/gdpval/multistage_elo_driver.py create mode 100644 resources_servers/gdpval/tests/test_multistage_elo.py create mode 100644 resources_servers/gdpval/tests/test_multistage_elo_driver.py diff --git a/resources_servers/gdpval/app.py b/resources_servers/gdpval/app.py index 18463586a..6da29d77b 100644 --- a/resources_servers/gdpval/app.py +++ b/resources_servers/gdpval/app.py @@ -184,6 +184,12 @@ class GDPValVerifyRequest(BaseVerifyRequest): rubric_pretty: Optional[str] = None reference_file_urls: Optional[List[str]] = None deliverables_dir: Optional[str] = None + # Optional per-request filter (comparison mode): judge the eval deliverable + # only against this subset of the configured ``reference_models``. Unknown + # ids are ignored; ``None`` (default) judges against every configured + # reference. Used by the multi-stage ELO driver to select a different set of + # reference models per judgementstage without reconfiguring the server. + reference_ids: Optional[List[str]] = None class GDPValVerifyResponse(GDPValVerifyRequest, BaseVerifyResponse): @@ -369,11 +375,18 @@ async def _verify_comparison(self, body: GDPValVerifyRequest) -> GDPValVerifyRes eval_task_dir = Path(body.deliverables_dir) if body.deliverables_dir else None + # Optional per-request reference subset (multi-stage ELO). When set, only + # the named references are judged this call; unknown ids are ignored. + active_references = self._references + if body.reference_ids is not None: + requested = set(body.reference_ids) + active_references = {rid: cfg for rid, cfg in self._references.items() if rid in requested} + # Resolve, per reference model, the available (attempted) repeat dirs # for this task. A reference that has no deliverable for this task is # simply skipped — the eval model just isn't judged against it here. ref_dirs_by_id: Dict[str, List[Path]] = {} - for ref_id, ref_cfg in self._references.items(): + for ref_id, ref_cfg in active_references.items(): ref_task_root = Path(ref_cfg.deliverables_dir) / f"task_{body.task_id}" dirs = [d for d in _iter_ref_repeat_dirs(ref_task_root) if task_attempted(str(d))] if dirs: diff --git a/resources_servers/gdpval/multistage_elo.py b/resources_servers/gdpval/multistage_elo.py new file mode 100644 index 000000000..a8f4fccc1 --- /dev/null +++ b/resources_servers/gdpval/multistage_elo.py @@ -0,0 +1,305 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Multi-stage adaptive ELO estimation for GDPVal pairwise comparison. + +Instead of comparing the evaluated model against every reference model on all +tasks, this runs a sequence of *stages*. Each stage: + +1. fixes a set of ``T`` tasks sampled from a task-distribution JSON file (see + ``responses_api_agents.stirrup_agent.task_distribution``), +2. judges the evaluated model against a set of ``M`` reference models on those + tasks (delegated to an injected ``judge_stage`` callable), +3. fits an anchored Bradley-Terry MLE ELO from that stage's win/loss/tie + battles (reusing ``comparison.calculate_mle_elo``), and +4. uses that estimate to choose the ``M`` references for the next stage. + +Across stages, ``M`` typically shrinks (zooming in on references whose known +ELO is closest to the evaluated model's current estimate) while ``T`` grows +(spending the saved judge budget on a tighter final estimate). + +This module is intentionally **pure / server-agnostic**: the actual judging +(running rollouts, calling ``/verify``, reading cached deliverables) is supplied +by the caller as a ``judge_stage`` callable, so the staging/selection/ELO logic +is unit-testable without any servers. The orchestration that wires this to the +GDPVal servers lives in the driver (see the module docstring there). +""" + +from __future__ import annotations + +import random +from dataclasses import dataclass, field +from typing import Callable, Dict, List, Mapping, Optional, Sequence + +from resources_servers.gdpval.comparison import calculate_mle_elo + + +# A mapping ``ref_id -> {"wins": int, "losses": int, "ties": int, +# "reference_elo": float}`` as produced (per task, then pooled) by the GDPVal +# comparison verifier. This is the unit the ELO MLE is fit over. +PerReferenceTotals = Dict[str, Dict[str, float]] + +# Signature of the injected judging step. Given the stage's fixed task ids and +# the selected reference ids, return pooled per-reference win/loss/tie totals +# for the evaluated model across those tasks. +JudgeStageFn = Callable[[Sequence[str], Sequence[str]], PerReferenceTotals] + + +@dataclass +class StageSpec: + """Configuration for a single stage. + + ``num_tasks`` is ``T`` (the number of tasks judged this stage). ``num_models`` + is ``M`` (the number of reference models compared against); ``None`` means + "all available references" (used for the first, broad stage). ``seed`` makes + task sampling for this stage reproducible. + """ + + num_tasks: int + num_models: Optional[int] = None + seed: Optional[int] = None + + +@dataclass +class StageResult: + """Outcome of one stage.""" + + stage_index: int + task_ids: List[str] + reference_ids: List[str] + per_reference: PerReferenceTotals + eval_elo: Optional[float] + normalized_elo: Optional[float] + # Number of reference models included in this stage's ELO fit. + num_references: int + + +@dataclass +class MultiStageEloConfig: + """End-to-end configuration for a multi-stage ELO run.""" + + stages: List[StageSpec] + # ref_id -> known/anchor ELO. Both the MLE (anchors) and reference selection + # ("closest to the eval estimate") require these. + reference_elos: Dict[str, float] + + # Task distribution source. When ``distribution_path`` is unset (or missing), + # the driver builds a distribution from ``dataset_path`` (or the default + # GDPVal dataset) grouped by ``column`` and caches it. See + # ``multistage_elo_driver.ensure_distribution``. + distribution_path: Optional[str] = None + dataset_path: Optional[str] = None + + # Eval deliverables source. When set, pre-existing cached deliverables under + # this directory (``task_/repeat_/``) are reused instead of producing + # fresh rollouts. ``produce_missing`` controls whether tasks absent from the + # cache are produced on demand (True) or dropped from the stage (False). + eval_deliverables_dir: Optional[str] = None + produce_missing: bool = True + + # Sampling behaviour across stages. ``nested=True`` makes each stage's task set + # a superset of the previous stage's, which is cheaper (reuses produced + # deliverables and judgments) but couples the stages' samples. The default + # (False) samples each stage independently: later stages draw fresh tasks, so + # the stages contribute more independent information to the ELO estimate. + nested_tasks: bool = False + + selection: str = "closest" + column: List[str] = field(default_factory=lambda: ["occupation"]) + + def __post_init__(self) -> None: + if not self.stages: + raise ValueError("At least one stage is required.") + if self.selection != "closest": + raise ValueError(f"Unknown selection strategy: {self.selection!r}") + + +# --------------------------------------------------------------------------- +# Reference selection +# --------------------------------------------------------------------------- + + +def select_references( + reference_elos: Mapping[str, float], + eval_elo: Optional[float], + num_models: Optional[int], +) -> List[str]: + """Choose reference ids for a stage. + + Returns all references (sorted by id) when ``num_models`` is ``None`` or the + estimate is not yet available (the first, broad stage). Otherwise returns the + ``num_models`` references whose anchor ELO is closest to ``eval_elo``, ties + broken by ``ref_id`` for determinism. + """ + all_ids = sorted(reference_elos) + if num_models is None or eval_elo is None or num_models >= len(all_ids): + return all_ids + if num_models <= 0: + return [] + ranked = sorted(all_ids, key=lambda rid: (abs(reference_elos[rid] - eval_elo), rid)) + chosen = ranked[:num_models] + # Return in stable id order rather than distance order for readable output. + return sorted(chosen) + + +# --------------------------------------------------------------------------- +# Task planning +# --------------------------------------------------------------------------- + + +def plan_stage_task_ids( + distribution: Mapping[str, Mapping[str, object]], + stages: Sequence[StageSpec], + *, + rng: Optional[random.Random] = None, + nested: bool = True, +) -> List[List[str]]: + """Pre-sample the task set for every stage from a task distribution. + + Task selection is independent of any ELO estimate, so all stages' task sets + can be planned up front. + + ``nested=True`` makes each stage's set a superset of the previous one. We get + this for free in a single draw: ``sample_task_ids`` samples without + replacement one task at a time, so a prefix of a large draw is identical to a + smaller draw made with the same RNG. We therefore draw once, sized to the + largest stage, and slice each stage's prefix from it — O(max T) work and + exactly proportional per stage, with nesting guaranteed. A single shared RNG + is used (per-stage ``seed`` only applies to independent sampling). + + ``nested=False`` samples each stage independently, honoring its own ``seed``. + """ + from responses_api_agents.stirrup_agent.task_distribution import sample_task_ids + + base_rng = rng or random.Random() + + if not nested: + return [ + sample_task_ids( + distribution, + s.num_tasks, + rng=random.Random(s.seed) if s.seed is not None else base_rng, + ) + for s in stages + ] + + max_target = max(s.num_tasks for s in stages) + ordered = sample_task_ids(distribution, max_target, rng=base_rng) + return [list(ordered[: s.num_tasks]) for s in stages] + + +# --------------------------------------------------------------------------- +# ELO fitting +# --------------------------------------------------------------------------- + + +def fit_stage_elo( + per_reference: Mapping[str, Mapping[str, float]], + reference_elos: Mapping[str, float], +) -> tuple[Optional[float], Optional[float], int]: + """Fit the eval model's ELO for a stage from per-reference battle totals. + + A reference is included in the fit only if it has a known anchor ELO (from + ``reference_elos`` or a ``reference_elo`` recorded on its counts) and at + least one judged game (win + loss + tie > 0). + + Returns ``(elo, normalized_elo, num_references)``: + - ``num_references`` is how many references met both criteria above and were + passed to the MLE. + - ``elo`` / ``normalized_elo`` are ``None`` when no reference qualified + (``num_references == 0``) or when the MLE itself could not produce a rating; + in the latter case ``num_references`` is still > 0. + """ + battles: List[tuple[float, float, float, float]] = [] + for ref_id, counts in per_reference.items(): + ref_elo = reference_elos.get(ref_id, counts.get("reference_elo")) + if ref_elo is None: + continue + wins = float(counts.get("wins", 0) or 0) + losses = float(counts.get("losses", 0) or 0) + ties = float(counts.get("ties", 0) or 0) + if wins + losses + ties <= 0: + continue + battles.append((float(ref_elo), wins, losses, ties)) + + if not battles: + return None, None, 0 + + mle = calculate_mle_elo(battles) + if mle is None: + return None, None, len(battles) + elo, normalized = mle + return elo, normalized, len(battles) + + +# --------------------------------------------------------------------------- +# Runner +# --------------------------------------------------------------------------- + + +class MultiStageEloRunner: + """Drive the multi-stage ELO procedure. + + ``run`` first plans every stage's task set up front (task selection does not + depend on any ELO estimate), then walks the stages sequentially: for each + stage it selects the references (closest known ELO to the running estimate), + judges the stage, fits the stage ELO, and threads that estimate into the next + stage's reference selection. Matchup judging is not the runner's concern; it + is supplied as ``judge_stage(task_ids, reference_ids) -> per_reference_totals``. + + ``run`` returns one ``StageResult`` per stage; the last stage's ``eval_elo`` + is the headline estimate. + """ + + def __init__( + self, + config: MultiStageEloConfig, + distribution: Mapping[str, Mapping[str, object]], + judge_stage: JudgeStageFn, + *, + rng: Optional[random.Random] = None, + ) -> None: + self.config = config + self.distribution = distribution + self.judge_stage = judge_stage + self.rng = rng or random.Random() + + def run(self) -> List[StageResult]: + stage_task_sets = plan_stage_task_ids( + self.distribution, + self.config.stages, + rng=self.rng, + nested=self.config.nested_tasks, + ) + + results: List[StageResult] = [] + eval_elo: Optional[float] = None + for index, stage in enumerate(self.config.stages): + reference_ids = select_references(self.config.reference_elos, eval_elo, stage.num_models) + task_ids = stage_task_sets[index] + per_reference = self.judge_stage(task_ids, reference_ids) + stage_elo, normalized, num_references = fit_stage_elo(per_reference, self.config.reference_elos) + if stage_elo is not None: + eval_elo = stage_elo + results.append( + StageResult( + stage_index=index, + task_ids=list(task_ids), + reference_ids=list(reference_ids), + per_reference=dict(per_reference), + eval_elo=stage_elo, + normalized_elo=normalized, + num_references=num_references, + ) + ) + return results diff --git a/resources_servers/gdpval/multistage_elo_driver.py b/resources_servers/gdpval/multistage_elo_driver.py new file mode 100644 index 000000000..310c4d1d4 --- /dev/null +++ b/resources_servers/gdpval/multistage_elo_driver.py @@ -0,0 +1,613 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Driver that wires the multi-stage ELO logic to the GDPVal comparison server. + +This composes the pure staging logic in ``multistage_elo`` with the GDPVal +resources server's ``/verify`` (comparison mode). For each stage it: + +1. asks the runner to select the stage's references (closest known ELO to the + current estimate) and fix the stage's sampled tasks, +2. judges the evaluated model's cached deliverables against that reference + subset, one ``/verify`` call per (task, repeat) with the per-request + ``reference_ids`` filter, +3. pools the per-reference win/loss/tie votes and fits the stage ELO. + +The evaluated model's deliverables are read from a directory laid out as +``/task_/repeat_/`` (the same layout the Stirrup +agent persists). Point ``eval_deliverables_dir`` at deliverables produced by an +earlier run to score them with **zero rollouts**. Tasks missing from the cache +are either produced on demand via an injected ``producer`` callback or reported, +controlled by ``produce_missing``. + +The judging primitive ``verify_one`` is injected so the orchestration is +testable without a running server; ``make_http_verify_one`` provides the real +implementation that POSTs to the resources server. + +CLI usage (run from the repo root, against a running comparison server):: + + python -m resources_servers.gdpval.multistage_elo_driver \\ + --server-url http://localhost:8000 \\ + --eval-deliverables-dir /path/to/eval/deliverables \\ + --reference-elos '@refs.json' \\ + --stage 5 --stage 88:4 \\ + --output elo_summary.json + +where ``refs.json`` is ``{"": , ...}`` with ids matching the +server's configured ``reference_models``. See ``--help`` for all flags. +""" + +from __future__ import annotations + +import argparse +import json +import random +import sys +from pathlib import Path +from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence + +from resources_servers.gdpval.comparison import task_attempted +from resources_servers.gdpval.multistage_elo import ( + MultiStageEloConfig, + MultiStageEloRunner, + PerReferenceTotals, + StageResult, + StageSpec, +) + + +# verify_one(task_id, deliverables_dir, prompt, reference_ids) -> verify response dict +VerifyOneFn = Callable[[str, str, str, Sequence[str]], Dict[str, Any]] +# producer(task_ids) -> None: materialize eval deliverables for the given tasks. +ProducerFn = Callable[[Sequence[str]], None] + + +# --------------------------------------------------------------------------- +# Dataset / distribution loading +# --------------------------------------------------------------------------- + + +# Default location for distributions this driver builds on demand. Lives under +# the resources server's data dir so it is reachable from wherever the driver +# runs and is easy to inspect/reuse across runs. +DEFAULT_DISTRIBUTION_CACHE_DIR = Path(__file__).resolve().parent / "data" / "distributions" + + +def load_distribution(path: str | Path) -> Dict[str, Dict[str, Any]]: + """Load a task-distribution JSON file produced by ``task_distribution.py``.""" + with Path(path).open("r", encoding="utf-8") as handle: + data = json.load(handle) + if not isinstance(data, dict): + raise ValueError(f"Distribution file {path} must be a JSON object.") + return data + + +def ensure_distribution( + distribution_path: Optional[str | Path] = None, + *, + dataset_path: Optional[str | Path] = None, + columns: Optional[Sequence[str]] = None, + cache_dir: Optional[str | Path] = None, +) -> tuple[Dict[str, Dict[str, Any]], Path]: + """Return ``(distribution, path)``, building the distribution if needed. + + If ``distribution_path`` exists it is loaded as-is. Otherwise a distribution + is built from ``dataset_path`` (or the default GDPVal dataset) grouped by + ``columns`` (default ``["occupation"]``) via ``task_distribution``, then saved + so subsequent runs reuse it. It is written to ``distribution_path`` when + given, else to ``/_distribution.json`` (cache_dir + defaults to ``DEFAULT_DISTRIBUTION_CACHE_DIR``). + """ + column_list = list(columns) if columns else ["occupation"] + + if distribution_path is not None and Path(distribution_path).is_file(): + return load_distribution(distribution_path), Path(distribution_path) + + from responses_api_agents.stirrup_agent.task_distribution import ( + build_distribution_from_dataset, + resolve_default_dataset, + ) + + resolved_dataset = Path(dataset_path) if dataset_path is not None else resolve_default_dataset() + if resolved_dataset is None: + raise FileNotFoundError( + "No distribution file was provided and no default GDPVal dataset could be found to " + "build one from. Provide distribution_path, pass dataset_path, or prepare the GDPVal " + "dataset (gym eval prepare --benchmark gdpval)." + ) + + distribution = build_distribution_from_dataset(resolved_dataset, column_list) + + if distribution_path is not None: + out_path = Path(distribution_path) + else: + base = Path(cache_dir) if cache_dir is not None else DEFAULT_DISTRIBUTION_CACHE_DIR + out_path = base / f"{'_'.join(column_list)}_distribution.json" + out_path.parent.mkdir(parents=True, exist_ok=True) + with out_path.open("w", encoding="utf-8") as handle: + json.dump(distribution, handle, indent=2, ensure_ascii=False) + print( + f"[multistage-elo] built task distribution over {column_list} from {resolved_dataset} -> {out_path}", + flush=True, + ) + return distribution, out_path + + +def load_task_prompts(jsonl_path: str | Path) -> Dict[str, str]: + """Map ``task_id -> prompt`` from a benchmark JSONL. + + The prompt is needed when judging cached deliverables (the judge sees the + task description). Looks for ``prompt`` and ``task_id`` at the top level and, + failing that, under ``responses_create_params.metadata`` — covering both the + prepared benchmark layout and the metadata-nested layout. + """ + prompts: Dict[str, str] = {} + with Path(jsonl_path).open("r", encoding="utf-8") as handle: + for line in handle: + line = line.strip() + if not line: + continue + row = json.loads(line) + meta = (row.get("responses_create_params") or {}).get("metadata") or {} + task_id = row.get("task_id") or meta.get("task_id") + prompt = row.get("prompt") or meta.get("prompt") + if task_id is not None: + prompts[str(task_id)] = prompt or "" + + return prompts + + +# --------------------------------------------------------------------------- +# Cached-deliverable discovery +# --------------------------------------------------------------------------- + + +def task_repeat_dirs(eval_deliverables_dir: str | Path, task_id: str) -> List[Path]: + """Return attempted ``repeat_`` dirs (or a flat task dir) for a task. + + Mirrors the resources server's reference-repeat resolution: prefers + ``task_/repeat_/`` subdirs, falls back to a flat ``task_/``, and + only returns dirs that look like a completed run (``finish_params.json``). + """ + task_root = Path(eval_deliverables_dir) / f"task_{task_id}" + if not task_root.is_dir(): + return [] + repeats = sorted(p for p in task_root.iterdir() if p.is_dir() and p.name.startswith("repeat_")) + candidates = repeats or [task_root] + return [d for d in candidates if task_attempted(str(d))] + + +def cached_task_ids(eval_deliverables_dir: str | Path) -> set: + """All task ids that have at least one attempted deliverable in the cache.""" + root = Path(eval_deliverables_dir) + if not root.is_dir(): + return set() + found = set() + for child in root.iterdir(): + if child.is_dir() and child.name.startswith("task_"): + task_id = child.name[len("task_") :] + if task_repeat_dirs(eval_deliverables_dir, task_id): + found.add(task_id) + return found + + +def check_coverage(eval_deliverables_dir: str | Path, task_ids: Sequence[str]) -> tuple[List[str], List[str]]: + """Split ``task_ids`` into ``(present, missing)`` against the cache.""" + present, missing = [], [] + for tid in task_ids: + (present if task_repeat_dirs(eval_deliverables_dir, tid) else missing).append(tid) + + return present, missing + + +# --------------------------------------------------------------------------- +# Vote pooling +# --------------------------------------------------------------------------- + + +def pool_per_reference(verify_responses: Sequence[Mapping[str, Any]]) -> PerReferenceTotals: + """Sum ``per_reference`` win/loss/tie counts across many verify responses.""" + totals: PerReferenceTotals = {} + for vr in verify_responses: + per_ref = vr.get("per_reference") or {} + for ref_id, counts in per_ref.items(): + entry = totals.setdefault(ref_id, {"wins": 0, "losses": 0, "ties": 0, "reference_elo": None}) + entry["wins"] += int(counts.get("wins", 0) or 0) + entry["losses"] += int(counts.get("losses", 0) or 0) + entry["ties"] += int(counts.get("ties", 0) or 0) + if entry["reference_elo"] is None: + entry["reference_elo"] = counts.get("reference_elo") + + return totals + + +# --------------------------------------------------------------------------- +# judge_stage builder +# --------------------------------------------------------------------------- + + +def build_judge_stage( + verify_one: VerifyOneFn, + eval_deliverables_dir: str | Path, + task_prompts: Mapping[str, str], + *, + produce_missing: bool = True, + producer: Optional[ProducerFn] = None, +): + """Build the ``judge_stage`` callable expected by ``MultiStageEloRunner``. + + For each stage's tasks, judges the cached eval deliverables against the + selected references (one ``verify_one`` call per task-repeat) and pools the + per-reference votes. Missing tasks are produced via ``producer`` when given; + otherwise ``produce_missing=True`` raises an actionable error and + ``produce_missing=False`` drops them with a warning. + """ + + def judge_stage(task_ids: Sequence[str], reference_ids: Sequence[str]) -> PerReferenceTotals: + present, missing = check_coverage(eval_deliverables_dir, task_ids) + if missing: + if producer is not None: + producer(missing) + present, missing = check_coverage(eval_deliverables_dir, task_ids) + if missing and produce_missing and producer is None: + raise FileNotFoundError( + f"{len(missing)} task(s) have no cached eval deliverable under " + f"{eval_deliverables_dir} (e.g. {missing[:3]}). Produce them first with an " + f"execute_only run, pass a producer, or set produce_missing=False to skip them." + ) + if missing: + print( + f"[multistage-elo] WARNING: skipping {len(missing)} task(s) with no cached " + f"deliverable (e.g. {missing[:3]})", + flush=True, + ) + + responses: List[Dict[str, Any]] = [] + for task_id in present: + prompt = task_prompts.get(task_id, "") + for repeat_dir in task_repeat_dirs(eval_deliverables_dir, task_id): + responses.append(verify_one(task_id, str(repeat_dir), prompt, list(reference_ids))) + return pool_per_reference(responses) + + return judge_stage + + +# --------------------------------------------------------------------------- +# Real verify_one (HTTP) +# --------------------------------------------------------------------------- + + +def build_verify_request_body( + task_id: str, + deliverables_dir: str, + prompt: str, + reference_ids: Sequence[str], + *, + model: str = "eval", +) -> Dict[str, Any]: + """Build a minimal comparison-mode ``/verify`` request body. + + In comparison mode the judge reads deliverable files from ``deliverables_dir`` + rather than the response payload, so a placeholder response is sufficient. + """ + return { + "responses_create_params": {"input": [], "model": model}, + "response": { + "id": f"multistage-{task_id}", + "created_at": 0, + "model": model, + "object": "response", + "output": [], + "parallel_tool_calls": False, + "tool_choice": "none", + "tools": [], + }, + "task_id": task_id, + "prompt": prompt, + "deliverables_dir": deliverables_dir, + "reference_ids": list(reference_ids), + } + + +def make_http_verify_one(server_url: str, *, timeout: float = 1800.0, model: str = "eval") -> VerifyOneFn: + """Return a blocking ``verify_one`` that POSTs to a running resources server. + + ``server_url`` is the resources server base URL (e.g. ``http://host:port``); + ``/verify`` is appended. Uses stdlib ``urllib`` so the driver pulls in no + async machinery — it is a standalone orchestration script, not part of the + server hot path. + """ + import urllib.request + + endpoint = server_url.rstrip("/") + "/verify" + + def verify_one(task_id: str, deliverables_dir: str, prompt: str, reference_ids: Sequence[str]) -> Dict[str, Any]: + body = build_verify_request_body(task_id, deliverables_dir, prompt, reference_ids, model=model) + data = json.dumps(body).encode("utf-8") + req = urllib.request.Request(endpoint, data=data, headers={"Content-Type": "application/json"}) + with urllib.request.urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + + return verify_one + + +# --------------------------------------------------------------------------- +# Top-level run +# --------------------------------------------------------------------------- + + +def run_multistage_elo( + config: MultiStageEloConfig, + verify_one: VerifyOneFn, + task_prompts: Mapping[str, str], + *, + rng=None, + producer: Optional[ProducerFn] = None, +) -> List[StageResult]: + """Run the full multi-stage ELO procedure and return per-stage results. + + ``config.eval_deliverables_dir`` must be set — it is the source of the eval + model's (cached or produced) deliverables. + """ + if not config.eval_deliverables_dir: + raise ValueError("config.eval_deliverables_dir must be set (source of eval deliverables).") + + distribution, _ = ensure_distribution( + config.distribution_path, + dataset_path=config.dataset_path, + columns=config.column, + ) + judge_stage = build_judge_stage( + verify_one, + config.eval_deliverables_dir, + task_prompts, + produce_missing=config.produce_missing, + producer=producer, + ) + runner = MultiStageEloRunner(config, distribution, judge_stage, rng=rng) + return runner.run() + + +def stage_results_to_dict(results: Sequence[StageResult]) -> Dict[str, Any]: + """Serialize stage results to a JSON-friendly summary dict.""" + final = results[-1] if results else None + return { + "final_eval_elo": final.eval_elo if final else None, + "final_normalized_elo": final.normalized_elo if final else None, + "num_stages": len(results), + "stages": [ + { + "stage_index": r.stage_index, + "num_tasks": len(r.task_ids), + "reference_ids": r.reference_ids, + "eval_elo": r.eval_elo, + "normalized_elo": r.normalized_elo, + "num_references": r.num_references, + "per_reference": r.per_reference, + "task_ids": r.task_ids, + } + for r in results + ], + } + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +DEFAULT_TASK_PROMPTS = "benchmarks/gdpval/data/gdpval_benchmark.jsonl" + + +def _parse_stage(spec: str) -> StageSpec: + """Parse a ``--stage`` value ``num_tasks[:num_models[:seed]]`` into a StageSpec. + + ``num_models`` may be ``all`` or empty for "all available references". Examples: + ``5`` (5 tasks, all refs), ``88:4`` (88 tasks, 4 closest refs), ``5:all:7`` + (5 tasks, all refs, seed 7). + """ + parts = spec.split(":") + if not parts or not parts[0].strip(): + raise argparse.ArgumentTypeError(f"Invalid --stage {spec!r}: num_tasks is required.") + try: + num_tasks = int(parts[0]) + except ValueError: + raise argparse.ArgumentTypeError(f"Invalid --stage {spec!r}: num_tasks must be an integer.") + + num_models: Optional[int] = None + if len(parts) >= 2 and parts[1].strip() and parts[1].strip().lower() != "all": + try: + num_models = int(parts[1]) + except ValueError: + raise argparse.ArgumentTypeError(f"Invalid --stage {spec!r}: num_models must be an integer or 'all'.") + + seed: Optional[int] = None + if len(parts) >= 3 and parts[2].strip(): + try: + seed = int(parts[2]) + except ValueError: + raise argparse.ArgumentTypeError(f"Invalid --stage {spec!r}: seed must be an integer.") + + return StageSpec(num_tasks=num_tasks, num_models=num_models, seed=seed) + + +def _load_reference_elos(value: str) -> Dict[str, float]: + """Load reference ELOs from inline JSON or, if prefixed with ``@``, a JSON file. + + Accepts ``{"ref_id": elo, ...}``. The ids must match the running server's + ``reference_models`` ids. + """ + text = value + if value.startswith("@"): + text = Path(value[1:]).read_text(encoding="utf-8") + data = json.loads(text) + if not isinstance(data, dict) or not data: + raise argparse.ArgumentTypeError("--reference-elos must be a non-empty JSON object of {ref_id: elo}.") + return {str(k): float(v) for k, v in data.items()} + + +def _build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="multistage_elo", + description=( + "Run multi-stage adaptive ELO estimation for a model's GDPVal deliverables " + "against a running GDPVal comparison server." + ), + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "Example:\n" + " python -m resources_servers.gdpval.multistage_elo_driver \\\n" + " --server-url http://localhost:8000 \\\n" + " --eval-deliverables-dir /path/to/eval/deliverables \\\n" + " --reference-elos '@refs.json' \\\n" + " --stage 5 --stage 88:4 \\\n" + " --output elo_summary.json\n" + ), + ) + parser.add_argument( + "--server-url", + required=True, + help="Base URL of the running GDPVal comparison-mode resources server (e.g. http://localhost:8000).", + ) + parser.add_argument( + "--eval-deliverables-dir", + required=True, + help="Directory of the evaluated model's deliverables (task_/repeat_/ layout).", + ) + parser.add_argument( + "--reference-elos", + required=True, + type=_load_reference_elos, + metavar="JSON", + help=( + "Reference anchor ELOs as inline JSON ('{\"ref\": 1500, ...}') or '@path.json'. " + "Keys must match the server's reference_models ids." + ), + ) + parser.add_argument( + "--stage", + dest="stages", + action="append", + required=True, + type=_parse_stage, + metavar="N[:M[:SEED]]", + help=( + "A stage as num_tasks[:num_models[:seed]] (num_models 'all' or omitted = all references). " + "Repeat for multiple stages, e.g. --stage 5 --stage 88:4." + ), + ) + parser.add_argument( + "--task-prompts", + default=DEFAULT_TASK_PROMPTS, + help=f"Benchmark JSONL mapping task_id -> prompt (default: {DEFAULT_TASK_PROMPTS}).", + ) + parser.add_argument( + "--distribution", + default=None, + help="Existing task-distribution JSON to sample tasks from. If omitted, one is built and cached.", + ) + parser.add_argument( + "--dataset", + default=None, + help="Dataset JSONL to build the distribution from when --distribution is not given (default: GDPVal).", + ) + parser.add_argument( + "--column", + dest="columns", + action="append", + default=None, + metavar="COLUMN", + help="Column(s) to group the distribution by when building one (default: occupation). Repeatable.", + ) + parser.add_argument( + "--nested-tasks", + action="store_true", + help="Make each stage's task set a superset of the previous (default: independent per-stage sampling).", + ) + parser.add_argument( + "--skip-missing", + action="store_true", + help="Drop tasks with no cached eval deliverable instead of erroring (sets produce_missing=False).", + ) + parser.add_argument( + "--model", + default="eval", + help="Label for the evaluated model in verify requests (default: eval).", + ) + parser.add_argument( + "--timeout", + type=float, + default=1800.0, + help="Per-request /verify timeout in seconds (default: 1800).", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Top-level RNG seed for reproducible task sampling and reference selection.", + ) + parser.add_argument( + "--output", + "-o", + default=None, + help="Path to write the JSON ELO summary. Defaults to stdout.", + ) + return parser + + +def main(argv: Optional[Sequence[str]] = None) -> int: + parser = _build_arg_parser() + args = parser.parse_args(argv) + + eval_dir = Path(args.eval_deliverables_dir) + if not eval_dir.is_dir(): + print(f"Eval deliverables dir not found: {eval_dir}", file=sys.stderr) + return 2 + + prompts_path = Path(args.task_prompts) + if not prompts_path.is_file(): + print(f"Task prompts JSONL not found: {prompts_path}", file=sys.stderr) + return 2 + + config = MultiStageEloConfig( + stages=list(args.stages), + reference_elos=args.reference_elos, + distribution_path=args.distribution, + dataset_path=args.dataset, + eval_deliverables_dir=str(eval_dir), + produce_missing=not args.skip_missing, + nested_tasks=args.nested_tasks, + column=list(args.columns) if args.columns else ["occupation"], + ) + + verify_one = make_http_verify_one(args.server_url, timeout=args.timeout, model=args.model) + task_prompts = load_task_prompts(prompts_path) + rng = random.Random(args.seed) if args.seed is not None else None + + results = run_multistage_elo(config, verify_one, task_prompts, rng=rng) + payload = json.dumps(stage_results_to_dict(results), indent=2, ensure_ascii=False) + + if args.output: + out_path = Path(args.output) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(payload + "\n", encoding="utf-8") + final = results[-1] if results else None + final_elo = final.eval_elo if final else None + print(f"Wrote ELO summary ({len(results)} stages, final_eval_elo={final_elo}) to {out_path}", file=sys.stderr) + else: + print(payload) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/resources_servers/gdpval/tests/test_app.py b/resources_servers/gdpval/tests/test_app.py index b2585b950..a61652f0e 100644 --- a/resources_servers/gdpval/tests/test_app.py +++ b/resources_servers/gdpval/tests/test_app.py @@ -595,6 +595,74 @@ def fake_run_trials(**_kwargs): assert resp.total_losses == 2 assert resp.judge_response["reference_count"] == 2 + @pytest.mark.asyncio + async def test_reference_ids_filter_judges_subset(self, tmp_path) -> None: + """``reference_ids`` on the verify request restricts judging to the named + references; unknown ids are ignored.""" + eval_dir = tmp_path / "eval" / "task_task-1" / "repeat_0" + eval_dir.mkdir(parents=True) + (eval_dir / "finish_params.json").write_text("{}") + + ref_roots = {} + for ref_id in ("kimi", "gpt5"): + root = tmp_path / ref_id + td = root / "task_task-1" + td.mkdir(parents=True) + (td / "finish_params.json").write_text("{}") + ref_roots[ref_id] = root + + server = _server( + reward_mode="comparison", + reference_models={ + "kimi": {"deliverables_dir": str(ref_roots["kimi"]), "elo": 1290.0}, + "gpt5": {"deliverables_dir": str(ref_roots["gpt5"]), "elo": 1320.0}, + }, + preconvert_office_to_pdf=False, + num_comparison_trials=4, + ) + + def fake_run_trials(**_kwargs): + return {"winner": "[[B]]", "win_count_a": 1, "win_count_b": 3, "tie_count": 0, "task_count": 4} + + # Only judge against gpt5 (and an unknown id, which is ignored). + body = _verify_request(deliverables_dir=str(eval_dir), reference_ids=["gpt5", "nonexistent"]) + + with ( + patch("resources_servers.gdpval.comparison.run_trials", side_effect=fake_run_trials), + patch("resources_servers.gdpval.app.get_server_url", return_value="http://localhost:9999"), + patch("resources_servers.gdpval.comparison.build_file_section", return_value=[]), + patch("openai.OpenAI", return_value=MagicMock()), + ): + resp = await server.verify(body) + + assert set(resp.per_reference) == {"gpt5"} + assert resp.total_wins == 3 + assert resp.total_losses == 1 + assert resp.judge_response["reference_count"] == 1 + + @pytest.mark.asyncio + async def test_reference_ids_empty_yields_no_references(self, tmp_path) -> None: + """An empty ``reference_ids`` list judges against nothing → reference_missing.""" + eval_dir = tmp_path / "eval" / "task_task-1" / "repeat_0" + eval_dir.mkdir(parents=True) + (eval_dir / "finish_params.json").write_text("{}") + root = tmp_path / "kimi" + (root / "task_task-1").mkdir(parents=True) + (root / "task_task-1" / "finish_params.json").write_text("{}") + + server = _server( + reward_mode="comparison", + reference_models={"kimi": {"deliverables_dir": str(root), "elo": 1290.0}}, + preconvert_office_to_pdf=False, + ) + body = _verify_request(deliverables_dir=str(eval_dir), reference_ids=[]) + + with patch("resources_servers.gdpval.app.get_server_url", return_value="http://localhost:9999"): + resp = await server.verify(body) + + assert resp.reward == 0.0 + assert resp.judge_response == {"error": "reference_missing"} + @staticmethod def _two_ref_server_and_body(tmp_path): eval_dir = tmp_path / "eval" / "task_task-1" / "repeat_0" diff --git a/resources_servers/gdpval/tests/test_multistage_elo.py b/resources_servers/gdpval/tests/test_multistage_elo.py new file mode 100644 index 000000000..4633cdda7 --- /dev/null +++ b/resources_servers/gdpval/tests/test_multistage_elo.py @@ -0,0 +1,185 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import random + +import pytest + +from resources_servers.gdpval.multistage_elo import ( + MultiStageEloConfig, + MultiStageEloRunner, + StageSpec, + fit_stage_elo, + plan_stage_task_ids, + select_references, +) + + +def _dist(groups): + """groups: {key: [task_ids]} -> distribution dict with proportional pct.""" + total = sum(len(v) for v in groups.values()) or 1 + return {k: {"percentage": len(v) / total, "task_ids": list(v)} for k, v in groups.items()} + + +class TestSelectReferences: + ELOS = {"a": 1000.0, "b": 1200.0, "c": 1300.0, "d": 1500.0} + + def test_all_when_num_models_none(self) -> None: + assert select_references(self.ELOS, 1234.0, None) == ["a", "b", "c", "d"] + + def test_all_when_eval_elo_none(self) -> None: + assert select_references(self.ELOS, None, 2) == ["a", "b", "c", "d"] + + def test_all_when_num_models_exceeds_available(self) -> None: + assert select_references(self.ELOS, 1234.0, 10) == ["a", "b", "c", "d"] + + def test_closest_subset(self) -> None: + # eval 1250 -> closest are c(1300,50) and b(1200,50); tie broken by id. + assert select_references(self.ELOS, 1250.0, 2) == ["b", "c"] + + def test_closest_single(self) -> None: + assert select_references(self.ELOS, 1490.0, 1) == ["d"] + + def test_zero_models_returns_empty(self) -> None: + assert select_references(self.ELOS, 1250.0, 0) == [] + + def test_result_sorted_by_id(self) -> None: + chosen = select_references(self.ELOS, 1100.0, 3) + assert chosen == sorted(chosen) + + +class TestPlanStageTaskIds: + def test_nested_is_superset(self) -> None: + dist = _dist({"x": [f"x{i}" for i in range(10)], "y": [f"y{i}" for i in range(10)]}) + stages = [StageSpec(num_tasks=3), StageSpec(num_tasks=8)] + planned = plan_stage_task_ids(dist, stages, rng=random.Random(0), nested=True) + assert len(planned[0]) == 3 + assert len(planned[1]) == 8 + assert set(planned[0]).issubset(set(planned[1])) + + def test_nested_no_duplicates(self) -> None: + dist = _dist({"x": [f"x{i}" for i in range(20)]}) + stages = [StageSpec(num_tasks=5), StageSpec(num_tasks=12)] + planned = plan_stage_task_ids(dist, stages, rng=random.Random(1), nested=True) + assert len(planned[1]) == len(set(planned[1])) + + def test_nested_capped_at_available(self) -> None: + dist = _dist({"x": ["a", "b", "c"]}) + stages = [StageSpec(num_tasks=2), StageSpec(num_tasks=100)] + planned = plan_stage_task_ids(dist, stages, rng=random.Random(2), nested=True) + assert sorted(planned[1]) == ["a", "b", "c"] + + def test_non_increasing_stage_reuses_prefix(self) -> None: + dist = _dist({"x": [f"x{i}" for i in range(10)]}) + stages = [StageSpec(num_tasks=5), StageSpec(num_tasks=3)] + planned = plan_stage_task_ids(dist, stages, rng=random.Random(3), nested=True) + assert planned[1] == planned[0][:3] + + def test_independent_sampling(self) -> None: + dist = _dist({"x": [f"x{i}" for i in range(50)]}) + stages = [StageSpec(num_tasks=5, seed=1), StageSpec(num_tasks=5, seed=2)] + planned = plan_stage_task_ids(dist, stages, nested=False) + assert len(planned[0]) == 5 and len(planned[1]) == 5 + + def test_seed_reproducible(self) -> None: + dist = _dist({"x": [f"x{i}" for i in range(50)]}) + stages = [StageSpec(num_tasks=7, seed=42)] + a = plan_stage_task_ids(dist, stages, nested=False) + b = plan_stage_task_ids(dist, stages, nested=False) + assert a == b + + +class TestFitStageElo: + ELOS = {"a": 1000.0, "b": 1400.0} + + def test_no_battles_returns_none(self) -> None: + assert fit_stage_elo({}, self.ELOS) == (None, None, 0) + + def test_zero_games_skipped(self) -> None: + per_ref = {"a": {"wins": 0, "losses": 0, "ties": 0}} + assert fit_stage_elo(per_ref, self.ELOS) == (None, None, 0) + + def test_fits_elo_uses_config_anchor(self) -> None: + per_ref = {"a": {"wins": 5, "losses": 5, "ties": 0}} + elo, norm, n = fit_stage_elo(per_ref, self.ELOS) + # 50% win rate vs a single anchor -> eval elo ~= anchor elo. + assert n == 1 + assert elo == pytest.approx(1000.0, abs=1.0) + assert norm == pytest.approx((elo - 500.0) / 2000.0) + + def test_falls_back_to_recorded_reference_elo(self) -> None: + per_ref = {"z": {"wins": 5, "losses": 5, "ties": 0, "reference_elo": 1100.0}} + elo, _norm, n = fit_stage_elo(per_ref, {}) + assert n == 1 + assert elo == pytest.approx(1100.0, abs=1.0) + + def test_multi_reference_battles(self) -> None: + per_ref = { + "a": {"wins": 8, "losses": 2, "ties": 0}, + "b": {"wins": 2, "losses": 8, "ties": 0}, + } + elo, _norm, n = fit_stage_elo(per_ref, self.ELOS) + assert n == 2 + assert 1000.0 < elo < 1400.0 + + +class TestMultiStageEloRunner: + def _config(self, **overrides): + base = dict( + distribution_path="unused.json", + stages=[StageSpec(num_tasks=3, num_models=None), StageSpec(num_tasks=6, num_models=2)], + reference_elos={"a": 1000.0, "b": 1200.0, "c": 1300.0, "d": 1500.0}, + ) + base.update(overrides) + return MultiStageEloConfig(**base) + + def test_requires_stages(self) -> None: + with pytest.raises(ValueError): + MultiStageEloConfig(distribution_path="x", stages=[], reference_elos={}) + + def test_unknown_selection_rejected(self) -> None: + with pytest.raises(ValueError): + MultiStageEloConfig(distribution_path="x", stages=[StageSpec(1)], reference_elos={}, selection="zzz") + + def test_two_stage_flow_threads_elo_and_shrinks_refs(self) -> None: + dist = _dist({"x": [f"x{i}" for i in range(20)]}) + seen_stage_refs = [] + + def judge_stage(task_ids, reference_ids): + seen_stage_refs.append(list(reference_ids)) + # Eval beats everyone 7-3 -> high elo estimate. + return {rid: {"wins": 7, "losses": 3, "ties": 0} for rid in reference_ids} + + runner = MultiStageEloRunner(self._config(nested_tasks=True), dist, judge_stage, rng=random.Random(0)) + results = runner.run() + + assert len(results) == 2 + # Stage 1 uses all references. + assert seen_stage_refs[0] == ["a", "b", "c", "d"] + # Stage 2 narrows to 2 references (closest to the stage-1 estimate). + assert len(seen_stage_refs[1]) == 2 + assert set(seen_stage_refs[1]).issubset({"a", "b", "c", "d"}) + # Nested task sets (nested_tasks=True): stage 2 superset of stage 1. + assert set(results[0].task_ids).issubset(set(results[1].task_ids)) + assert results[1].eval_elo is not None + + def test_stage_with_no_games_leaves_elo_unset(self) -> None: + dist = _dist({"x": [f"x{i}" for i in range(10)]}) + + def judge_stage(task_ids, reference_ids): + return {} + + cfg = self._config(stages=[StageSpec(num_tasks=2, num_models=None)]) + results = MultiStageEloRunner(cfg, dist, judge_stage, rng=random.Random(0)).run() + assert results[0].eval_elo is None + assert results[0].num_references == 0 diff --git a/resources_servers/gdpval/tests/test_multistage_elo_driver.py b/resources_servers/gdpval/tests/test_multistage_elo_driver.py new file mode 100644 index 000000000..0cce37910 --- /dev/null +++ b/resources_servers/gdpval/tests/test_multistage_elo_driver.py @@ -0,0 +1,429 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import random +from pathlib import Path + +import pytest + +import resources_servers.gdpval.multistage_elo_driver as driver +from resources_servers.gdpval.multistage_elo import MultiStageEloConfig, StageResult, StageSpec +from resources_servers.gdpval.multistage_elo_driver import ( + _load_reference_elos, + _parse_stage, + build_judge_stage, + build_verify_request_body, + cached_task_ids, + check_coverage, + ensure_distribution, + load_distribution, + load_task_prompts, + main, + pool_per_reference, + run_multistage_elo, + stage_results_to_dict, + task_repeat_dirs, +) + + +def _make_cache(root: Path, task_ids, repeats=("repeat_0",)): + for tid in task_ids: + for rep in repeats: + d = root / f"task_{tid}" / rep + d.mkdir(parents=True) + (d / "finish_params.json").write_text("{}") + + +def _dist(groups): + total = sum(len(v) for v in groups.values()) or 1 + return {k: {"percentage": len(v) / total, "task_ids": list(v)} for k, v in groups.items()} + + +class TestCacheDiscovery: + def test_task_repeat_dirs_lists_attempted_repeats(self, tmp_path: Path) -> None: + _make_cache(tmp_path, ["a"], repeats=("repeat_0", "repeat_1")) + dirs = task_repeat_dirs(tmp_path, "a") + assert [d.name for d in dirs] == ["repeat_0", "repeat_1"] + + def test_task_repeat_dirs_skips_unattempted(self, tmp_path: Path) -> None: + (tmp_path / "task_a" / "repeat_0").mkdir(parents=True) # no finish_params.json + assert task_repeat_dirs(tmp_path, "a") == [] + + def test_task_repeat_dirs_flat_layout(self, tmp_path: Path) -> None: + d = tmp_path / "task_a" + d.mkdir(parents=True) + (d / "finish_params.json").write_text("{}") + assert [p.name for p in task_repeat_dirs(tmp_path, "a")] == ["task_a"] + + def test_missing_task_returns_empty(self, tmp_path: Path) -> None: + assert task_repeat_dirs(tmp_path, "ghost") == [] + + def test_cached_task_ids(self, tmp_path: Path) -> None: + _make_cache(tmp_path, ["a", "b"]) + assert cached_task_ids(tmp_path) == {"a", "b"} + + def test_cached_task_ids_missing_dir(self, tmp_path: Path) -> None: + assert cached_task_ids(tmp_path / "nope") == set() + + def test_check_coverage(self, tmp_path: Path) -> None: + _make_cache(tmp_path, ["a", "c"]) + present, missing = check_coverage(tmp_path, ["a", "b", "c"]) + assert present == ["a", "c"] + assert missing == ["b"] + + +class TestPoolPerReference: + def test_sums_counts_and_keeps_elo(self) -> None: + responses = [ + {"per_reference": {"a": {"wins": 2, "losses": 1, "ties": 0, "reference_elo": 1000.0}}}, + {"per_reference": {"a": {"wins": 1, "losses": 0, "ties": 1, "reference_elo": 1000.0}}}, + ] + pooled = pool_per_reference(responses) + assert pooled["a"]["wins"] == 3 + assert pooled["a"]["losses"] == 1 + assert pooled["a"]["ties"] == 1 + assert pooled["a"]["reference_elo"] == 1000.0 + + def test_handles_missing_per_reference(self) -> None: + assert pool_per_reference([{}, {"per_reference": None}]) == {} + + +class TestLoaders: + def test_load_distribution(self, tmp_path: Path) -> None: + p = tmp_path / "d.json" + p.write_text(json.dumps(_dist({"x": ["a"]}))) + assert load_distribution(p)["x"]["task_ids"] == ["a"] + + def test_load_distribution_rejects_non_object(self, tmp_path: Path) -> None: + p = tmp_path / "d.json" + p.write_text("[1,2,3]") + with pytest.raises(ValueError): + load_distribution(p) + + def test_load_task_prompts_top_level(self, tmp_path: Path) -> None: + p = tmp_path / "b.jsonl" + p.write_text(json.dumps({"task_id": "a", "prompt": "do x"}) + "\n") + assert load_task_prompts(p) == {"a": "do x"} + + def test_load_task_prompts_metadata_nested(self, tmp_path: Path) -> None: + p = tmp_path / "b.jsonl" + p.write_text(json.dumps({"responses_create_params": {"metadata": {"task_id": "a", "prompt": "y"}}}) + "\n") + assert load_task_prompts(p) == {"a": "y"} + + +class TestEnsureDistribution: + def test_loads_existing_file(self, tmp_path: Path) -> None: + p = tmp_path / "d.json" + p.write_text(json.dumps(_dist({"x": ["a"]}))) + dist, path = ensure_distribution(str(p)) + assert path == p + assert dist["x"]["task_ids"] == ["a"] + + def test_builds_from_dataset_when_missing(self, tmp_path: Path) -> None: + dataset = tmp_path / "tasks.jsonl" + rows = [ + {"task_id": "t1", "occupation": "Lawyer"}, + {"task_id": "t2", "occupation": "Lawyer"}, + {"task_id": "t3", "occupation": "Nurse"}, + ] + dataset.write_text("\n".join(json.dumps(r) for r in rows) + "\n") + cache = tmp_path / "cache" + + dist, path = ensure_distribution(None, dataset_path=str(dataset), cache_dir=str(cache)) + + assert path == cache / "occupation_distribution.json" + assert path.is_file() + assert dist["Lawyer"]["task_ids"] == ["t1", "t2"] + assert dist["Nurse"]["task_ids"] == ["t3"] + + def test_writes_to_distribution_path_when_given(self, tmp_path: Path) -> None: + dataset = tmp_path / "tasks.jsonl" + dataset.write_text(json.dumps({"task_id": "t1", "occupation": "Lawyer"}) + "\n") + out = tmp_path / "sub" / "mydist.json" + + _dist_, path = ensure_distribution(str(out), dataset_path=str(dataset)) + + assert path == out + assert out.is_file() + + def test_custom_columns_in_filename(self, tmp_path: Path) -> None: + dataset = tmp_path / "tasks.jsonl" + dataset.write_text(json.dumps({"task_id": "t1", "sector": "Legal", "occupation": "Lawyer"}) + "\n") + cache = tmp_path / "cache" + _dist_, path = ensure_distribution( + None, dataset_path=str(dataset), columns=["sector", "occupation"], cache_dir=str(cache) + ) + assert path == cache / "sector_occupation_distribution.json" + + def test_raises_when_no_dataset_available(self, tmp_path: Path, monkeypatch) -> None: + import responses_api_agents.stirrup_agent.task_distribution as td + + monkeypatch.setattr(td, "DEFAULT_DATASET_CANDIDATES", (tmp_path / "missing.jsonl",)) + with pytest.raises(FileNotFoundError): + ensure_distribution(None, cache_dir=str(tmp_path / "cache")) + + +class TestBuildVerifyRequestBody: + def test_includes_reference_ids_and_deliverables(self) -> None: + body = build_verify_request_body("t1", "/cache/task_t1/repeat_0", "prompt", ["a", "b"]) + assert body["task_id"] == "t1" + assert body["deliverables_dir"] == "/cache/task_t1/repeat_0" + assert body["reference_ids"] == ["a", "b"] + assert body["prompt"] == "prompt" + + +class TestBuildJudgeStage: + def test_judges_present_tasks_and_pools(self, tmp_path: Path) -> None: + _make_cache(tmp_path, ["a", "b"], repeats=("repeat_0", "repeat_1")) + calls = [] + + def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids): + calls.append((task_id, Path(deliverables_dir).name, tuple(reference_ids))) + return {"per_reference": {reference_ids[0]: {"wins": 1, "losses": 0, "ties": 0, "reference_elo": 1000.0}}} + + judge = build_judge_stage(fake_verify_one, tmp_path, {"a": "pa", "b": "pb"}) + pooled = judge(["a", "b"], ["ref1"]) + # 2 tasks x 2 repeats = 4 verify calls. + assert len(calls) == 4 + assert pooled["ref1"]["wins"] == 4 + + def test_missing_raises_when_no_producer(self, tmp_path: Path) -> None: + _make_cache(tmp_path, ["a"]) + judge = build_judge_stage(lambda *a: {}, tmp_path, {}) + with pytest.raises(FileNotFoundError): + judge(["a", "missing"], ["ref1"]) + + def test_missing_skipped_when_produce_missing_false(self, tmp_path: Path) -> None: + _make_cache(tmp_path, ["a"]) + + def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids): + return {"per_reference": {"ref1": {"wins": 1, "losses": 0, "ties": 0, "reference_elo": 1000.0}}} + + judge = build_judge_stage(fake_verify_one, tmp_path, {"a": ""}, produce_missing=False) + pooled = judge(["a", "missing"], ["ref1"]) + assert pooled["ref1"]["wins"] == 1 + + def test_producer_materializes_then_judges(self, tmp_path: Path) -> None: + _make_cache(tmp_path, ["a"]) + + def producer(task_ids): + _make_cache(tmp_path, list(task_ids)) + + def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids): + return {"per_reference": {"ref1": {"wins": 1, "losses": 0, "ties": 0, "reference_elo": 1000.0}}} + + judge = build_judge_stage(fake_verify_one, tmp_path, {}, producer=producer) + pooled = judge(["a", "b"], ["ref1"]) + assert pooled["ref1"]["wins"] == 2 # both tasks judged after production + + +class TestRunMultistageElo: + def test_requires_eval_dir(self, tmp_path: Path) -> None: + cfg = MultiStageEloConfig(distribution_path="x.json", stages=[StageSpec(1)], reference_elos={"a": 1000.0}) + with pytest.raises(ValueError): + run_multistage_elo(cfg, lambda *a: {}, {}) + + def test_end_to_end_with_fakes(self, tmp_path: Path) -> None: + # 30 cached tasks, 2-stage adaptive run with a fake judge. + task_ids = [f"t{i}" for i in range(30)] + _make_cache(tmp_path, task_ids) + dist_path = tmp_path / "dist.json" + dist_path.write_text(json.dumps(_dist({"x": task_ids}))) + + def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids): + return { + "per_reference": { + rid: {"wins": 7, "losses": 3, "ties": 0, "reference_elo": elo} + for rid, elo in {"a": 1000.0, "b": 1200.0, "c": 1300.0, "d": 1500.0}.items() + if rid in reference_ids + } + } + + cfg = MultiStageEloConfig( + distribution_path=str(dist_path), + stages=[StageSpec(num_tasks=5, num_models=None), StageSpec(num_tasks=12, num_models=2)], + reference_elos={"a": 1000.0, "b": 1200.0, "c": 1300.0, "d": 1500.0}, + eval_deliverables_dir=str(tmp_path), + ) + results = run_multistage_elo(cfg, fake_verify_one, {t: "" for t in task_ids}, rng=random.Random(0)) + + assert len(results) == 2 + assert results[0].reference_ids == ["a", "b", "c", "d"] + assert len(results[1].reference_ids) == 2 + assert results[1].eval_elo is not None + + summary = stage_results_to_dict(results) + assert summary["num_stages"] == 2 + assert summary["final_eval_elo"] == results[1].eval_elo + + def test_stage_results_to_dict_empty(self) -> None: + assert stage_results_to_dict([])["final_eval_elo"] is None + + +class TestParseStage: + def test_tasks_only(self) -> None: + s = _parse_stage("5") + assert (s.num_tasks, s.num_models, s.seed) == (5, None, None) + + def test_tasks_and_models(self) -> None: + s = _parse_stage("88:4") + assert (s.num_tasks, s.num_models, s.seed) == (88, 4, None) + + def test_all_models_keyword_and_seed(self) -> None: + s = _parse_stage("5:all:7") + assert (s.num_tasks, s.num_models, s.seed) == (5, None, 7) + + @pytest.mark.parametrize("bad", ["", "x", "5:y", "5:4:z"]) + def test_invalid(self, bad: str) -> None: + import argparse + + with pytest.raises(argparse.ArgumentTypeError): + _parse_stage(bad) + + +class TestLoadReferenceElos: + def test_inline_json(self) -> None: + assert _load_reference_elos('{"a": 1500, "b": 1200}') == {"a": 1500.0, "b": 1200.0} + + def test_from_file(self, tmp_path: Path) -> None: + f = tmp_path / "refs.json" + f.write_text(json.dumps({"a": 1000})) + assert _load_reference_elos(f"@{f}") == {"a": 1000.0} + + @pytest.mark.parametrize("bad", ["[]", "{}", '"x"']) + def test_invalid(self, bad: str) -> None: + import argparse + + with pytest.raises(argparse.ArgumentTypeError): + _load_reference_elos(bad) + + +class TestCliMain: + def _setup(self, tmp_path: Path): + _make_cache(tmp_path, ["a", "b"]) + prompts = tmp_path / "bench.jsonl" + prompts.write_text(json.dumps({"task_id": "a", "prompt": "p"}) + "\n") + refs = tmp_path / "refs.json" + refs.write_text(json.dumps({"a": 1000.0, "b": 1200.0})) + return prompts, refs + + def test_main_writes_summary(self, tmp_path: Path, monkeypatch, capsys) -> None: + prompts, refs = self._setup(tmp_path) + captured = {} + + def fake_run(config, verify_one, task_prompts, *, rng=None, producer=None): + captured["config"] = config + captured["rng"] = rng + return [ + StageResult( + stage_index=0, + task_ids=["a"], + reference_ids=["a", "b"], + per_reference={}, + eval_elo=1234.0, + normalized_elo=0.5, + num_references=2, + ) + ] + + monkeypatch.setattr(driver, "run_multistage_elo", fake_run) + out = tmp_path / "summary.json" + rc = main( + [ + "--server-url", + "http://localhost:9999", + "--eval-deliverables-dir", + str(tmp_path), + "--reference-elos", + f"@{refs}", + "--stage", + "5", + "--stage", + "12:1", + "--task-prompts", + str(prompts), + "--nested-tasks", + "--skip-missing", + "--seed", + "3", + "--output", + str(out), + ] + ) + assert rc == 0 + summary = json.loads(out.read_text()) + assert summary["final_eval_elo"] == 1234.0 + cfg = captured["config"] + assert [s.num_tasks for s in cfg.stages] == [5, 12] + assert cfg.stages[1].num_models == 1 + assert cfg.nested_tasks is True + assert cfg.produce_missing is False + assert cfg.reference_elos == {"a": 1000.0, "b": 1200.0} + assert isinstance(captured["rng"], random.Random) + + def test_main_to_stdout(self, tmp_path: Path, monkeypatch, capsys) -> None: + prompts, refs = self._setup(tmp_path) + monkeypatch.setattr(driver, "run_multistage_elo", lambda *a, **k: []) + rc = main( + [ + "--server-url", + "http://localhost:9999", + "--eval-deliverables-dir", + str(tmp_path), + "--reference-elos", + f"@{refs}", + "--stage", + "5", + "--task-prompts", + str(prompts), + ] + ) + assert rc == 0 + assert json.loads(capsys.readouterr().out)["num_stages"] == 0 + + def test_main_missing_eval_dir(self, tmp_path: Path, capsys) -> None: + _, refs = self._setup(tmp_path) + rc = main( + [ + "--server-url", + "http://x", + "--eval-deliverables-dir", + str(tmp_path / "nope"), + "--reference-elos", + f"@{refs}", + "--stage", + "5", + ] + ) + assert rc == 2 + assert "not found" in capsys.readouterr().err.lower() + + def test_main_missing_prompts(self, tmp_path: Path, capsys) -> None: + _, refs = self._setup(tmp_path) + rc = main( + [ + "--server-url", + "http://x", + "--eval-deliverables-dir", + str(tmp_path), + "--reference-elos", + f"@{refs}", + "--stage", + "5", + "--task-prompts", + str(tmp_path / "nope.jsonl"), + ] + ) + assert rc == 2 + assert "not found" in capsys.readouterr().err.lower() From af7f34411b12678a1552d23778a0c1f99362ffef Mon Sep 17 00:00:00 2001 From: Virginia Wu Date: Fri, 26 Jun 2026 17:06:27 -0700 Subject: [PATCH 6/8] multistage-elo E2E smoke test works Signed-off-by: Virginia Wu --- multistage_refs.json | 7 ++ .../configs/gdpval_comparison_smoketest.yaml | 56 ++++++++++++ resources_servers/gdpval/multistage_elo.py | 28 ++++++ .../gdpval/multistage_elo_driver.py | 85 +++++++++++++++++-- .../gdpval/tests/test_multistage_elo.py | 21 +++++ .../tests/test_multistage_elo_driver.py | 17 +++- 6 files changed, 207 insertions(+), 7 deletions(-) create mode 100644 multistage_refs.json create mode 100644 resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml diff --git a/multistage_refs.json b/multistage_refs.json new file mode 100644 index 000000000..e6e383192 --- /dev/null +++ b/multistage_refs.json @@ -0,0 +1,7 @@ +{ + "glm51": 1259, + "minimax_m27": 1165, + "nemotron3_ultra_ga": 1168, + "kimi_k25": 1000, + "qwen35_397b": 956 +} diff --git a/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml b/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml new file mode 100644 index 000000000..4d2e2a0e6 --- /dev/null +++ b/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml @@ -0,0 +1,56 @@ +# Scratch config for an end-to-end smoke test of the multi-stage ELO driver. +# +# Starts ONLY the two servers the driver needs: the GDPVal resources server in +# comparison mode (with the five reference models baked in) and the Gemini judge +# it calls. The Stirrup agent / policy model are intentionally omitted because +# the driver judges *cached* deliverables and never runs the agent. +# +# Usage: +# export JUDGE_API_KEY= +# gym env start "+config_paths=[resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml]" +# +# Delete this file when done; it is a local scratch config, not a committed env. + +# Gemini 3.1 Pro judge (proxied via the NVIDIA inference API). +gdpval_judge_model: + responses_api_models: + openai_model: + entrypoint: app.py + openai_base_url: ${oc.env:JUDGE_BASE_URL,https://inference-api.nvidia.com/v1} + openai_api_key: ${oc.env:JUDGE_API_KEY,dummy} + openai_model: ${oc.env:JUDGE_MODEL_NAME,gcp/google/gemini-3.1-pro-preview} + max_concurrent_requests: 4 + +# GDPVal resources server in multi-reference comparison mode. +gdpval_resources_server: + resources_servers: + gdpval: + entrypoint: app.py + domain: other + verified: false + reward_mode: comparison + # Text deliverables (soap_note.txt/.md) need no office->pdf conversion, so + # disable preconvert to avoid the libreoffice dependency for the smoke test. + preconvert_office_to_pdf: false + num_comparison_trials: 1 + reference_models: + glm51: + deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/GLM5.1-GDPval-Reference-Outputs + elo: 1259 + minimax_m27: + deliverables_dir: /lustre/fsw/portfolios/llmservice/users/agronskiy/nemo-evaluator-rundirs/ultra-v3/MiniMaxAI/MiniMax-M2.7/20260516_133717-5cf449e4fc2a6ffd/nemo_gym.0/artifacts/gdpval/deliverables_cache + elo: 1165 + nemotron3_ultra_ga: + deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Nemotron-3-Ultra-GA-checkpoint-no-interleaved-reasoning-05-29-2026 + elo: 1168 + kimi_k25: + deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Kimi-K2.5-Thinking + elo: 1000 + qwen35_397b: + deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Qwen3.5-397B + elo: 956 + judge_model_server: + type: responses_api_models + name: gdpval_judge_model + judge_responses_create_params_overrides: + model: gcp/google/gemini-3.1-pro-preview diff --git a/resources_servers/gdpval/multistage_elo.py b/resources_servers/gdpval/multistage_elo.py index a8f4fccc1..ceaf28b48 100644 --- a/resources_servers/gdpval/multistage_elo.py +++ b/resources_servers/gdpval/multistage_elo.py @@ -268,11 +268,20 @@ def __init__( judge_stage: JudgeStageFn, *, rng: Optional[random.Random] = None, + on_event: Optional[Callable[[str, dict], None]] = None, ) -> None: self.config = config self.distribution = distribution self.judge_stage = judge_stage self.rng = rng or random.Random() + # Optional progress hook. Called as ``on_event(name, data)`` for the + # events "planned", "stage_start", and "stage_end". Kept as a callback so + # this module performs no I/O itself; the driver/CLI does the printing. + self.on_event = on_event + + def _emit(self, name: str, **data: object) -> None: + if self.on_event is not None: + self.on_event(name, data) def run(self) -> List[StageResult]: stage_task_sets = plan_stage_task_ids( @@ -281,16 +290,35 @@ def run(self) -> List[StageResult]: rng=self.rng, nested=self.config.nested_tasks, ) + total_stages = len(self.config.stages) + self._emit("planned", stage_task_counts=[len(s) for s in stage_task_sets], total_stages=total_stages) results: List[StageResult] = [] eval_elo: Optional[float] = None for index, stage in enumerate(self.config.stages): reference_ids = select_references(self.config.reference_elos, eval_elo, stage.num_models) task_ids = stage_task_sets[index] + self._emit( + "stage_start", + index=index, + total_stages=total_stages, + reference_ids=list(reference_ids), + num_tasks=len(task_ids), + prior_elo=eval_elo, + ) per_reference = self.judge_stage(task_ids, reference_ids) stage_elo, normalized, num_references = fit_stage_elo(per_reference, self.config.reference_elos) if stage_elo is not None: eval_elo = stage_elo + self._emit( + "stage_end", + index=index, + total_stages=total_stages, + eval_elo=stage_elo, + normalized_elo=normalized, + num_references=num_references, + per_reference=dict(per_reference), + ) results.append( StageResult( stage_index=index, diff --git a/resources_servers/gdpval/multistage_elo_driver.py b/resources_servers/gdpval/multistage_elo_driver.py index 310c4d1d4..48a741857 100644 --- a/resources_servers/gdpval/multistage_elo_driver.py +++ b/resources_servers/gdpval/multistage_elo_driver.py @@ -44,7 +44,9 @@ --output elo_summary.json where ``refs.json`` is ``{"": , ...}`` with ids matching the -server's configured ``reference_models``. See ``--help`` for all flags. +server's configured ``reference_models``. Each stage has a set number of +tasks and reference models set like ``--stage num_tasks:num_models``. +See ``--help`` for all flags. """ from __future__ import annotations @@ -243,6 +245,7 @@ def build_judge_stage( *, produce_missing: bool = True, producer: Optional[ProducerFn] = None, + progress: Optional[Callable[[int, int, str], None]] = None, ): """Build the ``judge_stage`` callable expected by ``MultiStageEloRunner``. @@ -251,6 +254,9 @@ def build_judge_stage( per-reference votes. Missing tasks are produced via ``producer`` when given; otherwise ``produce_missing=True`` raises an actionable error and ``produce_missing=False`` drops them with a warning. + + ``progress`` is an optional callback invoked as ``progress(done, total, + task_id)`` after each ``verify_one`` completes, for live status reporting. """ def judge_stage(task_ids: Sequence[str], reference_ids: Sequence[str]) -> PerReferenceTotals: @@ -272,11 +278,16 @@ def judge_stage(task_ids: Sequence[str], reference_ids: Sequence[str]) -> PerRef flush=True, ) + # Flatten to (task_id, repeat_dir) units up front so progress can report + # an accurate done/total across all repeats in the stage. + units = [(tid, repeat_dir) for tid in present for repeat_dir in task_repeat_dirs(eval_deliverables_dir, tid)] + total = len(units) responses: List[Dict[str, Any]] = [] - for task_id in present: + for done, (task_id, repeat_dir) in enumerate(units, start=1): prompt = task_prompts.get(task_id, "") - for repeat_dir in task_repeat_dirs(eval_deliverables_dir, task_id): - responses.append(verify_one(task_id, str(repeat_dir), prompt, list(reference_ids))) + responses.append(verify_one(task_id, str(repeat_dir), prompt, list(reference_ids))) + if progress is not None: + progress(done, total, task_id) return pool_per_reference(responses) return judge_stage @@ -353,11 +364,17 @@ def run_multistage_elo( *, rng=None, producer: Optional[ProducerFn] = None, + on_event: Optional[Callable[[str, dict], None]] = None, + progress: Optional[Callable[[int, int, str], None]] = None, ) -> List[StageResult]: """Run the full multi-stage ELO procedure and return per-stage results. ``config.eval_deliverables_dir`` must be set — it is the source of the eval model's (cached or produced) deliverables. + + ``on_event``/``progress`` are optional callbacks for live status reporting: + ``on_event`` receives stage-level events (see ``MultiStageEloRunner``) and + ``progress`` receives per-(task, repeat) judging progress. """ if not config.eval_deliverables_dir: raise ValueError("config.eval_deliverables_dir must be set (source of eval deliverables).") @@ -373,8 +390,9 @@ def run_multistage_elo( task_prompts, produce_missing=config.produce_missing, producer=producer, + progress=progress, ) - runner = MultiStageEloRunner(config, distribution, judge_stage, rng=rng) + runner = MultiStageEloRunner(config, distribution, judge_stage, rng=rng, on_event=on_event) return runner.run() @@ -556,6 +574,12 @@ def _build_arg_parser() -> argparse.ArgumentParser: default=None, help="Top-level RNG seed for reproducible task sampling and reference selection.", ) + parser.add_argument( + "--quiet", + "-q", + action="store_true", + help="Suppress live per-stage / per-task progress output on stderr.", + ) parser.add_argument( "--output", "-o", @@ -565,6 +589,54 @@ def _build_arg_parser() -> argparse.ArgumentParser: return parser +def _make_progress_printers(): + """Return ``(on_event, progress)`` callbacks that print human-readable status to stderr. + + ``on_event`` prints a banner at the start/end of each stage (selected + references, task count, fitted ELO); ``progress`` prints a per-(task, repeat) + counter as each ``/verify`` completes. + """ + + def on_event(name: str, data: dict) -> None: + if name == "planned": + counts = data.get("stage_task_counts", []) + print( + f"[multistage-elo] planned {data.get('total_stages')} stage(s); tasks per stage: {counts}", + file=sys.stderr, + flush=True, + ) + elif name == "stage_start": + idx = int(data["index"]) + 1 + total = data["total_stages"] + refs = data.get("reference_ids", []) + prior = data.get("prior_elo") + prior_str = f"{prior:.1f}" if isinstance(prior, (int, float)) else "n/a" + print( + f"[multistage-elo] stage {idx}/{total}: {data.get('num_tasks')} task(s) " + f"vs {len(refs)} ref(s) {refs} (prior ELO: {prior_str})", + file=sys.stderr, + flush=True, + ) + elif name == "stage_end": + idx = int(data["index"]) + 1 + total = data["total_stages"] + elo = data.get("eval_elo") + elo_str = f"{elo:.1f}" if isinstance(elo, (int, float)) else "unset (no games)" + print( + f"[multistage-elo] stage {idx}/{total} done: eval ELO = {elo_str} " + f"(fit over {data.get('num_references')} ref(s))", + file=sys.stderr, + flush=True, + ) + + def progress(done: int, total: int, task_id: str) -> None: + short = task_id[:18] + "…" if len(task_id) > 19 else task_id + end = "\n" if done == total else "\r" + print(f"[multistage-elo] judged {done}/{total} (task {short}) ", end=end, file=sys.stderr, flush=True) + + return on_event, progress + + def main(argv: Optional[Sequence[str]] = None) -> int: parser = _build_arg_parser() args = parser.parse_args(argv) @@ -594,7 +666,8 @@ def main(argv: Optional[Sequence[str]] = None) -> int: task_prompts = load_task_prompts(prompts_path) rng = random.Random(args.seed) if args.seed is not None else None - results = run_multistage_elo(config, verify_one, task_prompts, rng=rng) + on_event, progress = (None, None) if args.quiet else _make_progress_printers() + results = run_multistage_elo(config, verify_one, task_prompts, rng=rng, on_event=on_event, progress=progress) payload = json.dumps(stage_results_to_dict(results), indent=2, ensure_ascii=False) if args.output: diff --git a/resources_servers/gdpval/tests/test_multistage_elo.py b/resources_servers/gdpval/tests/test_multistage_elo.py index 4633cdda7..d0b091321 100644 --- a/resources_servers/gdpval/tests/test_multistage_elo.py +++ b/resources_servers/gdpval/tests/test_multistage_elo.py @@ -183,3 +183,24 @@ def judge_stage(task_ids, reference_ids): results = MultiStageEloRunner(cfg, dist, judge_stage, rng=random.Random(0)).run() assert results[0].eval_elo is None assert results[0].num_references == 0 + + def test_on_event_emits_lifecycle_events(self) -> None: + dist = _dist({"x": [f"x{i}" for i in range(10)]}) + + def judge_stage(task_ids, reference_ids): + return {rid: {"wins": 6, "losses": 4, "ties": 0} for rid in reference_ids} + + events = [] + cfg = self._config(stages=[StageSpec(num_tasks=2, num_models=None), StageSpec(num_tasks=3, num_models=2)]) + MultiStageEloRunner( + cfg, dist, judge_stage, rng=random.Random(0), on_event=lambda name, data: events.append((name, data)) + ).run() + + names = [n for n, _ in events] + assert names[0] == "planned" + assert names.count("stage_start") == 2 + assert names.count("stage_end") == 2 + # stage_start carries the selected references and task count. + first_start = next(d for n, d in events if n == "stage_start") + assert first_start["num_tasks"] == 2 + assert first_start["reference_ids"] == ["a", "b", "c", "d"] diff --git a/resources_servers/gdpval/tests/test_multistage_elo_driver.py b/resources_servers/gdpval/tests/test_multistage_elo_driver.py index 0cce37910..09d5d80e3 100644 --- a/resources_servers/gdpval/tests/test_multistage_elo_driver.py +++ b/resources_servers/gdpval/tests/test_multistage_elo_driver.py @@ -227,6 +227,21 @@ def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids): pooled = judge(["a", "b"], ["ref1"]) assert pooled["ref1"]["wins"] == 2 # both tasks judged after production + def test_progress_callback_reports_each_unit(self, tmp_path: Path) -> None: + _make_cache(tmp_path, ["a", "b"], repeats=("repeat_0", "repeat_1")) + + def fake_verify_one(task_id, deliverables_dir, prompt, reference_ids): + return {"per_reference": {"ref1": {"wins": 1, "losses": 0, "ties": 0, "reference_elo": 1000.0}}} + + seen = [] + judge = build_judge_stage( + fake_verify_one, tmp_path, {}, progress=lambda done, total, tid: seen.append((done, total, tid)) + ) + judge(["a", "b"], ["ref1"]) + # 2 tasks x 2 repeats = 4 units; progress reports running done/total. + assert [s[0] for s in seen] == [1, 2, 3, 4] + assert all(s[1] == 4 for s in seen) + class TestRunMultistageElo: def test_requires_eval_dir(self, tmp_path: Path) -> None: @@ -322,7 +337,7 @@ def test_main_writes_summary(self, tmp_path: Path, monkeypatch, capsys) -> None: prompts, refs = self._setup(tmp_path) captured = {} - def fake_run(config, verify_one, task_prompts, *, rng=None, producer=None): + def fake_run(config, verify_one, task_prompts, *, rng=None, producer=None, on_event=None, progress=None): captured["config"] = config captured["rng"] = rng return [ From b11f4462e40ee0feea109046ab385f97920aa28a Mon Sep 17 00:00:00 2001 From: Virginia Wu Date: Fri, 26 Jun 2026 17:29:44 -0700 Subject: [PATCH 7/8] trimmed white space Signed-off-by: Virginia Wu --- resources_servers/gdpval/multistage_elo_driver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources_servers/gdpval/multistage_elo_driver.py b/resources_servers/gdpval/multistage_elo_driver.py index 48a741857..6d39c20fb 100644 --- a/resources_servers/gdpval/multistage_elo_driver.py +++ b/resources_servers/gdpval/multistage_elo_driver.py @@ -44,8 +44,8 @@ --output elo_summary.json where ``refs.json`` is ``{"": , ...}`` with ids matching the -server's configured ``reference_models``. Each stage has a set number of -tasks and reference models set like ``--stage num_tasks:num_models``. +server's configured ``reference_models``. Each stage has a set number of +tasks and reference models set like ``--stage num_tasks:num_models``. See ``--help`` for all flags. """ From ce40b2f8b70f18d80c9d808bef5554bf07fd9bff Mon Sep 17 00:00:00 2001 From: Virginia Wu Date: Fri, 26 Jun 2026 17:33:51 -0700 Subject: [PATCH 8/8] removed smoked test yaml Signed-off-by: Virginia Wu --- .../configs/gdpval_comparison_smoketest.yaml | 56 ------------------- 1 file changed, 56 deletions(-) delete mode 100644 resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml diff --git a/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml b/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml deleted file mode 100644 index 4d2e2a0e6..000000000 --- a/resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# Scratch config for an end-to-end smoke test of the multi-stage ELO driver. -# -# Starts ONLY the two servers the driver needs: the GDPVal resources server in -# comparison mode (with the five reference models baked in) and the Gemini judge -# it calls. The Stirrup agent / policy model are intentionally omitted because -# the driver judges *cached* deliverables and never runs the agent. -# -# Usage: -# export JUDGE_API_KEY= -# gym env start "+config_paths=[resources_servers/gdpval/configs/gdpval_comparison_smoketest.yaml]" -# -# Delete this file when done; it is a local scratch config, not a committed env. - -# Gemini 3.1 Pro judge (proxied via the NVIDIA inference API). -gdpval_judge_model: - responses_api_models: - openai_model: - entrypoint: app.py - openai_base_url: ${oc.env:JUDGE_BASE_URL,https://inference-api.nvidia.com/v1} - openai_api_key: ${oc.env:JUDGE_API_KEY,dummy} - openai_model: ${oc.env:JUDGE_MODEL_NAME,gcp/google/gemini-3.1-pro-preview} - max_concurrent_requests: 4 - -# GDPVal resources server in multi-reference comparison mode. -gdpval_resources_server: - resources_servers: - gdpval: - entrypoint: app.py - domain: other - verified: false - reward_mode: comparison - # Text deliverables (soap_note.txt/.md) need no office->pdf conversion, so - # disable preconvert to avoid the libreoffice dependency for the smoke test. - preconvert_office_to_pdf: false - num_comparison_trials: 1 - reference_models: - glm51: - deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/GLM5.1-GDPval-Reference-Outputs - elo: 1259 - minimax_m27: - deliverables_dir: /lustre/fsw/portfolios/llmservice/users/agronskiy/nemo-evaluator-rundirs/ultra-v3/MiniMaxAI/MiniMax-M2.7/20260516_133717-5cf449e4fc2a6ffd/nemo_gym.0/artifacts/gdpval/deliverables_cache - elo: 1165 - nemotron3_ultra_ga: - deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Nemotron-3-Ultra-GA-checkpoint-no-interleaved-reasoning-05-29-2026 - elo: 1168 - kimi_k25: - deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Kimi-K2.5-Thinking - elo: 1000 - qwen35_397b: - deliverables_dir: /lustre/fsw/portfolios/llmservice/users/vadams/Stirrup/output/gdpval/Qwen3.5-397B - elo: 956 - judge_model_server: - type: responses_api_models - name: gdpval_judge_model - judge_responses_create_params_overrides: - model: gcp/google/gemini-3.1-pro-preview