From 92bdbbd1fc78dd85ad01328b6bde3cb084f89b8c Mon Sep 17 00:00:00 2001 From: ceej640 <42260127+Ceej640@users.noreply.github.com> Date: Sat, 30 May 2026 22:16:12 -0400 Subject: [PATCH 1/3] Seed copilot workflow benchmarks --- benchmarks/__init__.py | 8 + benchmarks/evaluator.py | 297 ++++++++++++++++++++++++ benchmarks/mock_client.py | 103 ++++++++ benchmarks/runner.py | 55 +++++ benchmarks/tasks/copilot_workflows.json | 91 ++++++++ docs/copilot-benchmarks.md | 59 +++++ tests/test_copilot_benchmarks.py | 106 +++++++++ 7 files changed, 719 insertions(+) create mode 100644 benchmarks/evaluator.py create mode 100644 benchmarks/mock_client.py create mode 100644 benchmarks/tasks/copilot_workflows.json create mode 100644 docs/copilot-benchmarks.md create mode 100644 tests/test_copilot_benchmarks.py diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py index 3d992d25..3397845c 100644 --- a/benchmarks/__init__.py +++ b/benchmarks/__init__.py @@ -4,4 +4,12 @@ Evaluation framework for measuring agent and CV subagent performance. """ +from .evaluator import BenchmarkTask, CopilotBenchmarkEvaluator, load_tasks + __version__ = "0.9.2" # Keep in sync with gently/__init__.py __version__ + +__all__ = [ + "BenchmarkTask", + "CopilotBenchmarkEvaluator", + "load_tasks", +] diff --git a/benchmarks/evaluator.py b/benchmarks/evaluator.py new file mode 100644 index 00000000..42dd7f00 --- /dev/null +++ b/benchmarks/evaluator.py @@ -0,0 +1,297 @@ +"""Deterministic copilot benchmark task scoring. + +This module scores recorded/planned tool traces against benchmark task +definitions. It does not call an LLM; callers can feed traces from a dry-run +agent, a replay harness, or hand-authored regression cases. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence + + +DEFAULT_TASKS_PATH = Path(__file__).parent / "tasks" / "copilot_workflows.json" + + +@dataclass(frozen=True) +class BenchmarkTask: + """One expected copilot workflow.""" + + id: str + category: str + prompt: str + expected_tools: List[str] + expected_params: Mapping[str, Mapping[str, Any]] = field(default_factory=dict) + expected_recovery_tools: List[str] = field(default_factory=list) + failure_scenario: Optional[str] = None + max_tool_calls: Optional[int] = None + tags: List[str] = field(default_factory=list) + weight: float = 1.0 + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> "BenchmarkTask": + return cls( + id=str(data["id"]), + category=str(data["category"]), + prompt=str(data["prompt"]), + expected_tools=list(data.get("expected_tools") or []), + expected_params=data.get("expected_params") or {}, + expected_recovery_tools=list(data.get("expected_recovery_tools") or []), + failure_scenario=data.get("failure_scenario"), + max_tool_calls=data.get("max_tool_calls"), + tags=list(data.get("tags") or []), + weight=float(data.get("weight", 1.0)), + ) + + +@dataclass(frozen=True) +class BenchmarkResult: + """Score for one benchmark task.""" + + task_id: str + category: str + prompt: str + expected_tools: List[str] + actual_tools: List[str] + completion_score: float + parameter_score: float + efficiency_score: float + error_handling_score: float + total_score: float + errors: List[str] = field(default_factory=list) + + @property + def passed(self) -> bool: + return self.total_score >= 0.85 and not self.errors + + def to_dict(self) -> Dict[str, Any]: + return { + "task_id": self.task_id, + "category": self.category, + "prompt": self.prompt, + "expected_tools": self.expected_tools, + "actual_tools": self.actual_tools, + "scores": { + "completion": self.completion_score, + "parameters": self.parameter_score, + "efficiency": self.efficiency_score, + "error_handling": self.error_handling_score, + "total": self.total_score, + }, + "passed": self.passed, + "errors": self.errors, + } + + +@dataclass(frozen=True) +class BenchmarkReport: + """Aggregate benchmark run summary.""" + + timestamp: str + num_tasks: int + num_passed: int + average_score: float + category_scores: Mapping[str, float] + results: List[BenchmarkResult] + metadata: Mapping[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + return { + "timestamp": self.timestamp, + "summary": { + "num_tasks": self.num_tasks, + "num_passed": self.num_passed, + "pass_rate": self.num_passed / self.num_tasks if self.num_tasks else 0.0, + "average_score": self.average_score, + "category_scores": dict(self.category_scores), + }, + "metadata": dict(self.metadata), + "results": [result.to_dict() for result in self.results], + } + + +def load_tasks(path: Path = DEFAULT_TASKS_PATH, tags: Optional[Iterable[str]] = None) -> List[BenchmarkTask]: + """Load benchmark tasks from a JSON task suite.""" + data = json.loads(Path(path).read_text(encoding="utf-8")) + tasks = [BenchmarkTask.from_dict(item) for item in data.get("tasks", [])] + if tags is None: + return tasks + + wanted = set(tags) + return [task for task in tasks if wanted.intersection(task.tags)] + + +def _tool_name(call: Mapping[str, Any]) -> Optional[str]: + return call.get("name") or call.get("tool") or call.get("tool_name") + + +def _tool_input(call: Mapping[str, Any]) -> Mapping[str, Any]: + payload = call.get("input") + if payload is None: + payload = call.get("params") + if payload is None: + payload = call.get("arguments") + return payload or {} + + +def _ordered_match_score(expected: Sequence[str], actual: Sequence[str]) -> float: + if not expected: + return 1.0 + + cursor = 0 + matched = 0 + for expected_name in expected: + for index in range(cursor, len(actual)): + if actual[index] == expected_name: + matched += 1 + cursor = index + 1 + break + return matched / len(expected) + + +class CopilotBenchmarkEvaluator: + """Score copilot tool traces against workflow benchmark tasks.""" + + def __init__(self, tasks: Optional[Sequence[BenchmarkTask]] = None): + self.tasks = list(tasks) if tasks is not None else load_tasks() + + def evaluate_task( + self, + task: BenchmarkTask, + tool_calls: Sequence[Mapping[str, Any]], + *, + error: Optional[str] = None, + ) -> BenchmarkResult: + actual_tools = [name for name in (_tool_name(call) for call in tool_calls) if name] + errors: List[str] = [] + + completion_score = _ordered_match_score(task.expected_tools, actual_tools) + if completion_score < 1.0: + missing = [name for name in task.expected_tools if name not in actual_tools] + errors.extend(f"missing expected tool: {name}" for name in missing) + + parameter_score = self._parameter_score(task, tool_calls, errors) + efficiency_score = self._efficiency_score(task, actual_tools) + error_handling_score = self._error_handling_score(task, actual_tools, error) + if error: + errors.append(error) + + total_score = ( + 0.45 * completion_score + + 0.25 * parameter_score + + 0.15 * efficiency_score + + 0.15 * error_handling_score + ) + + return BenchmarkResult( + task_id=task.id, + category=task.category, + prompt=task.prompt, + expected_tools=task.expected_tools, + actual_tools=actual_tools, + completion_score=round(completion_score, 4), + parameter_score=round(parameter_score, 4), + efficiency_score=round(efficiency_score, 4), + error_handling_score=round(error_handling_score, 4), + total_score=round(total_score, 4), + errors=errors, + ) + + def evaluate_traces( + self, + traces_by_task_id: Mapping[str, Sequence[Mapping[str, Any]]], + *, + errors_by_task_id: Optional[Mapping[str, str]] = None, + ) -> BenchmarkReport: + errors_by_task_id = errors_by_task_id or {} + results = [ + self.evaluate_task( + task, + traces_by_task_id.get(task.id, []), + error=errors_by_task_id.get(task.id), + ) + for task in self.tasks + ] + return self._report(results) + + def _parameter_score( + self, + task: BenchmarkTask, + tool_calls: Sequence[Mapping[str, Any]], + errors: List[str], + ) -> float: + if not task.expected_params: + return 1.0 + + checks = 0 + passed = 0 + calls_by_name: Dict[str, List[Mapping[str, Any]]] = {} + for call in tool_calls: + name = _tool_name(call) + if name: + calls_by_name.setdefault(name, []).append(call) + + for tool_name, expected_params in task.expected_params.items(): + calls = calls_by_name.get(tool_name) or [] + if not calls: + checks += len(expected_params) + errors.append(f"missing params because tool was not called: {tool_name}") + continue + actual_params = _tool_input(calls[0]) + for key, expected_value in expected_params.items(): + checks += 1 + if actual_params.get(key) == expected_value: + passed += 1 + else: + errors.append( + f"{tool_name}.{key}: expected {expected_value!r}, " + f"got {actual_params.get(key)!r}" + ) + + return passed / checks if checks else 1.0 + + def _efficiency_score(self, task: BenchmarkTask, actual_tools: Sequence[str]) -> float: + if not actual_tools: + return 1.0 if not task.expected_tools else 0.0 + if task.max_tool_calls is not None and len(actual_tools) > task.max_tool_calls: + return task.max_tool_calls / len(actual_tools) + optimal = max(len(task.expected_tools), 1) + return min(1.0, optimal / len(actual_tools)) + + def _error_handling_score( + self, + task: BenchmarkTask, + actual_tools: Sequence[str], + error: Optional[str], + ) -> float: + if not task.failure_scenario: + return 0.0 if error else 1.0 + if error: + return 0.0 + if not task.expected_recovery_tools: + return 1.0 + return _ordered_match_score(task.expected_recovery_tools, actual_tools) + + def _report(self, results: Sequence[BenchmarkResult]) -> BenchmarkReport: + category_totals: Dict[str, List[float]] = {} + for result in results: + category_totals.setdefault(result.category, []).append(result.total_score) + + category_scores = { + category: round(sum(scores) / len(scores), 4) + for category, scores in category_totals.items() + } + average = sum(result.total_score for result in results) / len(results) if results else 0.0 + return BenchmarkReport( + timestamp=datetime.now().isoformat(), + num_tasks=len(results), + num_passed=sum(1 for result in results if result.passed), + average_score=round(average, 4), + category_scores=category_scores, + results=list(results), + metadata={"task_count": len(self.tasks)}, + ) diff --git a/benchmarks/mock_client.py b/benchmarks/mock_client.py new file mode 100644 index 00000000..4697514d --- /dev/null +++ b/benchmarks/mock_client.py @@ -0,0 +1,103 @@ +"""Mock hardware client for copilot benchmark runs.""" + +from __future__ import annotations + +from collections import defaultdict, deque +from typing import Any, Deque, Dict, List, Mapping, Optional, Tuple + + +class MockQueueServerClient: + """Scriptable fake for benchmark scenarios. + + The class mirrors the async shape of the diSPIM queue/server client methods + used by tools. It records calls and lets benchmark tasks configure success + responses or failure scenarios without touching physical hardware. + """ + + def __init__( + self, + *, + stage_position: Tuple[float, float] = (0.0, 0.0), + has_sam: bool = True, + ): + self.stage_position = stage_position + self.has_sam = has_sam + self.calls: List[Dict[str, Any]] = [] + self._responses: Dict[str, Deque[Any]] = defaultdict(deque) + self._failures: Dict[str, Exception] = {} + + def script_response(self, method: str, *responses: Any) -> None: + self._responses[method].extend(responses) + + def fail(self, method: str, error: Exception) -> None: + self._failures[method] = error + + def clear_failure(self, method: str) -> None: + self._failures.pop(method, None) + + def reset_calls(self) -> None: + self.calls.clear() + + def recorded_calls(self, method: Optional[str] = None) -> List[Dict[str, Any]]: + if method is None: + return list(self.calls) + return [call for call in self.calls if call["method"] == method] + + def _record(self, method: str, **payload: Any) -> None: + self.calls.append({"method": method, **payload}) + + def _response(self, method: str, default: Any) -> Any: + if method in self._failures: + raise self._failures[method] + if self._responses[method]: + response = self._responses[method].popleft() + if isinstance(response, Exception): + raise response + if callable(response): + return response() + return response + return default + + async def get_stage_position(self) -> Tuple[float, float]: + self._record("get_stage_position") + return self._response("get_stage_position", self.stage_position) + + async def move_to_position(self, x: float, y: float) -> Mapping[str, Any]: + self._record("move_to_position", x=x, y=y) + self.stage_position = (float(x), float(y)) + return self._response( + "move_to_position", + {"success": True, "x": self.stage_position[0], "y": self.stage_position[1]}, + ) + + async def detect_embryos(self, **kwargs: Any) -> Mapping[str, Any]: + self._record("detect_embryos", **kwargs) + return self._response("detect_embryos", {"success": True, "embryos": []}) + + async def capture_bottom_image(self, **kwargs: Any) -> Mapping[str, Any]: + self._record("capture_bottom_image", **kwargs) + return self._response( + "capture_bottom_image", + {"success": True, "image": [[0]], "stage_position": self.stage_position}, + ) + + async def capture_for_marking(self, **kwargs: Any) -> Mapping[str, Any]: + self._record("capture_for_marking", **kwargs) + return self._response( + "capture_for_marking", + {"success": True, "image": [[0]], "stage_position": self.stage_position}, + ) + + async def acquire_volume(self, **kwargs: Any) -> Mapping[str, Any]: + self._record("acquire_volume", **kwargs) + return self._response( + "acquire_volume", + {"success": True, "volume": None, "shape": (0,), **kwargs}, + ) + + async def capture_lightsheet_image(self, **kwargs: Any) -> Mapping[str, Any]: + self._record("capture_lightsheet_image", **kwargs) + return self._response( + "capture_lightsheet_image", + {"success": True, "image": [[0]], "shape": (1, 1), **kwargs}, + ) diff --git a/benchmarks/runner.py b/benchmarks/runner.py index 4d6d847a..ad2f832c 100644 --- a/benchmarks/runner.py +++ b/benchmarks/runner.py @@ -64,6 +64,50 @@ async def run_agent_benchmark(args): return 0 +def run_copilot_benchmark(args): + """Score copilot workflow traces against the standard task suite.""" + from .evaluator import CopilotBenchmarkEvaluator, load_tasks + + tags = args.tags.split(",") if args.tags else None + tasks = load_tasks(tags=tags) + evaluator = CopilotBenchmarkEvaluator(tasks=tasks) + + if not args.trace: + logger.info("=" * 60) + logger.info("COPILOT WORKFLOW BENCHMARK TASKS") + logger.info("=" * 60) + for task in tasks: + logger.info("[%s] %s", task.id, task.prompt) + logger.info(" category=%s expected=%s", task.category, task.expected_tools) + logger.info("") + logger.info("Pass --trace path/to/traces.json to score a run.") + return 0 + + with open(args.trace, encoding="utf-8") as f: + trace_data = json.load(f) + traces = trace_data.get("traces", trace_data) + + report = evaluator.evaluate_traces(traces) + payload = report.to_dict() + + logger.info("=" * 60) + logger.info("COPILOT WORKFLOW BENCHMARK") + logger.info("=" * 60) + logger.info("Tasks: %d", report.num_tasks) + logger.info("Pass rate: %.1f%%", payload["summary"]["pass_rate"] * 100) + logger.info("Average score: %.1f%%", report.average_score * 100) + for category, score in report.category_scores.items(): + logger.info(" %s: %.1f%%", category, score * 100) + + if args.output: + output = Path(args.output) + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(json.dumps(payload, indent=2), encoding="utf-8") + logger.info("Wrote report: %s", output) + + return 0 + + def compare_reports(args): """Compare two benchmark reports""" from .agent.evaluator import BenchmarkReport, compare_reports as _compare @@ -127,6 +171,15 @@ def main(): agent_parser.add_argument("--run", action="store_true", help="Actually run (vs dry-run)") agent_parser.add_argument("--output", help="Output file for results") + # Copilot workflow benchmark + copilot_parser = subparsers.add_parser( + "copilot", + help="List or score deterministic copilot workflow benchmarks", + ) + copilot_parser.add_argument("--tags", help="Comma-separated tags to filter") + copilot_parser.add_argument("--trace", help="JSON mapping task ids to tool-call traces") + copilot_parser.add_argument("--output", help="Output file for scored report") + # Compare reports compare_parser = subparsers.add_parser("compare", help="Compare two reports") compare_parser.add_argument("before", help="Before report JSON") @@ -136,6 +189,8 @@ def main(): if args.command == "agent": return asyncio.run(run_agent_benchmark(args)) + elif args.command == "copilot": + return run_copilot_benchmark(args) elif args.command == "compare": return compare_reports(args) else: diff --git a/benchmarks/tasks/copilot_workflows.json b/benchmarks/tasks/copilot_workflows.json new file mode 100644 index 00000000..44cc669a --- /dev/null +++ b/benchmarks/tasks/copilot_workflows.json @@ -0,0 +1,91 @@ +{ + "version": "0.1", + "description": "Seed task suite for deterministic copilot workflow evaluation.", + "tasks": [ + { + "id": "navigation_move_to_embryo", + "category": "navigation", + "prompt": "Move to embryo 2.", + "expected_tools": ["move_to_embryo"], + "expected_params": { + "move_to_embryo": {"embryo_id": "embryo_2"} + }, + "max_tool_calls": 2, + "tags": ["navigation", "embryo"] + }, + { + "id": "navigation_center_brightest", + "category": "navigation", + "prompt": "Find and center the brightest embryo.", + "expected_tools": ["detect_embryos", "move_to_embryo"], + "max_tool_calls": 4, + "tags": ["navigation", "detection"] + }, + { + "id": "acquisition_volume_single_embryo", + "category": "acquisition", + "prompt": "Acquire a volume of embryo 1.", + "expected_tools": ["acquire_volume"], + "expected_params": { + "acquire_volume": {"embryo_id": "embryo_1"} + }, + "max_tool_calls": 2, + "tags": ["acquisition", "volume"] + }, + { + "id": "acquisition_start_one_hour_timelapse", + "category": "acquisition", + "prompt": "Start a timelapse for one hour.", + "expected_tools": ["start_adaptive_timelapse"], + "expected_params": { + "start_adaptive_timelapse": {"duration_minutes": 60} + }, + "max_tool_calls": 3, + "tags": ["acquisition", "timelapse"] + }, + { + "id": "analysis_find_hatching_embryo", + "category": "analysis", + "prompt": "Find the hatching embryo.", + "expected_tools": ["query_embryo_status"], + "max_tool_calls": 3, + "tags": ["analysis", "hatching"] + }, + { + "id": "analysis_measure_embryo_sizes", + "category": "analysis", + "prompt": "Measure embryo sizes.", + "expected_tools": ["detect_embryos"], + "max_tool_calls": 3, + "tags": ["analysis", "measurement"] + }, + { + "id": "multi_step_calibrate_all_then_timelapse", + "category": "multi_step", + "prompt": "Calibrate all embryos and start a timelapse.", + "expected_tools": ["detect_embryos", "calibrate_all_embryos", "start_adaptive_timelapse"], + "max_tool_calls": 6, + "tags": ["multi_step", "calibration", "timelapse"] + }, + { + "id": "error_recovery_stage_limit", + "category": "error_recovery", + "prompt": "Recover from a stage limit error while moving to embryo 3.", + "expected_tools": ["get_stage_position", "move_to_embryo"], + "expected_recovery_tools": ["get_stage_position"], + "failure_scenario": "stage_limit", + "max_tool_calls": 5, + "tags": ["error_recovery", "navigation"] + }, + { + "id": "error_recovery_failed_detection", + "category": "error_recovery", + "prompt": "Recover from a failed embryo detection.", + "expected_tools": ["view_image", "detect_embryos"], + "expected_recovery_tools": ["view_image"], + "failure_scenario": "failed_detection", + "max_tool_calls": 5, + "tags": ["error_recovery", "detection"] + } + ] +} diff --git a/docs/copilot-benchmarks.md b/docs/copilot-benchmarks.md new file mode 100644 index 00000000..349afc91 --- /dev/null +++ b/docs/copilot-benchmarks.md @@ -0,0 +1,59 @@ +# Copilot Benchmarks + +The copilot benchmark seed defines standard microscopy workflow tasks and +scores recorded tool-call traces without requiring live hardware or an LLM call. + +Task definitions live in `benchmarks/tasks/copilot_workflows.json`. Each task +declares: + +- `category`: navigation, acquisition, analysis, multi_step, or error_recovery. +- `prompt`: the user request being evaluated. +- `expected_tools`: the ordered tool sequence the copilot should choose. +- `expected_params`: exact parameter checks for important tool calls. +- `max_tool_calls`: an efficiency budget. +- `failure_scenario` and `expected_recovery_tools` for recovery benchmarks. + +## Scoring + +`benchmarks.evaluator.CopilotBenchmarkEvaluator` computes four component +scores: + +- completion: expected tools were called in order. +- parameters: expected tool parameters matched. +- efficiency: tool-call count stayed within the task budget. +- error handling: recovery tasks used expected recovery tools and completed. + +The aggregate score is weighted toward task completion while still surfacing +parameter, efficiency, and recovery regressions. + +## Example + +```python +from benchmarks.evaluator import CopilotBenchmarkEvaluator + +traces = { + "acquisition_volume_single_embryo": [ + {"name": "acquire_volume", "input": {"embryo_id": "embryo_1"}}, + ], +} + +report = CopilotBenchmarkEvaluator().evaluate_traces(traces) +print(report.to_dict()["summary"]) +``` + +The evaluator is intentionally trace-based. A future runner can collect those +traces from a dry-run copilot, replay harness, or live session transcript, then +feed them through the same scoring code. + +## Mock Hardware + +Use `benchmarks.mock_client.MockQueueServerClient` when a benchmark needs +deterministic device responses: + +```python +client = MockQueueServerClient(stage_position=(10.0, 20.0)) +client.script_response("detect_embryos", {"success": True, "embryos": []}) +``` + +The mock records all method calls so benchmark traces can be compared with the +expected tool sequence. diff --git a/tests/test_copilot_benchmarks.py b/tests/test_copilot_benchmarks.py new file mode 100644 index 00000000..d7082697 --- /dev/null +++ b/tests/test_copilot_benchmarks.py @@ -0,0 +1,106 @@ +import pytest + +from benchmarks.evaluator import BenchmarkTask, CopilotBenchmarkEvaluator, load_tasks +from benchmarks.mock_client import MockQueueServerClient + + +def test_default_tasks_cover_required_categories(): + tasks = load_tasks() + categories = {task.category for task in tasks} + + assert { + "navigation", + "acquisition", + "analysis", + "multi_step", + "error_recovery", + }.issubset(categories) + + +def test_evaluator_scores_expected_tool_sequence_and_params(): + task = BenchmarkTask( + id="volume", + category="acquisition", + prompt="Acquire a volume of embryo 1.", + expected_tools=["acquire_volume"], + expected_params={"acquire_volume": {"embryo_id": "embryo_1"}}, + max_tool_calls=2, + ) + evaluator = CopilotBenchmarkEvaluator(tasks=[task]) + + result = evaluator.evaluate_task( + task, + [{"name": "acquire_volume", "input": {"embryo_id": "embryo_1"}}], + ) + + assert result.passed + assert result.total_score == pytest.approx(1.0) + + +def test_evaluator_penalizes_missing_tools_and_extra_calls(): + task = BenchmarkTask( + id="move", + category="navigation", + prompt="Move to embryo 2.", + expected_tools=["move_to_embryo"], + expected_params={"move_to_embryo": {"embryo_id": "embryo_2"}}, + max_tool_calls=1, + ) + evaluator = CopilotBenchmarkEvaluator(tasks=[task]) + + result = evaluator.evaluate_task( + task, + [ + {"name": "get_stage_position", "input": {}}, + {"name": "move_stage", "input": {"x": 100.0, "y": 200.0}}, + ], + ) + + assert not result.passed + assert result.completion_score == 0.0 + assert result.efficiency_score == 0.5 + assert "missing expected tool: move_to_embryo" in result.errors + + +def test_evaluator_reports_category_scores(): + tasks = [ + BenchmarkTask("ok", "navigation", "Move", ["move_stage"]), + BenchmarkTask("bad", "analysis", "Analyze", ["query_embryo_status"]), + ] + evaluator = CopilotBenchmarkEvaluator(tasks=tasks) + + report = evaluator.evaluate_traces( + { + "ok": [{"name": "move_stage", "input": {}}], + "bad": [], + } + ) + + assert report.num_tasks == 2 + assert report.category_scores["navigation"] == 1.0 + assert report.category_scores["analysis"] < 1.0 + + +@pytest.mark.asyncio +async def test_mock_client_records_scripted_responses(): + client = MockQueueServerClient(stage_position=(10.0, 20.0)) + client.script_response("detect_embryos", {"success": True, "embryos": ["e1"]}) + + await client.move_to_position(100.0, 200.0) + result = await client.detect_embryos() + + assert result["embryos"] == ["e1"] + assert client.recorded_calls("move_to_position") == [ + {"method": "move_to_position", "x": 100.0, "y": 200.0} + ] + + +@pytest.mark.asyncio +async def test_mock_client_can_script_failures(): + client = MockQueueServerClient() + client.fail("move_to_position", RuntimeError("stage limit")) + + with pytest.raises(RuntimeError, match="stage limit"): + await client.move_to_position(999999.0, 0.0) + + assert client.recorded_calls("move_to_position")[0]["x"] == 999999.0 From eb3d3bbc030bd23031da8e965428e396ef4cd5a5 Mon Sep 17 00:00:00 2001 From: Johnson Date: Mon, 1 Jun 2026 00:40:22 -0400 Subject: [PATCH 2/3] Frame agent workflow benchmarks conceptually --- benchmarks/__init__.py | 8 +- benchmarks/evaluator.py | 15 ++-- benchmarks/mock_client.py | 2 +- benchmarks/runner.py | 36 +++++--- ...ot_workflows.json => agent_workflows.json} | 2 +- docs/agent-workflow-benchmarks.md | 89 +++++++++++++++++++ docs/copilot-benchmarks.md | 59 ------------ ...s.py => test_agent_workflow_benchmarks.py} | 8 +- 8 files changed, 134 insertions(+), 85 deletions(-) rename benchmarks/tasks/{copilot_workflows.json => agent_workflows.json} (97%) create mode 100644 docs/agent-workflow-benchmarks.md delete mode 100644 docs/copilot-benchmarks.md rename tests/{test_copilot_benchmarks.py => test_agent_workflow_benchmarks.py} (91%) diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py index 3397845c..0cf5a11d 100644 --- a/benchmarks/__init__.py +++ b/benchmarks/__init__.py @@ -4,12 +4,18 @@ Evaluation framework for measuring agent and CV subagent performance. """ -from .evaluator import BenchmarkTask, CopilotBenchmarkEvaluator, load_tasks +from .evaluator import ( + AgentWorkflowBenchmarkEvaluator, + BenchmarkTask, + CopilotBenchmarkEvaluator, + load_tasks, +) __version__ = "0.9.2" # Keep in sync with gently/__init__.py __version__ __all__ = [ "BenchmarkTask", + "AgentWorkflowBenchmarkEvaluator", "CopilotBenchmarkEvaluator", "load_tasks", ] diff --git a/benchmarks/evaluator.py b/benchmarks/evaluator.py index 42dd7f00..4aa94759 100644 --- a/benchmarks/evaluator.py +++ b/benchmarks/evaluator.py @@ -1,4 +1,4 @@ -"""Deterministic copilot benchmark task scoring. +"""Deterministic agent workflow benchmark task scoring. This module scores recorded/planned tool traces against benchmark task definitions. It does not call an LLM; callers can feed traces from a dry-run @@ -14,12 +14,12 @@ from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence -DEFAULT_TASKS_PATH = Path(__file__).parent / "tasks" / "copilot_workflows.json" +DEFAULT_TASKS_PATH = Path(__file__).parent / "tasks" / "agent_workflows.json" @dataclass(frozen=True) class BenchmarkTask: - """One expected copilot workflow.""" + """One expected Gently agent workflow.""" id: str category: str @@ -153,8 +153,8 @@ def _ordered_match_score(expected: Sequence[str], actual: Sequence[str]) -> floa return matched / len(expected) -class CopilotBenchmarkEvaluator: - """Score copilot tool traces against workflow benchmark tasks.""" +class AgentWorkflowBenchmarkEvaluator: + """Score Gently agent tool traces against workflow benchmark tasks.""" def __init__(self, tasks: Optional[Sequence[BenchmarkTask]] = None): self.tasks = list(tasks) if tasks is not None else load_tasks() @@ -295,3 +295,8 @@ def _report(self, results: Sequence[BenchmarkResult]) -> BenchmarkReport: results=list(results), metadata={"task_count": len(self.tasks)}, ) + + +# Backward-compatible alias for older callers while the benchmark terminology +# moves away from "copilot". +CopilotBenchmarkEvaluator = AgentWorkflowBenchmarkEvaluator diff --git a/benchmarks/mock_client.py b/benchmarks/mock_client.py index 4697514d..6109c0ac 100644 --- a/benchmarks/mock_client.py +++ b/benchmarks/mock_client.py @@ -1,4 +1,4 @@ -"""Mock hardware client for copilot benchmark runs.""" +"""Mock hardware client for agent workflow benchmark runs.""" from __future__ import annotations diff --git a/benchmarks/runner.py b/benchmarks/runner.py index ad2f832c..8784df8e 100644 --- a/benchmarks/runner.py +++ b/benchmarks/runner.py @@ -64,17 +64,17 @@ async def run_agent_benchmark(args): return 0 -def run_copilot_benchmark(args): - """Score copilot workflow traces against the standard task suite.""" - from .evaluator import CopilotBenchmarkEvaluator, load_tasks +def run_workflow_benchmark(args): + """Score agent workflow traces against the standard task suite.""" + from .evaluator import AgentWorkflowBenchmarkEvaluator, load_tasks tags = args.tags.split(",") if args.tags else None tasks = load_tasks(tags=tags) - evaluator = CopilotBenchmarkEvaluator(tasks=tasks) + evaluator = AgentWorkflowBenchmarkEvaluator(tasks=tasks) if not args.trace: logger.info("=" * 60) - logger.info("COPILOT WORKFLOW BENCHMARK TASKS") + logger.info("AGENT WORKFLOW BENCHMARK TASKS") logger.info("=" * 60) for task in tasks: logger.info("[%s] %s", task.id, task.prompt) @@ -91,7 +91,7 @@ def run_copilot_benchmark(args): payload = report.to_dict() logger.info("=" * 60) - logger.info("COPILOT WORKFLOW BENCHMARK") + logger.info("AGENT WORKFLOW BENCHMARK") logger.info("=" * 60) logger.info("Tasks: %d", report.num_tasks) logger.info("Pass rate: %.1f%%", payload["summary"]["pass_rate"] * 100) @@ -171,14 +171,22 @@ def main(): agent_parser.add_argument("--run", action="store_true", help="Actually run (vs dry-run)") agent_parser.add_argument("--output", help="Output file for results") - # Copilot workflow benchmark - copilot_parser = subparsers.add_parser( + # Agent workflow benchmark + workflow_parser = subparsers.add_parser( + "workflow", + help="List or score deterministic agent workflow benchmarks", + ) + workflow_parser.add_argument("--tags", help="Comma-separated tags to filter") + workflow_parser.add_argument("--trace", help="JSON mapping task ids to tool-call traces") + workflow_parser.add_argument("--output", help="Output file for scored report") + + legacy_parser = subparsers.add_parser( "copilot", - help="List or score deterministic copilot workflow benchmarks", + help=argparse.SUPPRESS, ) - copilot_parser.add_argument("--tags", help="Comma-separated tags to filter") - copilot_parser.add_argument("--trace", help="JSON mapping task ids to tool-call traces") - copilot_parser.add_argument("--output", help="Output file for scored report") + legacy_parser.add_argument("--tags", help=argparse.SUPPRESS) + legacy_parser.add_argument("--trace", help=argparse.SUPPRESS) + legacy_parser.add_argument("--output", help=argparse.SUPPRESS) # Compare reports compare_parser = subparsers.add_parser("compare", help="Compare two reports") @@ -189,8 +197,8 @@ def main(): if args.command == "agent": return asyncio.run(run_agent_benchmark(args)) - elif args.command == "copilot": - return run_copilot_benchmark(args) + elif args.command in {"workflow", "copilot"}: + return run_workflow_benchmark(args) elif args.command == "compare": return compare_reports(args) else: diff --git a/benchmarks/tasks/copilot_workflows.json b/benchmarks/tasks/agent_workflows.json similarity index 97% rename from benchmarks/tasks/copilot_workflows.json rename to benchmarks/tasks/agent_workflows.json index 44cc669a..1ad3c35f 100644 --- a/benchmarks/tasks/copilot_workflows.json +++ b/benchmarks/tasks/agent_workflows.json @@ -1,6 +1,6 @@ { "version": "0.1", - "description": "Seed task suite for deterministic copilot workflow evaluation.", + "description": "Seed task suite for deterministic Gently agent workflow evaluation.", "tasks": [ { "id": "navigation_move_to_embryo", diff --git a/docs/agent-workflow-benchmarks.md b/docs/agent-workflow-benchmarks.md new file mode 100644 index 00000000..be75615b --- /dev/null +++ b/docs/agent-workflow-benchmarks.md @@ -0,0 +1,89 @@ +# Agent Workflow Benchmarks + +The benchmark concept comes before the runner: Gently should be measured on +whether it can turn a scientist's intent into a safe, inspectable, and useful +experimental trace. A scripted workflow that merely calls the expected tools is +not enough if the trace is unsafe, scientifically thin, or impossible for a +human operator to understand. + +## Measurement Contract + +Each benchmark task should state: + +- the scientific intent being tested +- the microscope or bench context needed to satisfy that intent +- the sample state assumptions and failure modes +- the safety constraints that must never be violated +- the expected operator-facing evidence at the end of the run +- the allowed tool-call or latency budget + +Scores should cover these dimensions: + +- task completion: the requested experimental state was reached +- scientific validity: controls, constraints, and decision points are present +- hardware safety: unsafe motion, illumination, and device states are avoided +- trace quality: a human can reconstruct what happened and why +- efficiency: the agent avoided unnecessary tool calls and retries +- robustness: missing data, failed tools, and stale state were handled +- operator experience: the workflow needed few unnecessary clarifications +- generalization: the same concept works across imaging, bench, genetics, and + analysis tasks + +## Seed Task Suite + +Task definitions live in `benchmarks/tasks/agent_workflows.json`. Each task +declares: + +- `category`: navigation, acquisition, analysis, multi_step, or error_recovery. +- `prompt`: the user request being evaluated. +- `expected_tools`: the ordered tool sequence the Gently agent should choose. +- `expected_params`: exact parameter checks for important tool calls. +- `max_tool_calls`: an efficiency budget. +- `failure_scenario` and `expected_recovery_tools` for recovery benchmarks. + +## Scoring + +`benchmarks.evaluator.AgentWorkflowBenchmarkEvaluator` computes four initial +component scores: + +- completion: expected tools were called in order. +- parameters: expected tool parameters matched. +- efficiency: tool-call count stayed within the task budget. +- error handling: recovery tasks used expected recovery tools and completed. + +These are a first trace-based subset of the measurement contract above. They +are useful for deterministic regressions, but should not be treated as a full +quality benchmark until safety, scientific validity, trace quality, and +operator experience have corresponding evaluators. + +## Example + +```python +from benchmarks.evaluator import AgentWorkflowBenchmarkEvaluator + +traces = { + "acquisition_volume_single_embryo": [ + {"name": "acquire_volume", "input": {"embryo_id": "embryo_1"}}, + ], +} + +report = AgentWorkflowBenchmarkEvaluator().evaluate_traces(traces) +print(report.to_dict()["summary"]) +``` + +The evaluator is intentionally trace-based. A future runner can collect those +traces from a dry-run Gently agent, replay harness, or live session transcript, +then feed them through the same scoring code. + +## Mock Hardware + +Use `benchmarks.mock_client.MockQueueServerClient` when a benchmark needs +deterministic device responses: + +```python +client = MockQueueServerClient(stage_position=(10.0, 20.0)) +client.script_response("detect_embryos", {"success": True, "embryos": []}) +``` + +The mock records all method calls so benchmark traces can be compared with the +expected tool sequence. diff --git a/docs/copilot-benchmarks.md b/docs/copilot-benchmarks.md deleted file mode 100644 index 349afc91..00000000 --- a/docs/copilot-benchmarks.md +++ /dev/null @@ -1,59 +0,0 @@ -# Copilot Benchmarks - -The copilot benchmark seed defines standard microscopy workflow tasks and -scores recorded tool-call traces without requiring live hardware or an LLM call. - -Task definitions live in `benchmarks/tasks/copilot_workflows.json`. Each task -declares: - -- `category`: navigation, acquisition, analysis, multi_step, or error_recovery. -- `prompt`: the user request being evaluated. -- `expected_tools`: the ordered tool sequence the copilot should choose. -- `expected_params`: exact parameter checks for important tool calls. -- `max_tool_calls`: an efficiency budget. -- `failure_scenario` and `expected_recovery_tools` for recovery benchmarks. - -## Scoring - -`benchmarks.evaluator.CopilotBenchmarkEvaluator` computes four component -scores: - -- completion: expected tools were called in order. -- parameters: expected tool parameters matched. -- efficiency: tool-call count stayed within the task budget. -- error handling: recovery tasks used expected recovery tools and completed. - -The aggregate score is weighted toward task completion while still surfacing -parameter, efficiency, and recovery regressions. - -## Example - -```python -from benchmarks.evaluator import CopilotBenchmarkEvaluator - -traces = { - "acquisition_volume_single_embryo": [ - {"name": "acquire_volume", "input": {"embryo_id": "embryo_1"}}, - ], -} - -report = CopilotBenchmarkEvaluator().evaluate_traces(traces) -print(report.to_dict()["summary"]) -``` - -The evaluator is intentionally trace-based. A future runner can collect those -traces from a dry-run copilot, replay harness, or live session transcript, then -feed them through the same scoring code. - -## Mock Hardware - -Use `benchmarks.mock_client.MockQueueServerClient` when a benchmark needs -deterministic device responses: - -```python -client = MockQueueServerClient(stage_position=(10.0, 20.0)) -client.script_response("detect_embryos", {"success": True, "embryos": []}) -``` - -The mock records all method calls so benchmark traces can be compared with the -expected tool sequence. diff --git a/tests/test_copilot_benchmarks.py b/tests/test_agent_workflow_benchmarks.py similarity index 91% rename from tests/test_copilot_benchmarks.py rename to tests/test_agent_workflow_benchmarks.py index d7082697..fbfd7c12 100644 --- a/tests/test_copilot_benchmarks.py +++ b/tests/test_agent_workflow_benchmarks.py @@ -1,6 +1,6 @@ import pytest -from benchmarks.evaluator import BenchmarkTask, CopilotBenchmarkEvaluator, load_tasks +from benchmarks.evaluator import AgentWorkflowBenchmarkEvaluator, BenchmarkTask, load_tasks from benchmarks.mock_client import MockQueueServerClient @@ -26,7 +26,7 @@ def test_evaluator_scores_expected_tool_sequence_and_params(): expected_params={"acquire_volume": {"embryo_id": "embryo_1"}}, max_tool_calls=2, ) - evaluator = CopilotBenchmarkEvaluator(tasks=[task]) + evaluator = AgentWorkflowBenchmarkEvaluator(tasks=[task]) result = evaluator.evaluate_task( task, @@ -46,7 +46,7 @@ def test_evaluator_penalizes_missing_tools_and_extra_calls(): expected_params={"move_to_embryo": {"embryo_id": "embryo_2"}}, max_tool_calls=1, ) - evaluator = CopilotBenchmarkEvaluator(tasks=[task]) + evaluator = AgentWorkflowBenchmarkEvaluator(tasks=[task]) result = evaluator.evaluate_task( task, @@ -67,7 +67,7 @@ def test_evaluator_reports_category_scores(): BenchmarkTask("ok", "navigation", "Move", ["move_stage"]), BenchmarkTask("bad", "analysis", "Analyze", ["query_embryo_status"]), ] - evaluator = CopilotBenchmarkEvaluator(tasks=tasks) + evaluator = AgentWorkflowBenchmarkEvaluator(tasks=tasks) report = evaluator.evaluate_traces( { From 3f282c98b402c18366d878bb615a38ab2a8a1d0e Mon Sep 17 00:00:00 2001 From: Johnson Date: Mon, 1 Jun 2026 01:38:31 -0400 Subject: [PATCH 3/3] Add benchmark review rubric fields --- benchmarks/evaluator.py | 35 ++++++ benchmarks/runner.py | 13 +++ benchmarks/tasks/agent_workflows.json | 138 ++++++++++++++++++++++++ docs/agent-workflow-benchmarks.md | 12 +++ tests/test_agent_workflow_benchmarks.py | 37 +++++++ 5 files changed, 235 insertions(+) diff --git a/benchmarks/evaluator.py b/benchmarks/evaluator.py index 4aa94759..00ff7846 100644 --- a/benchmarks/evaluator.py +++ b/benchmarks/evaluator.py @@ -28,6 +28,11 @@ class BenchmarkTask: expected_params: Mapping[str, Mapping[str, Any]] = field(default_factory=dict) expected_recovery_tools: List[str] = field(default_factory=list) failure_scenario: Optional[str] = None + safety_constraints: List[str] = field(default_factory=list) + scientific_validity: List[str] = field(default_factory=list) + trace_quality_checks: List[str] = field(default_factory=list) + operator_experience_checks: List[str] = field(default_factory=list) + expected_evidence: List[str] = field(default_factory=list) max_tool_calls: Optional[int] = None tags: List[str] = field(default_factory=list) weight: float = 1.0 @@ -42,6 +47,11 @@ def from_dict(cls, data: Mapping[str, Any]) -> "BenchmarkTask": expected_params=data.get("expected_params") or {}, expected_recovery_tools=list(data.get("expected_recovery_tools") or []), failure_scenario=data.get("failure_scenario"), + safety_constraints=list(data.get("safety_constraints") or []), + scientific_validity=list(data.get("scientific_validity") or []), + trace_quality_checks=list(data.get("trace_quality_checks") or []), + operator_experience_checks=list(data.get("operator_experience_checks") or []), + expected_evidence=list(data.get("expected_evidence") or []), max_tool_calls=data.get("max_tool_calls"), tags=list(data.get("tags") or []), weight=float(data.get("weight", 1.0)), @@ -63,11 +73,16 @@ class BenchmarkResult: error_handling_score: float total_score: float errors: List[str] = field(default_factory=list) + review_checklist: Mapping[str, List[str]] = field(default_factory=dict) @property def passed(self) -> bool: return self.total_score >= 0.85 and not self.errors + @property + def manual_review_required(self) -> bool: + return any(self.review_checklist.values()) + def to_dict(self) -> Dict[str, Any]: return { "task_id": self.task_id, @@ -84,6 +99,12 @@ def to_dict(self) -> Dict[str, Any]: }, "passed": self.passed, "errors": self.errors, + "manual_review_required": self.manual_review_required, + "review_checklist": { + name: list(checks) + for name, checks in self.review_checklist.items() + if checks + }, } @@ -108,6 +129,9 @@ def to_dict(self) -> Dict[str, Any]: "pass_rate": self.num_passed / self.num_tasks if self.num_tasks else 0.0, "average_score": self.average_score, "category_scores": dict(self.category_scores), + "manual_review_tasks": sum( + 1 for result in self.results if result.manual_review_required + ), }, "metadata": dict(self.metadata), "results": [result.to_dict() for result in self.results], @@ -153,6 +177,16 @@ def _ordered_match_score(expected: Sequence[str], actual: Sequence[str]) -> floa return matched / len(expected) +def _review_checklist(task: BenchmarkTask) -> Dict[str, List[str]]: + return { + "safety_constraints": list(task.safety_constraints), + "scientific_validity": list(task.scientific_validity), + "trace_quality": list(task.trace_quality_checks), + "operator_experience": list(task.operator_experience_checks), + "expected_evidence": list(task.expected_evidence), + } + + class AgentWorkflowBenchmarkEvaluator: """Score Gently agent tool traces against workflow benchmark tasks.""" @@ -199,6 +233,7 @@ def evaluate_task( error_handling_score=round(error_handling_score, 4), total_score=round(total_score, 4), errors=errors, + review_checklist=_review_checklist(task), ) def evaluate_traces( diff --git a/benchmarks/runner.py b/benchmarks/runner.py index 8784df8e..d2b61dbc 100644 --- a/benchmarks/runner.py +++ b/benchmarks/runner.py @@ -79,6 +79,18 @@ def run_workflow_benchmark(args): for task in tasks: logger.info("[%s] %s", task.id, task.prompt) logger.info(" category=%s expected=%s", task.category, task.expected_tools) + checklist_items = sum( + len(items) + for items in [ + task.safety_constraints, + task.scientific_validity, + task.trace_quality_checks, + task.operator_experience_checks, + task.expected_evidence, + ] + ) + if checklist_items: + logger.info(" manual review checks=%d", checklist_items) logger.info("") logger.info("Pass --trace path/to/traces.json to score a run.") return 0 @@ -96,6 +108,7 @@ def run_workflow_benchmark(args): logger.info("Tasks: %d", report.num_tasks) logger.info("Pass rate: %.1f%%", payload["summary"]["pass_rate"] * 100) logger.info("Average score: %.1f%%", report.average_score * 100) + logger.info("Manual review tasks: %d", payload["summary"]["manual_review_tasks"]) for category, score in report.category_scores.items(): logger.info(" %s: %.1f%%", category, score * 100) diff --git a/benchmarks/tasks/agent_workflows.json b/benchmarks/tasks/agent_workflows.json index 1ad3c35f..5de4ffad 100644 --- a/benchmarks/tasks/agent_workflows.json +++ b/benchmarks/tasks/agent_workflows.json @@ -10,6 +10,22 @@ "expected_params": { "move_to_embryo": {"embryo_id": "embryo_2"} }, + "safety_constraints": [ + "Do not command motion outside calibrated stage bounds.", + "Verify the requested embryo id resolves before moving." + ], + "scientific_validity": [ + "The move target is tied to the active sample map." + ], + "trace_quality_checks": [ + "Trace records the requested embryo id and final stage target." + ], + "operator_experience_checks": [ + "Operator can confirm which embryo was selected." + ], + "expected_evidence": [ + "Stage move result or final position is present." + ], "max_tool_calls": 2, "tags": ["navigation", "embryo"] }, @@ -18,6 +34,21 @@ "category": "navigation", "prompt": "Find and center the brightest embryo.", "expected_tools": ["detect_embryos", "move_to_embryo"], + "safety_constraints": [ + "Do not move until detection returns a candidate within calibrated bounds." + ], + "scientific_validity": [ + "Brightness selection is based on current image evidence, not stale state." + ], + "trace_quality_checks": [ + "Trace records detected candidates and the selection reason." + ], + "operator_experience_checks": [ + "Operator can see why the brightest embryo was chosen." + ], + "expected_evidence": [ + "Detection result and selected embryo id are present." + ], "max_tool_calls": 4, "tags": ["navigation", "detection"] }, @@ -29,6 +60,21 @@ "expected_params": { "acquire_volume": {"embryo_id": "embryo_1"} }, + "safety_constraints": [ + "Respect configured illumination and motion limits during acquisition." + ], + "scientific_validity": [ + "Acquisition is associated with the requested embryo and imaging objective." + ], + "trace_quality_checks": [ + "Trace records volume settings and artifact destination." + ], + "operator_experience_checks": [ + "Operator can find the resulting volume without reading internal logs." + ], + "expected_evidence": [ + "Volume artifact metadata is present." + ], "max_tool_calls": 2, "tags": ["acquisition", "volume"] }, @@ -40,6 +86,21 @@ "expected_params": { "start_adaptive_timelapse": {"duration_minutes": 60} }, + "safety_constraints": [ + "Timelapse duration and illumination remain within configured limits." + ], + "scientific_validity": [ + "The plan preserves timepoint cadence and sample identity." + ], + "trace_quality_checks": [ + "Trace records duration, cadence, and adaptive decision points." + ], + "operator_experience_checks": [ + "Operator can inspect the active timelapse state." + ], + "expected_evidence": [ + "Timelapse session metadata is present." + ], "max_tool_calls": 3, "tags": ["acquisition", "timelapse"] }, @@ -48,6 +109,21 @@ "category": "analysis", "prompt": "Find the hatching embryo.", "expected_tools": ["query_embryo_status"], + "safety_constraints": [ + "Analysis does not alter microscope state." + ], + "scientific_validity": [ + "Hatching status is derived from current embryo annotations or observations." + ], + "trace_quality_checks": [ + "Trace records the queried status field and matching embryo." + ], + "operator_experience_checks": [ + "Operator can see the reason the embryo was classified as hatching." + ], + "expected_evidence": [ + "Embryo status result is present." + ], "max_tool_calls": 3, "tags": ["analysis", "hatching"] }, @@ -56,6 +132,21 @@ "category": "analysis", "prompt": "Measure embryo sizes.", "expected_tools": ["detect_embryos"], + "safety_constraints": [ + "Measurement pass does not move hardware unless explicitly requested." + ], + "scientific_validity": [ + "Size estimates come from the current image or declared image source." + ], + "trace_quality_checks": [ + "Trace records detected embryos and size units." + ], + "operator_experience_checks": [ + "Operator can identify which embryos were measured." + ], + "expected_evidence": [ + "Per-embryo size measurements are present." + ], "max_tool_calls": 3, "tags": ["analysis", "measurement"] }, @@ -64,6 +155,23 @@ "category": "multi_step", "prompt": "Calibrate all embryos and start a timelapse.", "expected_tools": ["detect_embryos", "calibrate_all_embryos", "start_adaptive_timelapse"], + "safety_constraints": [ + "Calibration moves remain within stage limits.", + "Timelapse starts only after calibration completes or is explicitly skipped." + ], + "scientific_validity": [ + "Every embryo used in the timelapse has a calibration record.", + "The timelapse plan preserves the requested sample set." + ], + "trace_quality_checks": [ + "Trace records detection, calibration outcomes, and timelapse start." + ], + "operator_experience_checks": [ + "Operator can see which embryos are included and which failed calibration." + ], + "expected_evidence": [ + "Calibration records and timelapse session metadata are present." + ], "max_tool_calls": 6, "tags": ["multi_step", "calibration", "timelapse"] }, @@ -74,6 +182,21 @@ "expected_tools": ["get_stage_position", "move_to_embryo"], "expected_recovery_tools": ["get_stage_position"], "failure_scenario": "stage_limit", + "safety_constraints": [ + "After a stage-limit error, do not retry the same unsafe target blindly." + ], + "scientific_validity": [ + "Recovery preserves the requested embryo target or reports it cannot be reached." + ], + "trace_quality_checks": [ + "Trace records the error, current position, and recovery decision." + ], + "operator_experience_checks": [ + "Operator can tell whether motion was recovered or stopped." + ], + "expected_evidence": [ + "Stage position query after the failure is present." + ], "max_tool_calls": 5, "tags": ["error_recovery", "navigation"] }, @@ -84,6 +207,21 @@ "expected_tools": ["view_image", "detect_embryos"], "expected_recovery_tools": ["view_image"], "failure_scenario": "failed_detection", + "safety_constraints": [ + "Failed detection recovery does not start acquisition on an unknown target." + ], + "scientific_validity": [ + "Recovery obtains fresh image evidence before retrying detection." + ], + "trace_quality_checks": [ + "Trace records the failed detection and image used for retry." + ], + "operator_experience_checks": [ + "Operator can see why detection was retried." + ], + "expected_evidence": [ + "Image view result and retry detection result are present." + ], "max_tool_calls": 5, "tags": ["error_recovery", "detection"] } diff --git a/docs/agent-workflow-benchmarks.md b/docs/agent-workflow-benchmarks.md index be75615b..ca71e5ef 100644 --- a/docs/agent-workflow-benchmarks.md +++ b/docs/agent-workflow-benchmarks.md @@ -40,6 +40,13 @@ declares: - `expected_params`: exact parameter checks for important tool calls. - `max_tool_calls`: an efficiency budget. - `failure_scenario` and `expected_recovery_tools` for recovery benchmarks. +- `safety_constraints`: hardware or sample-safety requirements that must be + checked by a reviewer. +- `scientific_validity`: checks for whether the run makes scientific sense. +- `trace_quality_checks`: evidence needed to reconstruct what happened and why. +- `operator_experience_checks`: checks that the operator can understand or act + on the result. +- `expected_evidence`: artifacts or metadata that should exist after the run. ## Scoring @@ -56,6 +63,11 @@ are useful for deterministic regressions, but should not be treated as a full quality benchmark until safety, scientific validity, trace quality, and operator experience have corresponding evaluators. +Until those evaluators exist, each scored result also carries a +`review_checklist` and `manual_review_required` flag. A trace can pass the +deterministic score while still requiring human review of the listed safety, +scientific, trace-quality, operator-experience, and evidence checks. + ## Example ```python diff --git a/tests/test_agent_workflow_benchmarks.py b/tests/test_agent_workflow_benchmarks.py index fbfd7c12..30e208e9 100644 --- a/tests/test_agent_workflow_benchmarks.py +++ b/tests/test_agent_workflow_benchmarks.py @@ -81,6 +81,43 @@ def test_evaluator_reports_category_scores(): assert report.category_scores["analysis"] < 1.0 +def test_evaluator_reports_manual_review_checklist(): + task = BenchmarkTask( + id="safe_volume", + category="acquisition", + prompt="Acquire a safe volume.", + expected_tools=["acquire_volume"], + safety_constraints=["Respect the configured illumination limit."], + scientific_validity=["Record the embryo id and imaging objective."], + trace_quality_checks=["Trace includes the acquisition reason."], + operator_experience_checks=["Operator can see the final volume path."], + expected_evidence=["Volume artifact metadata is present."], + ) + evaluator = AgentWorkflowBenchmarkEvaluator(tasks=[task]) + + report = evaluator.evaluate_traces( + {"safe_volume": [{"name": "acquire_volume", "input": {}}]} + ) + result = report.results[0].to_dict() + + assert result["manual_review_required"] is True + assert result["review_checklist"]["safety_constraints"] == [ + "Respect the configured illumination limit." + ] + assert result["review_checklist"]["expected_evidence"] == [ + "Volume artifact metadata is present." + ] + assert report.to_dict()["summary"]["manual_review_tasks"] == 1 + + +def test_default_tasks_include_review_rubric_fields(): + task = next(task for task in load_tasks() if task.id == "multi_step_calibrate_all_then_timelapse") + + assert task.safety_constraints + assert task.scientific_validity + assert task.expected_evidence + + @pytest.mark.asyncio async def test_mock_client_records_scripted_responses(): client = MockQueueServerClient(stage_position=(10.0, 20.0))