diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index 3d992d25..0cf5a11d 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -4,4 +4,18 @@
 Evaluation framework for measuring agent and CV subagent performance.
 """
 
+from .evaluator import (
+    AgentWorkflowBenchmarkEvaluator,
+    BenchmarkTask,
+    CopilotBenchmarkEvaluator,
+    load_tasks,
+)
+
 __version__ = "0.9.2"  # Keep in sync with gently/__init__.py __version__
+
+__all__ = [
+    "BenchmarkTask",
+    "AgentWorkflowBenchmarkEvaluator",
+    "CopilotBenchmarkEvaluator",
+    "load_tasks",
+]
diff --git a/benchmarks/evaluator.py b/benchmarks/evaluator.py
new file mode 100644
index 00000000..00ff7846
--- /dev/null
+++ b/benchmarks/evaluator.py
@@ -0,0 +1,337 @@
+"""Deterministic agent workflow benchmark task scoring.
+
+This module scores recorded/planned tool traces against benchmark task
+definitions. It does not call an LLM; callers can feed traces from a dry-run
+agent, a replay harness, or hand-authored regression cases.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence
+
+
+DEFAULT_TASKS_PATH = Path(__file__).parent / "tasks" / "agent_workflows.json"
+
+
+@dataclass(frozen=True)
+class BenchmarkTask:
+    """One expected Gently agent workflow."""
+
+    id: str
+    category: str
+    prompt: str
+    expected_tools: List[str]
+    expected_params: Mapping[str, Mapping[str, Any]] = field(default_factory=dict)
+    expected_recovery_tools: List[str] = field(default_factory=list)
+    failure_scenario: Optional[str] = None
+    safety_constraints: List[str] = field(default_factory=list)
+    scientific_validity: List[str] = field(default_factory=list)
+    trace_quality_checks: List[str] = field(default_factory=list)
+    operator_experience_checks: List[str] = field(default_factory=list)
+    expected_evidence: List[str] = field(default_factory=list)
+    max_tool_calls: Optional[int] = None
+    tags: List[str] = field(default_factory=list)
+    weight: float = 1.0
+
+    @classmethod
+    def from_dict(cls, data: Mapping[str, Any]) -> "BenchmarkTask":
+        return cls(
+            id=str(data["id"]),
+            category=str(data["category"]),
+            prompt=str(data["prompt"]),
+            expected_tools=list(data.get("expected_tools") or []),
+            expected_params=data.get("expected_params") or {},
+            expected_recovery_tools=list(data.get("expected_recovery_tools") or []),
+            failure_scenario=data.get("failure_scenario"),
+            safety_constraints=list(data.get("safety_constraints") or []),
+            scientific_validity=list(data.get("scientific_validity") or []),
+            trace_quality_checks=list(data.get("trace_quality_checks") or []),
+            operator_experience_checks=list(data.get("operator_experience_checks") or []),
+            expected_evidence=list(data.get("expected_evidence") or []),
+            max_tool_calls=data.get("max_tool_calls"),
+            tags=list(data.get("tags") or []),
+            weight=float(data.get("weight", 1.0)),
+        )
+
+
+@dataclass(frozen=True)
+class BenchmarkResult:
+    """Score for one benchmark task."""
+
+    task_id: str
+    category: str
+    prompt: str
+    expected_tools: List[str]
+    actual_tools: List[str]
+    completion_score: float
+    parameter_score: float
+    efficiency_score: float
+    error_handling_score: float
+    total_score: float
+    errors: List[str] = field(default_factory=list)
+    review_checklist: Mapping[str, List[str]] = field(default_factory=dict)
+
+    @property
+    def passed(self) -> bool:
+        return self.total_score >= 0.85 and not self.errors
+
+    @property
+    def manual_review_required(self) -> bool:
+        return any(self.review_checklist.values())
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "task_id": self.task_id,
+            "category": self.category,
+            "prompt": self.prompt,
+            "expected_tools": self.expected_tools,
+            "actual_tools": self.actual_tools,
+            "scores": {
+                "completion": self.completion_score,
+                "parameters": self.parameter_score,
+                "efficiency": self.efficiency_score,
+                "error_handling": self.error_handling_score,
+                "total": self.total_score,
+            },
+            "passed": self.passed,
+            "errors": self.errors,
+            "manual_review_required": self.manual_review_required,
+            "review_checklist": {
+                name: list(checks)
+                for name, checks in self.review_checklist.items()
+                if checks
+            },
+        }
+
+
+@dataclass(frozen=True)
+class BenchmarkReport:
+    """Aggregate benchmark run summary."""
+
+    timestamp: str
+    num_tasks: int
+    num_passed: int
+    average_score: float
+    category_scores: Mapping[str, float]
+    results: List[BenchmarkResult]
+    metadata: Mapping[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "timestamp": self.timestamp,
+            "summary": {
+                "num_tasks": self.num_tasks,
+                "num_passed": self.num_passed,
+                "pass_rate": self.num_passed / self.num_tasks if self.num_tasks else 0.0,
+                "average_score": self.average_score,
+                "category_scores": dict(self.category_scores),
+                "manual_review_tasks": sum(
+                    1 for result in self.results if result.manual_review_required
+                ),
+            },
+            "metadata": dict(self.metadata),
+            "results": [result.to_dict() for result in self.results],
+        }
+
+
+def load_tasks(path: Path = DEFAULT_TASKS_PATH, tags: Optional[Iterable[str]] = None) -> List[BenchmarkTask]:
+    """Load benchmark tasks from a JSON task suite."""
+    data = json.loads(Path(path).read_text(encoding="utf-8"))
+    tasks = [BenchmarkTask.from_dict(item) for item in data.get("tasks", [])]
+    if tags is None:
+        return tasks
+
+    wanted = set(tags)
+    return [task for task in tasks if wanted.intersection(task.tags)]
+
+
+def _tool_name(call: Mapping[str, Any]) -> Optional[str]:
+    return call.get("name") or call.get("tool") or call.get("tool_name")
+
+
+def _tool_input(call: Mapping[str, Any]) -> Mapping[str, Any]:
+    payload = call.get("input")
+    if payload is None:
+        payload = call.get("params")
+    if payload is None:
+        payload = call.get("arguments")
+    return payload or {}
+
+
+def _ordered_match_score(expected: Sequence[str], actual: Sequence[str]) -> float:
+    if not expected:
+        return 1.0
+
+    cursor = 0
+    matched = 0
+    for expected_name in expected:
+        for index in range(cursor, len(actual)):
+            if actual[index] == expected_name:
+                matched += 1
+                cursor = index + 1
+                break
+    return matched / len(expected)
+
+
+def _review_checklist(task: BenchmarkTask) -> Dict[str, List[str]]:
+    return {
+        "safety_constraints": list(task.safety_constraints),
+        "scientific_validity": list(task.scientific_validity),
+        "trace_quality": list(task.trace_quality_checks),
+        "operator_experience": list(task.operator_experience_checks),
+        "expected_evidence": list(task.expected_evidence),
+    }
+
+
+class AgentWorkflowBenchmarkEvaluator:
+    """Score Gently agent tool traces against workflow benchmark tasks."""
+
+    def __init__(self, tasks: Optional[Sequence[BenchmarkTask]] = None):
+        self.tasks = list(tasks) if tasks is not None else load_tasks()
+
+    def evaluate_task(
+        self,
+        task: BenchmarkTask,
+        tool_calls: Sequence[Mapping[str, Any]],
+        *,
+        error: Optional[str] = None,
+    ) -> BenchmarkResult:
+        actual_tools = [name for name in (_tool_name(call) for call in tool_calls) if name]
+        errors: List[str] = []
+
+        completion_score = _ordered_match_score(task.expected_tools, actual_tools)
+        if completion_score < 1.0:
+            missing = [name for name in task.expected_tools if name not in actual_tools]
+            errors.extend(f"missing expected tool: {name}" for name in missing)
+
+        parameter_score = self._parameter_score(task, tool_calls, errors)
+        efficiency_score = self._efficiency_score(task, actual_tools)
+        error_handling_score = self._error_handling_score(task, actual_tools, error)
+        if error:
+            errors.append(error)
+
+        total_score = (
+            0.45 * completion_score
+            + 0.25 * parameter_score
+            + 0.15 * efficiency_score
+            + 0.15 * error_handling_score
+        )
+
+        return BenchmarkResult(
+            task_id=task.id,
+            category=task.category,
+            prompt=task.prompt,
+            expected_tools=task.expected_tools,
+            actual_tools=actual_tools,
+            completion_score=round(completion_score, 4),
+            parameter_score=round(parameter_score, 4),
+            efficiency_score=round(efficiency_score, 4),
+            error_handling_score=round(error_handling_score, 4),
+            total_score=round(total_score, 4),
+            errors=errors,
+            review_checklist=_review_checklist(task),
+        )
+
+    def evaluate_traces(
+        self,
+        traces_by_task_id: Mapping[str, Sequence[Mapping[str, Any]]],
+        *,
+        errors_by_task_id: Optional[Mapping[str, str]] = None,
+    ) -> BenchmarkReport:
+        errors_by_task_id = errors_by_task_id or {}
+        results = [
+            self.evaluate_task(
+                task,
+                traces_by_task_id.get(task.id, []),
+                error=errors_by_task_id.get(task.id),
+            )
+            for task in self.tasks
+        ]
+        return self._report(results)
+
+    def _parameter_score(
+        self,
+        task: BenchmarkTask,
+        tool_calls: Sequence[Mapping[str, Any]],
+        errors: List[str],
+    ) -> float:
+        if not task.expected_params:
+            return 1.0
+
+        checks = 0
+        passed = 0
+        calls_by_name: Dict[str, List[Mapping[str, Any]]] = {}
+        for call in tool_calls:
+            name = _tool_name(call)
+            if name:
+                calls_by_name.setdefault(name, []).append(call)
+
+        for tool_name, expected_params in task.expected_params.items():
+            calls = calls_by_name.get(tool_name) or []
+            if not calls:
+                checks += len(expected_params)
+                errors.append(f"missing params because tool was not called: {tool_name}")
+                continue
+            actual_params = _tool_input(calls[0])
+            for key, expected_value in expected_params.items():
+                checks += 1
+                if actual_params.get(key) == expected_value:
+                    passed += 1
+                else:
+                    errors.append(
+                        f"{tool_name}.{key}: expected {expected_value!r}, "
+                        f"got {actual_params.get(key)!r}"
+                    )
+
+        return passed / checks if checks else 1.0
+
+    def _efficiency_score(self, task: BenchmarkTask, actual_tools: Sequence[str]) -> float:
+        if not actual_tools:
+            return 1.0 if not task.expected_tools else 0.0
+        if task.max_tool_calls is not None and len(actual_tools) > task.max_tool_calls:
+            return task.max_tool_calls / len(actual_tools)
+        optimal = max(len(task.expected_tools), 1)
+        return min(1.0, optimal / len(actual_tools))
+
+    def _error_handling_score(
+        self,
+        task: BenchmarkTask,
+        actual_tools: Sequence[str],
+        error: Optional[str],
+    ) -> float:
+        if not task.failure_scenario:
+            return 0.0 if error else 1.0
+        if error:
+            return 0.0
+        if not task.expected_recovery_tools:
+            return 1.0
+        return _ordered_match_score(task.expected_recovery_tools, actual_tools)
+
+    def _report(self, results: Sequence[BenchmarkResult]) -> BenchmarkReport:
+        category_totals: Dict[str, List[float]] = {}
+        for result in results:
+            category_totals.setdefault(result.category, []).append(result.total_score)
+
+        category_scores = {
+            category: round(sum(scores) / len(scores), 4)
+            for category, scores in category_totals.items()
+        }
+        average = sum(result.total_score for result in results) / len(results) if results else 0.0
+        return BenchmarkReport(
+            timestamp=datetime.now().isoformat(),
+            num_tasks=len(results),
+            num_passed=sum(1 for result in results if result.passed),
+            average_score=round(average, 4),
+            category_scores=category_scores,
+            results=list(results),
+            metadata={"task_count": len(self.tasks)},
+        )
+
+
+# Backward-compatible alias for older callers while the benchmark terminology
+# moves away from "copilot".
+CopilotBenchmarkEvaluator = AgentWorkflowBenchmarkEvaluator
diff --git a/benchmarks/mock_client.py b/benchmarks/mock_client.py
new file mode 100644
index 00000000..6109c0ac
--- /dev/null
+++ b/benchmarks/mock_client.py
@@ -0,0 +1,103 @@
+"""Mock hardware client for agent workflow benchmark runs."""
+
+from __future__ import annotations
+
+from collections import defaultdict, deque
+from typing import Any, Deque, Dict, List, Mapping, Optional, Tuple
+
+
+class MockQueueServerClient:
+    """Scriptable fake for benchmark scenarios.
+
+    The class mirrors the async shape of the diSPIM queue/server client methods
+    used by tools. It records calls and lets benchmark tasks configure success
+    responses or failure scenarios without touching physical hardware.
+    """
+
+    def __init__(
+        self,
+        *,
+        stage_position: Tuple[float, float] = (0.0, 0.0),
+        has_sam: bool = True,
+    ):
+        self.stage_position = stage_position
+        self.has_sam = has_sam
+        self.calls: List[Dict[str, Any]] = []
+        self._responses: Dict[str, Deque[Any]] = defaultdict(deque)
+        self._failures: Dict[str, Exception] = {}
+
+    def script_response(self, method: str, *responses: Any) -> None:
+        self._responses[method].extend(responses)
+
+    def fail(self, method: str, error: Exception) -> None:
+        self._failures[method] = error
+
+    def clear_failure(self, method: str) -> None:
+        self._failures.pop(method, None)
+
+    def reset_calls(self) -> None:
+        self.calls.clear()
+
+    def recorded_calls(self, method: Optional[str] = None) -> List[Dict[str, Any]]:
+        if method is None:
+            return list(self.calls)
+        return [call for call in self.calls if call["method"] == method]
+
+    def _record(self, method: str, **payload: Any) -> None:
+        self.calls.append({"method": method, **payload})
+
+    def _response(self, method: str, default: Any) -> Any:
+        if method in self._failures:
+            raise self._failures[method]
+        if self._responses[method]:
+            response = self._responses[method].popleft()
+            if isinstance(response, Exception):
+                raise response
+            if callable(response):
+                return response()
+            return response
+        return default
+
+    async def get_stage_position(self) -> Tuple[float, float]:
+        self._record("get_stage_position")
+        return self._response("get_stage_position", self.stage_position)
+
+    async def move_to_position(self, x: float, y: float) -> Mapping[str, Any]:
+        self._record("move_to_position", x=x, y=y)
+        self.stage_position = (float(x), float(y))
+        return self._response(
+            "move_to_position",
+            {"success": True, "x": self.stage_position[0], "y": self.stage_position[1]},
+        )
+
+    async def detect_embryos(self, **kwargs: Any) -> Mapping[str, Any]:
+        self._record("detect_embryos", **kwargs)
+        return self._response("detect_embryos", {"success": True, "embryos": []})
+
+    async def capture_bottom_image(self, **kwargs: Any) -> Mapping[str, Any]:
+        self._record("capture_bottom_image", **kwargs)
+        return self._response(
+            "capture_bottom_image",
+            {"success": True, "image": [[0]], "stage_position": self.stage_position},
+        )
+
+    async def capture_for_marking(self, **kwargs: Any) -> Mapping[str, Any]:
+        self._record("capture_for_marking", **kwargs)
+        return self._response(
+            "capture_for_marking",
+            {"success": True, "image": [[0]], "stage_position": self.stage_position},
+        )
+
+    async def acquire_volume(self, **kwargs: Any) -> Mapping[str, Any]:
+        self._record("acquire_volume", **kwargs)
+        return self._response(
+            "acquire_volume",
+            {"success": True, "volume": None, "shape": (0,), **kwargs},
+        )
+
+    async def capture_lightsheet_image(self, **kwargs: Any) -> Mapping[str, Any]:
+        self._record("capture_lightsheet_image", **kwargs)
+        return self._response(
+            "capture_lightsheet_image",
+            {"success": True, "image": [[0]], "shape": (1, 1), **kwargs},
+        )
diff --git a/benchmarks/runner.py b/benchmarks/runner.py
index 4d6d847a..d2b61dbc 100644
--- a/benchmarks/runner.py
+++ b/benchmarks/runner.py
@@ -64,6 +64,63 @@ async def run_agent_benchmark(args):
     return 0
 
 
+def run_workflow_benchmark(args):
+    """Score agent workflow traces against the standard task suite."""
+    from .evaluator import AgentWorkflowBenchmarkEvaluator, load_tasks
+
+    tags = args.tags.split(",") if args.tags else None
+    tasks = load_tasks(tags=tags)
+    evaluator = AgentWorkflowBenchmarkEvaluator(tasks=tasks)
+
+    if not args.trace:
+        logger.info("=" * 60)
+        logger.info("AGENT WORKFLOW BENCHMARK TASKS")
+        logger.info("=" * 60)
+        for task in tasks:
+            logger.info("[%s] %s", task.id, task.prompt)
+            logger.info("    category=%s expected=%s", task.category, task.expected_tools)
+            checklist_items = sum(
+                len(items)
+                for items in [
+                    task.safety_constraints,
+                    task.scientific_validity,
+                    task.trace_quality_checks,
+                    task.operator_experience_checks,
+                    task.expected_evidence,
+                ]
+            )
+            if checklist_items:
+                logger.info("    manual review checks=%d", checklist_items)
+        logger.info("")
+        logger.info("Pass --trace path/to/traces.json to score a run.")
+        return 0
+
+    with open(args.trace, encoding="utf-8") as f:
+        trace_data = json.load(f)
+    traces = trace_data.get("traces", trace_data)
+
+    report = evaluator.evaluate_traces(traces)
+    payload = report.to_dict()
+
+    logger.info("=" * 60)
+    logger.info("AGENT WORKFLOW BENCHMARK")
+    logger.info("=" * 60)
+    logger.info("Tasks: %d", report.num_tasks)
+    logger.info("Pass rate: %.1f%%", payload["summary"]["pass_rate"] * 100)
+    logger.info("Average score: %.1f%%", report.average_score * 100)
+    logger.info("Manual review tasks: %d", payload["summary"]["manual_review_tasks"])
+    for category, score in report.category_scores.items():
+        logger.info("  %s: %.1f%%", category, score * 100)
+
+    if args.output:
+        output = Path(args.output)
+        output.parent.mkdir(parents=True, exist_ok=True)
+        output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+        logger.info("Wrote report: %s", output)
+
+    return 0
+
+
 def compare_reports(args):
     """Compare two benchmark reports"""
     from .agent.evaluator import BenchmarkReport, compare_reports as _compare
@@ -127,6 +184,23 @@ def main():
     agent_parser.add_argument("--run", action="store_true", help="Actually run (vs dry-run)")
     agent_parser.add_argument("--output", help="Output file for results")
 
+    # Agent workflow benchmark
+    workflow_parser = subparsers.add_parser(
+        "workflow",
+        help="List or score deterministic agent workflow benchmarks",
+    )
+    workflow_parser.add_argument("--tags", help="Comma-separated tags to filter")
+    workflow_parser.add_argument("--trace", help="JSON mapping task ids to tool-call traces")
+    workflow_parser.add_argument("--output", help="Output file for scored report")
+
+    legacy_parser = subparsers.add_parser(
+        "copilot",
+        help=argparse.SUPPRESS,
+    )
+    legacy_parser.add_argument("--tags", help=argparse.SUPPRESS)
+    legacy_parser.add_argument("--trace", help=argparse.SUPPRESS)
+    legacy_parser.add_argument("--output", help=argparse.SUPPRESS)
+
     # Compare reports
     compare_parser = subparsers.add_parser("compare", help="Compare two reports")
     compare_parser.add_argument("before", help="Before report JSON")
@@ -136,6 +210,8 @@ def main():
 
     if args.command == "agent":
         return asyncio.run(run_agent_benchmark(args))
+    elif args.command in {"workflow", "copilot"}:
+        return run_workflow_benchmark(args)
     elif args.command == "compare":
         return compare_reports(args)
     else:
diff --git a/benchmarks/tasks/agent_workflows.json b/benchmarks/tasks/agent_workflows.json
new file mode 100644
index 00000000..5de4ffad
--- /dev/null
+++ b/benchmarks/tasks/agent_workflows.json
@@ -0,0 +1,229 @@
+{
+  "version": "0.1",
+  "description": "Seed task suite for deterministic Gently agent workflow evaluation.",
+  "tasks": [
+    {
+      "id": "navigation_move_to_embryo",
+      "category": "navigation",
+      "prompt": "Move to embryo 2.",
+      "expected_tools": ["move_to_embryo"],
+      "expected_params": {
+        "move_to_embryo": {"embryo_id": "embryo_2"}
+      },
+      "safety_constraints": [
+        "Do not command motion outside calibrated stage bounds.",
+        "Verify the requested embryo id resolves before moving."
+      ],
+      "scientific_validity": [
+        "The move target is tied to the active sample map."
+      ],
+      "trace_quality_checks": [
+        "Trace records the requested embryo id and final stage target."
+      ],
+      "operator_experience_checks": [
+        "Operator can confirm which embryo was selected."
+      ],
+      "expected_evidence": [
+        "Stage move result or final position is present."
+      ],
+      "max_tool_calls": 2,
+      "tags": ["navigation", "embryo"]
+    },
+    {
+      "id": "navigation_center_brightest",
+      "category": "navigation",
+      "prompt": "Find and center the brightest embryo.",
+      "expected_tools": ["detect_embryos", "move_to_embryo"],
+      "safety_constraints": [
+        "Do not move until detection returns a candidate within calibrated bounds."
+      ],
+      "scientific_validity": [
+        "Brightness selection is based on current image evidence, not stale state."
+      ],
+      "trace_quality_checks": [
+        "Trace records detected candidates and the selection reason."
+      ],
+      "operator_experience_checks": [
+        "Operator can see why the brightest embryo was chosen."
+      ],
+      "expected_evidence": [
+        "Detection result and selected embryo id are present."
+      ],
+      "max_tool_calls": 4,
+      "tags": ["navigation", "detection"]
+    },
+    {
+      "id": "acquisition_volume_single_embryo",
+      "category": "acquisition",
+      "prompt": "Acquire a volume of embryo 1.",
+      "expected_tools": ["acquire_volume"],
+      "expected_params": {
+        "acquire_volume": {"embryo_id": "embryo_1"}
+      },
+      "safety_constraints": [
+        "Respect configured illumination and motion limits during acquisition."
+      ],
+      "scientific_validity": [
+        "Acquisition is associated with the requested embryo and imaging objective."
+      ],
+      "trace_quality_checks": [
+        "Trace records volume settings and artifact destination."
+      ],
+      "operator_experience_checks": [
+        "Operator can find the resulting volume without reading internal logs."
+      ],
+      "expected_evidence": [
+        "Volume artifact metadata is present."
+      ],
+      "max_tool_calls": 2,
+      "tags": ["acquisition", "volume"]
+    },
+    {
+      "id": "acquisition_start_one_hour_timelapse",
+      "category": "acquisition",
+      "prompt": "Start a timelapse for one hour.",
+      "expected_tools": ["start_adaptive_timelapse"],
+      "expected_params": {
+        "start_adaptive_timelapse": {"duration_minutes": 60}
+      },
+      "safety_constraints": [
+        "Timelapse duration and illumination remain within configured limits."
+      ],
+      "scientific_validity": [
+        "The plan preserves timepoint cadence and sample identity."
+      ],
+      "trace_quality_checks": [
+        "Trace records duration, cadence, and adaptive decision points."
+      ],
+      "operator_experience_checks": [
+        "Operator can inspect the active timelapse state."
+      ],
+      "expected_evidence": [
+        "Timelapse session metadata is present."
+      ],
+      "max_tool_calls": 3,
+      "tags": ["acquisition", "timelapse"]
+    },
+    {
+      "id": "analysis_find_hatching_embryo",
+      "category": "analysis",
+      "prompt": "Find the hatching embryo.",
+      "expected_tools": ["query_embryo_status"],
+      "safety_constraints": [
+        "Analysis does not alter microscope state."
+      ],
+      "scientific_validity": [
+        "Hatching status is derived from current embryo annotations or observations."
+      ],
+      "trace_quality_checks": [
+        "Trace records the queried status field and matching embryo."
+      ],
+      "operator_experience_checks": [
+        "Operator can see the reason the embryo was classified as hatching."
+      ],
+      "expected_evidence": [
+        "Embryo status result is present."
+      ],
+      "max_tool_calls": 3,
+      "tags": ["analysis", "hatching"]
+    },
+    {
+      "id": "analysis_measure_embryo_sizes",
+      "category": "analysis",
+      "prompt": "Measure embryo sizes.",
+      "expected_tools": ["detect_embryos"],
+      "safety_constraints": [
+        "Measurement pass does not move hardware unless explicitly requested."
+      ],
+      "scientific_validity": [
+        "Size estimates come from the current image or declared image source."
+      ],
+      "trace_quality_checks": [
+        "Trace records detected embryos and size units."
+      ],
+      "operator_experience_checks": [
+        "Operator can identify which embryos were measured."
+      ],
+      "expected_evidence": [
+        "Per-embryo size measurements are present."
+      ],
+      "max_tool_calls": 3,
+      "tags": ["analysis", "measurement"]
+    },
+    {
+      "id": "multi_step_calibrate_all_then_timelapse",
+      "category": "multi_step",
+      "prompt": "Calibrate all embryos and start a timelapse.",
+      "expected_tools": ["detect_embryos", "calibrate_all_embryos", "start_adaptive_timelapse"],
+      "safety_constraints": [
+        "Calibration moves remain within stage limits.",
+        "Timelapse starts only after calibration completes or is explicitly skipped."
+      ],
+      "scientific_validity": [
+        "Every embryo used in the timelapse has a calibration record.",
+        "The timelapse plan preserves the requested sample set."
+      ],
+      "trace_quality_checks": [
+        "Trace records detection, calibration outcomes, and timelapse start."
+      ],
+      "operator_experience_checks": [
+        "Operator can see which embryos are included and which failed calibration."
+      ],
+      "expected_evidence": [
+        "Calibration records and timelapse session metadata are present."
+      ],
+      "max_tool_calls": 6,
+      "tags": ["multi_step", "calibration", "timelapse"]
+    },
+    {
+      "id": "error_recovery_stage_limit",
+      "category": "error_recovery",
+      "prompt": "Recover from a stage limit error while moving to embryo 3.",
+      "expected_tools": ["get_stage_position", "move_to_embryo"],
+      "expected_recovery_tools": ["get_stage_position"],
+      "failure_scenario": "stage_limit",
+      "safety_constraints": [
+        "After a stage-limit error, do not retry the same unsafe target blindly."
+      ],
+      "scientific_validity": [
+        "Recovery preserves the requested embryo target or reports it cannot be reached."
+      ],
+      "trace_quality_checks": [
+        "Trace records the error, current position, and recovery decision."
+      ],
+      "operator_experience_checks": [
+        "Operator can tell whether motion was recovered or stopped."
+      ],
+      "expected_evidence": [
+        "Stage position query after the failure is present."
+      ],
+      "max_tool_calls": 5,
+      "tags": ["error_recovery", "navigation"]
+    },
+    {
+      "id": "error_recovery_failed_detection",
+      "category": "error_recovery",
+      "prompt": "Recover from a failed embryo detection.",
+      "expected_tools": ["view_image", "detect_embryos"],
+      "expected_recovery_tools": ["view_image"],
+      "failure_scenario": "failed_detection",
+      "safety_constraints": [
+        "Failed detection recovery does not start acquisition on an unknown target."
+      ],
+      "scientific_validity": [
+        "Recovery obtains fresh image evidence before retrying detection."
+      ],
+      "trace_quality_checks": [
+        "Trace records the failed detection and image used for retry."
+      ],
+      "operator_experience_checks": [
+        "Operator can see why detection was retried."
+      ],
+      "expected_evidence": [
+        "Image view result and retry detection result are present."
+      ],
+      "max_tool_calls": 5,
+      "tags": ["error_recovery", "detection"]
+    }
+  ]
+}
diff --git a/docs/agent-workflow-benchmarks.md b/docs/agent-workflow-benchmarks.md
new file mode 100644
index 00000000..ca71e5ef
--- /dev/null
+++ b/docs/agent-workflow-benchmarks.md
@@ -0,0 +1,101 @@
+# Agent Workflow Benchmarks
+
+The benchmark concept comes before the runner: Gently should be measured on
+whether it can turn a scientist's intent into a safe, inspectable, and useful
+experimental trace. A scripted workflow that merely calls the expected tools is
+not enough if the trace is unsafe, scientifically thin, or impossible for a
+human operator to understand.
+
+## Measurement Contract
+
+Each benchmark task should state:
+
+- the scientific intent being tested
+- the microscope or bench context needed to satisfy that intent
+- the sample state assumptions and failure modes
+- the safety constraints that must never be violated
+- the expected operator-facing evidence at the end of the run
+- the allowed tool-call or latency budget
+
+Scores should cover these dimensions:
+
+- task completion: the requested experimental state was reached
+- scientific validity: controls, constraints, and decision points are present
+- hardware safety: unsafe motion, illumination, and device states are avoided
+- trace quality: a human can reconstruct what happened and why
+- efficiency: the agent avoided unnecessary tool calls and retries
+- robustness: missing data, failed tools, and stale state were handled
+- operator experience: the workflow needed few unnecessary clarifications
+- generalization: the same concept works across imaging, bench, genetics, and
+  analysis tasks
+
+## Seed Task Suite
+
+Task definitions live in `benchmarks/tasks/agent_workflows.json`. Each task
+declares:
+
+- `category`: navigation, acquisition, analysis, multi_step, or error_recovery.
+- `prompt`: the user request being evaluated.
+- `expected_tools`: the ordered tool sequence the Gently agent should choose.
+- `expected_params`: exact parameter checks for important tool calls.
+- `max_tool_calls`: an efficiency budget.
+- `failure_scenario` and `expected_recovery_tools` for recovery benchmarks.
+- `safety_constraints`: hardware or sample-safety requirements that must be
+  checked by a reviewer.
+- `scientific_validity`: checks for whether the run makes scientific sense.
+- `trace_quality_checks`: evidence needed to reconstruct what happened and why.
+- `operator_experience_checks`: checks that the operator can understand or act
+  on the result.
+- `expected_evidence`: artifacts or metadata that should exist after the run.
+
+## Scoring
+
+`benchmarks.evaluator.AgentWorkflowBenchmarkEvaluator` computes four initial
+component scores:
+
+- completion: expected tools were called in order.
+- parameters: expected tool parameters matched.
+- efficiency: tool-call count stayed within the task budget.
+- error handling: recovery tasks used expected recovery tools and completed.
+
+These are a first trace-based subset of the measurement contract above. They
+are useful for deterministic regressions, but should not be treated as a full
+quality benchmark until safety, scientific validity, trace quality, and
+operator experience have corresponding evaluators.
+
+Until those evaluators exist, each scored result also carries a
+`review_checklist` and `manual_review_required` flag. A trace can pass the
+deterministic score while still requiring human review of the listed safety,
+scientific, trace-quality, operator-experience, and evidence checks.
+
+## Example
+
+```python
+from benchmarks.evaluator import AgentWorkflowBenchmarkEvaluator
+
+traces = {
+    "acquisition_volume_single_embryo": [
+        {"name": "acquire_volume", "input": {"embryo_id": "embryo_1"}},
+    ],
+}
+
+report = AgentWorkflowBenchmarkEvaluator().evaluate_traces(traces)
+print(report.to_dict()["summary"])
+```
+
+The evaluator is intentionally trace-based. A future runner can collect those
+traces from a dry-run Gently agent, replay harness, or live session transcript,
+then feed them through the same scoring code.
+
+## Mock Hardware
+
+Use `benchmarks.mock_client.MockQueueServerClient` when a benchmark needs
+deterministic device responses:
+
+```python
+client = MockQueueServerClient(stage_position=(10.0, 20.0))
+client.script_response("detect_embryos", {"success": True, "embryos": []})
+```
+
+The mock records all method calls so benchmark traces can be compared with the
+expected tool sequence.
diff --git a/tests/test_agent_workflow_benchmarks.py b/tests/test_agent_workflow_benchmarks.py
new file mode 100644
index 00000000..30e208e9
--- /dev/null
+++ b/tests/test_agent_workflow_benchmarks.py
@@ -0,0 +1,143 @@
+import pytest
+
+from benchmarks.evaluator import AgentWorkflowBenchmarkEvaluator, BenchmarkTask, load_tasks
+from benchmarks.mock_client import MockQueueServerClient
+
+
+def test_default_tasks_cover_required_categories():
+    tasks = load_tasks()
+    categories = {task.category for task in tasks}
+
+    assert {
+        "navigation",
+        "acquisition",
+        "analysis",
+        "multi_step",
+        "error_recovery",
+    }.issubset(categories)
+
+
+def test_evaluator_scores_expected_tool_sequence_and_params():
+    task = BenchmarkTask(
+        id="volume",
+        category="acquisition",
+        prompt="Acquire a volume of embryo 1.",
+        expected_tools=["acquire_volume"],
+        expected_params={"acquire_volume": {"embryo_id": "embryo_1"}},
+        max_tool_calls=2,
+    )
+    evaluator = AgentWorkflowBenchmarkEvaluator(tasks=[task])
+
+    result = evaluator.evaluate_task(
+        task,
+        [{"name": "acquire_volume", "input": {"embryo_id": "embryo_1"}}],
+    )
+
+    assert result.passed
+    assert result.total_score == pytest.approx(1.0)
+
+
+def test_evaluator_penalizes_missing_tools_and_extra_calls():
+    task = BenchmarkTask(
+        id="move",
+        category="navigation",
+        prompt="Move to embryo 2.",
+        expected_tools=["move_to_embryo"],
+        expected_params={"move_to_embryo": {"embryo_id": "embryo_2"}},
+        max_tool_calls=1,
+    )
+    evaluator = AgentWorkflowBenchmarkEvaluator(tasks=[task])
+
+    result = evaluator.evaluate_task(
+        task,
+        [
+            {"name": "get_stage_position", "input": {}},
+            {"name": "move_stage", "input": {"x": 100.0, "y": 200.0}},
+        ],
+    )
+
+    assert not result.passed
+    assert result.completion_score == 0.0
+    assert result.efficiency_score == 0.5
+    assert "missing expected tool: move_to_embryo" in result.errors
+
+
+def test_evaluator_reports_category_scores():
+    tasks = [
+        BenchmarkTask("ok", "navigation", "Move", ["move_stage"]),
+        BenchmarkTask("bad", "analysis", "Analyze", ["query_embryo_status"]),
+    ]
+    evaluator = AgentWorkflowBenchmarkEvaluator(tasks=tasks)
+
+    report = evaluator.evaluate_traces(
+        {
+            "ok": [{"name": "move_stage", "input": {}}],
+            "bad": [],
+        }
+    )
+
+    assert report.num_tasks == 2
+    assert report.category_scores["navigation"] == 1.0
+    assert report.category_scores["analysis"] < 1.0
+
+
+def test_evaluator_reports_manual_review_checklist():
+    task = BenchmarkTask(
+        id="safe_volume",
+        category="acquisition",
+        prompt="Acquire a safe volume.",
+        expected_tools=["acquire_volume"],
+        safety_constraints=["Respect the configured illumination limit."],
+        scientific_validity=["Record the embryo id and imaging objective."],
+        trace_quality_checks=["Trace includes the acquisition reason."],
+        operator_experience_checks=["Operator can see the final volume path."],
+        expected_evidence=["Volume artifact metadata is present."],
+    )
+    evaluator = AgentWorkflowBenchmarkEvaluator(tasks=[task])
+
+    report = evaluator.evaluate_traces(
+        {"safe_volume": [{"name": "acquire_volume", "input": {}}]}
+    )
+    result = report.results[0].to_dict()
+
+    assert result["manual_review_required"] is True
+    assert result["review_checklist"]["safety_constraints"] == [
+        "Respect the configured illumination limit."
+    ]
+    assert result["review_checklist"]["expected_evidence"] == [
+        "Volume artifact metadata is present."
+    ]
+    assert report.to_dict()["summary"]["manual_review_tasks"] == 1
+
+
+def test_default_tasks_include_review_rubric_fields():
+    task = next(task for task in load_tasks() if task.id == "multi_step_calibrate_all_then_timelapse")
+
+    assert task.safety_constraints
+    assert task.scientific_validity
+    assert task.expected_evidence
+
+
+@pytest.mark.asyncio
+async def test_mock_client_records_scripted_responses():
+    client = MockQueueServerClient(stage_position=(10.0, 20.0))
+    client.script_response("detect_embryos", {"success": True, "embryos": ["e1"]})
+
+    await client.move_to_position(100.0, 200.0)
+    result = await client.detect_embryos()
+
+    assert result["embryos"] == ["e1"]
+    assert client.recorded_calls("move_to_position") == [
+        {"method": "move_to_position", "x": 100.0, "y": 200.0}
+    ]
+
+
+@pytest.mark.asyncio
+async def test_mock_client_can_script_failures():
+    client = MockQueueServerClient()
+    client.fail("move_to_position", RuntimeError("stage limit"))
+
+    with pytest.raises(RuntimeError, match="stage limit"):
+        await client.move_to_position(999999.0, 0.0)
+
+    assert client.recorded_calls("move_to_position")[0]["x"] == 999999.0