gently-project · ceej640 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/benchmarks/runner.py b/benchmarks/runner.py
@@ -64,6 +64,35 @@ async def run_agent_benchmark(args):
     return 0
 
 
+async def run_structured_plan_benchmark(args):
+    """Run the offline structured-plan replay benchmark."""
+    from .structured_plan_replay import run_structured_plan_replay
+
+    report = await run_structured_plan_replay(Path(args.workdir)) if args.workdir else None
+    if report is None:
+        import tempfile
+
+        with tempfile.TemporaryDirectory(prefix="gently-structured-plan-") as tmp:
+            report = await run_structured_plan_replay(Path(tmp))
+
+    payload = report
+    logger.info("=" * 60)
+    logger.info("STRUCTURED PLAN REPLAY")
+    logger.info("=" * 60)
+    logger.info("Passed: %s", payload["passed"])
+    logger.info("Elapsed: %.3f ms", payload["elapsed_ms"])
+    logger.info("Expected: %s", payload["expected_counts"])
+    logger.info("Actual:   %s", payload["actual_counts"])
+
+    if args.output:
+        output = Path(args.output)
+        output.parent.mkdir(parents=True, exist_ok=True)
+        output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+        logger.info("Wrote report: %s", output)
+
+    return 0 if payload["passed"] else 1
+
+
 def compare_reports(args):
     """Compare two benchmark reports"""
     from .agent.evaluator import BenchmarkReport, compare_reports as _compare
@@ -132,10 +161,20 @@ def main():
     compare_parser.add_argument("before", help="Before report JSON")
     compare_parser.add_argument("after", help="After report JSON")
 
+    # Structured plan replay
+    plan_parser = subparsers.add_parser(
+        "structured-plan",
+        help="Run the offline structured-plan replay benchmark",
+    )
+    plan_parser.add_argument("--workdir", help="Directory for the temporary ContextStore database")
+    plan_parser.add_argument("--output", help="Output file for the JSON report")
+
     args = parser.parse_args()
 
     if args.command == "agent":
         return asyncio.run(run_agent_benchmark(args))
+    elif args.command == "structured-plan":
+        return asyncio.run(run_structured_plan_benchmark(args))
     elif args.command == "compare":
         return compare_reports(args)
     else:

diff --git a/benchmarks/structured_plan_replay.py b/benchmarks/structured_plan_replay.py
@@ -0,0 +1,181 @@
+"""Offline replay benchmark for structured plan generation.
+
+This benchmark exercises the `create_structured_plan` tool against a fresh
+ContextStore. It is intentionally deterministic: no LLM, browser session, or
+microscope connection is required.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import tempfile
+import time
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any, Dict, Mapping, Optional
+
+from gently.harness.memory.store import ContextStore
+from gently.harness.tools.registry import get_tool_registry
+
+# Import registers the plan-mode tools with the global registry.
+from gently.harness.plan_mode.tools import planning  # noqa: F401
+
+
+DEFAULT_PLAN_PAYLOAD: Dict[str, Any] = {
+    "description": "Safe F-drive focus finding",
+    "shorthand": "fdrive-focus-2026",
+    "target": "Locate embryos and decide whether a safe timelapse can start",
+    "phases": [
+        {"key": "setup", "description": "Setup and calibration"},
+        {"key": "run", "description": "Timelapse readiness"},
+    ],
+    "items": [
+        {
+            "key": "find",
+            "phase": "setup",
+            "type": "imaging",
+            "title": "Locate embryos in XY",
+            "spec": {
+                "sample_prep": "poly-lysine slide",
+                "num_embryos": 3,
+            },
+            "plan_context": {
+                "technical": "Bottom overview camera locates embryos in XY.",
+                "experimental": "Ryan and Brie verify positions before focus approach.",
+                "theoretical": "Embryos remain comparable after mounting.",
+                "conceptual": "Keep microscope support visible to the biologist.",
+                "constraints": ["Avoid overtravel toward glass"],
+            },
+        },
+        {
+            "key": "calibrate",
+            "phase": "setup",
+            "type": "imaging",
+            "title": "Calibrate galvo-piezo per embryo",
+            "depends_on": ["find"],
+        },
+        {
+            "key": "decide",
+            "phase": "run",
+            "task_class": "decision_point",
+            "title": "Decide whether to start timelapse",
+            "depends_on": ["calibrate"],
+        },
+    ],
+}
+
+
+def _count_plan_records(store: ContextStore, campaign_id: str) -> Dict[str, int]:
+    phases = store.get_subcampaigns(campaign_id)
+    campaign_ids = [campaign_id] + [phase.id for phase in phases]
+
+    items = []
+    dependency_count = 0
+    for cid in campaign_ids:
+        campaign_items = store.get_plan_items(campaign_id=cid)
+        items.extend(campaign_items)
+        for item in campaign_items:
+            dependency_count += len(store.get_plan_item_dependencies(item.id))
+
+    return {
+        "campaigns": 1,
+        "phases": len(phases),
+        "items": len(items),
+        "dependencies": dependency_count,
+    }
+
+
+def _expected_dependency_count(items: list[Mapping[str, Any]]) -> int:
+    count = 0
+    for item in items:
+        raw = item.get("depends_on") or []
+        count += 1 if isinstance(raw, str) else len(raw)
+    return count
+
+
+async def run_structured_plan_replay(
+    workdir: Path,
+    payload: Optional[Mapping[str, Any]] = None,
+) -> Dict[str, Any]:
+    """Run the deterministic structured-plan replay and return a JSON report."""
+    workdir = Path(workdir)
+    workdir.mkdir(parents=True, exist_ok=True)
+    plan_payload = dict(payload or DEFAULT_PLAN_PAYLOAD)
+
+    store = ContextStore(workdir / "structured_plan_replay.db")
+    agent = SimpleNamespace(context_store=store)
+    started = time.perf_counter()
+    try:
+        result = await get_tool_registry().execute(
+            "create_structured_plan",
+            plan_payload,
+            {"agent": agent},
+        )
+        elapsed_ms = round((time.perf_counter() - started) * 1000, 3)
+
+        campaign = store.resolve_campaign(str(plan_payload["shorthand"]))
+        counts = (
+            _count_plan_records(store, campaign.id)
+            if campaign is not None
+            else {"campaigns": 0, "phases": 0, "items": 0, "dependencies": 0}
+        )
+        expected_counts = {
+            "campaigns": 1,
+            "phases": len(plan_payload.get("phases") or []),
+            "items": len(plan_payload.get("items") or []),
+            "dependencies": _expected_dependency_count(plan_payload.get("items") or []),
+        }
+        passed = (
+            campaign is not None
+            and not str(result).startswith("Error:")
+            and counts == expected_counts
+        )
+
+        return {
+            "benchmark": "structured_plan_replay",
+            "passed": passed,
+            "elapsed_ms": elapsed_ms,
+            "tool_calls": [
+                {
+                    "name": "create_structured_plan",
+                    "phases": expected_counts["phases"],
+                    "items": expected_counts["items"],
+                    "dependencies": expected_counts["dependencies"],
+                }
+            ],
+            "expected_counts": expected_counts,
+            "actual_counts": counts,
+            "campaign_id": campaign.id if campaign else None,
+            "tool_result_excerpt": str(result).splitlines()[:8],
+        }
+    finally:
+        store.close()
+
+
+async def _run_cli(args: argparse.Namespace) -> int:
+    if args.workdir:
+        report = await run_structured_plan_replay(Path(args.workdir))
+    else:
+        with tempfile.TemporaryDirectory(prefix="gently-structured-plan-") as tmp:
+            report = await run_structured_plan_replay(Path(tmp))
+
+    payload = json.dumps(report, indent=2)
+    if args.output:
+        output = Path(args.output)
+        output.parent.mkdir(parents=True, exist_ok=True)
+        output.write_text(payload, encoding="utf-8")
+    print(payload)
+    return 0 if report["passed"] else 1
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run the structured plan replay benchmark")
+    parser.add_argument("--workdir", help="Directory for the temporary ContextStore database")
+    parser.add_argument("--output", help="Optional JSON report path")
+    return asyncio.run(_run_cli(parser.parse_args()))
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/docs/planning-context-hierarchy.md b/docs/planning-context-hierarchy.md
@@ -0,0 +1,120 @@
+# Planning Context Hierarchy
+
+This note maps the smart-microscopy context hierarchy into Gently plan mode.
+It responds to the PR #23 discussion about making the planning layer the
+biologist's primary way to interact with a DiSPIM, an organism, and later other
+experimental modalities.
+
+## Source Framework
+
+Kesavan and Nordenfelt describe smart microscopy as a shift from passive data
+collection toward active scientific collaboration. Their framework highlights
+hierarchical context integration across four levels: technical, experimental,
+theoretical, and conceptual context.
+
+Reference:
+P. S. Kesavan and P. Nordenfelt, "From observation to understanding: A
+multi-agent framework for smart microscopy," Journal of Microscopy, 2026,
+doi: 10.1111/jmi.70063.
+
+Preprint:
+https://arxiv.org/abs/2505.20466
+
+## Gently Plan Item Mapping
+
+Every important plan item can now carry `plan_context` alongside its executable
+or measurable `spec`.
+
+`technical`
+: Instrument, sample, calibration, dataflow, and safety state. For the DiSPIM
+  embryo workflow this includes bottom-overview XY finding, F/head-axis
+  alignment, stage Z/head approach, `calibration_tools.py`, galvo-piezo
+  calibration, detector state, and timelapse settings.
+
+`experimental`
+: Operator workflow, sample prep, controls, and user constraints. For the
+  immediate Ryan/Brie workflow this means locate embryos, align the F/head axis,
+  confirm calibration per embryo coordinate, then decide on timelapse.
+
+`theoretical`
+: The biological model, developmental process, mechanism, or measurement
+  hypothesis that gives the image data meaning.
+
+`conceptual`
+: The higher-level scientific or human-instrument objective. This is the layer
+  where the biologist should be able to say what they are trying to understand
+  without first phrasing it as device commands.
+
+Additional fields:
+- `sample_entity`: the organism, embryo, tissue, region, or other entity being
+  acted on or observed.
+- `operator_context`: who is operating, calibrating, approving, or using the
+  plan and what they need from the system.
+- `constraints`: safety, timing, phototoxicity, calibration, or workflow
+  constraints that must stay visible during planning.
+- `success_question`: the question that should be answerable if the item works.
+
+## Example
+
+```json
+{
+  "technical": "Embryos found in XY with the bottom overview camera; F/head axis aligned; calibration confirmed before timelapse.",
+  "experimental": "Ryan or Brie prepares poly-lysine-mounted C. elegans embryos, checks focus approach, and approves the first timelapse.",
+  "theoretical": "Embryo developmental timing and morphology should remain interpretable across the planned imaging window.",
+  "conceptual": "Let the biologist plan at the embryo-development level while Gently keeps the DiSPIM operations explicit.",
+  "sample_entity": "C. elegans embryos on a poly-lysine-coated glass slide",
+  "operator_context": "Ryan/Brie immediate DiSPIM users; Gently may automate calibration steps only after safety assumptions are explicit.",
+  "constraints": [
+    "Confirm calibration for each embryo coordinate before timelapse",
+    "State F-drive/head-axis focus-finding assumptions before lowering toward sample focus",
+    "Avoid overtravel beyond embryo focus toward the glass slide"
+  ],
+  "success_question": "Can the plan acquire reliable embryo timelapse data without hiding calibration or focus-safety assumptions?"
+}
+```
+
+## Validation Behavior
+
+Plan validation now warns, without blocking execution, when imaging items do not
+carry the four context layers. It also warns when a DiSPIM embryo timelapse plan
+does not state calibration and F-drive/head-axis focus-safety assumptions in the
+technical context or constraints.
+
+The warnings are deliberate: they keep the existing planning system usable while
+making the missing planning structure visible for iteration.
+
+## Structured Generation
+
+Plan mode also provides `create_structured_plan` for creating the full
+campaign/phase/task hierarchy in one operation. It accepts:
+
+- root campaign description, shorthand, and target
+- phase objects with local keys
+- typed task objects using `type` or `task_class`
+- local dependency references between task keys
+- task `spec`, `references`, `estimated_days`, and `plan_context`
+
+This keeps campaign -> phase -> task construction explicit while reducing the
+tool-call chatter that occurs when the agent creates each campaign, item, and
+dependency separately. The lower-level tools remain available for incremental
+edits after the initial plan outline exists.
+
+## Offline Replay Benchmark
+
+The structured-plan path has a deterministic replay benchmark:
+
+```bash
+python -m benchmarks.runner structured-plan
+```
+
+or directly:
+
+```bash
+python -m benchmarks.structured_plan_replay
+```
+
+The replay creates a fresh local `ContextStore`, executes one
+`create_structured_plan` tool call, and verifies the expected campaign, two
+phases, three plan items, and two dependencies. It does not call an LLM, browser,
+or microscope server, so it can be used as a regression check for the planning
+tool shape even before a full agent benchmark exists.