Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions benchmarks/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,35 @@ async def run_agent_benchmark(args):
return 0


async def run_structured_plan_benchmark(args):
"""Run the offline structured-plan replay benchmark."""
from .structured_plan_replay import run_structured_plan_replay

report = await run_structured_plan_replay(Path(args.workdir)) if args.workdir else None
if report is None:
import tempfile

with tempfile.TemporaryDirectory(prefix="gently-structured-plan-") as tmp:
report = await run_structured_plan_replay(Path(tmp))

payload = report
logger.info("=" * 60)
logger.info("STRUCTURED PLAN REPLAY")
logger.info("=" * 60)
logger.info("Passed: %s", payload["passed"])
logger.info("Elapsed: %.3f ms", payload["elapsed_ms"])
logger.info("Expected: %s", payload["expected_counts"])
logger.info("Actual: %s", payload["actual_counts"])

if args.output:
output = Path(args.output)
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
logger.info("Wrote report: %s", output)

return 0 if payload["passed"] else 1


def compare_reports(args):
"""Compare two benchmark reports"""
from .agent.evaluator import BenchmarkReport, compare_reports as _compare
Expand Down Expand Up @@ -132,10 +161,20 @@ def main():
compare_parser.add_argument("before", help="Before report JSON")
compare_parser.add_argument("after", help="After report JSON")

# Structured plan replay
plan_parser = subparsers.add_parser(
"structured-plan",
help="Run the offline structured-plan replay benchmark",
)
plan_parser.add_argument("--workdir", help="Directory for the temporary ContextStore database")
plan_parser.add_argument("--output", help="Output file for the JSON report")

args = parser.parse_args()

if args.command == "agent":
return asyncio.run(run_agent_benchmark(args))
elif args.command == "structured-plan":
return asyncio.run(run_structured_plan_benchmark(args))
elif args.command == "compare":
return compare_reports(args)
else:
Expand Down
181 changes: 181 additions & 0 deletions benchmarks/structured_plan_replay.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
"""Offline replay benchmark for structured plan generation.

This benchmark exercises the `create_structured_plan` tool against a fresh
ContextStore. It is intentionally deterministic: no LLM, browser session, or
microscope connection is required.
"""

from __future__ import annotations

import argparse
import asyncio
import json
import tempfile
import time
from pathlib import Path
from types import SimpleNamespace
from typing import Any, Dict, Mapping, Optional

from gently.harness.memory.store import ContextStore
from gently.harness.tools.registry import get_tool_registry

# Import registers the plan-mode tools with the global registry.
from gently.harness.plan_mode.tools import planning # noqa: F401


DEFAULT_PLAN_PAYLOAD: Dict[str, Any] = {
"description": "Safe F-drive focus finding",
"shorthand": "fdrive-focus-2026",
"target": "Locate embryos and decide whether a safe timelapse can start",
"phases": [
{"key": "setup", "description": "Setup and calibration"},
{"key": "run", "description": "Timelapse readiness"},
],
"items": [
{
"key": "find",
"phase": "setup",
"type": "imaging",
"title": "Locate embryos in XY",
"spec": {
"sample_prep": "poly-lysine slide",
"num_embryos": 3,
},
"plan_context": {
"technical": "Bottom overview camera locates embryos in XY.",
"experimental": "Ryan and Brie verify positions before focus approach.",
"theoretical": "Embryos remain comparable after mounting.",
"conceptual": "Keep microscope support visible to the biologist.",
"constraints": ["Avoid overtravel toward glass"],
},
},
{
"key": "calibrate",
"phase": "setup",
"type": "imaging",
"title": "Calibrate galvo-piezo per embryo",
"depends_on": ["find"],
},
{
"key": "decide",
"phase": "run",
"task_class": "decision_point",
"title": "Decide whether to start timelapse",
"depends_on": ["calibrate"],
},
],
}


def _count_plan_records(store: ContextStore, campaign_id: str) -> Dict[str, int]:
phases = store.get_subcampaigns(campaign_id)
campaign_ids = [campaign_id] + [phase.id for phase in phases]

items = []
dependency_count = 0
for cid in campaign_ids:
campaign_items = store.get_plan_items(campaign_id=cid)
items.extend(campaign_items)
for item in campaign_items:
dependency_count += len(store.get_plan_item_dependencies(item.id))

return {
"campaigns": 1,
"phases": len(phases),
"items": len(items),
"dependencies": dependency_count,
}


def _expected_dependency_count(items: list[Mapping[str, Any]]) -> int:
count = 0
for item in items:
raw = item.get("depends_on") or []
count += 1 if isinstance(raw, str) else len(raw)
return count


async def run_structured_plan_replay(
workdir: Path,
payload: Optional[Mapping[str, Any]] = None,
) -> Dict[str, Any]:
"""Run the deterministic structured-plan replay and return a JSON report."""
workdir = Path(workdir)
workdir.mkdir(parents=True, exist_ok=True)
plan_payload = dict(payload or DEFAULT_PLAN_PAYLOAD)

store = ContextStore(workdir / "structured_plan_replay.db")
agent = SimpleNamespace(context_store=store)
started = time.perf_counter()
try:
result = await get_tool_registry().execute(
"create_structured_plan",
plan_payload,
{"agent": agent},
)
elapsed_ms = round((time.perf_counter() - started) * 1000, 3)

campaign = store.resolve_campaign(str(plan_payload["shorthand"]))
counts = (
_count_plan_records(store, campaign.id)
if campaign is not None
else {"campaigns": 0, "phases": 0, "items": 0, "dependencies": 0}
)
expected_counts = {
"campaigns": 1,
"phases": len(plan_payload.get("phases") or []),
"items": len(plan_payload.get("items") or []),
"dependencies": _expected_dependency_count(plan_payload.get("items") or []),
}
passed = (
campaign is not None
and not str(result).startswith("Error:")
and counts == expected_counts
)

return {
"benchmark": "structured_plan_replay",
"passed": passed,
"elapsed_ms": elapsed_ms,
"tool_calls": [
{
"name": "create_structured_plan",
"phases": expected_counts["phases"],
"items": expected_counts["items"],
"dependencies": expected_counts["dependencies"],
}
],
"expected_counts": expected_counts,
"actual_counts": counts,
"campaign_id": campaign.id if campaign else None,
"tool_result_excerpt": str(result).splitlines()[:8],
}
finally:
store.close()


async def _run_cli(args: argparse.Namespace) -> int:
if args.workdir:
report = await run_structured_plan_replay(Path(args.workdir))
else:
with tempfile.TemporaryDirectory(prefix="gently-structured-plan-") as tmp:
report = await run_structured_plan_replay(Path(tmp))

payload = json.dumps(report, indent=2)
if args.output:
output = Path(args.output)
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(payload, encoding="utf-8")
print(payload)
return 0 if report["passed"] else 1


def main() -> int:
parser = argparse.ArgumentParser(description="Run the structured plan replay benchmark")
parser.add_argument("--workdir", help="Directory for the temporary ContextStore database")
parser.add_argument("--output", help="Optional JSON report path")
return asyncio.run(_run_cli(parser.parse_args()))


if __name__ == "__main__":
raise SystemExit(main())
120 changes: 120 additions & 0 deletions docs/planning-context-hierarchy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Planning Context Hierarchy

This note maps the smart-microscopy context hierarchy into Gently plan mode.
It responds to the PR #23 discussion about making the planning layer the
biologist's primary way to interact with a DiSPIM, an organism, and later other
experimental modalities.

## Source Framework

Kesavan and Nordenfelt describe smart microscopy as a shift from passive data
collection toward active scientific collaboration. Their framework highlights
hierarchical context integration across four levels: technical, experimental,
theoretical, and conceptual context.

Reference:
P. S. Kesavan and P. Nordenfelt, "From observation to understanding: A
multi-agent framework for smart microscopy," Journal of Microscopy, 2026,
doi: 10.1111/jmi.70063.

Preprint:
https://arxiv.org/abs/2505.20466

## Gently Plan Item Mapping

Every important plan item can now carry `plan_context` alongside its executable
or measurable `spec`.

`technical`
: Instrument, sample, calibration, dataflow, and safety state. For the DiSPIM
embryo workflow this includes bottom-overview XY finding, F/head-axis
alignment, stage Z/head approach, `calibration_tools.py`, galvo-piezo
calibration, detector state, and timelapse settings.

`experimental`
: Operator workflow, sample prep, controls, and user constraints. For the
immediate Ryan/Brie workflow this means locate embryos, align the F/head axis,
confirm calibration per embryo coordinate, then decide on timelapse.

`theoretical`
: The biological model, developmental process, mechanism, or measurement
hypothesis that gives the image data meaning.

`conceptual`
: The higher-level scientific or human-instrument objective. This is the layer
where the biologist should be able to say what they are trying to understand
without first phrasing it as device commands.

Additional fields:
- `sample_entity`: the organism, embryo, tissue, region, or other entity being
acted on or observed.
- `operator_context`: who is operating, calibrating, approving, or using the
plan and what they need from the system.
- `constraints`: safety, timing, phototoxicity, calibration, or workflow
constraints that must stay visible during planning.
- `success_question`: the question that should be answerable if the item works.

## Example

```json
{
"technical": "Embryos found in XY with the bottom overview camera; F/head axis aligned; calibration confirmed before timelapse.",
"experimental": "Ryan or Brie prepares poly-lysine-mounted C. elegans embryos, checks focus approach, and approves the first timelapse.",
"theoretical": "Embryo developmental timing and morphology should remain interpretable across the planned imaging window.",
"conceptual": "Let the biologist plan at the embryo-development level while Gently keeps the DiSPIM operations explicit.",
"sample_entity": "C. elegans embryos on a poly-lysine-coated glass slide",
"operator_context": "Ryan/Brie immediate DiSPIM users; Gently may automate calibration steps only after safety assumptions are explicit.",
"constraints": [
"Confirm calibration for each embryo coordinate before timelapse",
"State F-drive/head-axis focus-finding assumptions before lowering toward sample focus",
"Avoid overtravel beyond embryo focus toward the glass slide"
],
"success_question": "Can the plan acquire reliable embryo timelapse data without hiding calibration or focus-safety assumptions?"
}
```

## Validation Behavior

Plan validation now warns, without blocking execution, when imaging items do not
carry the four context layers. It also warns when a DiSPIM embryo timelapse plan
does not state calibration and F-drive/head-axis focus-safety assumptions in the
technical context or constraints.

The warnings are deliberate: they keep the existing planning system usable while
making the missing planning structure visible for iteration.

## Structured Generation

Plan mode also provides `create_structured_plan` for creating the full
campaign/phase/task hierarchy in one operation. It accepts:

- root campaign description, shorthand, and target
- phase objects with local keys
- typed task objects using `type` or `task_class`
- local dependency references between task keys
- task `spec`, `references`, `estimated_days`, and `plan_context`

This keeps campaign -> phase -> task construction explicit while reducing the
tool-call chatter that occurs when the agent creates each campaign, item, and
dependency separately. The lower-level tools remain available for incremental
edits after the initial plan outline exists.

## Offline Replay Benchmark

The structured-plan path has a deterministic replay benchmark:

```bash
python -m benchmarks.runner structured-plan
```

or directly:

```bash
python -m benchmarks.structured_plan_replay
```

The replay creates a fresh local `ContextStore`, executes one
`create_structured_plan` tool call, and verifies the expected campaign, two
phases, three plan items, and two dependencies. It does not call an LLM, browser,
or microscope server, so it can be used as a regression check for the planning
tool shape even before a full agent benchmark exists.
Loading