From 4b6c32179dcb4cc01f1aa257c33ab90d90eabcfa Mon Sep 17 00:00:00 2001
From: ceej640 <42260127+Ceej640@users.noreply.github.com>
Date: Sun, 31 May 2026 20:23:40 -0400
Subject: [PATCH 1/4] Shape plan mode around microscope context hierarchy

---
 docs/planning-context-hierarchy.md           |  84 +++++++++++++++
 gently/harness/memory/_plans.py              |  76 +++++++++----
 gently/harness/memory/file_store.py          |  49 +++++++++
 gently/harness/memory/model.py               |  19 ++++
 gently/harness/memory/store.py               |   4 +
 gently/harness/plan_mode/prompt.py           |  37 +++++++
 gently/harness/plan_mode/tools/planning.py   |  76 ++++++++++++-
 gently/harness/plan_mode/tools/validation.py | 107 +++++++++++++++++++
 tests/test_context_store.py                  |  68 ++++++++++++
 tests/test_plan_context_validation.py        |  61 +++++++++++
 10 files changed, 559 insertions(+), 22 deletions(-)
 create mode 100644 docs/planning-context-hierarchy.md
 create mode 100644 tests/test_plan_context_validation.py

diff --git a/docs/planning-context-hierarchy.md b/docs/planning-context-hierarchy.md
new file mode 100644
index 00000000..57c48038
--- /dev/null
+++ b/docs/planning-context-hierarchy.md
@@ -0,0 +1,84 @@
+# Planning Context Hierarchy
+
+This note maps the smart-microscopy context hierarchy into Gently plan mode.
+It responds to the PR #23 discussion about making the planning layer the
+biologist's primary way to interact with a DiSPIM, an organism, and later other
+experimental modalities.
+
+## Source Framework
+
+Kesavan and Nordenfelt describe smart microscopy as a shift from passive data
+collection toward active scientific collaboration. Their framework highlights
+hierarchical context integration across four levels: technical, experimental,
+theoretical, and conceptual context.
+
+Reference:
+P. S. Kesavan and P. Nordenfelt, "From observation to understanding: A
+multi-agent framework for smart microscopy," Journal of Microscopy, 2026,
+doi: 10.1111/jmi.70063.
+
+Preprint:
+https://arxiv.org/abs/2505.20466
+
+## Gently Plan Item Mapping
+
+Every important plan item can now carry `plan_context` alongside its executable
+or measurable `spec`.
+
+`technical`
+: Instrument, sample, calibration, dataflow, and safety state. For the DiSPIM
+  embryo workflow this includes bottom-overview XY finding, F/head-axis
+  alignment, stage Z/head approach, `calibration_tools.py`, galvo-piezo
+  calibration, detector state, and timelapse settings.
+
+`experimental`
+: Operator workflow, sample prep, controls, and user constraints. For the
+  immediate Ryan/Brie workflow this means locate embryos, align the F/head axis,
+  confirm calibration per embryo coordinate, then decide on timelapse.
+
+`theoretical`
+: The biological model, developmental process, mechanism, or measurement
+  hypothesis that gives the image data meaning.
+
+`conceptual`
+: The higher-level scientific or human-instrument objective. This is the layer
+  where the biologist should be able to say what they are trying to understand
+  without first phrasing it as device commands.
+
+Additional fields:
+- `sample_entity`: the organism, embryo, tissue, region, or other entity being
+  acted on or observed.
+- `operator_context`: who is operating, calibrating, approving, or using the
+  plan and what they need from the system.
+- `constraints`: safety, timing, phototoxicity, calibration, or workflow
+  constraints that must stay visible during planning.
+- `success_question`: the question that should be answerable if the item works.
+
+## Example
+
+```json
+{
+  "technical": "Embryos found in XY with the bottom overview camera; F/head axis aligned; calibration confirmed before timelapse.",
+  "experimental": "Ryan or Brie prepares poly-lysine-mounted C. elegans embryos, checks focus approach, and approves the first timelapse.",
+  "theoretical": "Embryo developmental timing and morphology should remain interpretable across the planned imaging window.",
+  "conceptual": "Let the biologist plan at the embryo-development level while Gently keeps the DiSPIM operations explicit.",
+  "sample_entity": "C. elegans embryos on a poly-lysine-coated glass slide",
+  "operator_context": "Ryan/Brie immediate DiSPIM users; Gently may automate calibration steps only after safety assumptions are explicit.",
+  "constraints": [
+    "Confirm calibration for each embryo coordinate before timelapse",
+    "State F-drive/head-axis focus-finding assumptions before lowering toward sample focus",
+    "Avoid overtravel beyond embryo focus toward the glass slide"
+  ],
+  "success_question": "Can the plan acquire reliable embryo timelapse data without hiding calibration or focus-safety assumptions?"
+}
+```
+
+## Validation Behavior
+
+Plan validation now warns, without blocking execution, when imaging items do not
+carry the four context layers. It also warns when a DiSPIM embryo timelapse plan
+does not state calibration and F-drive/head-axis focus-safety assumptions in the
+technical context or constraints.
+
+The warnings are deliberate: they keep the existing planning system usable while
+making the missing planning structure visible for iteration.
diff --git a/gently/harness/memory/_plans.py b/gently/harness/memory/_plans.py
index 0a872e2f..8bb95ab0 100644
--- a/gently/harness/memory/_plans.py
+++ b/gently/harness/memory/_plans.py
@@ -8,12 +8,14 @@
 import json
 import logging
 import sqlite3
+import dataclasses as _dc
 from datetime import datetime
 from typing import Any, Dict, List, Optional
 
 from .model import (
     BenchSpec,
     ImagingSpec,
+    PlanContext,
     PlanItem,
     PlanItemStatus,
     PlanItemType,
@@ -22,6 +24,46 @@
 logger = logging.getLogger(__name__)
 
 
+def _dataclass_to_dict(obj) -> Dict:
+    """Serialize a dataclass, omitting empty fields."""
+    data: Dict[str, Any] = {}
+    for f in _dc.fields(obj):
+        val = getattr(obj, f.name)
+        if val is None:
+            continue
+        if isinstance(val, (list, dict)) and not val:
+            continue
+        data[f.name] = val
+    return data
+
+
+def _coerce_plan_context(data) -> Optional[PlanContext]:
+    """Build a PlanContext from stored dict data while ignoring unknown keys."""
+    if not data:
+        return None
+    if isinstance(data, PlanContext):
+        return data
+    if not isinstance(data, dict):
+        return None
+    valid = {f.name for f in _dc.fields(PlanContext)}
+    kwargs = {k: v for k, v in data.items() if k in valid}
+    constraints = kwargs.get("constraints")
+    if constraints is None:
+        kwargs["constraints"] = []
+    elif isinstance(constraints, str):
+        kwargs["constraints"] = [constraints]
+    elif not isinstance(constraints, list):
+        kwargs["constraints"] = list(constraints)
+    return PlanContext(**kwargs)
+
+
+def _serialize_plan_context(data) -> Optional[Dict]:
+    context = _coerce_plan_context(data)
+    if not context:
+        return None
+    return _dataclass_to_dict(context)
+
+
 class PlansMixin:
     """Plan items, templates, snapshots, and dependency management."""
 
@@ -43,6 +85,7 @@ def create_plan_item(
         item_id: Optional[str] = None,
         references: Optional[List[Dict]] = None,
         estimated_days: Optional[int] = None,
+        plan_context: Optional[Dict] = None,
     ) -> str:
         """Create a plan item. Returns its ID.
 
@@ -64,12 +107,13 @@ def create_plan_item(
         with self._tx():
             self._conn.execute(
                 "INSERT INTO plan_items "
-                "(id, campaign_id, type, title, description, spec, inherit_from, "
+                "(id, campaign_id, type, title, description, spec, plan_context, inherit_from, "
                 " planned_session_id, estimated_days, phase_order, \"references\", status, created_at, updated_at) "
-                "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'planned', ?, ?)",
+                "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'planned', ?, ?)",
                 (
                     pid, campaign_id, type, title, description,
                     json.dumps(spec) if spec else None,
+                    json.dumps(_serialize_plan_context(plan_context)) if plan_context else None,
                     inherit_from, planned_session_id, estimated_days, phase_order,
                     json.dumps(references) if references else None,
                     now, now,
@@ -307,6 +351,7 @@ def update_plan_item(
         campaign_id: Optional[str] = None,
         references: Optional[List[Dict]] = None,
         estimated_days: Optional[int] = None,
+        plan_context: Optional[Dict] = None,
     ):
         """Update a plan item. Only non-None values are applied."""
         now = self._now()
@@ -336,6 +381,9 @@ def update_plan_item(
         if references is not None:
             updates.append("\"references\" = ?")
             values.append(json.dumps(references))
+        if plan_context is not None:
+            updates.append("plan_context = ?")
+            values.append(json.dumps(_serialize_plan_context(plan_context)) if plan_context else None)
         if not updates:
             return
         updates.append("updated_at = ?")
@@ -589,21 +637,9 @@ def _serialize_campaign_tree(self, campaign_id: str) -> Dict:
             }
             # Serialize spec
             if item.imaging_spec:
-                import dataclasses as _dc
-                spec_dict = {}
-                for f in _dc.fields(item.imaging_spec):
-                    val = getattr(item.imaging_spec, f.name)
-                    if val is not None:
-                        spec_dict[f.name] = val
-                item_data["spec"] = spec_dict
+                item_data["spec"] = _dataclass_to_dict(item.imaging_spec)
             elif item.bench_spec:
-                import dataclasses as _dc
-                spec_dict = {}
-                for f in _dc.fields(item.bench_spec):
-                    val = getattr(item.bench_spec, f.name)
-                    if val is not None:
-                        spec_dict[f.name] = val
-                item_data["spec"] = spec_dict
+                item_data["spec"] = _dataclass_to_dict(item.bench_spec)
 
             # Dependencies as relative indices within this campaign's items
             if item.depends_on:
@@ -616,6 +652,8 @@ def _serialize_campaign_tree(self, campaign_id: str) -> Dict:
 
             if item.references:
                 item_data["references"] = item.references
+            if item.plan_context:
+                item_data["plan_context"] = _dataclass_to_dict(item.plan_context)
 
             serialized_items.append(item_data)
 
@@ -711,6 +749,7 @@ def _instantiate_template_tree(
                 spec=spec,
                 phase_order=item_data.get("phase_order", -1),
                 references=item_data.get("references"),
+                plan_context=item_data.get("plan_context"),
             )
             new_item_ids.append(item_id)
 
@@ -931,19 +970,19 @@ def _row_to_plan_item(self, row: sqlite3.Row) -> PlanItem:
 
         if spec_data:
             if item_type == PlanItemType.IMAGING:
-                import dataclasses as _dc
                 valid = {f.name for f in _dc.fields(ImagingSpec)}
                 imaging_spec = ImagingSpec(**{
                     k: v for k, v in spec_data.items() if k in valid
                 })
             else:
-                import dataclasses as _dc
                 valid = {f.name for f in _dc.fields(BenchSpec)}
                 bench_spec = BenchSpec(**{
                     k: v for k, v in spec_data.items() if k in valid
                 })
 
         references = json.loads(d["references"]) if d.get("references") else []
+        plan_context_data = json.loads(d["plan_context"]) if d.get("plan_context") else None
+        plan_context = _coerce_plan_context(plan_context_data)
 
         return PlanItem(
             id=item_id,
@@ -957,6 +996,7 @@ def _row_to_plan_item(self, row: sqlite3.Row) -> PlanItem:
             claimed_by=d.get("claimed_by"),
             claimed_by_hostname=d.get("claimed_by_hostname"),
             references=references,
+            plan_context=plan_context,
             imaging_spec=imaging_spec,
             bench_spec=bench_spec,
             planned_session_id=d.get("planned_session_id"),
diff --git a/gently/harness/memory/file_store.py b/gently/harness/memory/file_store.py
index f8f797bb..ad7c530c 100644
--- a/gently/harness/memory/file_store.py
+++ b/gently/harness/memory/file_store.py
@@ -59,6 +59,7 @@
     Observation,
     PlannedSession,
     PlannedSessionStatus,
+    PlanContext,
     PlanItem,
     PlanItemStatus,
     PlanItemType,
@@ -1097,6 +1098,7 @@ def create_plan_item(
         item_id: Optional[str] = None,
         references: Optional[List[Dict]] = None,
         estimated_days: Optional[int] = None,
+        plan_context: Optional[Dict] = None,
     ) -> str:
         pid = item_id or self._gen_id()
         now = self._now()
@@ -1125,6 +1127,7 @@ def create_plan_item(
             "estimated_days": estimated_days,
             "phase_order": phase_order,
             "references": references,
+            "plan_context": self._serialise_plan_context(plan_context),
             "depends_on": depends_on or [],
             "claimed_by": None,
             "claimed_by_hostname": None,
@@ -1277,6 +1280,7 @@ def update_plan_item(
         campaign_id: Optional[str] = None,
         references: Optional[List[Dict]] = None,
         estimated_days: Optional[int] = None,
+        plan_context: Optional[Dict] = None,
     ):
         loc = self._find_plan_item_location(item_id)
         if not loc:
@@ -1302,6 +1306,8 @@ def update_plan_item(
             item["phase_order"] = phase_order
         if references is not None:
             item["references"] = references
+        if plan_context is not None:
+            item["plan_context"] = self._serialise_plan_context(plan_context)
         if campaign_id is not None and campaign_id != old_campaign_id:
             # Move item to a different campaign
             items.pop(idx)
@@ -1545,6 +1551,8 @@ def _serialize_campaign_tree(self, campaign_id: str) -> Dict:
 
             if item.references:
                 item_data["references"] = item.references
+            if item.plan_context:
+                item_data["plan_context"] = self._dataclass_to_sparse_dict(item.plan_context)
 
             serialized_items.append(item_data)
 
@@ -1643,6 +1651,7 @@ def _instantiate_template_tree(
                 spec=spec,
                 phase_order=item_data.get("phase_order", -1),
                 references=item_data.get("references"),
+                plan_context=item_data.get("plan_context"),
             )
             new_item_ids.append(item_id)
 
@@ -2448,6 +2457,44 @@ def _dict_to_planned_session(d: Dict) -> PlannedSession:
             updated_at=datetime.fromisoformat(d["updated_at"]) if isinstance(d.get("updated_at"), str) else d.get("updated_at", datetime.now()),
         )
 
+    @staticmethod
+    def _dataclass_to_sparse_dict(obj) -> Dict:
+        data: Dict[str, Any] = {}
+        for f in dataclasses.fields(obj):
+            val = getattr(obj, f.name)
+            if val is None:
+                continue
+            if isinstance(val, (list, dict)) and not val:
+                continue
+            data[f.name] = val
+        return data
+
+    @staticmethod
+    def _dict_to_plan_context(data) -> Optional[PlanContext]:
+        if not data:
+            return None
+        if isinstance(data, PlanContext):
+            return data
+        if not isinstance(data, dict):
+            return None
+        valid = {f.name for f in dataclasses.fields(PlanContext)}
+        kwargs = {k: v for k, v in data.items() if k in valid}
+        constraints = kwargs.get("constraints")
+        if constraints is None:
+            kwargs["constraints"] = []
+        elif isinstance(constraints, str):
+            kwargs["constraints"] = [constraints]
+        elif not isinstance(constraints, list):
+            kwargs["constraints"] = list(constraints)
+        return PlanContext(**kwargs)
+
+    @staticmethod
+    def _serialise_plan_context(data) -> Optional[Dict]:
+        context = FileContextStore._dict_to_plan_context(data)
+        if not context:
+            return None
+        return FileContextStore._dataclass_to_sparse_dict(context)
+
     @staticmethod
     def _dict_to_plan_item(d: Dict) -> PlanItem:
         item_type = PlanItemType(d["type"])
@@ -2468,6 +2515,7 @@ def _dict_to_plan_item(d: Dict) -> PlanItem:
                 })
 
         references = d.get("references") or []
+        plan_context = FileContextStore._dict_to_plan_context(d.get("plan_context"))
 
         return PlanItem(
             id=d["id"],
@@ -2481,6 +2529,7 @@ def _dict_to_plan_item(d: Dict) -> PlanItem:
             claimed_by=d.get("claimed_by"),
             claimed_by_hostname=d.get("claimed_by_hostname"),
             references=references,
+            plan_context=plan_context,
             imaging_spec=imaging_spec,
             bench_spec=bench_spec,
             planned_session_id=d.get("planned_session_id"),
diff --git a/gently/harness/memory/model.py b/gently/harness/memory/model.py
index d9dc9a4e..80cc9bb1 100644
--- a/gently/harness/memory/model.py
+++ b/gently/harness/memory/model.py
@@ -240,6 +240,24 @@ class BenchSpec:
     notes: Optional[str] = None
 
 
+@dataclass
+class PlanContext:
+    """
+    Multi-level microscope thought context for a plan item.
+
+    Based on the technical, experimental, theoretical, and conceptual
+    hierarchy used to connect microscope operations to biological reasoning.
+    """
+    technical: Optional[str] = None
+    experimental: Optional[str] = None
+    theoretical: Optional[str] = None
+    conceptual: Optional[str] = None
+    sample_entity: Optional[str] = None
+    operator_context: Optional[str] = None
+    constraints: List[str] = field(default_factory=list)
+    success_question: Optional[str] = None
+
+
 @dataclass
 class PlanItem:
     """
@@ -263,6 +281,7 @@ class PlanItem:
     claimed_by: Optional[str] = None         # instance_id of claiming node
     claimed_by_hostname: Optional[str] = None # human-readable hostname
     references: List[Dict[str, str]] = field(default_factory=list)  # Source citations
+    plan_context: Optional[PlanContext] = None  # Microscope thought hierarchy
 
     # Specifications (type-dependent)
     imaging_spec: Optional[ImagingSpec] = None
diff --git a/gently/harness/memory/store.py b/gently/harness/memory/store.py
index f3ae128f..91b14c75 100644
--- a/gently/harness/memory/store.py
+++ b/gently/harness/memory/store.py
@@ -184,6 +184,7 @@
     status TEXT DEFAULT 'planned',
     outcome TEXT,
     spec TEXT,
+    plan_context TEXT,
     inherit_from TEXT,
     planned_session_id TEXT,
     session_id TEXT,
@@ -298,6 +299,9 @@ def _migrate(self, conn: sqlite3.Connection):
         if "references" not in pi_cols:
             conn.execute('ALTER TABLE plan_items ADD COLUMN "references" TEXT')
             logger.info("Migration: added 'references' column to plan_items")
+        if "plan_context" not in pi_cols:
+            conn.execute("ALTER TABLE plan_items ADD COLUMN plan_context TEXT")
+            logger.info("Migration: added 'plan_context' column to plan_items")
 
         # Mesh campaign coordination columns
         camp_cols = {row[1] for row in conn.execute("PRAGMA table_info(campaigns)").fetchall()}
diff --git a/gently/harness/plan_mode/prompt.py b/gently/harness/plan_mode/prompt.py
index 19ea52b7..d7904afe 100644
--- a/gently/harness/plan_mode/prompt.py
+++ b/gently/harness/plan_mode/prompt.py
@@ -37,6 +37,38 @@
 - **Decision points**: Gates between phases where results determine next steps
 - **Dependencies**: What must complete before something else can start
 - **Success criteria**: How to know if each step worked
+- **Microscope thought context**: For each important task, record the
+  technical, experimental, theoretical, and conceptual context that connects
+  microscope operations to the scientific intent.
+
+## Microscope Thought Context
+
+Use the hierarchy from Kesavan and Nordenfelt's smart microscopy framework:
+
+- **Technical**: Instrument, sample, calibration, dataflow, and safety state.
+  For DiSPIM embryo work this includes bottom-overview XY finding, F/head-axis
+  alignment, stage Z/head approach, galvo-piezo calibration, piezo/galvo
+  settings, detector state, and timelapse acquisition.
+- **Experimental**: Operator workflow, sample prep, control structure, user
+  constraints, and the current point in the run. For the immediate DiSPIM work,
+  account for Ryan and Brie locating embryos, aligning F/head axis, calibrating
+  per embryo, and then deciding whether to start timelapse.
+- **Theoretical**: The biological model, mechanism, pathway, developmental
+  stage, or measurement hypothesis that makes the observations meaningful.
+- **Conceptual**: The higher-level goal for the interaction between biologist,
+  microscope, and sample environment.
+
+When you create or revise plan items, use the `plan_context` parameter in
+addition to `spec`. It accepts `technical`, `experimental`, `theoretical`,
+`conceptual`, `sample_entity`, `operator_context`, `constraints`, and
+`success_question`.
+
+For DiSPIM C. elegans embryo timelapse plans, make calibration state explicit:
+bottom overview locates embryos in XY, the operator or Gently aligns the
+F/head-axis approach, calibration is confirmed for the embryo coordinates, and
+only then should the plan choose timelapse settings. If a plan lowers the SPIM
+head toward sample focus, include F-drive/head-axis focus-finding and glass-slide
+safety assumptions in `plan_context.technical` or `plan_context.constraints`.
 
 ## When Proposing Imaging Sessions
 
@@ -70,6 +102,11 @@
 3. Set dependencies between items
 4. Present the full plan for review with propose_plan
 
+When creating plan items, attach both:
+- `spec` for executable or measurable parameters.
+- `plan_context` for the four-layer planning context that a biologist can use
+  as the primary interaction layer over the microscope and sample.
+
 IMPORTANT: ALWAYS use ask_user_choice when asking the researcher questions. Never
 present options as text lists.
 
diff --git a/gently/harness/plan_mode/tools/planning.py b/gently/harness/plan_mode/tools/planning.py
index 2c33d43c..5a695f6b 100644
--- a/gently/harness/plan_mode/tools/planning.py
+++ b/gently/harness/plan_mode/tools/planning.py
@@ -84,7 +84,10 @@ async def create_campaign(
         "Use estimated_days to indicate how many days this task takes "
         "(e.g. 1 for a quick imaging session, 14 for strain expansion). "
         "Use references to cite literature, databases, or other sources "
-        "(each with source, citation, and optional id/note)."
+        "(each with source, citation, and optional id/note). "
+        "Use plan_context to capture the microscope thought hierarchy: "
+        "technical, experimental, theoretical, and conceptual context, plus "
+        "sample_entity, operator_context, constraints, and success_question."
     ),
     category=ToolCategory.UTILITY,
     examples=[
@@ -120,6 +123,7 @@ async def create_plan_item(
     phase_order: int = -1,
     references: List[Dict] = None,
     estimated_days: int = None,
+    plan_context: Dict = None,
     context: Dict = None,
 ) -> str:
     """Create a plan item within a campaign/phase.
@@ -155,6 +159,7 @@ async def create_plan_item(
         phase_order=phase_order,
         references=references,
         estimated_days=estimated_days,
+        plan_context=plan_context,
     )
 
     # Include the human-friendly task number in the response
@@ -191,8 +196,9 @@ async def create_plan_item(
     name="update_plan_item",
     description=(
         "Update an existing plan item — change status, title, description, "
-        "outcome, spec, or references. Use this to mark items as completed, "
-        "skipped, update imaging specifications, or attach source citations."
+        "outcome, spec, plan_context, or references. Use this to mark items "
+        "as completed, skipped, update imaging specifications, attach source "
+        "citations, or refine the microscope thought hierarchy."
     ),
     category=ToolCategory.UTILITY,
 )
@@ -205,6 +211,7 @@ async def update_plan_item(
     spec: Dict = None,
     references: List[Dict] = None,
     estimated_days: int = None,
+    plan_context: Dict = None,
     campaign_id: str = None,
     context: Dict = None,
 ) -> str:
@@ -235,6 +242,7 @@ async def update_plan_item(
         spec=spec,
         references=references,
         estimated_days=estimated_days,
+        plan_context=plan_context,
     )
     changes = []
     if status:
@@ -247,6 +255,8 @@ async def update_plan_item(
         changes.append(f"title -> {title}")
     if references:
         changes.append(f"{len(references)} references attached")
+    if plan_context is not None:
+        changes.append("plan context updated")
     return f"Updated plan item '{item.title}' ({resolved_id}): {', '.join(changes) or 'updated'}"
 
 
@@ -412,6 +422,31 @@ async def propose_plan(
     return "\n".join(lines)
 
 
+def _plan_context_rows(plan_context, indent: str = "   ") -> List[str]:
+    """Format microscope thought hierarchy fields for text output."""
+    if not plan_context:
+        return []
+
+    rows = [f"{indent}Microscope thought context:"]
+    for attr, label in [
+        ("technical", "Technical"),
+        ("experimental", "Experimental"),
+        ("theoretical", "Theoretical"),
+        ("conceptual", "Conceptual"),
+        ("sample_entity", "Sample entity"),
+        ("operator_context", "Operator context"),
+        ("success_question", "Success question"),
+    ]:
+        val = getattr(plan_context, attr, None)
+        if val:
+            rows.append(f"{indent}  {label}: {val}")
+
+    constraints = getattr(plan_context, "constraints", None) or []
+    if constraints:
+        rows.append(f"{indent}  Constraints: {', '.join(constraints)}")
+    return rows
+
+
 def _format_plan_item(item, store, task_num: str = "") -> str:
     """Format a single plan item for display."""
     from gently.harness.memory.model import PlanItemStatus
@@ -473,6 +508,8 @@ def _format_plan_item(item, store, task_num: str = "") -> str:
         if spec.success_criteria:
             details.append(f"   Criteria: {spec.success_criteria}")
 
+    details.extend(_plan_context_rows(item.plan_context))
+
     if item.depends_on:
         dep_items = [store.get_plan_item(d) for d in item.depends_on]
         dep_names = [d.title for d in dep_items if d]
@@ -1131,6 +1168,32 @@ def _export_date() -> str:
     return datetime.now().strftime("%Y-%m-%d")
 
 
+def _export_plan_context(plan_context) -> List[str]:
+    if not plan_context:
+        return []
+
+    lines = ["**Microscope Thought Context:**"]
+    for attr, label in [
+        ("technical", "Technical"),
+        ("experimental", "Experimental"),
+        ("theoretical", "Theoretical"),
+        ("conceptual", "Conceptual"),
+        ("sample_entity", "Sample entity"),
+        ("operator_context", "Operator context"),
+        ("success_question", "Success question"),
+    ]:
+        val = getattr(plan_context, attr, None)
+        if val:
+            lines.append(f"- {label}: {val}")
+
+    constraints = getattr(plan_context, "constraints", None) or []
+    if constraints:
+        lines.append(f"- Constraints: {', '.join(constraints)}")
+
+    lines.append("")
+    return lines
+
+
 def _export_item(item, store, num: str) -> List[str]:
     """Format a plan item for the export document."""
     from gently.harness.memory.model import PlanItemStatus
@@ -1197,6 +1260,8 @@ def _export_item(item, store, num: str) -> List[str]:
             lines.append(f"- Success criteria: {bs.success_criteria}")
         lines.append("")
 
+    lines.extend(_export_plan_context(item.plan_context))
+
     if item.depends_on:
         dep_items = [store.get_plan_item(d) for d in item.depends_on]
         dep_names = [d.title for d in dep_items if d]
@@ -1231,7 +1296,7 @@ async def validate_plan_for_export(campaign_id: str, store) -> str:
     from .validation import (
         HARDWARE_LIMITS, CONTROL_KEYWORDS,
         _check_dependency_cycles, _stage_order, _normalise_stage,
-        _get_temp_factor, STAGE_TIMING_20C,
+        _get_temp_factor, STAGE_TIMING_20C, _collect_context_warnings,
     )
 
     items = store.get_plan_items(campaign_id=campaign_id, include_children=True)
@@ -1270,6 +1335,9 @@ async def validate_plan_for_export(campaign_id: str, store) -> str:
                 if hi is not None and val > hi:
                     issues.append(f"- **Error:** {label} — {field_name}={val} exceeds max {hi}")
 
+        for warning in _collect_context_warnings(label, item, spec):
+            issues.append(f"- **Warning:** {warning}")
+
     cycle_errors = _check_dependency_cycles(items)
     for cyc in cycle_errors:
         issues.append(f"- **Error:** {cyc}")
diff --git a/gently/harness/plan_mode/tools/validation.py b/gently/harness/plan_mode/tools/validation.py
index 8e09802a..f6c955f5 100644
--- a/gently/harness/plan_mode/tools/validation.py
+++ b/gently/harness/plan_mode/tools/validation.py
@@ -47,6 +47,21 @@
 
 CONTROL_KEYWORDS = {"control", "wildtype", "n2", "wt", "wild-type", "wild type"}
 
+CONTEXT_LAYERS = ("technical", "experimental", "theoretical", "conceptual")
+
+EMBRYO_CONTEXT_KEYWORDS = {
+    "embryo", "embryos", "egg", "eggs", "c. elegans", "celegans",
+}
+
+TIMELAPSE_CONTEXT_KEYWORDS = {
+    "timelapse", "time-lapse", "time lapse", "longitudinal", "development",
+}
+
+DISPIM_SAFETY_KEYWORDS = {
+    "calibration", "calibrate", "f-drive", "f drive", "head axis",
+    "focus", "glass", "slide", "poly-lysine", "poly lysine",
+}
+
 
 # ---------------------------------------------------------------------------
 # Helpers
@@ -139,6 +154,96 @@ def _normalise_stage(name: str) -> Optional[str]:
     return aliases.get(low.replace(" ", ""))
 
 
+def _missing_context_layers(item) -> List[str]:
+    """Return required thought-context layers that are missing."""
+    plan_context = getattr(item, "plan_context", None)
+    if not plan_context:
+        return list(CONTEXT_LAYERS)
+    return [
+        layer for layer in CONTEXT_LAYERS
+        if not getattr(plan_context, layer, None)
+    ]
+
+
+def _plan_context_text(item) -> str:
+    plan_context = getattr(item, "plan_context", None)
+    if not plan_context:
+        return ""
+    parts = []
+    for attr in (
+        "technical", "experimental", "theoretical", "conceptual",
+        "sample_entity", "operator_context", "success_question",
+    ):
+        val = getattr(plan_context, attr, None)
+        if val:
+            parts.append(str(val))
+    parts.extend(str(v) for v in getattr(plan_context, "constraints", []) or [])
+    return " ".join(parts).lower()
+
+
+def _item_text(item, spec=None) -> str:
+    parts = [item.title, item.description, item.outcome]
+    if spec:
+        for attr in (
+            "strain", "genotype", "reporter", "sample_prep", "target_window",
+            "start_stage", "stop_condition", "success_criteria",
+            "comparison_to",
+        ):
+            val = getattr(spec, attr, None)
+            if val:
+                parts.append(str(val))
+        if spec.num_embryos:
+            parts.append("embryo")
+        if spec.interval_s or spec.adaptive_intervals:
+            parts.append("timelapse")
+    return " ".join(filter(None, parts)).lower()
+
+
+def _needs_dispim_focus_context(item, spec=None) -> bool:
+    if item.type.value != "imaging" or not spec:
+        return False
+    text = _item_text(item, spec)
+    embryo_like = spec.num_embryos is not None or any(
+        kw in text for kw in EMBRYO_CONTEXT_KEYWORDS
+    )
+    timelapse_like = (
+        spec.interval_s is not None
+        or spec.adaptive_intervals is not None
+        or any(kw in text for kw in TIMELAPSE_CONTEXT_KEYWORDS)
+    )
+    dispim_like = (
+        spec.galvo_amplitude is not None
+        or spec.piezo_amplitude_um is not None
+        or "dispim" in text
+        or "spim" in text
+    )
+    return embryo_like and timelapse_like and (dispim_like or spec.num_slices is not None)
+
+
+def _has_dispim_focus_safety_context(item) -> bool:
+    context_text = _plan_context_text(item)
+    return any(kw in context_text for kw in DISPIM_SAFETY_KEYWORDS)
+
+
+def _collect_context_warnings(label: str, item, spec=None) -> List[str]:
+    warnings: List[str] = []
+    if item.type.value == "imaging":
+        missing_layers = _missing_context_layers(item)
+        if missing_layers:
+            warnings.append(
+                f"{label}: missing microscope thought context layers: "
+                f"{', '.join(missing_layers)}"
+            )
+
+    if _needs_dispim_focus_context(item, spec) and not _has_dispim_focus_safety_context(item):
+        warnings.append(
+            f"{label}: DiSPIM embryo timelapse should state calibration, "
+            "F-drive/head-axis focus finding, and glass-slide safety assumptions "
+            "in plan_context.technical or plan_context.constraints before acquisition."
+        )
+    return warnings
+
+
 # ---------------------------------------------------------------------------
 # Tool
 # ---------------------------------------------------------------------------
@@ -282,6 +387,8 @@ async def validate_plan(
                     f"{label}: missing imaging spec fields: {', '.join(missing_fields)}"
                 )
 
+        warnings.extend(_collect_context_warnings(label, item, spec))
+
     # ------------------------------------------------------------------
     # Plan-level checks
     # ------------------------------------------------------------------
diff --git a/tests/test_context_store.py b/tests/test_context_store.py
index 5bb8931d..fe0ccf4f 100644
--- a/tests/test_context_store.py
+++ b/tests/test_context_store.py
@@ -10,6 +10,7 @@
     Confidence,
     Learning,
     Observation,
+    PlanContext,
     PlanItemStatus,
     Significance,
 )
@@ -216,6 +217,73 @@ def test_list_plan_items_by_type(self, context_store):
         imaging_items = context_store.get_plan_items(campaign_id=cid, type="imaging")
         assert len(imaging_items) == 2
 
+    def test_plan_item_preserves_microscope_thought_context(self, context_store):
+        cid = context_store.create_campaign(description="C1")
+        item_id = context_store.create_plan_item(
+            campaign_id=cid,
+            type="imaging",
+            title="Embryo timelapse",
+            plan_context={
+                "technical": "Bottom overview XY, F/head-axis alignment, calibration first.",
+                "experimental": "Ryan/Brie locate embryos and approve timelapse.",
+                "theoretical": "Developmental timing remains interpretable.",
+                "conceptual": "Biologist plans at the embryo-development level.",
+                "sample_entity": "C. elegans embryo",
+                "operator_context": "Immediate DiSPIM users Ryan and Brie",
+                "constraints": ["Avoid head-axis overtravel near glass"],
+                "success_question": "Can focus and calibration stay explicit?",
+            },
+        )
+
+        item = context_store.get_plan_item(item_id)
+        assert isinstance(item.plan_context, PlanContext)
+        assert item.plan_context.technical.startswith("Bottom overview")
+        assert item.plan_context.constraints == ["Avoid head-axis overtravel near glass"]
+
+        context_store.update_plan_item(
+            item_id,
+            plan_context={
+                "technical": "Calibration confirmed before timelapse.",
+                "experimental": "Operator approves first acquisition.",
+                "theoretical": "Reporter dynamics remain measurable.",
+                "conceptual": "Planning layer governs microscope interaction.",
+            },
+        )
+        updated = context_store.get_plan_item(item_id)
+        assert updated.plan_context.technical == "Calibration confirmed before timelapse."
+        assert updated.plan_context.conceptual == "Planning layer governs microscope interaction."
+
+        template_id = context_store.save_plan_template(
+            "context-template",
+            "Plan context template",
+            cid,
+        )
+        new_cid = context_store.apply_plan_template(template_id)
+        cloned = context_store.get_plan_items(campaign_id=new_cid)[0]
+        assert cloned.plan_context.theoretical == "Reporter dynamics remain measurable."
+
+
+class TestFileContextPlanItems:
+    def test_file_context_store_preserves_microscope_thought_context(self, file_context_store):
+        cid = file_context_store.create_campaign(description="C1")
+        item_id = file_context_store.create_plan_item(
+            campaign_id=cid,
+            type="imaging",
+            title="Embryo timelapse",
+            plan_context={
+                "technical": "F-drive focus finding and calibration before timelapse.",
+                "experimental": "Ryan/Brie workflow.",
+                "theoretical": "Developmental timing remains interpretable.",
+                "conceptual": "Biologist plans at embryo level.",
+                "constraints": ["Avoid overtravel toward glass"],
+            },
+        )
+
+        item = file_context_store.get_plan_item(item_id)
+        assert isinstance(item.plan_context, PlanContext)
+        assert item.plan_context.technical.startswith("F-drive focus")
+        assert item.plan_context.constraints == ["Avoid overtravel toward glass"]
+
 
 class TestObservations:
     def test_add_and_retrieve(self, context_store):
diff --git a/tests/test_plan_context_validation.py b/tests/test_plan_context_validation.py
new file mode 100644
index 00000000..4ea8c066
--- /dev/null
+++ b/tests/test_plan_context_validation.py
@@ -0,0 +1,61 @@
+from gently.harness.memory.model import (
+    ImagingSpec,
+    PlanContext,
+    PlanItem,
+    PlanItemType,
+)
+from gently.harness.plan_mode.tools.validation import _collect_context_warnings
+
+
+def _embryo_timelapse_item(plan_context=None):
+    return PlanItem(
+        id="item-1",
+        campaign_id="campaign-1",
+        type=PlanItemType.IMAGING,
+        title="DiSPIM embryo timelapse",
+        imaging_spec=ImagingSpec(
+            num_slices=80,
+            interval_s=120,
+            num_embryos=4,
+            sample_prep="Embryos on poly-lysine-coated glass slide",
+        ),
+        plan_context=plan_context,
+    )
+
+
+def test_dispim_embryo_timelapse_warns_without_focus_safety_context():
+    item = _embryo_timelapse_item()
+    warnings = _collect_context_warnings(
+        "[imaging] 'DiSPIM embryo timelapse'",
+        item,
+        item.imaging_spec,
+    )
+
+    assert any("missing microscope thought context layers" in w for w in warnings)
+    assert any("F-drive/head-axis focus finding" in w for w in warnings)
+
+
+def test_dispim_embryo_timelapse_accepts_explicit_focus_safety_context():
+    item = _embryo_timelapse_item(
+        PlanContext(
+            technical=(
+                "Bottom overview XY, F-drive focus finding, "
+                "and calibration before timelapse."
+            ),
+            experimental="Ryan/Brie align F/head axis before acquisition.",
+            theoretical="Developmental progression remains interpretable.",
+            conceptual="Biologist plans at embryo level while Gently manages DiSPIM details.",
+            constraints=[
+                "Confirm calibration before lowering the SPIM head toward focus",
+                "Avoid overtravel beyond embryo focus toward the glass slide",
+            ],
+        )
+    )
+
+    warnings = _collect_context_warnings(
+        "[imaging] 'DiSPIM embryo timelapse'",
+        item,
+        item.imaging_spec,
+    )
+
+    assert warnings == []

From aa79f0f038c5336399f6fc2bcac285bb8b2bf00f Mon Sep 17 00:00:00 2001
From: ceej640 <42260127+Ceej640@users.noreply.github.com>
Date: Mon, 1 Jun 2026 00:16:10 -0400
Subject: [PATCH 2/4] Fix web template rendering with current Starlette

---
 gently/ui/web/routes/auth_routes.py | 6 +++++-
 gently/ui/web/routes/pages.py       | 6 ++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/gently/ui/web/routes/auth_routes.py b/gently/ui/web/routes/auth_routes.py
index e7db5500..431aa9f0 100644
--- a/gently/ui/web/routes/auth_routes.py
+++ b/gently/ui/web/routes/auth_routes.py
@@ -31,7 +31,11 @@ async def login_page(request: Request):
             return RedirectResponse("/", status_code=302)
         if current_username(request):
             return RedirectResponse("/", status_code=302)
-        return server.templates.TemplateResponse("login.html", {"request": request})
+        return server.templates.TemplateResponse(
+            request,
+            "login.html",
+            {"request": request},
+        )
 
     @router.post("/api/auth/login")
     async def login(request: Request):
diff --git a/gently/ui/web/routes/pages.py b/gently/ui/web/routes/pages.py
index 0cff7d82..65835ac4 100644
--- a/gently/ui/web/routes/pages.py
+++ b/gently/ui/web/routes/pages.py
@@ -16,8 +16,9 @@ async def index(request: Request):
         chat window's "Sign in" affordance), not a gate on the page itself.
         """
         return server.templates.TemplateResponse(
+            request,
             "index.html",
-            {"request": request, "active_section": "embryos", "is_live": True}
+            {"request": request, "active_section": "embryos", "is_live": True},
         )
 
     # Standalone URLs redirect to SPA with hash fragment for tab routing
@@ -37,8 +38,9 @@ async def plan_review_page(campaign_id: str):
     async def settings_page(request: Request):
         """Serve the dashboard settings page"""
         return server.templates.TemplateResponse(
+            request,
             "settings.html",
-            {"request": request}
+            {"request": request},
         )
 
     return router

From 301bffb446927f56155d59537c40a465bc7d7783 Mon Sep 17 00:00:00 2001
From: Johnson <johnsonc5@hhmi.org>
Date: Mon, 1 Jun 2026 01:14:04 -0400
Subject: [PATCH 3/4] Add structured plan generation tool

---
 docs/planning-context-hierarchy.md         |  16 ++
 gently/harness/plan_mode/prompt.py         |  11 +-
 gently/harness/plan_mode/tools/planning.py | 246 ++++++++++++++++++++-
 tests/test_structured_plan_tool.py         |  87 ++++++++
 4 files changed, 350 insertions(+), 10 deletions(-)
 create mode 100644 tests/test_structured_plan_tool.py

diff --git a/docs/planning-context-hierarchy.md b/docs/planning-context-hierarchy.md
index 57c48038..b257e571 100644
--- a/docs/planning-context-hierarchy.md
+++ b/docs/planning-context-hierarchy.md
@@ -82,3 +82,19 @@ technical context or constraints.
 
 The warnings are deliberate: they keep the existing planning system usable while
 making the missing planning structure visible for iteration.
+
+## Structured Generation
+
+Plan mode also provides `create_structured_plan` for creating the full
+campaign/phase/task hierarchy in one operation. It accepts:
+
+- root campaign description, shorthand, and target
+- phase objects with local keys
+- typed task objects using `type` or `task_class`
+- local dependency references between task keys
+- task `spec`, `references`, `estimated_days`, and `plan_context`
+
+This keeps campaign -> phase -> task construction explicit while reducing the
+tool-call chatter that occurs when the agent creates each campaign, item, and
+dependency separately. The lower-level tools remain available for incremental
+edits after the initial plan outline exists.
diff --git a/gently/harness/plan_mode/prompt.py b/gently/harness/plan_mode/prompt.py
index d7904afe..8f71cf6e 100644
--- a/gently/harness/plan_mode/prompt.py
+++ b/gently/harness/plan_mode/prompt.py
@@ -96,11 +96,12 @@
 
 ## Output Format
 
-Use the plan tools to build the plan:
-1. First create campaigns (top-level + phase sub-campaigns)
-2. Then create plan items within each phase
-3. Set dependencies between items
-4. Present the full plan for review with propose_plan
+Use the plan tools to build the plan. When the campaign/phase/task structure is
+known, prefer `create_structured_plan` so the campaign, phases, typed tasks,
+dependencies, specs, references, and plan_context are created in one deliberate
+operation. Use the lower-level tools (`create_campaign`, `create_plan_item`,
+`link_plan_items`, `propose_plan`) for incremental edits, small additions, or
+follow-up revisions.
 
 When creating plan items, attach both:
 - `spec` for executable or measurable parameters.
diff --git a/gently/harness/plan_mode/tools/planning.py b/gently/harness/plan_mode/tools/planning.py
index 5a695f6b..4ef5463a 100644
--- a/gently/harness/plan_mode/tools/planning.py
+++ b/gently/harness/plan_mode/tools/planning.py
@@ -17,6 +17,55 @@
 # Campaign / Phase Management
 # ---------------------------------------------------------------------------
 
+
+def _normalize_shorthand_year(shorthand: Optional[str]) -> Optional[str]:
+    """Keep generated campaign labels aligned with the current year."""
+    if not shorthand:
+        return shorthand
+    import re
+    from datetime import datetime
+    current_year = str(datetime.now().year)
+    return re.sub(r"-20\d{2}$", f"-{current_year}", shorthand)
+
+
+def _normalize_plan_item_type(value: Optional[str]) -> str:
+    """Validate and normalize plan item task classes."""
+    from gently.harness.memory.model import PlanItemType
+
+    item_type = (value or "").strip().lower()
+    aliases = {
+        "decision": "decision_point",
+        "decision-point": "decision_point",
+        "decision point": "decision_point",
+    }
+    item_type = aliases.get(item_type, item_type)
+    valid = {member.value for member in PlanItemType}
+    if item_type not in valid:
+        raise ValueError(f"type must be one of {sorted(valid)}")
+    return item_type
+
+
+def _phase_key(entry: Dict, index: int) -> str:
+    """Return a stable local key for a phase entry."""
+    return str(entry.get("key") or entry.get("id") or entry.get("name") or index)
+
+
+def _item_key(entry: Dict, index: int) -> str:
+    """Return a stable local key for an item entry."""
+    return str(entry.get("key") or entry.get("id") or entry.get("title") or index)
+
+
+def _lookup_local_id(ref: str, item_ids: Dict[str, str]) -> Optional[str]:
+    """Resolve a dependency reference against item keys or existing IDs."""
+    if ref in item_ids:
+        return item_ids[ref]
+    ref_s = str(ref)
+    if ref_s in item_ids:
+        return item_ids[ref_s]
+    if ref_s.startswith("$") and ref_s[1:] in item_ids:
+        return item_ids[ref_s[1:]]
+    return ref_s or None
+
 @tool(
     name="create_campaign",
     description=(
@@ -50,11 +99,7 @@ async def create_campaign(
         return "Error: Context store not available"
 
     # Fix year in shorthand — models sometimes hallucinate the wrong year
-    if shorthand:
-        import re
-        from datetime import datetime
-        current_year = str(datetime.now().year)
-        shorthand = re.sub(r'-20\d{2}$', f'-{current_year}', shorthand)
+    shorthand = _normalize_shorthand_year(shorthand)
 
     store = agent.context_store
     cid = store.create_campaign(
@@ -68,6 +113,191 @@ async def create_campaign(
     return f"Created campaign '{description}' (id: {cid})"
 
 
+@tool(
+    name="create_structured_plan",
+    description=(
+        "Create a complete campaign/phase/task plan in one operation. Use this "
+        "when the plan shape is known and you would otherwise call "
+        "create_campaign, create_plan_item, link_plan_items, and propose_plan "
+        "many times. Phases are a list of objects with description and optional "
+        "key/shorthand/target. Items are objects with key, phase, type or "
+        "task_class, title, description, spec, plan_context, references, "
+        "estimated_days, and depends_on. Dependency refs may point to item keys."
+    ),
+    category=ToolCategory.UTILITY,
+    examples=[
+        ToolExample(
+            user_query="Create a two-phase F-drive focus-finding plan",
+            tool_input={
+                "description": "Safe DiSPIM F-drive focus finding",
+                "shorthand": "fdrive-focus-2026",
+                "target": "Find embryos, calibrate safely, and decide whether timelapse can start",
+                "phases": [
+                    {"key": "setup", "description": "Setup and calibration"},
+                    {"key": "decision", "description": "Timelapse readiness decision"},
+                ],
+                "items": [
+                    {
+                        "key": "find",
+                        "phase": "setup",
+                        "type": "imaging",
+                        "title": "Locate embryos in XY",
+                        "spec": {"sample_prep": "poly-lysine slide"},
+                        "plan_context": {
+                            "technical": "Use bottom overview camera for XY finding.",
+                            "constraints": ["Do not approach glass before focus safety is confirmed"],
+                        },
+                    },
+                    {
+                        "key": "go",
+                        "phase": "decision",
+                        "task_class": "decision_point",
+                        "title": "Decide whether to start timelapse",
+                        "depends_on": ["find"],
+                    },
+                ],
+            },
+        ),
+    ],
+)
+async def create_structured_plan(
+    description: str,
+    shorthand: str = None,
+    target: str = None,
+    phases: List[Dict] = None,
+    items: List[Dict] = None,
+    present_plan: bool = True,
+    context: Dict = None,
+) -> str:
+    """Create a full campaign/phase/task hierarchy from a single outline.
+
+    This reduces tool-call chatter during plan synthesis while preserving the
+    same Campaign and PlanItem records used by the rest of plan mode.
+    """
+    agent = context.get("agent") if context else None
+    if not agent or not hasattr(agent, "context_store") or not agent.context_store:
+        return "Error: Context store not available"
+
+    store = agent.context_store
+    phases = phases or []
+    items = items or []
+
+    campaign_id = store.create_campaign(
+        description=description,
+        shorthand=_normalize_shorthand_year(shorthand),
+        target=target,
+    )
+    campaign = store.get_campaign(campaign_id)
+
+    phase_ids: Dict[str, str] = {}
+    phase_count = 0
+    for idx, phase in enumerate(phases, 1):
+        if not isinstance(phase, dict):
+            return f"Error: phase {idx} must be an object"
+        phase_description = (phase.get("description") or phase.get("title") or "").strip()
+        if not phase_description:
+            return f"Error: phase {idx} needs a description"
+        phase_id = store.create_campaign(
+            description=phase_description,
+            shorthand=_normalize_shorthand_year(phase.get("shorthand")),
+            target=phase.get("target"),
+            parent_id=campaign_id,
+        )
+        key = _phase_key(phase, idx)
+        phase_ids[key] = phase_id
+        phase_ids[str(idx)] = phase_id
+        phase_count += 1
+
+    item_ids: Dict[str, str] = {}
+    item_count = 0
+    pending_dependencies: List[tuple[str, List[str]]] = []
+    errors: List[str] = []
+
+    for idx, item in enumerate(items, 1):
+        if not isinstance(item, dict):
+            errors.append(f"item {idx} must be an object")
+            continue
+
+        title = (item.get("title") or "").strip()
+        if not title:
+            errors.append(f"item {idx} needs a title")
+            continue
+
+        try:
+            item_type = _normalize_plan_item_type(
+                item.get("type") or item.get("task_class") or item.get("class")
+            )
+        except ValueError as exc:
+            errors.append(f"{title}: {exc}")
+            continue
+
+        phase_ref = item.get("phase")
+        target_campaign_id = campaign_id
+        if phase_ref is not None:
+            target_campaign_id = phase_ids.get(str(phase_ref))
+            if not target_campaign_id:
+                errors.append(f"{title}: unknown phase reference {phase_ref!r}")
+                continue
+
+        phase_order_raw = item.get("phase_order", -1)
+        phase_order = -1 if phase_order_raw is None else int(phase_order_raw)
+        item_id = store.create_plan_item(
+            campaign_id=target_campaign_id,
+            type=item_type,
+            title=title,
+            description=item.get("description"),
+            spec=item.get("spec"),
+            inherit_from=item.get("inherit_from"),
+            phase_order=phase_order,
+            references=item.get("references"),
+            estimated_days=item.get("estimated_days"),
+            plan_context=item.get("plan_context"),
+        )
+        key = _item_key(item, idx)
+        item_ids[key] = item_id
+        item_ids[str(idx)] = item_id
+        item_count += 1
+        depends_raw = item.get("depends_on") or []
+        depends = [depends_raw] if isinstance(depends_raw, str) else list(depends_raw)
+        pending_dependencies.append((item_id, depends))
+
+    linked = 0
+    local_item_ids = set(item_ids.values())
+    for item_id, refs in pending_dependencies:
+        for ref in refs:
+            dep_id = _lookup_local_id(str(ref), item_ids)
+            if dep_id and dep_id not in local_item_ids:
+                existing = store.resolve_plan_item(dep_id, campaign_id=campaign_id)
+                if not existing:
+                    errors.append(f"{item_id}: unknown dependency reference {ref!r}")
+                    continue
+                dep_id = existing.id
+            if not dep_id:
+                errors.append(f"{item_id}: unknown dependency reference {ref!r}")
+                continue
+            try:
+                store.add_plan_item_dependency(item_id, dep_id)
+                linked += 1
+            except Exception as exc:
+                errors.append(f"{item_id}: could not link dependency {ref!r}: {exc}")
+
+    lines = [
+        f"Created structured plan '{description}' (id: {campaign_id})",
+        f"Phases: {phase_count}",
+        f"Items: {item_count}",
+        f"Dependencies: {linked}",
+    ]
+    if errors:
+        lines.append("Warnings:")
+        lines.extend(f"- {err}" for err in errors)
+    if present_plan and campaign:
+        lines.append("")
+        lines.append(_render_plan(campaign, store))
+    else:
+        lines.append("Call propose_plan to review the plan.")
+    return "\n".join(lines)
+
+
 # ---------------------------------------------------------------------------
 # Plan Item Management
 # ---------------------------------------------------------------------------
@@ -369,6 +599,12 @@ async def propose_plan(
         return f"Campaign '{campaign_id}' not found. Try a shorthand, name, or UUID."
     campaign_id = campaign.id
 
+    return _render_plan(campaign, store)
+
+
+def _render_plan(campaign, store) -> str:
+    """Render the full plan for review from an already resolved campaign."""
+    campaign_id = campaign.id
     lines = []
     lines.append(f"{'=' * 55}")
     lines.append(f" EXPERIMENTAL PLAN: {campaign.description}")
diff --git a/tests/test_structured_plan_tool.py b/tests/test_structured_plan_tool.py
new file mode 100644
index 00000000..5feef993
--- /dev/null
+++ b/tests/test_structured_plan_tool.py
@@ -0,0 +1,87 @@
+from types import SimpleNamespace
+
+import pytest
+
+from gently.harness.memory.model import PlanContext, PlanItemType
+from gently.harness.tools.registry import get_tool_registry
+
+# Import registers the plan-mode tools with the global registry.
+from gently.harness.plan_mode.tools import planning  # noqa: F401
+
+
+@pytest.mark.asyncio
+async def test_create_structured_plan_builds_phases_items_and_dependencies(context_store):
+    agent = SimpleNamespace(context_store=context_store)
+
+    result = await get_tool_registry().execute(
+        "create_structured_plan",
+        {
+            "description": "Safe F-drive focus finding",
+            "shorthand": "fdrive-focus-2026",
+            "target": "Locate embryos and decide whether a safe timelapse can start",
+            "phases": [
+                {"key": "setup", "description": "Setup and calibration"},
+                {"key": "run", "description": "Timelapse readiness"},
+            ],
+            "items": [
+                {
+                    "key": "find",
+                    "phase": "setup",
+                    "type": "imaging",
+                    "title": "Locate embryos in XY",
+                    "spec": {
+                        "sample_prep": "poly-lysine slide",
+                        "num_embryos": 3,
+                    },
+                    "plan_context": {
+                        "technical": "Bottom overview camera locates embryos in XY.",
+                        "experimental": "Ryan and Brie verify positions before focus approach.",
+                        "theoretical": "Embryos remain comparable after mounting.",
+                        "conceptual": "Keep microscope support visible to the biologist.",
+                        "constraints": ["Avoid overtravel toward glass"],
+                    },
+                },
+                {
+                    "key": "calibrate",
+                    "phase": "setup",
+                    "type": "imaging",
+                    "title": "Calibrate galvo-piezo per embryo",
+                    "depends_on": ["find"],
+                },
+                {
+                    "key": "decide",
+                    "phase": "run",
+                    "task_class": "decision_point",
+                    "title": "Decide whether to start timelapse",
+                    "depends_on": ["calibrate"],
+                },
+            ],
+        },
+        {"agent": agent},
+    )
+
+    campaign = context_store.resolve_campaign("fdrive-focus-2026")
+    assert campaign is not None
+    phases = context_store.get_subcampaigns(campaign.id)
+    assert [phase.description for phase in phases] == [
+        "Setup and calibration",
+        "Timelapse readiness",
+    ]
+
+    setup_items = context_store.get_plan_items(campaign_id=phases[0].id)
+    run_items = context_store.get_plan_items(campaign_id=phases[1].id)
+    assert [item.title for item in setup_items] == [
+        "Locate embryos in XY",
+        "Calibrate galvo-piezo per embryo",
+    ]
+    assert run_items[0].type == PlanItemType.DECISION_POINT
+    assert setup_items[0].type == PlanItemType.IMAGING
+    assert isinstance(setup_items[0].plan_context, PlanContext)
+    assert setup_items[0].plan_context.constraints == ["Avoid overtravel toward glass"]
+
+    assert context_store.get_plan_item_dependencies(setup_items[1].id) == [setup_items[0].id]
+    assert context_store.get_plan_item_dependencies(run_items[0].id) == [setup_items[1].id]
+    assert "Created structured plan" in result
+    assert "Phases: 2" in result
+    assert "Items: 3" in result
+    assert "EXPERIMENTAL PLAN" in result

From 9f9ebe1abd115a91b5cdb7ef6753a033cd83b313 Mon Sep 17 00:00:00 2001
From: Johnson <johnsonc5@hhmi.org>
Date: Mon, 1 Jun 2026 01:43:01 -0400
Subject: [PATCH 4/4] Add offline structured plan replay benchmark

---
 benchmarks/runner.py                 |  39 ++++++
 benchmarks/structured_plan_replay.py | 181 +++++++++++++++++++++++++++
 docs/planning-context-hierarchy.md   |  20 +++
 tests/test_structured_plan_replay.py |  27 ++++
 4 files changed, 267 insertions(+)
 create mode 100644 benchmarks/structured_plan_replay.py
 create mode 100644 tests/test_structured_plan_replay.py

diff --git a/benchmarks/runner.py b/benchmarks/runner.py
index 4d6d847a..fa03eeab 100644
--- a/benchmarks/runner.py
+++ b/benchmarks/runner.py
@@ -64,6 +64,35 @@ async def run_agent_benchmark(args):
     return 0
 
 
+async def run_structured_plan_benchmark(args):
+    """Run the offline structured-plan replay benchmark."""
+    from .structured_plan_replay import run_structured_plan_replay
+
+    report = await run_structured_plan_replay(Path(args.workdir)) if args.workdir else None
+    if report is None:
+        import tempfile
+
+        with tempfile.TemporaryDirectory(prefix="gently-structured-plan-") as tmp:
+            report = await run_structured_plan_replay(Path(tmp))
+
+    payload = report
+    logger.info("=" * 60)
+    logger.info("STRUCTURED PLAN REPLAY")
+    logger.info("=" * 60)
+    logger.info("Passed: %s", payload["passed"])
+    logger.info("Elapsed: %.3f ms", payload["elapsed_ms"])
+    logger.info("Expected: %s", payload["expected_counts"])
+    logger.info("Actual:   %s", payload["actual_counts"])
+
+    if args.output:
+        output = Path(args.output)
+        output.parent.mkdir(parents=True, exist_ok=True)
+        output.write_text(json.dumps(payload, indent=2), encoding="utf-8")
+        logger.info("Wrote report: %s", output)
+
+    return 0 if payload["passed"] else 1
+
+
 def compare_reports(args):
     """Compare two benchmark reports"""
     from .agent.evaluator import BenchmarkReport, compare_reports as _compare
@@ -132,10 +161,20 @@ def main():
     compare_parser.add_argument("before", help="Before report JSON")
     compare_parser.add_argument("after", help="After report JSON")
 
+    # Structured plan replay
+    plan_parser = subparsers.add_parser(
+        "structured-plan",
+        help="Run the offline structured-plan replay benchmark",
+    )
+    plan_parser.add_argument("--workdir", help="Directory for the temporary ContextStore database")
+    plan_parser.add_argument("--output", help="Output file for the JSON report")
+
     args = parser.parse_args()
 
     if args.command == "agent":
         return asyncio.run(run_agent_benchmark(args))
+    elif args.command == "structured-plan":
+        return asyncio.run(run_structured_plan_benchmark(args))
     elif args.command == "compare":
         return compare_reports(args)
     else:
diff --git a/benchmarks/structured_plan_replay.py b/benchmarks/structured_plan_replay.py
new file mode 100644
index 00000000..934f7984
--- /dev/null
+++ b/benchmarks/structured_plan_replay.py
@@ -0,0 +1,181 @@
+"""Offline replay benchmark for structured plan generation.
+
+This benchmark exercises the `create_structured_plan` tool against a fresh
+ContextStore. It is intentionally deterministic: no LLM, browser session, or
+microscope connection is required.
+"""
+
+from __future__ import annotations
+
+import argparse
+import asyncio
+import json
+import tempfile
+import time
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any, Dict, Mapping, Optional
+
+from gently.harness.memory.store import ContextStore
+from gently.harness.tools.registry import get_tool_registry
+
+# Import registers the plan-mode tools with the global registry.
+from gently.harness.plan_mode.tools import planning  # noqa: F401
+
+
+DEFAULT_PLAN_PAYLOAD: Dict[str, Any] = {
+    "description": "Safe F-drive focus finding",
+    "shorthand": "fdrive-focus-2026",
+    "target": "Locate embryos and decide whether a safe timelapse can start",
+    "phases": [
+        {"key": "setup", "description": "Setup and calibration"},
+        {"key": "run", "description": "Timelapse readiness"},
+    ],
+    "items": [
+        {
+            "key": "find",
+            "phase": "setup",
+            "type": "imaging",
+            "title": "Locate embryos in XY",
+            "spec": {
+                "sample_prep": "poly-lysine slide",
+                "num_embryos": 3,
+            },
+            "plan_context": {
+                "technical": "Bottom overview camera locates embryos in XY.",
+                "experimental": "Ryan and Brie verify positions before focus approach.",
+                "theoretical": "Embryos remain comparable after mounting.",
+                "conceptual": "Keep microscope support visible to the biologist.",
+                "constraints": ["Avoid overtravel toward glass"],
+            },
+        },
+        {
+            "key": "calibrate",
+            "phase": "setup",
+            "type": "imaging",
+            "title": "Calibrate galvo-piezo per embryo",
+            "depends_on": ["find"],
+        },
+        {
+            "key": "decide",
+            "phase": "run",
+            "task_class": "decision_point",
+            "title": "Decide whether to start timelapse",
+            "depends_on": ["calibrate"],
+        },
+    ],
+}
+
+
+def _count_plan_records(store: ContextStore, campaign_id: str) -> Dict[str, int]:
+    phases = store.get_subcampaigns(campaign_id)
+    campaign_ids = [campaign_id] + [phase.id for phase in phases]
+
+    items = []
+    dependency_count = 0
+    for cid in campaign_ids:
+        campaign_items = store.get_plan_items(campaign_id=cid)
+        items.extend(campaign_items)
+        for item in campaign_items:
+            dependency_count += len(store.get_plan_item_dependencies(item.id))
+
+    return {
+        "campaigns": 1,
+        "phases": len(phases),
+        "items": len(items),
+        "dependencies": dependency_count,
+    }
+
+
+def _expected_dependency_count(items: list[Mapping[str, Any]]) -> int:
+    count = 0
+    for item in items:
+        raw = item.get("depends_on") or []
+        count += 1 if isinstance(raw, str) else len(raw)
+    return count
+
+
+async def run_structured_plan_replay(
+    workdir: Path,
+    payload: Optional[Mapping[str, Any]] = None,
+) -> Dict[str, Any]:
+    """Run the deterministic structured-plan replay and return a JSON report."""
+    workdir = Path(workdir)
+    workdir.mkdir(parents=True, exist_ok=True)
+    plan_payload = dict(payload or DEFAULT_PLAN_PAYLOAD)
+
+    store = ContextStore(workdir / "structured_plan_replay.db")
+    agent = SimpleNamespace(context_store=store)
+    started = time.perf_counter()
+    try:
+        result = await get_tool_registry().execute(
+            "create_structured_plan",
+            plan_payload,
+            {"agent": agent},
+        )
+        elapsed_ms = round((time.perf_counter() - started) * 1000, 3)
+
+        campaign = store.resolve_campaign(str(plan_payload["shorthand"]))
+        counts = (
+            _count_plan_records(store, campaign.id)
+            if campaign is not None
+            else {"campaigns": 0, "phases": 0, "items": 0, "dependencies": 0}
+        )
+        expected_counts = {
+            "campaigns": 1,
+            "phases": len(plan_payload.get("phases") or []),
+            "items": len(plan_payload.get("items") or []),
+            "dependencies": _expected_dependency_count(plan_payload.get("items") or []),
+        }
+        passed = (
+            campaign is not None
+            and not str(result).startswith("Error:")
+            and counts == expected_counts
+        )
+
+        return {
+            "benchmark": "structured_plan_replay",
+            "passed": passed,
+            "elapsed_ms": elapsed_ms,
+            "tool_calls": [
+                {
+                    "name": "create_structured_plan",
+                    "phases": expected_counts["phases"],
+                    "items": expected_counts["items"],
+                    "dependencies": expected_counts["dependencies"],
+                }
+            ],
+            "expected_counts": expected_counts,
+            "actual_counts": counts,
+            "campaign_id": campaign.id if campaign else None,
+            "tool_result_excerpt": str(result).splitlines()[:8],
+        }
+    finally:
+        store.close()
+
+
+async def _run_cli(args: argparse.Namespace) -> int:
+    if args.workdir:
+        report = await run_structured_plan_replay(Path(args.workdir))
+    else:
+        with tempfile.TemporaryDirectory(prefix="gently-structured-plan-") as tmp:
+            report = await run_structured_plan_replay(Path(tmp))
+
+    payload = json.dumps(report, indent=2)
+    if args.output:
+        output = Path(args.output)
+        output.parent.mkdir(parents=True, exist_ok=True)
+        output.write_text(payload, encoding="utf-8")
+    print(payload)
+    return 0 if report["passed"] else 1
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run the structured plan replay benchmark")
+    parser.add_argument("--workdir", help="Directory for the temporary ContextStore database")
+    parser.add_argument("--output", help="Optional JSON report path")
+    return asyncio.run(_run_cli(parser.parse_args()))
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/docs/planning-context-hierarchy.md b/docs/planning-context-hierarchy.md
index b257e571..9e8632a8 100644
--- a/docs/planning-context-hierarchy.md
+++ b/docs/planning-context-hierarchy.md
@@ -98,3 +98,23 @@ This keeps campaign -> phase -> task construction explicit while reducing the
 tool-call chatter that occurs when the agent creates each campaign, item, and
 dependency separately. The lower-level tools remain available for incremental
 edits after the initial plan outline exists.
+
+## Offline Replay Benchmark
+
+The structured-plan path has a deterministic replay benchmark:
+
+```bash
+python -m benchmarks.runner structured-plan
+```
+
+or directly:
+
+```bash
+python -m benchmarks.structured_plan_replay
+```
+
+The replay creates a fresh local `ContextStore`, executes one
+`create_structured_plan` tool call, and verifies the expected campaign, two
+phases, three plan items, and two dependencies. It does not call an LLM, browser,
+or microscope server, so it can be used as a regression check for the planning
+tool shape even before a full agent benchmark exists.
diff --git a/tests/test_structured_plan_replay.py b/tests/test_structured_plan_replay.py
new file mode 100644
index 00000000..724d69b6
--- /dev/null
+++ b/tests/test_structured_plan_replay.py
@@ -0,0 +1,27 @@
+import pytest
+
+from benchmarks.structured_plan_replay import run_structured_plan_replay
+
+
+@pytest.mark.asyncio
+async def test_structured_plan_replay_scores_created_plan(tmp_path):
+    report = await run_structured_plan_replay(tmp_path)
+
+    assert report["benchmark"] == "structured_plan_replay"
+    assert report["passed"] is True
+    assert report["tool_calls"] == [
+        {
+            "name": "create_structured_plan",
+            "phases": 2,
+            "items": 3,
+            "dependencies": 2,
+        }
+    ]
+    assert report["expected_counts"] == report["actual_counts"]
+    assert report["actual_counts"] == {
+        "campaigns": 1,
+        "phases": 2,
+        "items": 3,
+        "dependencies": 2,
+    }
+    assert any("Created structured plan" in line for line in report["tool_result_excerpt"])