From 4b6c32179dcb4cc01f1aa257c33ab90d90eabcfa Mon Sep 17 00:00:00 2001 From: ceej640 <42260127+Ceej640@users.noreply.github.com> Date: Sun, 31 May 2026 20:23:40 -0400 Subject: [PATCH 1/4] Shape plan mode around microscope context hierarchy --- docs/planning-context-hierarchy.md | 84 +++++++++++++++ gently/harness/memory/_plans.py | 76 +++++++++---- gently/harness/memory/file_store.py | 49 +++++++++ gently/harness/memory/model.py | 19 ++++ gently/harness/memory/store.py | 4 + gently/harness/plan_mode/prompt.py | 37 +++++++ gently/harness/plan_mode/tools/planning.py | 76 ++++++++++++- gently/harness/plan_mode/tools/validation.py | 107 +++++++++++++++++++ tests/test_context_store.py | 68 ++++++++++++ tests/test_plan_context_validation.py | 61 +++++++++++ 10 files changed, 559 insertions(+), 22 deletions(-) create mode 100644 docs/planning-context-hierarchy.md create mode 100644 tests/test_plan_context_validation.py diff --git a/docs/planning-context-hierarchy.md b/docs/planning-context-hierarchy.md new file mode 100644 index 00000000..57c48038 --- /dev/null +++ b/docs/planning-context-hierarchy.md @@ -0,0 +1,84 @@ +# Planning Context Hierarchy + +This note maps the smart-microscopy context hierarchy into Gently plan mode. +It responds to the PR #23 discussion about making the planning layer the +biologist's primary way to interact with a DiSPIM, an organism, and later other +experimental modalities. + +## Source Framework + +Kesavan and Nordenfelt describe smart microscopy as a shift from passive data +collection toward active scientific collaboration. Their framework highlights +hierarchical context integration across four levels: technical, experimental, +theoretical, and conceptual context. + +Reference: +P. S. Kesavan and P. Nordenfelt, "From observation to understanding: A +multi-agent framework for smart microscopy," Journal of Microscopy, 2026, +doi: 10.1111/jmi.70063. + +Preprint: +https://arxiv.org/abs/2505.20466 + +## Gently Plan Item Mapping + +Every important plan item can now carry `plan_context` alongside its executable +or measurable `spec`. + +`technical` +: Instrument, sample, calibration, dataflow, and safety state. For the DiSPIM + embryo workflow this includes bottom-overview XY finding, F/head-axis + alignment, stage Z/head approach, `calibration_tools.py`, galvo-piezo + calibration, detector state, and timelapse settings. + +`experimental` +: Operator workflow, sample prep, controls, and user constraints. For the + immediate Ryan/Brie workflow this means locate embryos, align the F/head axis, + confirm calibration per embryo coordinate, then decide on timelapse. + +`theoretical` +: The biological model, developmental process, mechanism, or measurement + hypothesis that gives the image data meaning. + +`conceptual` +: The higher-level scientific or human-instrument objective. This is the layer + where the biologist should be able to say what they are trying to understand + without first phrasing it as device commands. + +Additional fields: +- `sample_entity`: the organism, embryo, tissue, region, or other entity being + acted on or observed. +- `operator_context`: who is operating, calibrating, approving, or using the + plan and what they need from the system. +- `constraints`: safety, timing, phototoxicity, calibration, or workflow + constraints that must stay visible during planning. +- `success_question`: the question that should be answerable if the item works. + +## Example + +```json +{ + "technical": "Embryos found in XY with the bottom overview camera; F/head axis aligned; calibration confirmed before timelapse.", + "experimental": "Ryan or Brie prepares poly-lysine-mounted C. elegans embryos, checks focus approach, and approves the first timelapse.", + "theoretical": "Embryo developmental timing and morphology should remain interpretable across the planned imaging window.", + "conceptual": "Let the biologist plan at the embryo-development level while Gently keeps the DiSPIM operations explicit.", + "sample_entity": "C. elegans embryos on a poly-lysine-coated glass slide", + "operator_context": "Ryan/Brie immediate DiSPIM users; Gently may automate calibration steps only after safety assumptions are explicit.", + "constraints": [ + "Confirm calibration for each embryo coordinate before timelapse", + "State F-drive/head-axis focus-finding assumptions before lowering toward sample focus", + "Avoid overtravel beyond embryo focus toward the glass slide" + ], + "success_question": "Can the plan acquire reliable embryo timelapse data without hiding calibration or focus-safety assumptions?" +} +``` + +## Validation Behavior + +Plan validation now warns, without blocking execution, when imaging items do not +carry the four context layers. It also warns when a DiSPIM embryo timelapse plan +does not state calibration and F-drive/head-axis focus-safety assumptions in the +technical context or constraints. + +The warnings are deliberate: they keep the existing planning system usable while +making the missing planning structure visible for iteration. diff --git a/gently/harness/memory/_plans.py b/gently/harness/memory/_plans.py index 0a872e2f..8bb95ab0 100644 --- a/gently/harness/memory/_plans.py +++ b/gently/harness/memory/_plans.py @@ -8,12 +8,14 @@ import json import logging import sqlite3 +import dataclasses as _dc from datetime import datetime from typing import Any, Dict, List, Optional from .model import ( BenchSpec, ImagingSpec, + PlanContext, PlanItem, PlanItemStatus, PlanItemType, @@ -22,6 +24,46 @@ logger = logging.getLogger(__name__) +def _dataclass_to_dict(obj) -> Dict: + """Serialize a dataclass, omitting empty fields.""" + data: Dict[str, Any] = {} + for f in _dc.fields(obj): + val = getattr(obj, f.name) + if val is None: + continue + if isinstance(val, (list, dict)) and not val: + continue + data[f.name] = val + return data + + +def _coerce_plan_context(data) -> Optional[PlanContext]: + """Build a PlanContext from stored dict data while ignoring unknown keys.""" + if not data: + return None + if isinstance(data, PlanContext): + return data + if not isinstance(data, dict): + return None + valid = {f.name for f in _dc.fields(PlanContext)} + kwargs = {k: v for k, v in data.items() if k in valid} + constraints = kwargs.get("constraints") + if constraints is None: + kwargs["constraints"] = [] + elif isinstance(constraints, str): + kwargs["constraints"] = [constraints] + elif not isinstance(constraints, list): + kwargs["constraints"] = list(constraints) + return PlanContext(**kwargs) + + +def _serialize_plan_context(data) -> Optional[Dict]: + context = _coerce_plan_context(data) + if not context: + return None + return _dataclass_to_dict(context) + + class PlansMixin: """Plan items, templates, snapshots, and dependency management.""" @@ -43,6 +85,7 @@ def create_plan_item( item_id: Optional[str] = None, references: Optional[List[Dict]] = None, estimated_days: Optional[int] = None, + plan_context: Optional[Dict] = None, ) -> str: """Create a plan item. Returns its ID. @@ -64,12 +107,13 @@ def create_plan_item( with self._tx(): self._conn.execute( "INSERT INTO plan_items " - "(id, campaign_id, type, title, description, spec, inherit_from, " + "(id, campaign_id, type, title, description, spec, plan_context, inherit_from, " " planned_session_id, estimated_days, phase_order, \"references\", status, created_at, updated_at) " - "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'planned', ?, ?)", + "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'planned', ?, ?)", ( pid, campaign_id, type, title, description, json.dumps(spec) if spec else None, + json.dumps(_serialize_plan_context(plan_context)) if plan_context else None, inherit_from, planned_session_id, estimated_days, phase_order, json.dumps(references) if references else None, now, now, @@ -307,6 +351,7 @@ def update_plan_item( campaign_id: Optional[str] = None, references: Optional[List[Dict]] = None, estimated_days: Optional[int] = None, + plan_context: Optional[Dict] = None, ): """Update a plan item. Only non-None values are applied.""" now = self._now() @@ -336,6 +381,9 @@ def update_plan_item( if references is not None: updates.append("\"references\" = ?") values.append(json.dumps(references)) + if plan_context is not None: + updates.append("plan_context = ?") + values.append(json.dumps(_serialize_plan_context(plan_context)) if plan_context else None) if not updates: return updates.append("updated_at = ?") @@ -589,21 +637,9 @@ def _serialize_campaign_tree(self, campaign_id: str) -> Dict: } # Serialize spec if item.imaging_spec: - import dataclasses as _dc - spec_dict = {} - for f in _dc.fields(item.imaging_spec): - val = getattr(item.imaging_spec, f.name) - if val is not None: - spec_dict[f.name] = val - item_data["spec"] = spec_dict + item_data["spec"] = _dataclass_to_dict(item.imaging_spec) elif item.bench_spec: - import dataclasses as _dc - spec_dict = {} - for f in _dc.fields(item.bench_spec): - val = getattr(item.bench_spec, f.name) - if val is not None: - spec_dict[f.name] = val - item_data["spec"] = spec_dict + item_data["spec"] = _dataclass_to_dict(item.bench_spec) # Dependencies as relative indices within this campaign's items if item.depends_on: @@ -616,6 +652,8 @@ def _serialize_campaign_tree(self, campaign_id: str) -> Dict: if item.references: item_data["references"] = item.references + if item.plan_context: + item_data["plan_context"] = _dataclass_to_dict(item.plan_context) serialized_items.append(item_data) @@ -711,6 +749,7 @@ def _instantiate_template_tree( spec=spec, phase_order=item_data.get("phase_order", -1), references=item_data.get("references"), + plan_context=item_data.get("plan_context"), ) new_item_ids.append(item_id) @@ -931,19 +970,19 @@ def _row_to_plan_item(self, row: sqlite3.Row) -> PlanItem: if spec_data: if item_type == PlanItemType.IMAGING: - import dataclasses as _dc valid = {f.name for f in _dc.fields(ImagingSpec)} imaging_spec = ImagingSpec(**{ k: v for k, v in spec_data.items() if k in valid }) else: - import dataclasses as _dc valid = {f.name for f in _dc.fields(BenchSpec)} bench_spec = BenchSpec(**{ k: v for k, v in spec_data.items() if k in valid }) references = json.loads(d["references"]) if d.get("references") else [] + plan_context_data = json.loads(d["plan_context"]) if d.get("plan_context") else None + plan_context = _coerce_plan_context(plan_context_data) return PlanItem( id=item_id, @@ -957,6 +996,7 @@ def _row_to_plan_item(self, row: sqlite3.Row) -> PlanItem: claimed_by=d.get("claimed_by"), claimed_by_hostname=d.get("claimed_by_hostname"), references=references, + plan_context=plan_context, imaging_spec=imaging_spec, bench_spec=bench_spec, planned_session_id=d.get("planned_session_id"), diff --git a/gently/harness/memory/file_store.py b/gently/harness/memory/file_store.py index f8f797bb..ad7c530c 100644 --- a/gently/harness/memory/file_store.py +++ b/gently/harness/memory/file_store.py @@ -59,6 +59,7 @@ Observation, PlannedSession, PlannedSessionStatus, + PlanContext, PlanItem, PlanItemStatus, PlanItemType, @@ -1097,6 +1098,7 @@ def create_plan_item( item_id: Optional[str] = None, references: Optional[List[Dict]] = None, estimated_days: Optional[int] = None, + plan_context: Optional[Dict] = None, ) -> str: pid = item_id or self._gen_id() now = self._now() @@ -1125,6 +1127,7 @@ def create_plan_item( "estimated_days": estimated_days, "phase_order": phase_order, "references": references, + "plan_context": self._serialise_plan_context(plan_context), "depends_on": depends_on or [], "claimed_by": None, "claimed_by_hostname": None, @@ -1277,6 +1280,7 @@ def update_plan_item( campaign_id: Optional[str] = None, references: Optional[List[Dict]] = None, estimated_days: Optional[int] = None, + plan_context: Optional[Dict] = None, ): loc = self._find_plan_item_location(item_id) if not loc: @@ -1302,6 +1306,8 @@ def update_plan_item( item["phase_order"] = phase_order if references is not None: item["references"] = references + if plan_context is not None: + item["plan_context"] = self._serialise_plan_context(plan_context) if campaign_id is not None and campaign_id != old_campaign_id: # Move item to a different campaign items.pop(idx) @@ -1545,6 +1551,8 @@ def _serialize_campaign_tree(self, campaign_id: str) -> Dict: if item.references: item_data["references"] = item.references + if item.plan_context: + item_data["plan_context"] = self._dataclass_to_sparse_dict(item.plan_context) serialized_items.append(item_data) @@ -1643,6 +1651,7 @@ def _instantiate_template_tree( spec=spec, phase_order=item_data.get("phase_order", -1), references=item_data.get("references"), + plan_context=item_data.get("plan_context"), ) new_item_ids.append(item_id) @@ -2448,6 +2457,44 @@ def _dict_to_planned_session(d: Dict) -> PlannedSession: updated_at=datetime.fromisoformat(d["updated_at"]) if isinstance(d.get("updated_at"), str) else d.get("updated_at", datetime.now()), ) + @staticmethod + def _dataclass_to_sparse_dict(obj) -> Dict: + data: Dict[str, Any] = {} + for f in dataclasses.fields(obj): + val = getattr(obj, f.name) + if val is None: + continue + if isinstance(val, (list, dict)) and not val: + continue + data[f.name] = val + return data + + @staticmethod + def _dict_to_plan_context(data) -> Optional[PlanContext]: + if not data: + return None + if isinstance(data, PlanContext): + return data + if not isinstance(data, dict): + return None + valid = {f.name for f in dataclasses.fields(PlanContext)} + kwargs = {k: v for k, v in data.items() if k in valid} + constraints = kwargs.get("constraints") + if constraints is None: + kwargs["constraints"] = [] + elif isinstance(constraints, str): + kwargs["constraints"] = [constraints] + elif not isinstance(constraints, list): + kwargs["constraints"] = list(constraints) + return PlanContext(**kwargs) + + @staticmethod + def _serialise_plan_context(data) -> Optional[Dict]: + context = FileContextStore._dict_to_plan_context(data) + if not context: + return None + return FileContextStore._dataclass_to_sparse_dict(context) + @staticmethod def _dict_to_plan_item(d: Dict) -> PlanItem: item_type = PlanItemType(d["type"]) @@ -2468,6 +2515,7 @@ def _dict_to_plan_item(d: Dict) -> PlanItem: }) references = d.get("references") or [] + plan_context = FileContextStore._dict_to_plan_context(d.get("plan_context")) return PlanItem( id=d["id"], @@ -2481,6 +2529,7 @@ def _dict_to_plan_item(d: Dict) -> PlanItem: claimed_by=d.get("claimed_by"), claimed_by_hostname=d.get("claimed_by_hostname"), references=references, + plan_context=plan_context, imaging_spec=imaging_spec, bench_spec=bench_spec, planned_session_id=d.get("planned_session_id"), diff --git a/gently/harness/memory/model.py b/gently/harness/memory/model.py index d9dc9a4e..80cc9bb1 100644 --- a/gently/harness/memory/model.py +++ b/gently/harness/memory/model.py @@ -240,6 +240,24 @@ class BenchSpec: notes: Optional[str] = None +@dataclass +class PlanContext: + """ + Multi-level microscope thought context for a plan item. + + Based on the technical, experimental, theoretical, and conceptual + hierarchy used to connect microscope operations to biological reasoning. + """ + technical: Optional[str] = None + experimental: Optional[str] = None + theoretical: Optional[str] = None + conceptual: Optional[str] = None + sample_entity: Optional[str] = None + operator_context: Optional[str] = None + constraints: List[str] = field(default_factory=list) + success_question: Optional[str] = None + + @dataclass class PlanItem: """ @@ -263,6 +281,7 @@ class PlanItem: claimed_by: Optional[str] = None # instance_id of claiming node claimed_by_hostname: Optional[str] = None # human-readable hostname references: List[Dict[str, str]] = field(default_factory=list) # Source citations + plan_context: Optional[PlanContext] = None # Microscope thought hierarchy # Specifications (type-dependent) imaging_spec: Optional[ImagingSpec] = None diff --git a/gently/harness/memory/store.py b/gently/harness/memory/store.py index f3ae128f..91b14c75 100644 --- a/gently/harness/memory/store.py +++ b/gently/harness/memory/store.py @@ -184,6 +184,7 @@ status TEXT DEFAULT 'planned', outcome TEXT, spec TEXT, + plan_context TEXT, inherit_from TEXT, planned_session_id TEXT, session_id TEXT, @@ -298,6 +299,9 @@ def _migrate(self, conn: sqlite3.Connection): if "references" not in pi_cols: conn.execute('ALTER TABLE plan_items ADD COLUMN "references" TEXT') logger.info("Migration: added 'references' column to plan_items") + if "plan_context" not in pi_cols: + conn.execute("ALTER TABLE plan_items ADD COLUMN plan_context TEXT") + logger.info("Migration: added 'plan_context' column to plan_items") # Mesh campaign coordination columns camp_cols = {row[1] for row in conn.execute("PRAGMA table_info(campaigns)").fetchall()} diff --git a/gently/harness/plan_mode/prompt.py b/gently/harness/plan_mode/prompt.py index 19ea52b7..d7904afe 100644 --- a/gently/harness/plan_mode/prompt.py +++ b/gently/harness/plan_mode/prompt.py @@ -37,6 +37,38 @@ - **Decision points**: Gates between phases where results determine next steps - **Dependencies**: What must complete before something else can start - **Success criteria**: How to know if each step worked +- **Microscope thought context**: For each important task, record the + technical, experimental, theoretical, and conceptual context that connects + microscope operations to the scientific intent. + +## Microscope Thought Context + +Use the hierarchy from Kesavan and Nordenfelt's smart microscopy framework: + +- **Technical**: Instrument, sample, calibration, dataflow, and safety state. + For DiSPIM embryo work this includes bottom-overview XY finding, F/head-axis + alignment, stage Z/head approach, galvo-piezo calibration, piezo/galvo + settings, detector state, and timelapse acquisition. +- **Experimental**: Operator workflow, sample prep, control structure, user + constraints, and the current point in the run. For the immediate DiSPIM work, + account for Ryan and Brie locating embryos, aligning F/head axis, calibrating + per embryo, and then deciding whether to start timelapse. +- **Theoretical**: The biological model, mechanism, pathway, developmental + stage, or measurement hypothesis that makes the observations meaningful. +- **Conceptual**: The higher-level goal for the interaction between biologist, + microscope, and sample environment. + +When you create or revise plan items, use the `plan_context` parameter in +addition to `spec`. It accepts `technical`, `experimental`, `theoretical`, +`conceptual`, `sample_entity`, `operator_context`, `constraints`, and +`success_question`. + +For DiSPIM C. elegans embryo timelapse plans, make calibration state explicit: +bottom overview locates embryos in XY, the operator or Gently aligns the +F/head-axis approach, calibration is confirmed for the embryo coordinates, and +only then should the plan choose timelapse settings. If a plan lowers the SPIM +head toward sample focus, include F-drive/head-axis focus-finding and glass-slide +safety assumptions in `plan_context.technical` or `plan_context.constraints`. ## When Proposing Imaging Sessions @@ -70,6 +102,11 @@ 3. Set dependencies between items 4. Present the full plan for review with propose_plan +When creating plan items, attach both: +- `spec` for executable or measurable parameters. +- `plan_context` for the four-layer planning context that a biologist can use + as the primary interaction layer over the microscope and sample. + IMPORTANT: ALWAYS use ask_user_choice when asking the researcher questions. Never present options as text lists. diff --git a/gently/harness/plan_mode/tools/planning.py b/gently/harness/plan_mode/tools/planning.py index 2c33d43c..5a695f6b 100644 --- a/gently/harness/plan_mode/tools/planning.py +++ b/gently/harness/plan_mode/tools/planning.py @@ -84,7 +84,10 @@ async def create_campaign( "Use estimated_days to indicate how many days this task takes " "(e.g. 1 for a quick imaging session, 14 for strain expansion). " "Use references to cite literature, databases, or other sources " - "(each with source, citation, and optional id/note)." + "(each with source, citation, and optional id/note). " + "Use plan_context to capture the microscope thought hierarchy: " + "technical, experimental, theoretical, and conceptual context, plus " + "sample_entity, operator_context, constraints, and success_question." ), category=ToolCategory.UTILITY, examples=[ @@ -120,6 +123,7 @@ async def create_plan_item( phase_order: int = -1, references: List[Dict] = None, estimated_days: int = None, + plan_context: Dict = None, context: Dict = None, ) -> str: """Create a plan item within a campaign/phase. @@ -155,6 +159,7 @@ async def create_plan_item( phase_order=phase_order, references=references, estimated_days=estimated_days, + plan_context=plan_context, ) # Include the human-friendly task number in the response @@ -191,8 +196,9 @@ async def create_plan_item( name="update_plan_item", description=( "Update an existing plan item — change status, title, description, " - "outcome, spec, or references. Use this to mark items as completed, " - "skipped, update imaging specifications, or attach source citations." + "outcome, spec, plan_context, or references. Use this to mark items " + "as completed, skipped, update imaging specifications, attach source " + "citations, or refine the microscope thought hierarchy." ), category=ToolCategory.UTILITY, ) @@ -205,6 +211,7 @@ async def update_plan_item( spec: Dict = None, references: List[Dict] = None, estimated_days: int = None, + plan_context: Dict = None, campaign_id: str = None, context: Dict = None, ) -> str: @@ -235,6 +242,7 @@ async def update_plan_item( spec=spec, references=references, estimated_days=estimated_days, + plan_context=plan_context, ) changes = [] if status: @@ -247,6 +255,8 @@ async def update_plan_item( changes.append(f"title -> {title}") if references: changes.append(f"{len(references)} references attached") + if plan_context is not None: + changes.append("plan context updated") return f"Updated plan item '{item.title}' ({resolved_id}): {', '.join(changes) or 'updated'}" @@ -412,6 +422,31 @@ async def propose_plan( return "\n".join(lines) +def _plan_context_rows(plan_context, indent: str = " ") -> List[str]: + """Format microscope thought hierarchy fields for text output.""" + if not plan_context: + return [] + + rows = [f"{indent}Microscope thought context:"] + for attr, label in [ + ("technical", "Technical"), + ("experimental", "Experimental"), + ("theoretical", "Theoretical"), + ("conceptual", "Conceptual"), + ("sample_entity", "Sample entity"), + ("operator_context", "Operator context"), + ("success_question", "Success question"), + ]: + val = getattr(plan_context, attr, None) + if val: + rows.append(f"{indent} {label}: {val}") + + constraints = getattr(plan_context, "constraints", None) or [] + if constraints: + rows.append(f"{indent} Constraints: {', '.join(constraints)}") + return rows + + def _format_plan_item(item, store, task_num: str = "") -> str: """Format a single plan item for display.""" from gently.harness.memory.model import PlanItemStatus @@ -473,6 +508,8 @@ def _format_plan_item(item, store, task_num: str = "") -> str: if spec.success_criteria: details.append(f" Criteria: {spec.success_criteria}") + details.extend(_plan_context_rows(item.plan_context)) + if item.depends_on: dep_items = [store.get_plan_item(d) for d in item.depends_on] dep_names = [d.title for d in dep_items if d] @@ -1131,6 +1168,32 @@ def _export_date() -> str: return datetime.now().strftime("%Y-%m-%d") +def _export_plan_context(plan_context) -> List[str]: + if not plan_context: + return [] + + lines = ["**Microscope Thought Context:**"] + for attr, label in [ + ("technical", "Technical"), + ("experimental", "Experimental"), + ("theoretical", "Theoretical"), + ("conceptual", "Conceptual"), + ("sample_entity", "Sample entity"), + ("operator_context", "Operator context"), + ("success_question", "Success question"), + ]: + val = getattr(plan_context, attr, None) + if val: + lines.append(f"- {label}: {val}") + + constraints = getattr(plan_context, "constraints", None) or [] + if constraints: + lines.append(f"- Constraints: {', '.join(constraints)}") + + lines.append("") + return lines + + def _export_item(item, store, num: str) -> List[str]: """Format a plan item for the export document.""" from gently.harness.memory.model import PlanItemStatus @@ -1197,6 +1260,8 @@ def _export_item(item, store, num: str) -> List[str]: lines.append(f"- Success criteria: {bs.success_criteria}") lines.append("") + lines.extend(_export_plan_context(item.plan_context)) + if item.depends_on: dep_items = [store.get_plan_item(d) for d in item.depends_on] dep_names = [d.title for d in dep_items if d] @@ -1231,7 +1296,7 @@ async def validate_plan_for_export(campaign_id: str, store) -> str: from .validation import ( HARDWARE_LIMITS, CONTROL_KEYWORDS, _check_dependency_cycles, _stage_order, _normalise_stage, - _get_temp_factor, STAGE_TIMING_20C, + _get_temp_factor, STAGE_TIMING_20C, _collect_context_warnings, ) items = store.get_plan_items(campaign_id=campaign_id, include_children=True) @@ -1270,6 +1335,9 @@ async def validate_plan_for_export(campaign_id: str, store) -> str: if hi is not None and val > hi: issues.append(f"- **Error:** {label} — {field_name}={val} exceeds max {hi}") + for warning in _collect_context_warnings(label, item, spec): + issues.append(f"- **Warning:** {warning}") + cycle_errors = _check_dependency_cycles(items) for cyc in cycle_errors: issues.append(f"- **Error:** {cyc}") diff --git a/gently/harness/plan_mode/tools/validation.py b/gently/harness/plan_mode/tools/validation.py index 8e09802a..f6c955f5 100644 --- a/gently/harness/plan_mode/tools/validation.py +++ b/gently/harness/plan_mode/tools/validation.py @@ -47,6 +47,21 @@ CONTROL_KEYWORDS = {"control", "wildtype", "n2", "wt", "wild-type", "wild type"} +CONTEXT_LAYERS = ("technical", "experimental", "theoretical", "conceptual") + +EMBRYO_CONTEXT_KEYWORDS = { + "embryo", "embryos", "egg", "eggs", "c. elegans", "celegans", +} + +TIMELAPSE_CONTEXT_KEYWORDS = { + "timelapse", "time-lapse", "time lapse", "longitudinal", "development", +} + +DISPIM_SAFETY_KEYWORDS = { + "calibration", "calibrate", "f-drive", "f drive", "head axis", + "focus", "glass", "slide", "poly-lysine", "poly lysine", +} + # --------------------------------------------------------------------------- # Helpers @@ -139,6 +154,96 @@ def _normalise_stage(name: str) -> Optional[str]: return aliases.get(low.replace(" ", "")) +def _missing_context_layers(item) -> List[str]: + """Return required thought-context layers that are missing.""" + plan_context = getattr(item, "plan_context", None) + if not plan_context: + return list(CONTEXT_LAYERS) + return [ + layer for layer in CONTEXT_LAYERS + if not getattr(plan_context, layer, None) + ] + + +def _plan_context_text(item) -> str: + plan_context = getattr(item, "plan_context", None) + if not plan_context: + return "" + parts = [] + for attr in ( + "technical", "experimental", "theoretical", "conceptual", + "sample_entity", "operator_context", "success_question", + ): + val = getattr(plan_context, attr, None) + if val: + parts.append(str(val)) + parts.extend(str(v) for v in getattr(plan_context, "constraints", []) or []) + return " ".join(parts).lower() + + +def _item_text(item, spec=None) -> str: + parts = [item.title, item.description, item.outcome] + if spec: + for attr in ( + "strain", "genotype", "reporter", "sample_prep", "target_window", + "start_stage", "stop_condition", "success_criteria", + "comparison_to", + ): + val = getattr(spec, attr, None) + if val: + parts.append(str(val)) + if spec.num_embryos: + parts.append("embryo") + if spec.interval_s or spec.adaptive_intervals: + parts.append("timelapse") + return " ".join(filter(None, parts)).lower() + + +def _needs_dispim_focus_context(item, spec=None) -> bool: + if item.type.value != "imaging" or not spec: + return False + text = _item_text(item, spec) + embryo_like = spec.num_embryos is not None or any( + kw in text for kw in EMBRYO_CONTEXT_KEYWORDS + ) + timelapse_like = ( + spec.interval_s is not None + or spec.adaptive_intervals is not None + or any(kw in text for kw in TIMELAPSE_CONTEXT_KEYWORDS) + ) + dispim_like = ( + spec.galvo_amplitude is not None + or spec.piezo_amplitude_um is not None + or "dispim" in text + or "spim" in text + ) + return embryo_like and timelapse_like and (dispim_like or spec.num_slices is not None) + + +def _has_dispim_focus_safety_context(item) -> bool: + context_text = _plan_context_text(item) + return any(kw in context_text for kw in DISPIM_SAFETY_KEYWORDS) + + +def _collect_context_warnings(label: str, item, spec=None) -> List[str]: + warnings: List[str] = [] + if item.type.value == "imaging": + missing_layers = _missing_context_layers(item) + if missing_layers: + warnings.append( + f"{label}: missing microscope thought context layers: " + f"{', '.join(missing_layers)}" + ) + + if _needs_dispim_focus_context(item, spec) and not _has_dispim_focus_safety_context(item): + warnings.append( + f"{label}: DiSPIM embryo timelapse should state calibration, " + "F-drive/head-axis focus finding, and glass-slide safety assumptions " + "in plan_context.technical or plan_context.constraints before acquisition." + ) + return warnings + + # --------------------------------------------------------------------------- # Tool # --------------------------------------------------------------------------- @@ -282,6 +387,8 @@ async def validate_plan( f"{label}: missing imaging spec fields: {', '.join(missing_fields)}" ) + warnings.extend(_collect_context_warnings(label, item, spec)) + # ------------------------------------------------------------------ # Plan-level checks # ------------------------------------------------------------------ diff --git a/tests/test_context_store.py b/tests/test_context_store.py index 5bb8931d..fe0ccf4f 100644 --- a/tests/test_context_store.py +++ b/tests/test_context_store.py @@ -10,6 +10,7 @@ Confidence, Learning, Observation, + PlanContext, PlanItemStatus, Significance, ) @@ -216,6 +217,73 @@ def test_list_plan_items_by_type(self, context_store): imaging_items = context_store.get_plan_items(campaign_id=cid, type="imaging") assert len(imaging_items) == 2 + def test_plan_item_preserves_microscope_thought_context(self, context_store): + cid = context_store.create_campaign(description="C1") + item_id = context_store.create_plan_item( + campaign_id=cid, + type="imaging", + title="Embryo timelapse", + plan_context={ + "technical": "Bottom overview XY, F/head-axis alignment, calibration first.", + "experimental": "Ryan/Brie locate embryos and approve timelapse.", + "theoretical": "Developmental timing remains interpretable.", + "conceptual": "Biologist plans at the embryo-development level.", + "sample_entity": "C. elegans embryo", + "operator_context": "Immediate DiSPIM users Ryan and Brie", + "constraints": ["Avoid head-axis overtravel near glass"], + "success_question": "Can focus and calibration stay explicit?", + }, + ) + + item = context_store.get_plan_item(item_id) + assert isinstance(item.plan_context, PlanContext) + assert item.plan_context.technical.startswith("Bottom overview") + assert item.plan_context.constraints == ["Avoid head-axis overtravel near glass"] + + context_store.update_plan_item( + item_id, + plan_context={ + "technical": "Calibration confirmed before timelapse.", + "experimental": "Operator approves first acquisition.", + "theoretical": "Reporter dynamics remain measurable.", + "conceptual": "Planning layer governs microscope interaction.", + }, + ) + updated = context_store.get_plan_item(item_id) + assert updated.plan_context.technical == "Calibration confirmed before timelapse." + assert updated.plan_context.conceptual == "Planning layer governs microscope interaction." + + template_id = context_store.save_plan_template( + "context-template", + "Plan context template", + cid, + ) + new_cid = context_store.apply_plan_template(template_id) + cloned = context_store.get_plan_items(campaign_id=new_cid)[0] + assert cloned.plan_context.theoretical == "Reporter dynamics remain measurable." + + +class TestFileContextPlanItems: + def test_file_context_store_preserves_microscope_thought_context(self, file_context_store): + cid = file_context_store.create_campaign(description="C1") + item_id = file_context_store.create_plan_item( + campaign_id=cid, + type="imaging", + title="Embryo timelapse", + plan_context={ + "technical": "F-drive focus finding and calibration before timelapse.", + "experimental": "Ryan/Brie workflow.", + "theoretical": "Developmental timing remains interpretable.", + "conceptual": "Biologist plans at embryo level.", + "constraints": ["Avoid overtravel toward glass"], + }, + ) + + item = file_context_store.get_plan_item(item_id) + assert isinstance(item.plan_context, PlanContext) + assert item.plan_context.technical.startswith("F-drive focus") + assert item.plan_context.constraints == ["Avoid overtravel toward glass"] + class TestObservations: def test_add_and_retrieve(self, context_store): diff --git a/tests/test_plan_context_validation.py b/tests/test_plan_context_validation.py new file mode 100644 index 00000000..4ea8c066 --- /dev/null +++ b/tests/test_plan_context_validation.py @@ -0,0 +1,61 @@ +from gently.harness.memory.model import ( + ImagingSpec, + PlanContext, + PlanItem, + PlanItemType, +) +from gently.harness.plan_mode.tools.validation import _collect_context_warnings + + +def _embryo_timelapse_item(plan_context=None): + return PlanItem( + id="item-1", + campaign_id="campaign-1", + type=PlanItemType.IMAGING, + title="DiSPIM embryo timelapse", + imaging_spec=ImagingSpec( + num_slices=80, + interval_s=120, + num_embryos=4, + sample_prep="Embryos on poly-lysine-coated glass slide", + ), + plan_context=plan_context, + ) + + +def test_dispim_embryo_timelapse_warns_without_focus_safety_context(): + item = _embryo_timelapse_item() + warnings = _collect_context_warnings( + "[imaging] 'DiSPIM embryo timelapse'", + item, + item.imaging_spec, + ) + + assert any("missing microscope thought context layers" in w for w in warnings) + assert any("F-drive/head-axis focus finding" in w for w in warnings) + + +def test_dispim_embryo_timelapse_accepts_explicit_focus_safety_context(): + item = _embryo_timelapse_item( + PlanContext( + technical=( + "Bottom overview XY, F-drive focus finding, " + "and calibration before timelapse." + ), + experimental="Ryan/Brie align F/head axis before acquisition.", + theoretical="Developmental progression remains interpretable.", + conceptual="Biologist plans at embryo level while Gently manages DiSPIM details.", + constraints=[ + "Confirm calibration before lowering the SPIM head toward focus", + "Avoid overtravel beyond embryo focus toward the glass slide", + ], + ) + ) + + warnings = _collect_context_warnings( + "[imaging] 'DiSPIM embryo timelapse'", + item, + item.imaging_spec, + ) + + assert warnings == [] From aa79f0f038c5336399f6fc2bcac285bb8b2bf00f Mon Sep 17 00:00:00 2001 From: ceej640 <42260127+Ceej640@users.noreply.github.com> Date: Mon, 1 Jun 2026 00:16:10 -0400 Subject: [PATCH 2/4] Fix web template rendering with current Starlette --- gently/ui/web/routes/auth_routes.py | 6 +++++- gently/ui/web/routes/pages.py | 6 ++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/gently/ui/web/routes/auth_routes.py b/gently/ui/web/routes/auth_routes.py index e7db5500..431aa9f0 100644 --- a/gently/ui/web/routes/auth_routes.py +++ b/gently/ui/web/routes/auth_routes.py @@ -31,7 +31,11 @@ async def login_page(request: Request): return RedirectResponse("/", status_code=302) if current_username(request): return RedirectResponse("/", status_code=302) - return server.templates.TemplateResponse("login.html", {"request": request}) + return server.templates.TemplateResponse( + request, + "login.html", + {"request": request}, + ) @router.post("/api/auth/login") async def login(request: Request): diff --git a/gently/ui/web/routes/pages.py b/gently/ui/web/routes/pages.py index 0cff7d82..65835ac4 100644 --- a/gently/ui/web/routes/pages.py +++ b/gently/ui/web/routes/pages.py @@ -16,8 +16,9 @@ async def index(request: Request): chat window's "Sign in" affordance), not a gate on the page itself. """ return server.templates.TemplateResponse( + request, "index.html", - {"request": request, "active_section": "embryos", "is_live": True} + {"request": request, "active_section": "embryos", "is_live": True}, ) # Standalone URLs redirect to SPA with hash fragment for tab routing @@ -37,8 +38,9 @@ async def plan_review_page(campaign_id: str): async def settings_page(request: Request): """Serve the dashboard settings page""" return server.templates.TemplateResponse( + request, "settings.html", - {"request": request} + {"request": request}, ) return router From 301bffb446927f56155d59537c40a465bc7d7783 Mon Sep 17 00:00:00 2001 From: Johnson Date: Mon, 1 Jun 2026 01:14:04 -0400 Subject: [PATCH 3/4] Add structured plan generation tool --- docs/planning-context-hierarchy.md | 16 ++ gently/harness/plan_mode/prompt.py | 11 +- gently/harness/plan_mode/tools/planning.py | 246 ++++++++++++++++++++- tests/test_structured_plan_tool.py | 87 ++++++++ 4 files changed, 350 insertions(+), 10 deletions(-) create mode 100644 tests/test_structured_plan_tool.py diff --git a/docs/planning-context-hierarchy.md b/docs/planning-context-hierarchy.md index 57c48038..b257e571 100644 --- a/docs/planning-context-hierarchy.md +++ b/docs/planning-context-hierarchy.md @@ -82,3 +82,19 @@ technical context or constraints. The warnings are deliberate: they keep the existing planning system usable while making the missing planning structure visible for iteration. + +## Structured Generation + +Plan mode also provides `create_structured_plan` for creating the full +campaign/phase/task hierarchy in one operation. It accepts: + +- root campaign description, shorthand, and target +- phase objects with local keys +- typed task objects using `type` or `task_class` +- local dependency references between task keys +- task `spec`, `references`, `estimated_days`, and `plan_context` + +This keeps campaign -> phase -> task construction explicit while reducing the +tool-call chatter that occurs when the agent creates each campaign, item, and +dependency separately. The lower-level tools remain available for incremental +edits after the initial plan outline exists. diff --git a/gently/harness/plan_mode/prompt.py b/gently/harness/plan_mode/prompt.py index d7904afe..8f71cf6e 100644 --- a/gently/harness/plan_mode/prompt.py +++ b/gently/harness/plan_mode/prompt.py @@ -96,11 +96,12 @@ ## Output Format -Use the plan tools to build the plan: -1. First create campaigns (top-level + phase sub-campaigns) -2. Then create plan items within each phase -3. Set dependencies between items -4. Present the full plan for review with propose_plan +Use the plan tools to build the plan. When the campaign/phase/task structure is +known, prefer `create_structured_plan` so the campaign, phases, typed tasks, +dependencies, specs, references, and plan_context are created in one deliberate +operation. Use the lower-level tools (`create_campaign`, `create_plan_item`, +`link_plan_items`, `propose_plan`) for incremental edits, small additions, or +follow-up revisions. When creating plan items, attach both: - `spec` for executable or measurable parameters. diff --git a/gently/harness/plan_mode/tools/planning.py b/gently/harness/plan_mode/tools/planning.py index 5a695f6b..4ef5463a 100644 --- a/gently/harness/plan_mode/tools/planning.py +++ b/gently/harness/plan_mode/tools/planning.py @@ -17,6 +17,55 @@ # Campaign / Phase Management # --------------------------------------------------------------------------- + +def _normalize_shorthand_year(shorthand: Optional[str]) -> Optional[str]: + """Keep generated campaign labels aligned with the current year.""" + if not shorthand: + return shorthand + import re + from datetime import datetime + current_year = str(datetime.now().year) + return re.sub(r"-20\d{2}$", f"-{current_year}", shorthand) + + +def _normalize_plan_item_type(value: Optional[str]) -> str: + """Validate and normalize plan item task classes.""" + from gently.harness.memory.model import PlanItemType + + item_type = (value or "").strip().lower() + aliases = { + "decision": "decision_point", + "decision-point": "decision_point", + "decision point": "decision_point", + } + item_type = aliases.get(item_type, item_type) + valid = {member.value for member in PlanItemType} + if item_type not in valid: + raise ValueError(f"type must be one of {sorted(valid)}") + return item_type + + +def _phase_key(entry: Dict, index: int) -> str: + """Return a stable local key for a phase entry.""" + return str(entry.get("key") or entry.get("id") or entry.get("name") or index) + + +def _item_key(entry: Dict, index: int) -> str: + """Return a stable local key for an item entry.""" + return str(entry.get("key") or entry.get("id") or entry.get("title") or index) + + +def _lookup_local_id(ref: str, item_ids: Dict[str, str]) -> Optional[str]: + """Resolve a dependency reference against item keys or existing IDs.""" + if ref in item_ids: + return item_ids[ref] + ref_s = str(ref) + if ref_s in item_ids: + return item_ids[ref_s] + if ref_s.startswith("$") and ref_s[1:] in item_ids: + return item_ids[ref_s[1:]] + return ref_s or None + @tool( name="create_campaign", description=( @@ -50,11 +99,7 @@ async def create_campaign( return "Error: Context store not available" # Fix year in shorthand — models sometimes hallucinate the wrong year - if shorthand: - import re - from datetime import datetime - current_year = str(datetime.now().year) - shorthand = re.sub(r'-20\d{2}$', f'-{current_year}', shorthand) + shorthand = _normalize_shorthand_year(shorthand) store = agent.context_store cid = store.create_campaign( @@ -68,6 +113,191 @@ async def create_campaign( return f"Created campaign '{description}' (id: {cid})" +@tool( + name="create_structured_plan", + description=( + "Create a complete campaign/phase/task plan in one operation. Use this " + "when the plan shape is known and you would otherwise call " + "create_campaign, create_plan_item, link_plan_items, and propose_plan " + "many times. Phases are a list of objects with description and optional " + "key/shorthand/target. Items are objects with key, phase, type or " + "task_class, title, description, spec, plan_context, references, " + "estimated_days, and depends_on. Dependency refs may point to item keys." + ), + category=ToolCategory.UTILITY, + examples=[ + ToolExample( + user_query="Create a two-phase F-drive focus-finding plan", + tool_input={ + "description": "Safe DiSPIM F-drive focus finding", + "shorthand": "fdrive-focus-2026", + "target": "Find embryos, calibrate safely, and decide whether timelapse can start", + "phases": [ + {"key": "setup", "description": "Setup and calibration"}, + {"key": "decision", "description": "Timelapse readiness decision"}, + ], + "items": [ + { + "key": "find", + "phase": "setup", + "type": "imaging", + "title": "Locate embryos in XY", + "spec": {"sample_prep": "poly-lysine slide"}, + "plan_context": { + "technical": "Use bottom overview camera for XY finding.", + "constraints": ["Do not approach glass before focus safety is confirmed"], + }, + }, + { + "key": "go", + "phase": "decision", + "task_class": "decision_point", + "title": "Decide whether to start timelapse", + "depends_on": ["find"], + }, + ], + }, + ), + ], +) +async def create_structured_plan( + description: str, + shorthand: str = None, + target: str = None, + phases: List[Dict] = None, + items: List[Dict] = None, + present_plan: bool = True, + context: Dict = None, +) -> str: + """Create a full campaign/phase/task hierarchy from a single outline. + + This reduces tool-call chatter during plan synthesis while preserving the + same Campaign and PlanItem records used by the rest of plan mode. + """ + agent = context.get("agent") if context else None + if not agent or not hasattr(agent, "context_store") or not agent.context_store: + return "Error: Context store not available" + + store = agent.context_store + phases = phases or [] + items = items or [] + + campaign_id = store.create_campaign( + description=description, + shorthand=_normalize_shorthand_year(shorthand), + target=target, + ) + campaign = store.get_campaign(campaign_id) + + phase_ids: Dict[str, str] = {} + phase_count = 0 + for idx, phase in enumerate(phases, 1): + if not isinstance(phase, dict): + return f"Error: phase {idx} must be an object" + phase_description = (phase.get("description") or phase.get("title") or "").strip() + if not phase_description: + return f"Error: phase {idx} needs a description" + phase_id = store.create_campaign( + description=phase_description, + shorthand=_normalize_shorthand_year(phase.get("shorthand")), + target=phase.get("target"), + parent_id=campaign_id, + ) + key = _phase_key(phase, idx) + phase_ids[key] = phase_id + phase_ids[str(idx)] = phase_id + phase_count += 1 + + item_ids: Dict[str, str] = {} + item_count = 0 + pending_dependencies: List[tuple[str, List[str]]] = [] + errors: List[str] = [] + + for idx, item in enumerate(items, 1): + if not isinstance(item, dict): + errors.append(f"item {idx} must be an object") + continue + + title = (item.get("title") or "").strip() + if not title: + errors.append(f"item {idx} needs a title") + continue + + try: + item_type = _normalize_plan_item_type( + item.get("type") or item.get("task_class") or item.get("class") + ) + except ValueError as exc: + errors.append(f"{title}: {exc}") + continue + + phase_ref = item.get("phase") + target_campaign_id = campaign_id + if phase_ref is not None: + target_campaign_id = phase_ids.get(str(phase_ref)) + if not target_campaign_id: + errors.append(f"{title}: unknown phase reference {phase_ref!r}") + continue + + phase_order_raw = item.get("phase_order", -1) + phase_order = -1 if phase_order_raw is None else int(phase_order_raw) + item_id = store.create_plan_item( + campaign_id=target_campaign_id, + type=item_type, + title=title, + description=item.get("description"), + spec=item.get("spec"), + inherit_from=item.get("inherit_from"), + phase_order=phase_order, + references=item.get("references"), + estimated_days=item.get("estimated_days"), + plan_context=item.get("plan_context"), + ) + key = _item_key(item, idx) + item_ids[key] = item_id + item_ids[str(idx)] = item_id + item_count += 1 + depends_raw = item.get("depends_on") or [] + depends = [depends_raw] if isinstance(depends_raw, str) else list(depends_raw) + pending_dependencies.append((item_id, depends)) + + linked = 0 + local_item_ids = set(item_ids.values()) + for item_id, refs in pending_dependencies: + for ref in refs: + dep_id = _lookup_local_id(str(ref), item_ids) + if dep_id and dep_id not in local_item_ids: + existing = store.resolve_plan_item(dep_id, campaign_id=campaign_id) + if not existing: + errors.append(f"{item_id}: unknown dependency reference {ref!r}") + continue + dep_id = existing.id + if not dep_id: + errors.append(f"{item_id}: unknown dependency reference {ref!r}") + continue + try: + store.add_plan_item_dependency(item_id, dep_id) + linked += 1 + except Exception as exc: + errors.append(f"{item_id}: could not link dependency {ref!r}: {exc}") + + lines = [ + f"Created structured plan '{description}' (id: {campaign_id})", + f"Phases: {phase_count}", + f"Items: {item_count}", + f"Dependencies: {linked}", + ] + if errors: + lines.append("Warnings:") + lines.extend(f"- {err}" for err in errors) + if present_plan and campaign: + lines.append("") + lines.append(_render_plan(campaign, store)) + else: + lines.append("Call propose_plan to review the plan.") + return "\n".join(lines) + + # --------------------------------------------------------------------------- # Plan Item Management # --------------------------------------------------------------------------- @@ -369,6 +599,12 @@ async def propose_plan( return f"Campaign '{campaign_id}' not found. Try a shorthand, name, or UUID." campaign_id = campaign.id + return _render_plan(campaign, store) + + +def _render_plan(campaign, store) -> str: + """Render the full plan for review from an already resolved campaign.""" + campaign_id = campaign.id lines = [] lines.append(f"{'=' * 55}") lines.append(f" EXPERIMENTAL PLAN: {campaign.description}") diff --git a/tests/test_structured_plan_tool.py b/tests/test_structured_plan_tool.py new file mode 100644 index 00000000..5feef993 --- /dev/null +++ b/tests/test_structured_plan_tool.py @@ -0,0 +1,87 @@ +from types import SimpleNamespace + +import pytest + +from gently.harness.memory.model import PlanContext, PlanItemType +from gently.harness.tools.registry import get_tool_registry + +# Import registers the plan-mode tools with the global registry. +from gently.harness.plan_mode.tools import planning # noqa: F401 + + +@pytest.mark.asyncio +async def test_create_structured_plan_builds_phases_items_and_dependencies(context_store): + agent = SimpleNamespace(context_store=context_store) + + result = await get_tool_registry().execute( + "create_structured_plan", + { + "description": "Safe F-drive focus finding", + "shorthand": "fdrive-focus-2026", + "target": "Locate embryos and decide whether a safe timelapse can start", + "phases": [ + {"key": "setup", "description": "Setup and calibration"}, + {"key": "run", "description": "Timelapse readiness"}, + ], + "items": [ + { + "key": "find", + "phase": "setup", + "type": "imaging", + "title": "Locate embryos in XY", + "spec": { + "sample_prep": "poly-lysine slide", + "num_embryos": 3, + }, + "plan_context": { + "technical": "Bottom overview camera locates embryos in XY.", + "experimental": "Ryan and Brie verify positions before focus approach.", + "theoretical": "Embryos remain comparable after mounting.", + "conceptual": "Keep microscope support visible to the biologist.", + "constraints": ["Avoid overtravel toward glass"], + }, + }, + { + "key": "calibrate", + "phase": "setup", + "type": "imaging", + "title": "Calibrate galvo-piezo per embryo", + "depends_on": ["find"], + }, + { + "key": "decide", + "phase": "run", + "task_class": "decision_point", + "title": "Decide whether to start timelapse", + "depends_on": ["calibrate"], + }, + ], + }, + {"agent": agent}, + ) + + campaign = context_store.resolve_campaign("fdrive-focus-2026") + assert campaign is not None + phases = context_store.get_subcampaigns(campaign.id) + assert [phase.description for phase in phases] == [ + "Setup and calibration", + "Timelapse readiness", + ] + + setup_items = context_store.get_plan_items(campaign_id=phases[0].id) + run_items = context_store.get_plan_items(campaign_id=phases[1].id) + assert [item.title for item in setup_items] == [ + "Locate embryos in XY", + "Calibrate galvo-piezo per embryo", + ] + assert run_items[0].type == PlanItemType.DECISION_POINT + assert setup_items[0].type == PlanItemType.IMAGING + assert isinstance(setup_items[0].plan_context, PlanContext) + assert setup_items[0].plan_context.constraints == ["Avoid overtravel toward glass"] + + assert context_store.get_plan_item_dependencies(setup_items[1].id) == [setup_items[0].id] + assert context_store.get_plan_item_dependencies(run_items[0].id) == [setup_items[1].id] + assert "Created structured plan" in result + assert "Phases: 2" in result + assert "Items: 3" in result + assert "EXPERIMENTAL PLAN" in result From 9f9ebe1abd115a91b5cdb7ef6753a033cd83b313 Mon Sep 17 00:00:00 2001 From: Johnson Date: Mon, 1 Jun 2026 01:43:01 -0400 Subject: [PATCH 4/4] Add offline structured plan replay benchmark --- benchmarks/runner.py | 39 ++++++ benchmarks/structured_plan_replay.py | 181 +++++++++++++++++++++++++++ docs/planning-context-hierarchy.md | 20 +++ tests/test_structured_plan_replay.py | 27 ++++ 4 files changed, 267 insertions(+) create mode 100644 benchmarks/structured_plan_replay.py create mode 100644 tests/test_structured_plan_replay.py diff --git a/benchmarks/runner.py b/benchmarks/runner.py index 4d6d847a..fa03eeab 100644 --- a/benchmarks/runner.py +++ b/benchmarks/runner.py @@ -64,6 +64,35 @@ async def run_agent_benchmark(args): return 0 +async def run_structured_plan_benchmark(args): + """Run the offline structured-plan replay benchmark.""" + from .structured_plan_replay import run_structured_plan_replay + + report = await run_structured_plan_replay(Path(args.workdir)) if args.workdir else None + if report is None: + import tempfile + + with tempfile.TemporaryDirectory(prefix="gently-structured-plan-") as tmp: + report = await run_structured_plan_replay(Path(tmp)) + + payload = report + logger.info("=" * 60) + logger.info("STRUCTURED PLAN REPLAY") + logger.info("=" * 60) + logger.info("Passed: %s", payload["passed"]) + logger.info("Elapsed: %.3f ms", payload["elapsed_ms"]) + logger.info("Expected: %s", payload["expected_counts"]) + logger.info("Actual: %s", payload["actual_counts"]) + + if args.output: + output = Path(args.output) + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(json.dumps(payload, indent=2), encoding="utf-8") + logger.info("Wrote report: %s", output) + + return 0 if payload["passed"] else 1 + + def compare_reports(args): """Compare two benchmark reports""" from .agent.evaluator import BenchmarkReport, compare_reports as _compare @@ -132,10 +161,20 @@ def main(): compare_parser.add_argument("before", help="Before report JSON") compare_parser.add_argument("after", help="After report JSON") + # Structured plan replay + plan_parser = subparsers.add_parser( + "structured-plan", + help="Run the offline structured-plan replay benchmark", + ) + plan_parser.add_argument("--workdir", help="Directory for the temporary ContextStore database") + plan_parser.add_argument("--output", help="Output file for the JSON report") + args = parser.parse_args() if args.command == "agent": return asyncio.run(run_agent_benchmark(args)) + elif args.command == "structured-plan": + return asyncio.run(run_structured_plan_benchmark(args)) elif args.command == "compare": return compare_reports(args) else: diff --git a/benchmarks/structured_plan_replay.py b/benchmarks/structured_plan_replay.py new file mode 100644 index 00000000..934f7984 --- /dev/null +++ b/benchmarks/structured_plan_replay.py @@ -0,0 +1,181 @@ +"""Offline replay benchmark for structured plan generation. + +This benchmark exercises the `create_structured_plan` tool against a fresh +ContextStore. It is intentionally deterministic: no LLM, browser session, or +microscope connection is required. +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import tempfile +import time +from pathlib import Path +from types import SimpleNamespace +from typing import Any, Dict, Mapping, Optional + +from gently.harness.memory.store import ContextStore +from gently.harness.tools.registry import get_tool_registry + +# Import registers the plan-mode tools with the global registry. +from gently.harness.plan_mode.tools import planning # noqa: F401 + + +DEFAULT_PLAN_PAYLOAD: Dict[str, Any] = { + "description": "Safe F-drive focus finding", + "shorthand": "fdrive-focus-2026", + "target": "Locate embryos and decide whether a safe timelapse can start", + "phases": [ + {"key": "setup", "description": "Setup and calibration"}, + {"key": "run", "description": "Timelapse readiness"}, + ], + "items": [ + { + "key": "find", + "phase": "setup", + "type": "imaging", + "title": "Locate embryos in XY", + "spec": { + "sample_prep": "poly-lysine slide", + "num_embryos": 3, + }, + "plan_context": { + "technical": "Bottom overview camera locates embryos in XY.", + "experimental": "Ryan and Brie verify positions before focus approach.", + "theoretical": "Embryos remain comparable after mounting.", + "conceptual": "Keep microscope support visible to the biologist.", + "constraints": ["Avoid overtravel toward glass"], + }, + }, + { + "key": "calibrate", + "phase": "setup", + "type": "imaging", + "title": "Calibrate galvo-piezo per embryo", + "depends_on": ["find"], + }, + { + "key": "decide", + "phase": "run", + "task_class": "decision_point", + "title": "Decide whether to start timelapse", + "depends_on": ["calibrate"], + }, + ], +} + + +def _count_plan_records(store: ContextStore, campaign_id: str) -> Dict[str, int]: + phases = store.get_subcampaigns(campaign_id) + campaign_ids = [campaign_id] + [phase.id for phase in phases] + + items = [] + dependency_count = 0 + for cid in campaign_ids: + campaign_items = store.get_plan_items(campaign_id=cid) + items.extend(campaign_items) + for item in campaign_items: + dependency_count += len(store.get_plan_item_dependencies(item.id)) + + return { + "campaigns": 1, + "phases": len(phases), + "items": len(items), + "dependencies": dependency_count, + } + + +def _expected_dependency_count(items: list[Mapping[str, Any]]) -> int: + count = 0 + for item in items: + raw = item.get("depends_on") or [] + count += 1 if isinstance(raw, str) else len(raw) + return count + + +async def run_structured_plan_replay( + workdir: Path, + payload: Optional[Mapping[str, Any]] = None, +) -> Dict[str, Any]: + """Run the deterministic structured-plan replay and return a JSON report.""" + workdir = Path(workdir) + workdir.mkdir(parents=True, exist_ok=True) + plan_payload = dict(payload or DEFAULT_PLAN_PAYLOAD) + + store = ContextStore(workdir / "structured_plan_replay.db") + agent = SimpleNamespace(context_store=store) + started = time.perf_counter() + try: + result = await get_tool_registry().execute( + "create_structured_plan", + plan_payload, + {"agent": agent}, + ) + elapsed_ms = round((time.perf_counter() - started) * 1000, 3) + + campaign = store.resolve_campaign(str(plan_payload["shorthand"])) + counts = ( + _count_plan_records(store, campaign.id) + if campaign is not None + else {"campaigns": 0, "phases": 0, "items": 0, "dependencies": 0} + ) + expected_counts = { + "campaigns": 1, + "phases": len(plan_payload.get("phases") or []), + "items": len(plan_payload.get("items") or []), + "dependencies": _expected_dependency_count(plan_payload.get("items") or []), + } + passed = ( + campaign is not None + and not str(result).startswith("Error:") + and counts == expected_counts + ) + + return { + "benchmark": "structured_plan_replay", + "passed": passed, + "elapsed_ms": elapsed_ms, + "tool_calls": [ + { + "name": "create_structured_plan", + "phases": expected_counts["phases"], + "items": expected_counts["items"], + "dependencies": expected_counts["dependencies"], + } + ], + "expected_counts": expected_counts, + "actual_counts": counts, + "campaign_id": campaign.id if campaign else None, + "tool_result_excerpt": str(result).splitlines()[:8], + } + finally: + store.close() + + +async def _run_cli(args: argparse.Namespace) -> int: + if args.workdir: + report = await run_structured_plan_replay(Path(args.workdir)) + else: + with tempfile.TemporaryDirectory(prefix="gently-structured-plan-") as tmp: + report = await run_structured_plan_replay(Path(tmp)) + + payload = json.dumps(report, indent=2) + if args.output: + output = Path(args.output) + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(payload, encoding="utf-8") + print(payload) + return 0 if report["passed"] else 1 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run the structured plan replay benchmark") + parser.add_argument("--workdir", help="Directory for the temporary ContextStore database") + parser.add_argument("--output", help="Optional JSON report path") + return asyncio.run(_run_cli(parser.parse_args())) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/docs/planning-context-hierarchy.md b/docs/planning-context-hierarchy.md index b257e571..9e8632a8 100644 --- a/docs/planning-context-hierarchy.md +++ b/docs/planning-context-hierarchy.md @@ -98,3 +98,23 @@ This keeps campaign -> phase -> task construction explicit while reducing the tool-call chatter that occurs when the agent creates each campaign, item, and dependency separately. The lower-level tools remain available for incremental edits after the initial plan outline exists. + +## Offline Replay Benchmark + +The structured-plan path has a deterministic replay benchmark: + +```bash +python -m benchmarks.runner structured-plan +``` + +or directly: + +```bash +python -m benchmarks.structured_plan_replay +``` + +The replay creates a fresh local `ContextStore`, executes one +`create_structured_plan` tool call, and verifies the expected campaign, two +phases, three plan items, and two dependencies. It does not call an LLM, browser, +or microscope server, so it can be used as a regression check for the planning +tool shape even before a full agent benchmark exists. diff --git a/tests/test_structured_plan_replay.py b/tests/test_structured_plan_replay.py new file mode 100644 index 00000000..724d69b6 --- /dev/null +++ b/tests/test_structured_plan_replay.py @@ -0,0 +1,27 @@ +import pytest + +from benchmarks.structured_plan_replay import run_structured_plan_replay + + +@pytest.mark.asyncio +async def test_structured_plan_replay_scores_created_plan(tmp_path): + report = await run_structured_plan_replay(tmp_path) + + assert report["benchmark"] == "structured_plan_replay" + assert report["passed"] is True + assert report["tool_calls"] == [ + { + "name": "create_structured_plan", + "phases": 2, + "items": 3, + "dependencies": 2, + } + ] + assert report["expected_counts"] == report["actual_counts"] + assert report["actual_counts"] == { + "campaigns": 1, + "phases": 2, + "items": 3, + "dependencies": 2, + } + assert any("Created structured plan" in line for line in report["tool_result_excerpt"])