diff --git a/docs/trajectory-debugging.md b/docs/trajectory-debugging.md new file mode 100644 index 00000000..fb01b20d --- /dev/null +++ b/docs/trajectory-debugging.md @@ -0,0 +1,66 @@ +# Trajectory Debugging + +Gently already captures several useful session artifacts, including +`events.jsonl`, `decisions.jsonl`, `timeline.jsonl`, perception traces, and +interaction logs. The debug exporter packages those artifacts into a compact +context bundle for a coding agent. + +## Create a Bundle + +```shell +python -m gently.debug --session abc12345 --annotate "should query embryo position before moving" +``` + +Options: + +- `--root`: storage root, defaulting to `GENTLY_STORAGE_PATH` or `D:/Gently3`. +- `--output-dir`: explicit destination for the bundle. +- `--max-records`: number of transcript excerpt records to include. + +The command writes: + +- `debug_context.md`: prompt/context for a coding agent. +- `artifacts.json`: artifact inventory and source-file hints. +- `transcript_excerpt.jsonl`: compact tail records from event, decision, + timeline, and interaction logs. +- `profile_summary.json`: profiler span counts, duration by component, and + slowest spans when `profile.jsonl` or `profile_spans.jsonl` exists. +- `source_files.txt`: source files inferred from tool calls in the logs. + +## Profiler Span Format + +Runtime profilers can write append-only JSONL records to either `profile.jsonl` +or `profile_spans.jsonl` in the session directory. The exporter recognizes +records with these fields: + +- `timestamp` or `start_time` +- `component`, `subsystem`, `agent`, or `tool_name` +- `operation`, `name`, `tool_name`, or `event` +- `duration_ms`, `elapsed_ms`, `wall_ms`, or `duration_s` +- optional `status` or `outcome` + +The schema is deliberately permissive so LLM calls, tool calls, hardware queue +waits, perception steps, file I/O, and UI/WebSocket events can all be summarized +without forcing them into one runtime dependency. + +Gently now records tool-call spans automatically when a running agent has an +active FileStore session. Those spans are appended to: + +```text +/profile_spans.jsonl +``` + +Set `GENTLY_PROFILE_PATH` to redirect spans to a specific JSONL file during +tests or custom launches. + +## Workflow + +1. Run or replay a Gently agent scenario until the behavior diverges from what was + expected. +2. Export the debug bundle with an annotation describing the expected behavior. +3. Give the bundle to a coding agent with access to the repo. +4. Ask for a root-cause analysis, a targeted fix, and an offline regression + test. + +The exporter does not require live hardware and does not copy large image or +volume payloads into the bundle. diff --git a/gently/debug/__init__.py b/gently/debug/__init__.py new file mode 100644 index 00000000..619404d4 --- /dev/null +++ b/gently/debug/__init__.py @@ -0,0 +1,9 @@ +"""Debug export helpers for trajectory-guided agent development.""" + +from .analyzer import DebugBundle, prepare_debug_context, resolve_session_dir + +__all__ = [ + "DebugBundle", + "prepare_debug_context", + "resolve_session_dir", +] diff --git a/gently/debug/__main__.py b/gently/debug/__main__.py new file mode 100644 index 00000000..fb0cedaf --- /dev/null +++ b/gently/debug/__main__.py @@ -0,0 +1,7 @@ +"""Command-line entry point for ``python -m gently.debug``.""" + +from .analyzer import main + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/gently/debug/analyzer.py b/gently/debug/analyzer.py new file mode 100644 index 00000000..4a0795dc --- /dev/null +++ b/gently/debug/analyzer.py @@ -0,0 +1,477 @@ +"""Prepare trajectory-debugging context for coding agents.""" + +from __future__ import annotations + +import argparse +import json +import os +import re +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple + + +PROMPT_TEMPLATE = Path(__file__).parent / "prompts" / "debugging_prompt.md" +_TOOL_DECORATOR_RE = re.compile(r"name\s*=\s*['\"]([^'\"]+)['\"]") + + +@dataclass(frozen=True) +class ArtifactSummary: + """Small summary of a session artifact included in a debug bundle.""" + + kind: str + path: str + exists: bool + bytes: int = 0 + lines: int = 0 + + +@dataclass(frozen=True) +class DebugBundle: + """Paths and metadata for a generated debug export.""" + + session_id: str + session_dir: str + output_dir: str + annotation: Optional[str] + artifacts: List[ArtifactSummary] = field(default_factory=list) + source_files: List[str] = field(default_factory=list) + profile_summary: Mapping[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + data = asdict(self) + data["artifacts"] = [asdict(artifact) for artifact in self.artifacts] + data["profile_summary"] = dict(self.profile_summary) + return data + + +def resolve_session_dir(session: str, root: Optional[Path] = None) -> Tuple[str, Path]: + """Resolve a session id/prefix or direct path to a session directory.""" + direct = Path(session) + if direct.exists() and direct.is_dir(): + return direct.name, direct + + from gently.core.file_store import FileStore + + root_path = Path(root or os.environ.get("GENTLY_STORAGE_PATH", "D:/Gently3")) + store = FileStore(root_path) + sessions = store.list_sessions() + matches = [ + item for item in sessions + if str(item.get("session_id", "")).startswith(session) + ] + if not matches: + raise FileNotFoundError(f"No session matching {session!r} under {root_path}") + if len(matches) > 1: + ids = ", ".join(str(item.get("session_id")) for item in matches) + raise ValueError(f"Multiple sessions match {session!r}: {ids}") + + session_id = str(matches[0]["session_id"]) + session_dir = store._session_dir(session_id) + if session_dir is None or not session_dir.exists(): + raise FileNotFoundError(f"Session directory not found for {session_id}") + return session_id, session_dir + + +def prepare_debug_context( + session: str, + *, + root: Optional[Path] = None, + output_dir: Optional[Path] = None, + annotation: Optional[str] = None, + max_records: int = 80, +) -> DebugBundle: + """Create a debug bundle for a session and return its metadata.""" + session_id, session_dir = resolve_session_dir(session, root=root) + if root is not None: + root_path = Path(root) + elif len(session_dir.parents) > 1: + root_path = session_dir.parents[1] + else: + root_path = session_dir.parent + if output_dir is None: + stamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = session_dir / "debug_exports" / stamp + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + artifacts = collect_artifacts(session_dir, root_path=root_path, session_id=session_id) + source_files = infer_relevant_source_files(session_dir, artifacts) + transcript_records = collect_transcript_excerpt(artifacts, max_records=max_records) + profile_summary = summarize_profile_records(artifacts) + + bundle = DebugBundle( + session_id=session_id, + session_dir=str(session_dir), + output_dir=str(output_dir), + annotation=annotation, + artifacts=artifacts, + source_files=source_files, + profile_summary=profile_summary, + ) + + (output_dir / "artifacts.json").write_text( + json.dumps(bundle.to_dict(), indent=2), + encoding="utf-8", + ) + (output_dir / "source_files.txt").write_text( + "\n".join(source_files) + ("\n" if source_files else ""), + encoding="utf-8", + ) + _write_jsonl(output_dir / "transcript_excerpt.jsonl", transcript_records) + (output_dir / "profile_summary.json").write_text( + json.dumps(profile_summary, indent=2), + encoding="utf-8", + ) + (output_dir / "debug_context.md").write_text( + build_debug_prompt(bundle, transcript_records), + encoding="utf-8", + ) + return bundle + + +def collect_artifacts( + session_dir: Path, + *, + root_path: Optional[Path] = None, + session_id: Optional[str] = None, +) -> List[ArtifactSummary]: + """Collect known session artifacts without reading large binary payloads.""" + candidates = [ + ("session", session_dir / "session.yaml"), + ("events", session_dir / "events.jsonl"), + ("decisions", session_dir / "decisions.jsonl"), + ("timeline", session_dir / "timeline.jsonl"), + ("profile", session_dir / "profile.jsonl"), + ("profile_spans", session_dir / "profile_spans.jsonl"), + ("interaction_log", session_dir / "interaction_log.jsonl"), + ] + if root_path is not None and session_id: + candidates.append( + ("interaction_logger", root_path / "interaction_logs" / f"{session_id}.jsonl") + ) + + for trace in sorted(session_dir.glob("embryos/*/traces/*.json"))[:25]: + candidates.append(("perception_trace", trace)) + for predictions in sorted(session_dir.glob("embryos/*/predictions.jsonl"))[:25]: + candidates.append(("predictions", predictions)) + + return [_summarize_artifact(kind, path) for kind, path in candidates] + + +def collect_transcript_excerpt( + artifacts: Sequence[ArtifactSummary], + *, + max_records: int = 80, +) -> List[Dict[str, Any]]: + """Read tail records from text/jsonl artifacts for compact debugging.""" + text_kinds = {"events", "decisions", "timeline", "interaction_log", "interaction_logger"} + records: List[Dict[str, Any]] = [] + per_file = max(1, max_records // max(1, len([a for a in artifacts if a.kind in text_kinds]))) + for artifact in artifacts: + if not artifact.exists or artifact.kind not in text_kinds: + continue + for record in _read_jsonl_tail(Path(artifact.path), per_file): + records.append({"artifact": artifact.kind, "record": record}) + return records[-max_records:] + + +def summarize_profile_records( + artifacts: Sequence[ArtifactSummary], + *, + max_records: int = 1000, + max_slowest: int = 10, +) -> Dict[str, Any]: + """Summarize profiler span logs for the debug bundle.""" + profile_kinds = {"profile", "profile_spans"} + spans: List[Dict[str, Any]] = [] + duration_by_component: Dict[str, float] = {} + + for artifact in artifacts: + if not artifact.exists or artifact.kind not in profile_kinds: + continue + for record in _read_jsonl_tail(Path(artifact.path), max_records): + if not isinstance(record, Mapping): + continue + component = str( + record.get("component") + or record.get("subsystem") + or record.get("agent") + or record.get("tool_name") + or "unknown" + ) + operation = str( + record.get("operation") + or record.get("name") + or record.get("tool_name") + or record.get("event") + or "unknown" + ) + duration_ms = _duration_ms(record) + span = { + "artifact": artifact.kind, + "timestamp": record.get("timestamp") or record.get("start_time"), + "component": component, + "operation": operation, + "duration_ms": duration_ms, + "status": record.get("status") or record.get("outcome"), + } + spans.append(span) + if duration_ms is not None: + duration_by_component[component] = ( + duration_by_component.get(component, 0.0) + duration_ms + ) + + slowest = sorted( + [span for span in spans if span["duration_ms"] is not None], + key=lambda span: span["duration_ms"], + reverse=True, + )[:max_slowest] + return { + "span_count": len(spans), + "duration_by_component_ms": { + component: round(duration, 3) + for component, duration in sorted(duration_by_component.items()) + }, + "slowest_spans": slowest, + } + + +def infer_relevant_source_files( + session_dir: Path, + artifacts: Sequence[ArtifactSummary], + *, + repo_root: Optional[Path] = None, +) -> List[str]: + """Infer relevant source files from tool names found in session logs.""" + repo_root = repo_root or Path(__file__).resolve().parents[2] + tool_names = extract_tool_names(artifacts) + source_index = build_tool_source_index(repo_root) + files: Set[str] = set() + for name in tool_names: + path = source_index.get(name) + if path: + files.add(path.relative_to(repo_root).as_posix()) + + if tool_names: + files.update( + [ + "gently/app/agent.py", + "gently/harness/conversation.py", + "gently/eval/decision_log.py", + ] + ) + if (session_dir / "events.jsonl").exists(): + files.add("gently/eval/event_capture.py") + files.add("gently/eval/event_replay.py") + + return sorted(files) + + +def extract_tool_names(artifacts: Sequence[ArtifactSummary]) -> Set[str]: + """Extract tool names from decision and interaction logs.""" + names: Set[str] = set() + for artifact in artifacts: + if not artifact.exists or artifact.kind not in { + "decisions", + "interaction_log", + "interaction_logger", + }: + continue + for record in _read_jsonl_tail(Path(artifact.path), 200): + names.update(_find_tool_names(record)) + return names + + +def build_tool_source_index(repo_root: Path) -> Dict[str, Path]: + """Map @tool decorator names to source files.""" + index: Dict[str, Path] = {} + tools_dir = repo_root / "gently" / "app" / "tools" + if not tools_dir.exists(): + return index + for path in tools_dir.glob("*.py"): + text = path.read_text(encoding="utf-8", errors="replace") + for match in _TOOL_DECORATOR_RE.finditer(text): + index.setdefault(match.group(1), path) + return index + + +def build_debug_prompt( + bundle: DebugBundle, + transcript_records: Sequence[Mapping[str, Any]], +) -> str: + """Build the markdown context handed to a coding agent.""" + template = PROMPT_TEMPLATE.read_text(encoding="utf-8") + artifact_lines = [ + f"- {artifact.kind}: `{artifact.path}` ({artifact.lines} lines, {artifact.bytes} bytes)" + for artifact in bundle.artifacts + if artifact.exists + ] + missing_lines = [ + f"- {artifact.kind}: `{artifact.path}`" + for artifact in bundle.artifacts + if not artifact.exists + ] + source_lines = [f"- `{path}`" for path in bundle.source_files] + + return "\n".join( + [ + template, + "", + "## Session", + "", + f"- Session id: `{bundle.session_id}`", + f"- Session directory: `{bundle.session_dir}`", + f"- Annotation: {bundle.annotation or '(none supplied)'}", + "", + "## Included Artifacts", + "", + *(artifact_lines or ["- (none found)"]), + "", + "## Missing Expected Artifacts", + "", + *(missing_lines or ["- (none)"]), + "", + "## Relevant Source Files", + "", + *(source_lines or ["- (no tool-specific source files inferred)"]), + "", + "## Transcript Excerpt", + "", + f"`transcript_excerpt.jsonl` contains {len(transcript_records)} compact records.", + "", + "## Profile Summary", + "", + _format_profile_summary(bundle.profile_summary), + "", + "## Suggested Debugging Output", + "", + "1. Root cause.", + "2. Smallest code or prompt fix.", + "3. Offline regression test.", + "4. Any live-hardware validation that remains necessary.", + "", + ] + ) + + +def _summarize_artifact(kind: str, path: Path) -> ArtifactSummary: + if not path.exists(): + return ArtifactSummary(kind=kind, path=str(path), exists=False) + stat = path.stat() + lines = 0 + if path.suffix.lower() in {".jsonl", ".yaml", ".yml", ".json", ".md", ".txt"}: + try: + with path.open("r", encoding="utf-8", errors="replace") as f: + lines = sum(1 for _ in f) + except OSError: + lines = 0 + return ArtifactSummary( + kind=kind, + path=str(path), + exists=True, + bytes=stat.st_size, + lines=lines, + ) + + +def _read_jsonl_tail(path: Path, max_lines: int) -> List[Any]: + try: + lines = path.read_text(encoding="utf-8", errors="replace").splitlines() + except OSError: + return [] + + out: List[Any] = [] + for raw in lines[-max_lines:]: + if not raw.strip(): + continue + try: + out.append(json.loads(raw)) + except json.JSONDecodeError: + out.append({"raw": raw}) + return out + + +def _find_tool_names(value: Any) -> Set[str]: + names: Set[str] = set() + if isinstance(value, Mapping): + if isinstance(value.get("tool_name"), str): + names.add(value["tool_name"]) + if isinstance(value.get("name"), str) and ( + "input" in value or "arguments" in value or "params" in value + ): + names.add(value["name"]) + for nested in value.values(): + names.update(_find_tool_names(nested)) + elif isinstance(value, list): + for item in value: + names.update(_find_tool_names(item)) + return names + + +def _duration_ms(record: Mapping[str, Any]) -> Optional[float]: + for key in ("duration_ms", "elapsed_ms", "wall_ms"): + if key in record and record[key] is not None: + try: + return round(float(record[key]), 3) + except (TypeError, ValueError): + return None + if "duration_s" in record and record["duration_s"] is not None: + try: + return round(float(record["duration_s"]) * 1000.0, 3) + except (TypeError, ValueError): + return None + return None + + +def _format_profile_summary(summary: Mapping[str, Any]) -> str: + if not summary or not summary.get("span_count"): + return "No profiler spans were found." + + lines = [f"Profiler spans: {summary.get('span_count')}"] + durations = summary.get("duration_by_component_ms") or {} + if durations: + lines.append("") + lines.append("Duration by component:") + for component, duration in durations.items(): + lines.append(f"- {component}: {duration} ms") + + slowest = summary.get("slowest_spans") or [] + if slowest: + lines.append("") + lines.append("Slowest spans:") + for span in slowest[:5]: + lines.append( + f"- {span.get('component')}.{span.get('operation')}: " + f"{span.get('duration_ms')} ms" + ) + return "\n".join(lines) + + +def _write_jsonl(path: Path, records: Iterable[Mapping[str, Any]]) -> None: + with path.open("w", encoding="utf-8") as f: + for record in records: + f.write(json.dumps(record, default=str) + "\n") + + +def main(argv: Optional[Sequence[str]] = None) -> int: + parser = argparse.ArgumentParser( + description="Prepare a trajectory-debugging context bundle for a Gently session." + ) + parser.add_argument("--session", required=True, help="Session id/prefix or session directory") + parser.add_argument("--root", default=None, help="Gently storage root") + parser.add_argument("--output-dir", default=None, help="Debug bundle output directory") + parser.add_argument("--annotate", default=None, help="Expected behavior or failure note") + parser.add_argument("--max-records", type=int, default=80, help="Transcript records to include") + args = parser.parse_args(argv) + + bundle = prepare_debug_context( + args.session, + root=Path(args.root) if args.root else None, + output_dir=Path(args.output_dir) if args.output_dir else None, + annotation=args.annotate, + max_records=args.max_records, + ) + print(bundle.output_dir) + return 0 diff --git a/gently/debug/profiler.py b/gently/debug/profiler.py new file mode 100644 index 00000000..99379598 --- /dev/null +++ b/gently/debug/profiler.py @@ -0,0 +1,80 @@ +"""Lightweight JSONL profiler spans for debug exports.""" + +from __future__ import annotations + +import json +import logging +import os +import socket +from datetime import datetime +from pathlib import Path +from typing import Any, Mapping, Optional + +logger = logging.getLogger(__name__) + + +def resolve_profile_path(context: Optional[Mapping[str, Any]]) -> Optional[Path]: + """Resolve where runtime profile spans should be written. + + The explicit ``GENTLY_PROFILE_PATH`` env var wins. Otherwise, when an agent + with a FileStore session is available, spans go beside the session artifacts + as ``profile_spans.jsonl``. + """ + explicit = os.environ.get("GENTLY_PROFILE_PATH", "").strip() + if explicit: + return Path(explicit) + + if not isinstance(context, Mapping): + return None + agent = context.get("agent") + if agent is None: + return None + + direct = getattr(agent, "profile_path", None) + if direct: + return Path(direct) + + session_id = getattr(agent, "session_id", None) + store = getattr(agent, "store", None) + if session_id and store is not None and hasattr(store, "_session_dir"): + try: + session_dir = store._session_dir(session_id) + except Exception: + session_dir = None + if session_dir is not None: + return Path(session_dir) / "profile_spans.jsonl" + + return None + + +def record_profile_span( + context: Optional[Mapping[str, Any]], + *, + component: str, + operation: str, + duration_ms: float, + status: str, + metadata: Optional[Mapping[str, Any]] = None, +) -> None: + """Append one profile span, best-effort and non-fatal.""" + path = resolve_profile_path(context) + if path is None: + return + + record = { + "timestamp": datetime.now().isoformat(timespec="milliseconds"), + "hostname": socket.gethostname(), + "component": component, + "operation": operation, + "duration_ms": round(float(duration_ms), 3), + "status": status, + } + if metadata: + record.update(dict(metadata)) + + try: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("a", encoding="utf-8") as f: + f.write(json.dumps(record, default=str) + "\n") + except Exception as exc: + logger.debug("failed to write profile span to %s: %s", path, exc) diff --git a/gently/debug/prompts/debugging_prompt.md b/gently/debug/prompts/debugging_prompt.md new file mode 100644 index 00000000..b649974a --- /dev/null +++ b/gently/debug/prompts/debugging_prompt.md @@ -0,0 +1,18 @@ +# Trajectory Debugging Prompt + +You are reviewing a Gently copilot trajectory. Use the attached session +artifacts, transcript excerpt, expected behavior annotation, and relevant source +files to identify the smallest code or prompt change that would make the agent +behave correctly. + +Focus on: + +- What the user expected. +- What tool calls or events actually happened. +- Whether the agent had enough context to choose the expected action. +- Which tool descriptions, prompt sections, or orchestration code shaped the + decision. +- A targeted fix and a regression test that would catch the issue next time. + +Do not assume live hardware is available. Prefer fixes that can be verified with +offline traces, mock clients, or deterministic unit tests. diff --git a/gently/harness/tools/registry.py b/gently/harness/tools/registry.py index 48e1cf4f..eb7202c6 100644 --- a/gently/harness/tools/registry.py +++ b/gently/harness/tools/registry.py @@ -425,6 +425,8 @@ async def execute(self, tool_name: str, tool_input: Dict, context: Dict = None) else: exec_context = self._context + start_time = time.time() + # Hybrid-autonomy backstop: during an autonomous (wake) turn, a small set # of irreversible tools (laser-on, embryo termination, stopping the run) # must NEVER execute without a human — even if the model tries to call @@ -436,6 +438,7 @@ async def execute(self, tool_name: str, tool_input: Dict, context: Dict = None) blocked = getattr(_agent, '_autonomous_blocked_tools', None) or () if tool_name in blocked: logger.info("Autonomy backstop blocked '%s' (irreversible)", tool_name) + _record_tool_span(exec_context, tool, start_time, "blocked") return (f"'{tool_name}' is an irreversible action and cannot run " f"autonomously. Ask the operator to confirm it.") @@ -443,10 +446,9 @@ async def execute(self, tool_name: str, tool_input: Dict, context: Dict = None) if tool.requires_microscope: client = exec_context.get('client') if client is None: + _record_tool_span(exec_context, tool, start_time, "missing_microscope") return "Error: Not connected to microscope server. Start the server and reconnect." - start_time = time.time() - try: # Prepare arguments kwargs = dict(tool_input) @@ -464,12 +466,14 @@ async def execute(self, tool_name: str, tool_input: Dict, context: Dict = None) duration = time.time() - start_time logger.debug(f"Tool {tool_name} executed in {duration:.2f}s") + _record_tool_span(exec_context, tool, start_time, "ok") return result except Exception as e: import traceback logger.error(f"Tool {tool_name} failed: {e}") + _record_tool_span(exec_context, tool, start_time, "error", error=str(e)) return f"Error executing {tool_name}: {str(e)}\n{traceback.format_exc()}" def __contains__(self, name: str) -> bool: @@ -479,6 +483,37 @@ def __len__(self) -> int: return len(self._tools) +def _record_tool_span( + context: Dict[str, Any], + tool: ToolDefinition, + start_time: float, + status: str, + *, + error: Optional[str] = None, +) -> None: + """Record one best-effort tool-call profiler span.""" + try: + from gently.debug.profiler import record_profile_span + + metadata = { + "tool_name": tool.name, + "category": tool.category.name.lower(), + "requires_microscope": tool.requires_microscope, + } + if error: + metadata["error"] = error + record_profile_span( + context, + component="tool", + operation=tool.name, + duration_ms=(time.time() - start_time) * 1000.0, + status=status, + metadata=metadata, + ) + except Exception: + logger.debug("failed to record tool profiler span", exc_info=True) + + # Global registry instance _global_registry: Optional[ToolRegistry] = None diff --git a/tests/test_debug_export.py b/tests/test_debug_export.py new file mode 100644 index 00000000..84a658b4 --- /dev/null +++ b/tests/test_debug_export.py @@ -0,0 +1,90 @@ +import json +from pathlib import Path + +from gently.core.file_store import FileStore +from gently.debug import prepare_debug_context, resolve_session_dir + + +def _write_jsonl(path: Path, records): + path.write_text( + "".join(json.dumps(record) + "\n" for record in records), + encoding="utf-8", + ) + + +def test_prepare_debug_context_exports_session_bundle(tmp_path): + store = FileStore(tmp_path) + store.create_session("abc12345", name="debug test") + session_dir = store._session_dir("abc12345") + assert session_dir is not None + + _write_jsonl( + session_dir / "decisions.jsonl", + [ + { + "timestamp": "2026-05-30T12:00:00", + "agent": "production", + "trigger": "user_message", + "tool_calls": [ + {"name": "acquire_volume", "input": {"embryo_id": "embryo_1"}} + ], + } + ], + ) + _write_jsonl( + session_dir / "events.jsonl", + [{"event_type": "STAGE_MOVED", "data": {"x": 1}}], + ) + _write_jsonl( + session_dir / "profile.jsonl", + [ + { + "timestamp": "2026-05-30T12:00:01", + "component": "llm", + "operation": "agent_turn", + "duration_ms": 1250.0, + "status": "ok", + }, + { + "timestamp": "2026-05-30T12:00:02", + "component": "tool", + "operation": "acquire_volume", + "duration_ms": 320.5, + "status": "ok", + }, + ], + ) + + bundle = prepare_debug_context( + "abc12345", + root=tmp_path, + output_dir=tmp_path / "debug_out", + annotation="should check stored position before acquisition", + ) + + output_dir = Path(bundle.output_dir) + context = (output_dir / "debug_context.md").read_text(encoding="utf-8") + source_files = (output_dir / "source_files.txt").read_text(encoding="utf-8") + artifacts = json.loads((output_dir / "artifacts.json").read_text(encoding="utf-8")) + profile = json.loads((output_dir / "profile_summary.json").read_text(encoding="utf-8")) + transcript = (output_dir / "transcript_excerpt.jsonl").read_text(encoding="utf-8") + + assert "should check stored position" in context + assert "Profile Summary" in context + assert "llm.agent_turn" in context + assert "gently/app/tools/acquisition_tools.py" in source_files + assert artifacts["session_id"] == "abc12345" + assert artifacts["profile_summary"]["span_count"] == 2 + assert profile["duration_by_component_ms"]["llm"] == 1250.0 + assert profile["slowest_spans"][0]["operation"] == "agent_turn" + assert "acquire_volume" in transcript + + +def test_resolve_session_dir_accepts_prefix(tmp_path): + store = FileStore(tmp_path) + store.create_session("prefix123", name="debug test") + + session_id, session_dir = resolve_session_dir("prefix", root=tmp_path) + + assert session_id == "prefix123" + assert session_dir.exists() diff --git a/tests/test_tool_registry.py b/tests/test_tool_registry.py index 273361bb..a78cd3f9 100644 --- a/tests/test_tool_registry.py +++ b/tests/test_tool_registry.py @@ -14,9 +14,12 @@ """ import asyncio +import json +from types import SimpleNamespace import pytest +from gently.core.file_store import FileStore from gently.harness.tools.registry import ( ToolRegistry, ToolCategory, @@ -207,3 +210,27 @@ async def async_greeter(name: str) -> str: async def test_execute_nonexistent_tool(self, registry): with pytest.raises(ValueError, match="Unknown tool"): await registry.execute("nonexistent", {}) + + @pytest.mark.asyncio + async def test_execute_records_profile_span_for_session_tool(self, registry, tmp_path): + def adder(a: int, b: int) -> str: + return str(a + b) + + store = FileStore(tmp_path) + store.create_session("abc12345", name="profile") + agent = SimpleNamespace(store=store, session_id="abc12345") + registry.register_function(adder, name="adder", category=ToolCategory.UTILITY) + + result = await registry.execute("adder", {"a": 2, "b": 5}, {"agent": agent}) + + profile_path = store._session_dir("abc12345") / "profile_spans.jsonl" + records = [ + json.loads(line) + for line in profile_path.read_text(encoding="utf-8").splitlines() + ] + assert result == "7" + assert records[-1]["component"] == "tool" + assert records[-1]["operation"] == "adder" + assert records[-1]["tool_name"] == "adder" + assert records[-1]["status"] == "ok" + assert records[-1]["duration_ms"] >= 0