From 484c2a974feb1361a5c52ab04db60728d0ab8d55 Mon Sep 17 00:00:00 2001 From: Ivan Nikolic Date: Thu, 11 Jun 2026 18:48:47 +0800 Subject: [PATCH 01/10] pure modules: aligned ticks, dual offline/live execution, backpressure + health MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A PureModule declares when it runs (one tick() input), how every other input is sampled at that moment (latest/interpolate/window), and one pure step() bound to inputs by parameter name. The same class runs live on pubsub ports (NullStore-bridged, recording = store choice) or offline over stored memory2 streams via Module.over() — lazy, exact, chainable into other modules. Live ticks flow through a selectable BackpressureBuffer (KeepLast default); every queue in the path is bounded, incl. a max_pending cap so a dead interpolate() input can't accumulate ticks. Health follows "drops are metrics, not errors": per-reason drop counters, a 1 Hz _health stream in the module store, one warmup line comparing observed vs expected input rates, transition-logged DEGRADED/STALLED contract messages with throttled reminders, and strict mode for offline replay determinism. --- dimos/memory2/architecture.md | 2 + dimos/memory2/buffer.py | 15 + dimos/memory2/health.py | 328 +++++++++++++++++ dimos/memory2/puremodule.md | 218 +++++++++++ dimos/memory2/puremodule.py | 612 +++++++++++++++++++++++++++++++ dimos/memory2/test_health.py | 270 ++++++++++++++ dimos/memory2/test_puremodule.py | 562 ++++++++++++++++++++++++++++ dimos/memory2/tick.py | 443 ++++++++++++++++++++++ dimos/spec/pure_modules.py | 24 ++ 9 files changed, 2474 insertions(+) create mode 100644 dimos/memory2/health.py create mode 100644 dimos/memory2/puremodule.md create mode 100644 dimos/memory2/puremodule.py create mode 100644 dimos/memory2/test_health.py create mode 100644 dimos/memory2/test_puremodule.py create mode 100644 dimos/memory2/tick.py create mode 100644 dimos/spec/pure_modules.py diff --git a/dimos/memory2/architecture.md b/dimos/memory2/architecture.md index c4a90a7085..7135cfc5ce 100644 --- a/dimos/memory2/architecture.md +++ b/dimos/memory2/architecture.md @@ -32,6 +32,8 @@ Supporting Systems: | `transform.py` | Transformer ABC, FnTransformer, FnIterTransformer, QualityWindow | | `buffer.py` | Backpressure buffers for live mode (KeepLast, Bounded, Unbounded) | | `embed.py` | EmbedImages / EmbedText transformers | +| `tick.py` | Tick assembly — samplers (tick/latest/interpolate/window) + TickMachine | +| `puremodule.py`| PureModule — pure `step` over aligned inputs; live ports or offline `over()` ([docs](puremodule.md)) | ## Subpackages diff --git a/dimos/memory2/buffer.py b/dimos/memory2/buffer.py index 49814eb6dc..9996890871 100644 --- a/dimos/memory2/buffer.py +++ b/dimos/memory2/buffer.py @@ -71,6 +71,15 @@ def close(self) -> None: @abstractmethod def __len__(self) -> int: ... + def clone(self) -> BackpressureBuffer[T]: + """Fresh, empty buffer with the same policy. + + Lets a buffer instance serve as a *template* (e.g. a class-level + ``backpressure = Bounded(8)``) while each consumer gets its own + independent state. Subclasses with constructor args must override. + """ + return type(self)() + def __iter__(self) -> Iterator[T]: """Yield items until the buffer is closed.""" while True: @@ -138,6 +147,9 @@ def __init__(self, maxlen: int) -> None: self._closed = False self._cond = threading.Condition() + def clone(self) -> Bounded[T]: + return Bounded(self._buf.maxlen or 0) + def put(self, item: T) -> bool: with self._cond: if self._closed: @@ -178,6 +190,9 @@ def __init__(self, maxlen: int) -> None: self._closed = False self._cond = threading.Condition() + def clone(self) -> DropNew[T]: + return DropNew(self._maxlen) + def put(self, item: T) -> bool: with self._cond: if self._closed or len(self._buf) >= self._maxlen: diff --git a/dimos/memory2/health.py b/dimos/memory2/health.py new file mode 100644 index 0000000000..999caa577c --- /dev/null +++ b/dimos/memory2/health.py @@ -0,0 +1,328 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Health monitoring for tick-based modules — drops are metrics, not errors. + +Under ``KeepLast`` backpressure a controller dropping most of its ticks is +the system *working as designed*, so per-drop warnings are wrong. The +mature ladder implemented here: + +1. **Count always** — ticks resolved/stepped, drops by reason + (``backpressure``, ``missing_input``), step latency, input rates, ages + of consumed values, output rates. +2. **Report continuously** — an aggregated :class:`Health` snapshot every + ``interval_s``, handed to a sink (the module appends it to a + ``_health`` memory2 stream: live-only on a NullStore, recorded next to + the data it explains on a storage-backed store). +3. **Log on transitions** — one warmup line comparing observed input rates + to declared expectations, one line on entering ``DEGRADED``/``STALLED`` + (with the violated contracts), a throttled reminder while unhealthy, + one line on recovery. + +Contracts come from two places, deliberately split: + +- *semantic tolerances in code* — ``latest(max_age=...)`` already declares + "older than this is unacceptable"; its violation shows up as + ``missing_input`` drops; +- *rates in deployment config* — ``expected_hz`` per input and + ``min_output_hz``, because sim, replay, and the robot legitimately + differ. + +The monitor is pure bookkeeping with an injectable clock — no threads, no +streams — so the contract messages are unit-testable. +""" + +from __future__ import annotations + +from collections import deque +from dataclasses import dataclass, field +import threading +from typing import TYPE_CHECKING, Any + +from dimos.utils.logging_config import setup_logger + +if TYPE_CHECKING: + from collections.abc import Callable + +logger = setup_logger() + +OK = "OK" +DEGRADED = "DEGRADED" +STALLED = "STALLED" + + +def _percentile(values: list[float], q: float) -> float: + if not values: + return 0.0 + vals = sorted(values) + i = min(len(vals) - 1, max(0, round(q * (len(vals) - 1)))) + return vals[i] + + +@dataclass(frozen=True) +class Health: + """One aggregated health snapshot (the payload of the ``_health`` stream).""" + + ts: float + state: str # OK | DEGRADED | STALLED + violations: tuple[str, ...] + metrics: dict[str, float] + + +@dataclass +class _Window: + """Counters accumulated since the last report.""" + + inputs: dict[str, int] = field(default_factory=dict) + resolved: int = 0 + queued: int = 0 # viable ticks handed to the backpressure buffer + stepped: int = 0 + emitted: int = 0 # steps that produced at least one output + missing: dict[str, int] = field(default_factory=dict) # drops by field + blocked: int = 0 # ticks evicted while waiting for interpolation brackets + outputs: dict[str, int] = field(default_factory=dict) + + +class HealthMonitor: + """Counts tick-loop events and turns them into contract messages. + + Thread-safe; all hooks are O(1). Call :meth:`maybe_report` + opportunistically from the worker loops — it no-ops until + ``interval_s`` has elapsed. + """ + + def __init__( + self, + name: str, + *, + expected_hz: dict[str, float] | None = None, + min_output_hz: float | None = None, + interval_s: float = 1.0, + warmup_s: float = 5.0, + unhealthy_log_every_s: float = 10.0, + stall_after_s: float = 5.0, + rate_tolerance: float = 0.5, + sink: Callable[[Health], None] | None = None, + clock: Callable[[], float] | None = None, + ) -> None: + import time + + self.name = name + self.expected_hz = dict(expected_hz or {}) + self.min_output_hz = min_output_hz + self.interval_s = interval_s + self.warmup_s = warmup_s + self.unhealthy_log_every_s = unhealthy_log_every_s + self.rate_tolerance = rate_tolerance + self.sink = sink + self.clock = clock if clock is not None else time.time + # A stall must persist this many reporting windows before we call it — + # a single zero-step window is normal for a step slower than the interval. + self._stall_windows = max(1, round(stall_after_s / interval_s)) + self._zero_step_windows = 0 + self._zero_resolve_windows = 0 + + self._lock = threading.Lock() + self._t0 = self.clock() + self._last_report = self._t0 + self._last_unhealthy_log = 0.0 + self._warmup_done = False + self._state = OK + self._state_since = self._t0 + self._win = _Window() + self._total_inputs: dict[str, int] = {} + self._step_ms: deque[float] = deque(maxlen=256) + self._ages: dict[str, deque[float]] = {} + self._buffer_len: Callable[[], int] = lambda: 0 + self._pending_len: Callable[[], int] = lambda: 0 + + # -- hooks (called from the tick loops) ----------------------------------- + + def attach_gauges(self, buffer_len: Callable[[], int], pending_len: Callable[[], int]) -> None: + """Wire gauges sampled at report time (buffer depth, alignment-pending).""" + self._buffer_len = buffer_len + self._pending_len = pending_len + + def on_input(self, name: str) -> None: + with self._lock: + self._win.inputs[name] = self._win.inputs.get(name, 0) + 1 + self._total_inputs[name] = self._total_inputs.get(name, 0) + 1 + + def on_resolved(self) -> None: + with self._lock: + self._win.resolved += 1 + + def on_missing(self, fields: list[str]) -> None: + """A resolved tick was dropped — required inputs missing.""" + with self._lock: + for f in fields: + self._win.missing[f] = self._win.missing.get(f, 0) + 1 + + def on_queued(self) -> None: + """A viable tick was handed to the backpressure buffer.""" + with self._lock: + self._win.queued += 1 + + def on_blocked(self, n: int) -> None: + """*n* ticks were evicted while waiting for interpolation brackets.""" + if n: + with self._lock: + self._win.blocked += n + + def on_step(self, duration_s: float, ages: dict[str, float], emitted: bool) -> None: + with self._lock: + self._win.stepped += 1 + if emitted: + self._win.emitted += 1 + self._step_ms.append(duration_s * 1000.0) + for name, age in ages.items(): + self._ages.setdefault(name, deque(maxlen=256)).append(age) + + def on_output(self, name: str) -> None: + with self._lock: + self._win.outputs[name] = self._win.outputs.get(name, 0) + 1 + + # -- reporting --------------------------------------------------------------- + + def maybe_report(self) -> Health | None: + """Emit a snapshot if ``interval_s`` elapsed; otherwise no-op.""" + now = self.clock() + with self._lock: + if now - self._last_report < self.interval_s: + return None + win, self._win = self._win, _Window() + dt = now - self._last_report + self._last_report = now + return self._report(win, dt, now) + + def _report(self, win: _Window, dt: float, now: float) -> Health: + metrics = self._metrics(win, dt) + + if not self._warmup_done and now - self._t0 >= self.warmup_s: + self._warmup_done = True + self._log_warmup(now) + + violations = self._violations(win, dt, metrics) if self._warmup_done else [] + state = self._next_state(win, violations) + self._transition(state, violations, now) + + health = Health(ts=now, state=state, violations=tuple(violations), metrics=metrics) + if self.sink is not None: + try: + self.sink(health) + except Exception: + logger.exception("%s: health sink failed", self.name) + return health + + def _metrics(self, win: _Window, dt: float) -> dict[str, float]: + m = { + "ticks_resolved_hz": win.resolved / dt, + "ticks_stepped_hz": win.stepped / dt, + "drops_backpressure_hz": max(0, win.queued - win.stepped - self._buffer_len()) / dt, + "drops_missing_hz": sum(win.missing.values()) / dt, + "drops_blocked_hz": win.blocked / dt, + "emitted_hz": win.emitted / dt, + "step_p50_ms": _percentile(list(self._step_ms), 0.50), + "step_p99_ms": _percentile(list(self._step_ms), 0.99), + "buffer_len": float(self._buffer_len()), + "pending_len": float(self._pending_len()), + } + for name, n in win.inputs.items(): + m[f"in_{name}_hz"] = n / dt + for name, n in win.outputs.items(): + m[f"out_{name}_hz"] = n / dt + for name, ages in self._ages.items(): + m[f"age_{name}_p99_s"] = _percentile(list(ages), 0.99) + return m + + def _violations(self, win: _Window, dt: float, metrics: dict[str, float]) -> list[str]: + v: list[str] = [] + for name, expected in self.expected_hz.items(): + observed = win.inputs.get(name, 0) / dt + if observed < expected * self.rate_tolerance: + v.append(f"input '{name}' at {observed:.1f} Hz, expected {expected:g} Hz") + if self.min_output_hz is not None: + emitted_hz = win.emitted / dt + if emitted_hz < self.min_output_hz: + v.append(f"output {emitted_hz:.1f} Hz < contract {self.min_output_hz:g} Hz") + for name, n in win.missing.items(): + if win.resolved and n / win.resolved > 0.5: + v.append(f"input '{name}' missing on {n}/{win.resolved} ticks (stale or dead?)") + return v + + def _next_state(self, win: _Window, violations: list[str]) -> str: + if win.stepped > 0: + self._zero_step_windows = 0 + elif win.queued > 0 or self._buffer_len() > 0: + self._zero_step_windows += 1 + if win.resolved > 0: + self._zero_resolve_windows = 0 + elif sum(win.inputs.values()) > 0 and self._pending_len() > 0: + self._zero_resolve_windows += 1 + + if not self._warmup_done: + return self._state + if self._zero_step_windows >= self._stall_windows: + violations.insert(0, "ticks queued but none stepped (step stuck?)") + return STALLED + if self._zero_resolve_windows >= self._stall_windows: + violations.insert(0, "inputs flowing but no ticks resolving (interpolate input dead?)") + return STALLED + return DEGRADED if violations else OK + + def _transition(self, state: str, violations: list[str], now: float) -> None: + if state != self._state: + if state == OK: + logger.info("%s %s: recovered after %.0fs", self.name, OK, now - self._state_since) + else: + logger.warning("%s %s: %s", self.name, state, "; ".join(violations)) + self._state = state + self._state_since = now + self._last_unhealthy_log = now + elif state != OK and now - self._last_unhealthy_log >= self.unhealthy_log_every_s: + logger.warning( + "%s still %s (%.0fs): %s", + self.name, + state, + now - self._state_since, + "; ".join(violations), + ) + self._last_unhealthy_log = now + + def _log_warmup(self, now: float) -> None: + if not self.expected_hz: + return + elapsed = now - self._t0 + parts = [] + for name, expected in sorted(self.expected_hz.items()): + observed = self._total_inputs.get(name, 0) / elapsed + note = "" if observed >= expected * self.rate_tolerance else " — LOW" + parts.append(f"{name} {observed:.1f} Hz (expected {expected:g}{note})") + logger.info("%s warmup: %s", self.name, ", ".join(parts)) + + @property + def state(self) -> str: + return self._state + + def counters(self) -> dict[str, Any]: + """Current window counters — for tests and debugging.""" + with self._lock: + w = self._win + return { + "inputs": dict(w.inputs), + "resolved": w.resolved, + "queued": w.queued, + "stepped": w.stepped, + "missing": dict(w.missing), + } diff --git a/dimos/memory2/puremodule.md b/dimos/memory2/puremodule.md new file mode 100644 index 0000000000..8120382b70 --- /dev/null +++ b/dimos/memory2/puremodule.md @@ -0,0 +1,218 @@ +# Pure Modules + +A `PureModule` separates a module into two declarations and one pure function: + +- **when it runs** — one input marked `tick()` fires the ticks; +- **how every other input is sampled at that moment** — `latest()`, + `interpolate()`, `window()`; +- **what it computes** — `step()`, a pure function of the aligned inputs + (and, optionally, an explicit recurrent state). + +Because `step` never touches ports, threads, or `self`-state, the same class +runs **live** on pubsub ports and **offline** over stored memory2 streams — +and can't tell the difference. That property is what buys replay, +time-travel debugging, restarts that resume where they left off, migration +across processes/machines, and parallel execution. + +```python skip +from dimos.core.stream import In, Out +from dimos.memory2.puremodule import PureModule, tick, interpolate, latest + +class Follower(PureModule): + image: In[Image] = tick() # 30 fps -> 30 ticks/s + pose: In[PoseStamped] = interpolate() # 50 Hz, slerped to frame time + imu: In[Imu] = latest(max_age=0.1) # newest, None if stale + + cmd_vel: Out[Twist] + + def step(self, image: Image, pose: PoseStamped, imu: Imu | None) -> Twist: + return chase(image, pose) +``` + +## The alignment language + +Sensors don't share a clock: cameras run at 30 fps, odometry at 50 Hz, IMUs +at 200 Hz. "Call the module with an image and *the pose at image time*" +requires a policy, and the policy is the whole declaration: + +| Sampler | Value at tick time `t` | Delays the tick? | +|---|---|---| +| `tick()` | the observation that fired the tick | — (it *is* the tick) | +| `latest(max_age=None)` | newest obs with `ts <= t`; missing if older than `max_age` | never | +| `interpolate(tolerance=0.5)` | lerp/slerp between the obs bracketing `t` | live: until the next obs arrives (~one sample period) | +| `window(seconds)` | list of obs in `(t - seconds, t]` | never | + +An `In` port with no sampler defaults to `latest()`. `interpolate()` +understands numbers, `Pose`, and `PoseStamped` (position lerp + quaternion +slerp; observation poses are interpolated too); other types degrade to +nearest-neighbor. When no bracket exists (stream ended, tick before the +first sample) it falls back to the nearest observation within `tolerance`. + +Your "at what state do we call the module" examples translate as: + +| Intent | Declaration | +|---|---| +| tick on every image, poses interpolated | `image = tick()`, `pose = interpolate()` | +| tick on every pose, image latest-or-None | `pose = tick()`, `image = latest()`, param `image: Image \| None` | +| tick on every image, all IMU since 100ms before | `imu = window(0.1)`, param `imu: list[Imu]` | + +## Binding rules + +`step` parameters bind to inputs **by name**; the annotation picks the shape: + +- `Image` → `obs.data`; `Observation[Image]` → the full observation + (`ts`, `pose`, `tags`); +- `X | None` → missing becomes `None`; missing on a non-optional + parameter **drops the tick**; +- `window()` inputs: `list[X]` or `list[Observation[X]]`; +- reserved names: `ts: float` is the tick time; `state` (first parameter) + makes the module a Mealy machine. + +Outputs: one `Out` port — return the value, or `None` to emit nothing +(ticks double as filters); several — return `{port_name: value}`. + +## Offline: develop on recorded memory + +Record a session (any storage-backed store), then iterate on the module in +a notebook — no LCM, no processes, deterministic: + +```python skip +from dimos.memory2.store.sqlite import SqliteStore + +db = SqliteStore(path="walk_2026_06_11.db") + +out = Follower.over(image=db.streams.image, pose=db.streams.pose, + imu=db.streams.imu) + +out.to_list() # run it +out.map_data(lambda t: t.linear.x).to_list() # poke at results +out.save(db.stream("cmd_vel_v2")).drain() # or persist them +``` + +`over()` composes with the whole stream API — replay a slice with +`db.streams.image.after(t0).before(t1)`, downsample, quality-filter, etc. +Called on the class, `over` needs no module machinery at all +(`Follower.offline(some_config=...)` when the step reads `self.config`). +Don't pass `.live()` streams to `over()` — deploy the module for that. + +Offline alignment is *exact*: events are merged in timestamp order, so a +run over the same recording produces the same ticks every time. + +## Live: the same class on ports + +Deployed in a blueprint, each `In` port feeds a memory2 stream in the +module's store and ticks run on a worker thread; outputs publish to the +`Out` ports. The store is a `NullStore` by default — inputs behave as +live-only streams (`.map`/`.transform`/`.live()` work; history and search +are empty). Override `make_store()` to return a `SqliteStore` and the +module records **every input and output** while it runs — recording is a +deployment choice, not module code. (This is the path to subsuming +`Recorder`: a recorder is a PureModule deployment with a storage-backed +store and no step.) + +Live alignment is best-effort: observations are timestamped on arrival +(`msg.ts` when present) and slight cross-stream jitter is tolerated; +`interpolate()` inputs add one sample period of latency to each tick. + +## Backpressure: the tick is the unit of load + +The system has two regimes, and the store converts between them: + +- **Pull (offline / stored streams)** — backpressure is intrinsic. The + consumer's iteration is the clock; a chained pipeline computes one tick + at a time and nothing accumulates beyond the (pruned) alignment buffers. +- **Push (live ports)** — sensors can't be paused, so backpressure is a + declared *drop/coalesce policy*, and the natural unit is the tick: + secondaries are cheap to ingest, all the expense is `step()`. + +Live, resolved ticks flow through a `BackpressureBuffer` between the +alignment thread and the step thread: + +```python skip +class Follower(PureModule): + backpressure = KeepLast() # default — controller semantics + # backpressure = Unbounded() # recorder/indexer semantics: never drop + # backpressure = Bounded(8) # bounded queue, drops oldest +``` + +`KeepLast` means a slow step always processes the *freshest* tick and the +skipped ones are counted — for a 30 fps camera and a 100 ms step, +dropping ~2/3 of ticks is the system working as designed. Every queue in +the path is bounded: the tick buffer by policy, alignment buffers by +pruning, and ticks waiting for interpolation brackets by +`max_pending_ticks` (config, default 64) so a dead `interpolate()` input +can't accumulate ticks forever (evictions count as `drops_blocked`). + +One honest consequence: with drops, a live run processes a *subsample* of +triggers, so replaying raw inputs offline (which processes all of them) +diverges for stateful modules. Exact replay-of-a-run requires recording +the resolved tick rows — designed next step, not built. + +## Health: drops are metrics, not errors + +Per-drop warnings at sensor rate are noise. The module follows the +mature ladder instead — count always, report continuously, log on +transitions, alert on *contracts*: + +- **Counters** (always): ticks resolved/stepped, drops by reason + (`backpressure`, `missing_input`, `blocked`), step p50/p99, per-input + observed Hz, age of consumed `latest()` values, output rates. +- **`_health` stream**: an aggregated snapshot every + `health_interval_s` (1 s) appended to the module store — live-only on a + NullStore, *recorded next to the data it explains* on a SqliteStore, so + a post-incident notebook plots drop ratio against the very frames that + were dropped. +- **Contracts** are split deliberately: semantic tolerances live in the + declaration (`latest(max_age=…)`, `interpolate(tolerance=…)`); *rates* + live in deployment config (`expected_hz={"pose": 50}`, + `min_output_hz=10`) because sim, replay, and the robot differ. +- **Messages**: one warmup line after `health_warmup_s` comparing + observed input rates to expectations ("pose 12.1 Hz (expected 50 — + LOW)"); one WARN on entering `DEGRADED`/`STALLED` naming the violated + contracts; a throttled reminder every `unhealthy_log_every_s` while + unhealthy; one INFO on recovery with the outage duration. Stalls + (`STALLED`) distinguish "ticks queued but none stepped (step stuck?)" + from "inputs flowing but no ticks resolving (interpolate input dead?)" + and must persist `stall_after_s` before firing — a single slow step is + not a stall. + +The real SLO is output freshness and rate, not drop count — alert on the +contract, read the drop counters to diagnose *why*. Offline, `over()` +logs a per-field drop summary, and `_strict=True` raises on the first +drop instead (replay determinism tests should fail loudly). + +## State, explicitly + +If a module needs recurrence (gait phase, filters, RNN hidden state), +declare it — don't hide it in `self`: + +```python skip +class GaitController(PureModule): + pose: In[PoseStamped] = tick() + cmd_vel: Out[Twist] + + initial_state = GaitState(phase=0.0) + + def step(self, state: GaitState, pose: PoseStamped) -> tuple[GaitState, Twist]: + ... + return new_state, twist +``` + +The runtime threads the state through the ticks (it's `scan` over the tick +stream). Because state is a value, snapshotting it per tick gives +time-rewind and live migration — the snapshot stream is the designed next +step, not yet built. + +## Not yet designed / deliberately deferred + +- `every(hz)` clock triggers and multi-input triggers (`on_any`) — only + one `tick()` input for now. +- State snapshot streams (time-travel, suspend/revive, migration) — the + Mealy form is the hook; persistence isn't wired yet. +- A live timeout policy for `interpolate()` when its input dies (currently + ticks wait; on shutdown they resolve via the nearest-fallback). +- Modules that *query* memory (semantic search) — that's an impure + capability and stays on `MemoryModule` for now. +- `Annotated[In[X], sampler]` syntax — rejected for now because core + `Module` introspection doesn't unwrap `Annotated`; the default-value + syntax is canonical. diff --git a/dimos/memory2/puremodule.py b/dimos/memory2/puremodule.py new file mode 100644 index 0000000000..91954eb0a0 --- /dev/null +++ b/dimos/memory2/puremodule.py @@ -0,0 +1,612 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""PureModule — a module whose core is a pure function of aligned inputs. + +Instead of subscribing to ports and mutating ``self``, a PureModule +declares *when* it runs and *how* each input is sampled at that moment, +then implements one pure ``step``. The same class runs live on pubsub +ports or offline over stored memory2 streams — ``step`` cannot tell the +difference, which is what buys replay, time-travel debugging, migration, +and parallelism. + +:: + + class Follower(PureModule): + image: In[Image] = tick() # fires the ticks + pose: In[PoseStamped] = interpolate() # slerped to frame time + imu: In[Imu] = latest(max_age=0.1) # newest, or None if stale + + cmd_vel: Out[Twist] + + def step(self, image: Image, pose: PoseStamped, imu: Imu | None) -> Twist: + return chase(image, pose) + +``step`` parameters are bound by name to the declared inputs: + +- the parameter's annotation picks the payload — ``Image`` gets + ``obs.data``, ``Observation[Image]`` gets the full observation, + ``window()`` inputs take ``list[X]`` or ``list[Observation[X]]``; +- ``X | None`` means a missing value is passed as ``None``; a missing + value for a non-optional parameter *drops the tick*; +- reserved names: ``ts: float`` receives the tick time; declaring + ``state`` makes the module a Mealy machine — ``step(self, state, ...)`` + must return ``(new_state, output)`` and the initial state comes from + the ``initial_state`` attribute. + +Outputs: with one ``Out`` port, return the value (or ``None`` to emit +nothing); with several, return ``{port_name: value}``. + +**Live** (``module.start()``): each ``In`` port feeds a memory2 stream in +the module's store — a :class:`~dimos.memory2.store.null.NullStore` by +default, so inputs behave as live-only streams (no history, queries come +back empty). Override :meth:`make_store` to return a storage-backed store +and the module records every input *and* output as a side effect of +running. Outputs publish to the ``Out`` ports. + +**Offline** (:meth:`over`): run the same module over stored streams — +no ports, no transports, no threads, just lazy iteration:: + + store = SqliteStore(path="walk.db") + out = Follower.over(image=store.streams.image, pose=store.streams.pose, + imu=store.streams.imu) + out.to_list() # or .save(...), .map(...) + +Called on the class, ``over`` builds a machinery-free instance via +:meth:`offline` (pass config there when needed: +``Follower.offline(gain=2.0).over(...)``). + +Alignment semantics live in :mod:`dimos.memory2.tick`. +""" + +from __future__ import annotations + +from dataclasses import dataclass +import inspect +import queue +import threading +import time +import types +import typing +from typing import TYPE_CHECKING, Any, Union, get_args, get_origin, get_type_hints + +from pydantic import Field +from reactivex.disposable import Disposable + +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.resource import CompositeResource +from dimos.core.stream import In, Out +from dimos.memory2.buffer import BackpressureBuffer, ClosedError, KeepLast +from dimos.memory2.health import Health, HealthMonitor +from dimos.memory2.store.null import NullStore +from dimos.memory2.tick import ( + MISSING, + Latest, + Sampler, + Tick, + TickMachine, + Window, + interpolate, + latest, + merge_events, + tick, + window, +) +from dimos.memory2.type.observation import Observation +from dimos.protocol.service.spec import Configurable +from dimos.utils.logging_config import setup_logger + +if TYPE_CHECKING: + from dimos.memory2.store.base import Store + from dimos.memory2.stream import Stream + +logger = setup_logger() + +__all__ = ["PureModule", "interpolate", "latest", "tick", "window"] + +_STOP = object() + + +@dataclass(frozen=True) +class _Param: + name: str + kind: str # "ts" | "trigger" | "field" + optional: bool + wants_obs: bool + + +@dataclass(frozen=True) +class _Plan: + trigger: str + samplers: dict[str, Sampler] # secondaries only (no trigger) + ins: dict[str, type] + outs: dict[str, type] + params: tuple[_Param, ...] + stateful: bool + + +def _unwrap_optional(ann: Any) -> tuple[Any, bool]: + if get_origin(ann) in (Union, types.UnionType): + args = [a for a in get_args(ann) if a is not type(None)] + if len(args) == 1: + return args[0], True + return ann, False + + +def _wants_obs(ann: Any) -> bool: + return ann is Observation or get_origin(ann) is Observation + + +class _class_or_instance: + """Method usable on the class (auto-creating an offline instance) or an instance. + + ``Walker.over(...)`` in a notebook needs no module machinery; + ``walker.over(...)`` reuses a configured instance. + """ + + def __init__(self, fn: Any) -> None: + self.fn = fn + + def __get__(self, obj: Any, objtype: type[PureModule] | None = None) -> Any: + if obj is None: + + def bound(*args: Any, **kwargs: Any) -> Any: + assert objtype is not None + return self.fn(objtype.offline(), *args, **kwargs) + + return bound + return types.MethodType(self.fn, obj) + + +class PureModuleConfig(ModuleConfig): + """Deployment-side contracts and health reporting knobs. + + Semantic tolerances (``max_age``, ``tolerance``) live in the module's + sampler declarations; *rates* live here because sim, replay, and the + robot legitimately differ. + """ + + expected_hz: dict[str, float] = Field(default_factory=dict) + """Expected arrival rate per input — checked once after warmup, then + continuously (violation below 50% of expected).""" + + min_output_hz: float | None = None + """Contract: rate of ticks that emit at least one output.""" + + health_interval_s: float = 1.0 + health_warmup_s: float = 5.0 + unhealthy_log_every_s: float = 10.0 + stall_after_s: float = 5.0 + health_stream: bool = True + + max_pending_ticks: int = 64 + """Cap on live ticks awaiting interpolation brackets — bounds memory + when an ``interpolate()`` input dies; evictions count as + ``drops_blocked``. Offline ``over()`` is uncapped (exact).""" + """Append 1 Hz aggregated Health snapshots to a ``_health`` stream in + the module store (live-only on NullStore, recorded on SqliteStore).""" + + +class PureModule(Module): + """Base class for modules implementing a pure ``step`` over aligned inputs. + + See the module docstring for the full declaration language. + """ + + config: PureModuleConfig + + initial_state: Any = None + + backpressure: BackpressureBuffer[Any] = KeepLast() + """Policy for resolved ticks awaiting ``step`` when deployed live. + + ``KeepLast()`` (default) always steps the freshest tick — controller + semantics; skipped ticks are counted as ``drops_backpressure``. Use + ``Unbounded()`` for must-process-everything consumers (recorders, + indexers) or ``Bounded(n)``/``DropNew(n)`` in between. The instance is + a template — each ``start()`` gets a fresh ``clone()``.""" + + def step(self, *args: Any, **kwargs: Any) -> Any: # pragma: no cover - overridden + raise NotImplementedError(f"{type(self).__name__} must define step()") + + # -- plan ----------------------------------------------------------------- + + @classmethod + def _plan(cls) -> _Plan: + plan = cls.__dict__.get("_cached_plan") + if plan is None: + plan = cls._build_plan() + cls._cached_plan = plan # type: ignore[attr-defined] + return plan + + @classmethod + def _build_plan(cls) -> _Plan: + if cls.step is PureModule.step: + raise TypeError(f"{cls.__name__} must define step()") + + hints = get_type_hints(cls, include_extras=True) + ins: dict[str, type] = {} + outs: dict[str, type] = {} + samplers: dict[str, Sampler] = {} + trigger: str | None = None + + for name, ann in hints.items(): + if get_origin(ann) is typing.Annotated: + inner = get_args(ann)[0] + if get_origin(inner) in (In, Out): + raise TypeError( + f"{cls.__name__}.{name}: Annotated ports are not supported " + f"(Module won't create the port) — declare the sampler as a " + f"default instead: `{name}: In[...] = latest()`" + ) + continue + origin = get_origin(ann) + if origin is In: + if name in ("ts", "state"): + raise TypeError(f"{cls.__name__}.{name}: 'ts' and 'state' are reserved names") + ins[name] = (get_args(ann) or (object,))[0] + sampler = inspect.getattr_static(cls, name, None) + if isinstance(sampler, Tick): + if trigger is not None: + raise TypeError( + f"{cls.__name__}: multiple tick() inputs ({trigger!r}, {name!r}) — " + f"exactly one input must fire the ticks" + ) + trigger = name + elif isinstance(sampler, Sampler): + samplers[name] = sampler + else: + samplers[name] = latest() + elif origin is Out: + outs[name] = (get_args(ann) or (object,))[0] + + if trigger is None: + raise TypeError( + f"{cls.__name__}: no tick() input — mark exactly one input with " + f"`name: In[...] = tick()` to define when the module steps" + ) + + sig = inspect.signature(cls.step) + try: + step_hints = get_type_hints(cls.step, include_extras=True) + except (NameError, AttributeError, TypeError): + step_hints = {} + + params: list[_Param] = [] + stateful = False + names = [n for n in sig.parameters if n != "self"] + for name in names: + if name == "state": + if names[0] != "state": + raise TypeError(f"{cls.__name__}.step: 'state' must be the first parameter") + stateful = True + continue + if name == "ts": + params.append(_Param("ts", "ts", optional=False, wants_obs=False)) + continue + if name not in ins: + raise TypeError( + f"{cls.__name__}.step parameter {name!r} doesn't match an input — " + f"declared inputs: {sorted(ins)} (reserved: ts, state)" + ) + ann, optional = _unwrap_optional(step_hints.get(name, Any)) + if isinstance(samplers.get(name), Window): + args = get_args(ann) + wants = bool(args) and _wants_obs(args[0]) + else: + wants = _wants_obs(ann) + kind = "trigger" if name == trigger else "field" + params.append(_Param(name, kind, optional=optional, wants_obs=wants)) + + return _Plan( + trigger=trigger, + samplers=samplers, + ins=ins, + outs=outs, + params=tuple(params), + stateful=stateful, + ) + + # -- binding & dispatch ----------------------------------------------------- + + def _bind( + self, plan: _Plan, tobs: Observation[Any], row: dict[str, Any] + ) -> tuple[dict[str, Any] | None, dict[str, float], list[str]]: + """Build step kwargs for one tick. + + Returns ``(kwargs, ages, missing)`` — kwargs is None when the tick + must drop, ``missing`` names the required fields that were absent, + ``ages`` is staleness of consumed ``latest()`` values (tick time + minus observation time), for health reporting. + """ + kwargs: dict[str, Any] = {} + ages: dict[str, float] = {} + missing: list[str] = [] + for p in plan.params: + if p.kind == "ts": + kwargs[p.name] = tobs.ts + elif p.kind == "trigger": + kwargs[p.name] = tobs if p.wants_obs else tobs.data + else: + val = row[p.name] + if val is MISSING: + if not p.optional: + missing.append(p.name) + kwargs[p.name] = None + elif isinstance(val, list): + kwargs[p.name] = val if p.wants_obs else [o.data for o in val] + else: + if isinstance(plan.samplers.get(p.name), Latest): + ages[p.name] = tobs.ts - val.ts + kwargs[p.name] = val if p.wants_obs else val.data + if missing: + return None, ages, missing + return kwargs, ages, missing + + def _invoke( + self, plan: _Plan, state: Any, kwargs: dict[str, Any] + ) -> tuple[Any, dict[str, Any]]: + """Run step; returns (new_state, {out_name: value}).""" + if plan.stateful: + result = self.step(state, **kwargs) + if not (isinstance(result, tuple) and len(result) == 2): + raise TypeError( + f"{type(self).__name__}.step declares 'state' so it must " + f"return (new_state, output), got {type(result).__name__}" + ) + state, out = result + else: + out = self.step(**kwargs) + + if out is None or not plan.outs: + return state, {} + if len(plan.outs) == 1: + return state, {next(iter(plan.outs)): out} + if not isinstance(out, dict): + raise TypeError( + f"{type(self).__name__}.step must return a dict over its Out ports " + f"{sorted(plan.outs)}, got {type(out).__name__}" + ) + unknown = set(out) - set(plan.outs) + if unknown: + raise TypeError( + f"{type(self).__name__}.step returned unknown outputs {sorted(unknown)}" + ) + return state, out + + # -- offline ------------------------------------------------------------------ + + @classmethod + def offline(cls, **config: Any) -> PureModule: + """Construct without module machinery (no event loop, RPC, or ports). + + Enough of an instance to run :meth:`over` — ``self.config`` works, + live deployment doesn't. This is what notebooks want. + """ + self = cls.__new__(cls) + Configurable.__init__(self, **config) + CompositeResource.__init__(self) + return self + + @_class_or_instance + def over(self, _strict: bool = False, **streams: Any) -> Stream[Any]: + """Run this module over stored/finite streams; returns a lazy output stream. + + Pass one memory2 stream per declared input, by name. The result + is a regular lazy :class:`~dimos.memory2.stream.Stream` — chain + ``.to_list()``, ``.save(...)``, ``.map_data(...)`` etc. With one + ``Out`` port the observations carry the output values; with + several they carry ``{port_name: value}`` dicts. Output + observations derive from the trigger observation (its ``ts``, + ``pose``, ``tags``). + + Each stream must iterate in ascending ``ts`` (stored streams in + insertion order do; otherwise prepend ``.order_by("ts")``). Don't + pass ``.live()`` streams — deploy the module for that. + + ``_strict=True`` raises on the first tick dropped for missing + required inputs instead of counting it — offline a drop usually + means a data or declaration bug, and replay-determinism tests + should fail loudly. + """ + plan = self._plan() + missing = set(plan.ins) - set(streams) + extra = set(streams) - set(plan.ins) + if missing or extra: + raise TypeError( + f"{type(self).__name__}.over() inputs mismatch — " + f"missing: {sorted(missing) or '—'}, unknown: {sorted(extra) or '—'}" + ) + + secondaries = {name: streams[name] for name in plan.samplers} + + def _run(upstream: Any) -> Any: + machine = TickMachine(plan.trigger, plan.samplers) + state = self.initial_state + dropped: dict[str, int] = {} + + def emit(resolved: list[tuple[Observation[Any], dict[str, Any]]]) -> Any: + nonlocal state + for tobs, row in resolved: + kwargs, _ages, missing_fields = self._bind(plan, tobs, row) + if kwargs is None: + if _strict: + raise ValueError( + f"{type(self).__name__}: tick at ts={tobs.ts} missing " + f"required inputs {missing_fields} (strict mode)" + ) + for f in missing_fields: + dropped[f] = dropped.get(f, 0) + 1 + continue + state, outs = self._invoke(plan, state, kwargs) + if not outs: + continue + if len(plan.outs) <= 1: + yield tobs.derive(data=next(iter(outs.values()))) + else: + yield tobs.derive(data=outs) + + for name, obs in merge_events(plan.trigger, upstream, secondaries): + yield from emit(machine.process(name, obs)) + yield from emit(machine.flush()) + if dropped: + logger.info( + "%s: dropped %d ticks with missing required inputs (%s)", + type(self).__name__, + sum(dropped.values()), + ", ".join(f"{f}: {n}" for f, n in sorted(dropped.items())), + ) + + trigger_stream = streams[plan.trigger] + return trigger_stream.transform(_run) # type: ignore[no-any-return] + + # -- live ----------------------------------------------------------------------- + + def make_store(self) -> Store: + """Store bridging the ports to memory2 streams when deployed. + + Default is a :class:`NullStore` — inputs/outputs behave as + live-only streams with no history. Return a storage-backed store + (e.g. ``SqliteStore``) to record every input and output of the + running module. + """ + return NullStore() + + @rpc + def start(self) -> None: + super().start() + plan = self._plan() + cfg = self.config + + store = self.register_disposable(self.make_store()) + store.start() + self._streams = {name: store.stream(name, port.type) for name, port in self.inputs.items()} + self._out_streams = { + name: store.stream(name, port.type) for name, port in self.outputs.items() + } + + health_stream = store.stream("_health", dict) if cfg.health_stream else None + + def _sink(h: Health) -> None: + assert health_stream is not None + health_stream.append( + h.metrics, ts=h.ts, tags={"state": h.state, "violations": list(h.violations)} + ) + + monitor = HealthMonitor( + str(self), + expected_hz=cfg.expected_hz, + min_output_hz=cfg.min_output_hz, + interval_s=cfg.health_interval_s, + warmup_s=cfg.health_warmup_s, + unhealthy_log_every_s=cfg.unhealthy_log_every_s, + stall_after_s=cfg.stall_after_s, + sink=_sink if health_stream is not None else None, + ) + self.health_monitor = monitor + + q: queue.SimpleQueue[Any] = queue.SimpleQueue() + self._queue = q + ticks = self._tick_buffer = self.backpressure.clone() + machine = TickMachine(plan.trigger, plan.samplers, max_pending=cfg.max_pending_ticks) + monitor.attach_gauges( + buffer_len=lambda: len(ticks), pending_len=lambda: len(machine.pending) + ) + + for name, port in self.inputs.items(): + if name not in plan.ins: + continue + + def _on_msg(msg: Any, _name: str = name) -> None: + ts = getattr(msg, "ts", None) or time.time() + self._streams[_name].append(msg, ts=ts) + monitor.on_input(_name) + q.put((_name, Observation(ts=ts, data_type=type(msg), _data=msg))) + + self.register_disposable(Disposable(port.subscribe(_on_msg))) + + def _align_loop() -> None: + """Drain raw events fast; resolve + bind ticks; never blocks on step.""" + blocked_seen = 0 + while True: + try: + item = q.get(timeout=cfg.health_interval_s) + except queue.Empty: + monitor.maybe_report() + continue + try: + resolved = machine.flush() if item is _STOP else machine.process(*item) + if machine.blocked_dropped > blocked_seen: + monitor.on_blocked(machine.blocked_dropped - blocked_seen) + blocked_seen = machine.blocked_dropped + for tobs, row in resolved: + monitor.on_resolved() + kwargs, ages, missing = self._bind(plan, tobs, row) + if kwargs is None: + monitor.on_missing(missing) + continue + monitor.on_queued() + ticks.put((tobs, kwargs, ages)) + except Exception: + logger.exception("%s: alignment failed for an event", self) + monitor.maybe_report() + if item is _STOP: + ticks.close() + return + + def _step_loop() -> None: + """Pull ticks at step pace — the backpressure policy decides what it sees.""" + state = self.initial_state + while True: + try: + tobs, kwargs, ages = ticks.take(timeout=cfg.health_interval_s) + except TimeoutError: + monitor.maybe_report() + continue + except ClosedError: + return + t0 = time.perf_counter() + try: + state, outs = self._invoke(plan, state, kwargs) + except Exception: + logger.exception("%s.step failed for tick ts=%s", self, tobs.ts) + continue + monitor.on_step(time.perf_counter() - t0, ages, emitted=bool(outs)) + for out_name, value in outs.items(): + monitor.on_output(out_name) + try: + self.outputs[out_name].publish(value) + self._out_streams[out_name].append(value, ts=tobs.ts) + except Exception: + logger.exception("%s: publishing %s failed", self, out_name) + monitor.maybe_report() + + self._threads = [ + threading.Thread(target=_align_loop, name=f"{self}.align", daemon=True), + threading.Thread(target=_step_loop, name=f"{self}.step", daemon=True), + ] + for t in self._threads: + t.start() + + def _shutdown() -> None: + q.put(_STOP) + for t in self._threads: + t.join(2.0) + + self.register_disposable(Disposable(_shutdown)) + + @rpc + def stop(self) -> None: + super().stop() diff --git a/dimos/memory2/test_health.py b/dimos/memory2/test_health.py new file mode 100644 index 0000000000..057bfe59ee --- /dev/null +++ b/dimos/memory2/test_health.py @@ -0,0 +1,270 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""HealthMonitor — contract messages, states, and drop accounting (fake clock).""" + +from __future__ import annotations + +from typing import Any + +import pytest + +import dimos.memory2.health +from dimos.memory2.health import DEGRADED, OK, STALLED, Health, HealthMonitor + + +class LogSpy: + """Records formatted log lines — independent of logging configuration.""" + + def __init__(self) -> None: + self.lines: list[tuple[str, str]] = [] + + def _record(self, level: str, msg: str, *args: object) -> None: + self.lines.append((level, msg % args if args else msg)) + + def info(self, msg: str, *args: object) -> None: + self._record("INFO", msg, *args) + + def warning(self, msg: str, *args: object) -> None: + self._record("WARNING", msg, *args) + + def exception(self, msg: str, *args: object) -> None: + self._record("ERROR", msg, *args) + + def at(self, level: str) -> list[str]: + return [line for lvl, line in self.lines if lvl == level] + + +@pytest.fixture +def logspy(monkeypatch: pytest.MonkeyPatch) -> LogSpy: + spy = LogSpy() + monkeypatch.setattr(dimos.memory2.health, "logger", spy) + return spy + + +class Clock: + def __init__(self) -> None: + self.t = 0.0 + + def __call__(self) -> float: + return self.t + + def advance(self, dt: float) -> float: + self.t += dt + return self.t + + +def make(clock: Clock, **kwargs: Any) -> tuple[HealthMonitor, list[Health]]: + snaps: list[Health] = [] + kwargs.setdefault("interval_s", 1.0) + kwargs.setdefault("warmup_s", 0.0) # most tests skip warmup gating + m = HealthMonitor("mod", clock=clock, sink=snaps.append, **kwargs) + return m, snaps + + +def report(m: HealthMonitor, clock: Clock) -> Health: + clock.advance(1.0) + h = m.maybe_report() + assert h is not None + return h + + +def test_reports_only_after_interval() -> None: + clock = Clock() + m, snaps = make(clock) + assert m.maybe_report() is None # interval not elapsed + clock.advance(0.5) + assert m.maybe_report() is None + clock.advance(0.6) + assert m.maybe_report() is not None + assert len(snaps) == 1 + + +def test_healthy_steady_state_with_expected_drops() -> None: + """KeepLast dropping most ticks is OK as long as contracts hold.""" + clock = Clock() + m, _ = make(clock, min_output_hz=5.0) + for _ in range(30): # 30 ticks queued... + m.on_resolved() + m.on_queued() + for _ in range(10): # ...10 stepped (KeepLast ate 20) + m.on_step(0.05, {}, emitted=True) + + h = report(m, clock) + assert h.state == OK # drops alone are not a violation + assert h.metrics["drops_backpressure_hz"] == 20.0 + assert h.metrics["ticks_stepped_hz"] == 10.0 + + +def test_output_contract_violation_and_recovery() -> None: + clock = Clock() + m, _ = make(clock, min_output_hz=10.0) + + for _ in range(3): + m.on_step(0.01, {}, emitted=True) + h = report(m, clock) + assert h.state == DEGRADED + assert any("output 3.0 Hz < contract 10 Hz" in v for v in h.violations) + + for _ in range(12): + m.on_step(0.01, {}, emitted=True) + h = report(m, clock) + assert h.state == OK + assert h.violations == () + + +def test_input_rate_violation_mentions_rates() -> None: + clock = Clock() + m, _ = make(clock, expected_hz={"pose": 50.0}) + for _ in range(12): + m.on_input("pose") + h = report(m, clock) + assert h.state == DEGRADED + assert any("'pose' at 12.0 Hz, expected 50 Hz" in v for v in h.violations) + + +def test_missing_input_majority_is_a_violation() -> None: + clock = Clock() + m, _ = make(clock) + for _ in range(10): + m.on_resolved() + for _ in range(8): + m.on_missing(["imu"]) + h = report(m, clock) + assert h.state == DEGRADED + assert any("'imu' missing on 8/10 ticks" in v for v in h.violations) + + +def test_stall_requires_persistence() -> None: + clock = Clock() + m, _ = make(clock, stall_after_s=3.0) + + # Ticks queue but nothing steps. One window: not yet a stall. + m.on_resolved() + m.on_queued() + assert report(m, clock).state == OK + + # Buffer still holding, two more zero-step windows -> STALLED. + m.attach_gauges(buffer_len=lambda: 1, pending_len=lambda: 0) + assert report(m, clock).state == OK + h = report(m, clock) + assert h.state == STALLED + assert any("none stepped" in v for v in h.violations) + + # A step recovers it. + m.on_step(0.01, {}, emitted=True) + m.attach_gauges(buffer_len=lambda: 0, pending_len=lambda: 0) + assert report(m, clock).state == OK + + +def test_alignment_stall_when_inputs_flow_but_nothing_resolves() -> None: + clock = Clock() + m, _ = make(clock, stall_after_s=2.0) + m.attach_gauges(buffer_len=lambda: 0, pending_len=lambda: 3) + for _ in range(2): + for _ in range(5): + m.on_input("image") + clock.advance(1.0) + h = m.maybe_report() + assert h is not None + assert h.state == STALLED + assert any("no ticks resolving" in v for v in h.violations) + + +def test_warmup_gates_violations() -> None: + clock = Clock() + m, _ = make(clock, warmup_s=3.0, expected_hz={"image": 30.0}) + h = report(m, clock) # t=1.0 — warming up, no violations yet + assert h.state == OK and h.violations == () + report(m, clock) + h = report(m, clock) # t=3.0 — warmup done, contract active + assert h.state == DEGRADED + + +def test_staleness_ages_reported() -> None: + clock = Clock() + m, _ = make(clock) + m.on_step(0.01, {"imu": 0.02}, emitted=True) + m.on_step(0.01, {"imu": 0.09}, emitted=True) + h = report(m, clock) + assert h.metrics["age_imu_p99_s"] == 0.09 + assert h.metrics["step_p50_ms"] == 10.0 + + +def test_blocked_drops_metric() -> None: + clock = Clock() + m, _ = make(clock) + m.on_blocked(7) + h = report(m, clock) + assert h.metrics["drops_blocked_hz"] == 7.0 + + +# -- contract messages (log lines) --------------------------------------------------- + + +def test_degraded_logged_once_then_throttled(logspy: LogSpy) -> None: + clock = Clock() + m, _ = make(clock, min_output_hz=10.0, unhealthy_log_every_s=10.0) + + report(m, clock) # violation -> DEGRADED, logged + warns = logspy.at("WARNING") + assert len(warns) == 1 + assert "mod DEGRADED: output 0.0 Hz < contract 10 Hz" in warns[0] + + report(m, clock) # still degraded, inside throttle window -> no new line + assert len(logspy.at("WARNING")) == 1 + + clock.advance(10.0) # throttle elapsed -> one reminder with duration + m.maybe_report() + warns = logspy.at("WARNING") + assert len(warns) == 2 + assert "still DEGRADED" in warns[1] + + +def test_recovery_logged_with_duration(logspy: LogSpy) -> None: + clock = Clock() + m, _ = make(clock, min_output_hz=10.0) + report(m, clock) # DEGRADED + for _ in range(12): + m.on_step(0.01, {}, emitted=True) + report(m, clock) # back to OK + infos = logspy.at("INFO") + assert any("mod OK: recovered after" in line for line in infos) + + +def test_warmup_line_reports_observed_vs_expected(logspy: LogSpy) -> None: + clock = Clock() + m, _ = make(clock, warmup_s=2.0, expected_hz={"image": 30.0, "pose": 50.0}) + for _ in range(60): + m.on_input("image") # 30 Hz over the 2s warmup + for _ in range(10): + m.on_input("pose") # 5 Hz — far below 50 + report(m, clock) + report(m, clock) # t=2.0 -> warmup line emitted exactly once + report(m, clock) + warmups = [line for line in logspy.at("INFO") if "warmup" in line] + assert len(warmups) == 1 + assert "image 30.0 Hz (expected 30)" in warmups[0] + assert "pose 5.0 Hz (expected 50 — LOW)" in warmups[0] + + +def test_stall_message_names_the_suspect(logspy: LogSpy) -> None: + clock = Clock() + m, _ = make(clock, stall_after_s=1.0) + m.attach_gauges(buffer_len=lambda: 1, pending_len=lambda: 0) + m.on_resolved() + m.on_queued() + h = report(m, clock) + assert h.state == STALLED + assert any("STALLED" in line and "step stuck" in line for line in logspy.at("WARNING")) diff --git a/dimos/memory2/test_puremodule.py b/dimos/memory2/test_puremodule.py new file mode 100644 index 0000000000..3d63a2b80e --- /dev/null +++ b/dimos/memory2/test_puremodule.py @@ -0,0 +1,562 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""PureModule — offline alignment, binding rules, and live wiring.""" + +from __future__ import annotations + +import math +import threading +import time +from typing import TYPE_CHECKING, Annotated, Any + +import pytest + +from dimos.core.stream import In, Out +from dimos.memory2.puremodule import PureModule, interpolate, latest, tick, window +from dimos.memory2.store.memory import MemoryStore +from dimos.memory2.tick import Interpolate, Latest, TickMachine, Window +from dimos.memory2.type.observation import Observation +from dimos.msgs.geometry_msgs.Pose import Pose + +if TYPE_CHECKING: + from collections.abc import Callable, Iterator + + from dimos.memory2.stream import Stream + +# -- helpers ----------------------------------------------------------------- + + +@pytest.fixture +def store() -> Iterator[MemoryStore]: + s = MemoryStore() + s.start() + yield s + s.dispose() + + +def fill(stream: Stream[Any], pairs: list[tuple[float, Any]]) -> Stream[Any]: + for ts, data in pairs: + stream.append(data, ts=ts) + return stream + + +def obs(ts: float, data: Any) -> Observation[Any]: + return Observation(ts=ts, data_type=type(data), _data=data) + + +# -- offline: interpolation to tick time --------------------------------------- + + +class PoseEcho(PureModule): + camera: In[int] = tick() + pose: In[float] = interpolate() + + sampled: Out[float] + + def step(self, camera: int, pose: float, ts: float) -> float: + assert isinstance(camera, int) + return pose + + +def test_interpolates_to_tick_time(store: MemoryStore) -> None: + # camera at 10 Hz offset by 5ms; pose at 25 Hz with data = 10 * ts + camera = fill(store.stream("camera", int), [(0.005 + i / 10, i) for i in range(9)]) + pose = fill(store.stream("pose", float), [(i / 25, 10 * (i / 25)) for i in range(26)]) + + out = PoseEcho.over(camera=camera, pose=pose).to_list() + + assert len(out) == 9 # every camera frame ticked + for o in out: + assert o.data == pytest.approx(10 * o.ts, abs=1e-9) # exact at frame time + + +def test_output_derives_from_trigger(store: MemoryStore) -> None: + camera = fill(store.stream("camera", int), [(0.105, 7)]) + pose = fill(store.stream("pose", float), [(0.0, 0.0), (0.2, 2.0)]) + + (o,) = PoseEcho.over(camera=camera, pose=pose).to_list() + assert o.ts == 0.105 + assert o.data == pytest.approx(1.05) + + +# -- offline: latest / optional / dropping --------------------------------------- + + +class WithImu(PureModule): + camera: In[int] = tick() + imu: In[float] = latest(max_age=0.05) + + out: Out[float] + + def step(self, camera: int, imu: float | None) -> float: + return -1.0 if imu is None else imu + + +def test_latest_respects_max_age(store: MemoryStore) -> None: + camera = fill(store.stream("camera", int), [(0.01, 0), (0.2, 1)]) + imu = fill(store.stream("imu", float), [(0.0, 5.0)]) + + out = WithImu.over(camera=camera, imu=imu).to_list() + assert [o.data for o in out] == [5.0, -1.0] # fresh, then stale -> None + + +class RequiredImu(PureModule): + camera: In[int] = tick() + imu: In[float] = latest(max_age=0.05) + + out: Out[float] + + def step(self, camera: int, imu: float) -> float: + return imu + + +def test_missing_required_input_drops_tick(store: MemoryStore) -> None: + camera = fill(store.stream("camera", int), [(0.01, 0), (0.2, 1)]) + imu = fill(store.stream("imu", float), [(0.0, 5.0)]) + + out = RequiredImu.over(camera=camera, imu=imu).to_list() + assert [o.data for o in out] == [5.0] # stale tick dropped + + +def test_strict_mode_raises_on_drop(store: MemoryStore) -> None: + camera = fill(store.stream("camera", int), [(0.01, 0), (0.2, 1)]) + imu = fill(store.stream("imu", float), [(0.0, 5.0)]) + + with pytest.raises(ValueError, match=r"missing required inputs \['imu'\]"): + RequiredImu.over(_strict=True, camera=camera, imu=imu).to_list() + + +def test_interpolate_tolerance_gates_fallback(store: MemoryStore) -> None: + class Tight(PureModule): + camera: In[int] = tick() + pose: In[float] = interpolate(tolerance=0.01) + out: Out[float] + + def step(self, camera: int, pose: float) -> float: + return pose + + camera = fill(store.stream("camera", int), [(0.0, 0), (0.1, 1), (0.45, 2)]) + pose = fill(store.stream("pose", float), [(0.4, 4.0), (0.5, 5.0)]) + + out = Tight.over(camera=camera, pose=pose).to_list() + # 0.0/0.1 precede all poses by > tolerance -> dropped; 0.45 brackets fine + assert [o.data for o in out] == [pytest.approx(4.5)] + + +# -- offline: window -------------------------------------------------------------- + + +class ImuBatch(PureModule): + camera: In[int] = tick() + imu: In[float] = window(0.1) + + out: Out[int] + + def step(self, camera: int, imu: list[float]) -> int: + return len(imu) + + +def test_window_collects_trailing_samples(store: MemoryStore) -> None: + camera = fill(store.stream("camera", int), [(0.05, 0), (0.1, 1)]) + imu = fill(store.stream("imu", float), [(0.0, 0.0), (0.03, 1.0), (0.06, 2.0), (0.09, 3.0)]) + + out = ImuBatch.over(camera=camera, imu=imu).to_list() + # (-0.05, 0.05] -> {0.0, 0.03}; (0.0, 0.1] -> {0.03, 0.06, 0.09} (left edge exclusive) + assert [o.data for o in out] == [2, 3] + + +# -- offline: pose slerp ------------------------------------------------------------ + + +class PoseSampler(PureModule): + camera: In[int] = tick() + pose: In[Pose] = interpolate() + + out: Out[Pose] + + def step(self, camera: int, pose: Pose) -> Pose: + return pose + + +def test_pose_lerp_and_slerp(store: MemoryStore) -> None: + identity = Pose(0, 0, 0, 0, 0, 0, 1) + z90 = Pose(1, 0, 0, 0, 0, math.sin(math.pi / 4), math.cos(math.pi / 4)) + + camera = fill(store.stream("camera", int), [(0.5, 0)]) + pose = store.stream("pose", Pose) + pose.append(identity, ts=0.0) + pose.append(z90, ts=1.0) + + (o,) = PoseSampler.over(camera=camera, pose=pose).to_list() + mid = o.data + assert mid.position.x == pytest.approx(0.5) + z45 = Pose(0, 0, 0, 0, 0, math.sin(math.pi / 8), math.cos(math.pi / 8)) + assert mid.orientation.angle_to(z45.orientation) == pytest.approx(0.0, abs=1e-6) + + +# -- offline: state, multi-out, observation binding ------------------------------------- + + +class Counter(PureModule): + camera: In[int] = tick() + out: Out[int] + + initial_state = 0 + + def step(self, state: int, camera: int) -> tuple[int, int]: + return state + 1, state + + +def test_stateful_threading(store: MemoryStore) -> None: + camera = fill(store.stream("camera", int), [(i / 10, i) for i in range(5)]) + + out = Counter.over(camera=camera).to_list() + assert [o.data for o in out] == [0, 1, 2, 3, 4] + + +class TwoOut(PureModule): + camera: In[int] = tick() + + doubled: Out[int] + parity: Out[str] + + def step(self, camera: int) -> dict[str, Any]: + return {"doubled": camera * 2, "parity": "even" if camera % 2 == 0 else "odd"} + + +def test_multi_out_returns_dict_rows(store: MemoryStore) -> None: + camera = fill(store.stream("camera", int), [(0.1, 1), (0.2, 2)]) + + out = TwoOut.over(camera=camera).to_list() + assert [o.data for o in out] == [ + {"doubled": 2, "parity": "odd"}, + {"doubled": 4, "parity": "even"}, + ] + + +class WantsObs(PureModule): + camera: In[int] = tick() + pose: In[float] = latest() + + out: Out[float] + + def step(self, camera: Observation[int], pose: Observation[float]) -> float: + assert isinstance(camera, Observation) + assert isinstance(pose, Observation) + return pose.ts + + +def test_observation_annotation_binds_full_obs(store: MemoryStore) -> None: + camera = fill(store.stream("camera", int), [(0.1, 0)]) + pose = fill(store.stream("pose", float), [(0.07, 1.0)]) + + (o,) = WantsObs.over(camera=camera, pose=pose).to_list() + assert o.data == 0.07 + + +def test_none_output_emits_nothing(store: MemoryStore) -> None: + class EvenOnly(PureModule): + camera: In[int] = tick() + out: Out[int] + + def step(self, camera: int) -> int | None: + return camera if camera % 2 == 0 else None + + camera = fill(store.stream("camera", int), [(0.1, 1), (0.2, 2), (0.3, 3), (0.4, 4)]) + out = EvenOnly.over(camera=camera).to_list() + assert [o.data for o in out] == [2, 4] + + +# -- plan validation ----------------------------------------------------------------- + + +def test_no_tick_input_is_an_error() -> None: + class NoTick(PureModule): + camera: In[int] + out: Out[int] + + def step(self, camera: int) -> int: + return camera + + with pytest.raises(TypeError, match="no tick"): + NoTick._plan() + + +def test_two_tick_inputs_is_an_error() -> None: + class TwoTicks(PureModule): + a: In[int] = tick() + b: In[int] = tick() + out: Out[int] + + def step(self, a: int) -> int: + return a + + with pytest.raises(TypeError, match="multiple tick"): + TwoTicks._plan() + + +def test_unknown_step_param_is_an_error() -> None: + class Typo(PureModule): + camera: In[int] = tick() + out: Out[int] + + def step(self, camera: int, poze: float) -> int: + return camera + + with pytest.raises(TypeError, match="poze"): + Typo._plan() + + +def test_annotated_port_is_an_error() -> None: + class Annot(PureModule): + camera: In[int] = tick() + pose: Annotated[In[float], latest()] + out: Out[int] + + def step(self, camera: int) -> int: + return camera + + with pytest.raises(TypeError, match="Annotated"): + Annot._plan() + + +def test_over_validates_stream_names(store: MemoryStore) -> None: + camera = store.stream("camera", int) + with pytest.raises(TypeError, match="mismatch"): + PoseEcho.over(camera=camera) # missing pose + + +# -- live arrival order (machine level) -------------------------------------------------- + + +def test_machine_blocks_until_bracketed() -> None: + m = TickMachine("camera", {"pose": Interpolate(tolerance=0.5)}) + + assert m.process("camera", obs(1.0, 0)) == [] # no pose at all yet + assert m.process("pose", obs(0.9, 9.0)) == [] # left only — still blocked + rows = m.process("pose", obs(1.1, 11.0)) # right arrives — resolves + [(tobs, row)] = rows + assert tobs.ts == 1.0 + assert row["pose"].data == pytest.approx(10.0) + + +def test_machine_flush_resolves_pending_via_fallback() -> None: + m = TickMachine("camera", {"pose": Interpolate(tolerance=0.5)}) + assert m.process("pose", obs(0.9, 9.0)) == [] + assert m.process("camera", obs(1.0, 0)) == [] # waiting for right bracket + [(tobs, row)] = m.flush() # stream over — nearest within tolerance + assert row["pose"].data == 9.0 + + +def test_machine_preserves_tick_order() -> None: + m = TickMachine("camera", {"pose": Interpolate(tolerance=0.5)}) + m.process("camera", obs(1.0, 0)) + m.process("camera", obs(2.0, 1)) + rows = m.process("pose", obs(2.5, 25.0)) # resolves both at once + assert [t.ts for t, _ in rows] == [1.0, 2.0] + + +# -- memory bounds: nothing accumulates without a consumer --------------------------- + + +def test_machine_pending_cap_evicts_oldest() -> None: + """A dead interpolate() input must not queue ticks forever.""" + m = TickMachine("camera", {"pose": Interpolate(tolerance=0.5)}, max_pending=3) + for i in range(10): + m.process("camera", obs(float(i), i)) # pose never arrives + assert len(m.pending) == 3 + assert m.blocked_dropped == 7 + m.process("pose", obs(100.0, 0.0)) # pose revives — pending resolves and empties + assert len(m.pending) == 0 + + +def test_machine_secondary_buffer_bounded_without_triggers() -> None: + """A dead trigger (camera unplugged) must not grow secondary buffers.""" + m = TickMachine("camera", {"pose": Latest()}) + for i in range(1000): + m.process("pose", obs(i * 0.01, float(i))) # 10s of 100 Hz poses, no frames + assert len(m.fields["pose"].buf) < 200 # pruned to ~1s arrival-jitter slack + + +def test_machine_window_buffer_bounded() -> None: + m = TickMachine("camera", {"imu": Window(0.1)}) + for i in range(1000): + m.process("imu", obs(i * 0.01, float(i))) + if i % 100 == 0: + m.process("camera", obs(i * 0.01, i)) + assert len(m.fields["imu"].buf) < 300 # window + slack, not the full history + + +# -- blueprint & live e2e ------------------------------------------------------------------ + + +def test_blueprint_ports() -> None: + bp = PoseEcho.blueprint() + (atom,) = bp.blueprints + names = {s.name for s in atom.streams} + assert {"camera", "pose", "sampled"} <= names + + +@pytest.mark.tool +def test_live_wiring_end_to_end() -> None: + from dimos.core.transport import pLCMTransport + + class LiveEcho(PureModule): + frame: In[int] = tick() + gain: In[float] # no sampler -> latest() + + out: Out[float] + + def step(self, frame: int, gain: float | None) -> float: + return frame * (gain if gain is not None else 1.0) + + module = LiveEcho() + module.frame.transport = pLCMTransport("/test/pm/frame") + module.gain.transport = pLCMTransport("/test/pm/gain") + module.out.transport = pLCMTransport("/test/pm/out") + + received: list[float] = [] + done = threading.Event() + + def _collect(msg: float) -> None: + received.append(msg) + done.set() + + unsub = module.out.subscribe(_collect) + + module.start() + try: + module.gain.transport.publish(2.0) + time.sleep(0.2) # ensure gain arrives (and is timestamped) before the frame + module.frame.transport.publish(21) + assert done.wait(timeout=5.0), f"timed out, received={received}" + assert received == [42.0] + finally: + unsub() + module.stop() + + +def _await(condition: Callable[[], bool], timeout: float = 5.0) -> bool: + """Bounded wait on a cheap condition — no fixed sleeps in assertions.""" + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if condition(): + return True + time.sleep(0.001) + return bool(condition()) + + +class _GatedSteps: + """Lockstep harness: the test releases each step() completion explicitly.""" + + def __init__(self) -> None: + self.entered = threading.Semaphore(0) + self.release = threading.Semaphore(0) + + def gate(self) -> None: # called inside step() + self.entered.release() + assert self.release.acquire(timeout=5.0), "test never released the step" + + def step_once(self) -> bool: + """Wait for step() to start, then let it finish.""" + ok = self.entered.acquire(timeout=5.0) + self.release.release() + return ok + + def unblock(self) -> None: + self.release.release() + self.release.release() + + +@pytest.mark.tool +def test_live_backpressure_keeplast_skips_stale_ticks() -> None: + """KeepLast: while a step is busy, later ticks coalesce to the freshest one.""" + from dimos.core.transport import pLCMTransport + + gates = _GatedSteps() + + class Gated(PureModule): + frame: In[int] = tick() + out: Out[int] + + def step(self, frame: int) -> int: + gates.gate() + return frame + + module = Gated() + module.frame.transport = pLCMTransport("/test/bp/frame") + module.out.transport = pLCMTransport("/test/bp/out") + outs: list[int] = [] + unsub = module.out.subscribe(outs.append) + + module.start() + try: + # Inject events directly into the raw event queue — deterministic order, + # explicit timestamps, no transport timing involved. + module._queue.put(("frame", obs(1.0, 1))) + assert gates.entered.acquire(timeout=5.0) # step(1) started and is now held + + for i in range(2, 11): + module._queue.put(("frame", obs(float(i), i))) + # Alignment digests all events while step(1) is held: KeepLast must + # coalesce ticks 2..10 down to ONE buffered tick (the freshest). + assert _await(lambda: module._queue.qsize() == 0 and len(module._tick_buffer) == 1) + + gates.release.release() # finish step(1) + assert gates.step_once() # step(10) runs + assert _await(lambda: len(outs) == 2) + assert outs == [1, 10] # stale ticks 2..9 were skipped, deterministically + assert len(module._tick_buffer) == 0 # nothing left buffered + finally: + gates.unblock() + unsub() + module.stop() + + +@pytest.mark.tool +def test_live_backpressure_unbounded_processes_everything() -> None: + from dimos.core.transport import pLCMTransport + from dimos.memory2.buffer import Unbounded + + gates = _GatedSteps() + + class GatedAll(PureModule): + frame: In[int] = tick() + out: Out[int] + backpressure = Unbounded() + + def step(self, frame: int) -> int: + gates.gate() + return frame + + module = GatedAll() + module.frame.transport = pLCMTransport("/test/bp2/frame") + module.out.transport = pLCMTransport("/test/bp2/out") + outs: list[int] = [] + unsub = module.out.subscribe(outs.append) + + module.start() + try: + for i in range(1, 7): + module._queue.put(("frame", obs(float(i), i))) + for _ in range(6): + assert gates.step_once() + assert _await(lambda: len(outs) == 6) + assert outs == [1, 2, 3, 4, 5, 6] # every tick, in order + assert len(module._tick_buffer) == 0 # drained, not accumulating + finally: + gates.unblock() + unsub() + module.stop() diff --git a/dimos/memory2/tick.py b/dimos/memory2/tick.py new file mode 100644 index 0000000000..00fd21b0d5 --- /dev/null +++ b/dimos/memory2/tick.py @@ -0,0 +1,443 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tick assembly — align N observation streams into per-tick input rows. + +A :class:`~dimos.memory2.puremodule.PureModule` declares *when* it runs +(one ``tick()`` input) and *how* every other input is sampled at that +moment (``latest()``, ``interpolate()``, ``window()``). This module holds +the samplers and the :class:`TickMachine` that does the alignment. + +The machine is a plain event-in/rows-out state machine — no threads, no +streams — so the exact same code drives both execution modes: + +- **offline** (stored streams): events arrive in ascending-``ts`` order + via a heap-merge; alignment is exact and deterministic. +- **live** (pubsub ports): events arrive in wall-clock order from a + queue; alignment is best-effort under arrival jitter. + +Samplers answer "what is the value of this input at time t?": + +- ``tick()`` — the input that *defines* t; each observation fires a tick. +- ``latest(max_age=None)`` — newest observation with ``ts <= t`` + (zero-order hold). Never delays a tick. +- ``interpolate(tolerance=0.5)`` — bracket t between the surrounding + observations and interpolate (lerp for numbers, lerp+slerp for poses). + Needs one observation with ``ts >= t``, so a live tick waits for the + next sample on this input (~one sample period of latency). When no + bracket is possible (stream ended / not yet started), falls back to the + nearest observation within ``tolerance`` seconds. +- ``window(seconds)`` — every observation with ``t - seconds < ts <= t``, + as a list (e.g. all IMU samples since 0.1s before the frame). + +A missing value (e.g. ``latest`` older than ``max_age``) becomes ``None`` +when the step parameter is typed ``X | None``, otherwise the tick is +dropped. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +import bisect +from dataclasses import dataclass, field +import heapq +import logging +from typing import TYPE_CHECKING, Any + +from dimos.msgs.geometry_msgs.Pose import Pose +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped + +if TYPE_CHECKING: + from collections.abc import Iterable, Iterator + + from dimos.memory2.type.observation import Observation + +logger = logging.getLogger(__name__) + + +class _Blocked: + """Sentinel: this tick must wait for a future observation (live mode).""" + + __slots__ = () + + def __repr__(self) -> str: + return "" + + +class _Missing: + """Sentinel: no value available for this input at this tick.""" + + __slots__ = () + + def __repr__(self) -> str: + return "" + + +BLOCKED = _Blocked() +MISSING = _Missing() + + +# -- Interpolation ---------------------------------------------------------- + + +def _lerp(a: float, b: float, alpha: float) -> float: + return a + (b - a) * alpha + + +def _slerp( + qa: tuple[float, float, float, float], + qb: tuple[float, float, float, float], + alpha: float, +) -> tuple[float, float, float, float]: + """Spherical lerp between two (x, y, z, w) quaternions.""" + import math + + ax, ay, az, aw = qa + bx, by, bz, bw = qb + dot = ax * bx + ay * by + az * bz + aw * bw + if dot < 0.0: # take the short path + bx, by, bz, bw = -bx, -by, -bz, -bw + dot = -dot + if dot > 0.9995: # nearly parallel — nlerp to avoid div-by-~0 + x, y, z, w = ( + _lerp(ax, bx, alpha), + _lerp(ay, by, alpha), + _lerp(az, bz, alpha), + _lerp(aw, bw, alpha), + ) + n = math.sqrt(x * x + y * y + z * z + w * w) + return (x / n, y / n, z / n, w / n) + theta = math.acos(min(1.0, dot)) + s = math.sin(theta) + wa = math.sin((1.0 - alpha) * theta) / s + wb = math.sin(alpha * theta) / s + return ( + ax * wa + bx * wb, + ay * wa + by * wb, + az * wa + bz * wb, + aw * wa + bw * wb, + ) + + +def _interp_pose_tuple(a: Any, b: Any, alpha: float) -> tuple[float, ...]: + """Interpolate two pose-like objects (with .position/.orientation) to a 7-tuple.""" + pa, pb = a.position, b.position + return ( + _lerp(pa.x, pb.x, alpha), + _lerp(pa.y, pb.y, alpha), + _lerp(pa.z, pb.z, alpha), + *_slerp( + (a.orientation.x, a.orientation.y, a.orientation.z, a.orientation.w), + (b.orientation.x, b.orientation.y, b.orientation.z, b.orientation.w), + alpha, + ), + ) + + +def interp_data(a: Any, b: Any, alpha: float, t: float) -> Any: + """Interpolate two data values; falls back to nearest-neighbor for unknown types. + + Supported: int/float (lerp), :class:`Pose` (lerp + slerp), + :class:`PoseStamped` (lerp + slerp, ``ts=t``). Anything else returns + the temporally nearer of the two. + """ + if isinstance(a, bool) or isinstance(b, bool): # bool is int — don't lerp it + return a if alpha < 0.5 else b + if isinstance(a, (int, float)) and isinstance(b, (int, float)): + return _lerp(float(a), float(b), alpha) + if isinstance(a, PoseStamped) and isinstance(b, PoseStamped): + x, y, z, qx, qy, qz, qw = _interp_pose_tuple(a, b, alpha) + return PoseStamped(ts=t, position=(x, y, z), orientation=(qx, qy, qz, qw)) + if isinstance(a, Pose) and isinstance(b, Pose): + return Pose(*_interp_pose_tuple(a, b, alpha)) + return a if alpha < 0.5 else b + + +# -- Samplers ---------------------------------------------------------------- + + +class Sampler(ABC): + """How to read one input's value at a tick time *t* from its buffer.""" + + @abstractmethod + def sample(self, buf: list[Observation[Any]], t: float, exhausted: bool) -> Any: + """Return an Observation (or list), MISSING, or BLOCKED. + + ``buf`` is in ascending-``ts`` order. ``exhausted`` means no more + observations will ever arrive on this input (offline end-of-stream + or module shutdown) — samplers must not return BLOCKED then. + """ + + def min_keep_ts(self, bound: float) -> float: + """Observations with ``ts`` strictly below this can be pruned, + given no future tick will be earlier than *bound*.""" + return bound + + def __repr__(self) -> str: + return f"{self.__class__.__name__}()" + + +def _ts_list(buf: list[Observation[Any]]) -> list[float]: + return [o.ts for o in buf] + + +class Tick(Sampler): + """Marks the input whose observations fire the ticks.""" + + def sample(self, buf: list[Observation[Any]], t: float, exhausted: bool) -> Any: + raise RuntimeError("Tick sampler is never sampled — it drives the clock") + + +class Latest(Sampler): + """Newest observation with ``ts <= t``; MISSING if none (or older than max_age).""" + + def __init__(self, max_age: float | None = None) -> None: + if max_age is not None and max_age <= 0: + raise ValueError(f"latest(max_age) requires max_age > 0, got {max_age}") + self.max_age = max_age + + def sample(self, buf: list[Observation[Any]], t: float, exhausted: bool) -> Any: + i = bisect.bisect_right(_ts_list(buf), t) - 1 + if i < 0: + return MISSING + obs = buf[i] + if self.max_age is not None and t - obs.ts > self.max_age: + return MISSING + return obs + + def __repr__(self) -> str: + return f"Latest(max_age={self.max_age})" + + +class Interpolate(Sampler): + """Bracket *t* between surrounding observations and interpolate the data. + + Live, a tick waits (BLOCKED) until this input produces an observation + with ``ts >= t`` — one sample period of latency. With no bracket + available (input ended, or t precedes its first observation) the + nearest observation within ``tolerance`` seconds is used as-is. + """ + + def __init__(self, tolerance: float = 0.5) -> None: + if tolerance <= 0: + raise ValueError(f"interpolate(tolerance) requires tolerance > 0, got {tolerance}") + self.tolerance = tolerance + + def sample(self, buf: list[Observation[Any]], t: float, exhausted: bool) -> Any: + ts = _ts_list(buf) + right_i = bisect.bisect_left(ts, t) + left_i = right_i - 1 + right = buf[right_i] if right_i < len(buf) else None + left = buf[left_i] if left_i >= 0 else None + + if right is None and not exhausted: + return BLOCKED # the bracketing observation may still arrive + if left is not None and right is not None: + dt = right.ts - left.ts + alpha = (t - left.ts) / dt if dt > 0 else 0.0 + data = interp_data(left.data, right.data, alpha, t) + pose: Any = None + if left.pose is not None and right.pose is not None: + pose = _interp_pose_tuple(left.pose, right.pose, alpha) + elif left.pose is not None: + pose = left.pose_tuple + return left.derive(data=data, ts=t, pose_tuple=tuple(pose) if pose else None) + nearest = left if left is not None else right + if nearest is not None and abs(nearest.ts - t) <= self.tolerance: + return nearest + return MISSING + + def __repr__(self) -> str: + return f"Interpolate(tolerance={self.tolerance})" + + +class Window(Sampler): + """All observations with ``t - seconds < ts <= t``, as a list (may be empty).""" + + def __init__(self, seconds: float) -> None: + if seconds <= 0: + raise ValueError(f"window(seconds) requires seconds > 0, got {seconds}") + self.seconds = seconds + + def sample(self, buf: list[Observation[Any]], t: float, exhausted: bool) -> Any: + ts = _ts_list(buf) + lo = bisect.bisect_right(ts, t - self.seconds) + hi = bisect.bisect_right(ts, t) + return list(buf[lo:hi]) + + def min_keep_ts(self, bound: float) -> float: + return bound - self.seconds + + def __repr__(self) -> str: + return f"Window(seconds={self.seconds})" + + +def tick() -> Any: + """This input fires the ticks — the module steps once per observation. + + Declared as a port default: ``image: In[Image] = tick()``. Exactly one + input must be the tick. (Typed ``Any`` so it can sit on an ``In[X]`` + annotation, pydantic-``Field()`` style.) + """ + return Tick() + + +def latest(max_age: float | None = None) -> Any: + """Sample this input as the newest observation at the tick time (hold).""" + return Latest(max_age) + + +def interpolate(tolerance: float = 0.5) -> Any: + """Sample this input by interpolating to the tick time (lerp/slerp).""" + return Interpolate(tolerance) + + +def window(seconds: float) -> Any: + """Sample this input as the list of observations in the trailing window.""" + return Window(seconds) + + +# -- The machine ------------------------------------------------------------- + + +@dataclass +class _FieldState: + sampler: Sampler + buf: list[Observation[Any]] = field(default_factory=list) + exhausted: bool = False + + +class TickMachine: + """Aligns observation events from N named inputs into per-tick rows. + + Feed events with :meth:`process`; finish with :meth:`flush`. Each + returned row is ``(trigger_obs, {input_name: Observation | list | + MISSING})`` — policy (optional vs drop, data vs obs) is the caller's. + + Ticks resolve in order: the oldest pending tick blocks the rest, so + output order always matches trigger order. + """ + + def __init__( + self, trigger: str, samplers: dict[str, Sampler], max_pending: int | None = None + ) -> None: + """``max_pending`` caps ticks waiting for interpolation brackets. + + When a live ``interpolate()`` input dies, every trigger would + otherwise queue forever. With the cap, the oldest pending tick is + dropped (counted in :attr:`blocked_dropped`) — controller + semantics. Pass ``None`` offline, where exactness matters and the + ts-ordered merge bounds pending naturally. + """ + self.trigger = trigger + self.fields = {name: _FieldState(s) for name, s in samplers.items()} + self.pending: list[Observation[Any]] = [] + self.max_pending = max_pending + self.blocked_dropped = 0 # ticks evicted by the max_pending cap + self._last_t = float("-inf") # ts of the newest resolved tick + + def process( + self, name: str, obs: Observation[Any] + ) -> list[tuple[Observation[Any], dict[str, Any]]]: + """Feed one observation event; return any ticks it resolved.""" + if name == self.trigger: + self.pending.append(obs) + if self.max_pending is not None and len(self.pending) > self.max_pending: + del self.pending[0] + self.blocked_dropped += 1 + else: + fs = self.fields[name] + # Live arrival can be slightly out of order across sources; + # keep the buffer ts-sorted so samplers can bisect. + if fs.buf and obs.ts < fs.buf[-1].ts: + bisect.insort(fs.buf, obs, key=lambda o: o.ts) + else: + fs.buf.append(obs) + return self._resolve() + + def end_of_stream(self, name: str) -> list[tuple[Observation[Any], dict[str, Any]]]: + """Mark one input as finished (no more observations will arrive).""" + if name != self.trigger: + self.fields[name].exhausted = True + return self._resolve() + + def flush(self) -> list[tuple[Observation[Any], dict[str, Any]]]: + """Mark every input finished and resolve all pending ticks.""" + for fs in self.fields.values(): + fs.exhausted = True + return self._resolve() + + def _resolve(self) -> list[tuple[Observation[Any], dict[str, Any]]]: + rows: list[tuple[Observation[Any], dict[str, Any]]] = [] + while self.pending: + tobs = self.pending[0] + row: dict[str, Any] = {} + blocked = False + for name, fs in self.fields.items(): + val = fs.sampler.sample(fs.buf, tobs.ts, fs.exhausted) + if val is BLOCKED: + blocked = True + break + row[name] = val + if blocked: + break + self.pending.pop(0) + self._last_t = tobs.ts + rows.append((tobs, row)) + self._prune() + return rows + + def _prune(self) -> None: + """Drop buffer entries no future tick can need.""" + for fs in self.fields.values(): + if self.pending: + bound = self.pending[0].ts + elif fs.buf: + # No tick waiting — the next one can't be much older than + # what we've already seen (1s slack for live arrival jitter). + bound = max(self._last_t, fs.buf[-1].ts - 1.0) + else: + continue + keep_from = fs.sampler.min_keep_ts(bound) + ts = _ts_list(fs.buf) + # Keep the newest obs at-or-before keep_from (it's the next + # tick's "latest"/left bracket), drop everything older. + i = bisect.bisect_right(ts, keep_from) - 1 + if i > 0: + del fs.buf[:i] + + +# -- Offline driver ----------------------------------------------------------- + + +def merge_events( + trigger_name: str, + trigger_iter: Iterator[Observation[Any]], + streams: dict[str, Iterable[Observation[Any]]], +) -> Iterator[tuple[str, Observation[Any]]]: + """K-way merge of observation iterators into one ascending-``ts`` event feed. + + At equal ``ts``, non-trigger observations sort first so a tick sees + same-timestamp data from other inputs. Assumes each input iterates in + ascending ``ts`` (true for stored streams in insertion order — prepend + ``.order_by("ts")`` otherwise). + """ + + def tag(name: str, it: Iterable[Observation[Any]], prio: int) -> Iterator[tuple[Any, ...]]: + return ((obs.ts, prio, name, obs) for obs in it) + + feeds = [tag(name, it, 0) for name, it in streams.items()] + feeds.append(tag(trigger_name, trigger_iter, 1)) + for _ts, _prio, name, obs in heapq.merge(*feeds): + yield name, obs diff --git a/dimos/spec/pure_modules.py b/dimos/spec/pure_modules.py new file mode 100644 index 0000000000..838fee06a5 --- /dev/null +++ b/dimos/spec/pure_modules.py @@ -0,0 +1,24 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pure modules — re-exported from :mod:`dimos.memory2.puremodule`. + +A PureModule's core is a pure ``step`` over inputs aligned to a tick; the +same class runs live on pubsub ports or offline over stored memory2 +streams. See the implementation module for the declaration language. +""" + +from dimos.memory2.puremodule import PureModule, interpolate, latest, tick, window + +__all__ = ["PureModule", "interpolate", "latest", "tick", "window"] From 04176113bd0761147722e1f8066d633df511b9f6 Mon Sep 17 00:00:00 2001 From: Ivan Nikolic Date: Thu, 11 Jun 2026 18:57:48 +0800 Subject: [PATCH 02/10] docs: gentle runnable intro to pure modules Executable walkthrough (md-babel) in docs/usage/pure_modules.md: synthetic recording, first module with interpolation proof, sampler language, missing-data policy, explicit state, module chaining, live deployment notes, and a fake-clock health contract demo. Linked from the usage TOC and the puremodule.md reference. --- dimos/memory2/puremodule.md | 4 + docs/usage/README.md | 1 + docs/usage/pure_modules.md | 343 ++++++++++++++++++++++++++++++++++++ 3 files changed, 348 insertions(+) create mode 100644 docs/usage/pure_modules.md diff --git a/dimos/memory2/puremodule.md b/dimos/memory2/puremodule.md index 8120382b70..e83782e912 100644 --- a/dimos/memory2/puremodule.md +++ b/dimos/memory2/puremodule.md @@ -1,5 +1,9 @@ # Pure Modules +> New here? Start with the +> [gentle, runnable introduction](/docs/usage/pure_modules.md) — this page +> is the reference. + A `PureModule` separates a module into two declarations and one pure function: - **when it runs** — one input marked `tick()` fires the ticks; diff --git a/docs/usage/README.md b/docs/usage/README.md index 071b6fc0b2..81ad6bbc23 100644 --- a/docs/usage/README.md +++ b/docs/usage/README.md @@ -5,6 +5,7 @@ This page explains general concepts. ## Table of Contents - [Modules](/docs/usage/modules.md): The primary units of deployment in DimOS, modules run in parallel and are python classes. +- [Pure Modules](/docs/usage/pure_modules.md): Modules as pure functions over time-aligned inputs — the same class runs live or over recordings. - [Streams](/docs/usage/sensor_streams/README.md): How modules communicate, a Pub / Sub system. - [Blueprints](/docs/usage/blueprints.md): a way to group modules together and define their connections to each other. - [RPC](/docs/usage/blueprints.md#calling-the-methods-of-other-modules): how one module can call a method on another module (arguments get serialized to JSON-like binary data). diff --git a/docs/usage/pure_modules.md b/docs/usage/pure_modules.md new file mode 100644 index 0000000000..56ed8f8d96 --- /dev/null +++ b/docs/usage/pure_modules.md @@ -0,0 +1,343 @@ +# Pure Modules + +A traditional robotics module subscribes to topics, keeps state on `self`, +and publishes from callbacks. It works — but the interesting logic ends up +welded to live infrastructure: you can't run it on yesterday's recording, +you can't unit-test it without a pub/sub bus, and two runs over the same +data won't reproduce the same behavior. + +A `PureModule` splits the same job into three declarations: + +- **when it runs** — one input marked `tick()` fires the ticks; +- **how every other input is sampled at that moment** — `latest()`, + `interpolate()`, `window()`; +- **what it computes** — `step()`, a pure function of the aligned inputs. + +```diagon mode=GraphDAG +camera -> align +pose -> align +imu -> align +align -> tick +tick -> step +step -> outputs +``` + +```results +┌──────┐┌────┐┌───┐ +│camera││pose││imu│ +└┬─────┘└┬───┘└┬──┘ +┌▽───────▽─────▽┐ +│align │ +└┬──────────────┘ +┌▽───┐ +│tick│ +└┬───┘ +┌▽───┐ +│step│ +└┬───┘ +┌▽──────┐ +│outputs│ +└───────┘ +``` + +Because `step` never touches ports, threads, or `self`, the same class runs +**live** on pub/sub ports and **offline** over stored +[memory2 streams](/dimos/memory2/intro.md) — and cannot tell the +difference. Replay, time-travel debugging, migration, and parallel +execution stop being features you build into a module and become +properties of the runtime. + +This page walks through the offline experience, which is also how you +develop: record once, then iterate in a plain Python session. + +## A tiny robot, recorded + +Let's fabricate two seconds of robot data — in real life this is a +`SqliteStore` recorded on the robot, here an in-memory store is enough. +The camera runs at 10 fps, the pose at 25 Hz (driving +x at exactly +1 m/s), and the IMU at 100 Hz: + +```python session=pure ansi=false no-result +import logging +logging.disable(logging.CRITICAL) # keep doc output clean + +from dimos.memory2.store.memory import MemoryStore +from dimos.msgs.geometry_msgs.Pose import Pose + +store = MemoryStore() +store.start() + +camera = store.stream("camera", str) +pose = store.stream("pose", Pose) +imu = store.stream("imu", float) + +for i in range(20): # 10 fps, offset 5 ms from the pose clock + camera.append(f"frame-{i:02d}", ts=0.005 + i * 0.1) + +for i in range(51): # 25 Hz, x = t because the robot drives 1 m/s + t = i * 0.04 + pose.append(Pose(t, 0, 0, 0, 0, 0, 1), ts=t) + +for i in range(201): # 100 Hz + imu.append(0.1 * (i % 5), ts=i * 0.01) +``` + +Notice the three streams don't share a clock — no two observations land on +the same timestamp. That's the reality pure modules are designed around. + +## Your first pure module + +The declaration reads top to bottom: tick on every camera frame, +interpolate the pose *to the frame's capture time*, batch all IMU samples +from the last 100 ms: + +```python session=pure ansi=false +from dimos.core.stream import In, Out +from dimos.memory2.puremodule import PureModule, tick, interpolate, window + +class Snapshot(PureModule): + image: In[str] = tick() + pose: In[Pose] = interpolate() + imu: In[float] = window(0.1) + + described: Out[str] + + def step(self, image: str, pose: Pose, imu: list[float], ts: float) -> str: + return f"{image} at x={pose.position.x:.3f}m with {len(imu)} imu samples" + +out = Snapshot.over(image=camera, pose=pose, imu=imu) +for o in out.to_list()[:4]: + print(f"t={o.ts:.3f} {o.data}") +``` + +```results +t=0.005 frame-00 at x=0.005m with 1 imu samples +t=0.105 frame-01 at x=0.105m with 10 imu samples +t=0.205 frame-02 at x=0.205m with 10 imu samples +t=0.305 frame-03 at x=0.305m with 10 imu samples +``` + +Two things to notice: + +- `step` parameters bind to the inputs **by name**, and the annotations + are plain types — this is just a function. You can call + `Snapshot.offline().step("f", Pose(1,0,0,0,0,0,1), [], 0.0)` in a test + with no infrastructure at all. +- `x` equals the tick time exactly. The robot drives 1 m/s, so a pose + *interpolated to the frame's capture time* must satisfy `x == t`. The + 25 Hz pose stream never sampled those instants — alignment built them. + +`over()` returns a regular lazy stream: `.to_list()` runs it, `.save()` +persists it, filters slice it. Nothing executed until we asked. + +## The sampler language + +| Sampler | Value at tick time `t` | +|---|---| +| `tick()` | the observation that fired the tick | +| `latest(max_age=None)` | newest observation with `ts <= t` (hold), missing if stale | +| `interpolate(tolerance=0.5)` | lerp/slerp between the observations bracketing `t` | +| `window(seconds)` | every observation in `(t - seconds, t]`, as a list | + +"At what state do we call the module" is always some combination of these. +Tick on poses with the latest image instead? Swap the samplers — the step +doesn't change shape: + +```python session=pure ansi=false +from dimos.memory2.puremodule import latest + +class PoseDriven(PureModule): + pose: In[Pose] = tick() # 25 ticks/s now + image: In[str] = latest() # most recent frame, whatever it is + + described: Out[str] + + def step(self, pose: Pose, image: str | None) -> str: + return f"x={pose.position.x:.2f} sees {image or 'nothing yet'}" + +rows = PoseDriven.over(pose=pose, image=camera).to_list() +print(rows[0].data) # first tick precedes the first frame +print(rows[-1].data) +``` + +```results +x=0.00 sees nothing yet +x=2.00 sees frame-19 +``` + +## When data is missing + +The step's *type signature* is the policy. `image: str | None` above said +"missing is fine, give me None". A non-optional parameter instead **drops +the tick** — the module simply doesn't run without its required inputs: + +```python session=pure ansi=false +gps = store.stream("gps", str) +for i in range(3): + gps.append(f"fix-{i}", ts=i * 0.4) # gps dies after t=0.8 + +class NeedsGps(PureModule): + image: In[str] = tick() + gps: In[str] = latest(max_age=0.5) # a fix older than 0.5s is no fix + + described: Out[str] + + def step(self, image: str, gps: str) -> str: # gps required + return f"{image} located via {gps}" + +located = NeedsGps.over(image=camera, gps=gps).to_list() +print(f"{located[-1].data} <- last locatable frame") +print(f"{len(located)} of 20 frames located; the rest were dropped") +``` + +```results +frame-12 located via fix-2 <- last locatable frame +13 of 20 frames located; the rest were dropped +``` + +Offline, drops are summarized in a log line (and `over(_strict=True)` +raises instead — replay tests should fail loudly). Live, they're counted +per reason in the module's health stream. + +## State, without `self` + +Recurrent state (filters, gait phase, anything Kalman-shaped) is declared, +not hidden: name the first parameter `state` and return +`(new_state, output)`. Returning `None` as the output emits nothing — so +ticks double as filters: + +```python session=pure ansi=false +class SpeedEstimator(PureModule): + pose: In[Pose] = tick() + speed: Out[float] + + initial_state = None # (previous ts, previous x) + + def step(self, state, pose: Pose, ts: float): + if state is None: + return (ts, pose.position.x), None # first tick: just remember + prev_ts, prev_x = state + v = (pose.position.x - prev_x) / (ts - prev_ts) + return (ts, pose.position.x), v + +speeds = SpeedEstimator.over(pose=pose) +values = [o.data for o in speeds.to_list()] +print(f"{len(values)} estimates, all {min(values):.2f}..{max(values):.2f} m/s") +``` + +```results +50 estimates, all 1.00..1.00 m/s +``` + +The runtime threads the state through the ticks. Because it's an explicit +value rather than attributes on `self`, a snapshot of it *is* the module's +full resume point — which is what makes restarts, migration, and +time-rewind mechanical. + +## Modules compose like streams + +A module's output stream is a normal memory2 stream, so modules chain into +pipelines — with stream operators slotting in between — and results save +back into the store next to the data they came from: + +```python session=pure ansi=false +class SpeedAlert(PureModule): + speed: In[float] = tick() + alert: Out[str] + + def step(self, speed: float, ts: float) -> str | None: + return f"speeding at t={ts:.2f}" if speed > 0.9 else None + +alerts = SpeedAlert.over(speed=SpeedEstimator.over(pose=pose)) + +saved = store.stream("alerts", str) +alerts.save(saved).drain() +print(f"{saved.count()} alerts stored, e.g. {saved.last().data!r}") +``` + +```results +50 alerts stored, e.g. 'speeding at t=2.00' +``` + +The whole chain ran in one lazy pass: each pose tick flowed through +`SpeedEstimator`, became a speed, ticked `SpeedAlert`, and landed in the +store. + +## The same class, live + +Deployment is the part the module never sees. In a blueprint, each `In` +becomes a pub/sub port feeding a stream in the module's store, `step` runs +on a worker thread, and outputs publish to the `Out` ports: + +```python skip +class Follower(PureModule): + image: In[Image] = tick() + pose: In[PoseStamped] = interpolate() + cmd_vel: Out[Twist] + + backpressure = KeepLast() # slow step? always process the freshest tick + + def step(self, image: Image, pose: PoseStamped) -> Twist: + return chase(image, pose) + +blueprint.add(Follower, expected_hz={"image": 30, "pose": 50}, min_output_hz=10) +``` + +Two deployment choices matter: + +- **The store.** Default is a `NullStore` — inputs/outputs behave as + live-only streams. Swap in a `SqliteStore` and the running module + records every input and output as a side effect; develop against that + recording with `over()` the next morning. +- **The backpressure policy.** `KeepLast()` (default) gives controller + semantics: a slow step always sees the freshest tick and skipped ticks + are *counted, not warned about* — for a 30 fps camera and a 100 ms + step, dropping two thirds of ticks is the system working as designed. + `Unbounded()` gives recorder semantics: never drop. + +## Contracts, not log spam + +Health is judged against declared contracts — input rates, output rate — +and reported as state transitions, not per-drop warnings. The monitor is +plain bookkeeping, so we can drive a fake deployment right here with a +fake clock: + +```python session=pure ansi=false +from dimos.memory2.health import HealthMonitor + +t = 0.0 +monitor = HealthMonitor("follower", min_output_hz=10.0, warmup_s=0.0, + interval_s=1.0, clock=lambda: t) + +for _ in range(3): # a bad second: only 3 outputs against a 10 Hz contract + monitor.on_step(duration_s=0.12, ages={}, emitted=True) +t += 1.0 +health = monitor.maybe_report() +print(health.state, "-", health.violations[0]) + +for _ in range(15): # a good second + monitor.on_step(duration_s=0.05, ages={}, emitted=True) +t += 1.0 +print(monitor.maybe_report().state) +``` + +```results +DEGRADED - output 3.0 Hz < contract 10 Hz +OK +``` + +Deployed, those same snapshots append to a `_health` stream in the module +store every second (drop rates by reason, step p50/p99, input staleness), +transitions log once with the violated contract, and a recording captures +the health stream *next to the data it explains* — so a post-incident +notebook can plot the drop ratio against the very frames that were +dropped. + +## Where next + +- [Pure modules reference](/dimos/memory2/puremodule.md) — the full + declaration language, binding rules, backpressure and health design. +- [Memory intro](/dimos/memory2/intro.md) — the stream API `over()` + returns. +- [Temporal alignment](/docs/usage/data_streams/temporal_alignment.md) — + the rx-level alignment this generalizes. From 2b855f5f5d37a85f02236d4b3472e4395f9ceb82 Mon Sep 17 00:00:00 2001 From: Ivan Nikolic Date: Thu, 11 Jun 2026 19:26:53 +0800 Subject: [PATCH 03/10] experiment: marker detection as a PureModule, parallel to the stream module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MarkerDetectionPureModule sits next to MarkerDetectionStreamModule (untouched): the per-frame TF lookup becomes a declared interpolate(tolerance=0.5) camera_pose input, QualityWindow/SpeedLimit config knobs become upstream stream composition, and the emit_empty_frames sentinel plumbing reduces to step() returning the (possibly empty) Detection3DArray. smoothing_window is deliberately out of scope — it is recurrent state and maps to an explicit Mealy state parameter. Also: PureModule.offline() now returns Self. --- dimos/memory2/puremodule.py | 8 +- .../fiducial/marker_detection_pure_module.py | 170 ++++++++++++++++++ .../test_marker_detection_pure_module.py | 137 ++++++++++++++ 3 files changed, 314 insertions(+), 1 deletion(-) create mode 100644 dimos/perception/fiducial/marker_detection_pure_module.py create mode 100644 dimos/perception/fiducial/test_marker_detection_pure_module.py diff --git a/dimos/memory2/puremodule.py b/dimos/memory2/puremodule.py index 91954eb0a0..79fc87d4aa 100644 --- a/dimos/memory2/puremodule.py +++ b/dimos/memory2/puremodule.py @@ -75,12 +75,18 @@ def step(self, image: Image, pose: PoseStamped, imu: Imu | None) -> Twist: from dataclasses import dataclass import inspect import queue +import sys import threading import time import types import typing from typing import TYPE_CHECKING, Any, Union, get_args, get_origin, get_type_hints +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + from pydantic import Field from reactivex.disposable import Disposable @@ -389,7 +395,7 @@ def _invoke( # -- offline ------------------------------------------------------------------ @classmethod - def offline(cls, **config: Any) -> PureModule: + def offline(cls, **config: Any) -> Self: """Construct without module machinery (no event loop, RPC, or ports). Enough of an instance to run :meth:`over` — ``self.config`` works, diff --git a/dimos/perception/fiducial/marker_detection_pure_module.py b/dimos/perception/fiducial/marker_detection_pure_module.py new file mode 100644 index 0000000000..7cd01a6ea8 --- /dev/null +++ b/dimos/perception/fiducial/marker_detection_pure_module.py @@ -0,0 +1,170 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Marker detection as a :class:`~dimos.memory2.puremodule.PureModule`. + +Experimental parallel of :class:`MarkerDetectionStreamModule` — the +original is untouched; this file is the same job re-expressed in the pure +declaration language: + +- **Camera pose is an input, not a TF lookup.** The stream module calls + ``self.tf.get(world, optical, time_point=ts, tolerance=0.5)`` per frame; + here that is ``camera_pose: In[PoseStamped] = interpolate(tolerance=0.5)`` + — the alignment runtime produces the camera-in-world pose *at the + frame's capture time*, live or from a recording, and the detection logic + never knows TF exists. +- **Frame gating is composition, not configuration.** ``QualityWindow`` / + ``SpeedLimit`` knobs from the stream module's config are not knobs here: + gate upstream instead, e.g. + ``module.over(color_image=imgs.transform(QualityWindow(...)), ...)`` + offline, or chain a gating module in front when deployed. +- **Empty frames need no sentinel plumbing.** The stream module threads an + ``emit_empty_frames`` flag plus ``MarkersPerFrame`` bookkeeping to emit + one array per processed frame; here ``step`` simply returns the + (possibly empty) ``Detection3DArray``. + +Knowingly out of scope: ``smoothing_window`` pose averaging — that is +recurrent state (per-marker sliding buffers + track ids) and belongs in an +explicit Mealy ``state`` parameter; left out of this experiment to keep +the core comparable. The intrinsics→OpenCV calibration cache kept on the +instance is memoization of config, not behavioral state. +""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +from pydantic import Field + +from dimos.core.stream import In, Out +from dimos.memory2.puremodule import PureModule, PureModuleConfig, interpolate, tick +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.geometry_msgs.Quaternion import Quaternion +from dimos.msgs.geometry_msgs.Transform import Transform +from dimos.msgs.geometry_msgs.Vector3 import Vector3 +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image +from dimos.msgs.vision_msgs.Detection3DArray import Detection3DArray +from dimos.perception.detection.type.detection3d.imageDetections3D import ImageDetections3D +from dimos.perception.fiducial.marker_detect import detect_markers_in_image +from dimos.perception.fiducial.marker_pose import ( + camera_info_to_cv_matrices, + camera_optical_frame_id, + create_aruco_detector, + is_fisheye_model, +) +from dimos.perception.fiducial.marker_transformer import _camera_info_key +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class MarkerDetectionPureModuleConfig(PureModuleConfig): + """Configuration for :class:`MarkerDetectionPureModule`.""" + + world_frame: str = "world" + aruco_dictionary: str = "DICT_APRILTAG_36h11" + marker_length_m: float = Field( + ..., gt=0.0, description="Physical square marker edge length in meters." + ) + camera_info: CameraInfo | None = None + + +class MarkerDetectionPureModule(PureModule): + """One marker-detection pass per camera frame, posed via aligned inputs.""" + + config: MarkerDetectionPureModuleConfig + + color_image: In[Image] = tick() + camera_pose: In[PoseStamped] = interpolate(tolerance=0.5) + """World ← camera-optical pose; interpolated to each frame's capture time.""" + + detections: Out[Detection3DArray] + + # Lazy per-intrinsics calibration cache (class defaults: `offline()` + # constructs instances without running __init__). + _calibration: tuple[Any, np.ndarray, np.ndarray] | None = None + _calibration_key: tuple[Any, ...] | None = None + _warned_distortion: bool = False + + def _resolve_calibration(self, info: CameraInfo) -> tuple[Any, np.ndarray, np.ndarray]: + key = _camera_info_key(info) + if key != self._calibration_key or self._calibration is None: + model = (info.distortion_model or "").strip().lower() + if model not in ("", "plumb_bob") and not is_fisheye_model(model): + if not self._warned_distortion: + logger.warning( + "MarkerDetectionPureModule: distortion_model=%r may be unsupported; " + "using D as-is.", + info.distortion_model, + ) + self._warned_distortion = True + camera_matrix, dist_coeffs = camera_info_to_cv_matrices(info) + detector = create_aruco_detector(self.config.aruco_dictionary) + self._calibration = (detector, camera_matrix, dist_coeffs) + self._calibration_key = key + return self._calibration + + def step( + self, color_image: Image, camera_pose: PoseStamped, ts: float + ) -> Detection3DArray | None: + info = self.config.camera_info + if info is None: + logger.debug("MarkerDetectionPureModule: no CameraInfo configured; skipping frame") + return None + if ( + info.width + and info.height + and (color_image.width != info.width or color_image.height != info.height) + ): + logger.debug( + "MarkerDetectionPureModule: image %sx%s != CameraInfo %sx%s; skipping frame", + color_image.width, + color_image.height, + info.width, + info.height, + ) + return None + + detector, camera_matrix, dist_coeffs = self._resolve_calibration(info) + world_t_optical = Transform( + translation=Vector3( + camera_pose.position.x, camera_pose.position.y, camera_pose.position.z + ), + rotation=Quaternion( + camera_pose.orientation.x, + camera_pose.orientation.y, + camera_pose.orientation.z, + camera_pose.orientation.w, + ), + frame_id=self.config.world_frame, + child_frame_id=camera_optical_frame_id(color_image, info), + ts=ts, + ) + + markers = detect_markers_in_image( + color_image, + camera_info=info, + world_T_optical=world_t_optical, + marker_length_m=self.config.marker_length_m, + aruco_dictionary=self.config.aruco_dictionary, + world_frame=self.config.world_frame, + detector=detector, + camera_matrix=camera_matrix, + dist_coeffs=dist_coeffs, + ) + return ImageDetections3D(color_image, markers).to_ros_detection3d_array( + frame_id=self.config.world_frame + ) diff --git a/dimos/perception/fiducial/test_marker_detection_pure_module.py b/dimos/perception/fiducial/test_marker_detection_pure_module.py new file mode 100644 index 0000000000..0b022499d1 --- /dev/null +++ b/dimos/perception/fiducial/test_marker_detection_pure_module.py @@ -0,0 +1,137 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +pytest.importorskip("cv2.aruco") + +from dimos.memory2.store.memory import MemoryStore +from dimos.memory2.tick import Interpolate, Tick +from dimos.memory2.transform import QualityWindow +from dimos.msgs.geometry_msgs.PoseStamped import PoseStamped +from dimos.msgs.sensor_msgs.Image import Image +from dimos.perception.fiducial.marker_detection_pure_module import MarkerDetectionPureModule +from dimos.perception.fiducial.test_helpers import blank_image, camera_info, synthetic_marker_image + +if TYPE_CHECKING: + from collections.abc import Iterator + + from dimos.memory2.stream import Stream + +MARKER_LENGTH_M = 0.18 + + +def identity_pose(ts: float) -> PoseStamped: + return PoseStamped(ts=ts, position=(0.0, 0.0, 0.0), orientation=(0.0, 0.0, 0.0, 1.0)) + + +@pytest.fixture +def store() -> Iterator[MemoryStore]: + with MemoryStore() as s: + yield s + + +def fill_poses( + store: MemoryStore, start: float, stop: float, hz: float = 25.0 +) -> Stream[PoseStamped]: + poses = store.stream("camera_pose", PoseStamped) + t = start + while t <= stop: + poses.append(identity_pose(t), ts=t) + t += 1.0 / hz + return poses + + +def module() -> MarkerDetectionPureModule: + return MarkerDetectionPureModule.offline( + marker_length_m=MARKER_LENGTH_M, camera_info=camera_info() + ) + + +def test_plan_declares_tick_and_interpolated_pose() -> None: + plan = MarkerDetectionPureModule._plan() + assert plan.trigger == "color_image" + assert isinstance(plan.samplers["camera_pose"], Interpolate) + assert plan.samplers["camera_pose"].tolerance == 0.5 # parity: tf_lookup_tolerance + assert set(plan.outs) == {"detections"} + assert not isinstance(plan.samplers.get("color_image"), Tick) # trigger isn't sampled + + +def test_detects_marker_and_emits_empty_frames(store: MemoryStore) -> None: + images = store.stream("color_image", Image) + marker_image = synthetic_marker_image(7, ts=10.0) + images.append(marker_image, ts=10.0) + images.append(blank_image(ts=11.0), ts=11.0) + poses = fill_poses(store, 9.9, 11.1) + + out = [o.data for o in module().over(color_image=images, camera_pose=poses).to_list()] + + assert len(out) == 2 # one array per frame, empty frames included + assert out[0].detections_length == 1 + assert out[0].detections[0].id == "7" + assert out[0].detections[0].results[0].hypothesis.class_id == "DICT_APRILTAG_36h11:7" + assert out[0].detections[0].bbox.size.x == pytest.approx(MARKER_LENGTH_M) + assert out[0].header.frame_id == "world" + assert out[1].detections_length == 0 + assert out[1].detections == [] + + +def test_quality_gating_composes_upstream(store: MemoryStore) -> None: + """The stream module's QualityWindow config knob becomes stream composition.""" + images = store.stream("color_image", Image) + images.append(blank_image(ts=10.0), ts=10.0) # featureless -> low sharpness + images.append(synthetic_marker_image(7, ts=10.5), ts=10.5) # edges -> sharp + poses = fill_poses(store, 9.9, 11.0) + + gated: Stream[Image] = images.transform(QualityWindow(lambda img: img.sharpness, window=2.0)) + out = module().over(color_image=gated, camera_pose=poses).to_list() + + assert len(out) == 1 # only the best frame in the window ticked + assert out[0].data.detections_length == 1 + + +def test_frame_without_camera_pose_is_dropped(store: MemoryStore) -> None: + images = store.stream("color_image", Image) + images.append(synthetic_marker_image(7, ts=10.0), ts=10.0) + images.append(blank_image(ts=20.0), ts=20.0) # far outside pose coverage + poses = fill_poses(store, 9.9, 10.1) + + out = module().over(color_image=images, camera_pose=poses).to_list() + + assert [o.ts for o in out] == [10.0] # unposed frame dropped, not mislocated + + with pytest.raises(ValueError, match="missing required inputs"): + module().over(_strict=True, color_image=images, camera_pose=poses).to_list() + + +def test_without_camera_info_emits_nothing(store: MemoryStore) -> None: + images = store.stream("color_image", Image) + images.append(synthetic_marker_image(7, ts=10.0), ts=10.0) + poses = fill_poses(store, 9.9, 10.1) + + bare = MarkerDetectionPureModule.offline(marker_length_m=MARKER_LENGTH_M) + assert bare.over(color_image=images, camera_pose=poses).to_list() == [] + + +def test_step_is_directly_callable() -> None: + """The pure core needs no streams, store, or ports at all.""" + m = module() + result = m.step(synthetic_marker_image(7, ts=10.0), identity_pose(10.0), ts=10.0) + assert result is not None + assert result.detections_length == 1 + assert result.detections[0].id == "7" From 012a8885f50e15c4706cc978b556a7400b27e217 Mon Sep 17 00:00:00 2001 From: Ivan Nikolic Date: Thu, 11 Jun 2026 20:46:21 +0800 Subject: [PATCH 04/10] docs: document multi-output pure modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Runnable Navigator example in the gentle intro (two Out ports, partial emission, slicing one output back out of the offline dict rows) and a proper Outputs section in the reference covering single/multi/None/ partial semantics, where outputs land per mode, and the dict-row vs per-port asymmetry as a known open point. Also fixes a map_data misuse hiding in a skip block — the callable receives the observation, not data. --- dimos/memory2/puremodule.md | 21 +++++++++++++--- docs/usage/pure_modules.md | 48 +++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 3 deletions(-) diff --git a/dimos/memory2/puremodule.md b/dimos/memory2/puremodule.md index e83782e912..901bb0a759 100644 --- a/dimos/memory2/puremodule.md +++ b/dimos/memory2/puremodule.md @@ -72,8 +72,23 @@ Your "at what state do we call the module" examples translate as: - reserved names: `ts: float` is the tick time; `state` (first parameter) makes the module a Mealy machine. -Outputs: one `Out` port — return the value, or `None` to emit nothing -(ticks double as filters); several — return `{port_name: value}`. +## Outputs + +- **One `Out` port** — `step` returns the value; returning `None` emits + nothing, so ticks double as filters. +- **Several `Out` ports** — return `{port_name: value}`. A *partial* dict + is allowed (omitted ports stay quiet that tick — e.g. a command every + tick, an alert occasionally); unknown keys raise `TypeError`. +- **No `Out` ports** — the return value is ignored. + +Where outputs land differs by mode: **live**, every dict entry publishes +to its own port (and is appended to that port's stream in the module +store); **offline**, `over()` yields one observation per tick — the bare +value for single-output modules, the `{port: value}` dict for +multi-output ones (slice one output back out with +`.filter(lambda o: "alerts" in o.data).map_data(lambda o: o.data["alerts"])`). +That dict-row asymmetry is a known open point — the planned alternative +is a run handle exposing one store-backed stream per output. ## Offline: develop on recorded memory @@ -89,7 +104,7 @@ out = Follower.over(image=db.streams.image, pose=db.streams.pose, imu=db.streams.imu) out.to_list() # run it -out.map_data(lambda t: t.linear.x).to_list() # poke at results +out.map_data(lambda o: o.data.linear.x).to_list() # poke at results out.save(db.stream("cmd_vel_v2")).drain() # or persist them ``` diff --git a/docs/usage/pure_modules.md b/docs/usage/pure_modules.md index 56ed8f8d96..e128b0e0c3 100644 --- a/docs/usage/pure_modules.md +++ b/docs/usage/pure_modules.md @@ -234,6 +234,54 @@ value rather than attributes on `self`, a snapshot of it *is* the module's full resume point — which is what makes restarts, migration, and time-rewind mechanical. +## Multiple outputs + +Declare several `Out` ports and return a dict keyed by port name. Emitting +a **subset** is allowed — each port fires independently, so a module can +publish a command every tick and an alert only sometimes: + +```python session=pure ansi=false +class Navigator(PureModule): + pose: In[Pose] = tick() + + cmd: Out[str] + alerts: Out[str] + + def step(self, pose: Pose, ts: float) -> dict: + outs = {"cmd": f"forward x={pose.position.x:.2f}"} + if pose.position.x > 1.8: + outs["alerts"] = f"approaching boundary at t={ts:.2f}" + return outs + +rows = Navigator.over(pose=pose).to_list() +fired = [r for r in rows if "alerts" in r.data] +print(f"{len(rows)} ticks; alerts on {len(fired)}; last: {rows[-1].data}") +``` + +```results +51 ticks; alerts on 5; last: {'cmd': 'forward x=2.00', 'alerts': 'approaching boundary at t=2.00'} +``` + +The rules: one `Out` port → return the bare value (`None` emits nothing); +several → return `{port: value}` (missing keys stay quiet, unknown keys +raise). Deployed live, each entry publishes to its own port. Offline, +`over()` yields one observation per tick whose data is the dict — slice a +single output back out with a map: + +```python session=pure ansi=false +boundary_alerts = Navigator.over(pose=pose) \ + .filter(lambda o: "alerts" in o.data) \ + .map_data(lambda o: o.data["alerts"]) +print(boundary_alerts.last().data) +``` + +```results +approaching boundary at t=2.00 +``` + +(That dict-row shape is a known asymmetry with the live per-port streams — +a run handle with one stream per output is on the design table.) + ## Modules compose like streams A module's output stream is a normal memory2 stream, so modules chain into From daca1d028281f4c3a64fb548677f5fb10766a034 Mon Sep 17 00:00:00 2001 From: Ivan Nikolic Date: Thu, 11 Jun 2026 20:59:00 +0800 Subject: [PATCH 05/10] docs: consolidate pure modules into one user doc + design notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit docs/usage/pure_modules.md is now the single user-facing document: runnable tutorial followed by a Reference part with the exact rules (declaration/binding, alignment semantics, outputs, over(), deployment knobs, health). dimos/memory2/puremodule.md shrinks to design notes — the why behind ticks/backpressure/health, replay fidelity under drops, the state persistence journal plan, the run-handle plan, and the deferred list. No more duplicated tables; architecture.md links both. --- dimos/memory2/architecture.md | 3 +- dimos/memory2/puremodule.md | 355 ++++++++++++---------------------- docs/usage/pure_modules.md | 128 +++++++++++- 3 files changed, 246 insertions(+), 240 deletions(-) diff --git a/dimos/memory2/architecture.md b/dimos/memory2/architecture.md index 7135cfc5ce..052d4b1d81 100644 --- a/dimos/memory2/architecture.md +++ b/dimos/memory2/architecture.md @@ -33,7 +33,8 @@ Supporting Systems: | `buffer.py` | Backpressure buffers for live mode (KeepLast, Bounded, Unbounded) | | `embed.py` | EmbedImages / EmbedText transformers | | `tick.py` | Tick assembly — samplers (tick/latest/interpolate/window) + TickMachine | -| `puremodule.py`| PureModule — pure `step` over aligned inputs; live ports or offline `over()` ([docs](puremodule.md)) | +| `puremodule.py`| PureModule — pure `step` over aligned inputs; live ports or offline `over()` ([usage](/docs/usage/pure_modules.md), [design notes](puremodule.md)) | +| `health.py` | HealthMonitor — drop counters, contracts, `_health` stream for PureModule | ## Subpackages diff --git a/dimos/memory2/puremodule.md b/dimos/memory2/puremodule.md index 901bb0a759..4bdfe93ffb 100644 --- a/dimos/memory2/puremodule.md +++ b/dimos/memory2/puremodule.md @@ -1,237 +1,130 @@ -# Pure Modules - -> New here? Start with the -> [gentle, runnable introduction](/docs/usage/pure_modules.md) — this page -> is the reference. - -A `PureModule` separates a module into two declarations and one pure function: - -- **when it runs** — one input marked `tick()` fires the ticks; -- **how every other input is sampled at that moment** — `latest()`, - `interpolate()`, `window()`; -- **what it computes** — `step()`, a pure function of the aligned inputs - (and, optionally, an explicit recurrent state). - -Because `step` never touches ports, threads, or `self`-state, the same class -runs **live** on pubsub ports and **offline** over stored memory2 streams — -and can't tell the difference. That property is what buys replay, -time-travel debugging, restarts that resume where they left off, migration -across processes/machines, and parallel execution. - -```python skip -from dimos.core.stream import In, Out -from dimos.memory2.puremodule import PureModule, tick, interpolate, latest - -class Follower(PureModule): - image: In[Image] = tick() # 30 fps -> 30 ticks/s - pose: In[PoseStamped] = interpolate() # 50 Hz, slerped to frame time - imu: In[Imu] = latest(max_age=0.1) # newest, None if stale - - cmd_vel: Out[Twist] - - def step(self, image: Image, pose: PoseStamped, imu: Imu | None) -> Twist: - return chase(image, pose) -``` - -## The alignment language - -Sensors don't share a clock: cameras run at 30 fps, odometry at 50 Hz, IMUs -at 200 Hz. "Call the module with an image and *the pose at image time*" -requires a policy, and the policy is the whole declaration: - -| Sampler | Value at tick time `t` | Delays the tick? | -|---|---|---| -| `tick()` | the observation that fired the tick | — (it *is* the tick) | -| `latest(max_age=None)` | newest obs with `ts <= t`; missing if older than `max_age` | never | -| `interpolate(tolerance=0.5)` | lerp/slerp between the obs bracketing `t` | live: until the next obs arrives (~one sample period) | -| `window(seconds)` | list of obs in `(t - seconds, t]` | never | - -An `In` port with no sampler defaults to `latest()`. `interpolate()` -understands numbers, `Pose`, and `PoseStamped` (position lerp + quaternion -slerp; observation poses are interpolated too); other types degrade to -nearest-neighbor. When no bracket exists (stream ended, tick before the -first sample) it falls back to the nearest observation within `tolerance`. - -Your "at what state do we call the module" examples translate as: - -| Intent | Declaration | -|---|---| -| tick on every image, poses interpolated | `image = tick()`, `pose = interpolate()` | -| tick on every pose, image latest-or-None | `pose = tick()`, `image = latest()`, param `image: Image \| None` | -| tick on every image, all IMU since 100ms before | `imu = window(0.1)`, param `imu: list[Imu]` | - -## Binding rules - -`step` parameters bind to inputs **by name**; the annotation picks the shape: - -- `Image` → `obs.data`; `Observation[Image]` → the full observation - (`ts`, `pose`, `tags`); -- `X | None` → missing becomes `None`; missing on a non-optional - parameter **drops the tick**; -- `window()` inputs: `list[X]` or `list[Observation[X]]`; -- reserved names: `ts: float` is the tick time; `state` (first parameter) - makes the module a Mealy machine. - -## Outputs - -- **One `Out` port** — `step` returns the value; returning `None` emits - nothing, so ticks double as filters. -- **Several `Out` ports** — return `{port_name: value}`. A *partial* dict - is allowed (omitted ports stay quiet that tick — e.g. a command every - tick, an alert occasionally); unknown keys raise `TypeError`. -- **No `Out` ports** — the return value is ignored. - -Where outputs land differs by mode: **live**, every dict entry publishes -to its own port (and is appended to that port's stream in the module -store); **offline**, `over()` yields one observation per tick — the bare -value for single-output modules, the `{port: value}` dict for -multi-output ones (slice one output back out with -`.filter(lambda o: "alerts" in o.data).map_data(lambda o: o.data["alerts"])`). -That dict-row asymmetry is a known open point — the planned alternative -is a run handle exposing one store-backed stream per output. - -## Offline: develop on recorded memory - -Record a session (any storage-backed store), then iterate on the module in -a notebook — no LCM, no processes, deterministic: - -```python skip -from dimos.memory2.store.sqlite import SqliteStore - -db = SqliteStore(path="walk_2026_06_11.db") - -out = Follower.over(image=db.streams.image, pose=db.streams.pose, - imu=db.streams.imu) - -out.to_list() # run it -out.map_data(lambda o: o.data.linear.x).to_list() # poke at results -out.save(db.stream("cmd_vel_v2")).drain() # or persist them -``` - -`over()` composes with the whole stream API — replay a slice with -`db.streams.image.after(t0).before(t1)`, downsample, quality-filter, etc. -Called on the class, `over` needs no module machinery at all -(`Follower.offline(some_config=...)` when the step reads `self.config`). -Don't pass `.live()` streams to `over()` — deploy the module for that. - -Offline alignment is *exact*: events are merged in timestamp order, so a -run over the same recording produces the same ticks every time. - -## Live: the same class on ports - -Deployed in a blueprint, each `In` port feeds a memory2 stream in the -module's store and ticks run on a worker thread; outputs publish to the -`Out` ports. The store is a `NullStore` by default — inputs behave as -live-only streams (`.map`/`.transform`/`.live()` work; history and search -are empty). Override `make_store()` to return a `SqliteStore` and the -module records **every input and output** while it runs — recording is a -deployment choice, not module code. (This is the path to subsuming -`Recorder`: a recorder is a PureModule deployment with a storage-backed -store and no step.) - -Live alignment is best-effort: observations are timestamped on arrival -(`msg.ts` when present) and slight cross-stream jitter is tolerated; -`interpolate()` inputs add one sample period of latency to each tick. +# Pure Modules — design notes + +Usage, tutorial, and the exact rules live in +[docs/usage/pure_modules.md](/docs/usage/pure_modules.md). This page +records the *why* — the reasoning behind the design decisions and the +plans for what isn't built yet. Read it before changing the +implementation in [puremodule.py](puremodule.py) / [tick.py](tick.py) / +[health.py](health.py). + +## Why ticks + +If modules are stateless — or their state is fed externally, react/redux +style — replay, time-travel, live migration, restarts that resume, and +parallel execution stop being features built into each module and become +properties of the runtime. The blocker for robotics is that "call the +module on its inputs" is ill-posed when sensors don't share a clock: the +declaration must say *when* the module runs (the tick) and *how* every +other input is sampled at that moment (`latest` / `interpolate` / +`window`). The sampler language is the smallest vocabulary we found that +covers the real cases (pose-at-image-time, hold-with-expiry, IMU +batching); `every(hz)` clock ticks and multi-input triggers are deferred +until a concrete module needs them. + +One machine drives both modes: the `TickMachine` is a plain +events-in/rows-out state machine, fed from a timestamp-ordered merge +offline (exact, deterministic) and from an arrival-ordered queue live +(best-effort under jitter). Keeping it free of threads and streams is +what makes alignment unit-testable. ## Backpressure: the tick is the unit of load -The system has two regimes, and the store converts between them: - -- **Pull (offline / stored streams)** — backpressure is intrinsic. The - consumer's iteration is the clock; a chained pipeline computes one tick - at a time and nothing accumulates beyond the (pruned) alignment buffers. -- **Push (live ports)** — sensors can't be paused, so backpressure is a - declared *drop/coalesce policy*, and the natural unit is the tick: - secondaries are cheap to ingest, all the expense is `step()`. - -Live, resolved ticks flow through a `BackpressureBuffer` between the -alignment thread and the step thread: - -```python skip -class Follower(PureModule): - backpressure = KeepLast() # default — controller semantics - # backpressure = Unbounded() # recorder/indexer semantics: never drop - # backpressure = Bounded(8) # bounded queue, drops oldest -``` - -`KeepLast` means a slow step always processes the *freshest* tick and the -skipped ones are counted — for a 30 fps camera and a 100 ms step, -dropping ~2/3 of ticks is the system working as designed. Every queue in -the path is bounded: the tick buffer by policy, alignment buffers by -pruning, and ticks waiting for interpolation brackets by -`max_pending_ticks` (config, default 64) so a dead `interpolate()` input -can't accumulate ticks forever (evictions count as `drops_blocked`). - -One honest consequence: with drops, a live run processes a *subsample* of -triggers, so replaying raw inputs offline (which processes all of them) -diverges for stateful modules. Exact replay-of-a-run requires recording -the resolved tick rows — designed next step, not built. +The system has two regimes, and the store converts between them. Pull +(offline): backpressure is intrinsic — the consumer's iteration is the +clock and nothing accumulates beyond pruned alignment buffers. Push +(live): sensors can't be paused, so backpressure must be a declared +drop/coalesce policy, and the tick is the right unit — secondaries are +cheap to ingest, all the expense is `step()`. Hence the +`BackpressureBuffer` between the alignment thread and the step thread, +speaking the existing `buffer.py` vocabulary rather than inventing one. + +The invariant to preserve when changing the live path: **every queue is +bounded** — the tick buffer by policy, alignment buffers by pruning +(including the dead-trigger case, 1 s arrival-jitter slack), pending +ticks by `max_pending_ticks` (a dead `interpolate()` input must not +accumulate ticks), and the monitor's reservoirs by fixed-size deques. ## Health: drops are metrics, not errors -Per-drop warnings at sensor rate are noise. The module follows the -mature ladder instead — count always, report continuously, log on -transitions, alert on *contracts*: - -- **Counters** (always): ticks resolved/stepped, drops by reason - (`backpressure`, `missing_input`, `blocked`), step p50/p99, per-input - observed Hz, age of consumed `latest()` values, output rates. -- **`_health` stream**: an aggregated snapshot every - `health_interval_s` (1 s) appended to the module store — live-only on a - NullStore, *recorded next to the data it explains* on a SqliteStore, so - a post-incident notebook plots drop ratio against the very frames that - were dropped. -- **Contracts** are split deliberately: semantic tolerances live in the - declaration (`latest(max_age=…)`, `interpolate(tolerance=…)`); *rates* - live in deployment config (`expected_hz={"pose": 50}`, - `min_output_hz=10`) because sim, replay, and the robot differ. -- **Messages**: one warmup line after `health_warmup_s` comparing - observed input rates to expectations ("pose 12.1 Hz (expected 50 — - LOW)"); one WARN on entering `DEGRADED`/`STALLED` naming the violated - contracts; a throttled reminder every `unhealthy_log_every_s` while - unhealthy; one INFO on recovery with the outage duration. Stalls - (`STALLED`) distinguish "ticks queued but none stepped (step stuck?)" - from "inputs flowing but no ticks resolving (interpolate input dead?)" - and must persist `stall_after_s` before firing — a single slow step is - not a stall. - -The real SLO is output freshness and rate, not drop count — alert on the -contract, read the drop counters to diagnose *why*. Offline, `over()` -logs a per-field drop summary, and `_strict=True` raises on the first -drop instead (replay determinism tests should fail loudly). - -## State, explicitly - -If a module needs recurrence (gait phase, filters, RNN hidden state), -declare it — don't hide it in `self`: - -```python skip -class GaitController(PureModule): - pose: In[PoseStamped] = tick() - cmd_vel: Out[Twist] - - initial_state = GaitState(phase=0.0) - - def step(self, state: GaitState, pose: PoseStamped) -> tuple[GaitState, Twist]: - ... - return new_state, twist -``` - -The runtime threads the state through the ticks (it's `scan` over the tick -stream). Because state is a value, snapshotting it per tick gives -time-rewind and live migration — the snapshot stream is the designed next -step, not yet built. - -## Not yet designed / deliberately deferred - -- `every(hz)` clock triggers and multi-input triggers (`on_any`) — only - one `tick()` input for now. -- State snapshot streams (time-travel, suspend/revive, migration) — the - Mealy form is the hook; persistence isn't wired yet. -- A live timeout policy for `interpolate()` when its input dies (currently - ticks wait; on shutdown they resolve via the nearest-fallback). -- Modules that *query* memory (semantic search) — that's an impure - capability and stays on `MemoryModule` for now. -- `Annotated[In[X], sampler]` syntax — rejected for now because core - `Module` introspection doesn't unwrap `Annotated`; the default-value - syntax is canonical. +Under `KeepLast` a controller dropping most ticks is the system working +as designed, so per-drop warnings are categorically wrong. The ladder: +count always (by reason — `backpressure`, `missing_input`, `blocked` are +three different problems: slow step, dead sensor, clock skew), report +continuously (the `_health` stream rides the same store as the data, so +recordings capture health next to the frames it explains), log on state +transitions only, alert on declared contracts. The real SLO is output +freshness and rate; drop counters are diagnosis. Contracts split +deliberately: semantic tolerances (`max_age`, `tolerance`) belong in the +declaration because they're algorithm truths; rates (`expected_hz`, +`min_output_hz`) belong in deployment config because sim, replay, and +the robot legitimately differ. + +## Replay fidelity under drops (planned: record tick rows) + +With a dropping policy, a live run processes a *subsample* of triggers, +so replaying raw inputs offline (which processes all of them) diverges +for stateful modules. The fix is to record the **resolved tick rows** — +the aligned inputs actually consumed — making replay-of-a-run exact by +construction, drops and all. This is the prerequisite for trusting +time-travel on stateful modules and should land before production +relies on them. + +## State persistence (planned: the journal design) + +Today, Mealy state lives in a loop variable — initialized from +`initial_state`, threaded by the runtime, gone when the run ends. +Deliberately not on `self` (concurrent `over()` runs stay independent), +deliberately not yet persisted. The plan: + +- **Snapshots are a stream.** The runtime appends post-tick state to a + `_state` stream in the module store (like `_health`), on a cadence + policy — every tick for small states, every N seconds for big ones, + on-stop minimum. Store choice = persistence policy; codecs, ts + indexing, and replay tooling already exist. +- **The DB is a journal, not the hot path.** The working copy stays in + memory; appends are write-through; reads happen only at start or seek. + No round-trips inside a control loop. +- **What it buys**: resume (`start()` loads `_state.last()` under a + `resume` config), migration (the snapshot is a value in a file), + time-travel (snapshots are checkpoints, the tick log is the WAL — + `state = fold(step, ticks)`, seek = load snapshot ≤ T + replay), + counterfactual debugging (replay from a snapshot with edited inputs or + edited step code). `state` is a reserved input name, so + `over(state=snapshot, pose=db.pose.after(t0))` is collision-free. +- **Contract on state values**: plain serializable data (dataclass / + LCM message / numpy — an LCM-typed state gets cross-language replay), + treated as immutable (`step` returns new state; serializing at append + time is the aliasing fix), sized for its cadence. +- **Endgame**: keyed state (e.g. per-marker buffers) shards — the + runtime partitions ticks by key across processes, each owning a + shard. Only possible because state is a declared value the runtime + owns. + +## Multi-output offline shape (planned: run handle) + +Offline, multi-output modules yield `{port: value}` dict rows while live +publishes per-port — an asymmetry. The planned fix is a run handle: +`run = M.over(..., store=...)` executes once into a store and exposes one +stream per output (`run.detections`, `run.alerts`), independently +re-iterable and queryable. Materializing through a store also makes +offline structurally identical to live (both are "module + store") and is +the substrate a future module-graph would build on. The lazy dict-row +form stays for single-pass pipelines. + +## Deliberately deferred + +- `every(hz)` clock triggers and multi-input triggers (`on_any`). +- A live timeout policy for `interpolate()` when its input dies + (currently ticks wait until evicted; shutdown resolves via the + nearest-fallback). +- Live-side input gating — offline, gating composes onto the input + stream (`over(color_image=imgs.transform(QualityWindow(...)))`); live + has no per-port hook yet (chain a gating module). Possibly + `tick(via=...)`. +- Modules that *query* memory (semantic search) — impure capability, + stays on `MemoryModule`. +- `Annotated[In[X], sampler]` syntax — core `Module` introspection + doesn't unwrap `Annotated`, so ports would silently not be created; + the default-value syntax is canonical. +- `Recorder` subsumption — a recorder is a PureModule deployment with a + storage-backed store and no step; fold once the API is stable. diff --git a/docs/usage/pure_modules.md b/docs/usage/pure_modules.md index e128b0e0c3..cd52797fd7 100644 --- a/docs/usage/pure_modules.md +++ b/docs/usage/pure_modules.md @@ -132,12 +132,12 @@ persists it, filters slice it. Nothing executed until we asked. ## The sampler language -| Sampler | Value at tick time `t` | -|---|---| -| `tick()` | the observation that fired the tick | -| `latest(max_age=None)` | newest observation with `ts <= t` (hold), missing if stale | -| `interpolate(tolerance=0.5)` | lerp/slerp between the observations bracketing `t` | -| `window(seconds)` | every observation in `(t - seconds, t]`, as a list | +| Sampler | Value at tick time `t` | +|------------------------------|------------------------------------------------------------| +| `tick()` | the observation that fired the tick | +| `latest(max_age=None)` | newest observation with `ts <= t` (hold), missing if stale | +| `interpolate(tolerance=0.5)` | lerp/slerp between the observations bracketing `t` | +| `window(seconds)` | every observation in `(t - seconds, t]`, as a list | "At what state do we call the module" is always some combination of these. Tick on poses with the latest image instead? Swap the samplers — the step @@ -381,10 +381,122 @@ the health stream *next to the data it explains* — so a post-incident notebook can plot the drop ratio against the very frames that were dropped. +--- + +# Reference + +The tutorial above shows the feel; this part states the exact rules. + +## Declaration & binding + +One input must carry `tick()`; an `In` port with no sampler defaults to +`latest()`. Input ports may not be named `ts` or `state` (reserved). +`step` parameters bind to inputs **by name**; the annotation picks the +shape: + +- `Image` → `obs.data`; `Observation[Image]` → the full observation + (`ts`, `pose`, `tags`); +- `X | None` → missing becomes `None`; missing on a non-optional + parameter **drops the tick**; +- `window()` inputs: `list[X]` or `list[Observation[X]]`; +- reserved names: `ts: float` is the tick time; `state` (first parameter) + makes the module a Mealy machine — `step(self, state, ...)` must return + `(new_state, output)`, with the initial value from the `initial_state` + attribute. + +Declarations are validated at first use: a missing/duplicate `tick()`, +a `step` parameter that matches no input, or an +`Annotated[In[X], sampler]` port (unsupported — use the default-value +syntax) all raise `TypeError`. + +## Alignment semantics + +| Sampler | Value at tick time `t` | Delays the tick? | +|------------------------------|------------------------------------------------------------|-------------------------------------------------------| +| `tick()` | the observation that fired the tick | — (it *is* the tick) | +| `latest(max_age=None)` | newest obs with `ts <= t`; missing if older than `max_age` | never | +| `interpolate(tolerance=0.5)` | lerp/slerp between the obs bracketing `t` | live: until the next obs arrives (~one sample period) | +| `window(seconds)` | list of obs in `(t - seconds, t]` | never | + +`interpolate()` understands numbers, `Pose`, and `PoseStamped` (position +lerp + quaternion slerp; observation poses are interpolated too); other +types degrade to nearest-neighbor. When no bracket exists (stream ended, +tick before the first sample) it falls back to the nearest observation +within `tolerance`, else the value is missing. + +**Offline is exact**: events are merged in timestamp order, so a run over +the same recording produces the same ticks every time. **Live is +best-effort**: observations are timestamped on arrival (`msg.ts` when +present), slight cross-stream jitter is tolerated, and `interpolate()` +inputs add one sample period of latency to each tick. + +## Outputs + +- **One `Out` port** — `step` returns the value; returning `None` emits + nothing, so ticks double as filters. +- **Several `Out` ports** — return `{port_name: value}`. A *partial* dict + is allowed (omitted ports stay quiet that tick); unknown keys raise + `TypeError`. +- **No `Out` ports** — the return value is ignored. + +Live, every dict entry publishes to its own port (and is appended to that +port's stream in the module store). Offline, `over()` yields one +observation per tick — the bare value for single-output modules, the +`{port: value}` dict for multi-output ones. Output observations derive +from the trigger observation (its `ts`, `pose`, `tags`). + +## Running offline: `over()` + +Pass one stream per declared input, by name; each must iterate in +ascending `ts` (stored streams in insertion order do; otherwise prepend +`.order_by("ts")`). Don't pass `.live()` streams — deploy the module for +that. Called on the class, `over` builds a machinery-free instance via +`offline()` (pass config there: `Follower.offline(gain=2.0).over(...)`). +Drops are summarized in a log line; `over(_strict=True)` raises on the +first tick dropped for missing required inputs — replay determinism tests +should fail loudly. + +## Running live: deployment knobs + +The store: `NullStore` by default — inputs/outputs behave as live-only +streams (`.map`/`.transform`/`.live()` work; history and search are +empty). Override `make_store()` to return a `SqliteStore` and the module +records **every input and output** while it runs — recording is a +deployment choice, not module code. + +Backpressure: live, resolved ticks flow through a `BackpressureBuffer` +between the alignment thread and the step thread — + +| Policy | Semantics | +|---|---| +| `KeepLast()` (default) | controller: always step the freshest tick, count the skipped | +| `Unbounded()` | recorder/indexer: never drop, memory-bounded only by consumption | +| `Bounded(n)` / `DropNew(n)` | bounded queue dropping oldest / rejecting newest | + +Every queue in the path is bounded: the tick buffer by policy, alignment +buffers by pruning, and ticks waiting for interpolation brackets by +`max_pending_ticks` (config, default 64) so a dead `interpolate()` input +can't accumulate ticks forever (evictions count as `drops_blocked`). + +Health: counters always (ticks resolved/stepped, drops by reason, step +p50/p99, per-input Hz, ages of consumed `latest()` values, output rates); +a `_health` stream snapshot every `health_interval_s`; one warmup line +comparing observed input rates to `expected_hz`; transition-logged +`DEGRADED`/`STALLED` with the violated contracts, throttled reminders +every `unhealthy_log_every_s`, recovery logged with duration. Stalls must +persist `stall_after_s` and distinguish "ticks queued but none stepped +(step stuck?)" from "inputs flowing but no ticks resolving (interpolate +input dead?)". Contracts split deliberately: semantic tolerances live in +the declaration (`max_age`, `tolerance`); *rates* live in deployment +config (`expected_hz`, `min_output_hz`) because sim, replay, and the +robot differ. The real SLO is output freshness and rate — alert on the +contract, read drop counters to diagnose why. + ## Where next -- [Pure modules reference](/dimos/memory2/puremodule.md) — the full - declaration language, binding rules, backpressure and health design. +- [Design notes](/dimos/memory2/puremodule.md) — the why behind + backpressure and health, replay fidelity under drops, the state + persistence plan, and the deferred list. - [Memory intro](/dimos/memory2/intro.md) — the stream API `over()` returns. - [Temporal alignment](/docs/usage/data_streams/temporal_alignment.md) — From 359b38ff7610149fb8407f7e94f16b1b9a5504be Mon Sep 17 00:00:00 2001 From: Ivan Nikolic Date: Thu, 11 Jun 2026 21:03:59 +0800 Subject: [PATCH 06/10] docs: clarify the health monitor demo is what a module builds internally The detached HealthMonitor in the contracts section looked like it was monitoring something; say explicitly that modules construct their own from config at start() (module.health_monitor), and the demo hand-feeds one to show the messages a deployed Follower would produce. --- docs/usage/pure_modules.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/usage/pure_modules.md b/docs/usage/pure_modules.md index cd52797fd7..05dd1c58d7 100644 --- a/docs/usage/pure_modules.md +++ b/docs/usage/pure_modules.md @@ -346,18 +346,24 @@ Two deployment choices matter: ## Contracts, not log spam Health is judged against declared contracts — input rates, output rate — -and reported as state transitions, not per-drop warnings. The monitor is -plain bookkeeping, so we can drive a fake deployment right here with a -fake clock: +and reported as state transitions, not per-drop warnings. You never build +a monitor yourself: when a module starts, it constructs one from its own +config (`min_output_hz`, `expected_hz`, ...) and feeds it from the tick +loop; it lives at `module.health_monitor` and its snapshots land in the +`_health` stream. But since it's plain bookkeeping with an injectable +clock, we can hand-feed a detached one right here and watch the exact +messages a deployed `Follower` would produce — the first argument is just +the module name stamped on them: ```python session=pure ansi=false from dimos.memory2.health import HealthMonitor +# what Follower's start() builds internally, driven by hand: t = 0.0 monitor = HealthMonitor("follower", min_output_hz=10.0, warmup_s=0.0, interval_s=1.0, clock=lambda: t) -for _ in range(3): # a bad second: only 3 outputs against a 10 Hz contract +for _ in range(3): # a bad second: step emitted only 3 outputs vs the 10 Hz contract monitor.on_step(duration_s=0.12, ages={}, emitted=True) t += 1.0 health = monitor.maybe_report() From 58c448f0429c3155d9a0d470fbfc9ae2df6e96f3 Mon Sep 17 00:00:00 2001 From: Ivan Nikolic Date: Fri, 12 Jun 2026 10:37:53 +0800 Subject: [PATCH 07/10] pure modules: ratio/latency contracts, tick latency metric, output writer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Health contracts gain scale-free forms: max_drop_ratio (the step keeps up, independent of deployment rates), max_missing_ratio (the hardcoded >50% staleness rule made configurable), and max_tick_latency_s over a new end-to-end latency metric — trigger arrival to outputs published, covering queue wait + alignment + buffer + step. Latency is the felt consequence of queue growth under any backpressure policy, so queue depth stays a diagnostic gauge rather than a contract. Ratio contracts evaluate only above ratio_min_samples: tiny windows are noise and zero traffic passes vacuously, so the absolute contracts remain the liveness floor. Multi-output ergonomics: the reserved `out` writer parameter on step — assignment emits, skipping a port stays quiet, unknown ports raise at the assignment line, last write wins. With `out`, stateless steps return None and stateful steps return just new_state (no more (state, dict) tuple). The dict return stays as the low-level equivalent; single-output bare return is unchanged. Outputs is exported for annotation and is the seam for typed Out bundles later — design notes record the agreed bundle/structural-wiring direction, the flat-syntax-as-anonymous-bundle compatibility rule, and the contracts-on-inputs-vs-outputs rationale. Docs: health/contracts section rewritten from the deployment perspective around a captured log transcript; contracts table in the reference; Navigator example switched to the writer form. --- dimos/memory2/health.py | 47 +++++++++- dimos/memory2/puremodule.md | 77 ++++++++++++++++ dimos/memory2/puremodule.py | 114 ++++++++++++++++++++--- dimos/memory2/test_health.py | 60 ++++++++++++ dimos/memory2/test_puremodule.py | 95 ++++++++++++++++++- docs/usage/pure_modules.md | 152 ++++++++++++++++++------------- 6 files changed, 469 insertions(+), 76 deletions(-) diff --git a/dimos/memory2/health.py b/dimos/memory2/health.py index 999caa577c..99ff9628f7 100644 --- a/dimos/memory2/health.py +++ b/dimos/memory2/health.py @@ -92,6 +92,7 @@ class _Window: missing: dict[str, int] = field(default_factory=dict) # drops by field blocked: int = 0 # ticks evicted while waiting for interpolation brackets outputs: dict[str, int] = field(default_factory=dict) + latencies_s: list[float] = field(default_factory=list) # trigger ts -> publish class HealthMonitor: @@ -108,6 +109,10 @@ def __init__( *, expected_hz: dict[str, float] | None = None, min_output_hz: float | None = None, + max_drop_ratio: float | None = None, + max_tick_latency_s: float | None = None, + max_missing_ratio: float = 0.5, + ratio_min_samples: int = 10, interval_s: float = 1.0, warmup_s: float = 5.0, unhealthy_log_every_s: float = 10.0, @@ -121,6 +126,13 @@ def __init__( self.name = name self.expected_hz = dict(expected_hz or {}) self.min_output_hz = min_output_hz + self.max_drop_ratio = max_drop_ratio + self.max_tick_latency_s = max_tick_latency_s + self.max_missing_ratio = max_missing_ratio + # Ratio contracts are vacuous on tiny windows (3 drops of 4 ticks = + # 75%) and at zero traffic — they only evaluate at this many samples. + # Absolute contracts (min_output_hz, expected_hz) remain the liveness floor. + self.ratio_min_samples = ratio_min_samples self.interval_s = interval_s self.warmup_s = warmup_s self.unhealthy_log_every_s = unhealthy_log_every_s @@ -143,6 +155,7 @@ def __init__( self._win = _Window() self._total_inputs: dict[str, int] = {} self._step_ms: deque[float] = deque(maxlen=256) + self._latency_ms: deque[float] = deque(maxlen=256) self._ages: dict[str, deque[float]] = {} self._buffer_len: Callable[[], int] = lambda: 0 self._pending_len: Callable[[], int] = lambda: 0 @@ -180,12 +193,25 @@ def on_blocked(self, n: int) -> None: with self._lock: self._win.blocked += n - def on_step(self, duration_s: float, ages: dict[str, float], emitted: bool) -> None: + def on_step( + self, + duration_s: float, + ages: dict[str, float], + emitted: bool, + latency_s: float | None = None, + ) -> None: + """One step completed. ``latency_s`` is end-to-end tick latency — + trigger timestamp to outputs published — covering queue wait, + alignment wait, buffer wait, and the step itself.""" with self._lock: self._win.stepped += 1 if emitted: self._win.emitted += 1 self._step_ms.append(duration_s * 1000.0) + if latency_s is not None: + self._latency_ms.append(latency_s * 1000.0) + if len(self._win.latencies_s) < 4096: + self._win.latencies_s.append(latency_s) for name, age in ages.items(): self._ages.setdefault(name, deque(maxlen=256)).append(age) @@ -235,6 +261,8 @@ def _metrics(self, win: _Window, dt: float) -> dict[str, float]: "emitted_hz": win.emitted / dt, "step_p50_ms": _percentile(list(self._step_ms), 0.50), "step_p99_ms": _percentile(list(self._step_ms), 0.99), + "tick_latency_p50_ms": _percentile(list(self._latency_ms), 0.50), + "tick_latency_p99_ms": _percentile(list(self._latency_ms), 0.99), "buffer_len": float(self._buffer_len()), "pending_len": float(self._pending_len()), } @@ -256,8 +284,23 @@ def _violations(self, win: _Window, dt: float, metrics: dict[str, float]) -> lis emitted_hz = win.emitted / dt if emitted_hz < self.min_output_hz: v.append(f"output {emitted_hz:.1f} Hz < contract {self.min_output_hz:g} Hz") + if self.max_drop_ratio is not None and win.queued >= self.ratio_min_samples: + dropped = max(0, win.queued - win.stepped - self._buffer_len()) + ratio = dropped / win.queued + if ratio > self.max_drop_ratio: + v.append( + f"backpressure drop ratio {ratio:.0%} > contract {self.max_drop_ratio:.0%} " + f"(step not keeping up)" + ) + if self.max_tick_latency_s is not None and len(win.latencies_s) >= self.ratio_min_samples: + p99 = _percentile(win.latencies_s, 0.99) + if p99 > self.max_tick_latency_s: + v.append( + f"tick latency p99 {p99 * 1000:.0f} ms > contract " + f"{self.max_tick_latency_s * 1000:.0f} ms" + ) for name, n in win.missing.items(): - if win.resolved and n / win.resolved > 0.5: + if win.resolved >= self.ratio_min_samples and n / win.resolved > self.max_missing_ratio: v.append(f"input '{name}' missing on {n}/{win.resolved} ticks (stale or dead?)") return v diff --git a/dimos/memory2/puremodule.md b/dimos/memory2/puremodule.md index 4bdfe93ffb..06eeb626f1 100644 --- a/dimos/memory2/puremodule.md +++ b/dimos/memory2/puremodule.md @@ -59,6 +59,52 @@ declaration because they're algorithm truths; rates (`expected_hz`, `min_output_hz`) belong in deployment config because sim, replay, and the robot legitimately differ. +### Contracts on inputs vs outputs + +Both exist, with different roles: + +- **Output contracts are promises** — the module's SLO to its consumers + ("commands at ≥ 10 Hz from fresh data"). These are what paging should + key on: their violation means *I am failing whoever depends on me*. +- **Input expectations are assumptions** — dependencies on upstream the + module cannot fix, only attribute ("pose at 12 Hz, expected 50"). They + exist for the warmup check and for *blame*: when the output contract + breaks, input expectations turn "too slow" into "because pose is + starved", distinguishing slow-step / starved-trigger / dead-interpolate + causes. + +In a module graph, B's input expectation on A's output duplicates A's +output contract — input expectations really earn their keep at the +*edges*, where the producer (a sensor driver) has no health of its own +and the first consumer hosts its contract. The graph-era resolution is +to attach rate contracts to *streams* (declared once, checked at the +producer when possible, at the first consumer otherwise); per-module +`expected_hz` is the pragmatic stand-in until then. + +### Absolute vs ratio vs latency contracts + +Absolute rates (`min_output_hz`, `expected_hz`) bake the deployment's +sensor rates into the contract; ratio contracts (`max_drop_ratio`, +`max_missing_ratio`) are scale-free — "the step keeps up, with headroom" +survives a camera swap unchanged. But ratios are vacuous at zero traffic +(a dead camera produces zero drops) and noisy on tiny windows, so they +only evaluate above `ratio_min_samples` and the absolute contracts remain +the liveness floor — both kinds, different jobs. + +Queue *depth* was considered and rejected as a contract: `KeepLast` depth +is 0/1 by construction, `Bounded(n)` at depth n just means drops (already +counted), and for `Unbounded` the fear isn't depth but *growth* — whose +felt consequence is latency. Hence `max_tick_latency_s` instead: p99 of +trigger-arrival → outputs-published, meaningful under every policy, +subsuming depth (which stays an exported gauge for diagnosis). + +Two known refinements deferred until needed: the health *state* should +arguably be driven by output contracts only (inputs below expectation +while outputs still meet contract is "at risk", not degraded — today +both trip `DEGRADED`); and `min_output_hz` should become per-port +(`{"cmd": 10}`) once real multi-output modules exist — partial emission +makes a single number wrong for deliberately sparse ports like alerts. + ## Replay fidelity under drops (planned: record tick rows) With a dropping policy, a live run processes a *subsample* of triggers, @@ -100,6 +146,37 @@ deliberately not yet persisted. The plan: shard. Only possible because state is a declared value the runtime owns. +## Output declaration (agreed direction: writer now, bundles later) + +Single-output modules keep the flat root declaration and bare return — +that's the dominant case and it stays terse. Multi-output modules use a +per-tick **writer**: a reserved `out` parameter on `step`; assignment +emits (`out.cmd = ...`), skipping a port means staying quiet, unknown +ports raise at the assignment line, last write wins. With `out` declared, +stateless steps return `None` and stateful steps return just `new_state` +— dissolving the old `(state, dict)` tuple. The raw `{port: value}` dict +return remains accepted as the low-level equivalent. `out` joins +`ts`/`state` as reserved input names. The writer stays referentially +pure: fresh per tick, collected immediately — the return value passed +inside-out. + +The typed future is **bundles**: a nested `class Out(Bundle)` is both the +port declaration (ports synthesized from its annotations) and the +writer's static type (`out.cmd = 5` becomes a mypy error). Inputs get the +same treatment (`class In(Bundle)` with samplers as field defaults — +which is the original InputState idea), with mixed binding allowed: spread +inputs by name for small modules, take the whole bundle for big ones. The +key compatibility rule: **today's flat syntax is an anonymous bundle** — +flat declarations compile to implicit `In`/`Out` bundles at plan time, so +nothing migrates and structural wiring works on every existing module. +Bundles then subsume `dimos/spec/` Protocols as real, connectable, +type-checked interfaces (`bp.connect(cam.o, nav.i)` matched by name+type; +interface inheritance like `DepthCameraOut(CameraOut)`), which is the +blueprint-rethink track. Note: the class-level-`None` port attribute +dance in `Module.__init_subclass__` cites Dask actor proxies — dimos no +longer uses Dask, so that constraint is gone and the real serialization +surface for any port rework is `RemoteIn`/`RemoteOut` over LCM. + ## Multi-output offline shape (planned: run handle) Offline, multi-output modules yield `{port: value}` dict rows while live diff --git a/dimos/memory2/puremodule.py b/dimos/memory2/puremodule.py index 79fc87d4aa..ef9ce9a29d 100644 --- a/dimos/memory2/puremodule.py +++ b/dimos/memory2/puremodule.py @@ -40,13 +40,23 @@ def step(self, image: Image, pose: PoseStamped, imu: Imu | None) -> Twist: ``window()`` inputs take ``list[X]`` or ``list[Observation[X]]``; - ``X | None`` means a missing value is passed as ``None``; a missing value for a non-optional parameter *drops the tick*; -- reserved names: ``ts: float`` receives the tick time; declaring - ``state`` makes the module a Mealy machine — ``step(self, state, ...)`` - must return ``(new_state, output)`` and the initial state comes from - the ``initial_state`` attribute. +- reserved names: ``ts: float`` receives the tick time; ``out`` receives + a per-tick output writer; declaring ``state`` (first parameter) makes + the module a Mealy machine with the initial value from the + ``initial_state`` attribute. Outputs: with one ``Out`` port, return the value (or ``None`` to emit -nothing); with several, return ``{port_name: value}``. +nothing). With several, declare ``out`` and assign — set = emit, skip = +quiet, unknown ports raise at the assignment:: + + def step(self, pose: Pose, out: Outputs) -> None: + out.cmd_vel = drive(pose) + if blocked: + out.alerts = "obstacle" + +With ``out`` declared, stateless steps return ``None`` and stateful steps +return just ``new_state`` (no tuple). Returning ``{port_name: value}`` +instead of using the writer is also accepted. **Live** (``module.start()``): each ``In`` port feeds a memory2 stream in the module's store — a :class:`~dimos.memory2.store.null.NullStore` by @@ -120,7 +130,7 @@ def step(self, image: Image, pose: PoseStamped, imu: Imu | None) -> Twist: logger = setup_logger() -__all__ = ["PureModule", "interpolate", "latest", "tick", "window"] +__all__ = ["Outputs", "PureModule", "interpolate", "latest", "tick", "window"] _STOP = object() @@ -141,6 +151,36 @@ class _Plan: outs: dict[str, type] params: tuple[_Param, ...] stateful: bool + uses_out: bool # step declares the reserved `out` writer parameter + + +class Outputs: + """Per-tick output writer — assignment emits; skipping a port stays quiet. + + Handed to ``step`` as the reserved ``out`` parameter. Fresh per tick + and collected right after the call, so the step stays referentially + pure — this is the return value, passed inside-out. Unknown ports + raise at the assignment line; reassignment last-wins. + """ + + __slots__ = ("_allowed", "_values") + + def __init__(self, allowed: frozenset[str]) -> None: + object.__setattr__(self, "_allowed", allowed) + object.__setattr__(self, "_values", {}) + + def __setattr__(self, name: str, value: Any) -> None: + if name not in self._allowed: + raise AttributeError( + f"unknown output {name!r} — declared Out ports: {sorted(self._allowed)}" + ) + self._values[name] = value + + def __getattr__(self, name: str) -> Any: + try: + return self._values[name] + except KeyError: + raise AttributeError(name) from None def _unwrap_optional(ann: Any) -> tuple[Any, bool]: @@ -189,7 +229,26 @@ class PureModuleConfig(ModuleConfig): continuously (violation below 50% of expected).""" min_output_hz: float | None = None - """Contract: rate of ticks that emit at least one output.""" + """Contract: rate of ticks that emit at least one output. Absolute — + the liveness floor that ratio contracts can't provide.""" + + max_drop_ratio: float | None = None + """Contract: fraction of viable ticks skipped by backpressure. + Scale-free "the step keeps up" — independent of deployment rates.""" + + max_tick_latency_s: float | None = None + """Contract: p99 end-to-end latency, trigger arrival to outputs + published. Meaningful under every backpressure policy (under + ``Unbounded`` queue growth shows up here first).""" + + max_missing_ratio: float = Field(0.5, gt=0.0, le=1.0) + """Per-input staleness contract: fraction of resolved ticks where the + input was missing before it's flagged.""" + + ratio_min_samples: int = Field(10, ge=1) + """Ratio contracts only evaluate on windows with at least this many + samples — tiny windows make ratios noise, zero traffic makes them + vacuously pass.""" health_interval_s: float = 1.0 health_warmup_s: float = 5.0 @@ -260,8 +319,10 @@ def _build_plan(cls) -> _Plan: continue origin = get_origin(ann) if origin is In: - if name in ("ts", "state"): - raise TypeError(f"{cls.__name__}.{name}: 'ts' and 'state' are reserved names") + if name in ("ts", "state", "out"): + raise TypeError( + f"{cls.__name__}.{name}: 'ts', 'state' and 'out' are reserved names" + ) ins[name] = (get_args(ann) or (object,))[0] sampler = inspect.getattr_static(cls, name, None) if isinstance(sampler, Tick): @@ -292,6 +353,7 @@ def _build_plan(cls) -> _Plan: params: list[_Param] = [] stateful = False + uses_out = False names = [n for n in sig.parameters if n != "self"] for name in names: if name == "state": @@ -302,10 +364,13 @@ def _build_plan(cls) -> _Plan: if name == "ts": params.append(_Param("ts", "ts", optional=False, wants_obs=False)) continue + if name == "out": + uses_out = True + continue if name not in ins: raise TypeError( f"{cls.__name__}.step parameter {name!r} doesn't match an input — " - f"declared inputs: {sorted(ins)} (reserved: ts, state)" + f"declared inputs: {sorted(ins)} (reserved: ts, state, out)" ) ann, optional = _unwrap_optional(step_hints.get(name, Any)) if isinstance(samplers.get(name), Window): @@ -323,6 +388,7 @@ def _build_plan(cls) -> _Plan: outs=outs, params=tuple(params), stateful=stateful, + uses_out=uses_out, ) # -- binding & dispatch ----------------------------------------------------- @@ -365,6 +431,20 @@ def _invoke( self, plan: _Plan, state: Any, kwargs: dict[str, Any] ) -> tuple[Any, dict[str, Any]]: """Run step; returns (new_state, {out_name: value}).""" + if plan.uses_out: + writer = Outputs(frozenset(plan.outs)) + if plan.stateful: + # With the writer, the return value is just the new state. + state = self.step(state, out=writer, **kwargs) + else: + ret = self.step(out=writer, **kwargs) + if ret is not None: + raise TypeError( + f"{type(self).__name__}.step declares 'out' so outputs are " + f"set on it — return None, got {type(ret).__name__}" + ) + return state, dict(writer._values) + if plan.stateful: result = self.step(state, **kwargs) if not (isinstance(result, tuple) and len(result) == 2): @@ -515,6 +595,10 @@ def _sink(h: Health) -> None: str(self), expected_hz=cfg.expected_hz, min_output_hz=cfg.min_output_hz, + max_drop_ratio=cfg.max_drop_ratio, + max_tick_latency_s=cfg.max_tick_latency_s, + max_missing_ratio=cfg.max_missing_ratio, + ratio_min_samples=cfg.ratio_min_samples, interval_s=cfg.health_interval_s, warmup_s=cfg.health_warmup_s, unhealthy_log_every_s=cfg.unhealthy_log_every_s, @@ -589,7 +673,7 @@ def _step_loop() -> None: except Exception: logger.exception("%s.step failed for tick ts=%s", self, tobs.ts) continue - monitor.on_step(time.perf_counter() - t0, ages, emitted=bool(outs)) + duration = time.perf_counter() - t0 for out_name, value in outs.items(): monitor.on_output(out_name) try: @@ -597,6 +681,14 @@ def _step_loop() -> None: self._out_streams[out_name].append(value, ts=tobs.ts) except Exception: logger.exception("%s: publishing %s failed", self, out_name) + # Live observation ts is arrival wall-clock, so this spans the + # whole path: queue wait + alignment + buffer wait + step + publish. + monitor.on_step( + duration, + ages, + emitted=bool(outs), + latency_s=max(0.0, time.time() - tobs.ts), + ) monitor.maybe_report() self._threads = [ diff --git a/dimos/memory2/test_health.py b/dimos/memory2/test_health.py index 057bfe59ee..e1c97ce4c6 100644 --- a/dimos/memory2/test_health.py +++ b/dimos/memory2/test_health.py @@ -210,6 +210,66 @@ def test_blocked_drops_metric() -> None: assert h.metrics["drops_blocked_hz"] == 7.0 +# -- ratio & latency contracts --------------------------------------------------- + + +def test_drop_ratio_contract() -> None: + clock = Clock() + m, _ = make(clock, max_drop_ratio=0.5) + for _ in range(20): + m.on_resolved() + m.on_queued() + for _ in range(5): # 15 of 20 skipped -> 75% > 50% contract + m.on_step(0.01, {}, emitted=True) + h = report(m, clock) + assert h.state == DEGRADED + assert any("drop ratio 75% > contract 50%" in v for v in h.violations) + + for _ in range(20): # keeping up again + m.on_resolved() + m.on_queued() + m.on_step(0.01, {}, emitted=True) + assert report(m, clock).state == OK + + +def test_ratio_contracts_need_min_samples() -> None: + """Tiny windows and zero traffic must not trip ratio contracts.""" + clock = Clock() + m, _ = make(clock, max_drop_ratio=0.1, max_tick_latency_s=0.1) + + assert report(m, clock).state == OK # zero traffic: vacuous, stays OK + + for _ in range(4): # 3 of 4 dropped = 75%, but below ratio_min_samples=10 + m.on_queued() + m.on_step(0.01, {}, emitted=True, latency_s=5.0) # 1 latency sample, ditto + assert report(m, clock).state == OK + + +def test_tick_latency_contract_and_metric() -> None: + clock = Clock() + m, _ = make(clock, max_tick_latency_s=0.5) + for _ in range(12): + m.on_step(0.01, {}, emitted=True, latency_s=1.2) + h = report(m, clock) + assert h.state == DEGRADED + assert any("tick latency p99 1200 ms > contract 500 ms" in v for v in h.violations) + assert h.metrics["tick_latency_p99_ms"] == 1200.0 + + for _ in range(12): + m.on_step(0.01, {}, emitted=True, latency_s=0.02) + assert report(m, clock).state == OK + + +def test_missing_ratio_is_configurable() -> None: + clock = Clock() + m, _ = make(clock, max_missing_ratio=0.9) + for _ in range(10): + m.on_resolved() + for _ in range(8): # 80% missing, under the 90% contract + m.on_missing(["imu"]) + assert report(m, clock).state == OK + + # -- contract messages (log lines) --------------------------------------------------- diff --git a/dimos/memory2/test_puremodule.py b/dimos/memory2/test_puremodule.py index 3d63a2b80e..9259e63503 100644 --- a/dimos/memory2/test_puremodule.py +++ b/dimos/memory2/test_puremodule.py @@ -24,7 +24,7 @@ import pytest from dimos.core.stream import In, Out -from dimos.memory2.puremodule import PureModule, interpolate, latest, tick, window +from dimos.memory2.puremodule import Outputs, PureModule, interpolate, latest, tick, window from dimos.memory2.store.memory import MemoryStore from dimos.memory2.tick import Interpolate, Latest, TickMachine, Window from dimos.memory2.type.observation import Observation @@ -246,6 +246,98 @@ def test_multi_out_returns_dict_rows(store: MemoryStore) -> None: ] +class WriterNav(PureModule): + camera: In[int] = tick() + + cmd: Out[str] + alerts: Out[str] + + def step(self, camera: int, out: Outputs) -> None: + out.cmd = f"go {camera}" + if camera % 2 == 0: + out.alerts = f"even {camera}" + + +def test_out_writer_partial_emission(store: MemoryStore) -> None: + camera = fill(store.stream("camera", int), [(0.1, 1), (0.2, 2)]) + + out = WriterNav.over(camera=camera).to_list() + assert [o.data for o in out] == [ + {"cmd": "go 1"}, + {"cmd": "go 2", "alerts": "even 2"}, + ] + + +def test_out_writer_rejects_unknown_port(store: MemoryStore) -> None: + class Typo(PureModule): + camera: In[int] = tick() + cmd: Out[str] + + def step(self, camera: int, out: Outputs) -> None: + out.cmdd = "oops" + + camera = fill(store.stream("camera", int), [(0.1, 1)]) + with pytest.raises(AttributeError, match=r"unknown output 'cmdd'.*\['cmd'\]"): + Typo.over(camera=camera).to_list() + + +def test_out_writer_stateful_returns_bare_state(store: MemoryStore) -> None: + class Counter(PureModule): + camera: In[int] = tick() + count: Out[int] + + initial_state = 0 + + def step(self, state: int, camera: int, out: Outputs) -> int: + out.count = state + return state + 1 # no tuple — the writer carries the outputs + + camera = fill(store.stream("camera", int), [(i / 10, i) for i in range(4)]) + out = Counter.over(camera=camera).to_list() + assert [o.data for o in out] == [0, 1, 2, 3] + + +def test_out_writer_forbids_returning_values(store: MemoryStore) -> None: + class Confused(PureModule): + camera: In[int] = tick() + cmd: Out[str] + + def step(self, camera: int, out: Outputs) -> str: + out.cmd = "go" + return "also go" + + camera = fill(store.stream("camera", int), [(0.1, 1)]) + with pytest.raises(TypeError, match="declares 'out'"): + Confused.over(camera=camera).to_list() + + +def test_out_writer_last_write_wins(store: MemoryStore) -> None: + class Rewrites(PureModule): + camera: In[int] = tick() + cmd: Out[str] + + def step(self, camera: int, out: Outputs) -> None: + out.cmd = "draft" + out.cmd = "final" + + camera = fill(store.stream("camera", int), [(0.1, 1)]) + (o,) = Rewrites.over(camera=camera).to_list() + assert o.data == "final" + + +def test_input_named_out_is_an_error() -> None: + class BadPort(PureModule): + camera: In[int] = tick() + out: In[float] + cmd: Out[int] + + def step(self, camera: int) -> int: + return camera + + with pytest.raises(TypeError, match="reserved names"): + BadPort._plan() + + class WantsObs(PureModule): camera: In[int] = tick() pose: In[float] = latest() @@ -556,6 +648,7 @@ def step(self, frame: int) -> int: assert _await(lambda: len(outs) == 6) assert outs == [1, 2, 3, 4, 5, 6] # every tick, in order assert len(module._tick_buffer) == 0 # drained, not accumulating + assert _await(lambda: len(module.health_monitor._latency_ms) == 6) # e2e latency measured finally: gates.unblock() unsub() diff --git a/docs/usage/pure_modules.md b/docs/usage/pure_modules.md index 05dd1c58d7..30884d357f 100644 --- a/docs/usage/pure_modules.md +++ b/docs/usage/pure_modules.md @@ -236,22 +236,24 @@ time-rewind mechanical. ## Multiple outputs -Declare several `Out` ports and return a dict keyed by port name. Emitting -a **subset** is allowed — each port fires independently, so a module can -publish a command every tick and an alert only sometimes: +Declare several `Out` ports and ask for the **output writer** — the +reserved `out` parameter. Assignment emits; skipping a port keeps it +quiet that tick, so a module can publish a command every tick and an +alert only sometimes: ```python session=pure ansi=false +from dimos.memory2.puremodule import Outputs + class Navigator(PureModule): pose: In[Pose] = tick() cmd: Out[str] alerts: Out[str] - def step(self, pose: Pose, ts: float) -> dict: - outs = {"cmd": f"forward x={pose.position.x:.2f}"} + def step(self, pose: Pose, ts: float, out: Outputs) -> None: + out.cmd = f"forward x={pose.position.x:.2f}" if pose.position.x > 1.8: - outs["alerts"] = f"approaching boundary at t={ts:.2f}" - return outs + out.alerts = f"approaching boundary at t={ts:.2f}" rows = Navigator.over(pose=pose).to_list() fired = [r for r in rows if "alerts" in r.data] @@ -263,10 +265,13 @@ print(f"{len(rows)} ticks; alerts on {len(fired)}; last: {rows[-1].data}") ``` The rules: one `Out` port → return the bare value (`None` emits nothing); -several → return `{port: value}` (missing keys stay quiet, unknown keys -raise). Deployed live, each entry publishes to its own port. Offline, -`over()` yields one observation per tick whose data is the dict — slice a -single output back out with a map: +several → assign on `out` (an undeclared port raises at that line; +reassignment last-wins; with `out` declared, stateless steps return +`None` and stateful steps return just the new state — no tuple). +Returning a `{port: value}` dict instead of the writer is also accepted. +Deployed live, each emission publishes to its own port. Offline, `over()` +yields one observation per tick whose data is the `{port: value}` dict — +slice a single output back out with a map: ```python session=pure ansi=false boundary_alerts = Navigator.over(pose=pose) \ @@ -345,47 +350,52 @@ Two deployment choices matter: ## Contracts, not log spam -Health is judged against declared contracts — input rates, output rate — -and reported as state transitions, not per-drop warnings. You never build -a monitor yourself: when a module starts, it constructs one from its own -config (`min_output_hz`, `expected_hz`, ...) and feeds it from the tick -loop; it lives at `module.health_monitor` and its snapshots land in the -`_health` stream. But since it's plain bookkeeping with an injectable -clock, we can hand-feed a detached one right here and watch the exact -messages a deployed `Follower` would produce — the first argument is just -the module name stamped on them: +Your module contains **zero health code** — health is judged against +contracts you declare at deployment, and reported as state transitions, +never per-drop warnings. You already wrote one contract without noticing: +`latest(max_age=0.1)` *is* the statement "data older than 100 ms is +unacceptable". The other contracts are rates, and they're two numbers in +the module config — deployment-side, because the robot, sim, and replay +legitimately differ: -```python session=pure ansi=false -from dimos.memory2.health import HealthMonitor - -# what Follower's start() builds internally, driven by hand: -t = 0.0 -monitor = HealthMonitor("follower", min_output_hz=10.0, warmup_s=0.0, - interval_s=1.0, clock=lambda: t) - -for _ in range(3): # a bad second: step emitted only 3 outputs vs the 10 Hz contract - monitor.on_step(duration_s=0.12, ages={}, emitted=True) -t += 1.0 -health = monitor.maybe_report() -print(health.state, "-", health.violations[0]) - -for _ in range(15): # a good second - monitor.on_step(duration_s=0.05, ages={}, emitted=True) -t += 1.0 -print(monitor.maybe_report().state) +```python skip +module = Follower( + expected_hz={"frame": 30}, # "the camera should arrive at 30 Hz" + min_output_hz=10.0, # "I must emit commands at >= 10 Hz" +) +module.start() ``` -```results -DEGRADED - output 3.0 Hz < contract 10 Hz -OK +From here everything is observed, not coded. Suppose the camera delivers +5 Hz instead of 30, then recovers — this is the module's complete log +output (captured from a real run): + +``` +--- camera misbehaving: 5 Hz instead of 30 --- +[inf] Follower warmup: frame 5.7 Hz (expected 30 — LOW) +[war] Follower DEGRADED: input 'frame' at 5.7 Hz, expected 30 Hz; output 4.7 Hz < contract 10 Hz +[war] Follower still DEGRADED (2s): input 'frame' at 5.0 Hz, expected 30 Hz; ... +--- camera recovers to 30 Hz --- +[inf] Follower OK: recovered after 4s ``` -Deployed, those same snapshots append to a `_health` stream in the module -store every second (drop rates by reason, step p50/p99, input staleness), -transitions log once with the violated contract, and a recording captures -the health stream *next to the data it explains* — so a post-incident -notebook can plot the drop ratio against the very frames that were -dropped. +The interaction model: one **warmup** line shortly after start compares +every declared rate to reality (this alone catches miswired or +misconfigured sensors); one WARN on entering `DEGRADED`/`STALLED` naming +exactly the violated contracts; a throttled reminder while it persists; +one INFO with the outage duration on recovery. And the part that makes it +livable: a healthy module skipping two thirds of its frames under +`KeepLast` backpressure logs **nothing** — expected drops are counters, +not warnings. + +When logs aren't enough, the same information is queryable: +`module.health_monitor.state` gives the current `OK`/`DEGRADED`/`STALLED` +in process, and a `_health` stream in the module store receives an +aggregated metrics snapshot every second (drop rates by reason, step +p50/p99, input staleness, observed Hz) — subscribe to it live, or deploy +with a `SqliteStore` and the health history is recorded *next to the data +it explains*, so a post-incident notebook can plot the drop ratio against +the very frames that were dropped. --- @@ -440,9 +450,12 @@ inputs add one sample period of latency to each tick. - **One `Out` port** — `step` returns the value; returning `None` emits nothing, so ticks double as filters. -- **Several `Out` ports** — return `{port_name: value}`. A *partial* dict - is allowed (omitted ports stay quiet that tick); unknown keys raise - `TypeError`. +- **Several `Out` ports** — declare the reserved `out` parameter and + assign (`out.cmd = ...`): set = emit, skip = quiet, undeclared ports + raise at the assignment line, reassignment last-wins. With `out`, + stateless steps return `None`; stateful steps return just the new + state. Returning a partial `{port_name: value}` dict is the accepted + low-level equivalent (unknown keys raise `TypeError`). - **No `Out` ports** — the return value is ignored. Live, every dict entry publishes to its own port (and is appended to that @@ -485,18 +498,33 @@ buffers by pruning, and ticks waiting for interpolation brackets by can't accumulate ticks forever (evictions count as `drops_blocked`). Health: counters always (ticks resolved/stepped, drops by reason, step -p50/p99, per-input Hz, ages of consumed `latest()` values, output rates); -a `_health` stream snapshot every `health_interval_s`; one warmup line -comparing observed input rates to `expected_hz`; transition-logged -`DEGRADED`/`STALLED` with the violated contracts, throttled reminders -every `unhealthy_log_every_s`, recovery logged with duration. Stalls must -persist `stall_after_s` and distinguish "ticks queued but none stepped -(step stuck?)" from "inputs flowing but no ticks resolving (interpolate -input dead?)". Contracts split deliberately: semantic tolerances live in -the declaration (`max_age`, `tolerance`); *rates* live in deployment -config (`expected_hz`, `min_output_hz`) because sim, replay, and the -robot differ. The real SLO is output freshness and rate — alert on the -contract, read drop counters to diagnose why. +p50/p99, end-to-end tick latency p50/p99, per-input Hz, ages of consumed +`latest()` values, output rates); a `_health` stream snapshot every +`health_interval_s`; one warmup line comparing observed input rates to +`expected_hz`; transition-logged `DEGRADED`/`STALLED` with the violated +contracts, throttled reminders every `unhealthy_log_every_s`, recovery +logged with duration. Stalls must persist `stall_after_s` and distinguish +"ticks queued but none stepped (step stuck?)" from "inputs flowing but no +ticks resolving (interpolate input dead?)". + +The contracts: + +| Config | Contract | Kind | +|---|---|---| +| `expected_hz={"pose": 50}` | input arrives at its declared rate | absolute (liveness) | +| `min_output_hz=10` | ticks emit outputs at this rate | absolute (liveness) | +| `max_drop_ratio=0.8` | step keeps up: ≤ this fraction of viable ticks skipped by backpressure | ratio (scale-free) | +| `max_missing_ratio=0.5` | per input: ≤ this fraction of ticks with the input missing | ratio (scale-free) | +| `max_tick_latency_s=0.2` | p99 trigger-arrival → outputs-published; covers queue growth under any policy | latency | + +Ratio and latency contracts only evaluate on windows with at least +`ratio_min_samples` samples — tiny windows make ratios noise, and at zero +traffic they'd pass vacuously, which is why the absolute contracts remain +the liveness floor. Contracts split deliberately: semantic tolerances +live in the declaration (`max_age`, `tolerance`); rates and ratios live +in deployment config because sim, replay, and the robot differ. The real +SLO is output freshness and rate — alert on the contract, read drop +counters to diagnose why. ## Where next From df818c42ed07328c310182b43a8c05877d1b5500 Mon Sep 17 00:00:00 2001 From: Ivan Nikolic Date: Fri, 12 Jun 2026 17:38:15 +0800 Subject: [PATCH 08/10] docs: scale-free contract demo, lazy projection note, I/O bundle design MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Usage doc gains the second captured transcript — a heavy step that keeps the absolute 10 Hz contract while drop-ratio and tick-latency catch it — plus ratio/latency examples in the deployment block and recompute semantics for projected multi-output streams. Design notes: the agreed nested In/Out bundle design (annotation injection, nested-XOR-flat rule, binding by annotation, tick-row synergy, stated trade-offs). --- dimos/memory2/puremodule.md | 72 ++++++++++++++++++++++++++++--------- docs/usage/pure_modules.md | 22 ++++++++++-- 2 files changed, 76 insertions(+), 18 deletions(-) diff --git a/dimos/memory2/puremodule.md b/dimos/memory2/puremodule.md index 06eeb626f1..66a2b95ca5 100644 --- a/dimos/memory2/puremodule.md +++ b/dimos/memory2/puremodule.md @@ -160,22 +160,62 @@ return remains accepted as the low-level equivalent. `out` joins pure: fresh per tick, collected immediately — the return value passed inside-out. -The typed future is **bundles**: a nested `class Out(Bundle)` is both the -port declaration (ports synthesized from its annotations) and the -writer's static type (`out.cmd = 5` becomes a mypy error). Inputs get the -same treatment (`class In(Bundle)` with samplers as field defaults — -which is the original InputState idea), with mixed binding allowed: spread -inputs by name for small modules, take the whole bundle for big ones. The -key compatibility rule: **today's flat syntax is an anonymous bundle** — -flat declarations compile to implicit `In`/`Out` bundles at plan time, so -nothing migrates and structural wiring works on every existing module. -Bundles then subsume `dimos/spec/` Protocols as real, connectable, -type-checked interfaces (`bp.connect(cam.o, nav.i)` matched by name+type; -interface inheritance like `DepthCameraOut(CameraOut)`), which is the -blueprint-rethink track. Note: the class-level-`None` port attribute -dance in `Module.__init_subclass__` cites Dask actor proxies — dimos no -longer uses Dask, so that constraint is gone and the real serialization -surface for any port rework is `RemoteIn`/`RemoteOut` over LCM. +The typed next step is **nested I/O bundles** (agreed design, not built): + +```python skip +class Navigator(PureModule): + class In: + pose: PoseStamped = tick() + image: Image | None = latest() # optionality lives on the field now + imu: list[Imu] = window(0.1) + + class Out: + cmd_vel: Twist + alerts: str + + def step(self, i: In, out: Out) -> None: + out.cmd_vel = drive(i.pose, i.imu) + if i.image is not None and blocked(i.image): + out.alerts = "obstacle" +``` + +One declaration per port doing three jobs: port synthesis, +static typing, and the plan's I/O table. The mechanics and rules: + +- **No core changes**: `PureModule.__init_subclass__` injects + `name: In[T]` / `name: Out[T]` entries into `cls.__annotations__` + *before* chaining to `Module.__init_subclass__`, so the existing port + machinery creates real ports and blueprint/LCM wiring is unchanged at + runtime. +- **Static typing lands where the code is**: `out: Out` makes every + assignment mypy-checked against the nested class; `i: In` types every + read. The runtime objects stay the validated writer / a per-tick row + object — the annotations do the static work. +- **Nested XOR flat, per direction**: a nested class named `In`/`Out` + shadows the imported port types for any flat declaration written below + it — so a module uses one form per direction, validated at plan time. + Flat stays canonical for simple modules. +- **Binding detected by annotation**: a single step param annotated with + the nested `In` class receives the whole bundle (with `i.ts` reserved); + otherwise params spread by name as today — both supported, fields are + names either way. +- **In-bundle unifications**: optionality, `Observation[X]` access, and + window list types all move onto the field annotation — today they're + split between the port declaration and the step signature. +- **Tick-row synergy**: the `In` bundle instance *is* the resolved tick + row — recording tick rows (replay fidelity) becomes serializing these, + and a recorded tick replays as a direct `step(i, out)` call. +- **Trade-off, stated**: synthesized ports aren't statically visible to + *external* code (`module.cmd_vel` is runtime-only for mypy); modules + whose ports are hand-wired in typed code keep the flat form. +- Declaration reuse via inheritance (`class In(CameraFeed)`) works + without touching wiring; full structural connect + (`bp.connect(cam.o, nav.i)`) and subsuming `dimos/spec/` Protocols + remain the separate blueprint-rethink track. Note: the + class-level-`None` port attribute dance in `Module.__init_subclass__` + cites Dask actor proxies — dimos no longer uses Dask, so that + constraint is gone and the real serialization surface for any port + rework is `RemoteIn`/`RemoteOut` over LCM. ## Multi-output offline shape (planned: run handle) diff --git a/docs/usage/pure_modules.md b/docs/usage/pure_modules.md index 30884d357f..29b59cadb5 100644 --- a/docs/usage/pure_modules.md +++ b/docs/usage/pure_modules.md @@ -284,8 +284,11 @@ print(boundary_alerts.last().data) approaching boundary at t=2.00 ``` -(That dict-row shape is a known asymmetry with the live per-port streams — -a run handle with one stream per output is on the design table.) +Each projected stream is lazy, so iterating two outputs separately runs +the module twice — the same recompute-on-reiterate semantics as any +memory2 transform stream. To run once and consume many times, use the +stream API's own materialization: `.save()` the row stream into a store, +then project from the stored rows. ## Modules compose like streams @@ -362,6 +365,8 @@ legitimately differ: module = Follower( expected_hz={"frame": 30}, # "the camera should arrive at 30 Hz" min_output_hz=10.0, # "I must emit commands at >= 10 Hz" + max_drop_ratio=0.5, # "skip at most half my frames" (scale-free) + max_tick_latency_s=0.1, # "commands come from <= 100ms-old frames" ) module.start() ``` @@ -379,6 +384,19 @@ output (captured from a real run): [inf] Follower OK: recovered after 4s ``` +A different failure: the step gets heavy. It still clears the absolute +10 Hz output contract — that's the trap with absolute rates — but the +scale-free contracts catch it (also a real captured run): + +``` +--- healthy: 30 Hz camera, fast step --- +[inf] Follower warmup: frame 29.2 Hz (expected 30) +--- step gets slow (80 ms): keeps the 10 Hz output contract, but... --- +[war] Follower DEGRADED: backpressure drop ratio 53% > contract 50% (step not keeping up); tick latency p99 112 ms > contract 100 ms +--- step recovers --- +[inf] Follower OK: recovered after 4s +``` + The interaction model: one **warmup** line shortly after start compares every declared rate to reality (this alone catches miswired or misconfigured sensors); one WARN on entering `DEGRADED`/`STALLED` naming From d5f9e8aa4e441f9cfd57a19f7be96ed2b7c968dc Mon Sep 17 00:00:00 2001 From: Ivan Nikolic Date: Sun, 14 Jun 2026 14:15:25 +0800 Subject: [PATCH 09/10] pure modules iteration --- dimos/memory2/health.py | 80 ++++++++++--- dimos/memory2/puremodule.md | 38 ++++-- dimos/memory2/puremodule.py | 197 +++++++++++++++++++++---------- dimos/memory2/test_health.py | 62 +++++++++- dimos/memory2/test_puremodule.py | 116 +++++++++++++++++- dimos/memory2/tick.py | 81 ++++++++++--- docs/usage/pure_modules.md | 123 ++++++++++++++++--- 7 files changed, 570 insertions(+), 127 deletions(-) diff --git a/dimos/memory2/health.py b/dimos/memory2/health.py index 99ff9628f7..b86898c037 100644 --- a/dimos/memory2/health.py +++ b/dimos/memory2/health.py @@ -50,6 +50,8 @@ import threading from typing import TYPE_CHECKING, Any +from pydantic import BaseModel, Field + from dimos.utils.logging_config import setup_logger if TYPE_CHECKING: @@ -62,6 +64,37 @@ STALLED = "STALLED" +class ModuleContracts(BaseModel): + """Module-wide contracts: the tick loop's promises, not any one port's.""" + + min_output_hz: float | None = Field(default=None, gt=0.0) + """Absolute liveness floor: rate of ticks that emit at least one output.""" + + max_drop_ratio: float | None = Field(default=None, gt=0.0, le=1.0) + """Scale-free "the step keeps up": fraction of viable ticks skipped by backpressure.""" + + max_tick_latency_s: float | None = Field(default=None, gt=0.0) + """p99 end-to-end latency, trigger arrival to outputs published.""" + + max_missing_ratio: float = Field(default=0.5, gt=0.0, le=1.0) + """Global default per-input missing threshold (per-input contracts override).""" + + +class HealthConfig(BaseModel): + """Monitor mechanics — how health is measured and reported, not what it promises.""" + + interval_s: float = Field(default=1.0, gt=0.0) + warmup_s: float = Field(default=5.0, ge=0.0) + unhealthy_log_every_s: float = Field(default=10.0, gt=0.0) + stall_after_s: float = Field(default=5.0, gt=0.0) + ratio_min_samples: int = Field(default=10, ge=1) + """Ratio/latency contracts only evaluate on windows with at least this + many samples — tiny windows are noise, zero traffic passes vacuously.""" + + stream: bool = True + """Append snapshots to a ``_health`` stream in the module store.""" + + def _percentile(values: list[float], q: float) -> float: if not values: return 0.0 @@ -107,41 +140,45 @@ def __init__( self, name: str, *, + contracts: ModuleContracts | None = None, + health: HealthConfig | None = None, expected_hz: dict[str, float] | None = None, - min_output_hz: float | None = None, - max_drop_ratio: float | None = None, - max_tick_latency_s: float | None = None, - max_missing_ratio: float = 0.5, - ratio_min_samples: int = 10, - interval_s: float = 1.0, - warmup_s: float = 5.0, - unhealthy_log_every_s: float = 10.0, - stall_after_s: float = 5.0, + out_min_hz: dict[str, float] | None = None, + missing_ratio_by_input: dict[str, float] | None = None, rate_tolerance: float = 0.5, sink: Callable[[Health], None] | None = None, clock: Callable[[], float] | None = None, ) -> None: + """``contracts`` = module-wide promises; ``health`` = reporting + mechanics; the three dicts are the *resolved* per-port contracts + (class declarations already merged with deployment overrides by + the caller).""" import time + contracts = contracts if contracts is not None else ModuleContracts() + cfg = health if health is not None else HealthConfig() + self.name = name self.expected_hz = dict(expected_hz or {}) - self.min_output_hz = min_output_hz - self.max_drop_ratio = max_drop_ratio - self.max_tick_latency_s = max_tick_latency_s - self.max_missing_ratio = max_missing_ratio + self.min_output_hz = contracts.min_output_hz + self.out_min_hz = dict(out_min_hz or {}) + self.max_drop_ratio = contracts.max_drop_ratio + self.max_tick_latency_s = contracts.max_tick_latency_s + self.max_missing_ratio = contracts.max_missing_ratio + self.missing_ratio_by_input = dict(missing_ratio_by_input or {}) # Ratio contracts are vacuous on tiny windows (3 drops of 4 ticks = # 75%) and at zero traffic — they only evaluate at this many samples. # Absolute contracts (min_output_hz, expected_hz) remain the liveness floor. - self.ratio_min_samples = ratio_min_samples - self.interval_s = interval_s - self.warmup_s = warmup_s - self.unhealthy_log_every_s = unhealthy_log_every_s + self.ratio_min_samples = cfg.ratio_min_samples + self.interval_s = cfg.interval_s + self.warmup_s = cfg.warmup_s + self.unhealthy_log_every_s = cfg.unhealthy_log_every_s self.rate_tolerance = rate_tolerance self.sink = sink self.clock = clock if clock is not None else time.time # A stall must persist this many reporting windows before we call it — # a single zero-step window is normal for a step slower than the interval. - self._stall_windows = max(1, round(stall_after_s / interval_s)) + self._stall_windows = max(1, round(cfg.stall_after_s / cfg.interval_s)) self._zero_step_windows = 0 self._zero_resolve_windows = 0 @@ -284,6 +321,10 @@ def _violations(self, win: _Window, dt: float, metrics: dict[str, float]) -> lis emitted_hz = win.emitted / dt if emitted_hz < self.min_output_hz: v.append(f"output {emitted_hz:.1f} Hz < contract {self.min_output_hz:g} Hz") + for name, min_hz in self.out_min_hz.items(): + port_hz = win.outputs.get(name, 0) / dt + if port_hz < min_hz: + v.append(f"output '{name}' at {port_hz:.1f} Hz < contract {min_hz:g} Hz") if self.max_drop_ratio is not None and win.queued >= self.ratio_min_samples: dropped = max(0, win.queued - win.stepped - self._buffer_len()) ratio = dropped / win.queued @@ -300,7 +341,8 @@ def _violations(self, win: _Window, dt: float, metrics: dict[str, float]) -> lis f"{self.max_tick_latency_s * 1000:.0f} ms" ) for name, n in win.missing.items(): - if win.resolved >= self.ratio_min_samples and n / win.resolved > self.max_missing_ratio: + threshold = self.missing_ratio_by_input.get(name, self.max_missing_ratio) + if win.resolved >= self.ratio_min_samples and n / win.resolved > threshold: v.append(f"input '{name}' missing on {n}/{win.resolved} ticks (stale or dead?)") return v diff --git a/dimos/memory2/puremodule.md b/dimos/memory2/puremodule.md index 66a2b95ca5..6df369b7e6 100644 --- a/dimos/memory2/puremodule.md +++ b/dimos/memory2/puremodule.md @@ -55,8 +55,9 @@ recordings capture health next to the frames it explains), log on state transitions only, alert on declared contracts. The real SLO is output freshness and rate; drop counters are diagnosis. Contracts split deliberately: semantic tolerances (`max_age`, `tolerance`) belong in the -declaration because they're algorithm truths; rates (`expected_hz`, -`min_output_hz`) belong in deployment config because sim, replay, and +declaration because they're algorithm truths; rate *overrides* +(`inputs={...}`, `outputs={...}`, `contracts={...}`) belong in deployment +config because sim, replay, and the robot legitimately differ. ### Contracts on inputs vs outputs @@ -78,12 +79,12 @@ output contract — input expectations really earn their keep at the *edges*, where the producer (a sensor driver) has no health of its own and the first consumer hosts its contract. The graph-era resolution is to attach rate contracts to *streams* (declared once, checked at the -producer when possible, at the first consumer otherwise); per-module -`expected_hz` is the pragmatic stand-in until then. +producer when possible, at the first consumer otherwise); +per-module input expectations are the pragmatic stand-in until then. ### Absolute vs ratio vs latency contracts -Absolute rates (`min_output_hz`, `expected_hz`) bake the deployment's +Absolute rates (`contracts.min_output_hz`, per-input `expect_hz`) bake the deployment's sensor rates into the contract; ratio contracts (`max_drop_ratio`, `max_missing_ratio`) are scale-free — "the step keeps up, with headroom" survives a camera swap unchanged. But ratios are vacuous at zero traffic @@ -98,12 +99,27 @@ felt consequence is latency. Hence `max_tick_latency_s` instead: p99 of trigger-arrival → outputs-published, meaningful under every policy, subsuming depth (which stays an exported gauge for diagnosis). -Two known refinements deferred until needed: the health *state* should -arguably be driven by output contracts only (inputs below expectation -while outputs still meet contract is "at risk", not degraded — today -both trip `DEGRADED`); and `min_output_hz` should become per-port -(`{"cmd": 10}`) once real multi-output modules exist — partial emission -makes a single number wrong for deliberately sparse ports like alerts. +Per-port contracts are **class declarations only**: samplers take +`expect_hz`/`max_missing_ratio`, `Out` ports take `contract(min_hz=...)` +as their default value — the rates the module was built for, declared +where the port is. Per-port *deployment* overrides (an `inputs=`/ +`outputs=` config layer with shorthand coercion) were built and then +removed: three ways to declare one thing, a precedence rule, and a +typing wart, for a feature with no consumer — YAGNI. Re-adding is cheap +if a real deployment needs it (the monitor consumes resolved dicts +either way). Per-port `contract(min_hz=...)` resolves the former +deferred item — `contracts.min_output_hz` counted ticks emitting +*anything*, which deliberately sparse ports made meaningless. + +Deployment config keeps what genuinely varies per deployment, structured +not flat: `contracts` (module-wide promises) and `health` (reporting +mechanics) — sub-models living in `health.py`, consumed by +`HealthMonitor` directly, one source of truth. + +One refinement still deferred: the health *state* should arguably be +driven by output contracts only (inputs below expectation while outputs +still meet contract is "at risk", not degraded — today both trip +`DEGRADED`). ## Replay fidelity under drops (planned: record tick rows) diff --git a/dimos/memory2/puremodule.py b/dimos/memory2/puremodule.py index ef9ce9a29d..e23403f809 100644 --- a/dimos/memory2/puremodule.py +++ b/dimos/memory2/puremodule.py @@ -83,6 +83,7 @@ def step(self, pose: Pose, out: Outputs) -> None: from __future__ import annotations from dataclasses import dataclass +import functools import inspect import queue import sys @@ -105,7 +106,7 @@ def step(self, pose: Pose, out: Outputs) -> None: from dimos.core.resource import CompositeResource from dimos.core.stream import In, Out from dimos.memory2.buffer import BackpressureBuffer, ClosedError, KeepLast -from dimos.memory2.health import Health, HealthMonitor +from dimos.memory2.health import Health, HealthConfig, HealthMonitor, ModuleContracts from dimos.memory2.store.null import NullStore from dimos.memory2.tick import ( MISSING, @@ -130,7 +131,16 @@ def step(self, pose: Pose, out: Outputs) -> None: logger = setup_logger() -__all__ = ["Outputs", "PureModule", "interpolate", "latest", "tick", "window"] +__all__ = [ + "OutContract", + "Outputs", + "PureModule", + "contract", + "interpolate", + "latest", + "tick", + "window", +] _STOP = object() @@ -152,6 +162,34 @@ class _Plan: params: tuple[_Param, ...] stateful: bool uses_out: bool # step declares the reserved `out` writer parameter + expect_hz: dict[str, float] # class-declared per-input rates + missing_ratio: dict[str, float] # class-declared per-input missing thresholds + out_min_hz: dict[str, float] # class-declared per-output rate contracts + + +class OutContract: + """Class-level health contract on an ``Out`` port (see :func:`contract`).""" + + def __init__(self, min_hz: float | None = None) -> None: + if min_hz is not None and min_hz <= 0: + raise ValueError(f"contract(min_hz) requires min_hz > 0, got {min_hz}") + self.min_hz = min_hz + + def __repr__(self) -> str: + return f"OutContract(min_hz={self.min_hz})" + + +def contract(min_hz: float | None = None) -> Any: + """Declare a per-output health contract as the port's default value:: + + cmd: Out[Twist] = contract(min_hz=10) # this port must emit >= 10 Hz + alerts: Out[str] # sparse by design — no contract + + Class-level declaration — per-port contracts live here, not in + deployment config. Typed ``Any`` so it can sit on an ``Out[X]`` + annotation. + """ + return OutContract(min_hz=min_hz) class Outputs: @@ -217,51 +255,34 @@ def bound(*args: Any, **kwargs: Any) -> Any: class PureModuleConfig(ModuleConfig): - """Deployment-side contracts and health reporting knobs. - - Semantic tolerances (``max_age``, ``tolerance``) live in the module's - sampler declarations; *rates* live here because sim, replay, and the - robot legitimately differ. + """Deployment-side module-wide contracts and health mechanics. + + Per-port contracts live in the module's class declaration only — + samplers (``tick(expect_hz=30)``, ``latest(max_missing_ratio=...)``) + and ``contract(min_hz=...)`` on ``Out`` ports. Per-port *deployment* + overrides were tried and removed (no consumer; three ways to say one + thing) — re-add when a real deployment needs them. This config + carries what genuinely varies per deployment:: + + Follower( + contracts={"max_drop_ratio": 0.8}, + health={"warmup_s": 1.0}, + ) """ - expected_hz: dict[str, float] = Field(default_factory=dict) - """Expected arrival rate per input — checked once after warmup, then - continuously (violation below 50% of expected).""" - - min_output_hz: float | None = None - """Contract: rate of ticks that emit at least one output. Absolute — - the liveness floor that ratio contracts can't provide.""" - - max_drop_ratio: float | None = None - """Contract: fraction of viable ticks skipped by backpressure. - Scale-free "the step keeps up" — independent of deployment rates.""" - - max_tick_latency_s: float | None = None - """Contract: p99 end-to-end latency, trigger arrival to outputs - published. Meaningful under every backpressure policy (under - ``Unbounded`` queue growth shows up here first).""" - - max_missing_ratio: float = Field(0.5, gt=0.0, le=1.0) - """Per-input staleness contract: fraction of resolved ticks where the - input was missing before it's flagged.""" + contracts: ModuleContracts = Field(default_factory=ModuleContracts) + """Module-wide promises: ``min_output_hz``, ``max_drop_ratio``, + ``max_tick_latency_s``, global ``max_missing_ratio``.""" - ratio_min_samples: int = Field(10, ge=1) - """Ratio contracts only evaluate on windows with at least this many - samples — tiny windows make ratios noise, zero traffic makes them - vacuously pass.""" - - health_interval_s: float = 1.0 - health_warmup_s: float = 5.0 - unhealthy_log_every_s: float = 10.0 - stall_after_s: float = 5.0 - health_stream: bool = True + health: HealthConfig = Field(default_factory=HealthConfig) + """Reporting mechanics: ``interval_s``, ``warmup_s``, + ``unhealthy_log_every_s``, ``stall_after_s``, ``ratio_min_samples``, + ``stream``.""" max_pending_ticks: int = 64 """Cap on live ticks awaiting interpolation brackets — bounds memory when an ``interpolate()`` input dies; evictions count as ``drops_blocked``. Offline ``over()`` is uncapped (exact).""" - """Append 1 Hz aggregated Health snapshots to a ``_health`` stream in - the module store (live-only on NullStore, recorded on SqliteStore).""" class PureModule(Module): @@ -283,6 +304,23 @@ class PureModule(Module): indexers) or ``Bounded(n)``/``DropNew(n)`` in between. The instance is a template — each ``start()`` gets a fresh ``clone()``.""" + input_sources: dict[str, Any] | None = None + """Per-input source overrides for the live runner — the live↔stored switch. + + Set before ``start()``: ``{input_name: source}`` where a source is + anything with ``.observable()`` (a :class:`ReplayStream` for + wall-clock-paced recordings, a stored memory2 stream for + fast-as-possible feeding) or a raw RxPY observable. Inputs not listed + keep their pub/sub port. Sources emitting :class:`Observation` keep + their recorded timestamps; raw payloads are stamped like port arrivals + (``msg.ts`` if present, else now). A module with *every* input sourced + needs no transports at all:: + + m = Navigator() + m.input_sources = {"pose": db.replay(speed=2.0).streams.pose} + m.start() # same module, fed from a recording, paced 2x + """ + def step(self, *args: Any, **kwargs: Any) -> Any: # pragma: no cover - overridden raise NotImplementedError(f"{type(self).__name__} must define step()") @@ -306,6 +344,9 @@ def _build_plan(cls) -> _Plan: outs: dict[str, type] = {} samplers: dict[str, Sampler] = {} trigger: str | None = None + expect_hz: dict[str, float] = {} + missing_ratio: dict[str, float] = {} + out_min_hz: dict[str, float] = {} for name, ann in hints.items(): if get_origin(ann) is typing.Annotated: @@ -336,8 +377,17 @@ def _build_plan(cls) -> _Plan: samplers[name] = sampler else: samplers[name] = latest() + declared = sampler if isinstance(sampler, Sampler) else None + if declared is not None: + if declared.expect_hz is not None: + expect_hz[name] = declared.expect_hz + if declared.max_missing_ratio is not None: + missing_ratio[name] = declared.max_missing_ratio elif origin is Out: outs[name] = (get_args(ann) or (object,))[0] + marker = inspect.getattr_static(cls, name, None) + if isinstance(marker, OutContract) and marker.min_hz is not None: + out_min_hz[name] = marker.min_hz if trigger is None: raise TypeError( @@ -389,6 +439,9 @@ def _build_plan(cls) -> _Plan: params=tuple(params), stateful=stateful, uses_out=uses_out, + expect_hz=expect_hz, + missing_ratio=missing_ratio, + out_min_hz=out_min_hz, ) # -- binding & dispatch ----------------------------------------------------- @@ -583,7 +636,7 @@ def start(self) -> None: name: store.stream(name, port.type) for name, port in self.outputs.items() } - health_stream = store.stream("_health", dict) if cfg.health_stream else None + health_stream = store.stream("_health", dict) if cfg.health.stream else None def _sink(h: Health) -> None: assert health_stream is not None @@ -593,16 +646,12 @@ def _sink(h: Health) -> None: monitor = HealthMonitor( str(self), - expected_hz=cfg.expected_hz, - min_output_hz=cfg.min_output_hz, - max_drop_ratio=cfg.max_drop_ratio, - max_tick_latency_s=cfg.max_tick_latency_s, - max_missing_ratio=cfg.max_missing_ratio, - ratio_min_samples=cfg.ratio_min_samples, - interval_s=cfg.health_interval_s, - warmup_s=cfg.health_warmup_s, - unhealthy_log_every_s=cfg.unhealthy_log_every_s, - stall_after_s=cfg.stall_after_s, + contracts=cfg.contracts, + health=cfg.health, + # per-port contracts come from the class declaration (the plan) + expected_hz=plan.expect_hz, + out_min_hz=plan.out_min_hz, + missing_ratio_by_input=plan.missing_ratio, sink=_sink if health_stream is not None else None, ) self.health_monitor = monitor @@ -615,24 +664,52 @@ def _sink(h: Health) -> None: buffer_len=lambda: len(ticks), pending_len=lambda: len(machine.pending) ) + sources = dict(self.input_sources or {}) + unknown_sources = set(sources) - set(plan.ins) + if unknown_sources: + raise TypeError( + f"{type(self).__name__}.input_sources has unknown inputs " + f"{sorted(unknown_sources)} — declared: {sorted(plan.ins)}" + ) + + def _ingest(name: str, item: Any) -> None: + """Feed one arrival — a raw payload (port/replay) or an Observation.""" + if isinstance(item, Observation): + obs = item # sourced from a store: keep the recorded ts + self._streams[name].append(obs.data, ts=obs.ts) + else: + ts = getattr(item, "ts", None) or time.time() + self._streams[name].append(item, ts=ts) + obs = Observation(ts=ts, data_type=type(item), _data=item) + monitor.on_input(name) + q.put((name, obs)) + + def _source_error(name: str, e: Exception) -> None: + logger.exception("%s: input source %r failed: %s", self, name, e) + for name, port in self.inputs.items(): if name not in plan.ins: continue - - def _on_msg(msg: Any, _name: str = name) -> None: - ts = getattr(msg, "ts", None) or time.time() - self._streams[_name].append(msg, ts=ts) - monitor.on_input(_name) - q.put((_name, Observation(ts=ts, data_type=type(msg), _data=msg))) - - self.register_disposable(Disposable(port.subscribe(_on_msg))) + source = sources.get(name) + if source is None: + self.register_disposable( + Disposable(port.subscribe(functools.partial(_ingest, name))) + ) + else: + observable = source.observable() if hasattr(source, "observable") else source + self.register_disposable( + observable.subscribe( + on_next=functools.partial(_ingest, name), + on_error=functools.partial(_source_error, name), + ) + ) def _align_loop() -> None: """Drain raw events fast; resolve + bind ticks; never blocks on step.""" blocked_seen = 0 while True: try: - item = q.get(timeout=cfg.health_interval_s) + item = q.get(timeout=cfg.health.interval_s) except queue.Empty: monitor.maybe_report() continue @@ -661,7 +738,7 @@ def _step_loop() -> None: state = self.initial_state while True: try: - tobs, kwargs, ages = ticks.take(timeout=cfg.health_interval_s) + tobs, kwargs, ages = ticks.take(timeout=cfg.health.interval_s) except TimeoutError: monitor.maybe_report() continue diff --git a/dimos/memory2/test_health.py b/dimos/memory2/test_health.py index e1c97ce4c6..9add4325b5 100644 --- a/dimos/memory2/test_health.py +++ b/dimos/memory2/test_health.py @@ -21,7 +21,15 @@ import pytest import dimos.memory2.health -from dimos.memory2.health import DEGRADED, OK, STALLED, Health, HealthMonitor +from dimos.memory2.health import ( + DEGRADED, + OK, + STALLED, + Health, + HealthConfig, + HealthMonitor, + ModuleContracts, +) class LogSpy: @@ -65,11 +73,32 @@ def advance(self, dt: float) -> float: return self.t +_CONTRACT_KEYS = {"min_output_hz", "max_drop_ratio", "max_tick_latency_s", "max_missing_ratio"} +_HEALTH_KEYS = { + "interval_s", + "warmup_s", + "unhealthy_log_every_s", + "stall_after_s", + "ratio_min_samples", + "stream", +} + + def make(clock: Clock, **kwargs: Any) -> tuple[HealthMonitor, list[Health]]: + """Build a monitor from flat kwargs, routed into the structured sub-configs.""" snaps: list[Health] = [] - kwargs.setdefault("interval_s", 1.0) - kwargs.setdefault("warmup_s", 0.0) # most tests skip warmup gating - m = HealthMonitor("mod", clock=clock, sink=snaps.append, **kwargs) + health_kwargs = {k: v for k, v in kwargs.items() if k in _HEALTH_KEYS} + health_kwargs.setdefault("interval_s", 1.0) + health_kwargs.setdefault("warmup_s", 0.0) # most tests skip warmup gating + rest = {k: v for k, v in kwargs.items() if k not in _CONTRACT_KEYS | _HEALTH_KEYS} + m = HealthMonitor( + "mod", + clock=clock, + sink=snaps.append, + contracts=ModuleContracts(**{k: v for k, v in kwargs.items() if k in _CONTRACT_KEYS}), + health=HealthConfig(**health_kwargs), + **rest, + ) return m, snaps @@ -270,6 +299,31 @@ def test_missing_ratio_is_configurable() -> None: assert report(m, clock).state == OK +def test_per_output_contract_only_flags_contracted_port() -> None: + clock = Clock() + m, _ = make(clock, out_min_hz={"cmd": 10.0}) + for _ in range(3): + m.on_output("cmd") # 3 Hz < 10 Hz contract + # 'alerts' emits nothing at all — sparse by design, uncontracted, no violation + h = report(m, clock) + assert h.state == DEGRADED + assert h.violations == ("output 'cmd' at 3.0 Hz < contract 10 Hz",) + + for _ in range(12): + m.on_output("cmd") + assert report(m, clock).state == OK + + +def test_per_input_missing_ratio_overrides_global() -> None: + clock = Clock() + m, _ = make(clock, max_missing_ratio=0.5, missing_ratio_by_input={"gps": 0.9}) + for _ in range(10): + m.on_resolved() + for _ in range(8): + m.on_missing(["gps"]) # 80% — over global 0.5, under gps's declared 0.9 + assert report(m, clock).state == OK + + # -- contract messages (log lines) --------------------------------------------------- diff --git a/dimos/memory2/test_puremodule.py b/dimos/memory2/test_puremodule.py index 9259e63503..db41594d8c 100644 --- a/dimos/memory2/test_puremodule.py +++ b/dimos/memory2/test_puremodule.py @@ -24,7 +24,15 @@ import pytest from dimos.core.stream import In, Out -from dimos.memory2.puremodule import Outputs, PureModule, interpolate, latest, tick, window +from dimos.memory2.puremodule import ( + Outputs, + PureModule, + contract, + interpolate, + latest, + tick, + window, +) from dimos.memory2.store.memory import MemoryStore from dimos.memory2.tick import Interpolate, Latest, TickMachine, Window from dimos.memory2.type.observation import Observation @@ -540,6 +548,16 @@ def _collect(msg: float) -> None: module.stop() +def _reset_rx_pool() -> None: + """Replace the shared RxPY thread pool so the conftest leak check passes.""" + from reactivex.scheduler import ThreadPoolScheduler + + import dimos.utils.threadpool as tp + + tp.scheduler.executor.shutdown(wait=True) + tp.scheduler = ThreadPoolScheduler(max_workers=tp.get_max_workers()) + + def _await(condition: Callable[[], bool], timeout: float = 5.0) -> bool: """Bounded wait on a cheap condition — no fixed sleeps in assertions.""" deadline = time.monotonic() + timeout @@ -617,6 +635,102 @@ def step(self, frame: int) -> int: module.stop() +@pytest.mark.tool +def test_input_sources_feed_live_runner_from_store(store: MemoryStore) -> None: + """The live↔stored switch: all inputs sourced -> no transports anywhere.""" + from dimos.memory2.buffer import Unbounded + + gates = _GatedSteps() + + class TsEcho(PureModule): + frame: In[int] = tick() + out: Out[float] + backpressure = Unbounded() + + def step(self, frame: int, ts: float) -> float: + gates.gate() + return ts + + recorded = fill(store.stream("frames", int), [(10.0, 1), (10.5, 2), (11.0, 3)]) + + module = TsEcho() + module.input_sources = {"frame": recorded} # a stored stream is a valid source + outs: list[float] = [] + unsub = module.out.subscribe(outs.append) + + module.start() + try: + for _ in range(3): + assert gates.step_once() + assert _await(lambda: len(outs) == 3) + assert outs == [10.0, 10.5, 11.0] # recorded timestamps, not wall clock + finally: + gates.unblock() + unsub() + module.stop() + _reset_rx_pool() # source.observable() schedules on the shared rx pool + + +class Contracted(PureModule): + frame: In[int] = tick(expect_hz=30) + gps: In[str] = latest(max_age=0.5, expect_hz=1, max_missing_ratio=0.9) + + cmd: Out[str] = contract(min_hz=10) + alerts: Out[str] # sparse by design — no contract + + def step(self, frame: int, gps: str | None) -> dict[str, str]: + return {"cmd": f"go {frame}"} + + +def test_class_level_contracts_collected_in_plan() -> None: + plan = Contracted._plan() + assert plan.expect_hz == {"frame": 30, "gps": 1} + assert plan.missing_ratio == {"gps": 0.9} + assert plan.out_min_hz == {"cmd": 10} + + +def test_contracted_out_port_still_declared_in_blueprint() -> None: + bp = Contracted.blueprint() + (atom,) = bp.blueprints + assert {"frame", "gps", "cmd", "alerts"} <= {s.name for s in atom.streams} + + +@pytest.mark.tool +def test_class_contracts_reach_the_monitor(store: MemoryStore) -> None: + module = Contracted(contracts={"max_drop_ratio": 0.8}) # module-wide config still applies + module.input_sources = { # empty sourced inputs: no transports needed + "frame": store.stream("frame", int), + "gps": store.stream("gps", str), + } + try: + module.start() + assert module.health_monitor.expected_hz == {"frame": 30, "gps": 1} + assert module.health_monitor.out_min_hz == {"cmd": 10} + assert module.health_monitor.missing_ratio_by_input == {"gps": 0.9} + assert module.health_monitor.max_drop_ratio == 0.8 + finally: + module.stop() + _reset_rx_pool() + + +@pytest.mark.tool +def test_input_sources_unknown_name_raises() -> None: + class Echo(PureModule): + frame: In[int] = tick() + out: Out[int] + + def step(self, frame: int) -> int: + return frame + + module = Echo() + module.input_sources = {"typo": object()} + try: + with pytest.raises(TypeError, match="typo"): + module.start() + finally: + module.stop() + + @pytest.mark.tool def test_live_backpressure_unbounded_processes_everything() -> None: from dimos.core.transport import pLCMTransport diff --git a/dimos/memory2/tick.py b/dimos/memory2/tick.py index 00fd21b0d5..47125e2994 100644 --- a/dimos/memory2/tick.py +++ b/dimos/memory2/tick.py @@ -168,7 +168,24 @@ def interp_data(a: Any, b: Any, alpha: float, t: float) -> Any: class Sampler(ABC): - """How to read one input's value at a tick time *t* from its buffer.""" + """How to read one input's value at a tick time *t* from its buffer. + + Samplers also carry the input's *declared health contracts* + (``expect_hz``, ``max_missing_ratio``) — class-level defaults that + deployment config may override per key. Alignment ignores them; the + health monitor reads them off the plan. + """ + + expect_hz: float | None = None + max_missing_ratio: float | None = None + + def _init_contracts(self, expect_hz: float | None, max_missing_ratio: float | None) -> None: + if expect_hz is not None and expect_hz <= 0: + raise ValueError(f"expect_hz must be > 0, got {expect_hz}") + if max_missing_ratio is not None and not (0.0 < max_missing_ratio <= 1.0): + raise ValueError(f"max_missing_ratio must be in (0, 1], got {max_missing_ratio}") + self.expect_hz = expect_hz + self.max_missing_ratio = max_missing_ratio @abstractmethod def sample(self, buf: list[Observation[Any]], t: float, exhausted: bool) -> Any: @@ -195,6 +212,9 @@ def _ts_list(buf: list[Observation[Any]]) -> list[float]: class Tick(Sampler): """Marks the input whose observations fire the ticks.""" + def __init__(self, expect_hz: float | None = None) -> None: + self._init_contracts(expect_hz, None) + def sample(self, buf: list[Observation[Any]], t: float, exhausted: bool) -> Any: raise RuntimeError("Tick sampler is never sampled — it drives the clock") @@ -202,10 +222,16 @@ def sample(self, buf: list[Observation[Any]], t: float, exhausted: bool) -> Any: class Latest(Sampler): """Newest observation with ``ts <= t``; MISSING if none (or older than max_age).""" - def __init__(self, max_age: float | None = None) -> None: + def __init__( + self, + max_age: float | None = None, + expect_hz: float | None = None, + max_missing_ratio: float | None = None, + ) -> None: if max_age is not None and max_age <= 0: raise ValueError(f"latest(max_age) requires max_age > 0, got {max_age}") self.max_age = max_age + self._init_contracts(expect_hz, max_missing_ratio) def sample(self, buf: list[Observation[Any]], t: float, exhausted: bool) -> Any: i = bisect.bisect_right(_ts_list(buf), t) - 1 @@ -229,10 +255,16 @@ class Interpolate(Sampler): nearest observation within ``tolerance`` seconds is used as-is. """ - def __init__(self, tolerance: float = 0.5) -> None: + def __init__( + self, + tolerance: float = 0.5, + expect_hz: float | None = None, + max_missing_ratio: float | None = None, + ) -> None: if tolerance <= 0: raise ValueError(f"interpolate(tolerance) requires tolerance > 0, got {tolerance}") self.tolerance = tolerance + self._init_contracts(expect_hz, max_missing_ratio) def sample(self, buf: list[Observation[Any]], t: float, exhausted: bool) -> Any: ts = _ts_list(buf) @@ -265,10 +297,15 @@ def __repr__(self) -> str: class Window(Sampler): """All observations with ``t - seconds < ts <= t``, as a list (may be empty).""" - def __init__(self, seconds: float) -> None: + def __init__( + self, + seconds: float, + expect_hz: float | None = None, + ) -> None: if seconds <= 0: raise ValueError(f"window(seconds) requires seconds > 0, got {seconds}") self.seconds = seconds + self._init_contracts(expect_hz, None) def sample(self, buf: list[Observation[Any]], t: float, exhausted: bool) -> Any: ts = _ts_list(buf) @@ -283,29 +320,43 @@ def __repr__(self) -> str: return f"Window(seconds={self.seconds})" -def tick() -> Any: +def tick(expect_hz: float | None = None) -> Any: """This input fires the ticks — the module steps once per observation. Declared as a port default: ``image: In[Image] = tick()``. Exactly one - input must be the tick. (Typed ``Any`` so it can sit on an ``In[X]`` - annotation, pydantic-``Field()`` style.) + input must be the tick. ``expect_hz`` declares the input's expected + arrival rate as a class-level health contract (deployment config + overrides). (Typed ``Any`` so it can sit on an ``In[X]`` annotation, + pydantic-``Field()`` style.) """ - return Tick() + return Tick(expect_hz=expect_hz) + +def latest( + max_age: float | None = None, + expect_hz: float | None = None, + max_missing_ratio: float | None = None, +) -> Any: + """Sample this input as the newest observation at the tick time (hold). -def latest(max_age: float | None = None) -> Any: - """Sample this input as the newest observation at the tick time (hold).""" - return Latest(max_age) + ``expect_hz``/``max_missing_ratio`` declare class-level health + contracts for this input (deployment config overrides per key). + """ + return Latest(max_age, expect_hz=expect_hz, max_missing_ratio=max_missing_ratio) -def interpolate(tolerance: float = 0.5) -> Any: +def interpolate( + tolerance: float = 0.5, + expect_hz: float | None = None, + max_missing_ratio: float | None = None, +) -> Any: """Sample this input by interpolating to the tick time (lerp/slerp).""" - return Interpolate(tolerance) + return Interpolate(tolerance, expect_hz=expect_hz, max_missing_ratio=max_missing_ratio) -def window(seconds: float) -> Any: +def window(seconds: float, expect_hz: float | None = None) -> Any: """Sample this input as the list of observations in the trailing window.""" - return Window(seconds) + return Window(seconds, expect_hz=expect_hz) # -- The machine ------------------------------------------------------------- diff --git a/docs/usage/pure_modules.md b/docs/usage/pure_modules.md index 29b59cadb5..a8eea6ed06 100644 --- a/docs/usage/pure_modules.md +++ b/docs/usage/pure_modules.md @@ -336,7 +336,7 @@ class Follower(PureModule): def step(self, image: Image, pose: PoseStamped) -> Twist: return chase(image, pose) -blueprint.add(Follower, expected_hz={"image": 30, "pose": 50}, min_output_hz=10) +blueprint.add(Follower, contracts={"min_output_hz": 10}) ``` Two deployment choices matter: @@ -351,6 +351,56 @@ Two deployment choices matter: step, dropping two thirds of ticks is the system working as designed. `Unbounded()` gives recorder semantics: never drop. +Multi-output modules need nothing extra live — each output publishes on +its own port, and partial emission just means a port stays quiet that +tick. Consumers subscribe independently: + +```python skip +m = Navigator() # the multi-output module from above +m.cmd.subscribe(controller.on_cmd) # fires every tick +m.alerts.subscribe(notifier.send) # fires only when step assigned it +m.start() +``` + +## Switching inputs: live ↔ recorded + +The same *deployed* module — live runner, backpressure, health contracts +and all — can be fed from a recording instead of its ports. Set +`input_sources` before `start()`; inputs not listed keep their port, and +a module with every input sourced needs **no transports at all**: + +```python skip +m = Navigator() +m.input_sources = {"pose": db.replay(speed=2.0).streams.pose} # paced, 2x +# or: m.input_sources = {"pose": db.streams.pose} # fast-as-possible +m.start() +``` + +Captured from a real run of both modes over the same five poses: + +``` +== live: ports, one consumer per output == +cmd port got 5: last = 'forward x=2.5' +alerts port got 2: ['boundary at x=2.0 (t=1781258336.6)', 'boundary at x=2.5 (t=1781258336.7)'] +== same class, fed from a recording (no transports) == +cmd got 5: last = 'forward x=2.5' +alerts got 2: ['boundary at x=2.0 (t=101.5)', 'boundary at x=2.5 (t=102.0)'] +``` + +Note the timestamps: sourced inputs keep their **recorded** time, so +alignment, `ts`, and `max_age` behave exactly as they did on the robot. +This differs from `over()` on purpose — `over()` is the pull-based exact +path for development; `input_sources` exercises the *real live machinery* +(threads, backpressure policy, health contracts) against recorded data. +The full matrix: + +| Mode | Inputs | Pacing | Use it for | +|----------------------------------------|----------------|--------------------|---------------------------------------------------------| +| `over(...)` | stored streams | none (pull) | development, exact deterministic replay | +| ports | pub/sub | sensor-driven | the robot | +| `input_sources` + stored stream | recording | fast-as-possible | integration-testing the live path | +| `input_sources` + `db.replay(speed=…)` | recording | wall-clock × speed | rehearsing contracts & backpressure against a recording | + ## Contracts, not log spam Your module contains **zero health code** — health is judged against @@ -361,12 +411,23 @@ unacceptable". The other contracts are rates, and they're two numbers in the module config — deployment-side, because the robot, sim, and replay legitimately differ: +Per-port rates are declared on the class, where the port is:: + +```python skip +class Follower(PureModule): + frame: In[Image] = tick(expect_hz=30) # "the camera arrives at 30 Hz" + cmd: Out[Twist] = contract(min_hz=10) # "this port emits at >= 10 Hz" + ... +``` + +while module-wide contracts and mechanics come from deployment config: + ```python skip module = Follower( - expected_hz={"frame": 30}, # "the camera should arrive at 30 Hz" - min_output_hz=10.0, # "I must emit commands at >= 10 Hz" - max_drop_ratio=0.5, # "skip at most half my frames" (scale-free) - max_tick_latency_s=0.1, # "commands come from <= 100ms-old frames" + contracts={ + "max_drop_ratio": 0.5, # "skip at most half my frames" (scale-free) + "max_tick_latency_s": 0.1, # "commands come from <= 100ms-old frames" + }, ) module.start() ``` @@ -504,11 +565,11 @@ deployment choice, not module code. Backpressure: live, resolved ticks flow through a `BackpressureBuffer` between the alignment thread and the step thread — -| Policy | Semantics | -|---|---| -| `KeepLast()` (default) | controller: always step the freshest tick, count the skipped | -| `Unbounded()` | recorder/indexer: never drop, memory-bounded only by consumption | -| `Bounded(n)` / `DropNew(n)` | bounded queue dropping oldest / rejecting newest | +| Policy | Semantics | +|-----------------------------|------------------------------------------------------------------| +| `KeepLast()` (default) | controller: always step the freshest tick, count the skipped | +| `Unbounded()` | recorder/indexer: never drop, memory-bounded only by consumption | +| `Bounded(n)` / `DropNew(n)` | bounded queue dropping oldest / rejecting newest | Every queue in the path is bounded: the tick buffer by policy, alignment buffers by pruning, and ticks waiting for interpolation brackets by @@ -527,13 +588,41 @@ ticks resolving (interpolate input dead?)". The contracts: -| Config | Contract | Kind | -|---|---|---| -| `expected_hz={"pose": 50}` | input arrives at its declared rate | absolute (liveness) | -| `min_output_hz=10` | ticks emit outputs at this rate | absolute (liveness) | -| `max_drop_ratio=0.8` | step keeps up: ≤ this fraction of viable ticks skipped by backpressure | ratio (scale-free) | -| `max_missing_ratio=0.5` | per input: ≤ this fraction of ticks with the input missing | ratio (scale-free) | -| `max_tick_latency_s=0.2` | p99 trigger-arrival → outputs-published; covers queue growth under any policy | latency | +Per-port contracts are class declarations; module-wide contracts and +mechanics are deployment config: + +| Declared | Contract | Kind | +|-------------------------------------------|-------------------------------------------------------------------------------|---------------------| +| `tick(expect_hz=30)` / `latest(expect_hz=…)` | input arrives at its declared rate | absolute (liveness) | +| `latest(max_missing_ratio=0.3)` | per input: ≤ this fraction of ticks with the input missing | ratio (scale-free) | +| `cmd: Out[T] = contract(min_hz=10)` | this output port emits at its rate | absolute, per port | +| `contracts={"min_output_hz": 10}` | ticks emit *any* output at this rate | absolute (liveness) | +| `contracts={"max_drop_ratio": 0.8}` | step keeps up: ≤ this fraction of viable ticks skipped by backpressure | ratio (scale-free) | +| `contracts={"max_tick_latency_s": 0.2}` | p99 trigger-arrival → outputs-published; covers queue growth under any policy | latency | +| `health={"warmup_s": 5, "interval_s": 1}` | not contracts — reporting mechanics (warmup, throttle, stall window, samples) | — | + +Per-input and per-output contracts can also be declared **on the class +itself**, right where the port is declared — samplers take `expect_hz` / +`max_missing_ratio`, and an `Out` port takes a `contract()` default: + +```python skip +class Follower(PureModule): + image: In[Image] = tick(expect_hz=30) + pose: In[PoseStamped] = interpolate(expect_hz=50) + gps: In[str] = latest(max_age=2.0, max_missing_ratio=0.9) # flaky is fine + + cmd_vel: Out[Twist] = contract(min_hz=10) # the robot's heartbeat + alerts: Out[str] # sparse by design — no contract +``` + +These are the rates the module was built for, declared once where the +port is. (Per-port *deployment* overrides were tried and removed — three +ways to say one thing with no consumer; module-wide `contracts=` and +`health=` remain the deployment knobs, and per-port overrides can return +if a real deployment needs them.) Per-output contracts fix the +multi-output gap: `contracts.min_output_hz` counts ticks that emitted +*anything*, which a deliberately sparse `alerts` port would drag down — +`contract(min_hz=...)` checks each port on its own. Ratio and latency contracts only evaluate on windows with at least `ratio_min_samples` samples — tiny windows make ratios noise, and at zero From 9b88e5243f11b2c3dcc719baa6ac0fafc6f313b2 Mon Sep 17 00:00:00 2001 From: Ivan Nikolic Date: Mon, 15 Jun 2026 17:06:53 +0800 Subject: [PATCH 10/10] pure module API iteration --- dimos/memory2/health.py | 7 +++ dimos/memory2/puremodule.py | 77 ++++++++++++++++++++++++++++++-- dimos/memory2/test_puremodule.py | 72 +++++++++++++++++++++++++++++ docs/usage/pure_modules.md | 60 ++++++++++++------------- 4 files changed, 182 insertions(+), 34 deletions(-) diff --git a/dimos/memory2/health.py b/dimos/memory2/health.py index b86898c037..d6054b7080 100644 --- a/dimos/memory2/health.py +++ b/dimos/memory2/health.py @@ -189,6 +189,7 @@ def __init__( self._warmup_done = False self._state = OK self._state_since = self._t0 + self._latest: Health | None = None self._win = _Window() self._total_inputs: dict[str, int] = {} self._step_ms: deque[float] = deque(maxlen=256) @@ -281,6 +282,7 @@ def _report(self, win: _Window, dt: float, now: float) -> Health: self._transition(state, violations, now) health = Health(ts=now, state=state, violations=tuple(violations), metrics=metrics) + self._latest = health if self.sink is not None: try: self.sink(health) @@ -400,6 +402,11 @@ def _log_warmup(self, now: float) -> None: def state(self) -> str: return self._state + @property + def latest(self) -> Health | None: + """The most recent snapshot, or ``None`` before the first report.""" + return self._latest + def counters(self) -> dict[str, Any]: """Current window counters — for tests and debugging.""" with self._lock: diff --git a/dimos/memory2/puremodule.py b/dimos/memory2/puremodule.py index e23403f809..cff9a6f262 100644 --- a/dimos/memory2/puremodule.py +++ b/dimos/memory2/puremodule.py @@ -285,6 +285,51 @@ class PureModuleConfig(ModuleConfig): ``drops_blocked``. Offline ``over()`` is uncapped (exact).""" +class HealthView: + """Read-only health surface for a running module — ``module.health``. + + Wraps the internal :class:`HealthMonitor` and the ``_health`` stream so + callers never touch either directly. ``state``/``latest`` are in-process + (a supervisor, readiness probe, or test assertion); ``stream`` and + ``subscribe`` ride the module's store — discarded on a ``NullStore``, + recorded next to the data on a ``SqliteStore``. + """ + + __slots__ = ("_monitor", "_stream") + + def __init__(self, monitor: HealthMonitor, stream: Any | None) -> None: + self._monitor = monitor + self._stream = stream + + @property + def state(self) -> str: + """Current state: ``'OK'`` | ``'DEGRADED'`` | ``'STALLED'``.""" + return self._monitor.state + + @property + def latest(self) -> Health | None: + """The most recent snapshot, or ``None`` before the first report.""" + return self._monitor.latest + + @property + def stream(self) -> Any | None: + """The ``_health`` memory2 stream (``None`` if ``health.stream`` is off). + + An ordinary stream: ``.live()`` to tail, ``.before(t).to_list()`` to + query a recording. ``None`` when health snapshots are disabled. + """ + return self._stream + + def subscribe(self, on_health: Any) -> Any: + """Subscribe to live snapshots. Raises if ``health.stream`` is off.""" + if self._stream is None: + raise RuntimeError( + "health snapshots are disabled (config.health.stream=False) — " + "nothing to subscribe to" + ) + return self._stream.live().subscribe(on_health) + + class PureModule(Module): """Base class for modules implementing a pure ``step`` over aligned inputs. @@ -324,6 +369,21 @@ class PureModule(Module): def step(self, *args: Any, **kwargs: Any) -> Any: # pragma: no cover - overridden raise NotImplementedError(f"{type(self).__name__} must define step()") + @property + def health(self) -> HealthView: + """Read-only health surface (see :class:`HealthView`). + + Available only after ``start()`` — a pure ``over()`` run has no + runtime to monitor. + """ + view: HealthView | None = getattr(self, "_health_view", None) + if view is None: + raise RuntimeError( + f"{type(self).__name__}.health is available only after start() " + f"(no health monitor on an offline over() run)" + ) + return view + # -- plan ----------------------------------------------------------------- @classmethod @@ -360,9 +420,10 @@ def _build_plan(cls) -> _Plan: continue origin = get_origin(ann) if origin is In: - if name in ("ts", "state", "out"): + if name in ("ts", "state", "out", "health"): raise TypeError( - f"{cls.__name__}.{name}: 'ts', 'state' and 'out' are reserved names" + f"{cls.__name__}.{name}: 'ts', 'state', 'out' and 'health' " + f"are reserved names" ) ins[name] = (get_args(ann) or (object,))[0] sampler = inspect.getattr_static(cls, name, None) @@ -384,6 +445,11 @@ def _build_plan(cls) -> _Plan: if declared.max_missing_ratio is not None: missing_ratio[name] = declared.max_missing_ratio elif origin is Out: + if name == "health": + raise TypeError( + f"{cls.__name__}.{name}: 'health' is a reserved name " + f"(shadows module.health)" + ) outs[name] = (get_args(ann) or (object,))[0] marker = inspect.getattr_static(cls, name, None) if isinstance(marker, OutContract) and marker.min_hz is not None: @@ -629,14 +695,16 @@ def start(self) -> None: plan = self._plan() cfg = self.config - store = self.register_disposable(self.make_store()) + store = self._store = self.register_disposable(self.make_store()) store.start() self._streams = {name: store.stream(name, port.type) for name, port in self.inputs.items()} self._out_streams = { name: store.stream(name, port.type) for name, port in self.outputs.items() } - health_stream = store.stream("_health", dict) if cfg.health.stream else None + health_stream = self._health_stream = ( + store.stream("_health", dict) if cfg.health.stream else None + ) def _sink(h: Health) -> None: assert health_stream is not None @@ -655,6 +723,7 @@ def _sink(h: Health) -> None: sink=_sink if health_stream is not None else None, ) self.health_monitor = monitor + self._health_view = HealthView(monitor, health_stream) q: queue.SimpleQueue[Any] = queue.SimpleQueue() self._queue = q diff --git a/dimos/memory2/test_puremodule.py b/dimos/memory2/test_puremodule.py index db41594d8c..acbc180368 100644 --- a/dimos/memory2/test_puremodule.py +++ b/dimos/memory2/test_puremodule.py @@ -713,6 +713,78 @@ def test_class_contracts_reach_the_monitor(store: MemoryStore) -> None: _reset_rx_pool() +def test_health_reserved_as_in_port_name() -> None: + class BadIn(PureModule): + health: In[int] = tick() # type: ignore[misc] + + def step(self, health: int) -> int: + return health + + with pytest.raises(TypeError, match="reserved"): + BadIn._plan() + + +def test_health_reserved_as_out_port_name() -> None: + class BadOut(PureModule): + frame: In[int] = tick() + health: Out[int] # type: ignore[misc] + + def step(self, frame: int) -> dict[str, int]: + return {"health": frame} + + with pytest.raises(TypeError, match="reserved"): + BadOut._plan() + + +@pytest.mark.tool +def test_health_facade_unavailable_before_start() -> None: + module = Contracted() + try: + with pytest.raises(RuntimeError, match="only after start"): + _ = module.health + finally: + module.stop() + _reset_rx_pool() + + +@pytest.mark.tool +def test_health_facade_exposes_state_stream_and_subscribe(store: MemoryStore) -> None: + module = Contracted() + module.input_sources = { # empty sourced inputs: no transports needed + "frame": store.stream("frame", int), + "gps": store.stream("gps", str), + } + try: + module.start() + assert module.health.state == "OK" # fresh module is healthy + assert module.health.latest is None # no report emitted yet + assert module.health.stream is module._health_stream # the real _health stream + seen: list[Any] = [] + disp = module.health.subscribe(seen.append) # live subscribe wires up + disp.dispose() + finally: + module.stop() + _reset_rx_pool() + + +@pytest.mark.tool +def test_health_subscribe_raises_when_stream_disabled(store: MemoryStore) -> None: + module = Contracted(health={"stream": False}) + module.input_sources = { + "frame": store.stream("frame", int), + "gps": store.stream("gps", str), + } + try: + module.start() + assert module.health.stream is None + with pytest.raises(RuntimeError, match="disabled"): + module.health.subscribe(lambda _: None) + assert module.health.state == "OK" # state still readable + finally: + module.stop() + _reset_rx_pool() + + @pytest.mark.tool def test_input_sources_unknown_name_raises() -> None: class Echo(PureModule): diff --git a/docs/usage/pure_modules.md b/docs/usage/pure_modules.md index a8eea6ed06..db924e55b1 100644 --- a/docs/usage/pure_modules.md +++ b/docs/usage/pure_modules.md @@ -13,32 +13,32 @@ A `PureModule` splits the same job into three declarations: `interpolate()`, `window()`; - **what it computes** — `step()`, a pure function of the aligned inputs. -```diagon mode=GraphDAG -camera -> align -pose -> align -imu -> align -align -> tick -tick -> step -step -> outputs +
+diagram source + +```pikchr fold output=assets/pure_modules_dag.svg +color = white +fill = none +down + +Align: box "align" rad 5px fit wid 200% ht 170% +arrow +Step: box "step" rad 5px fit wid 170% ht 170% +arrow +Out: box "outputs" rad 5px fit wid 170% ht 170% + +Cam: box "camera" rad 5px fit wid 170% ht 170% with .s at (Align.n.x - 1.1in, Align.n.y + 0.6in) +Pose: box "pose" rad 5px fit wid 170% ht 170% with .s at (Align.n.x, Align.n.y + 0.6in) +Imu: box "imu" rad 5px fit wid 170% ht 170% with .s at (Align.n.x + 1.1in, Align.n.y + 0.6in) + +arrow from Cam.s to Align.nw +arrow from Pose.s to Align.n +arrow from Imu.s to Align.ne ``` -```results -┌──────┐┌────┐┌───┐ -│camera││pose││imu│ -└┬─────┘└┬───┘└┬──┘ -┌▽───────▽─────▽┐ -│align │ -└┬──────────────┘ -┌▽───┐ -│tick│ -└┬───┘ -┌▽───┐ -│step│ -└┬───┘ -┌▽──────┐ -│outputs│ -└───────┘ -``` +
+ +![output](assets/pure_modules_dag.svg) Because `step` never touches ports, threads, or `self`, the same class runs **live** on pub/sub ports and **offline** over stored @@ -467,12 +467,12 @@ livable: a healthy module skipping two thirds of its frames under `KeepLast` backpressure logs **nothing** — expected drops are counters, not warnings. -When logs aren't enough, the same information is queryable: -`module.health_monitor.state` gives the current `OK`/`DEGRADED`/`STALLED` -in process, and a `_health` stream in the module store receives an -aggregated metrics snapshot every second (drop rates by reason, step -p50/p99, input staleness, observed Hz) — subscribe to it live, or deploy -with a `SqliteStore` and the health history is recorded *next to the data +When logs aren't enough, the same information is queryable through +`module.health`: `module.health.state` gives the current +`OK`/`DEGRADED`/`STALLED` in process, and `module.health.subscribe(cb)` +tails a `_health` stream that receives an aggregated metrics snapshot every +second (drop rates by reason, step p50/p99, input staleness, observed Hz). +Deploy with a `SqliteStore` and that stream is recorded *next to the data it explains*, so a post-incident notebook can plot the drop ratio against the very frames that were dropped.