diff --git a/mcp-server/src/__tests__/migration-071-swarm-storage.test.ts b/mcp-server/src/__tests__/migration-071-swarm-storage.test.ts new file mode 100644 index 0000000..3d944b2 --- /dev/null +++ b/mcp-server/src/__tests__/migration-071-swarm-storage.test.ts @@ -0,0 +1,254 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { readFileSync } from "node:fs"; +import { fileURLToPath } from "node:url"; +import { dirname, resolve } from "node:path"; + +// --------------------------------------------------------------------------- +// Migration 071 — swarm storage contract pin (Swarm Phase 3d, issue #88) +// +// Why this guard exists: this migration creates the on-disk floor for every +// later /swarm/lessons and /swarm/hubs endpoint. Three contracts MUST hold +// before the Phase 3/4 endpoints write or read against it: +// +// 1. The two new tables (swarm_lessons, swarm_hub_anchors) exist with +// every column docs/SWARM_SPEC.md §3.1 / §3.2 declares, in the right +// types, with the right NOT NULLs. +// 2. Every column-level CHECK that maps to a §5 rejection rule is +// present at the DB level — content size cap (rule 12), embedding +// dimension (768, rule 4), synthesized_from_cluster_size >= 2 (rule +// 11), signed_at >= created_at (rule 9), hub_score 0..1, etc. +// 3. No `trust_edges`/TrustEdge table exists. SWARM_SPEC §3.4 forbids +// putting trust on the wire; introducing such a table would create +// the temptation to JOIN it into a wire response. +// +// Same defensive pattern as migration-070-node-identity.test.ts — read the +// SQL, normalise it, regex-pin the structural contract. We can't run the +// migration here (the autonomy loop is forbidden from executing migrations, +// Reed runs them by hand after merge), but we CAN make sure a future edit +// can never silently weaken the contract these endpoints depend on. +// --------------------------------------------------------------------------- + +const __dirname = dirname(fileURLToPath(import.meta.url)); +// Test runs from mcp-server/dist/__tests__/, so the repo root is three +// levels up (dist/__tests__ -> dist -> mcp-server -> repo-root). +const MIGRATION_PATH = resolve( + __dirname, + "../../..", + "supabase/migrations/071_swarm_storage.sql", +); +const SQL_RAW = readFileSync(MIGRATION_PATH, "utf8"); +// Strip line and block comments BEFORE keyword checks so a phrase like +// "no DROP, no DELETE" inside a comment cannot accidentally satisfy or +// violate a structural assertion. +const SQL_NO_COMMENTS = SQL_RAW.replace(/--[^\n]*/g, "").replace( + /\/\*[\s\S]*?\*\//g, + "", +); +// Lowercase + collapsed whitespace makes the assertions robust to indent +// or keyword-case drift without weakening the structural contracts. +const SQL = SQL_NO_COMMENTS.toLowerCase().replace(/\s+/g, " "); + +// --------------------------------------------------------------------------- +// (1) nodes — peer + local-trust columns (extends migration 070) +// --------------------------------------------------------------------------- + +test("migration 071 extends `nodes` with peer + local-trust columns", () => { + // pubkey_b64url: §3.3 wire encoding, mirrored from raw `pubkey` BYTEA. + // Nullable because legacy rows from migration 070 don't have it yet. + assert.match(SQL, /alter table nodes\s+add column if not exists pubkey_b64url\s+text\b/); + // endpoint_url: §3.3 https-url base for /swarm/* endpoints. Nullable — + // peers we haven't fully advertised yet still get a row. + assert.match(SQL, /alter table nodes\s+add column if not exists endpoint_url\s+text\b/); + // last_seen_at: nullable — we may know about a peer we haven't reached. + assert.match(SQL, /alter table nodes\s+add column if not exists last_seen_at\s+timestamptz\b/); + // trust_weight: §3.4 TrustEdge.weight, with safe DEFAULT 0.5 so the + // existing is_self bootstrap row gets a neutral value without backfill. + assert.match( + SQL, + /alter table nodes\s+add column if not exists trust_weight\s+real\s+not null\s+default\s+0\.5\b/, + ); + // trust_reason: §3.4 free-form audit note. Nullable. + assert.match(SQL, /alter table nodes\s+add column if not exists trust_reason\s+text\b/); +}); + +// --------------------------------------------------------------------------- +// (2) swarm_lessons — §3.1 mirror, with §5 rejection-rule CHECKs +// --------------------------------------------------------------------------- + +test("swarm_lessons table exists with every required §3.1 column", () => { + assert.match(SQL, /create table if not exists swarm_lessons\b/); + // Renaming the table would orphan every later endpoint. Pin the name. + assert.match(SQL, /id\s+uuid\s+primary key\b/); + // content: §3.1 ≤ 8 KiB (8192 octets) → §5 rule 12 enforced at column. + assert.match( + SQL, + /content\s+text\s+not null\s+check\s*\(\s*octet_length\s*\(\s*content\s*\)\s*<=\s*8192\s*\)/, + ); + // embedding: §3.6 fixes 768-d nomic-embed-text. Wrong dim = §5 rule 4. + assert.match(SQL, /embedding\s+vector\s*\(\s*768\s*\)\s+not null\b/); + // synthesized_from_cluster_size: §3.1 says ≥ 1, §5 rule 11 says wire + // floor is 2. Enforce the wire floor at the DB so a misbehaving local + // synthesiser can't insert a single-source "lesson" that would later + // be re-served as wire-illegal. + assert.match( + SQL, + /synthesized_from_cluster_size\s+int\s+not null\s+check\s*\(\s*synthesized_from_cluster_size\s*>=\s*2\s*\)/, + ); + // origin_node_id: provenance handle. FK to nodes(node_id) prevents + // dangling lesson rows when a peer is purged; pinning the FK target + // protects against a future rename of the parent column. + assert.match( + SQL, + /origin_node_id\s+text\s+not null\s+references nodes\s*\(\s*node_id\s*\)/, + ); + assert.match(SQL, /signed_at\s+timestamptz\s+not null\b/); + assert.match(SQL, /created_at\s+timestamptz\s+not null\b/); + assert.match(SQL, /signature\s+text\s+not null\b/); + // tags: §3.1 optional, hence no NOT NULL. + assert.match(SQL, /tags\s+text\[\]/); + assert.match(SQL, /spec_version\s+text\s+not null\b/); + // received_at: local bookkeeping, server-set so a malicious peer can't + // forge it. NOT NULL DEFAULT NOW() means INSERT can omit it. + assert.match(SQL, /received_at\s+timestamptz\s+not null\s+default\s+now\s*\(\s*\)/); + // signature_verified_at: nullable. Set by the wire-validator (PR #89) + // after Ed25519 verify succeeds; NULL means "not yet verified". + assert.match(SQL, /signature_verified_at\s+timestamptz\b/); +}); + +test("swarm_lessons enforces signed_at >= created_at (§5 rule 9, defense in depth)", () => { + // §5 rule 9 says wire records with signed_at < created_at must be + // dropped. The wire-validator already checks this; we re-enforce at the + // DB so a bug in the validator can't silently land a backdated row. + assert.match( + SQL, + /check\s*\(\s*signed_at\s*>=\s*created_at\s*\)/, + ); +}); + +test("swarm_lessons has the indexes /swarm/lessons §4.3 needs", () => { + // (signed_at) — supports the ?since= query parameter. + assert.match( + SQL, + /create index if not exists swarm_lessons_signed_at_idx\s+on swarm_lessons\s*\(\s*signed_at\s*\)/, + ); + // (origin_node_id, signed_at) — supports duplicate-detection (§5 rule 10: + // duplicate id with different signature for same origin + signed_at). + assert.match( + SQL, + /create index if not exists swarm_lessons_origin_signed_at_idx\s+on swarm_lessons\s*\(\s*origin_node_id\s*,\s*signed_at\s*\)/, + ); + // HNSW vector_cosine_ops — supports the ?topic= query parameter and + // matches every other embedding index in the codebase (memories, + // experiences, lessons, soul_traits — migrations 002, 004, 015, 017). + assert.match( + SQL, + /create index if not exists swarm_lessons_embedding_hnsw\s+on swarm_lessons using hnsw\s*\(\s*embedding\s+vector_cosine_ops\s*\)/, + ); +}); + +// --------------------------------------------------------------------------- +// (3) swarm_hub_anchors — §3.2 mirror, with §5 range CHECKs +// --------------------------------------------------------------------------- + +test("swarm_hub_anchors table exists with every required §3.2 column", () => { + assert.match(SQL, /create table if not exists swarm_hub_anchors\b/); + // §3.2 has no `id` field — it's a pointer, not data. Use a local + // BIGSERIAL for storage identity; do NOT expose this on the wire. + assert.match(SQL, /id\s+bigserial\s+primary key\b/); + assert.match(SQL, /embedding\s+vector\s*\(\s*768\s*\)\s+not null\b/); + // hub_score: §3.2 says 0..1. Range check is the analogue of the + // wire-validator's domain check. + assert.match( + SQL, + /hub_score\s+real\s+not null\s+check\s*\(\s*hub_score\s*>=\s*0\s+and\s+hub_score\s*<=\s*1\s*\)/, + ); + // local_memory_count: §3.2 says ≥ 1. + assert.match( + SQL, + /local_memory_count\s+int\s+not null\s+check\s*\(\s*local_memory_count\s*>=\s*1\s*\)/, + ); + // topic_label: §3.2 ≤ 256 chars, optional → §5 rule 12 (size cap). + assert.match( + SQL, + /topic_label\s+text\s+check\s*\(\s*topic_label\s+is\s+null\s+or\s+octet_length\s*\(\s*topic_label\s*\)\s*<=\s*256\s*\)/, + ); + assert.match( + SQL, + /origin_node_id\s+text\s+not null\s+references nodes\s*\(\s*node_id\s*\)/, + ); + assert.match(SQL, /signed_at\s+timestamptz\s+not null\b/); + assert.match(SQL, /signature\s+text\s+not null\b/); + assert.match(SQL, /spec_version\s+text\s+not null\b/); + assert.match(SQL, /received_at\s+timestamptz\s+not null\s+default\s+now\s*\(\s*\)/); + assert.match(SQL, /signature_verified_at\s+timestamptz\b/); +}); + +test("swarm_hub_anchors has an HNSW embedding index", () => { + assert.match( + SQL, + /create index if not exists swarm_hub_anchors_embedding_hnsw\s+on swarm_hub_anchors using hnsw\s*\(\s*embedding\s+vector_cosine_ops\s*\)/, + ); +}); + +// --------------------------------------------------------------------------- +// (4) Acceptance-criteria probes (issue #88) +// --------------------------------------------------------------------------- + +test("acceptance: cluster_size = 1 fails the CHECK (§5 rule 11 floor)", () => { + // Issue #88 acceptance: "Inserting a lesson with + // synthesized_from_cluster_size = 1 fails the check constraint." + // We can't run the INSERT, but we CAN pin the constraint that produces + // the runtime behaviour: a CHECK that mandates >= 2 on the column. + // CHECK ( ... < 2 ... ) would weaken the floor to "anything below"; we + // must see ">=" with a "2" RHS specifically. + assert.match( + SQL, + /check\s*\(\s*synthesized_from_cluster_size\s*>=\s*2\s*\)/, + ); + assert.doesNotMatch(SQL, /synthesized_from_cluster_size\s*>=\s*1\s*\)/); +}); + +test("acceptance: signed_at < created_at fails the CHECK (§5 rule 9)", () => { + // Issue #88 acceptance: "Inserting a lesson with signed_at < created_at + // fails the check constraint." The single table-level CHECK enforces + // both halves of the inequality at once. + assert.match( + SQL, + /check\s*\(\s*signed_at\s*>=\s*created_at\s*\)/, + ); +}); + +// --------------------------------------------------------------------------- +// (5) Hard-rule guards +// --------------------------------------------------------------------------- + +test("comment on table references docs/SWARM_SPEC.md for both new tables", () => { + // The `comment on table` clause is the in-DB pointer back to the spec. + // Drift between SQL and spec is the failure mode this catches. + assert.match(SQL, /comment on table swarm_lessons\s+is\s+'/); + assert.match(SQL, /comment on table swarm_hub_anchors\s+is\s+'/); + // Both comments must point readers to the spec file. + const docsRefs = SQL.match(/docs\/swarm_spec\.md/g) ?? []; + assert.ok( + docsRefs.length >= 2, + `expected each table comment to reference docs/SWARM_SPEC.md, got ${docsRefs.length} reference(s)`, + ); +}); + +test("migration is create-only (no DROP / DELETE statements)", () => { + // Hard rule from issue #88: only CREATE / ALTER ADD. A stray DROP or + // DELETE would either pre-empt later schema work or wipe a swarm + // ingestion log. Comments are stripped above so prose like "no DROP" in + // the file header cannot trigger this guard. + assert.doesNotMatch(SQL, /\bdrop\s+(table|index|column|constraint|schema)\b/); + assert.doesNotMatch(SQL, /\bdelete\s+from\b/); +}); + +test("no TrustEdge / trust_edges table — §3.4 forbids trust on the wire", () => { + // SWARM_SPEC §3.4: "There is intentionally no HTTP endpoint that returns + // TrustEdge records." A separate trust_edges table would create the + // temptation to JOIN it into a wire response. Trust lives in the flat + // nodes.trust_weight / nodes.trust_reason columns instead. + assert.doesNotMatch(SQL, /create table[^;]+\btrust_edges?\b/); +}); diff --git a/supabase/migrations/071_swarm_storage.sql b/supabase/migrations/071_swarm_storage.sql new file mode 100644 index 0000000..d72901e --- /dev/null +++ b/supabase/migrations/071_swarm_storage.sql @@ -0,0 +1,117 @@ +-- 071_swarm_storage.sql — Swarm Phase 3d (issue #88) +-- +-- Storage floor for the GET /swarm/lessons and GET /swarm/hubs endpoints +-- (later Phase 3 / 4 issues), plus peer + local-trust columns on the +-- existing `nodes` table from migration 070. +-- +-- Schemas mechanically derived from docs/SWARM_SPEC.md: +-- * swarm_lessons ← §3.1 Lesson +-- * swarm_hub_anchors ← §3.2 HubAnchor +-- * nodes (extended) ← §3.3 NodeAdvertisement bookkeeping +-- + §3.4 TrustEdge (LOCAL ONLY — see below) +-- +-- Trust intentionally lives in two new `nodes` columns (`trust_weight`, +-- `trust_reason`) rather than a separate `trust_edges` table. SWARM_SPEC +-- §3.4 is explicit: "A node MAY expose its trust list to its operator +-- but MUST NOT expose it across the wire. There is intentionally no HTTP +-- endpoint that returns TrustEdge records." A one-row-per-peer view in +-- `nodes` satisfies the local-only contract and removes the temptation to +-- ever JOIN it into a /swarm/* response. +-- +-- Verfassung pillar 6 (Cyber security): every record in swarm_lessons and +-- swarm_hub_anchors carries the producer's `signature` field. The wire- +-- validator (PR #89) verifies on ingest; the DB enforces the size/range +-- subset of the §5 rejection rules at the column level (defense in depth). +-- +-- Index choice: HNSW on `vector_cosine_ops` to match every other embedding +-- index in this repo (memories, experiences, lessons, soul_traits, +-- intentions, people, stimuli — see migrations 002, 004, 015, 017, 022). +-- The issue body referred to ivfflat as a placeholder; HNSW is the +-- established pattern and keeps query plans uniform across vector tables. +-- +-- This migration ONLY creates structure. No DROP, no DELETE, no data +-- backfill. The endpoints that READ these tables ship in later Phase 3/4 +-- issues. Reed runs the migration manually after merge — the autonomy +-- loop is forbidden from executing it. + +-- --------------------------------------------------------------------------- +-- (1) Extend `nodes` (from migration 070) with peer + local-trust columns. +-- All new columns are nullable or carry safe defaults so the bootstrap +-- is_self row from phase 1b survives without backfill. `pubkey_b64url` +-- mirrors the raw `pubkey` (BYTEA) column in the §3.3 wire encoding to +-- avoid encoding it on every read; `endpoint_url` is the §3.3 https-url +-- where the peer's /swarm/* endpoints live; `last_seen_at` records the +-- last successful contact (set by Phase 4 polling job, not this migration). +-- `trust_weight` and `trust_reason` are the §3.4 TrustEdge as flat +-- columns — never JOINed into a wire response. +-- --------------------------------------------------------------------------- +ALTER TABLE nodes + ADD COLUMN IF NOT EXISTS pubkey_b64url TEXT; +ALTER TABLE nodes + ADD COLUMN IF NOT EXISTS endpoint_url TEXT; +ALTER TABLE nodes + ADD COLUMN IF NOT EXISTS last_seen_at TIMESTAMPTZ; +ALTER TABLE nodes + ADD COLUMN IF NOT EXISTS trust_weight REAL NOT NULL DEFAULT 0.5; +ALTER TABLE nodes + ADD COLUMN IF NOT EXISTS trust_reason TEXT; + +-- --------------------------------------------------------------------------- +-- (2) swarm_lessons — incoming + locally re-served Lesson records (§3.1). +-- All checks map to §5 rejection rules: rule 9 (signed_at >= created_at), +-- rule 11 (synthesized_from_cluster_size >= 2 floor), rule 12 (content +-- size cap, 8 KiB == 8192 octets). +-- --------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS swarm_lessons ( + id UUID PRIMARY KEY, + content TEXT NOT NULL CHECK (octet_length(content) <= 8192), + embedding VECTOR(768) NOT NULL, + synthesized_from_cluster_size INT NOT NULL CHECK (synthesized_from_cluster_size >= 2), + origin_node_id TEXT NOT NULL REFERENCES nodes(node_id), + signed_at TIMESTAMPTZ NOT NULL, + created_at TIMESTAMPTZ NOT NULL, + signature TEXT NOT NULL, + tags TEXT[], + spec_version TEXT NOT NULL, + received_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + signature_verified_at TIMESTAMPTZ, + CHECK (signed_at >= created_at) +); + +CREATE INDEX IF NOT EXISTS swarm_lessons_signed_at_idx + ON swarm_lessons (signed_at); +CREATE INDEX IF NOT EXISTS swarm_lessons_origin_signed_at_idx + ON swarm_lessons (origin_node_id, signed_at); +CREATE INDEX IF NOT EXISTS swarm_lessons_embedding_hnsw + ON swarm_lessons USING hnsw (embedding vector_cosine_ops); + +COMMENT ON TABLE swarm_lessons IS + 'Mycelium swarm: signed Lesson records received from peers and/or locally re-served. Schema mirrors docs/SWARM_SPEC.md §3.1. Column-level CHECKs enforce the size/range subset of the §5 rejection rules (defense in depth behind the wire-validator).'; + +-- --------------------------------------------------------------------------- +-- (3) swarm_hub_anchors — incoming + locally re-served HubAnchor records +-- (§3.2). The wire spec defines no `id` field for HubAnchor (it's a pointer, +-- not data); we use a local BIGSERIAL primary key for storage identity. +-- A unique (origin_node_id, embedding) constraint is intentionally NOT +-- added — comparing 768-dim vectors for equality is expensive and the +-- producer is allowed to re-publish a refined centroid for the same hub. +-- --------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS swarm_hub_anchors ( + id BIGSERIAL PRIMARY KEY, + embedding VECTOR(768) NOT NULL, + hub_score REAL NOT NULL CHECK (hub_score >= 0 AND hub_score <= 1), + local_memory_count INT NOT NULL CHECK (local_memory_count >= 1), + topic_label TEXT CHECK (topic_label IS NULL OR octet_length(topic_label) <= 256), + origin_node_id TEXT NOT NULL REFERENCES nodes(node_id), + signed_at TIMESTAMPTZ NOT NULL, + signature TEXT NOT NULL, + spec_version TEXT NOT NULL, + received_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + signature_verified_at TIMESTAMPTZ +); + +CREATE INDEX IF NOT EXISTS swarm_hub_anchors_embedding_hnsw + ON swarm_hub_anchors USING hnsw (embedding vector_cosine_ops); + +COMMENT ON TABLE swarm_hub_anchors IS + 'Mycelium swarm: signed HubAnchor records (centroid + counts) received from peers. Schema mirrors docs/SWARM_SPEC.md §3.2. HubAnchors are pointers, not episode data — no content column.';