diff --git a/.gitignore b/.gitignore index 75f32d2..45b979d 100644 --- a/.gitignore +++ b/.gitignore @@ -36,5 +36,12 @@ memory-slop.md # Icon-design scratch renders (Nano Banana ideation output) docs/design/renders/ +# Regenerable eval plot artifacts — the per-run records under eval/runs/ ARE +# committed (comparable history); these are scratch from `--dump-scores` + the +# ROC plotter (scripts/plot_threshold.py). +eval/scores-*.json +eval/thresholds-*.json +eval/threshold.png + # LLVM profiling output *.profraw diff --git a/CLAUDE.md b/CLAUDE.md index 38f5c12..6d7fdfd 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,8 +7,8 @@ and recall content. See `README.md` for architecture and build instructions. - `Sources/EngramCore` — domain models, store, embeddings, ranking (the brains) - `Sources/EngramCore/RecallGate.swift` — the recall confidence gate: decides which fetched memories are confident enough to inject on a prompt. Shared by the hook and the eval; presets `.current`/`.proposed`. `RecallText.swift` is the shared tokenizer (stopwords + lexical token overlap) used by both the gate and FTS. -- `Sources/EngramCore/RetrievalMetrics.swift` — pure retrieval-quality metrics (Recall@k, MRR, gate precision/recall, negative false-positive rate, injection precision) over labeled `QueryOutcome`s. -- `Sources/engram-eval` — offline retrieval eval harness (`swift run engram-eval`): seeds a temp store from `Resources/corpus.json` + `queries.json`, runs each prompt through `fetch`, applies `RecallGate` configs, and prints a current-vs-tightened comparison (ADR 0021). `--distances` dumps per-kind distance separability; `--record` appends a per-run JSON file (git sha + embedder signature + host + metrics) under `eval/runs/`. Numbers are embedder/machine-dependent — it's a relative A/B, not a benchmark. +- `Sources/EngramCore/RetrievalMetrics.swift` — pure retrieval-quality metrics (Recall@k, MRR, gate precision/recall, negative false-positive rate, injection precision) over labeled `QueryOutcome`s, plus the session-aware `SessionInjectionReport` (`evaluateSessions` / `firstTouchCoverage`) for the recall cooldown (ADR 0023). +- `Sources/engram-eval` — offline retrieval eval harness (`swift run engram-eval`): seeds a temp store from `Resources/corpus.json` + `queries.json`, runs each prompt through `fetch`, applies `RecallGate` configs, and prints a current-vs-tightened comparison (ADR 0021). It then replays `Resources/sessions.json` (ordered on-topic prompt sequences) through the gate + the real session cooldown and prints the **session-aware injection** metric — redundant re-injection rate with vs without the cooldown, plus first-touch coverage (ADR 0023). `--distances` dumps per-kind distance separability; `--record` appends a per-run JSON file (git sha + embedder signature + host + metrics, incl. the `sessions` block) under `eval/runs/`. Numbers are embedder/machine-dependent — it's a relative A/B, not a benchmark. - `Sources/engram` — the `engram` CLI (store / fetch / stats / activity / hook) - `Sources/CSQLite` — vendored SQLite + sqlite-vec (static C target) - `Sources/engram/Setup.swift` — install logic (`engram install` / `engram setup`); the single source of truth for installing the CLI, hook, and skills. `engram install` symlinks `/usr/local/bin/engram` → the running binary diff --git a/README.md b/README.md index 2125788..b4d445e 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,10 @@ per embedder — its distance thresholds are tuned to the live model's scale via the offline eval (`swift run engram-eval`), ADR 0021. Off-topic prompts inject nothing — it exits 0 silently, so it can't block or spam. (It does record a *retrieval-activity* row — see below — which is -decoupled from ranking, ADR 0015.) +decoupled from ranking, ADR 0015.) A **session-scoped cooldown** (ADR 0023) then +drops any memory already injected via recall earlier in the same session (within +30 min), so the same note doesn't re-appear on every on-topic prompt — keyed off +the `session_id` now carried on each retrieval row. The same hook also appends a **reflection nudge** every 5th prompt of a session (tracked by a tiny per-session counter sidecar'd next to the store): a soft diff --git a/Sources/EngramCore/MemoryStore.swift b/Sources/EngramCore/MemoryStore.swift index 172a419..b06fa8b 100644 --- a/Sources/EngramCore/MemoryStore.swift +++ b/Sources/EngramCore/MemoryStore.swift @@ -47,10 +47,30 @@ public actor MemoryStore { private static func migrate(db: SQLiteDatabase, embedder: Embedder, databaseURL: URL) throws { try createSchema(db) try addMissingColumns(db) + try addMissingRetrievalColumns(db) try migrateVectorStore(db: db, embedder: embedder, databaseURL: databaseURL) try backfillFTS(db) } + /// Additively adds the `session_id` column the retrievals ledger grew for the + /// session-scoped recall cooldown (ADR 0023). No-op on an already-migrated DB. + private static func addMissingRetrievalColumns(_ db: SQLiteDatabase) throws { + var existingColumns = Set() + try db.prepare("PRAGMA table_info(retrievals);") { stmt in + while try stmt.step() { + if let name = stmt.columnText(1) { existingColumns.insert(name) } + } + } + if !existingColumns.contains("session_id") { + try db.exec("ALTER TABLE retrievals ADD COLUMN session_id TEXT;") + } + // Create the index here (not in createSchema) so it's only built once the + // session_id column is guaranteed to exist — on a fresh DB and on an + // upgraded one alike. Referencing it in createSchema fails on old DBs + // whose retrievals table predates the column. + try db.exec("CREATE INDEX IF NOT EXISTS idx_retrievals_session ON retrievals(session_id, memory_id, at);") + } + private static func createSchema(_ db: SQLiteDatabase) throws { try db.exec( """ @@ -81,7 +101,8 @@ public actor MemoryStore { memory_id TEXT NOT NULL, source TEXT NOT NULL, query TEXT, - at REAL NOT NULL + at REAL NOT NULL, + session_id TEXT ); CREATE INDEX IF NOT EXISTS idx_retrievals_at ON retrievals(at); CREATE INDEX IF NOT EXISTS idx_memories_deleted ON memories(deleted_at); @@ -878,15 +899,16 @@ public actor MemoryStore { /// `query` that surfaced them. One row per id, single timestamp, in a /// transaction. Deliberately does **not** touch `access_count` — this ledger /// is decoupled from ranking (ADR 0015 preserves ADR 0005's loop-break). - public func recordRetrieval(memoryIDs: [UUID], source: RetrievalSource, query: String? = nil) throws { + public func recordRetrieval(memoryIDs: [UUID], source: RetrievalSource, query: String? = nil, sessionID: String? = nil) throws { guard !memoryIDs.isEmpty else { return } let now = Date().timeIntervalSince1970 let trimmedQuery = query.map { String($0.prefix(Self.maxRetrievalQueryLength)) } try db.exec("BEGIN;") do { for id in memoryIDs { - try db.prepare("INSERT INTO retrievals(memory_id, source, query, at) VALUES(?, ?, ?, ?);") { stmt in - stmt.bind(id.uuidString, at: 1).bind(source.rawValue, at: 2).bind(trimmedQuery, at: 3).bind(now, at: 4) + try db.prepare("INSERT INTO retrievals(memory_id, source, query, at, session_id) VALUES(?, ?, ?, ?, ?);") { stmt in + stmt.bind(id.uuidString, at: 1).bind(source.rawValue, at: 2).bind(trimmedQuery, at: 3) + .bind(now, at: 4).bind(sessionID, at: 5) _ = try stmt.step() } } @@ -897,6 +919,35 @@ public actor MemoryStore { } } + /// Memories already injected via `recall` in this session within `cooldown` + /// (ADR 0023). The recall hook drops these post-gate so the same memory isn't + /// re-injected on every on-topic prompt of a session. Returns an empty set for + /// an empty `sessionID` (e.g. a manual `fetch` with no session) so nothing is + /// ever suppressed outside a real session. + public func recentlyInjectedInSession(_ memoryIDs: [UUID], sessionID: String, within cooldown: TimeInterval) throws -> Set { + guard !sessionID.isEmpty, !memoryIDs.isEmpty else { return [] } + let cutoff = Date().timeIntervalSince1970 - cooldown + var suppressed = Set() + let placeholders = memoryIDs.map { _ in "?" }.joined(separator: ",") + let sql = """ + SELECT DISTINCT memory_id FROM retrievals + WHERE session_id = ? AND source = ? AND at >= ? AND memory_id IN (\(placeholders)); + """ + try db.prepare(sql) { stmt in + stmt.bind(sessionID, at: 1).bind(RetrievalSource.recall.rawValue, at: 2).bind(cutoff, at: 3) + for (offset, id) in memoryIDs.enumerated() { stmt.bind(id.uuidString, at: Int32(4 + offset)) } + while try stmt.step() { + if let text = stmt.columnText(0), let id = UUID(uuidString: text) { suppressed.insert(id) } + } + } + return suppressed + } + + /// Cooldown for re-injecting the same memory via recall within one session + /// (ADR 0023). 30 minutes: short on-topic sessions show a memory once; a long + /// session gets at most a periodic refresh rather than the same note every prompt. + public static let recallReinjectionCooldown: TimeInterval = 30 * 60 + /// Retrieval-activity rows from `since` onward, newest first, optionally /// filtered to one `source`. Powers `engram activity` and the Activity view. public func retrievals(since: Date, source: RetrievalSource? = nil, limit: Int = 500) throws -> [RetrievalEvent] { diff --git a/Sources/EngramCore/RecallGate.swift b/Sources/EngramCore/RecallGate.swift index 3fe2961..020138c 100644 --- a/Sources/EngramCore/RecallGate.swift +++ b/Sources/EngramCore/RecallGate.swift @@ -54,13 +54,19 @@ public struct RecallGateConfig: Sendable, Equatable { /// kill the single-keyword leak. The per-query relevance floor and median gate /// were dropped — measurement showed neither separates on- from off-topic. /// - /// ⚠️ `maxDistance` is embedder-specific. 0.10 fits the contextual model; the - /// fallback `word-512` embedder lives on a different scale. Before shipping to - /// the hook this should become embedder-relative rather than a constant. + /// Retuned 0.10 → **0.09** (ADR 0021 addendum): on the eval that drops the + /// negative false-positive rate 13% → 0% with *unchanged* gate recall (the + /// lexical leg holds recall; the distance leg only sheds off-topic injections). + /// Engram's recall is precision-first — it runs on every prompt, so a false + /// positive bloats context repeatedly while a miss is recoverable (it re-fires + /// next prompt, or `/recall`). Tightening past 0.09 finally costs gate recall. + /// + /// ⚠️ `maxDistance` is embedder-specific. 0.09 fits the contextual model; the + /// fallback `word-512` embedder lives on a different scale (it keeps `.current`). public static let proposed = RecallGateConfig( topK: 3, minRelevance: 0, - maxDistance: 0.10, + maxDistance: 0.09, minLexicalTokenHits: 2, requireDistanceBelowMedian: false ) diff --git a/Sources/EngramCore/RetrievalMetrics.swift b/Sources/EngramCore/RetrievalMetrics.swift index b2cbd32..1091dea 100644 --- a/Sources/EngramCore/RetrievalMetrics.swift +++ b/Sources/EngramCore/RetrievalMetrics.swift @@ -121,3 +121,62 @@ public enum RetrievalMetrics { values.isEmpty ? 0 : values.reduce(0, +) / Double(values.count) } } + +/// Session-aware injection metric (ADR 0023). The per-query metrics above can't +/// see *repetition across a session* — the same memory re-injected on prompt +/// after prompt. This summarizes that over ordered prompt sequences. +public struct SessionInjectionReport: Sendable, Codable { + public let sessionCount: Int + public let promptCount: Int + /// Total memories injected across every prompt of every session. + public let totalInjections: Int + /// Injections of a memory *beyond the first* within the same session — pure + /// repetition. The number the session cooldown is meant to drive toward zero. + public let redundantInjections: Int + /// `redundantInjections / totalInjections` — 0 means every injection was a + /// memory's first appearance in its session. + public let redundantRate: Double + public let meanInjectionsPerSession: Double +} + +extension RetrievalMetrics { + /// Evaluate session injection behavior. Input: for each session, the ordered + /// per-prompt lists of injected memory ids. "Redundant" counts any id seen + /// earlier in the *same* session. + public static func evaluateSessions(_ sessions: [[[UUID]]]) -> SessionInjectionReport { + var total = 0 + var redundant = 0 + var promptCount = 0 + for session in sessions { + var seen = Set() + for prompt in session { + promptCount += 1 + for id in prompt { + total += 1 + if !seen.insert(id).inserted { redundant += 1 } + } + } + } + return SessionInjectionReport( + sessionCount: sessions.count, + promptCount: promptCount, + totalInjections: total, + redundantInjections: redundant, + redundantRate: total == 0 ? 0 : Double(redundant) / Double(total), + meanInjectionsPerSession: sessions.isEmpty ? 0 : Double(total) / Double(sessions.count) + ) + } + + /// Fraction of memories that, injected at least once *without* the cooldown, + /// are still injected at least once *with* it — across the same sessions. + /// Must stay 1.0: the cooldown removes repeats, never a memory's only/first hit. + public static func firstTouchCoverage(withoutCooldown: [[[UUID]]], withCooldown: [[[UUID]]]) -> Double { + let distinct: ([[[UUID]]]) -> Set = { sessions in + Set(sessions.flatMap { $0.flatMap { $0 } }) + } + let base = distinct(withoutCooldown) + guard !base.isEmpty else { return 1 } + let kept = distinct(withCooldown).intersection(base) + return Double(kept.count) / Double(base.count) + } +} diff --git a/Sources/engram-eval/Resources/sessions.json b/Sources/engram-eval/Resources/sessions.json new file mode 100644 index 0000000..f796095 --- /dev/null +++ b/Sources/engram-eval/Resources/sessions.json @@ -0,0 +1,37 @@ +{ + "_comment": "Ordered prompt sequences for the session-aware injection metric (ADR 0023). Each session stays on one topic, so the same memories keep clearing the gate prompt after prompt — exactly the repetition the session cooldown should damp. Prompts are not labeled; the metric measures re-injection of the same memory within a session, not correctness (that's the per-query eval).", + "sessions": [ + { + "name": "engram-internals", + "prompts": [ + "how does engram's recall hook decide what to inject on a prompt?", + "what does the recall confidence gate actually do?", + "walk me through engram's recall flow end to end", + "how does engram keep off-topic memories from being injected?", + "what embedding model does engram use to embed memories?", + "how does engram combine keyword and semantic search?" + ] + }, + { + "name": "python-stack", + "prompts": [ + "what's the standard python project setup here?", + "how do I manage python dependencies and virtualenvs?", + "which tools lint and format our python code?", + "how do we type-check python code?", + "which test framework do python projects use?", + "remind me of the python conventions we follow" + ] + }, + { + "name": "gcp-deploy", + "prompts": [ + "how are services deployed to kubernetes?", + "which gcp project do production services run in?", + "how does authentication to gcp work for our services?", + "how do I access the kubernetes cluster and gcp infra?", + "how are application secrets managed?" + ] + } + ] +} diff --git a/Sources/engram-eval/main.swift b/Sources/engram-eval/main.swift index b8e9246..c1fb18c 100644 --- a/Sources/engram-eval/main.swift +++ b/Sources/engram-eval/main.swift @@ -21,8 +21,14 @@ struct EvalQuery: Decodable { let kind: String // "targeted" | "multi" | "negative" } +struct EvalSession: Decodable { + let name: String + let prompts: [String] +} + struct Corpus: Decodable { let memories: [CorpusMemory] } struct QuerySet: Decodable { let queries: [EvalQuery] } +struct SessionSet: Decodable { let sessions: [EvalSession] } func loadResource(_ name: String, as type: T.Type) -> T { guard let url = Bundle.module.url(forResource: name, withExtension: "json") else { @@ -43,6 +49,11 @@ let configs: [(name: String, config: RecallGateConfig)] = [ ("calib-0.12", RecallGateConfig(topK: 3, minRelevance: 0, maxDistance: 0.12, minLexicalTokenHits: 2)), ("calib-0.11", RecallGateConfig(topK: 3, minRelevance: 0, maxDistance: 0.11, minLexicalTokenHits: 2)), ("calib-0.10", RecallGateConfig(topK: 3, minRelevance: 0, maxDistance: 0.10, minLexicalTokenHits: 2)), + ("calib-0.09", RecallGateConfig(topK: 3, minRelevance: 0, maxDistance: 0.09, minLexicalTokenHits: 2)), + ("calib-0.08", RecallGateConfig(topK: 3, minRelevance: 0, maxDistance: 0.08, minLexicalTokenHits: 2)), + // Tight distance, NO lexical leg: shows how far recall falls back on the + // lexical floor when the semantic gate is nearly closed. + ("calib-0.08-lex0", RecallGateConfig(topK: 3, minRelevance: 0, maxDistance: 0.08, minLexicalTokenHits: 0)), ("calib-0.11-lex0", RecallGateConfig(topK: 3, minRelevance: 0, maxDistance: 0.11, minLexicalTokenHits: 0)), ] @@ -93,13 +104,33 @@ func run() async throws { printTable(outcomesByConfig) + // Session-aware injection (ADR 0023): replay ordered, on-topic prompt + // sequences and measure how often the *same* memory is re-injected within a + // session — with vs without the session cooldown the recall hook applies. + let sessionSet = loadResource("sessions", as: SessionSet.self) + let (noCooldown, withCooldown) = try await simulateSessions(store: store, sessions: sessionSet.sessions) + let sessionRecord = SessionRunRecord( + withoutCooldown: RetrievalMetrics.evaluateSessions(noCooldown), + withCooldown: RetrievalMetrics.evaluateSessions(withCooldown), + firstTouchCoverage: RetrievalMetrics.firstTouchCoverage(withoutCooldown: noCooldown, withCooldown: withCooldown) + ) + printSessionTable(sessionRecord) + if CommandLine.arguments.contains("--distances") { try await dumpDistances(store: store, querySet: querySet) } + if CommandLine.arguments.contains("--dump-scores") { + try await dumpScores( + store: store, querySet: querySet, idForSlug: idForSlug, + embedderSignature: await store.embedderSignature + ) + } + if CommandLine.arguments.contains("--record") { try recordRun( outcomesByConfig: outcomesByConfig, + sessions: sessionRecord, embedderSignature: await store.embedderSignature, corpusSize: corpus.memories.count, queryCount: querySet.queries.count @@ -107,6 +138,41 @@ func run() async throws { } } +/// Replays each session's prompts in order through `fetch` + the **shipped** gate +/// (`config(forEmbedderSignature:)`, the same one the recall hook uses), producing +/// the per-prompt injected-id lists twice: once stateless ("without cooldown" — +/// the old behavior) and once applying the real session-scoped cooldown +/// (`recentlyInjectedInSession` + `recordRetrieval`, ADR 0023) against a unique +/// session id, exactly as the recall hook does. Using the shipped gate (not the +/// legacy `.current`) keeps the redundancy numbers faithful to production. +func simulateSessions(store: MemoryStore, sessions: [EvalSession]) async throws + -> (withoutCooldown: [[[UUID]]], withCooldown: [[[UUID]]]) { + let gate = RecallGate.config(forEmbedderSignature: await store.embedderSignature) + var without: [[[UUID]]] = [] + var withCd: [[[UUID]]] = [] + for session in sessions { + var statelessLists: [[UUID]] = [] + var cooldownLists: [[UUID]] = [] + let sessionID = "eval-\(session.name)" + for prompt in session.prompts { + let results = (try? await store.fetch(query: prompt, limit: 8, recordAccess: false)) ?? [] + let confident = RecallGate.select(results, query: prompt, config: gate).map(\.memory.id) + statelessLists.append(confident) + + let suppressed = (try? await store.recentlyInjectedInSession( + confident, sessionID: sessionID, within: MemoryStore.recallReinjectionCooldown)) ?? [] + let fresh = confident.filter { !suppressed.contains($0) } + if !fresh.isEmpty { + try? await store.recordRetrieval(memoryIDs: fresh, source: .recall, query: prompt, sessionID: sessionID) + } + cooldownLists.append(fresh) + } + without.append(statelessLists) + withCd.append(cooldownLists) + } + return (without, withCd) +} + // MARK: - Per-run recording (eval/runs/.json) struct VariantResult: Encodable { @@ -114,6 +180,12 @@ struct VariantResult: Encodable { let report: RetrievalReport } +struct SessionRunRecord: Encodable { + let withoutCooldown: SessionInjectionReport + let withCooldown: SessionInjectionReport + let firstTouchCoverage: Double +} + struct RunRecord: Encodable { let timestamp: String let gitSha: String @@ -123,6 +195,7 @@ struct RunRecord: Encodable { let queryCount: Int let k: Int let results: [VariantResult] + let sessions: SessionRunRecord } /// Writes one JSON file per run under eval/runs/. The metadata (git sha, embedder @@ -130,6 +203,7 @@ struct RunRecord: Encodable { /// alone are meaningless without the embedder/scale they were measured on. func recordRun( outcomesByConfig: [String: [QueryOutcome]], + sessions: SessionRunRecord, embedderSignature: String, corpusSize: Int, queryCount: Int @@ -149,7 +223,8 @@ func recordRun( corpusSize: corpusSize, queryCount: queryCount, k: 3, - results: results + results: results, + sessions: sessions ) let runsDir = URL(fileURLWithPath: "eval/runs", isDirectory: true) @@ -180,6 +255,35 @@ func gitSha() -> String { } } +/// Dumps every fetched candidate's semantic distance + whether it's relevant to +/// the query, so an external tool can plot ROC/PR curves over the distance +/// threshold and mark the shipped gate's ceiling. One row per (query, candidate) +/// with a finite distance (lexical-only candidates carry no distance). Writes +/// `eval/scores-.json`. +func dumpScores(store: MemoryStore, querySet: QuerySet, idForSlug: [String: UUID], embedderSignature: String) async throws { + struct ScoreRow: Encodable { let distance: Double; let relevant: Bool; let kind: String } + var rows: [ScoreRow] = [] + for query in querySet.queries { + let relevant = Set(query.relevant.compactMap { idForSlug[$0] }) + let results = (try? await store.fetch(query: query.prompt, limit: 8, recordAccess: false)) ?? [] + for result in results where result.distance.isFinite && result.distance < .greatestFiniteMagnitude { + rows.append(ScoreRow(distance: result.distance, relevant: relevant.contains(result.memory.id), kind: query.kind)) + } + } + let dir = URL(fileURLWithPath: "eval", isDirectory: true) + try FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true) + let fileURL = dir.appendingPathComponent("scores-\(embedderSignature).json") + let encoder = JSONEncoder() + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + try encoder.encode([ + "currentMaxDistance": RecallGateConfig.current.maxDistance, + "proposedMaxDistance": RecallGateConfig.proposed.maxDistance, + ]).write(to: dir.appendingPathComponent("thresholds-\(embedderSignature).json")) + try encoder.encode(rows).write(to: fileURL) + let pos = rows.filter(\.relevant).count + print("\ndumped \(rows.count) candidate scores (\(pos) relevant) → \(fileURL.path)") +} + /// Diagnostic: per query kind, how separable are on-topic from off-topic by raw /// distance? Prints mean top-1 distance and the gap/ratio between the best /// candidate and the candidate median — the signals a calibrated gate could use. @@ -210,6 +314,35 @@ func pad(_ s: String, _ width: Int) -> String { s.count >= width ? s : s + String(repeating: " ", count: width - s.count) } +/// Session-aware injection report (ADR 0023): the same memory re-injected across +/// an on-topic session, before vs after the cooldown. +func printSessionTable(_ record: SessionRunRecord) { + let before = record.withoutCooldown + let after = record.withCooldown + print("\n── session-aware injection (ADR 0023): \(before.sessionCount) sessions · \(before.promptCount) prompts ──") + let cols = ["variant", "injections", "redundant", "redund-rate"] + let widths = [18, 11, 10, 11] + print(zip(cols, widths).map { pad($0, $1) }.joined(separator: " ")) + let rows = [("no-cooldown", before), ("session-cooldown", after)] + for (name, r) in rows { + let cells = [ + pad(name, widths[0]), + pad("\(r.totalInjections)", widths[1]), + pad("\(r.redundantInjections)", widths[2]), + pad(String(format: "%.0f%%", r.redundantRate * 100), widths[3]), + ] + print(cells.joined(separator: " ")) + } + print(String(format: "first-touch coverage: %.0f%% (memories still surfaced ≥1× — must be 100%%)", + record.firstTouchCoverage * 100)) + print(""" + + redundant injections of a memory beyond its first in the same session (repetition) + redund-rate redundant ÷ total injections — the session cooldown should drive this to ~0 + coverage memories injected without the cooldown that still appear with it (over-suppression guard) + """) +} + func printTable(_ outcomesByConfig: [String: [QueryOutcome]]) { let cols = ["variant", "Recall@3", "answer%", "inj-rec", "P@3", "neg-FP%", "neg-junk", "inj-prec"] let widths = [16, 9, 8, 8, 7, 8, 9, 9] diff --git a/Sources/engram/main.swift b/Sources/engram/main.swift index 8d8d098..a9759bc 100644 --- a/Sources/engram/main.swift +++ b/Sources/engram/main.swift @@ -450,12 +450,23 @@ do { let gateConfig = RecallGate.config(forEmbedderSignature: await store.embedderSignature) let confident = RecallGate.select(results, query: prompt, config: gateConfig) + // Session-scoped cooldown (ADR 0023): a memory already injected via + // recall earlier in this session is dropped, so the same note doesn't + // re-appear on every on-topic prompt. No session id → nothing suppressed. + let sessionID = payload["session_id"] as? String ?? "" + let suppressed = (try? await store.recentlyInjectedInSession( + confident.map(\.memory.id), sessionID: sessionID, + within: MemoryStore.recallReinjectionCooldown)) ?? [] + let fresh = confident.filter { !suppressed.contains($0.memory.id) } + // Two independent sections: recalled notes (when confident) and a // periodic reflection nudge (every Nth prompt). Either may be empty. var sections: [String] = [] - if !confident.isEmpty { - try? await store.recordRetrieval(memoryIDs: confident.map(\.memory.id), source: .recall, query: prompt) - let bullets = confident.map { "- \($0.memory.content)" }.joined(separator: "\n") + if !fresh.isEmpty { + try? await store.recordRetrieval( + memoryIDs: fresh.map(\.memory.id), source: .recall, query: prompt, + sessionID: sessionID.isEmpty ? nil : sessionID) + let bullets = fresh.map { "- \($0.memory.content)" }.joined(separator: "\n") sections.append(untrustedMemoryBlock( lead: "Possibly relevant notes from Engram (ignore if off-topic):", body: bullets diff --git a/Tests/EngramCoreTests/MemoryStoreTests.swift b/Tests/EngramCoreTests/MemoryStoreTests.swift index fe7056a..3e4102a 100644 --- a/Tests/EngramCoreTests/MemoryStoreTests.swift +++ b/Tests/EngramCoreTests/MemoryStoreTests.swift @@ -182,6 +182,38 @@ private func makeTempStore() throws -> (MemoryStore, URL) { #expect(reloaded?.confidence == 1.0) } +@Test func migratesPre0023RetrievalsTableMissingSessionID() async throws { + // Regression (ADR 0023): a DB whose `retrievals` table predates the + // session_id column must upgrade cleanly. The column-referencing index must + // not be created before the column exists, or open fails with + // "no such column: session_id". The existing tests only ever build the + // current schema on a fresh DB, so they missed this upgrade path. + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("engram-test-\(UUID().uuidString).sqlite") + defer { try? FileManager.default.removeItem(at: url) } + + // Seed the pre-0023 retrievals schema (no session_id, no session index). + let legacy = try SQLiteDatabase(path: url.path) + try legacy.exec( + """ + CREATE TABLE retrievals( + id INTEGER PRIMARY KEY AUTOINCREMENT, + memory_id TEXT NOT NULL, + source TEXT NOT NULL, + query TEXT, + at REAL NOT NULL + ); + CREATE INDEX idx_retrievals_at ON retrievals(at); + """ + ) + + // Opening with the current code must migrate, not throw. + let store = try MemoryStore(url: url) + let stored = try await store.store(content: "Survives the upgrade.") + try await store.recordRetrieval(memoryIDs: [stored.id], source: .recall, query: "q", sessionID: "S1") + #expect(try await store.recentlyInjectedInSession([stored.id], sessionID: "S1", within: 3600).contains(stored.id)) +} + @Test func markVerifiedSetsVerifiedAtAndConfidence() async throws { let (store, url) = try makeTempStore() defer { try? FileManager.default.removeItem(at: url) } @@ -379,3 +411,28 @@ private func makeTempStore() throws -> (MemoryStore, URL) { #expect(Lookback.parse("1w") == nil) #expect(Lookback.parse("abc") == nil) } + +@Test func recallCooldownSuppressesSameMemoryWithinSession() async throws { + // ADR 0023: a memory injected via recall earlier in the same session is + // reported as recently-injected so the hook can drop it (no re-injection + // every on-topic prompt). Uses raw ids — the ledger doesn't FK memory rows. + let (store, url) = try makeTempStore() + defer { try? FileManager.default.removeItem(at: url) } + + let id = UUID() + try await store.recordRetrieval(memoryIDs: [id], source: .recall, query: "q", sessionID: "S1") + + // same session, within the window → suppressed + #expect(try await store.recentlyInjectedInSession([id], sessionID: "S1", within: 3600).contains(id)) + // a different session → not suppressed + #expect(try await store.recentlyInjectedInSession([id], sessionID: "S2", within: 3600).isEmpty) + // the cooldown has elapsed (the row is already in the past) → not suppressed + #expect(try await store.recentlyInjectedInSession([id], sessionID: "S1", within: 0).isEmpty) + // no session id (e.g. a manual fetch) → never suppress + #expect(try await store.recentlyInjectedInSession([id], sessionID: "", within: 3600).isEmpty) + + // a non-recall source (manual fetch) is ignored by the recall cooldown + let other = UUID() + try await store.recordRetrieval(memoryIDs: [other], source: .fetch, query: "q", sessionID: "S1") + #expect(try await store.recentlyInjectedInSession([other], sessionID: "S1", within: 3600).isEmpty) +} diff --git a/Tests/EngramCoreTests/RetrievalMetricsTests.swift b/Tests/EngramCoreTests/RetrievalMetricsTests.swift index c3dfc1f..8fefca8 100644 --- a/Tests/EngramCoreTests/RetrievalMetricsTests.swift +++ b/Tests/EngramCoreTests/RetrievalMetricsTests.swift @@ -69,3 +69,36 @@ private let a = UUID(), b = UUID(), c = UUID(), d = UUID() #expect(report.injectionPrecision == 1.0) #expect(report.labeledCount == 2) } + +@Test func sessionMetricCountsRedundantReinjections() { + // Two sessions. Session 1 injects A on three prompts (2 redundant) and B once; + // session 2 injects A once. Total 5 injections, 2 redundant → 40%. + let sessions: [[[UUID]]] = [ + [[a], [a, b], [a]], + [[a]], + ] + let report = RetrievalMetrics.evaluateSessions(sessions) + #expect(report.sessionCount == 2) + #expect(report.promptCount == 4) + #expect(report.totalInjections == 5) + #expect(report.redundantInjections == 2) + #expect(abs(report.redundantRate - 0.4) < 1e-9) +} + +@Test func sessionMetricZeroWhenNoRepeats() { + let sessions: [[[UUID]]] = [[[a], [b], [c]]] + let report = RetrievalMetrics.evaluateSessions(sessions) + #expect(report.redundantInjections == 0) + #expect(report.redundantRate == 0) +} + +@Test func firstTouchCoverageIsFullWhenCooldownOnlyDropsRepeats() { + // Without cooldown A appears twice + B once; with cooldown A once + B once. + // Every distinct memory still surfaced → coverage 1.0. + let without: [[[UUID]]] = [[[a], [a, b]]] + let withCd: [[[UUID]]] = [[[a], [b]]] + #expect(RetrievalMetrics.firstTouchCoverage(withoutCooldown: without, withCooldown: withCd) == 1.0) + // If the cooldown wrongly dropped B entirely, coverage falls to 0.5. + let dropped: [[[UUID]]] = [[[a], []]] + #expect(RetrievalMetrics.firstTouchCoverage(withoutCooldown: without, withCooldown: dropped) == 0.5) +} diff --git a/docs/adr/0021-embedder-relative-recall-gate.md b/docs/adr/0021-embedder-relative-recall-gate.md index 84472cf..69072c2 100644 --- a/docs/adr/0021-embedder-relative-recall-gate.md +++ b/docs/adr/0021-embedder-relative-recall-gate.md @@ -81,3 +81,21 @@ constant is unsafe. entirely by **bundling our own deterministic embedder** (ROADMAP item; would also make the eval reproducible across machines and the threshold a stable constant). + +## Addendum (2026-06-23) — recalibrated 0.10 → 0.09 + +A follow-up sweep (finer `calib-0.09`/`calib-0.08` rows + an ROC/PR threshold +plot, `scripts/plot_threshold.py`) tightened the contextual ceiling **0.10 → +0.09**. This is calibration *within* the mechanism this ADR established, not a +new decision — the decision text above stands. + +Why: on the eval, `0.10 → 0.09` drops the negative false-positive rate **13% → +0%** and lifts injection precision **0.47 → 0.54** with **unchanged gate recall +(93%)** — the lexical (≥2-token) leg holds recall while the tighter distance leg +only sheds off-topic injections. This fits Engram's **precision-first** stance +for recall: the hook runs on every prompt, so a false positive bloats context +repeatedly, whereas a miss is recoverable (it re-fires next prompt, or `/recall`). +`0.09` is the knee — `0.08` finally trades gate recall (93% → 91%) for +diminishing precision. Caveat: small eval set (47 relevant / 15 negatives), so +0% neg-FP is encouraging, not a guarantee; the value stays embedder-specific via +`config(forEmbedderSignature:)`. diff --git a/docs/adr/0023-session-scoped-recall-cooldown.md b/docs/adr/0023-session-scoped-recall-cooldown.md new file mode 100644 index 0000000..de22b20 --- /dev/null +++ b/docs/adr/0023-session-scoped-recall-cooldown.md @@ -0,0 +1,80 @@ +# 0023 — Session-scoped recall re-injection cooldown + +- **Status:** Accepted +- **Date:** 2026-06-22 +- **Deciders:** Daniel Klevebring + +## Context + +The recall hook (`engram hook recall`, ADR 0005/0021) is **stateless per prompt**: +on every `UserPromptSubmit` it hybrid-searches with the prompt as the query and +injects whatever clears the confidence gate (`RecallGate`). The gate answers "is +this memory topically relevant to *this* prompt?" — which is the right question +for a single prompt, but the wrong one across a session. + +A user reported that in a session *about* Engram, a single memory ("X uses Engram +for Claude Code memory") was injected into ~30–40% of prompts (2–3 of 7–8) — +always the same memory. That's the gate working as designed: the prompts were all +topically on-subject, so the memory cleared the bar every time. But re-injecting +the **same** memory across a session adds no new information after the first time; +it wastes context and reads as spam. + +The retrieval-activity ledger (ADR 0015) already records one row per injected +memory (`memory_id`, `source`, `query`, `at`) — it was missing only a session +dimension, so the hook had no way to know "did I already show this memory in this +session?" + +## Decision + +**Suppress re-injecting a memory that was already injected via recall in the same +session within a cooldown window.** Concretely: + +- Add a `session_id TEXT` column to the `retrievals` ledger (additive migration, + mirroring `addMissingColumns`) plus an index on `(session_id, memory_id, at)`. + The recall hook reads `session_id` from the Claude Code hook payload and passes + it to `recordRetrieval`. +- After the confidence gate selects the confident memories, the hook drops any + that were already injected via `source = recall` **in this session** within the + last `recallReinjectionCooldown` (default **30 minutes**). Whatever remains is + injected and recorded (with the session id); if nothing remains, the hook stays + silent, exactly as for an off-topic prompt. +- Suppression is **scoped to recall and to the session**. Manual `engram fetch` + and the `session-digest` / `verify-context` hooks are unaffected. A genuinely + new session (or the same memory after the cooldown elapses) can surface it + again — so a long session still gets a periodic refresh rather than total + one-shot suppression, which matters because earlier context can be compacted + away. + +**Cooldown shape — time vs. prompts.** A prompt-count cooldown ("not within the +last N prompts") maps more directly to the report, but it requires threading a +per-session prompt index (the reflection-nudge counter) onto every ledger row. +Time-based needs only the one `session_id` column and no coupling to the nudge +counter, and it fixes the reported scenario equally well (the repeats were +seconds-to-minutes apart). We ship **time-based** for v1; prompt-based remains a +clean future refinement if 30 minutes proves too coarse. + +## Consequences + +- The most common annoyance — the *same* memory on most prompts of an on-topic + session — goes away, without narrowing recall breadth: other memories still + surface normally, and the gate/embedder calibration (ADR 0021) is untouched. +- The change is confined to the ledger schema, `recordRetrieval`, one new query, + and the recall hook. New DBs get the column in `CREATE TABLE`; existing DBs get + it via the additive migration. The decoupling from ranking (ADR 0005/0015) is + preserved — this only reads/writes the retrieval ledger. +- The `session_id` on the ledger also unlocks a **session-aware eval metric**, + added here (ADR 0021's eval was per-query and structurally couldn't see + re-injection). `engram-eval` now replays ordered prompt sequences + (`Resources/sessions.json`) through the gate + the real cooldown and reports, + via `RetrievalMetrics.SessionInjectionReport`: + - **redundant re-injection rate** — share of injections that repeat a memory + already injected earlier in the same session — measured **with vs. without** + the cooldown (the A/B that justifies this change), and + - **first-touch coverage** — that the cooldown never drops a memory's *first* + legitimate appearance (a guard against over-suppression). + As with the rest of the eval, the numbers are embedder/machine-dependent — a + relative A/B, not a benchmark — so concrete figures live in the per-run + `eval/runs/*.json` records, not here. +- Trade-off: a memory shown once early in a very long session could be compacted + out of context and not reappear until the cooldown lapses. Accepted for v1; a + transcript-aware check ("is it still in context?") is a possible later refinement. diff --git a/docs/adr/README.md b/docs/adr/README.md index 5793417..c5c4572 100644 --- a/docs/adr/README.md +++ b/docs/adr/README.md @@ -32,6 +32,7 @@ supersedes the old one (and update the old one's status to `Superseded by NNNN`) | [0020](0020-unified-activity-timeline.md) | Unified Activity timeline: reads + writes in one stream (extends 0015) | Accepted | | [0021](0021-embedder-relative-recall-gate.md) | Embedder-relative recall gate, calibrated by offline eval (refines 0005's gate) | Accepted | | [0022](0022-privileged-helper-for-cli-install.md) | Privileged CLI install via a one-shot authenticated `osascript` | Accepted | +| [0023](0023-session-scoped-recall-cooldown.md) | Session-scoped recall re-injection cooldown (stop re-injecting the same memory every prompt) | Accepted | ## Writing a new ADR diff --git a/eval/runs/2026-06-22T19-05-37Z-contextual-512.json b/eval/runs/2026-06-22T19-05-37Z-contextual-512.json new file mode 100644 index 0000000..916ac06 --- /dev/null +++ b/eval/runs/2026-06-22T19-05-37Z-contextual-512.json @@ -0,0 +1,101 @@ +{ + "corpusSize" : 153, + "embedderSignature" : "contextual-512", + "gitSha" : "3b2fa83", + "host" : "daniels-fancy-macbook-pro-2.local", + "k" : 3, + "queryCount" : 58, + "results" : [ + { + "report" : { + "answerHitRate" : 0.8837209302325582, + "avgInjectedOnNegatives" : 3, + "injectedRecall" : 0.8294573643410852, + "injectionPrecision" : 0.2413793103448276, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 1, + "precisionAtK" : 0.32558139534883723, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "current" + }, + { + "report" : { + "answerHitRate" : 0.8837209302325582, + "avgInjectedOnNegatives" : 1.2, + "injectedRecall" : 0.8410852713178294, + "injectionPrecision" : 0.31343283582089554, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.5333333333333333, + "precisionAtK" : 0.38888888888888884, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.13" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.6666666666666666, + "injectedRecall" : 0.8875968992248061, + "injectionPrecision" : 0.3728813559322034, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.3333333333333333, + "precisionAtK" : 0.4761904761904763, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.12" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.5333333333333333, + "injectedRecall" : 0.8992248062015503, + "injectionPrecision" : 0.4205607476635514, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.26666666666666666, + "precisionAtK" : 0.5436507936507938, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.11" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.13333333333333333, + "injectedRecall" : 0.8992248062015503, + "injectionPrecision" : 0.46875, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.13333333333333333, + "precisionAtK" : 0.5714285714285715, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.10" + }, + { + "report" : { + "answerHitRate" : 0.5116279069767442, + "avgInjectedOnNegatives" : 0.5333333333333333, + "injectedRecall" : 0.4922480620155039, + "injectionPrecision" : 0.32051282051282054, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.26666666666666666, + "precisionAtK" : 0.3928571428571428, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.11-lex0" + } + ], + "timestamp" : "2026-06-22T19:05:37Z" +} \ No newline at end of file diff --git a/eval/runs/2026-06-23T05-05-50Z-contextual-512.json b/eval/runs/2026-06-23T05-05-50Z-contextual-512.json new file mode 100644 index 0000000..aca4741 --- /dev/null +++ b/eval/runs/2026-06-23T05-05-50Z-contextual-512.json @@ -0,0 +1,120 @@ +{ + "corpusSize" : 153, + "embedderSignature" : "contextual-512", + "gitSha" : "3a6d12e", + "host" : "daniels-fancy-macbook-pro-2.local", + "k" : 3, + "queryCount" : 58, + "results" : [ + { + "report" : { + "answerHitRate" : 0.8837209302325582, + "avgInjectedOnNegatives" : 3, + "injectedRecall" : 0.8294573643410852, + "injectionPrecision" : 0.2413793103448276, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 1, + "precisionAtK" : 0.32558139534883723, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "current" + }, + { + "report" : { + "answerHitRate" : 0.8837209302325582, + "avgInjectedOnNegatives" : 1.2, + "injectedRecall" : 0.8410852713178294, + "injectionPrecision" : 0.31343283582089554, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.5333333333333333, + "precisionAtK" : 0.38888888888888884, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.13" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.6666666666666666, + "injectedRecall" : 0.8875968992248061, + "injectionPrecision" : 0.3728813559322034, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.3333333333333333, + "precisionAtK" : 0.4761904761904763, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.12" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.5333333333333333, + "injectedRecall" : 0.8992248062015503, + "injectionPrecision" : 0.4205607476635514, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.26666666666666666, + "precisionAtK" : 0.5436507936507938, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.11" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.13333333333333333, + "injectedRecall" : 0.8992248062015503, + "injectionPrecision" : 0.46875, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.13333333333333333, + "precisionAtK" : 0.5714285714285715, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.10" + }, + { + "report" : { + "answerHitRate" : 0.5116279069767442, + "avgInjectedOnNegatives" : 0.5333333333333333, + "injectedRecall" : 0.4922480620155039, + "injectionPrecision" : 0.32051282051282054, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.26666666666666666, + "precisionAtK" : 0.3928571428571428, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.11-lex0" + } + ], + "sessions" : { + "firstTouchCoverage" : 1, + "withCooldown" : { + "meanInjectionsPerSession" : 11.333333333333334, + "promptCount" : 17, + "redundantInjections" : 0, + "redundantRate" : 0, + "sessionCount" : 3, + "totalInjections" : 34 + }, + "withoutCooldown" : { + "meanInjectionsPerSession" : 17, + "promptCount" : 17, + "redundantInjections" : 17, + "redundantRate" : 0.3333333333333333, + "sessionCount" : 3, + "totalInjections" : 51 + } + }, + "timestamp" : "2026-06-23T05:05:50Z" +} \ No newline at end of file diff --git a/eval/runs/2026-06-23T20-03-04Z-contextual-512.json b/eval/runs/2026-06-23T20-03-04Z-contextual-512.json new file mode 100644 index 0000000..057e9ab --- /dev/null +++ b/eval/runs/2026-06-23T20-03-04Z-contextual-512.json @@ -0,0 +1,165 @@ +{ + "corpusSize" : 153, + "embedderSignature" : "contextual-512", + "gitSha" : "aa9854e", + "host" : "daniels-fancy-macbook-pro-2.local", + "k" : 3, + "queryCount" : 58, + "results" : [ + { + "report" : { + "answerHitRate" : 0.8837209302325582, + "avgInjectedOnNegatives" : 3, + "injectedRecall" : 0.8294573643410852, + "injectionPrecision" : 0.2413793103448276, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 1, + "precisionAtK" : 0.32558139534883723, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "current" + }, + { + "report" : { + "answerHitRate" : 0.8837209302325582, + "avgInjectedOnNegatives" : 1.2, + "injectedRecall" : 0.8410852713178294, + "injectionPrecision" : 0.31343283582089554, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.5333333333333333, + "precisionAtK" : 0.38888888888888884, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.13" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.6666666666666666, + "injectedRecall" : 0.8875968992248061, + "injectionPrecision" : 0.3728813559322034, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.3333333333333333, + "precisionAtK" : 0.4761904761904763, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.12" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.5333333333333333, + "injectedRecall" : 0.8992248062015503, + "injectionPrecision" : 0.4205607476635514, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.26666666666666666, + "precisionAtK" : 0.5436507936507938, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.11" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.13333333333333333, + "injectedRecall" : 0.8992248062015503, + "injectionPrecision" : 0.46875, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.13333333333333333, + "precisionAtK" : 0.5714285714285715, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.10" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0, + "injectedRecall" : 0.8992248062015503, + "injectionPrecision" : 0.5357142857142857, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0, + "precisionAtK" : 0.6150793650793651, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.09" + }, + { + "report" : { + "answerHitRate" : 0.9069767441860465, + "avgInjectedOnNegatives" : 0, + "injectedRecall" : 0.8759689922480619, + "injectionPrecision" : 0.5714285714285714, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0, + "precisionAtK" : 0.6626016260162603, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.08" + }, + { + "report" : { + "answerHitRate" : 0.20930232558139536, + "avgInjectedOnNegatives" : 0, + "injectedRecall" : 0.1937984496124031, + "injectionPrecision" : 0.5625, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0, + "precisionAtK" : 0.7, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.08-lex0" + }, + { + "report" : { + "answerHitRate" : 0.5116279069767442, + "avgInjectedOnNegatives" : 0.5333333333333333, + "injectedRecall" : 0.4922480620155039, + "injectionPrecision" : 0.32051282051282054, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.26666666666666666, + "precisionAtK" : 0.3928571428571428, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.11-lex0" + } + ], + "sessions" : { + "firstTouchCoverage" : 1, + "withCooldown" : { + "meanInjectionsPerSession" : 11.333333333333334, + "promptCount" : 17, + "redundantInjections" : 0, + "redundantRate" : 0, + "sessionCount" : 3, + "totalInjections" : 34 + }, + "withoutCooldown" : { + "meanInjectionsPerSession" : 17, + "promptCount" : 17, + "redundantInjections" : 17, + "redundantRate" : 0.3333333333333333, + "sessionCount" : 3, + "totalInjections" : 51 + } + }, + "timestamp" : "2026-06-23T20:03:04Z" +} \ No newline at end of file diff --git a/eval/runs/2026-06-23T20-13-52Z-contextual-512.json b/eval/runs/2026-06-23T20-13-52Z-contextual-512.json new file mode 100644 index 0000000..0222234 --- /dev/null +++ b/eval/runs/2026-06-23T20-13-52Z-contextual-512.json @@ -0,0 +1,165 @@ +{ + "corpusSize" : 153, + "embedderSignature" : "contextual-512", + "gitSha" : "56bc8cd", + "host" : "daniels-fancy-macbook-pro-2.local", + "k" : 3, + "queryCount" : 58, + "results" : [ + { + "report" : { + "answerHitRate" : 0.8837209302325582, + "avgInjectedOnNegatives" : 3, + "injectedRecall" : 0.8294573643410852, + "injectionPrecision" : 0.2413793103448276, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 1, + "precisionAtK" : 0.32558139534883723, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "current" + }, + { + "report" : { + "answerHitRate" : 0.8837209302325582, + "avgInjectedOnNegatives" : 1.2, + "injectedRecall" : 0.8410852713178294, + "injectionPrecision" : 0.31343283582089554, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.5333333333333333, + "precisionAtK" : 0.38888888888888884, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.13" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.6666666666666666, + "injectedRecall" : 0.8875968992248061, + "injectionPrecision" : 0.3728813559322034, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.3333333333333333, + "precisionAtK" : 0.4761904761904763, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.12" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.5333333333333333, + "injectedRecall" : 0.8992248062015503, + "injectionPrecision" : 0.4205607476635514, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.26666666666666666, + "precisionAtK" : 0.5436507936507938, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.11" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0.13333333333333333, + "injectedRecall" : 0.8992248062015503, + "injectionPrecision" : 0.46875, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.13333333333333333, + "precisionAtK" : 0.5714285714285715, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.10" + }, + { + "report" : { + "answerHitRate" : 0.9302325581395349, + "avgInjectedOnNegatives" : 0, + "injectedRecall" : 0.8992248062015503, + "injectionPrecision" : 0.5357142857142857, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0, + "precisionAtK" : 0.6150793650793651, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.09" + }, + { + "report" : { + "answerHitRate" : 0.9069767441860465, + "avgInjectedOnNegatives" : 0, + "injectedRecall" : 0.8759689922480619, + "injectionPrecision" : 0.5714285714285714, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0, + "precisionAtK" : 0.6626016260162603, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.08" + }, + { + "report" : { + "answerHitRate" : 0.20930232558139536, + "avgInjectedOnNegatives" : 0, + "injectedRecall" : 0.1937984496124031, + "injectionPrecision" : 0.5625, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0, + "precisionAtK" : 0.7, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.08-lex0" + }, + { + "report" : { + "answerHitRate" : 0.5116279069767442, + "avgInjectedOnNegatives" : 0.5333333333333333, + "injectedRecall" : 0.4922480620155039, + "injectionPrecision" : 0.32051282051282054, + "labeledCount" : 43, + "mrr" : 0.8234496124031008, + "negativeCount" : 15, + "negativeFalsePositiveRate" : 0.26666666666666666, + "precisionAtK" : 0.3928571428571428, + "recallAtK" : 0.8294573643410852 + }, + "variant" : "calib-0.11-lex0" + } + ], + "sessions" : { + "firstTouchCoverage" : 1, + "withCooldown" : { + "meanInjectionsPerSession" : 7, + "promptCount" : 17, + "redundantInjections" : 0, + "redundantRate" : 0, + "sessionCount" : 3, + "totalInjections" : 21 + }, + "withoutCooldown" : { + "meanInjectionsPerSession" : 11, + "promptCount" : 17, + "redundantInjections" : 12, + "redundantRate" : 0.36363636363636365, + "sessionCount" : 3, + "totalInjections" : 33 + } + }, + "timestamp" : "2026-06-23T20:13:52Z" +} \ No newline at end of file diff --git a/scripts/plot_threshold.py b/scripts/plot_threshold.py new file mode 100644 index 0000000..c4904e3 --- /dev/null +++ b/scripts/plot_threshold.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +"""Plot ROC + precision/recall vs. the recall gate's distance threshold. + +Reads the per-candidate scores dumped by `engram-eval --dump-scores` +(`eval/scores-.json`: rows of {distance, relevant, kind}) and the +marked thresholds (`eval/thresholds-.json`), then renders an ROC curve +(with AUC) and a precision/recall-vs-threshold curve, marking the shipped +`proposed` gate and the legacy `current` ceiling. Writes `eval/threshold.png`. + +The gate also has a lexical leg; this models the *semantic distance* knob only — +the dominant control and the thing an AUC actually characterizes. + +Run: uv run --with matplotlib --with numpy scripts/plot_threshold.py [embedder] +""" +from __future__ import annotations + +import glob +import json +import sys +from pathlib import Path + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np + +ROOT = Path(__file__).resolve().parent.parent +EVAL = ROOT / "eval" + + +def load() -> tuple[np.ndarray, np.ndarray, dict, str]: + arg = sys.argv[1] if len(sys.argv) > 1 else None + scores = sorted(EVAL.glob(f"scores-{arg}.json" if arg else "scores-*.json")) + if not scores: + sys.exit("no eval/scores-*.json — run: swift run engram-eval --dump-scores") + path = scores[-1] + embedder = path.stem.replace("scores-", "") + rows = json.loads(path.read_text()) + dist = np.array([r["distance"] for r in rows], dtype=float) + rel = np.array([bool(r["relevant"]) for r in rows], dtype=bool) + tpath = EVAL / f"thresholds-{embedder}.json" + marks = json.loads(tpath.read_text()) if tpath.exists() else {"currentMaxDistance": 0.45, "proposedMaxDistance": 0.10} + return dist, rel, marks, embedder + + +def curve(dist: np.ndarray, rel: np.ndarray, taus: np.ndarray): + """A candidate is injected when distance < tau. Sweep tau → TPR/FPR/P/R.""" + P = int(rel.sum()) + N = int((~rel).sum()) + tpr, fpr, prec, rec = [], [], [], [] + for tau in taus: + pred = dist < tau + tp = int((pred & rel).sum()) + fp = int((pred & ~rel).sum()) + tpr.append(tp / P if P else 0.0) + fpr.append(fp / N if N else 0.0) + # Precision is undefined when nothing is injected — leave it NaN so the + # plot doesn't draw a misleading "P=1.0" shelf over the inject-nothing band. + prec.append(tp / (tp + fp) if (tp + fp) else float("nan")) + rec.append(tp / P if P else 0.0) + return np.array(tpr), np.array(fpr), np.array(prec), np.array(rec) + + +def at_threshold(dist, rel, tau): + pred = dist < tau + tp = int((pred & rel).sum()) + fp = int((pred & ~rel).sum()) + P = int(rel.sum()) + prec = tp / (tp + fp) if (tp + fp) else 1.0 + rec = tp / P if P else 0.0 + f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0 + return prec, rec, f1 + + +def main() -> None: + dist, rel, marks, embedder = load() + cur = float(marks["currentMaxDistance"]) + prop = float(marks["proposedMaxDistance"]) + + taus = np.linspace(0.0, max(0.5, cur + 0.02), 400) + tpr, fpr, prec, rec = curve(dist, rel, taus) + + # ROC AUC over the swept range (sort by FPR for a monotone integral). + trapz = getattr(np, "trapezoid", None) or np.trapz # numpy 2.x renamed trapz + order = np.argsort(fpr) + auc = float(trapz(tpr[order], fpr[order])) + + # Best-F1 threshold (a reasonable "optimal" operating point). + f1s = np.where((prec + rec) > 0, 2 * prec * rec / (prec + rec + 1e-12), 0.0) + best = int(np.argmax(f1s)) + best_tau = float(taus[best]) + + fig, (ax_roc, ax_pr) = plt.subplots(1, 2, figsize=(13, 5.2)) + + # ── ROC ── + ax_roc.plot(fpr, tpr, color="#2E3A59", lw=2, label=f"ROC (AUC={auc:.3f})") + ax_roc.plot([0, 1], [0, 1], ls=":", color="#aaa", lw=1) + for tau, name, color in [(cur, f"current {cur:.2f}", "#C0504D"), (prop, f"proposed {prop:.2f} (shipped)", "#4F8A4F")]: + p, r, _ = at_threshold(dist, rel, tau) + # locate the curve point nearest this tau + i = int(np.argmin(np.abs(taus - tau))) + ax_roc.scatter([fpr[i]], [tpr[i]], color=color, zorder=5, s=70) + ax_roc.annotate(f"{name}\nP={p:.2f} R={r:.2f}", (fpr[i], tpr[i]), + textcoords="offset points", xytext=(8, -4 if "current" in name else 10), + fontsize=9, color=color) + ax_roc.set_xlabel("false-positive rate (off-topic injected)") + ax_roc.set_ylabel("true-positive rate (relevant injected)") + ax_roc.set_title(f"ROC — distance gate ({embedder})") + ax_roc.legend(loc="lower right") + ax_roc.grid(alpha=0.2) + + # ── precision / recall vs threshold ── + ax_pr.plot(taus, prec, color="#2E3A59", lw=2, label="precision") + ax_pr.plot(taus, rec, color="#E08A4C", lw=2, label="recall") + for tau, name, color in [(cur, f"current {cur:.2f}", "#C0504D"), (prop, f"proposed {prop:.2f}", "#4F8A4F"), (best_tau, f"best-F1 {best_tau:.2f}", "#3b4a82")]: + ax_pr.axvline(tau, color=color, ls="--", lw=1.3, label=name) + ax_pr.set_xlabel("maxDistance threshold τ (inject when distance < τ)") + ax_pr.set_ylabel("precision / recall") + ax_pr.set_title("precision & recall vs. threshold") + ax_pr.legend(loc="center right", fontsize=8) + ax_pr.grid(alpha=0.2) + + fig.tight_layout() + out = EVAL / "threshold.png" + fig.savefig(out, dpi=140) + + # text summary + pc, rc, fc = at_threshold(dist, rel, cur) + pp, rp, fp_ = at_threshold(dist, rel, prop) + pb, rb, fb = at_threshold(dist, rel, best_tau) + print(f"embedder: {embedder} candidates: {len(dist)} relevant: {int(rel.sum())} ROC-AUC: {auc:.3f}") + print(f" current τ={cur:.2f} : precision {pc:.3f} recall {rc:.3f} F1 {fc:.3f}") + print(f" proposed τ={prop:.2f} : precision {pp:.3f} recall {rp:.3f} F1 {fp_:.3f} ← shipped") + print(f" best-F1 τ={best_tau:.2f} : precision {pb:.3f} recall {rb:.3f} F1 {fb:.3f}") + print(f"wrote {out}") + + +if __name__ == "__main__": + main()