From be6df8d4ecfd1c0f13a71df5b778b567e58fca98 Mon Sep 17 00:00:00 2001 From: Drew Cain Date: Fri, 12 Jun 2026 13:53:45 -0500 Subject: [PATCH] feat: filesystem-grep and full-context baseline providers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Published memory-system comparisons are only credible read against two cheap floors: - baseline-grep: deterministic TF-with-log-damping term matching over the raw corpus markdown — no index, no embeddings, no LLM. Squared coverage factor so a doc matching every distinct query term outranks a doc spamming one term. A memory system that can't beat this isn't adding retrieval value. - baseline-fullcontext: no retrieval at all; the whole corpus is returned as a single hit (capped at 600K chars ≈ 150K tokens) for the QA stage to answer over. Published results repeatedly show full-context beating dedicated memory systems on corpora that fit a context window; QA accuracy and token cost should be read against this provider's. Makes no doc-id claims, so retrieval metrics are meaningless for it by design. Both registered in the provider factory; 9 new tests; smoke-verified on the synthetic corpus (grep recall@5 0.75; full-context recall 0 / content-hit 0.75 as expected). Co-Authored-By: Claude Fable 5 Signed-off-by: Drew Cain --- .../providers/__init__.py | 15 ++- .../providers/baseline_fullcontext.py | 74 ++++++++++++ .../providers/baseline_grep.py | 97 +++++++++++++++ tests/providers/test_baseline_providers.py | 114 ++++++++++++++++++ 4 files changed, 299 insertions(+), 1 deletion(-) create mode 100644 src/basic_memory_benchmarks/providers/baseline_fullcontext.py create mode 100644 src/basic_memory_benchmarks/providers/baseline_grep.py create mode 100644 tests/providers/test_baseline_providers.py diff --git a/src/basic_memory_benchmarks/providers/__init__.py b/src/basic_memory_benchmarks/providers/__init__.py index 8592857..595eba6 100644 --- a/src/basic_memory_benchmarks/providers/__init__.py +++ b/src/basic_memory_benchmarks/providers/__init__.py @@ -3,6 +3,8 @@ from __future__ import annotations from basic_memory_benchmarks.providers.base import BenchmarkProvider +from basic_memory_benchmarks.providers.baseline_fullcontext import FullContextProvider +from basic_memory_benchmarks.providers.baseline_grep import FilesystemGrepProvider from basic_memory_benchmarks.providers.bm_cloud import BasicMemoryCloudProvider from basic_memory_benchmarks.providers.bm_local import BasicMemoryLocalProvider from basic_memory_benchmarks.providers.mem0_local import Mem0LocalProvider @@ -19,8 +21,19 @@ def create_provider(name: str) -> BenchmarkProvider: return Mem0LocalProvider() if normalized == "zep-reference": return ZepReferenceProvider() + if normalized == "baseline-grep": + return FilesystemGrepProvider() + if normalized == "baseline-fullcontext": + return FullContextProvider() raise ValueError(f"Unknown provider: {name}") def provider_names() -> list[str]: - return ["bm-local", "bm-cloud", "mem0-local", "zep-reference"] + return [ + "bm-local", + "bm-cloud", + "mem0-local", + "zep-reference", + "baseline-grep", + "baseline-fullcontext", + ] diff --git a/src/basic_memory_benchmarks/providers/baseline_fullcontext.py b/src/basic_memory_benchmarks/providers/baseline_fullcontext.py new file mode 100644 index 0000000..e7794ed --- /dev/null +++ b/src/basic_memory_benchmarks/providers/baseline_fullcontext.py @@ -0,0 +1,74 @@ +"""Full-context baseline provider. + +The other honesty floor: skip retrieval entirely and hand the answering model +the whole corpus. On corpora that fit a modern context window (LoCoMo +conversations, LongMemEval-S groups), published results repeatedly show +full-context beating dedicated memory systems — so a memory system's QA +accuracy should be read against this number, and its token cost against this +provider's token cost. + +Retrieval metrics (recall/MRR) are meaningless for this provider by design: +it returns the corpus as a single hit with no doc-id claims. Use it for the +QA stage only. +""" + +from __future__ import annotations + +from pathlib import Path + +import frontmatter + +from basic_memory_benchmarks.models import RunConfig, SearchHit +from basic_memory_benchmarks.providers.base import BenchmarkProvider + +# Cap the assembled context so a pathological corpus cannot blow past the +# answering model's window. ~600K chars ≈ 150K tokens, inside a 200K window +# with room for prompt scaffolding. LongMemEval-S groups (~115K tokens) fit. +MAX_CONTEXT_CHARS = 600_000 + + +class FullContextProvider(BenchmarkProvider): + """No-retrieval baseline: the whole corpus is the context.""" + + name = "baseline-fullcontext" + + def __init__(self) -> None: + self._context: str = "" + self._truncated: bool = False + + def ingest(self, corpus_path: Path, run_config: RunConfig) -> None: + _ = run_config + sections: list[str] = [] + for note_path in sorted(corpus_path.rglob("*.md")): + with note_path.open("r", encoding="utf-8") as handle: + parsed = frontmatter.load(handle) + sections.append(parsed.content.strip()) + context = "\n\n---\n\n".join(sections) + self._truncated = len(context) > MAX_CONTEXT_CHARS + self._context = context[:MAX_CONTEXT_CHARS] + + def search(self, query: str, limit: int, run_config: RunConfig) -> list[SearchHit]: + _ = (query, limit, run_config) + if not self._context: + return [] + return [ + SearchHit( + id="full-context", + source_doc_id=None, + source_path=None, + text=self._context, + score=1.0, + metadata={"provider": self.name, "truncated": self._truncated}, + ) + ] + + def cleanup(self, run_config: RunConfig) -> None: + _ = run_config + self._context = "" + self._truncated = False + + def version_info(self) -> dict[str, str]: + return { + "baseline": "full-context", + "max_context_chars": str(MAX_CONTEXT_CHARS), + } diff --git a/src/basic_memory_benchmarks/providers/baseline_grep.py b/src/basic_memory_benchmarks/providers/baseline_grep.py new file mode 100644 index 0000000..6c36177 --- /dev/null +++ b/src/basic_memory_benchmarks/providers/baseline_grep.py @@ -0,0 +1,97 @@ +"""Filesystem grep baseline provider. + +The honesty floor for retrieval: case-insensitive term matching over the raw +corpus markdown with TF-style scoring and no index, no embeddings, no LLM. +A memory system that can't beat this isn't adding retrieval value. (Letta's +"filesystem + grep agent" scored 74% on LoCoMo; this is the non-agentic +deterministic analogue.) +""" + +from __future__ import annotations + +import math +import re +from pathlib import Path + +import frontmatter + +from basic_memory_benchmarks.models import RunConfig, SearchHit +from basic_memory_benchmarks.providers.base import BenchmarkProvider + +_TOKEN_PATTERN = re.compile(r"[a-z0-9][a-z0-9'\-]*") + +# Minimal English stopword set so query scoring keys on content-bearing terms. +_STOPWORDS = frozenset( + "a an and are as at be but by did do does for from had has have how i in is it of on or " + "that the their they this to was we were what when where which who whom whose why will " + "with you your".split() +) + + +def _tokenize(text: str) -> list[str]: + return _TOKEN_PATTERN.findall(text.lower()) + + +class FilesystemGrepProvider(BenchmarkProvider): + """Deterministic grep-style ranking over corpus files.""" + + name = "baseline-grep" + + def __init__(self) -> None: + self._docs: list[tuple[str, str, str]] = [] # (doc_id, rel_path, body) + + def ingest(self, corpus_path: Path, run_config: RunConfig) -> None: + _ = run_config + self._docs = [] + for note_path in sorted(corpus_path.rglob("*.md")): + with note_path.open("r", encoding="utf-8") as handle: + parsed = frontmatter.load(handle) + doc_id = str(parsed.get("source_doc_id") or note_path.stem) + rel_path = note_path.relative_to(corpus_path).as_posix() + self._docs.append((doc_id, rel_path, parsed.content)) + + def search(self, query: str, limit: int, run_config: RunConfig) -> list[SearchHit]: + _ = run_config + terms = [t for t in _tokenize(query) if t not in _STOPWORDS] + if not terms: + return [] + + scored: list[tuple[float, str, str, str]] = [] + for doc_id, rel_path, body in self._docs: + body_lower = body.lower() + # TF with log damping per term; coverage bonus rewards docs + # matching more distinct query terms over many hits of one term. + matched = 0 + score = 0.0 + for term in terms: + count = body_lower.count(term) + if count: + matched += 1 + score += 1.0 + math.log(count) + if matched: + # Squared coverage: a doc matching every distinct query term + # must outrank a doc spamming one term many times. + score *= (matched / len(terms)) ** 2 + scored.append((score, doc_id, rel_path, body)) + + scored.sort(key=lambda item: item[0], reverse=True) + hits: list[SearchHit] = [] + for score, doc_id, rel_path, body in scored[:limit]: + hits.append( + SearchHit( + id=doc_id, + source_doc_id=doc_id, + source_path=rel_path, + text=body[:2000], + score=score, + metadata={"provider": self.name}, + ) + ) + return hits + + def cleanup(self, run_config: RunConfig) -> None: + _ = run_config + self._docs = [] + + def version_info(self) -> dict[str, str]: + return {"baseline": "filesystem-grep", "index": "none"} diff --git a/tests/providers/test_baseline_providers.py b/tests/providers/test_baseline_providers.py new file mode 100644 index 0000000..4cd8e38 --- /dev/null +++ b/tests/providers/test_baseline_providers.py @@ -0,0 +1,114 @@ +"""Tests for the grep and full-context baseline providers.""" + +from __future__ import annotations + +from pathlib import Path + +from basic_memory_benchmarks.models import RunConfig +from basic_memory_benchmarks.providers import create_provider +from basic_memory_benchmarks.providers.baseline_fullcontext import ( + MAX_CONTEXT_CHARS, + FullContextProvider, +) +from basic_memory_benchmarks.providers.baseline_grep import FilesystemGrepProvider + + +def _write_doc(corpus: Path, doc_id: str, body: str) -> None: + corpus.mkdir(parents=True, exist_ok=True) + (corpus / f"{doc_id}.md").write_text( + f"---\ntitle: {doc_id}\nsource_doc_id: {doc_id}\n---\n\n{body}\n", + encoding="utf-8", + ) + + +def _config(tmp_path: Path) -> RunConfig: + return RunConfig( + run_id="t", + dataset_id="t", + dataset_path="t", + corpus_dir=str(tmp_path), + queries_path="t", + ) + + +class TestFilesystemGrep: + def test_ranks_matching_doc_first(self, tmp_path): + corpus = tmp_path / "docs" + _write_doc(corpus, "doc-a", "Joanna moved to Austin and loves the food scene in Austin.") + _write_doc(corpus, "doc-b", "Anthony trains for marathons every morning.") + provider = FilesystemGrepProvider() + provider.ingest(corpus, _config(tmp_path)) + + hits = provider.search("Where does Joanna live in Austin?", 5, _config(tmp_path)) + + assert hits[0].source_doc_id == "doc-a" + assert hits[0].score > 0 + assert "Joanna" in (hits[0].text or "") + + def test_coverage_beats_repetition(self, tmp_path): + corpus = tmp_path / "docs" + # doc-spam repeats one term; doc-both matches both distinct terms. + _write_doc(corpus, "doc-spam", "dentist " * 30) + _write_doc(corpus, "doc-both", "I visited the dentist in November.") + provider = FilesystemGrepProvider() + provider.ingest(corpus, _config(tmp_path)) + + hits = provider.search("dentist November", 5, _config(tmp_path)) + + assert hits[0].source_doc_id == "doc-both" + + def test_no_content_terms_returns_empty(self, tmp_path): + corpus = tmp_path / "docs" + _write_doc(corpus, "doc-a", "anything") + provider = FilesystemGrepProvider() + provider.ingest(corpus, _config(tmp_path)) + assert provider.search("what did they do", 5, _config(tmp_path)) == [] + + def test_limit_respected(self, tmp_path): + corpus = tmp_path / "docs" + for i in range(10): + _write_doc(corpus, f"doc-{i}", "Austin is great.") + provider = FilesystemGrepProvider() + provider.ingest(corpus, _config(tmp_path)) + assert len(provider.search("Austin", 3, _config(tmp_path))) == 3 + + def test_factory_registration(self): + assert isinstance(create_provider("baseline-grep"), FilesystemGrepProvider) + + +class TestFullContext: + def test_returns_whole_corpus_as_single_hit(self, tmp_path): + corpus = tmp_path / "docs" + _write_doc(corpus, "doc-a", "Joanna moved to Austin.") + _write_doc(corpus, "doc-b", "Anthony runs marathons.") + provider = FullContextProvider() + provider.ingest(corpus, _config(tmp_path)) + + hits = provider.search("anything", 10, _config(tmp_path)) + + assert len(hits) == 1 + assert "Joanna moved to Austin." in (hits[0].text or "") + assert "Anthony runs marathons." in (hits[0].text or "") + assert hits[0].source_doc_id is None # no retrieval claims + + def test_truncation_capped_and_flagged(self, tmp_path): + corpus = tmp_path / "docs" + _write_doc(corpus, "doc-big", "x" * (MAX_CONTEXT_CHARS + 1000)) + provider = FullContextProvider() + provider.ingest(corpus, _config(tmp_path)) + + hits = provider.search("anything", 10, _config(tmp_path)) + + assert len(hits[0].text or "") <= MAX_CONTEXT_CHARS + assert hits[0].metadata["truncated"] is True + + def test_cleanup_clears_context(self, tmp_path): + corpus = tmp_path / "docs" + _write_doc(corpus, "doc-a", "content") + provider = FullContextProvider() + provider.ingest(corpus, _config(tmp_path)) + provider.cleanup(_config(tmp_path)) + assert provider.search("anything", 10, _config(tmp_path)) == [] + + def test_factory_registration(self): + assert isinstance(create_provider("baseline-fullcontext"), FullContextProvider)