From be6df8d4ecfd1c0f13a71df5b778b567e58fca98 Mon Sep 17 00:00:00 2001
From: Drew Cain <groksrc@gmail.com>
Date: Fri, 12 Jun 2026 13:53:45 -0500
Subject: [PATCH] feat: filesystem-grep and full-context baseline providers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Published memory-system comparisons are only credible read against two
cheap floors:

- baseline-grep: deterministic TF-with-log-damping term matching over
  the raw corpus markdown — no index, no embeddings, no LLM. Squared
  coverage factor so a doc matching every distinct query term outranks
  a doc spamming one term. A memory system that can't beat this isn't
  adding retrieval value.
- baseline-fullcontext: no retrieval at all; the whole corpus is
  returned as a single hit (capped at 600K chars ≈ 150K tokens) for the
  QA stage to answer over. Published results repeatedly show
  full-context beating dedicated memory systems on corpora that fit a
  context window; QA accuracy and token cost should be read against
  this provider's. Makes no doc-id claims, so retrieval metrics are
  meaningless for it by design.

Both registered in the provider factory; 9 new tests; smoke-verified on
the synthetic corpus (grep recall@5 0.75; full-context recall 0 /
content-hit 0.75 as expected).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Signed-off-by: Drew Cain <groksrc@gmail.com>
---
 .../providers/__init__.py                     |  15 ++-
 .../providers/baseline_fullcontext.py         |  74 ++++++++++++
 .../providers/baseline_grep.py                |  97 +++++++++++++++
 tests/providers/test_baseline_providers.py    | 114 ++++++++++++++++++
 4 files changed, 299 insertions(+), 1 deletion(-)
 create mode 100644 src/basic_memory_benchmarks/providers/baseline_fullcontext.py
 create mode 100644 src/basic_memory_benchmarks/providers/baseline_grep.py
 create mode 100644 tests/providers/test_baseline_providers.py

diff --git a/src/basic_memory_benchmarks/providers/__init__.py b/src/basic_memory_benchmarks/providers/__init__.py
index 8592857..595eba6 100644
--- a/src/basic_memory_benchmarks/providers/__init__.py
+++ b/src/basic_memory_benchmarks/providers/__init__.py
@@ -3,6 +3,8 @@
 from __future__ import annotations
 
 from basic_memory_benchmarks.providers.base import BenchmarkProvider
+from basic_memory_benchmarks.providers.baseline_fullcontext import FullContextProvider
+from basic_memory_benchmarks.providers.baseline_grep import FilesystemGrepProvider
 from basic_memory_benchmarks.providers.bm_cloud import BasicMemoryCloudProvider
 from basic_memory_benchmarks.providers.bm_local import BasicMemoryLocalProvider
 from basic_memory_benchmarks.providers.mem0_local import Mem0LocalProvider
@@ -19,8 +21,19 @@ def create_provider(name: str) -> BenchmarkProvider:
         return Mem0LocalProvider()
     if normalized == "zep-reference":
         return ZepReferenceProvider()
+    if normalized == "baseline-grep":
+        return FilesystemGrepProvider()
+    if normalized == "baseline-fullcontext":
+        return FullContextProvider()
     raise ValueError(f"Unknown provider: {name}")
 
 
 def provider_names() -> list[str]:
-    return ["bm-local", "bm-cloud", "mem0-local", "zep-reference"]
+    return [
+        "bm-local",
+        "bm-cloud",
+        "mem0-local",
+        "zep-reference",
+        "baseline-grep",
+        "baseline-fullcontext",
+    ]
diff --git a/src/basic_memory_benchmarks/providers/baseline_fullcontext.py b/src/basic_memory_benchmarks/providers/baseline_fullcontext.py
new file mode 100644
index 0000000..e7794ed
--- /dev/null
+++ b/src/basic_memory_benchmarks/providers/baseline_fullcontext.py
@@ -0,0 +1,74 @@
+"""Full-context baseline provider.
+
+The other honesty floor: skip retrieval entirely and hand the answering model
+the whole corpus. On corpora that fit a modern context window (LoCoMo
+conversations, LongMemEval-S groups), published results repeatedly show
+full-context beating dedicated memory systems — so a memory system's QA
+accuracy should be read against this number, and its token cost against this
+provider's token cost.
+
+Retrieval metrics (recall/MRR) are meaningless for this provider by design:
+it returns the corpus as a single hit with no doc-id claims. Use it for the
+QA stage only.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import frontmatter
+
+from basic_memory_benchmarks.models import RunConfig, SearchHit
+from basic_memory_benchmarks.providers.base import BenchmarkProvider
+
+# Cap the assembled context so a pathological corpus cannot blow past the
+# answering model's window. ~600K chars ≈ 150K tokens, inside a 200K window
+# with room for prompt scaffolding. LongMemEval-S groups (~115K tokens) fit.
+MAX_CONTEXT_CHARS = 600_000
+
+
+class FullContextProvider(BenchmarkProvider):
+    """No-retrieval baseline: the whole corpus is the context."""
+
+    name = "baseline-fullcontext"
+
+    def __init__(self) -> None:
+        self._context: str = ""
+        self._truncated: bool = False
+
+    def ingest(self, corpus_path: Path, run_config: RunConfig) -> None:
+        _ = run_config
+        sections: list[str] = []
+        for note_path in sorted(corpus_path.rglob("*.md")):
+            with note_path.open("r", encoding="utf-8") as handle:
+                parsed = frontmatter.load(handle)
+            sections.append(parsed.content.strip())
+        context = "\n\n---\n\n".join(sections)
+        self._truncated = len(context) > MAX_CONTEXT_CHARS
+        self._context = context[:MAX_CONTEXT_CHARS]
+
+    def search(self, query: str, limit: int, run_config: RunConfig) -> list[SearchHit]:
+        _ = (query, limit, run_config)
+        if not self._context:
+            return []
+        return [
+            SearchHit(
+                id="full-context",
+                source_doc_id=None,
+                source_path=None,
+                text=self._context,
+                score=1.0,
+                metadata={"provider": self.name, "truncated": self._truncated},
+            )
+        ]
+
+    def cleanup(self, run_config: RunConfig) -> None:
+        _ = run_config
+        self._context = ""
+        self._truncated = False
+
+    def version_info(self) -> dict[str, str]:
+        return {
+            "baseline": "full-context",
+            "max_context_chars": str(MAX_CONTEXT_CHARS),
+        }
diff --git a/src/basic_memory_benchmarks/providers/baseline_grep.py b/src/basic_memory_benchmarks/providers/baseline_grep.py
new file mode 100644
index 0000000..6c36177
--- /dev/null
+++ b/src/basic_memory_benchmarks/providers/baseline_grep.py
@@ -0,0 +1,97 @@
+"""Filesystem grep baseline provider.
+
+The honesty floor for retrieval: case-insensitive term matching over the raw
+corpus markdown with TF-style scoring and no index, no embeddings, no LLM.
+A memory system that can't beat this isn't adding retrieval value. (Letta's
+"filesystem + grep agent" scored 74% on LoCoMo; this is the non-agentic
+deterministic analogue.)
+"""
+
+from __future__ import annotations
+
+import math
+import re
+from pathlib import Path
+
+import frontmatter
+
+from basic_memory_benchmarks.models import RunConfig, SearchHit
+from basic_memory_benchmarks.providers.base import BenchmarkProvider
+
+_TOKEN_PATTERN = re.compile(r"[a-z0-9][a-z0-9'\-]*")
+
+# Minimal English stopword set so query scoring keys on content-bearing terms.
+_STOPWORDS = frozenset(
+    "a an and are as at be but by did do does for from had has have how i in is it of on or "
+    "that the their they this to was we were what when where which who whom whose why will "
+    "with you your".split()
+)
+
+
+def _tokenize(text: str) -> list[str]:
+    return _TOKEN_PATTERN.findall(text.lower())
+
+
+class FilesystemGrepProvider(BenchmarkProvider):
+    """Deterministic grep-style ranking over corpus files."""
+
+    name = "baseline-grep"
+
+    def __init__(self) -> None:
+        self._docs: list[tuple[str, str, str]] = []  # (doc_id, rel_path, body)
+
+    def ingest(self, corpus_path: Path, run_config: RunConfig) -> None:
+        _ = run_config
+        self._docs = []
+        for note_path in sorted(corpus_path.rglob("*.md")):
+            with note_path.open("r", encoding="utf-8") as handle:
+                parsed = frontmatter.load(handle)
+            doc_id = str(parsed.get("source_doc_id") or note_path.stem)
+            rel_path = note_path.relative_to(corpus_path).as_posix()
+            self._docs.append((doc_id, rel_path, parsed.content))
+
+    def search(self, query: str, limit: int, run_config: RunConfig) -> list[SearchHit]:
+        _ = run_config
+        terms = [t for t in _tokenize(query) if t not in _STOPWORDS]
+        if not terms:
+            return []
+
+        scored: list[tuple[float, str, str, str]] = []
+        for doc_id, rel_path, body in self._docs:
+            body_lower = body.lower()
+            # TF with log damping per term; coverage bonus rewards docs
+            # matching more distinct query terms over many hits of one term.
+            matched = 0
+            score = 0.0
+            for term in terms:
+                count = body_lower.count(term)
+                if count:
+                    matched += 1
+                    score += 1.0 + math.log(count)
+            if matched:
+                # Squared coverage: a doc matching every distinct query term
+                # must outrank a doc spamming one term many times.
+                score *= (matched / len(terms)) ** 2
+                scored.append((score, doc_id, rel_path, body))
+
+        scored.sort(key=lambda item: item[0], reverse=True)
+        hits: list[SearchHit] = []
+        for score, doc_id, rel_path, body in scored[:limit]:
+            hits.append(
+                SearchHit(
+                    id=doc_id,
+                    source_doc_id=doc_id,
+                    source_path=rel_path,
+                    text=body[:2000],
+                    score=score,
+                    metadata={"provider": self.name},
+                )
+            )
+        return hits
+
+    def cleanup(self, run_config: RunConfig) -> None:
+        _ = run_config
+        self._docs = []
+
+    def version_info(self) -> dict[str, str]:
+        return {"baseline": "filesystem-grep", "index": "none"}
diff --git a/tests/providers/test_baseline_providers.py b/tests/providers/test_baseline_providers.py
new file mode 100644
index 0000000..4cd8e38
--- /dev/null
+++ b/tests/providers/test_baseline_providers.py
@@ -0,0 +1,114 @@
+"""Tests for the grep and full-context baseline providers."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from basic_memory_benchmarks.models import RunConfig
+from basic_memory_benchmarks.providers import create_provider
+from basic_memory_benchmarks.providers.baseline_fullcontext import (
+    MAX_CONTEXT_CHARS,
+    FullContextProvider,
+)
+from basic_memory_benchmarks.providers.baseline_grep import FilesystemGrepProvider
+
+
+def _write_doc(corpus: Path, doc_id: str, body: str) -> None:
+    corpus.mkdir(parents=True, exist_ok=True)
+    (corpus / f"{doc_id}.md").write_text(
+        f"---\ntitle: {doc_id}\nsource_doc_id: {doc_id}\n---\n\n{body}\n",
+        encoding="utf-8",
+    )
+
+
+def _config(tmp_path: Path) -> RunConfig:
+    return RunConfig(
+        run_id="t",
+        dataset_id="t",
+        dataset_path="t",
+        corpus_dir=str(tmp_path),
+        queries_path="t",
+    )
+
+
+class TestFilesystemGrep:
+    def test_ranks_matching_doc_first(self, tmp_path):
+        corpus = tmp_path / "docs"
+        _write_doc(corpus, "doc-a", "Joanna moved to Austin and loves the food scene in Austin.")
+        _write_doc(corpus, "doc-b", "Anthony trains for marathons every morning.")
+        provider = FilesystemGrepProvider()
+        provider.ingest(corpus, _config(tmp_path))
+
+        hits = provider.search("Where does Joanna live in Austin?", 5, _config(tmp_path))
+
+        assert hits[0].source_doc_id == "doc-a"
+        assert hits[0].score > 0
+        assert "Joanna" in (hits[0].text or "")
+
+    def test_coverage_beats_repetition(self, tmp_path):
+        corpus = tmp_path / "docs"
+        # doc-spam repeats one term; doc-both matches both distinct terms.
+        _write_doc(corpus, "doc-spam", "dentist " * 30)
+        _write_doc(corpus, "doc-both", "I visited the dentist in November.")
+        provider = FilesystemGrepProvider()
+        provider.ingest(corpus, _config(tmp_path))
+
+        hits = provider.search("dentist November", 5, _config(tmp_path))
+
+        assert hits[0].source_doc_id == "doc-both"
+
+    def test_no_content_terms_returns_empty(self, tmp_path):
+        corpus = tmp_path / "docs"
+        _write_doc(corpus, "doc-a", "anything")
+        provider = FilesystemGrepProvider()
+        provider.ingest(corpus, _config(tmp_path))
+        assert provider.search("what did they do", 5, _config(tmp_path)) == []
+
+    def test_limit_respected(self, tmp_path):
+        corpus = tmp_path / "docs"
+        for i in range(10):
+            _write_doc(corpus, f"doc-{i}", "Austin is great.")
+        provider = FilesystemGrepProvider()
+        provider.ingest(corpus, _config(tmp_path))
+        assert len(provider.search("Austin", 3, _config(tmp_path))) == 3
+
+    def test_factory_registration(self):
+        assert isinstance(create_provider("baseline-grep"), FilesystemGrepProvider)
+
+
+class TestFullContext:
+    def test_returns_whole_corpus_as_single_hit(self, tmp_path):
+        corpus = tmp_path / "docs"
+        _write_doc(corpus, "doc-a", "Joanna moved to Austin.")
+        _write_doc(corpus, "doc-b", "Anthony runs marathons.")
+        provider = FullContextProvider()
+        provider.ingest(corpus, _config(tmp_path))
+
+        hits = provider.search("anything", 10, _config(tmp_path))
+
+        assert len(hits) == 1
+        assert "Joanna moved to Austin." in (hits[0].text or "")
+        assert "Anthony runs marathons." in (hits[0].text or "")
+        assert hits[0].source_doc_id is None  # no retrieval claims
+
+    def test_truncation_capped_and_flagged(self, tmp_path):
+        corpus = tmp_path / "docs"
+        _write_doc(corpus, "doc-big", "x" * (MAX_CONTEXT_CHARS + 1000))
+        provider = FullContextProvider()
+        provider.ingest(corpus, _config(tmp_path))
+
+        hits = provider.search("anything", 10, _config(tmp_path))
+
+        assert len(hits[0].text or "") <= MAX_CONTEXT_CHARS
+        assert hits[0].metadata["truncated"] is True
+
+    def test_cleanup_clears_context(self, tmp_path):
+        corpus = tmp_path / "docs"
+        _write_doc(corpus, "doc-a", "content")
+        provider = FullContextProvider()
+        provider.ingest(corpus, _config(tmp_path))
+        provider.cleanup(_config(tmp_path))
+        assert provider.search("anything", 10, _config(tmp_path)) == []
+
+    def test_factory_registration(self):
+        assert isinstance(create_provider("baseline-fullcontext"), FullContextProvider)