Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion src/basic_memory_benchmarks/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from __future__ import annotations

from basic_memory_benchmarks.providers.base import BenchmarkProvider
from basic_memory_benchmarks.providers.baseline_fullcontext import FullContextProvider
from basic_memory_benchmarks.providers.baseline_grep import FilesystemGrepProvider
from basic_memory_benchmarks.providers.bm_cloud import BasicMemoryCloudProvider
from basic_memory_benchmarks.providers.bm_local import BasicMemoryLocalProvider
from basic_memory_benchmarks.providers.mem0_local import Mem0LocalProvider
Expand All @@ -19,8 +21,19 @@ def create_provider(name: str) -> BenchmarkProvider:
return Mem0LocalProvider()
if normalized == "zep-reference":
return ZepReferenceProvider()
if normalized == "baseline-grep":
return FilesystemGrepProvider()
if normalized == "baseline-fullcontext":
return FullContextProvider()
raise ValueError(f"Unknown provider: {name}")


def provider_names() -> list[str]:
return ["bm-local", "bm-cloud", "mem0-local", "zep-reference"]
return [
"bm-local",
"bm-cloud",
"mem0-local",
"zep-reference",
"baseline-grep",
"baseline-fullcontext",
]
74 changes: 74 additions & 0 deletions src/basic_memory_benchmarks/providers/baseline_fullcontext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""Full-context baseline provider.

The other honesty floor: skip retrieval entirely and hand the answering model
the whole corpus. On corpora that fit a modern context window (LoCoMo
conversations, LongMemEval-S groups), published results repeatedly show
full-context beating dedicated memory systems — so a memory system's QA
accuracy should be read against this number, and its token cost against this
provider's token cost.

Retrieval metrics (recall/MRR) are meaningless for this provider by design:
it returns the corpus as a single hit with no doc-id claims. Use it for the
QA stage only.
"""

from __future__ import annotations

from pathlib import Path

import frontmatter

from basic_memory_benchmarks.models import RunConfig, SearchHit
from basic_memory_benchmarks.providers.base import BenchmarkProvider

# Cap the assembled context so a pathological corpus cannot blow past the
# answering model's window. ~600K chars ≈ 150K tokens, inside a 200K window
# with room for prompt scaffolding. LongMemEval-S groups (~115K tokens) fit.
MAX_CONTEXT_CHARS = 600_000


class FullContextProvider(BenchmarkProvider):
"""No-retrieval baseline: the whole corpus is the context."""

name = "baseline-fullcontext"

def __init__(self) -> None:
self._context: str = ""
self._truncated: bool = False

def ingest(self, corpus_path: Path, run_config: RunConfig) -> None:
_ = run_config
sections: list[str] = []
for note_path in sorted(corpus_path.rglob("*.md")):
with note_path.open("r", encoding="utf-8") as handle:
parsed = frontmatter.load(handle)
sections.append(parsed.content.strip())
context = "\n\n---\n\n".join(sections)
self._truncated = len(context) > MAX_CONTEXT_CHARS
self._context = context[:MAX_CONTEXT_CHARS]

def search(self, query: str, limit: int, run_config: RunConfig) -> list[SearchHit]:
_ = (query, limit, run_config)
if not self._context:
return []
return [
SearchHit(
id="full-context",
source_doc_id=None,
source_path=None,
text=self._context,
score=1.0,
metadata={"provider": self.name, "truncated": self._truncated},
)
]

def cleanup(self, run_config: RunConfig) -> None:
_ = run_config
self._context = ""
self._truncated = False

def version_info(self) -> dict[str, str]:
return {
"baseline": "full-context",
"max_context_chars": str(MAX_CONTEXT_CHARS),
}
97 changes: 97 additions & 0 deletions src/basic_memory_benchmarks/providers/baseline_grep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""Filesystem grep baseline provider.

The honesty floor for retrieval: case-insensitive term matching over the raw
corpus markdown with TF-style scoring and no index, no embeddings, no LLM.
A memory system that can't beat this isn't adding retrieval value. (Letta's
"filesystem + grep agent" scored 74% on LoCoMo; this is the non-agentic
deterministic analogue.)
"""

from __future__ import annotations

import math
import re
from pathlib import Path

import frontmatter

from basic_memory_benchmarks.models import RunConfig, SearchHit
from basic_memory_benchmarks.providers.base import BenchmarkProvider

_TOKEN_PATTERN = re.compile(r"[a-z0-9][a-z0-9'\-]*")

# Minimal English stopword set so query scoring keys on content-bearing terms.
_STOPWORDS = frozenset(
"a an and are as at be but by did do does for from had has have how i in is it of on or "
"that the their they this to was we were what when where which who whom whose why will "
"with you your".split()
)


def _tokenize(text: str) -> list[str]:
return _TOKEN_PATTERN.findall(text.lower())


class FilesystemGrepProvider(BenchmarkProvider):
"""Deterministic grep-style ranking over corpus files."""

name = "baseline-grep"

def __init__(self) -> None:
self._docs: list[tuple[str, str, str]] = [] # (doc_id, rel_path, body)

def ingest(self, corpus_path: Path, run_config: RunConfig) -> None:
_ = run_config
self._docs = []
for note_path in sorted(corpus_path.rglob("*.md")):
with note_path.open("r", encoding="utf-8") as handle:
parsed = frontmatter.load(handle)
doc_id = str(parsed.get("source_doc_id") or note_path.stem)
rel_path = note_path.relative_to(corpus_path).as_posix()
self._docs.append((doc_id, rel_path, parsed.content))

def search(self, query: str, limit: int, run_config: RunConfig) -> list[SearchHit]:
_ = run_config
terms = [t for t in _tokenize(query) if t not in _STOPWORDS]
if not terms:
return []

scored: list[tuple[float, str, str, str]] = []
for doc_id, rel_path, body in self._docs:
body_lower = body.lower()
# TF with log damping per term; coverage bonus rewards docs
# matching more distinct query terms over many hits of one term.
matched = 0
score = 0.0
for term in terms:
count = body_lower.count(term)
if count:
matched += 1
score += 1.0 + math.log(count)
if matched:
# Squared coverage: a doc matching every distinct query term
# must outrank a doc spamming one term many times.
score *= (matched / len(terms)) ** 2
scored.append((score, doc_id, rel_path, body))

scored.sort(key=lambda item: item[0], reverse=True)
hits: list[SearchHit] = []
for score, doc_id, rel_path, body in scored[:limit]:
hits.append(
SearchHit(
id=doc_id,
source_doc_id=doc_id,
source_path=rel_path,
text=body[:2000],
score=score,
metadata={"provider": self.name},
)
)
return hits

def cleanup(self, run_config: RunConfig) -> None:
_ = run_config
self._docs = []

def version_info(self) -> dict[str, str]:
return {"baseline": "filesystem-grep", "index": "none"}
114 changes: 114 additions & 0 deletions tests/providers/test_baseline_providers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
"""Tests for the grep and full-context baseline providers."""

from __future__ import annotations

from pathlib import Path

from basic_memory_benchmarks.models import RunConfig
from basic_memory_benchmarks.providers import create_provider
from basic_memory_benchmarks.providers.baseline_fullcontext import (
MAX_CONTEXT_CHARS,
FullContextProvider,
)
from basic_memory_benchmarks.providers.baseline_grep import FilesystemGrepProvider


def _write_doc(corpus: Path, doc_id: str, body: str) -> None:
corpus.mkdir(parents=True, exist_ok=True)
(corpus / f"{doc_id}.md").write_text(
f"---\ntitle: {doc_id}\nsource_doc_id: {doc_id}\n---\n\n{body}\n",
encoding="utf-8",
)


def _config(tmp_path: Path) -> RunConfig:
return RunConfig(
run_id="t",
dataset_id="t",
dataset_path="t",
corpus_dir=str(tmp_path),
queries_path="t",
)


class TestFilesystemGrep:
def test_ranks_matching_doc_first(self, tmp_path):
corpus = tmp_path / "docs"
_write_doc(corpus, "doc-a", "Joanna moved to Austin and loves the food scene in Austin.")
_write_doc(corpus, "doc-b", "Anthony trains for marathons every morning.")
provider = FilesystemGrepProvider()
provider.ingest(corpus, _config(tmp_path))

hits = provider.search("Where does Joanna live in Austin?", 5, _config(tmp_path))

assert hits[0].source_doc_id == "doc-a"
assert hits[0].score > 0
assert "Joanna" in (hits[0].text or "")

def test_coverage_beats_repetition(self, tmp_path):
corpus = tmp_path / "docs"
# doc-spam repeats one term; doc-both matches both distinct terms.
_write_doc(corpus, "doc-spam", "dentist " * 30)
_write_doc(corpus, "doc-both", "I visited the dentist in November.")
provider = FilesystemGrepProvider()
provider.ingest(corpus, _config(tmp_path))

hits = provider.search("dentist November", 5, _config(tmp_path))

assert hits[0].source_doc_id == "doc-both"

def test_no_content_terms_returns_empty(self, tmp_path):
corpus = tmp_path / "docs"
_write_doc(corpus, "doc-a", "anything")
provider = FilesystemGrepProvider()
provider.ingest(corpus, _config(tmp_path))
assert provider.search("what did they do", 5, _config(tmp_path)) == []

def test_limit_respected(self, tmp_path):
corpus = tmp_path / "docs"
for i in range(10):
_write_doc(corpus, f"doc-{i}", "Austin is great.")
provider = FilesystemGrepProvider()
provider.ingest(corpus, _config(tmp_path))
assert len(provider.search("Austin", 3, _config(tmp_path))) == 3

def test_factory_registration(self):
assert isinstance(create_provider("baseline-grep"), FilesystemGrepProvider)


class TestFullContext:
def test_returns_whole_corpus_as_single_hit(self, tmp_path):
corpus = tmp_path / "docs"
_write_doc(corpus, "doc-a", "Joanna moved to Austin.")
_write_doc(corpus, "doc-b", "Anthony runs marathons.")
provider = FullContextProvider()
provider.ingest(corpus, _config(tmp_path))

hits = provider.search("anything", 10, _config(tmp_path))

assert len(hits) == 1
assert "Joanna moved to Austin." in (hits[0].text or "")
assert "Anthony runs marathons." in (hits[0].text or "")
assert hits[0].source_doc_id is None # no retrieval claims

def test_truncation_capped_and_flagged(self, tmp_path):
corpus = tmp_path / "docs"
_write_doc(corpus, "doc-big", "x" * (MAX_CONTEXT_CHARS + 1000))
provider = FullContextProvider()
provider.ingest(corpus, _config(tmp_path))

hits = provider.search("anything", 10, _config(tmp_path))

assert len(hits[0].text or "") <= MAX_CONTEXT_CHARS
assert hits[0].metadata["truncated"] is True

def test_cleanup_clears_context(self, tmp_path):
corpus = tmp_path / "docs"
_write_doc(corpus, "doc-a", "content")
provider = FullContextProvider()
provider.ingest(corpus, _config(tmp_path))
provider.cleanup(_config(tmp_path))
assert provider.search("anything", 10, _config(tmp_path)) == []

def test_factory_registration(self):
assert isinstance(create_provider("baseline-fullcontext"), FullContextProvider)
Loading