Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ benchmarks/datasets/longmemeval/longmemeval_s.json
benchmarks/datasets/longmemeval/longmemeval_s.provenance.json
benchmarks/datasets/locomo-audit/
benchmarks/.mem0-qdrant/
benchmarks/datasets/convomem/
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,31 @@ session-id prefix and per-turn `has_answer` flags. The converter remaps all
session ids to neutral positional ids (`<qid>-s012`) and drops turn flags, so
ingested corpora carry no evidence markers.

## ConvoMem (sampled)

ConvoMem (Salesforce, Apache-2.0) ships ~75K QA pairs as pre-mixed test cases
— each a self-contained haystack of conversations plus questions — which map
1:1 onto grouped mode. The full dataset is multi-GB, so fetching is selective:
batch files are indexed with cheap HTTP Range tail-probes and only files
matching the requested context sizes are downloaded. The probe index
(`index.json`) records every file including the ones not downloaded, so the
selection is auditable.

```bash
uv run bm-bench datasets fetch --dataset convomem --context-sizes 10,30
uv run bm-bench convert convomem --sample-per-stratum 25 --seed 42
```

Sampling is stratified by (category, contextSize) with a fixed seed;
`sampling.json` records the seed, per-stratum population, and sample counts —
a published number states exactly which slice of ConvoMem it covers. Note
`--sample-per-stratum` counts *cases* (haystacks); larger-context cases carry
multiple questions each, all sharing one ingested group corpus.

Anti-leakage: raw conversations carry `containsEvidence`/`model_name` fields;
rendered docs include neither and conversation ids are remapped to neutral
positional ids.

## Basic Memory source policy

By default this project tracks Basic Memory from `main`.
Expand Down
8 changes: 8 additions & 0 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ bench-convert-longmemeval-dev:

bench-prepare-longmemeval: bench-fetch-longmemeval bench-convert-longmemeval

bench-fetch-convomem:
uv run bm-bench datasets fetch --dataset convomem --context-sizes 10,30

bench-convert-convomem:
uv run bm-bench convert convomem --sample-per-stratum 25 --seed 42

bench-prepare-convomem: bench-fetch-convomem bench-convert-convomem

bench-fetch-locomo-audit:
uv run bm-bench datasets fetch --dataset locomo-audit

Expand Down
34 changes: 33 additions & 1 deletion src/basic_memory_benchmarks/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from basic_memory_benchmarks.converters.locomo_to_corpus import convert_locomo_to_corpus
from basic_memory_benchmarks.converters.longmemeval_to_corpus import convert_longmemeval_to_corpus
from basic_memory_benchmarks.datasets.locomo import LOCOMO_URL, fetch_locomo_dataset
from basic_memory_benchmarks.converters.convomem_to_corpus import convert_convomem_to_corpus
from basic_memory_benchmarks.datasets.convomem import fetch_convomem_batches
from basic_memory_benchmarks.datasets.locomo_audit import fetch_locomo_audit_corrections
from basic_memory_benchmarks.datasets.longmemeval import (
LONGMEMEVAL_S_URL,
Expand Down Expand Up @@ -42,6 +44,9 @@ def datasets_fetch(
dataset: str = typer.Option("locomo", "--dataset"),
output: Path | None = typer.Option(None, "--output"),
url: str | None = typer.Option(None, "--url"),
context_sizes: str = typer.Option(
"10,30", "--context-sizes", help="convomem only: batch context sizes to download"
),
) -> None:
if dataset == "locomo":
resolved_output = output or Path("benchmarks/datasets/locomo/locomo10.json")
Expand All @@ -54,8 +59,14 @@ def datasets_fetch(
elif dataset == "locomo-audit":
resolved_output = output or Path("benchmarks/datasets/locomo-audit/corrections.json")
provenance = fetch_locomo_audit_corrections(output_path=resolved_output)
elif dataset == "convomem":
resolved_output = output or Path("benchmarks/datasets/convomem")
sizes = tuple(int(s.strip()) for s in context_sizes.split(",") if s.strip())
provenance = fetch_convomem_batches(output_dir=resolved_output, context_sizes=sizes)
else:
raise typer.BadParameter("Supported datasets: locomo, longmemeval-s, locomo-audit")
raise typer.BadParameter(
"Supported datasets: locomo, longmemeval-s, locomo-audit, convomem"
)

console.print(f"Downloaded {dataset} to [cyan]{resolved_output}[/cyan]")
console.print(f"SHA256: [green]{provenance.checksum_sha256}[/green]")
Expand Down Expand Up @@ -101,6 +112,27 @@ def convert_longmemeval(
console.print(f"Queries: [cyan]{queries_path}[/cyan] ({query_count})")


@convert_app.command("convomem")
def convert_convomem(
batches_dir: Path = typer.Option(Path("benchmarks/datasets/convomem"), "--batches-dir"),
output_dir: Path = typer.Option(Path("benchmarks/generated/convomem"), "--output-dir"),
sample_per_stratum: int = typer.Option(25, "--sample-per-stratum"),
seed: int = typer.Option(42, "--seed"),
context_sizes: str = typer.Option("10,30", "--context-sizes"),
) -> None:
sizes = tuple(int(s.strip()) for s in context_sizes.split(",") if s.strip())
groups_dir, queries_path, doc_count, query_count = convert_convomem_to_corpus(
batches_dir=batches_dir,
output_dir=output_dir,
sample_per_stratum=sample_per_stratum,
seed=seed,
context_sizes=sizes,
)
console.print(f"Groups: [cyan]{groups_dir}[/cyan] ({doc_count} docs)")
console.print(f"Queries: [cyan]{queries_path}[/cyan] ({query_count})")
console.print(f"Sampling manifest: [cyan]{output_dir / 'sampling.json'}[/cyan]")


@run_app.command("retrieval")
def run_retrieval_command(
providers: str = typer.Option("bm-local,mem0-local", "--providers"),
Expand Down
160 changes: 160 additions & 0 deletions src/basic_memory_benchmarks/converters/convomem_to_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""Convert sampled ConvoMem pre-mixed test cases into grouped benchmark corpora.

Sampling is stratified by (category, contextSize) with a fixed seed, and the
exact sample composition is written to ``sampling.json`` so a published number
can state precisely which slice of ConvoMem it covers.

Anti-leakage: conversations carry ``containsEvidence`` and ``model_name``
fields in the raw data; rendered docs include neither, and conversation ids
are remapped to neutral positional ids.
"""

from __future__ import annotations

import json
import random
from pathlib import Path

from basic_memory_benchmarks.datasets.convomem import load_convomem_batches

DATASET_ID = "convomem"

# Directory names -> benchmark category labels used in reports.
CATEGORY_LABELS: dict[str, str] = {
"user_evidence": "user_facts",
"assistant_facts_evidence": "assistant_facts",
"changing_evidence": "knowledge_update",
"abstention_evidence": "abstention",
"preference_evidence": "preference",
"implicit_connection_evidence": "implicit_connection",
}


def _render_conversation_doc(doc_id: str, messages: list[dict]) -> str:
lines: list[str] = [
"---",
f"title: {doc_id}",
"type: note",
f"source_doc_id: {doc_id}",
f"dataset_id: {DATASET_ID}",
"---",
"",
f"# {doc_id}",
"",
"## Conversation",
]
for message in messages:
speaker = str(message.get("speaker", "unknown")).capitalize()
text = str(message.get("text", "")).strip()
if not text:
continue
lines.append(f"- **{speaker}:** {' '.join(text.split())}")
return "\n".join(lines).rstrip() + "\n"


def convert_convomem_to_corpus(
batches_dir: Path,
output_dir: Path,
sample_per_stratum: int = 25,
seed: int = 42,
context_sizes: tuple[int, ...] | None = None,
) -> tuple[Path, Path, int, int]:
"""Sample cases per (category, contextSize) stratum and emit grouped corpora.

Returns:
groups_dir, queries_path, doc_count, query_count
"""
strata: dict[tuple[str, int], list[tuple[str, int, dict]]] = {}
for category, file_name, cases in load_convomem_batches(batches_dir):
for case_index, case in enumerate(cases):
context_size = int(case.get("contextSize") or 0)
if context_sizes is not None and context_size not in context_sizes:
continue
strata.setdefault((category, context_size), []).append((file_name, case_index, case))

if not strata:
raise ValueError(
f"No ConvoMem cases matched context sizes {context_sizes} in {batches_dir}"
)

groups_dir = output_dir / "groups"
groups_dir.mkdir(parents=True, exist_ok=True)

rng = random.Random(seed)
all_queries: list[dict] = []
sampling_manifest: dict[str, dict] = {}
doc_count = 0

for (category, context_size), members in sorted(strata.items()):
sample_size = min(sample_per_stratum, len(members))
# Sort first so sampling is deterministic regardless of load order.
members.sort(key=lambda member: (member[0], member[1]))
sampled = rng.sample(members, sample_size)
label = CATEGORY_LABELS.get(category, category)
sampling_manifest[f"{label}/cs{context_size}"] = {
"population": len(members),
"sampled": sample_size,
}

for file_name, case_index, case in sorted(sampled, key=lambda m: (m[0], m[1])):
batch_tag = file_name.rsplit("__", 1)[-1].removesuffix(".json")
group_id = f"{label}-cs{context_size}-{batch_tag}-{case_index:04d}"
docs_dir = groups_dir / group_id / "docs"
docs_dir.mkdir(parents=True, exist_ok=True)

doc_id_by_conversation_id: dict[str, str] = {}
for conv_index, conversation in enumerate(case.get("conversations") or []):
doc_id = f"{group_id}-c{conv_index:03d}"
raw_id = str(conversation.get("id") or f"conv-{conv_index}")
doc_id_by_conversation_id[raw_id] = doc_id
(docs_dir / f"{doc_id}.md").write_text(
_render_conversation_doc(doc_id, conversation.get("messages") or []),
encoding="utf-8",
)
doc_count += 1

for query_index, evidence in enumerate(case.get("evidenceItems") or []):
ground_truth: list[str] = []
for evidence_conversation in evidence.get("conversations") or []:
raw_id = str(evidence_conversation.get("id") or "")
mapped = doc_id_by_conversation_id.get(raw_id)
# Abstention evidence references conversations that are
# intentionally absent from the haystack; skip those.
if mapped is not None:
ground_truth.append(mapped)

all_queries.append(
{
"id": f"{group_id}-q{query_index}",
"query": str(evidence.get("question", "")).strip(),
"category": label,
"group": group_id,
"ground_truth": sorted(ground_truth),
"expected_answer": str(evidence.get("answer", "")).strip() or None,
"metadata": {
"dataset_id": DATASET_ID,
"context_size": context_size,
"abstention": label == "abstention",
"domain": str(evidence.get("category", "")),
},
}
)

queries_path = output_dir / "queries.json"
queries_path.write_text(json.dumps(all_queries, indent=2), encoding="utf-8")

sampling_path = output_dir / "sampling.json"
sampling_path.write_text(
json.dumps(
{
"seed": seed,
"sample_per_stratum": sample_per_stratum,
"context_sizes": sorted(context_sizes) if context_sizes else "all",
"strata": sampling_manifest,
},
indent=2,
),
encoding="utf-8",
)

return groups_dir, queries_path, doc_count, len(all_queries)
Loading
Loading