Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 35 additions & 3 deletions src/basic_memory/repository/postgres_search_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from basic_memory.repository.search_repository_base import (
SearchRepositoryBase,
VectorChunkState,
relaxed_query_words,
)
from basic_memory.repository.metadata_filters import parse_metadata_filters
from basic_memory.repository.semantic_errors import SemanticDependenciesMissingError
Expand Down Expand Up @@ -176,6 +177,14 @@ def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str:
# For non-Boolean queries, prepare single term
return self._prepare_single_term(term, is_prefix)

@staticmethod
def _relaxed_tsquery_text(search_text: Optional[str]) -> Optional[str]:
"""OR-relaxed tsquery expression for a failed strict query, or None."""
words = relaxed_query_words(search_text)
if not words:
return None
return " | ".join(f"{word}:*" for word in words)

def _prepare_boolean_query(self, query: str) -> str:
"""Convert Boolean query to tsquery format.

Expand Down Expand Up @@ -236,7 +245,12 @@ def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str:

# Handle multi-word queries
if " " in cleaned_term:
words = [w for w in cleaned_term.split() if w.strip()]
# Strip sentence punctuation from word edges so question-form
# queries produce clean lexemes (parity with SQLite FTS5 prep).
# The tsquery tokenizer ignores this punctuation anyway; leaving it
# in only risks tsquery syntax errors. Interior characters are kept.
words = [w.strip("?!.,;") for w in cleaned_term.split()]
words = [w for w in words if w]
if not words:
# All characters were special chars, search won't match anything
# Return a safe search term that won't cause syntax errors
Expand All @@ -249,8 +263,11 @@ def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str:
# Join with AND operator
return " & ".join(prepared_words)

# Single word
cleaned_term = cleaned_term.strip()
# Single word: strip edge punctuation; guard the now-empty case so a
# bare ":*"/"" never reaches tsquery.
cleaned_term = cleaned_term.strip().strip("?!.,;")
if not cleaned_term:
return "NOSPECIALCHARS:*"
if is_prefix:
return f"{cleaned_term}:*"
else:
Expand Down Expand Up @@ -908,6 +925,7 @@ async def search(
min_similarity: Optional[float] = None,
limit: int = 10,
offset: int = 0,
allow_relaxed: bool = False,
) -> List[SearchIndexRow]:
"""Search across all indexed content using PostgreSQL tsvector."""
# --- Dispatch vector / hybrid modes (shared logic) ---
Expand Down Expand Up @@ -982,6 +1000,20 @@ async def search(
async with db.scoped_session(self.session_maker) as session:
result = await session.execute(text(sql), params)
rows = result.fetchall()
# Trigger: multi-word natural-language query matched nothing
# under the default all-terms-AND tsquery semantics.
# Why: questions rarely have every word in one document;
# without relaxation the FTS half of hybrid search contributes
# zero candidates (parity with the SQLite path).
# Outcome: one retry with OR-joined prefix lexemes; ts_rank
# still ranks multi-term matches first.
relaxed = (
self._relaxed_tsquery_text(search_text) if allow_relaxed and not rows else None
)
if relaxed and params.get("text"):
params["text"] = relaxed
result = await session.execute(text(sql), params)
rows = result.fetchall()
except Exception as e:
if self._is_tsquery_syntax_error(e):
logger.warning(f"tsquery syntax error for search term: {search_text}, error: {e}")
Expand Down
35 changes: 35 additions & 0 deletions src/basic_memory/repository/search_repository_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,36 @@
OVERSIZED_ENTITY_VECTOR_SHARD_SIZE = 256
_SQLITE_MAX_PREPARE_WINDOW = 8

# Interrogative/function words contribute lexical noise when a strict
# full-text query is relaxed: "when OR did OR a" matches loud wrong documents
# that displace genuine results from the ranking window.
RELAXATION_STOPWORDS = frozenset(
"a an and are as at be but by did do does for from had has have how i in is it of on "
"or that the their they this to was we were what when where which who whom whose why "
"will with you your".split()
)


def relaxed_query_words(search_text: Optional[str]) -> Optional[list[str]]:
"""Content-bearing words for OR-relaxing a strict full-text query.

Returns None when relaxation must not apply: empty input, quoted phrases,
or explicit boolean queries (user intent is not second-guessed).
"""
if not search_text:
return None
stripped = search_text.strip()
if '"' in stripped or any(op in f" {stripped} " for op in (" AND ", " OR ", " NOT ")):
return None
words = [word.strip("?!.,;:") for word in stripped.split()]
words = [
word
for word in words
if word and word.isalnum() and word.lower() not in RELAXATION_STOPWORDS
]
return words or None


# Entity, observation, and relation rows in search_index carry ids from independent
# auto-increment sequences, so a bare id is ambiguous across row types. Every map in
# the vector/hybrid retrieval path must key rows by (type, id) to avoid collisions.
Expand Down Expand Up @@ -229,6 +259,7 @@ async def search(
min_similarity: Optional[float] = None,
limit: int = 10,
offset: int = 0,
allow_relaxed: bool = False,
) -> List[SearchIndexRow]:
"""Search across all indexed content.

Expand Down Expand Up @@ -2174,6 +2205,9 @@ async def _search_hybrid(
query_start = time.perf_counter()
candidate_limit = max(self._semantic_vector_k, (limit + offset) * 10)
fts_start = time.perf_counter()
# allow_relaxed: question-form queries rarely AND-match, and a dead FTS
# branch silently degrades hybrid to vector-only ranking. Fusion plus
# bm25 keep relaxed lexical candidates from dominating precision.
fts_results = await self.search(
search_text=search_text,
permalink=permalink,
Expand All @@ -2187,6 +2221,7 @@ async def _search_hybrid(
retrieval_mode=SearchRetrievalMode.FTS,
limit=candidate_limit,
offset=0,
allow_relaxed=True,

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Gate relaxed hybrid FTS with existing eligibility

When a HYBRID query has a strict FTS miss, this now enables OR-relaxation for every query shape, including short titles and numeric identifiers such as SPEC 16 or root note 1. The service-level relaxed FTS path explicitly rejects those cases in SearchService._is_relaxed_fts_fallback_eligible because OR-relaxing them over-broadens results; in hybrid, the relaxed FTS-only rows are then normalized up to 1.0 and can outrank the vector result the user actually needed. Please apply the same eligibility constraints before opting the hybrid FTS branch into relaxation.

Useful? React with 👍 / 👎.

)
fts_ms = (time.perf_counter() - fts_start) * 1000
vector_start = time.perf_counter()
Expand Down
50 changes: 48 additions & 2 deletions src/basic_memory/repository/sqlite_search_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@
from basic_memory.repository.embedding_provider import EmbeddingProvider
from basic_memory.repository.embedding_provider_factory import create_embedding_provider
from basic_memory.repository.search_index_row import SearchIndexRow
from basic_memory.repository.search_repository_base import SearchRepositoryBase
from basic_memory.repository.search_repository_base import (
SearchRepositoryBase,
relaxed_query_words,
)
from basic_memory.repository.metadata_filters import parse_metadata_filters, build_sqlite_json_path
from basic_memory.repository.semantic_errors import SemanticDependenciesMissingError
from basic_memory.schemas.search import SearchItemType, SearchRetrievalMode
Expand Down Expand Up @@ -255,6 +258,19 @@ def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str:
if "*" in term and all(c.isalnum() or c in "*_-" for c in term):
return term

# Natural-language queries arrive with sentence punctuation that FTS5
# treats as syntax ("When did Melanie paint a sunrise?"). The tokenizer
# ignores this punctuation in the INDEX, so stripping it from word
# edges loses nothing — but leaving it forces the whole question into
# an exact-phrase match that returns zero rows, silently disabling the
# FTS half of hybrid search. Interior characters (hyphens, slashes —
# permalinks and paths) are untouched.
if " " in term:
words = [word.strip("?!.,;:") for word in term.split()]
term = " ".join(word for word in words if word)
if not term:
return ""

# Characters that can cause FTS5 syntax errors when used as operators
# We're more conservative here - only quote when we detect problematic patterns
problematic_chars = [
Expand Down Expand Up @@ -351,6 +367,14 @@ def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str:
# For non-Boolean queries, use the single term preparation logic
return self._prepare_single_term(term, is_prefix)

@staticmethod
def _relaxed_fts_text(search_text: Optional[str]) -> Optional[str]:
"""OR-relaxed FTS5 expression for a failed strict query, or None."""
words = relaxed_query_words(search_text)
if not words:
return None
return " OR ".join(f"{word}*" for word in words)

# ------------------------------------------------------------------
# sqlite-vec extension loading (SQLite-specific)
# ------------------------------------------------------------------
Expand Down Expand Up @@ -953,8 +977,15 @@ async def search(
min_similarity: Optional[float] = None,
limit: int = 10,
offset: int = 0,
allow_relaxed: bool = False,
) -> List[SearchIndexRow]:
"""Search across all indexed content using SQLite FTS5."""
"""Search across all indexed content using SQLite FTS5.

``allow_relaxed=True`` retries a zero-result strict multi-word query
with OR-joined content terms. Only the hybrid path opts in: its FTS
branch otherwise contributes nothing for question-form queries.
Service-level FTS searches keep their own conservative fallback.
"""
# --- Dispatch vector / hybrid modes (shared logic) ---
dispatched = await self._dispatch_retrieval_mode(
search_text=search_text,
Expand Down Expand Up @@ -1021,6 +1052,21 @@ async def search(
async with db.scoped_session(self.session_maker) as session:
result = await session.execute(text(sql), params)
rows = result.fetchall()
# Trigger: multi-word natural-language query matched nothing
# under the default all-terms-AND semantics.
# Why: questions ("when did X do Y") rarely have every word in
# one document; without relaxation the FTS half of hybrid
# search contributes zero candidates and ranking degrades to
# vector-only.
# Outcome: one retry with OR-joined prefix terms; bm25 still
# ranks multi-term matches first.
relaxed = (
self._relaxed_fts_text(search_text) if allow_relaxed and not rows else None
)
if relaxed and params.get("text"):
params["text"] = relaxed
result = await session.execute(text(sql), params)
rows = result.fetchall()
except Exception as e:
# Handle FTS5 syntax errors and provide user-friendly feedback
if self._is_fts5_syntax_error(e): # pragma: no cover
Expand Down
1 change: 1 addition & 0 deletions tests/repository/test_hybrid_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ async def search(
min_similarity: Optional[float] = None,
limit: int = 10,
offset: int = 0,
allow_relaxed: bool = False,
) -> list[SearchIndexRow]:
return [] # pragma: no cover

Expand Down
56 changes: 56 additions & 0 deletions tests/repository/test_postgres_search_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -1001,3 +1001,59 @@ async def test_postgres_search_categories_exact_match(session_maker, test_projec
# Multiple categories union.
multi = await repo.search(categories=["requirement", "decision"])
assert {r.id for r in multi} == {70101, 70102}


@pytest.mark.asyncio
async def test_postgres_question_punctuation_and_relaxation(session_maker, test_project):
"""Question-form queries must produce clean lexemes and a usable relaxation.

Parity with SQLite: sentence punctuation previously reached tsquery terms,
and a strict all-AND miss had no relaxed retry, silently disabling the FTS
half of hybrid search for natural-language questions.
"""
repo = PostgresSearchRepository(session_maker, project_id=test_project.id)

# Edge punctuation stripped before lexeme formatting.
prepared = repo._prepare_search_term("When did Melanie paint a sunrise?")
assert "?" not in prepared
assert "sunrise:*" in prepared

# Relaxation drops stopwords and OR-joins content terms.
relaxed = repo._relaxed_tsquery_text("When did Melanie paint a sunrise?")
assert relaxed == "Melanie:* | paint:* | sunrise:*"

# User intent is not second-guessed.
assert repo._relaxed_tsquery_text("alpha AND beta") is None
assert repo._relaxed_tsquery_text('"exact phrase"') is None
assert repo._relaxed_tsquery_text(None) is None


@pytest.mark.asyncio
async def test_postgres_multiword_query_relaxes_on_strict_miss(session_maker, test_project):
repo = PostgresSearchRepository(session_maker, project_id=test_project.id)
now = datetime.now(timezone.utc)
await repo.index_item(
SearchIndexRow(
project_id=test_project.id,
id=77,
title="Trip plans",
content_stems="melanie painted a sunrise over the lake last year",
content_snippet="Melanie painted a sunrise over the lake last year.",
permalink="docs/trip-plans",
file_path="docs/trip-plans.md",
type="entity",
metadata={"note_type": "note"},
created_at=now,
updated_at=now,
)
)

# A content word absent from the doc ("hiking") makes the strict
# all-terms-AND query miss even after Postgres drops stopwords — without
# it, to_tsquery('english', ...) already strips "when/did/a" and matches.
strict = await repo.search(search_text="Did Melanie go hiking at sunrise?")
assert strict == []

# The hybrid FTS branch opts in; OR-relaxation surfaces the partial match.
results = await repo.search(search_text="Did Melanie go hiking at sunrise?", allow_relaxed=True)
assert any(r.id == 77 for r in results)
77 changes: 77 additions & 0 deletions tests/repository/test_search_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -1124,3 +1124,80 @@ async def test_search_categories_exact_match(search_repository, search_entity):
# Multiple categories union: both observations come back.
multi = await search_repository.search(categories=["requirement", "decision"])
assert {r.id for r in multi} == {70001, 70002}


@pytest.mark.asyncio
async def test_question_punctuation_does_not_phrase_quote(search_repository):
"""Sentence punctuation must not force exact-phrase matching (#hybrid-fts).

'When did Melanie paint a sunrise?' previously became the FTS5 phrase
'"When did Melanie paint a sunrise?"*' — zero rows for any corpus — which
silently disabled the FTS half of hybrid search for question queries.
"""
prepared = search_repository._prepare_single_term("When did Melanie paint a sunrise?")
assert '"' not in prepared
# Prefix syntax differs by backend: FTS5 uses '*', tsquery uses ':*'.
if is_postgres_backend(search_repository):
assert "sunrise:*" in prepared
else:
assert "sunrise*" in prepared


@pytest.mark.asyncio
async def test_relaxed_query_drops_stopwords(search_repository):
"""Relaxation keys on content-bearing terms in each backend's syntax."""
if is_postgres_backend(search_repository):
relaxed = search_repository._relaxed_tsquery_text("When did Melanie paint a sunrise?")
assert relaxed == "Melanie:* | paint:* | sunrise:*"
else:
relaxed = search_repository._relaxed_fts_text("When did Melanie paint a sunrise?")
assert relaxed == "Melanie* OR paint* OR sunrise*"


@pytest.mark.asyncio
async def test_relaxed_query_respects_user_intent(search_repository):
# Explicit boolean and quoted queries are not second-guessed (both backends).
if is_postgres_backend(search_repository):
relaxer = search_repository._relaxed_tsquery_text
single = "single:*"
else:
relaxer = search_repository._relaxed_fts_text
single = "single*"
assert relaxer("alpha AND beta") is None
assert relaxer('"exact phrase"') is None
assert relaxer("single") == single
assert relaxer(None) is None


@pytest.mark.asyncio
async def test_multiword_query_relaxes_to_or_when_strict_misses(search_repository, search_entity):
"""A question sharing only SOME words with a doc still surfaces it."""
from basic_memory.repository.search_index_row import SearchIndexRow
from basic_memory.schemas.search import SearchItemType

row = SearchIndexRow(
project_id=search_repository.project_id,
id=search_entity.id,
type=SearchItemType.ENTITY.value,
title="Trip plans",
content_snippet="Melanie painted a sunrise over the lake last year.",
content_stems="melanie painted a sunrise over the lake last year",
permalink=search_entity.permalink,
file_path=search_entity.file_path,
entity_id=search_entity.id,
metadata={"note_type": search_entity.note_type},
created_at=search_entity.created_at,
updated_at=search_entity.updated_at,
)
await search_repository.index_item(row)

# "hiking" is absent from the doc, so strict all-terms-AND misses on both
# backends (Postgres's stopword stripping can't rescue it either).
strict = await search_repository.search(search_text="Did Melanie go hiking at sunrise?")
assert strict == []

# The hybrid FTS branch opts in; OR-relaxation surfaces the partial match.
results = await search_repository.search(
search_text="Did Melanie go hiking at sunrise?", allow_relaxed=True
)
assert any(r.entity_id == search_entity.id for r in results)
1 change: 1 addition & 0 deletions tests/repository/test_semantic_search_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ async def search(
min_similarity: float | None = None,
limit: int = 10,
offset: int = 0,
allow_relaxed: bool = False,
) -> list[SearchIndexRow]:
return []

Expand Down
1 change: 1 addition & 0 deletions tests/repository/test_vector_pagination.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ async def search(
min_similarity: float | None = None,
limit: int = 10,
offset: int = 0,
allow_relaxed: bool = False,
) -> list[SearchIndexRow]:
return [] # pragma: no cover

Expand Down
Loading
Loading