diff --git a/src/basic_memory/repository/postgres_search_repository.py b/src/basic_memory/repository/postgres_search_repository.py index 85e01a10..92d201ff 100644 --- a/src/basic_memory/repository/postgres_search_repository.py +++ b/src/basic_memory/repository/postgres_search_repository.py @@ -18,6 +18,7 @@ from basic_memory.repository.search_repository_base import ( SearchRepositoryBase, VectorChunkState, + relaxed_query_words, ) from basic_memory.repository.metadata_filters import parse_metadata_filters from basic_memory.repository.semantic_errors import SemanticDependenciesMissingError @@ -176,6 +177,14 @@ def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str: # For non-Boolean queries, prepare single term return self._prepare_single_term(term, is_prefix) + @staticmethod + def _relaxed_tsquery_text(search_text: Optional[str]) -> Optional[str]: + """OR-relaxed tsquery expression for a failed strict query, or None.""" + words = relaxed_query_words(search_text) + if not words: + return None + return " | ".join(f"{word}:*" for word in words) + def _prepare_boolean_query(self, query: str) -> str: """Convert Boolean query to tsquery format. @@ -236,7 +245,12 @@ def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str: # Handle multi-word queries if " " in cleaned_term: - words = [w for w in cleaned_term.split() if w.strip()] + # Strip sentence punctuation from word edges so question-form + # queries produce clean lexemes (parity with SQLite FTS5 prep). + # The tsquery tokenizer ignores this punctuation anyway; leaving it + # in only risks tsquery syntax errors. Interior characters are kept. + words = [w.strip("?!.,;") for w in cleaned_term.split()] + words = [w for w in words if w] if not words: # All characters were special chars, search won't match anything # Return a safe search term that won't cause syntax errors @@ -249,8 +263,11 @@ def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str: # Join with AND operator return " & ".join(prepared_words) - # Single word - cleaned_term = cleaned_term.strip() + # Single word: strip edge punctuation; guard the now-empty case so a + # bare ":*"/"" never reaches tsquery. + cleaned_term = cleaned_term.strip().strip("?!.,;") + if not cleaned_term: + return "NOSPECIALCHARS:*" if is_prefix: return f"{cleaned_term}:*" else: @@ -908,6 +925,7 @@ async def search( min_similarity: Optional[float] = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> List[SearchIndexRow]: """Search across all indexed content using PostgreSQL tsvector.""" # --- Dispatch vector / hybrid modes (shared logic) --- @@ -982,6 +1000,20 @@ async def search( async with db.scoped_session(self.session_maker) as session: result = await session.execute(text(sql), params) rows = result.fetchall() + # Trigger: multi-word natural-language query matched nothing + # under the default all-terms-AND tsquery semantics. + # Why: questions rarely have every word in one document; + # without relaxation the FTS half of hybrid search contributes + # zero candidates (parity with the SQLite path). + # Outcome: one retry with OR-joined prefix lexemes; ts_rank + # still ranks multi-term matches first. + relaxed = ( + self._relaxed_tsquery_text(search_text) if allow_relaxed and not rows else None + ) + if relaxed and params.get("text"): + params["text"] = relaxed + result = await session.execute(text(sql), params) + rows = result.fetchall() except Exception as e: if self._is_tsquery_syntax_error(e): logger.warning(f"tsquery syntax error for search term: {search_text}, error: {e}") diff --git a/src/basic_memory/repository/search_repository_base.py b/src/basic_memory/repository/search_repository_base.py index 2e4e687a..52e2d0a8 100644 --- a/src/basic_memory/repository/search_repository_base.py +++ b/src/basic_memory/repository/search_repository_base.py @@ -40,6 +40,36 @@ OVERSIZED_ENTITY_VECTOR_SHARD_SIZE = 256 _SQLITE_MAX_PREPARE_WINDOW = 8 +# Interrogative/function words contribute lexical noise when a strict +# full-text query is relaxed: "when OR did OR a" matches loud wrong documents +# that displace genuine results from the ranking window. +RELAXATION_STOPWORDS = frozenset( + "a an and are as at be but by did do does for from had has have how i in is it of on " + "or that the their they this to was we were what when where which who whom whose why " + "will with you your".split() +) + + +def relaxed_query_words(search_text: Optional[str]) -> Optional[list[str]]: + """Content-bearing words for OR-relaxing a strict full-text query. + + Returns None when relaxation must not apply: empty input, quoted phrases, + or explicit boolean queries (user intent is not second-guessed). + """ + if not search_text: + return None + stripped = search_text.strip() + if '"' in stripped or any(op in f" {stripped} " for op in (" AND ", " OR ", " NOT ")): + return None + words = [word.strip("?!.,;:") for word in stripped.split()] + words = [ + word + for word in words + if word and word.isalnum() and word.lower() not in RELAXATION_STOPWORDS + ] + return words or None + + # Entity, observation, and relation rows in search_index carry ids from independent # auto-increment sequences, so a bare id is ambiguous across row types. Every map in # the vector/hybrid retrieval path must key rows by (type, id) to avoid collisions. @@ -229,6 +259,7 @@ async def search( min_similarity: Optional[float] = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> List[SearchIndexRow]: """Search across all indexed content. @@ -2174,6 +2205,9 @@ async def _search_hybrid( query_start = time.perf_counter() candidate_limit = max(self._semantic_vector_k, (limit + offset) * 10) fts_start = time.perf_counter() + # allow_relaxed: question-form queries rarely AND-match, and a dead FTS + # branch silently degrades hybrid to vector-only ranking. Fusion plus + # bm25 keep relaxed lexical candidates from dominating precision. fts_results = await self.search( search_text=search_text, permalink=permalink, @@ -2187,6 +2221,7 @@ async def _search_hybrid( retrieval_mode=SearchRetrievalMode.FTS, limit=candidate_limit, offset=0, + allow_relaxed=True, ) fts_ms = (time.perf_counter() - fts_start) * 1000 vector_start = time.perf_counter() diff --git a/src/basic_memory/repository/sqlite_search_repository.py b/src/basic_memory/repository/sqlite_search_repository.py index c467a9a2..8b82a79b 100644 --- a/src/basic_memory/repository/sqlite_search_repository.py +++ b/src/basic_memory/repository/sqlite_search_repository.py @@ -23,7 +23,10 @@ from basic_memory.repository.embedding_provider import EmbeddingProvider from basic_memory.repository.embedding_provider_factory import create_embedding_provider from basic_memory.repository.search_index_row import SearchIndexRow -from basic_memory.repository.search_repository_base import SearchRepositoryBase +from basic_memory.repository.search_repository_base import ( + SearchRepositoryBase, + relaxed_query_words, +) from basic_memory.repository.metadata_filters import parse_metadata_filters, build_sqlite_json_path from basic_memory.repository.semantic_errors import SemanticDependenciesMissingError from basic_memory.schemas.search import SearchItemType, SearchRetrievalMode @@ -255,6 +258,19 @@ def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str: if "*" in term and all(c.isalnum() or c in "*_-" for c in term): return term + # Natural-language queries arrive with sentence punctuation that FTS5 + # treats as syntax ("When did Melanie paint a sunrise?"). The tokenizer + # ignores this punctuation in the INDEX, so stripping it from word + # edges loses nothing — but leaving it forces the whole question into + # an exact-phrase match that returns zero rows, silently disabling the + # FTS half of hybrid search. Interior characters (hyphens, slashes — + # permalinks and paths) are untouched. + if " " in term: + words = [word.strip("?!.,;:") for word in term.split()] + term = " ".join(word for word in words if word) + if not term: + return "" + # Characters that can cause FTS5 syntax errors when used as operators # We're more conservative here - only quote when we detect problematic patterns problematic_chars = [ @@ -351,6 +367,14 @@ def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str: # For non-Boolean queries, use the single term preparation logic return self._prepare_single_term(term, is_prefix) + @staticmethod + def _relaxed_fts_text(search_text: Optional[str]) -> Optional[str]: + """OR-relaxed FTS5 expression for a failed strict query, or None.""" + words = relaxed_query_words(search_text) + if not words: + return None + return " OR ".join(f"{word}*" for word in words) + # ------------------------------------------------------------------ # sqlite-vec extension loading (SQLite-specific) # ------------------------------------------------------------------ @@ -953,8 +977,15 @@ async def search( min_similarity: Optional[float] = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> List[SearchIndexRow]: - """Search across all indexed content using SQLite FTS5.""" + """Search across all indexed content using SQLite FTS5. + + ``allow_relaxed=True`` retries a zero-result strict multi-word query + with OR-joined content terms. Only the hybrid path opts in: its FTS + branch otherwise contributes nothing for question-form queries. + Service-level FTS searches keep their own conservative fallback. + """ # --- Dispatch vector / hybrid modes (shared logic) --- dispatched = await self._dispatch_retrieval_mode( search_text=search_text, @@ -1021,6 +1052,21 @@ async def search( async with db.scoped_session(self.session_maker) as session: result = await session.execute(text(sql), params) rows = result.fetchall() + # Trigger: multi-word natural-language query matched nothing + # under the default all-terms-AND semantics. + # Why: questions ("when did X do Y") rarely have every word in + # one document; without relaxation the FTS half of hybrid + # search contributes zero candidates and ranking degrades to + # vector-only. + # Outcome: one retry with OR-joined prefix terms; bm25 still + # ranks multi-term matches first. + relaxed = ( + self._relaxed_fts_text(search_text) if allow_relaxed and not rows else None + ) + if relaxed and params.get("text"): + params["text"] = relaxed + result = await session.execute(text(sql), params) + rows = result.fetchall() except Exception as e: # Handle FTS5 syntax errors and provide user-friendly feedback if self._is_fts5_syntax_error(e): # pragma: no cover diff --git a/tests/repository/test_hybrid_fusion.py b/tests/repository/test_hybrid_fusion.py index 0a4a2c29..e61d02e5 100644 --- a/tests/repository/test_hybrid_fusion.py +++ b/tests/repository/test_hybrid_fusion.py @@ -77,6 +77,7 @@ async def search( min_similarity: Optional[float] = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> list[SearchIndexRow]: return [] # pragma: no cover diff --git a/tests/repository/test_postgres_search_repository.py b/tests/repository/test_postgres_search_repository.py index 7c022014..f3c497e6 100644 --- a/tests/repository/test_postgres_search_repository.py +++ b/tests/repository/test_postgres_search_repository.py @@ -1001,3 +1001,59 @@ async def test_postgres_search_categories_exact_match(session_maker, test_projec # Multiple categories union. multi = await repo.search(categories=["requirement", "decision"]) assert {r.id for r in multi} == {70101, 70102} + + +@pytest.mark.asyncio +async def test_postgres_question_punctuation_and_relaxation(session_maker, test_project): + """Question-form queries must produce clean lexemes and a usable relaxation. + + Parity with SQLite: sentence punctuation previously reached tsquery terms, + and a strict all-AND miss had no relaxed retry, silently disabling the FTS + half of hybrid search for natural-language questions. + """ + repo = PostgresSearchRepository(session_maker, project_id=test_project.id) + + # Edge punctuation stripped before lexeme formatting. + prepared = repo._prepare_search_term("When did Melanie paint a sunrise?") + assert "?" not in prepared + assert "sunrise:*" in prepared + + # Relaxation drops stopwords and OR-joins content terms. + relaxed = repo._relaxed_tsquery_text("When did Melanie paint a sunrise?") + assert relaxed == "Melanie:* | paint:* | sunrise:*" + + # User intent is not second-guessed. + assert repo._relaxed_tsquery_text("alpha AND beta") is None + assert repo._relaxed_tsquery_text('"exact phrase"') is None + assert repo._relaxed_tsquery_text(None) is None + + +@pytest.mark.asyncio +async def test_postgres_multiword_query_relaxes_on_strict_miss(session_maker, test_project): + repo = PostgresSearchRepository(session_maker, project_id=test_project.id) + now = datetime.now(timezone.utc) + await repo.index_item( + SearchIndexRow( + project_id=test_project.id, + id=77, + title="Trip plans", + content_stems="melanie painted a sunrise over the lake last year", + content_snippet="Melanie painted a sunrise over the lake last year.", + permalink="docs/trip-plans", + file_path="docs/trip-plans.md", + type="entity", + metadata={"note_type": "note"}, + created_at=now, + updated_at=now, + ) + ) + + # A content word absent from the doc ("hiking") makes the strict + # all-terms-AND query miss even after Postgres drops stopwords — without + # it, to_tsquery('english', ...) already strips "when/did/a" and matches. + strict = await repo.search(search_text="Did Melanie go hiking at sunrise?") + assert strict == [] + + # The hybrid FTS branch opts in; OR-relaxation surfaces the partial match. + results = await repo.search(search_text="Did Melanie go hiking at sunrise?", allow_relaxed=True) + assert any(r.id == 77 for r in results) diff --git a/tests/repository/test_search_repository.py b/tests/repository/test_search_repository.py index 0fb6d630..0068b3f1 100644 --- a/tests/repository/test_search_repository.py +++ b/tests/repository/test_search_repository.py @@ -1124,3 +1124,80 @@ async def test_search_categories_exact_match(search_repository, search_entity): # Multiple categories union: both observations come back. multi = await search_repository.search(categories=["requirement", "decision"]) assert {r.id for r in multi} == {70001, 70002} + + +@pytest.mark.asyncio +async def test_question_punctuation_does_not_phrase_quote(search_repository): + """Sentence punctuation must not force exact-phrase matching (#hybrid-fts). + + 'When did Melanie paint a sunrise?' previously became the FTS5 phrase + '"When did Melanie paint a sunrise?"*' — zero rows for any corpus — which + silently disabled the FTS half of hybrid search for question queries. + """ + prepared = search_repository._prepare_single_term("When did Melanie paint a sunrise?") + assert '"' not in prepared + # Prefix syntax differs by backend: FTS5 uses '*', tsquery uses ':*'. + if is_postgres_backend(search_repository): + assert "sunrise:*" in prepared + else: + assert "sunrise*" in prepared + + +@pytest.mark.asyncio +async def test_relaxed_query_drops_stopwords(search_repository): + """Relaxation keys on content-bearing terms in each backend's syntax.""" + if is_postgres_backend(search_repository): + relaxed = search_repository._relaxed_tsquery_text("When did Melanie paint a sunrise?") + assert relaxed == "Melanie:* | paint:* | sunrise:*" + else: + relaxed = search_repository._relaxed_fts_text("When did Melanie paint a sunrise?") + assert relaxed == "Melanie* OR paint* OR sunrise*" + + +@pytest.mark.asyncio +async def test_relaxed_query_respects_user_intent(search_repository): + # Explicit boolean and quoted queries are not second-guessed (both backends). + if is_postgres_backend(search_repository): + relaxer = search_repository._relaxed_tsquery_text + single = "single:*" + else: + relaxer = search_repository._relaxed_fts_text + single = "single*" + assert relaxer("alpha AND beta") is None + assert relaxer('"exact phrase"') is None + assert relaxer("single") == single + assert relaxer(None) is None + + +@pytest.mark.asyncio +async def test_multiword_query_relaxes_to_or_when_strict_misses(search_repository, search_entity): + """A question sharing only SOME words with a doc still surfaces it.""" + from basic_memory.repository.search_index_row import SearchIndexRow + from basic_memory.schemas.search import SearchItemType + + row = SearchIndexRow( + project_id=search_repository.project_id, + id=search_entity.id, + type=SearchItemType.ENTITY.value, + title="Trip plans", + content_snippet="Melanie painted a sunrise over the lake last year.", + content_stems="melanie painted a sunrise over the lake last year", + permalink=search_entity.permalink, + file_path=search_entity.file_path, + entity_id=search_entity.id, + metadata={"note_type": search_entity.note_type}, + created_at=search_entity.created_at, + updated_at=search_entity.updated_at, + ) + await search_repository.index_item(row) + + # "hiking" is absent from the doc, so strict all-terms-AND misses on both + # backends (Postgres's stopword stripping can't rescue it either). + strict = await search_repository.search(search_text="Did Melanie go hiking at sunrise?") + assert strict == [] + + # The hybrid FTS branch opts in; OR-relaxation surfaces the partial match. + results = await search_repository.search( + search_text="Did Melanie go hiking at sunrise?", allow_relaxed=True + ) + assert any(r.entity_id == search_entity.id for r in results) diff --git a/tests/repository/test_semantic_search_base.py b/tests/repository/test_semantic_search_base.py index b75f8871..373f2ea4 100644 --- a/tests/repository/test_semantic_search_base.py +++ b/tests/repository/test_semantic_search_base.py @@ -89,6 +89,7 @@ async def search( min_similarity: float | None = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> list[SearchIndexRow]: return [] diff --git a/tests/repository/test_vector_pagination.py b/tests/repository/test_vector_pagination.py index 09a001e3..f08a98ba 100644 --- a/tests/repository/test_vector_pagination.py +++ b/tests/repository/test_vector_pagination.py @@ -62,6 +62,7 @@ async def search( min_similarity: float | None = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> list[SearchIndexRow]: return [] # pragma: no cover diff --git a/tests/repository/test_vector_threshold.py b/tests/repository/test_vector_threshold.py index 34a66d7d..91598518 100644 --- a/tests/repository/test_vector_threshold.py +++ b/tests/repository/test_vector_threshold.py @@ -66,6 +66,7 @@ async def search( min_similarity: Optional[float] = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> list[SearchIndexRow]: return [] # pragma: no cover