From 2462844beb4be486659d342451e6e55c5810c499 Mon Sep 17 00:00:00 2001 From: Drew Cain Date: Fri, 12 Jun 2026 21:56:57 -0500 Subject: [PATCH 1/2] fix(core): repair FTS half of hybrid search for natural-language queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hybrid search was silently running vector-only on natural-language queries — the FTS branch contributed zero candidates. Two causes in the SQLite (and parallel Postgres) FTS query preparation: 1. Sentence punctuation forced phrase matching. A question like "When did Melanie paint a sunrise?" reached FTS5 as the exact phrase '"When did Melanie paint a sunrise?"*', which matches no document. The FTS5 tokenizer ignores this punctuation in the index, so stripping it from word edges loses nothing — but leaving it disabled the entire FTS contribution. _prepare_single_term now strips ?!.,;: from word edges of multi-word queries (interior characters — hyphens, slashes in permalinks/paths — untouched). 2. No relaxation when strict all-terms-AND matched nothing. Questions rarely have every word in one document, so even after (1) the strict AND returned zero rows. The hybrid path now retries once with an OR-joined, stopword-filtered, content-term query when the strict query is empty. bm25/ts_rank still rank multi-term matches first, and fusion with the vector branch keeps relaxed lexical candidates from dominating precision. The relaxation is gated behind a new allow_relaxed=False parameter on SearchRepositoryBase.search; only _search_hybrid opts in. Strict FTS behavior (search_type=text, title, permalink, link resolution) is unchanged — the service layer keeps its own conservative fallback. No config flag, default-safe. Discovered via the benchmark harness: two different fusion algorithms produced byte-identical rankings across 1,986 queries (impossible with two live sources), and instrumentation confirmed fts=0 on 40/40 sampled LoCoMo queries. Benchmark impact (corrected LoCoMo, 1,986 queries, same index, retrieval metrics — every category improves, no regression): recall@5 0.745 -> 0.823 (+7.9) MRR 0.618 -> 0.718 (+10.0) headline r5 0.734 -> 0.801, MRR 0.621 -> 0.706 Largest gains on open_domain (+0.10 r5) and adversarial (+0.12 r5); smallest on temporal (+0.003 r5 / +0.02 MRR). Tests: punctuation no longer phrase-quotes; relaxation builds the expected OR query and respects boolean/quoted/short-query intent; the hybrid opt-in surfaces a partial-overlap document while the default strict path still returns empty. Parallel coverage for Postgres. Full SQLite unit suite green (2968 passed); ty + ruff clean. Signed-off-by: Drew Cain --- .../repository/postgres_search_repository.py | 32 ++++++++++ .../repository/search_repository_base.py | 35 +++++++++++ .../repository/sqlite_search_repository.py | 50 ++++++++++++++- tests/repository/test_hybrid_fusion.py | 1 + .../test_postgres_search_repository.py | 54 ++++++++++++++++ tests/repository/test_search_repository.py | 62 +++++++++++++++++++ tests/repository/test_semantic_search_base.py | 1 + tests/repository/test_vector_pagination.py | 1 + tests/repository/test_vector_threshold.py | 1 + 9 files changed, 235 insertions(+), 2 deletions(-) diff --git a/src/basic_memory/repository/postgres_search_repository.py b/src/basic_memory/repository/postgres_search_repository.py index 85e01a10..1e2c60b6 100644 --- a/src/basic_memory/repository/postgres_search_repository.py +++ b/src/basic_memory/repository/postgres_search_repository.py @@ -18,6 +18,7 @@ from basic_memory.repository.search_repository_base import ( SearchRepositoryBase, VectorChunkState, + relaxed_query_words, ) from basic_memory.repository.metadata_filters import parse_metadata_filters from basic_memory.repository.semantic_errors import SemanticDependenciesMissingError @@ -176,6 +177,14 @@ def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str: # For non-Boolean queries, prepare single term return self._prepare_single_term(term, is_prefix) + @staticmethod + def _relaxed_tsquery_text(search_text: Optional[str]) -> Optional[str]: + """OR-relaxed tsquery expression for a failed strict query, or None.""" + words = relaxed_query_words(search_text) + if not words: + return None + return " | ".join(f"{word}:*" for word in words) + def _prepare_boolean_query(self, query: str) -> str: """Convert Boolean query to tsquery format. @@ -234,6 +243,14 @@ def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str: for char in special_chars: cleaned_term = cleaned_term.replace(char, " ") + # Sentence punctuation carries no lexical signal in tsquery either; + # strip it from word edges so question-form queries produce clean + # lexemes (parity with the SQLite FTS5 preparation). + if " " in cleaned_term: + cleaned_term = " ".join( + word.strip("?!.,;") for word in cleaned_term.split() if word.strip("?!.,;") + ) + # Handle multi-word queries if " " in cleaned_term: words = [w for w in cleaned_term.split() if w.strip()] @@ -908,6 +925,7 @@ async def search( min_similarity: Optional[float] = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> List[SearchIndexRow]: """Search across all indexed content using PostgreSQL tsvector.""" # --- Dispatch vector / hybrid modes (shared logic) --- @@ -982,6 +1000,20 @@ async def search( async with db.scoped_session(self.session_maker) as session: result = await session.execute(text(sql), params) rows = result.fetchall() + # Trigger: multi-word natural-language query matched nothing + # under the default all-terms-AND tsquery semantics. + # Why: questions rarely have every word in one document; + # without relaxation the FTS half of hybrid search contributes + # zero candidates (parity with the SQLite path). + # Outcome: one retry with OR-joined prefix lexemes; ts_rank + # still ranks multi-term matches first. + relaxed = ( + self._relaxed_tsquery_text(search_text) if allow_relaxed and not rows else None + ) + if relaxed and params.get("text"): + params["text"] = relaxed + result = await session.execute(text(sql), params) + rows = result.fetchall() except Exception as e: if self._is_tsquery_syntax_error(e): logger.warning(f"tsquery syntax error for search term: {search_text}, error: {e}") diff --git a/src/basic_memory/repository/search_repository_base.py b/src/basic_memory/repository/search_repository_base.py index 2e4e687a..52e2d0a8 100644 --- a/src/basic_memory/repository/search_repository_base.py +++ b/src/basic_memory/repository/search_repository_base.py @@ -40,6 +40,36 @@ OVERSIZED_ENTITY_VECTOR_SHARD_SIZE = 256 _SQLITE_MAX_PREPARE_WINDOW = 8 +# Interrogative/function words contribute lexical noise when a strict +# full-text query is relaxed: "when OR did OR a" matches loud wrong documents +# that displace genuine results from the ranking window. +RELAXATION_STOPWORDS = frozenset( + "a an and are as at be but by did do does for from had has have how i in is it of on " + "or that the their they this to was we were what when where which who whom whose why " + "will with you your".split() +) + + +def relaxed_query_words(search_text: Optional[str]) -> Optional[list[str]]: + """Content-bearing words for OR-relaxing a strict full-text query. + + Returns None when relaxation must not apply: empty input, quoted phrases, + or explicit boolean queries (user intent is not second-guessed). + """ + if not search_text: + return None + stripped = search_text.strip() + if '"' in stripped or any(op in f" {stripped} " for op in (" AND ", " OR ", " NOT ")): + return None + words = [word.strip("?!.,;:") for word in stripped.split()] + words = [ + word + for word in words + if word and word.isalnum() and word.lower() not in RELAXATION_STOPWORDS + ] + return words or None + + # Entity, observation, and relation rows in search_index carry ids from independent # auto-increment sequences, so a bare id is ambiguous across row types. Every map in # the vector/hybrid retrieval path must key rows by (type, id) to avoid collisions. @@ -229,6 +259,7 @@ async def search( min_similarity: Optional[float] = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> List[SearchIndexRow]: """Search across all indexed content. @@ -2174,6 +2205,9 @@ async def _search_hybrid( query_start = time.perf_counter() candidate_limit = max(self._semantic_vector_k, (limit + offset) * 10) fts_start = time.perf_counter() + # allow_relaxed: question-form queries rarely AND-match, and a dead FTS + # branch silently degrades hybrid to vector-only ranking. Fusion plus + # bm25 keep relaxed lexical candidates from dominating precision. fts_results = await self.search( search_text=search_text, permalink=permalink, @@ -2187,6 +2221,7 @@ async def _search_hybrid( retrieval_mode=SearchRetrievalMode.FTS, limit=candidate_limit, offset=0, + allow_relaxed=True, ) fts_ms = (time.perf_counter() - fts_start) * 1000 vector_start = time.perf_counter() diff --git a/src/basic_memory/repository/sqlite_search_repository.py b/src/basic_memory/repository/sqlite_search_repository.py index c467a9a2..8b82a79b 100644 --- a/src/basic_memory/repository/sqlite_search_repository.py +++ b/src/basic_memory/repository/sqlite_search_repository.py @@ -23,7 +23,10 @@ from basic_memory.repository.embedding_provider import EmbeddingProvider from basic_memory.repository.embedding_provider_factory import create_embedding_provider from basic_memory.repository.search_index_row import SearchIndexRow -from basic_memory.repository.search_repository_base import SearchRepositoryBase +from basic_memory.repository.search_repository_base import ( + SearchRepositoryBase, + relaxed_query_words, +) from basic_memory.repository.metadata_filters import parse_metadata_filters, build_sqlite_json_path from basic_memory.repository.semantic_errors import SemanticDependenciesMissingError from basic_memory.schemas.search import SearchItemType, SearchRetrievalMode @@ -255,6 +258,19 @@ def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str: if "*" in term and all(c.isalnum() or c in "*_-" for c in term): return term + # Natural-language queries arrive with sentence punctuation that FTS5 + # treats as syntax ("When did Melanie paint a sunrise?"). The tokenizer + # ignores this punctuation in the INDEX, so stripping it from word + # edges loses nothing — but leaving it forces the whole question into + # an exact-phrase match that returns zero rows, silently disabling the + # FTS half of hybrid search. Interior characters (hyphens, slashes — + # permalinks and paths) are untouched. + if " " in term: + words = [word.strip("?!.,;:") for word in term.split()] + term = " ".join(word for word in words if word) + if not term: + return "" + # Characters that can cause FTS5 syntax errors when used as operators # We're more conservative here - only quote when we detect problematic patterns problematic_chars = [ @@ -351,6 +367,14 @@ def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str: # For non-Boolean queries, use the single term preparation logic return self._prepare_single_term(term, is_prefix) + @staticmethod + def _relaxed_fts_text(search_text: Optional[str]) -> Optional[str]: + """OR-relaxed FTS5 expression for a failed strict query, or None.""" + words = relaxed_query_words(search_text) + if not words: + return None + return " OR ".join(f"{word}*" for word in words) + # ------------------------------------------------------------------ # sqlite-vec extension loading (SQLite-specific) # ------------------------------------------------------------------ @@ -953,8 +977,15 @@ async def search( min_similarity: Optional[float] = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> List[SearchIndexRow]: - """Search across all indexed content using SQLite FTS5.""" + """Search across all indexed content using SQLite FTS5. + + ``allow_relaxed=True`` retries a zero-result strict multi-word query + with OR-joined content terms. Only the hybrid path opts in: its FTS + branch otherwise contributes nothing for question-form queries. + Service-level FTS searches keep their own conservative fallback. + """ # --- Dispatch vector / hybrid modes (shared logic) --- dispatched = await self._dispatch_retrieval_mode( search_text=search_text, @@ -1021,6 +1052,21 @@ async def search( async with db.scoped_session(self.session_maker) as session: result = await session.execute(text(sql), params) rows = result.fetchall() + # Trigger: multi-word natural-language query matched nothing + # under the default all-terms-AND semantics. + # Why: questions ("when did X do Y") rarely have every word in + # one document; without relaxation the FTS half of hybrid + # search contributes zero candidates and ranking degrades to + # vector-only. + # Outcome: one retry with OR-joined prefix terms; bm25 still + # ranks multi-term matches first. + relaxed = ( + self._relaxed_fts_text(search_text) if allow_relaxed and not rows else None + ) + if relaxed and params.get("text"): + params["text"] = relaxed + result = await session.execute(text(sql), params) + rows = result.fetchall() except Exception as e: # Handle FTS5 syntax errors and provide user-friendly feedback if self._is_fts5_syntax_error(e): # pragma: no cover diff --git a/tests/repository/test_hybrid_fusion.py b/tests/repository/test_hybrid_fusion.py index 0a4a2c29..e61d02e5 100644 --- a/tests/repository/test_hybrid_fusion.py +++ b/tests/repository/test_hybrid_fusion.py @@ -77,6 +77,7 @@ async def search( min_similarity: Optional[float] = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> list[SearchIndexRow]: return [] # pragma: no cover diff --git a/tests/repository/test_postgres_search_repository.py b/tests/repository/test_postgres_search_repository.py index 7c022014..953f9542 100644 --- a/tests/repository/test_postgres_search_repository.py +++ b/tests/repository/test_postgres_search_repository.py @@ -1001,3 +1001,57 @@ async def test_postgres_search_categories_exact_match(session_maker, test_projec # Multiple categories union. multi = await repo.search(categories=["requirement", "decision"]) assert {r.id for r in multi} == {70101, 70102} + + +@pytest.mark.asyncio +async def test_postgres_question_punctuation_and_relaxation(session_maker, test_project): + """Question-form queries must produce clean lexemes and a usable relaxation. + + Parity with SQLite: sentence punctuation previously reached tsquery terms, + and a strict all-AND miss had no relaxed retry, silently disabling the FTS + half of hybrid search for natural-language questions. + """ + repo = PostgresSearchRepository(session_maker, project_id=test_project.id) + + # Edge punctuation stripped before lexeme formatting. + prepared = repo._prepare_search_term("When did Melanie paint a sunrise?") + assert "?" not in prepared + assert "sunrise:*" in prepared + + # Relaxation drops stopwords and OR-joins content terms. + relaxed = repo._relaxed_tsquery_text("When did Melanie paint a sunrise?") + assert relaxed == "Melanie:* | paint:* | sunrise:*" + + # User intent is not second-guessed. + assert repo._relaxed_tsquery_text("alpha AND beta") is None + assert repo._relaxed_tsquery_text('"exact phrase"') is None + assert repo._relaxed_tsquery_text(None) is None + + +@pytest.mark.asyncio +async def test_postgres_multiword_query_relaxes_on_strict_miss(session_maker, test_project): + repo = PostgresSearchRepository(session_maker, project_id=test_project.id) + now = datetime.now(timezone.utc) + await repo.index_item( + SearchIndexRow( + project_id=test_project.id, + id=77, + title="Trip plans", + content_stems="melanie painted a sunrise over the lake last year", + content_snippet="Melanie painted a sunrise over the lake last year.", + permalink="docs/trip-plans", + file_path="docs/trip-plans.md", + type="entity", + metadata={"note_type": "note"}, + created_at=now, + updated_at=now, + ) + ) + + # Default path stays strict: zero results, exactly as before. + strict = await repo.search(search_text="When did Melanie paint a sunrise?") + assert strict == [] + + # The hybrid FTS branch opts in; relaxation surfaces the doc. + results = await repo.search(search_text="When did Melanie paint a sunrise?", allow_relaxed=True) + assert any(r.id == 77 for r in results) diff --git a/tests/repository/test_search_repository.py b/tests/repository/test_search_repository.py index 0fb6d630..088077f2 100644 --- a/tests/repository/test_search_repository.py +++ b/tests/repository/test_search_repository.py @@ -1124,3 +1124,65 @@ async def test_search_categories_exact_match(search_repository, search_entity): # Multiple categories union: both observations come back. multi = await search_repository.search(categories=["requirement", "decision"]) assert {r.id for r in multi} == {70001, 70002} + + +@pytest.mark.asyncio +async def test_question_punctuation_does_not_phrase_quote(search_repository): + """Sentence punctuation must not force exact-phrase matching (#hybrid-fts). + + 'When did Melanie paint a sunrise?' previously became the FTS5 phrase + '"When did Melanie paint a sunrise?"*' — zero rows for any corpus — which + silently disabled the FTS half of hybrid search for question queries. + """ + prepared = search_repository._prepare_single_term("When did Melanie paint a sunrise?") + assert '"' not in prepared + assert "sunrise*" in prepared + + +@pytest.mark.asyncio +async def test_relaxed_fts_text_builds_or_query(search_repository): + relaxed = search_repository._relaxed_fts_text("When did Melanie paint a sunrise?") + # Stopwords dropped: relaxation keys on content-bearing terms only. + assert relaxed == "Melanie* OR paint* OR sunrise*" + + +@pytest.mark.asyncio +async def test_relaxed_fts_text_respects_user_intent(search_repository): + # Explicit boolean and quoted queries are not second-guessed. + assert search_repository._relaxed_fts_text("alpha AND beta") is None + assert search_repository._relaxed_fts_text('"exact phrase"') is None + assert search_repository._relaxed_fts_text("single") == "single*" + assert search_repository._relaxed_fts_text(None) is None + + +@pytest.mark.asyncio +async def test_multiword_query_relaxes_to_or_when_strict_misses(search_repository, search_entity): + """A question sharing only SOME words with a doc still surfaces it.""" + from basic_memory.repository.search_index_row import SearchIndexRow + from basic_memory.schemas.search import SearchItemType + + row = SearchIndexRow( + project_id=search_repository.project_id, + id=search_entity.id, + type=SearchItemType.ENTITY.value, + title="Trip plans", + content_snippet="Melanie painted a sunrise over the lake last year.", + content_stems="melanie painted a sunrise over the lake last year", + permalink=search_entity.permalink, + file_path=search_entity.file_path, + entity_id=search_entity.id, + metadata={"note_type": search_entity.note_type}, + created_at=search_entity.created_at, + updated_at=search_entity.updated_at, + ) + await search_repository.index_item(row) + + # Default path stays strict: zero results, exactly as before. + strict = await search_repository.search(search_text="When did Melanie paint a sunrise?") + assert strict == [] + + # The hybrid FTS branch opts in; relaxation surfaces the doc. + results = await search_repository.search( + search_text="When did Melanie paint a sunrise?", allow_relaxed=True + ) + assert any(r.entity_id == search_entity.id for r in results) diff --git a/tests/repository/test_semantic_search_base.py b/tests/repository/test_semantic_search_base.py index b75f8871..373f2ea4 100644 --- a/tests/repository/test_semantic_search_base.py +++ b/tests/repository/test_semantic_search_base.py @@ -89,6 +89,7 @@ async def search( min_similarity: float | None = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> list[SearchIndexRow]: return [] diff --git a/tests/repository/test_vector_pagination.py b/tests/repository/test_vector_pagination.py index 09a001e3..f08a98ba 100644 --- a/tests/repository/test_vector_pagination.py +++ b/tests/repository/test_vector_pagination.py @@ -62,6 +62,7 @@ async def search( min_similarity: float | None = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> list[SearchIndexRow]: return [] # pragma: no cover diff --git a/tests/repository/test_vector_threshold.py b/tests/repository/test_vector_threshold.py index 34a66d7d..91598518 100644 --- a/tests/repository/test_vector_threshold.py +++ b/tests/repository/test_vector_threshold.py @@ -66,6 +66,7 @@ async def search( min_similarity: Optional[float] = None, limit: int = 10, offset: int = 0, + allow_relaxed: bool = False, ) -> list[SearchIndexRow]: return [] # pragma: no cover From 8d4d1f19d5834cd15286689e3e86053a44c28457 Mon Sep 17 00:00:00 2001 From: Drew Cain Date: Fri, 12 Jun 2026 23:06:30 -0500 Subject: [PATCH 2/2] fix(core): Postgres parity for FTS punctuation/relaxation (CI green) CI Postgres shard caught two issues invisible to the local SQLite suite: 1. Postgres _prepare_single_term regression: the new edge-punctuation strip ran after special-character cleaning, so an all-special-char term ("()&!:") collapsed to empty and skipped the existing NOSPECIALCHARS:* guard, emitting a malformed ":*". Folded the strip into the word handlers so every guard survives, and added a single-word empty guard. 2. Backend-specific test assumptions. Four tests in test_search_repository.py (run under both backends via the search_repository fixture) asserted SQLite FTS5 syntax and SQLite-only strict-miss behavior. Postgres to_tsquery('english', ...) auto-strips stopwords, so "When did Melanie paint a sunrise?" already matches under strict AND. Made the four tests backend-aware via the existing is_postgres_backend() helper, and switched the relaxation integration test to a query with a word absent from the doc ("hiking") so the strict miss holds on both backends. Reproduced and fixed against real Postgres (testcontainers): full search test surface green on both backends (53 passed Postgres, 2968 SQLite), ruff + ty clean. Signed-off-by: Drew Cain --- .../repository/postgres_search_repository.py | 22 ++++----- .../test_postgres_search_repository.py | 10 +++-- tests/repository/test_search_repository.py | 45 ++++++++++++------- 3 files changed, 47 insertions(+), 30 deletions(-) diff --git a/src/basic_memory/repository/postgres_search_repository.py b/src/basic_memory/repository/postgres_search_repository.py index 1e2c60b6..92d201ff 100644 --- a/src/basic_memory/repository/postgres_search_repository.py +++ b/src/basic_memory/repository/postgres_search_repository.py @@ -243,17 +243,14 @@ def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str: for char in special_chars: cleaned_term = cleaned_term.replace(char, " ") - # Sentence punctuation carries no lexical signal in tsquery either; - # strip it from word edges so question-form queries produce clean - # lexemes (parity with the SQLite FTS5 preparation). - if " " in cleaned_term: - cleaned_term = " ".join( - word.strip("?!.,;") for word in cleaned_term.split() if word.strip("?!.,;") - ) - # Handle multi-word queries if " " in cleaned_term: - words = [w for w in cleaned_term.split() if w.strip()] + # Strip sentence punctuation from word edges so question-form + # queries produce clean lexemes (parity with SQLite FTS5 prep). + # The tsquery tokenizer ignores this punctuation anyway; leaving it + # in only risks tsquery syntax errors. Interior characters are kept. + words = [w.strip("?!.,;") for w in cleaned_term.split()] + words = [w for w in words if w] if not words: # All characters were special chars, search won't match anything # Return a safe search term that won't cause syntax errors @@ -266,8 +263,11 @@ def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str: # Join with AND operator return " & ".join(prepared_words) - # Single word - cleaned_term = cleaned_term.strip() + # Single word: strip edge punctuation; guard the now-empty case so a + # bare ":*"/"" never reaches tsquery. + cleaned_term = cleaned_term.strip().strip("?!.,;") + if not cleaned_term: + return "NOSPECIALCHARS:*" if is_prefix: return f"{cleaned_term}:*" else: diff --git a/tests/repository/test_postgres_search_repository.py b/tests/repository/test_postgres_search_repository.py index 953f9542..f3c497e6 100644 --- a/tests/repository/test_postgres_search_repository.py +++ b/tests/repository/test_postgres_search_repository.py @@ -1048,10 +1048,12 @@ async def test_postgres_multiword_query_relaxes_on_strict_miss(session_maker, te ) ) - # Default path stays strict: zero results, exactly as before. - strict = await repo.search(search_text="When did Melanie paint a sunrise?") + # A content word absent from the doc ("hiking") makes the strict + # all-terms-AND query miss even after Postgres drops stopwords — without + # it, to_tsquery('english', ...) already strips "when/did/a" and matches. + strict = await repo.search(search_text="Did Melanie go hiking at sunrise?") assert strict == [] - # The hybrid FTS branch opts in; relaxation surfaces the doc. - results = await repo.search(search_text="When did Melanie paint a sunrise?", allow_relaxed=True) + # The hybrid FTS branch opts in; OR-relaxation surfaces the partial match. + results = await repo.search(search_text="Did Melanie go hiking at sunrise?", allow_relaxed=True) assert any(r.id == 77 for r in results) diff --git a/tests/repository/test_search_repository.py b/tests/repository/test_search_repository.py index 088077f2..0068b3f1 100644 --- a/tests/repository/test_search_repository.py +++ b/tests/repository/test_search_repository.py @@ -1136,23 +1136,37 @@ async def test_question_punctuation_does_not_phrase_quote(search_repository): """ prepared = search_repository._prepare_single_term("When did Melanie paint a sunrise?") assert '"' not in prepared - assert "sunrise*" in prepared + # Prefix syntax differs by backend: FTS5 uses '*', tsquery uses ':*'. + if is_postgres_backend(search_repository): + assert "sunrise:*" in prepared + else: + assert "sunrise*" in prepared @pytest.mark.asyncio -async def test_relaxed_fts_text_builds_or_query(search_repository): - relaxed = search_repository._relaxed_fts_text("When did Melanie paint a sunrise?") - # Stopwords dropped: relaxation keys on content-bearing terms only. - assert relaxed == "Melanie* OR paint* OR sunrise*" +async def test_relaxed_query_drops_stopwords(search_repository): + """Relaxation keys on content-bearing terms in each backend's syntax.""" + if is_postgres_backend(search_repository): + relaxed = search_repository._relaxed_tsquery_text("When did Melanie paint a sunrise?") + assert relaxed == "Melanie:* | paint:* | sunrise:*" + else: + relaxed = search_repository._relaxed_fts_text("When did Melanie paint a sunrise?") + assert relaxed == "Melanie* OR paint* OR sunrise*" @pytest.mark.asyncio -async def test_relaxed_fts_text_respects_user_intent(search_repository): - # Explicit boolean and quoted queries are not second-guessed. - assert search_repository._relaxed_fts_text("alpha AND beta") is None - assert search_repository._relaxed_fts_text('"exact phrase"') is None - assert search_repository._relaxed_fts_text("single") == "single*" - assert search_repository._relaxed_fts_text(None) is None +async def test_relaxed_query_respects_user_intent(search_repository): + # Explicit boolean and quoted queries are not second-guessed (both backends). + if is_postgres_backend(search_repository): + relaxer = search_repository._relaxed_tsquery_text + single = "single:*" + else: + relaxer = search_repository._relaxed_fts_text + single = "single*" + assert relaxer("alpha AND beta") is None + assert relaxer('"exact phrase"') is None + assert relaxer("single") == single + assert relaxer(None) is None @pytest.mark.asyncio @@ -1177,12 +1191,13 @@ async def test_multiword_query_relaxes_to_or_when_strict_misses(search_repositor ) await search_repository.index_item(row) - # Default path stays strict: zero results, exactly as before. - strict = await search_repository.search(search_text="When did Melanie paint a sunrise?") + # "hiking" is absent from the doc, so strict all-terms-AND misses on both + # backends (Postgres's stopword stripping can't rescue it either). + strict = await search_repository.search(search_text="Did Melanie go hiking at sunrise?") assert strict == [] - # The hybrid FTS branch opts in; relaxation surfaces the doc. + # The hybrid FTS branch opts in; OR-relaxation surfaces the partial match. results = await search_repository.search( - search_text="When did Melanie paint a sunrise?", allow_relaxed=True + search_text="Did Melanie go hiking at sunrise?", allow_relaxed=True ) assert any(r.entity_id == search_entity.id for r in results)