From 1760025e32f4b55886d2296b781fcf8b24362ad7 Mon Sep 17 00:00:00 2001
From: yug <>
Date: Thu, 14 May 2026 22:41:00 -0400
Subject: [PATCH 1/5] hybrid search

---
 .env.sample      |   9 ++-
 main.py          | 166 ++++++++++++++++++++++++++++++++++++++++++++++-
 requirements.txt |   2 +-
 3 files changed, 174 insertions(+), 3 deletions(-)
diff --git a/.env.sample b/.env.sample
index 1062fee..f36dd48 100644
--- a/.env.sample
+++ b/.env.sample
@@ -11,10 +11,17 @@ AWS_SECRET=secret
 
 ELASTIC_PASSWORD=123
 ES_PORT=9200
-ES_STACK_VERSION=8.9.0
+ES_STACK_VERSION=8.16.0
 
 INDEXING_PASSWORD=index123
 
+# Semantic search (optional). When SEMANTIC_SEARCH_ENABLED is truthy,
+# /index creates a Google AI Studio text-embedding-005 inference endpoint and
+# indexes a semantic_text field, and /search honors `?semantic=true` to
+# RRF-fuse semantic + lexical hits. Get an API key at https://aistudio.google.com/apikey
+SEMANTIC_SEARCH_ENABLED=false
+GOOGLE_AI_STUDIO_API_KEY=your-api-key-here
+
 # Grafana Cloud — logs (Loki)
 # Get from: grafana.com → your stack → Loki card → "Send Logs"
 # Token: My Account → Access Policies → token with logs:write scope
diff --git a/main.py b/main.py
index d3c1823..e72a9ed 100644
--- a/main.py
+++ b/main.py
@@ -67,6 +67,15 @@ def _emit_access_log(response):
 
 INDEX_NAME = "english"
 
+SEMANTIC_ENABLED = os.environ.get("SEMANTIC_SEARCH_ENABLED", "").lower() in ("1", "true", "yes")
+SEMANTIC_FIELD = "hadithTextSemantic"
+INFERENCE_ENDPOINT = "googleai-text-embedding"
+# RRF constants. k=60 is the value from the original Cormack et al. paper and
+# the ES default. RRF_WINDOW is the depth fetched from each retriever before
+# fusion — bigger window = better recall at the tail, more cost per query.
+RRF_K = 60
+RRF_WINDOW = 100
+
 # Tiebreaker boosts added on top of the text-similarity score so canonical
 # collections rise when relevance is otherwise comparable. Sized to swing
 # rankings when BM25 scores are within a few points (e.g. the same hadith
@@ -106,6 +115,31 @@ def home():
     return "<h1>Welcome to sunnah.com search api.</h1>"
 
 
+def create_inference_endpoint():
+    """Create the Google AI Studio text-embedding inference endpoint used by
+    the semantic_text field. Deletes any existing endpoint with the same id
+    so re-indexing picks up dimension/model changes."""
+    try:
+        es_client.inference.delete(
+            task_type="text_embedding",
+            inference_id=INFERENCE_ENDPOINT,
+            force=True,
+        )
+    except NotFoundError:
+        pass
+    es_client.options(request_timeout=60).inference.put(
+        task_type="text_embedding",
+        inference_id=INFERENCE_ENDPOINT,
+        inference_config={
+            "service": "googleaistudio",
+            "service_settings": {
+                "api_key": os.environ.get("GOOGLE_AI_STUDIO_API_KEY"),
+                "model_id": "text-embedding-005",
+            },
+        },
+    )
+
+
 def create_and_update_index(index_name, documents, fields_to_not_index):
     settings = {
         "index": {
@@ -193,10 +227,20 @@ def create_and_update_index(index_name, documents, fields_to_not_index):
         }
         | {"arabicText": {"type": "text", "analyzer": "custom_arabic"}}
     }
+    if SEMANTIC_ENABLED:
+        mappings["properties"][SEMANTIC_FIELD] = {
+            "type": "semantic_text",
+            "inference_id": INFERENCE_ENDPOINT,
+        }
     if es_client.indices.exists(index=index_name):
         es_client.indices.delete(index=index_name)
     es_client.indices.create(index=index_name, mappings=mappings, settings=settings)
-    successCount, errors = helpers.bulk(es_client, documents, index=index_name)
+    # When semantic_text is in the mapping each doc triggers an embedding API
+    # call during indexing, so the bulk request needs a longer timeout.
+    bulk_timeout = 300 if SEMANTIC_ENABLED else 60
+    successCount, errors = helpers.bulk(
+        es_client, documents, index=index_name, request_timeout=bulk_timeout
+    )
     return successCount, errors
 
 def get_suggest_query(suggest_field):
@@ -225,6 +269,9 @@ def index():
     if request.args.get("password") != os.environ.get("INDEXING_PASSWORD"):
         return "Must provide valid password to index", 401
 
+    if SEMANTIC_ENABLED:
+        create_inference_endpoint()
+
     connection = pymysql.connect(
         host=os.environ.get("MYSQL_HOST"),
         user=os.environ.get("MYSQL_USER"),
@@ -257,6 +304,8 @@ def index():
 
     # Add arabic text and hadithNumber to english hadith
     for englishHadith in englishHadiths:
+        if SEMANTIC_ENABLED:
+            englishHadith[SEMANTIC_FIELD] = englishHadith["hadithText"]
         if englishHadith["urn"] not in matchingArabicHadiths:
            continue
         matchingArabic = matchingArabicHadiths[englishHadith["urn"]]
@@ -303,6 +352,41 @@ def index_status():
     }
 
 
+def _rrf_merge(lexical_resp, semantic_resp, k, from_, size):
+    """Reciprocal rank fusion of two ES result sets. Each doc's fused score
+    is sum(1/(k+rank)) across retrievers it appears in; rank is 1-indexed."""
+    scores = {}
+    hits_by_id = {}
+    for rank, h in enumerate(lexical_resp.get("hits", {}).get("hits", []), start=1):
+        doc_id = h["_id"]
+        scores[doc_id] = scores.get(doc_id, 0.0) + 1.0 / (k + rank)
+        # Lexical hits own the highlight slot — semantic queries don't produce one.
+        hits_by_id[doc_id] = h
+    for rank, h in enumerate(semantic_resp.get("hits", {}).get("hits", []), start=1):
+        doc_id = h["_id"]
+        scores[doc_id] = scores.get(doc_id, 0.0) + 1.0 / (k + rank)
+        hits_by_id.setdefault(doc_id, h)
+
+    sorted_ids = sorted(scores, key=scores.get, reverse=True)
+    merged_hits = []
+    for doc_id in sorted_ids[from_ : from_ + size]:
+        h = dict(hits_by_id[doc_id])
+        h["_score"] = scores[doc_id]
+        merged_hits.append(h)
+
+    # Lexical total is the keyword-match count and stays comparable to the
+    # non-semantic path; semantic returns top-N by similarity with no total.
+    total = lexical_resp.get("hits", {}).get(
+        "total", {"value": len(sorted_ids), "relation": "eq"}
+    )
+    max_score = scores[sorted_ids[0]] if sorted_ids else None
+    return {
+        "took": lexical_resp.get("took", 0) + semantic_resp.get("took", 0),
+        "hits": {"total": total, "max_score": max_score, "hits": merged_hits},
+        "suggest": lexical_resp.get("suggest"),
+    }
+
+
 def get_filter_from_args(args):
     filters = []
     collection = args.getlist("collection")
@@ -318,6 +402,9 @@ def get_filter_from_args(args):
 def search(language):
     query = request.args.get("q")
     filter = get_filter_from_args(request.args)
+    use_semantic = SEMANTIC_ENABLED and request.args.get(
+        "semantic", ""
+    ).lower() in ("1", "true", "yes")
 
     fields = ["hadithNumber^2", "hadithText", "arabicText", "collection^2"]
 
@@ -345,6 +432,9 @@ def build_query(query_type):
             }
         }
 
+    if use_semantic:
+        return _semantic_rrf_search(language, query, filter, build_query)
+
     search_kwargs = {
         "index": language,
         "from_": request.args.get("from", 0),
@@ -381,5 +471,79 @@ def build_query(query_type):
     return jsonify(result.body)
 
 
+def _semantic_rrf_search(language, query, filter_clauses, build_lexical_query):
+    """Run lexical + semantic searches in parallel via msearch and fuse with RRF.
+    Lexical query keeps the function_score collection boosts; semantic uses the
+    inference-backed semantic_text field. Fusion happens in Python to avoid the
+    Enterprise-licensed RRF retriever."""
+    from_ = int(request.args.get("from", 0))
+    size = int(request.args.get("size", 10))
+    window = max(RRF_WINDOW, from_ + size)
+
+    semantic_query = {
+        "bool": {
+            "filter": filter_clauses,
+            "must": [{"semantic": {"field": SEMANTIC_FIELD, "query": query}}],
+        }
+    }
+    # The semantic_text field stores chunked embeddings + a copy of the input
+    # text; excluding it keeps responses lean.
+    common_body = {
+        "from": 0,
+        "size": window,
+        "_source": {"excludes": [SEMANTIC_FIELD]},
+    }
+    # Highlight + suggest run on the lexical leg only — _rrf_merge keeps the
+    # lexical hit for any doc that appears in both legs, so a semantic-leg
+    # highlight would be computed and then discarded.
+    lexical_body = {
+        **common_body,
+        "highlight": {"number_of_fragments": 0, "fields": {"*": {}}},
+        "suggest": {
+            "text": query,
+            "english": {"phrase": get_suggest_query("hadithText.trigram")},
+            "arabic": {"phrase": get_suggest_query("arabicText")},
+        },
+    }
+
+    def _run(query_type):
+        searches = [
+            {"index": language},
+            {**lexical_body, "query": build_lexical_query(query_type)},
+            {"index": language},
+            {**common_body, "query": semantic_query},
+        ]
+        return es_client.options(request_timeout=130).msearch(searches=searches)
+
+    try:
+        result = _run("query_string")
+        if result["responses"][0].get("error"):
+            # Only the lexical leg can hit query_string strictness; the
+            # semantic leg uses a `semantic` clause that doesn't parse the
+            # user's query as a syntax.
+            result = _run("simple_query_string")
+    except BadRequestError as e:
+        access_log.warning(
+            "malformed_query",
+            extra={"request_id": getattr(g, "request_id", None), "detail": str(e)},
+        )
+        return jsonify({"error": "malformed query"}), 400
+
+    lex_resp, sem_resp = result["responses"][0], result["responses"][1]
+    for resp, label in ((lex_resp, "lexical"), (sem_resp, "semantic")):
+        if resp.get("error"):
+            access_log.warning(
+                "rrf_subquery_failed",
+                extra={
+                    "request_id": getattr(g, "request_id", None),
+                    "leg": label,
+                    "error": resp["error"],
+                },
+            )
+            return jsonify({"error": "malformed query"}), 400
+
+    return jsonify(_rrf_merge(lex_resp, sem_resp, RRF_K, from_, size))
+
+
 if __name__ == "__main__":
     app.run(host="0.0.0.0")
diff --git a/requirements.txt b/requirements.txt
index bf410ce..69d88a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ Jinja2==2.11.3
 python-dotenv==0.13.0
 virtualenv==20.0.25
 Werkzeug==1.0.1
-elasticsearch==8.9.0
+elasticsearch==8.16.0
 MarkupSafe==1.1.1
 itsdangerous==1.1.0
 python-json-logger==2.0.7

From babe5e2775d2e50f9249783cd4c6f845a8462ef9 Mon Sep 17 00:00:00 2001
From: yug <>
Date: Thu, 14 May 2026 23:31:08 -0400
Subject: [PATCH 2/5] Back to openai

---
 .env.sample |  6 +++---
 main.py     | 59 +++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/.env.sample b/.env.sample
index f36dd48..588d017 100644
--- a/.env.sample
+++ b/.env.sample
@@ -16,11 +16,11 @@ ES_STACK_VERSION=8.16.0
 INDEXING_PASSWORD=index123
 
 # Semantic search (optional). When SEMANTIC_SEARCH_ENABLED is truthy,
-# /index creates a Google AI Studio text-embedding-005 inference endpoint and
+# /index creates an OpenAI text-embedding-3-small inference endpoint and
 # indexes a semantic_text field, and /search honors `?semantic=true` to
-# RRF-fuse semantic + lexical hits. Get an API key at https://aistudio.google.com/apikey
+# RRF-fuse semantic + lexical hits. Get an API key at https://platform.openai.com/api-keys
 SEMANTIC_SEARCH_ENABLED=false
-GOOGLE_AI_STUDIO_API_KEY=your-api-key-here
+OPENAI_API_KEY=your-api-key-here
 
 # Grafana Cloud — logs (Loki)
 # Get from: grafana.com → your stack → Loki card → "Send Logs"
diff --git a/main.py b/main.py
index e72a9ed..8c3d955 100644
--- a/main.py
+++ b/main.py
@@ -69,7 +69,7 @@ def _emit_access_log(response):
 
 SEMANTIC_ENABLED = os.environ.get("SEMANTIC_SEARCH_ENABLED", "").lower() in ("1", "true", "yes")
 SEMANTIC_FIELD = "hadithTextSemantic"
-INFERENCE_ENDPOINT = "googleai-text-embedding"
+INFERENCE_ENDPOINT = "openai-text-embedding"
 # RRF constants. k=60 is the value from the original Cormack et al. paper and
 # the ES default. RRF_WINDOW is the depth fetched from each retriever before
 # fusion — bigger window = better recall at the tail, more cost per query.
@@ -116,9 +116,9 @@ def home():
 
 
 def create_inference_endpoint():
-    """Create the Google AI Studio text-embedding inference endpoint used by
-    the semantic_text field. Deletes any existing endpoint with the same id
-    so re-indexing picks up dimension/model changes."""
+    """Create the OpenAI text-embedding inference endpoint used by the
+    semantic_text field. Deletes any existing endpoint with the same id so
+    re-indexing picks up dimension/model changes."""
     try:
         es_client.inference.delete(
             task_type="text_embedding",
@@ -131,10 +131,14 @@ def create_inference_endpoint():
         task_type="text_embedding",
         inference_id=INFERENCE_ENDPOINT,
         inference_config={
-            "service": "googleaistudio",
+            "service": "openai",
             "service_settings": {
-                "api_key": os.environ.get("GOOGLE_AI_STUDIO_API_KEY"),
-                "model_id": "text-embedding-005",
+                "api_key": os.environ.get("OPENAI_API_KEY"),
+                "model_id": "text-embedding-3-small",
+                # OpenAI vectors are mathematically unit-length but drift past
+                # ES's strict epsilon for a small fraction of inputs, breaking
+                # the default dot_product similarity. See elastic/elasticsearch#122878.
+                "similarity": "cosine",
             },
         },
     )
@@ -239,7 +243,12 @@ def create_and_update_index(index_name, documents, fields_to_not_index):
     # call during indexing, so the bulk request needs a longer timeout.
     bulk_timeout = 300 if SEMANTIC_ENABLED else 60
     successCount, errors = helpers.bulk(
-        es_client, documents, index=index_name, request_timeout=bulk_timeout
+        es_client,
+        documents,
+        index=index_name,
+        request_timeout=bulk_timeout,
+        raise_on_error=False,
+        raise_on_exception=False,
     )
     return successCount, errors
 
@@ -433,12 +442,22 @@ def build_query(query_type):
         }
 
     if use_semantic:
+        access_log.info(
+            "semantic_search_enabled",
+            extra={
+                "request_id": getattr(g, "request_id", None),
+                "language": language,
+                "query": query,
+                "filters": filter,
+            },
+        )
         return _semantic_rrf_search(language, query, filter, build_query)
 
     search_kwargs = {
         "index": language,
         "from_": request.args.get("from", 0),
         "size": request.args.get("size", 10),
+        "_source": {"excludes": [SEMANTIC_FIELD]},
         "highlight": {"number_of_fragments": 0, "fields": {"*": {}}},
         "suggest": {
             "text": query,
@@ -530,6 +549,17 @@ def _run(query_type):
         return jsonify({"error": "malformed query"}), 400
 
     lex_resp, sem_resp = result["responses"][0], result["responses"][1]
+    access_log.info(
+        "semantic_rrf_legs",
+        extra={
+            "request_id": getattr(g, "request_id", None),
+            "lexical_hits": len(lex_resp.get("hits", {}).get("hits", [])),
+            "semantic_hits": len(sem_resp.get("hits", {}).get("hits", [])),
+            "lexical_took_ms": lex_resp.get("took"),
+            "semantic_took_ms": sem_resp.get("took"),
+            "window": window,
+        },
+    )
     for resp, label in ((lex_resp, "lexical"), (sem_resp, "semantic")):
         if resp.get("error"):
             access_log.warning(
@@ -542,7 +572,18 @@ def _run(query_type):
             )
             return jsonify({"error": "malformed query"}), 400
 
-    return jsonify(_rrf_merge(lex_resp, sem_resp, RRF_K, from_, size))
+    merged = _rrf_merge(lex_resp, sem_resp, RRF_K, from_, size)
+    access_log.info(
+        "semantic_rrf_merged",
+        extra={
+            "request_id": getattr(g, "request_id", None),
+            "returned_hits": len(merged["hits"]["hits"]),
+            "max_score": merged["hits"]["max_score"],
+            "from": from_,
+            "size": size,
+        },
+    )
+    return jsonify(merged)
 
 
 if __name__ == "__main__":

From 3a3630c63acc70efc47a0c1bd125f78b2c887dfa Mon Sep 17 00:00:00 2001
From: yug <>
Date: Fri, 15 May 2026 09:33:56 -0400
Subject: [PATCH 3/5] Semantic only mode

---
 .env.sample |   4 --
 main.py     | 116 ++++++++++++++++++++++++++++++++++------------------
 2 files changed, 76 insertions(+), 44 deletions(-)

diff --git a/.env.sample b/.env.sample
index 588d017..96603a6 100644
--- a/.env.sample
+++ b/.env.sample
@@ -15,10 +15,6 @@ ES_STACK_VERSION=8.16.0
 
 INDEXING_PASSWORD=index123
 
-# Semantic search (optional). When SEMANTIC_SEARCH_ENABLED is truthy,
-# /index creates an OpenAI text-embedding-3-small inference endpoint and
-# indexes a semantic_text field, and /search honors `?semantic=true` to
-# RRF-fuse semantic + lexical hits. Get an API key at https://platform.openai.com/api-keys
 SEMANTIC_SEARCH_ENABLED=false
 OPENAI_API_KEY=your-api-key-here
 
diff --git a/main.py b/main.py
index 8c3d955..4718814 100644
--- a/main.py
+++ b/main.py
@@ -76,6 +76,12 @@ def _emit_access_log(response):
 RRF_K = 60
 RRF_WINDOW = 100
 
+# Search modes. SEMANTIC_MODES are the ones needing the inference endpoint —
+# kept as one tuple so the "needs the semantic backend" rule has a single
+# source of truth across mode resolution and request dispatch.
+SEARCH_MODES = ("lexical", "hybrid", "semantic")
+SEMANTIC_MODES = ("hybrid", "semantic")
+
 # Tiebreaker boosts added on top of the text-similarity score so canonical
 # collections rise when relevance is otherwise comparable. Sized to swing
 # rankings when BM25 scores are within a few points (e.g. the same hadith
@@ -272,6 +278,35 @@ def get_suggest_query(suggest_field):
         },
     }
 
+
+def get_suggest_block(query):
+    """Phrase-suggester ("did you mean") block covering English + Arabic text."""
+    return {
+        "text": query,
+        "english": {"phrase": get_suggest_query("hadithText.trigram")},
+        "arabic": {"phrase": get_suggest_query("arabicText")},
+    }
+
+
+def build_semantic_query(query, filter_clauses):
+    """bool query matching the inference-backed semantic_text field."""
+    return {
+        "bool": {
+            "filter": filter_clauses,
+            "must": [{"semantic": {"field": SEMANTIC_FIELD, "query": query}}],
+        }
+    }
+
+
+def malformed_query_response(exc):
+    """400 for a query ES rejected. Logs the detail but doesn't leak ES
+    internals (field paths, index names) to the client."""
+    access_log.warning(
+        "malformed_query",
+        extra={"request_id": getattr(g, "request_id", None), "detail": str(exc)},
+    )
+    return jsonify({"error": "malformed query"}), 400
+
 @app.route("/index", methods=["GET"])
 def index():
     start = time.time()
@@ -407,13 +442,22 @@ def get_filter_from_args(args):
         filters.append({"terms": {"grade": grade}})
     return filters
 
+def _resolve_search_mode(args):
+    """Resolve the ?mode= arg, falling a semantic-backed mode back to lexical
+    when SEMANTIC_SEARCH_ENABLED is off — so a deploy without an inference
+    endpoint degrades gracefully instead of erroring."""
+    mode = args.get("mode", "lexical").lower()
+    if mode not in SEARCH_MODES:
+        mode = "lexical"
+    if mode in SEMANTIC_MODES and not SEMANTIC_ENABLED:
+        return "lexical"
+    return mode
+
 @app.route("/<language>/search", methods=["GET"])
 def search(language):
     query = request.args.get("q")
     filter = get_filter_from_args(request.args)
-    use_semantic = SEMANTIC_ENABLED and request.args.get(
-        "semantic", ""
-    ).lower() in ("1", "true", "yes")
+    mode = _resolve_search_mode(request.args)
 
     fields = ["hadithNumber^2", "hadithText", "arabicText", "collection^2"]
 
@@ -441,17 +485,20 @@ def build_query(query_type):
             }
         }
 
-    if use_semantic:
+    if mode in SEMANTIC_MODES:
         access_log.info(
-            "semantic_search_enabled",
+            "semantic_search_mode",
             extra={
                 "request_id": getattr(g, "request_id", None),
                 "language": language,
                 "query": query,
                 "filters": filter,
+                "mode": mode,
             },
         )
-        return _semantic_rrf_search(language, query, filter, build_query)
+        if mode == "hybrid":
+            return _semantic_rrf_search(language, query, filter, build_query)
+        return _semantic_only_search(language, query, filter)
 
     search_kwargs = {
         "index": language,
@@ -459,15 +506,7 @@ def build_query(query_type):
         "size": request.args.get("size", 10),
         "_source": {"excludes": [SEMANTIC_FIELD]},
         "highlight": {"number_of_fragments": 0, "fields": {"*": {}}},
-        "suggest": {
-            "text": query,
-            "english": {
-                "phrase": get_suggest_query("hadithText.trigram"),
-            },
-            "arabic": {
-                "phrase": get_suggest_query("arabicText"),
-            },
-        },
+        "suggest": get_suggest_block(query),
     }
 
     try:
@@ -477,15 +516,7 @@ def build_query(query_type):
             # query_string syntax is strict; retry once with simple_query_string, which tolerates malformed input
             result = es_client.search(query=build_query("simple_query_string"), **search_kwargs)
     except BadRequestError as e:
-        # Don't leak ES internals (field paths, index names) to client.
-        access_log.warning(
-            "malformed_query",
-            extra={
-                "request_id": getattr(g, "request_id", None),
-                "detail": str(e),
-            },
-        )
-        return jsonify({"error": "malformed query"}), 400
+        return malformed_query_response(e)
 
     return jsonify(result.body)
 
@@ -499,12 +530,7 @@ def _semantic_rrf_search(language, query, filter_clauses, build_lexical_query):
     size = int(request.args.get("size", 10))
     window = max(RRF_WINDOW, from_ + size)
 
-    semantic_query = {
-        "bool": {
-            "filter": filter_clauses,
-            "must": [{"semantic": {"field": SEMANTIC_FIELD, "query": query}}],
-        }
-    }
+    semantic_query = build_semantic_query(query, filter_clauses)
     # The semantic_text field stores chunked embeddings + a copy of the input
     # text; excluding it keeps responses lean.
     common_body = {
@@ -518,11 +544,7 @@ def _semantic_rrf_search(language, query, filter_clauses, build_lexical_query):
     lexical_body = {
         **common_body,
         "highlight": {"number_of_fragments": 0, "fields": {"*": {}}},
-        "suggest": {
-            "text": query,
-            "english": {"phrase": get_suggest_query("hadithText.trigram")},
-            "arabic": {"phrase": get_suggest_query("arabicText")},
-        },
+        "suggest": get_suggest_block(query),
     }
 
     def _run(query_type):
@@ -542,11 +564,7 @@ def _run(query_type):
             # user's query as a syntax.
             result = _run("simple_query_string")
     except BadRequestError as e:
-        access_log.warning(
-            "malformed_query",
-            extra={"request_id": getattr(g, "request_id", None), "detail": str(e)},
-        )
-        return jsonify({"error": "malformed query"}), 400
+        return malformed_query_response(e)
 
     lex_resp, sem_resp = result["responses"][0], result["responses"][1]
     access_log.info(
@@ -586,5 +604,23 @@ def _run(query_type):
     return jsonify(merged)
 
 
+def _semantic_only_search(language, query, filter_clauses):
+    """Single semantic query against the semantic_text field — no lexical leg
+    and no RRF fusion, so collection boosts (a function_score wrapper) don't
+    apply and there's no highlight (a semantic_text field can't be highlighted)."""
+    try:
+        result = es_client.options(request_timeout=130).search(
+            index=language,
+            from_=int(request.args.get("from", 0)),
+            size=int(request.args.get("size", 10)),
+            query=build_semantic_query(query, filter_clauses),
+            _source={"excludes": [SEMANTIC_FIELD]},
+            suggest=get_suggest_block(query),
+        )
+    except BadRequestError as e:
+        return malformed_query_response(e)
+    return jsonify(result.body)
+
+
 if __name__ == "__main__":
     app.run(host="0.0.0.0")

From 9891a04e4816296a9c2c57b91aa0ce8e158737aa Mon Sep 17 00:00:00 2001
From: yug <>
Date: Fri, 15 May 2026 10:15:25 -0400
Subject: [PATCH 4/5] incremental indexing git push

---
 main.py | 208 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 182 insertions(+), 26 deletions(-)

diff --git a/main.py b/main.py
index 4718814..4a45b2d 100644
--- a/main.py
+++ b/main.py
@@ -1,3 +1,4 @@
+import hashlib
 import logging
 import sys
 import time
@@ -69,6 +70,9 @@ def _emit_access_log(response):
 
 SEMANTIC_ENABLED = os.environ.get("SEMANTIC_SEARCH_ENABLED", "").lower() in ("1", "true", "yes")
 SEMANTIC_FIELD = "hadithTextSemantic"
+# Per-doc content hash stored alongside each document. An incremental reindex
+# diffs incoming docs against this to skip re-embedding unchanged hadiths.
+CONTENT_HASH_FIELD = "contentHash"
 INFERENCE_ENDPOINT = "openai-text-embedding"
 # RRF constants. k=60 is the value from the original Cormack et al. paper and
 # the ES default. RRF_WINDOW is the depth fetched from each retriever before
@@ -123,14 +127,18 @@ def home():
 
 def create_inference_endpoint():
     """Create the OpenAI text-embedding inference endpoint used by the
-    semantic_text field. Deletes any existing endpoint with the same id so
-    re-indexing picks up dimension/model changes."""
+    semantic_text field, only if it doesn't already exist.
+
+    Kept stable across re-indexes: the alias swap builds a new index while the
+    old one keeps serving traffic, and both reference this endpoint by id —
+    force-deleting it mid-reindex would break the live index's semantic field.
+    To change the model or dimensions, delete the endpoint manually so the
+    next reindex recreates it."""
     try:
-        es_client.inference.delete(
-            task_type="text_embedding",
-            inference_id=INFERENCE_ENDPOINT,
-            force=True,
+        es_client.inference.get(
+            task_type="text_embedding", inference_id=INFERENCE_ENDPOINT
         )
+        return
     except NotFoundError:
         pass
     es_client.options(request_timeout=60).inference.put(
@@ -150,7 +158,46 @@ def create_inference_endpoint():
     )
 
 
-def create_and_update_index(index_name, documents, fields_to_not_index):
+def _content_hash(doc):
+    """Stable hash of a document's content. Covers every field except the id,
+    the hash itself, and the semantic field (a verbatim copy of hadithText, so
+    already captured). Any source change flips the hash and the doc is
+    re-indexed — which, for the semantic field, means re-embedded."""
+    payload = {
+        k: v
+        for k, v in doc.items()
+        if k not in ("_id", CONTENT_HASH_FIELD, SEMANTIC_FIELD)
+    }
+    encoded = json.dumps(payload, sort_keys=True, default=str, ensure_ascii=False)
+    return hashlib.sha256(encoded.encode("utf-8")).hexdigest()
+
+
+def _prepare_documents(documents):
+    """Assign each doc a deterministic _id and a content hash, in place. The
+    _id namespaces urn by language because the English and Arabic URN spaces
+    overlap; it lets a reindex match an incoming doc to its indexed copy."""
+    for doc in documents:
+        doc["_id"] = f"{doc['lang']}:{doc['urn']}"
+        doc[CONTENT_HASH_FIELD] = _content_hash(doc)
+    return documents
+
+
+def _index_supports_incremental():
+    """True if the live index was built by the current indexer — detected by
+    the content-hash field in its mapping. An older index lacks it (and has
+    non-deterministic ids), so it must be rebuilt before incremental diffing
+    can work; until then a reindex would churn the whole corpus."""
+    try:
+        mapping = es_client.indices.get_mapping(index=INDEX_NAME)
+    except NotFoundError:
+        return False
+    return all(
+        CONTENT_HASH_FIELD in index_def.get("mappings", {}).get("properties", {})
+        for index_def in mapping.values()
+    ) and bool(mapping)
+
+
+def create_and_update_index(documents, fields_to_not_index):
     settings = {
         "index": {
             "number_of_shards": 1,
@@ -242,22 +289,116 @@ def create_and_update_index(index_name, documents, fields_to_not_index):
             "type": "semantic_text",
             "inference_id": INFERENCE_ENDPOINT,
         }
-    if es_client.indices.exists(index=index_name):
-        es_client.indices.delete(index=index_name)
-    es_client.indices.create(index=index_name, mappings=mappings, settings=settings)
+    mappings["properties"][CONTENT_HASH_FIELD] = {"type": "keyword", "index": False}
+    # Zero-downtime reindex: build into a fresh concrete index, then atomically
+    # repoint the INDEX_NAME alias at it. Searches keep hitting the old index
+    # until the swap, so there's no NotFoundError window. The previous
+    # delete-then-recreate caused ~2-3 min of downtime.
+    new_index = f"{INDEX_NAME}-{int(time.time())}"
+    es_client.indices.create(index=new_index, mappings=mappings, settings=settings)
+
+    _prepare_documents(documents)
+
     # When semantic_text is in the mapping each doc triggers an embedding API
-    # call during indexing, so the bulk request needs a longer timeout.
+    # call during indexing, so the bulk request needs a longer timeout. Indexing
+    # stays single-stream: OpenAI's tokens-per-minute quota is the ceiling for
+    # the embedding calls, so fanning out just trips 429s without going faster.
     bulk_timeout = 300 if SEMANTIC_ENABLED else 60
     successCount, errors = helpers.bulk(
         es_client,
         documents,
-        index=index_name,
+        index=new_index,
         request_timeout=bulk_timeout,
         raise_on_error=False,
         raise_on_exception=False,
     )
+
+    # Don't swap an empty/failed build over a working index.
+    if successCount == 0:
+        es_client.indices.delete(index=new_index, ignore_unavailable=True)
+        return successCount, errors
+
+    # Find whatever currently serves the alias so we can retire it after the swap.
+    old_indices = []
+    if es_client.indices.exists_alias(name=INDEX_NAME):
+        old_indices = list(es_client.indices.get_alias(name=INDEX_NAME).keys())
+    elif es_client.indices.exists(index=INDEX_NAME):
+        # Legacy concrete index occupying the alias name (pre-alias deploys).
+        # It must go before an alias of the same name can exist — one-time,
+        # brief gap on the first reindex after this change ships.
+        es_client.indices.delete(index=INDEX_NAME)
+
+    # Atomic alias swap: add new + remove old in a single cluster action.
+    actions = [{"add": {"index": new_index, "alias": INDEX_NAME}}]
+    for old in old_indices:
+        actions.append({"remove": {"index": old, "alias": INDEX_NAME}})
+    es_client.indices.update_aliases(actions=actions)
+
+    for old in old_indices:
+        es_client.indices.delete(index=old, ignore_unavailable=True)
+
     return successCount, errors
 
+
+def incremental_update_index(documents):
+    """Reindex by diffing against the live index instead of rebuilding it.
+
+    Each incoming doc carries a content hash; we fetch the stored hashes from
+    the live index and only touch what changed:
+      - new / changed docs are re-indexed (and, for the semantic field,
+        re-embedded — the only OpenAI calls made)
+      - docs no longer in the source are deleted
+      - unchanged docs are left untouched
+
+    Hadith text is near-static, so a typical run embeds a handful of docs
+    rather than the whole corpus — sidestepping the OpenAI rate limit a full
+    rebuild hits. Updates apply in place and atomically per doc, so there's no
+    downtime and no alias swap. A mapping/analysis change still needs a full
+    rebuild (use ?rebuild=true)."""
+    _prepare_documents(documents)
+    incoming = {doc["_id"]: doc for doc in documents}
+
+    # Pull just {_id: contentHash} for every indexed doc — no _source bodies,
+    # so this stays cheap even for the full corpus.
+    existing_hashes = {}
+    for hit in helpers.scan(
+        es_client,
+        index=INDEX_NAME,
+        query={"_source": [CONTENT_HASH_FIELD]},
+        size=2000,
+    ):
+        existing_hashes[hit["_id"]] = hit["_source"].get(CONTENT_HASH_FIELD)
+
+    to_index = [
+        doc
+        for doc_id, doc in incoming.items()
+        if existing_hashes.get(doc_id) != doc[CONTENT_HASH_FIELD]
+    ]
+    to_delete = [doc_id for doc_id in existing_hashes if doc_id not in incoming]
+
+    actions = list(to_index) + [
+        {"_op_type": "delete", "_id": doc_id} for doc_id in to_delete
+    ]
+    success_count, errors = 0, []
+    if actions:
+        bulk_timeout = 300 if SEMANTIC_ENABLED else 60
+        success_count, errors = helpers.bulk(
+            es_client,
+            actions,
+            index=INDEX_NAME,
+            request_timeout=bulk_timeout,
+            raise_on_error=False,
+            raise_on_exception=False,
+        )
+    return {
+        "indexed": len(to_index),
+        "deleted": len(to_delete),
+        "unchanged": len(incoming) - len(to_index),
+        "success_count": success_count,
+        "errors": errors,
+    }
+
+
 def get_suggest_query(suggest_field):
     return {
         "field": suggest_field,
@@ -357,21 +498,36 @@ def index():
         englishHadith["arabicGrade"] = matchingArabic["grade"]
         englishHadith["hadithNumber"] = matchingArabic["hadithNumber"]
         
-    indexingSuccessCount, indexingErrors = create_and_update_index(
-        INDEX_NAME, englishHadiths + arabicOnlyHadiths, ["urn", "matchingArabicURN", "lang"]
-    )
-
     connection.close()
-    return {
-        "all_hadith_index_results": {
-            "success_count": indexingSuccessCount,
-            "failed": json.dumps(indexingErrors),
-        },
-       "arabic_only": {
-            "count": len(arabicOnlyHadiths),
-        },
-        "timeInSeconds": time.time() - start
-    }
+    documents = englishHadiths + arabicOnlyHadiths
+
+    # Full rebuild when explicitly asked (?rebuild=true — needed after a
+    # mapping/analysis change) or when there's no current-format index to diff
+    # against. Otherwise diff against the live index and touch only what changed.
+    rebuild = request.args.get("rebuild", "").lower() in ("1", "true", "yes")
+    if rebuild or not _index_supports_incremental():
+        successCount, errors = create_and_update_index(
+            documents, ["urn", "matchingArabicURN", "lang"]
+        )
+        result = {
+            "mode": "rebuild",
+            "success_count": successCount,
+            "failed": json.dumps(errors),
+        }
+    else:
+        stats = incremental_update_index(documents)
+        result = {
+            "mode": "incremental",
+            "indexed": stats["indexed"],
+            "deleted": stats["deleted"],
+            "unchanged": stats["unchanged"],
+            "success_count": stats["success_count"],
+            "failed": json.dumps(stats["errors"]),
+        }
+
+    result["arabic_only"] = {"count": len(arabicOnlyHadiths)}
+    result["timeInSeconds"] = time.time() - start
+    return result
 
 
 @app.route("/index/status", methods=["GET"])

From 37a17b5395a6203dd8bbbba0fd78534a7cce122d Mon Sep 17 00:00:00 2001
From: yug <>
Date: Fri, 15 May 2026 10:23:38 -0400
Subject: [PATCH 5/5] cleanup

---
 main.py | 90 +++++++++++++++++++++++++--------------------------------
 1 file changed, 40 insertions(+), 50 deletions(-)

diff --git a/main.py b/main.py
index 4a45b2d..16164d3 100644
--- a/main.py
+++ b/main.py
@@ -66,14 +66,23 @@ def _emit_access_log(response):
     request_timeout=10,
 )
 
+def _is_truthy(value):
+    """Parse an env var / query param flag the same way everywhere."""
+    return (value or "").lower() in ("1", "true", "yes")
+
+
 INDEX_NAME = "english"
 
-SEMANTIC_ENABLED = os.environ.get("SEMANTIC_SEARCH_ENABLED", "").lower() in ("1", "true", "yes")
+SEMANTIC_ENABLED = _is_truthy(os.environ.get("SEMANTIC_SEARCH_ENABLED"))
 SEMANTIC_FIELD = "hadithTextSemantic"
-# Per-doc content hash stored alongside each document. An incremental reindex
-# diffs incoming docs against this to skip re-embedding unchanged hadiths.
+# Per-doc content hash; an incremental reindex diffs against it. See _content_hash.
 CONTENT_HASH_FIELD = "contentHash"
 INFERENCE_ENDPOINT = "openai-text-embedding"
+# helpers.bulk timeout. With semantic_text in the mapping each doc triggers an
+# OpenAI embedding call during indexing, so the request needs far longer than a
+# plain bulk. Indexing stays single-stream on purpose: OpenAI's tokens-per-minute
+# quota is the ceiling, so fanning out across threads just trips 429s.
+BULK_REQUEST_TIMEOUT = 300 if SEMANTIC_ENABLED else 60
 # RRF constants. k=60 is the value from the original Cormack et al. paper and
 # the ES default. RRF_WINDOW is the depth fetched from each retriever before
 # fusion — bigger window = better recall at the tail, more cost per query.
@@ -179,7 +188,20 @@ def _prepare_documents(documents):
     for doc in documents:
         doc["_id"] = f"{doc['lang']}:{doc['urn']}"
         doc[CONTENT_HASH_FIELD] = _content_hash(doc)
-    return documents
+
+
+def _bulk_index(actions, index):
+    """helpers.bulk with the project's standard flags: a timeout long enough
+    for the semantic_text embedding calls, and errors collected per-doc rather
+    than raised so a partial failure still reports."""
+    return helpers.bulk(
+        es_client,
+        actions,
+        index=index,
+        request_timeout=BULK_REQUEST_TIMEOUT,
+        raise_on_error=False,
+        raise_on_exception=False,
+    )
 
 
 def _index_supports_incremental():
@@ -191,10 +213,12 @@ def _index_supports_incremental():
         mapping = es_client.indices.get_mapping(index=INDEX_NAME)
     except NotFoundError:
         return False
+    if not mapping:
+        return False
     return all(
         CONTENT_HASH_FIELD in index_def.get("mappings", {}).get("properties", {})
         for index_def in mapping.values()
-    ) and bool(mapping)
+    )
 
 
 def create_and_update_index(documents, fields_to_not_index):
@@ -298,25 +322,13 @@ def create_and_update_index(documents, fields_to_not_index):
     es_client.indices.create(index=new_index, mappings=mappings, settings=settings)
 
     _prepare_documents(documents)
-
-    # When semantic_text is in the mapping each doc triggers an embedding API
-    # call during indexing, so the bulk request needs a longer timeout. Indexing
-    # stays single-stream: OpenAI's tokens-per-minute quota is the ceiling for
-    # the embedding calls, so fanning out just trips 429s without going faster.
-    bulk_timeout = 300 if SEMANTIC_ENABLED else 60
-    successCount, errors = helpers.bulk(
-        es_client,
-        documents,
-        index=new_index,
-        request_timeout=bulk_timeout,
-        raise_on_error=False,
-        raise_on_exception=False,
-    )
+    successCount, errors = _bulk_index(documents, new_index)
+    result = {"mode": "rebuild", "success_count": successCount, "errors": errors}
 
     # Don't swap an empty/failed build over a working index.
     if successCount == 0:
         es_client.indices.delete(index=new_index, ignore_unavailable=True)
-        return successCount, errors
+        return result
 
     # Find whatever currently serves the alias so we can retire it after the swap.
     old_indices = []
@@ -337,7 +349,7 @@ def create_and_update_index(documents, fields_to_not_index):
     for old in old_indices:
         es_client.indices.delete(index=old, ignore_unavailable=True)
 
-    return successCount, errors
+    return result
 
 
 def incremental_update_index(documents):
@@ -376,21 +388,14 @@ def incremental_update_index(documents):
     ]
     to_delete = [doc_id for doc_id in existing_hashes if doc_id not in incoming]
 
-    actions = list(to_index) + [
+    actions = to_index + [
         {"_op_type": "delete", "_id": doc_id} for doc_id in to_delete
     ]
     success_count, errors = 0, []
     if actions:
-        bulk_timeout = 300 if SEMANTIC_ENABLED else 60
-        success_count, errors = helpers.bulk(
-            es_client,
-            actions,
-            index=INDEX_NAME,
-            request_timeout=bulk_timeout,
-            raise_on_error=False,
-            raise_on_exception=False,
-        )
+        success_count, errors = _bulk_index(actions, INDEX_NAME)
     return {
+        "mode": "incremental",
         "indexed": len(to_index),
         "deleted": len(to_delete),
         "unchanged": len(incoming) - len(to_index),
@@ -504,27 +509,12 @@ def index():
     # Full rebuild when explicitly asked (?rebuild=true — needed after a
     # mapping/analysis change) or when there's no current-format index to diff
     # against. Otherwise diff against the live index and touch only what changed.
-    rebuild = request.args.get("rebuild", "").lower() in ("1", "true", "yes")
-    if rebuild or not _index_supports_incremental():
-        successCount, errors = create_and_update_index(
-            documents, ["urn", "matchingArabicURN", "lang"]
-        )
-        result = {
-            "mode": "rebuild",
-            "success_count": successCount,
-            "failed": json.dumps(errors),
-        }
+    if _is_truthy(request.args.get("rebuild")) or not _index_supports_incremental():
+        result = create_and_update_index(documents, ["urn", "matchingArabicURN", "lang"])
     else:
-        stats = incremental_update_index(documents)
-        result = {
-            "mode": "incremental",
-            "indexed": stats["indexed"],
-            "deleted": stats["deleted"],
-            "unchanged": stats["unchanged"],
-            "success_count": stats["success_count"],
-            "failed": json.dumps(stats["errors"]),
-        }
+        result = incremental_update_index(documents)
 
+    result["failed"] = json.dumps(result.pop("errors"))
     result["arabic_only"] = {"count": len(arabicOnlyHadiths)}
     result["timeInSeconds"] = time.time() - start
     return result