microsoft · gvanrossum · May 11, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 11, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -4,35 +4,18 @@
 
 Never run git commands that make any changes. (`git status` and `git diff` are fine)
 Exceptions: `git push`, `git worktree`, `git branch` (for tracking setup), as instructed below.
+Exceptions: `git push`, `git worktree`, `git branch` (for tracking setup), as instructed below.
 
 **NEVER COMMIT CODE.** Do not run `git commit` or any other git commands
 that make changes to the repository. Exception: Worktrees/Branches below.
 `git add` is fine.
 
 When moving, copying or deleting files, use the git commands: `git mv`, `git cp`, `git rm`
 
-## Worktrees and Branches
-
-- Each session uses its own worktree with a feature branch
-- Create worktrees with: `git worktree add ../<repo>-<branch-name> -b <branch-name>`
-- Push the branch to the `me` remote: `git push me <branch-name>`
-- Set upstream to `me/<branch-name>`: `git branch --set-upstream-to me/<branch-name>`
-- **Never** upstream to `me/main` — that must stay identical to `origin/main`
-- The worktree directory name should be `<repo>-<branch-name>` (sibling of the main checkout)
-- **Work in the worktree directory**, not the main checkout — edit files there, run tests there
-- VS Code may show buffers from the main checkout; ignore those when working in a worktree.
-  When in doubt, verify edits landed on disk with `cat` or `grep` in the terminal.
-
-## Debugging discipline
-
-- When a bug seems impossible, suspect stale files or wrong working directory — not exotic causes.
-- If you're tempted to blame installed package versions, `__pycache__`, or similar,
-  **stop and ask the user** before investigating further. You're probably on the wrong track.
-
-**Whenever the user tells you how to do something, states a preference, or corrects you,
-extract a general rule and add it to AGENTS.md** (unless it's already covered -- maybe
-reformulate since it apparently didn't work). This applies even without being asked.
-In all cases show what you added to AGENTS.md.
+When I ask to update AGENTS.md (even if maybe) extract a general rule from what I said
+before and update AGENTS.md (unless it's already in there -- maybe reformulate since
+it apparently didn't work). Also, when it looks like I state a general rule, add it to
+AGENTS.md. In all cases show what you added to AGENTS.md.
 
 - Don't use '!' on the command line, it's some bash magic (even inside single quotes)
 - When running 'make' commands, do not use the venv (the Makefile uses 'uv run')

diff --git a/pyproject.toml b/pyproject.toml
@@ -68,6 +68,7 @@ asyncio_default_fixture_loop_scope = "function"
 testpaths = ["tests"]
 
 [tool.pyright]
+extraPaths = ["src", "tools"]
 reportUnusedVariable = true
 reportUnusedImport = true
 reportDuplicateImport = true

diff --git a/src/typeagent/aitools/vectorbase.py b/src/typeagent/aitools/vectorbase.py
@@ -13,15 +13,52 @@
 )
 from .model_adapters import create_embedding_model
 
+DEFAULT_MIN_SCORE = 0.85
+
+# Empirical defaults for built-in OpenAI embedding models.
+# These values come from the Adrian Tchaikovsky Episode 53
+# search benchmark in `tools/repeat_embedding_benchmarks.py`, using an
+# exhaustive 0.00..1.00 min_score sweep. The benchmark recomputes corpus and
+# query embeddings for each model and ignores the fixture's serialized
+# embedding sidecar. Scores are normalized from cosine similarity to the public
+# 0..1 min_score scale.
+# These are repository defaults for known models, not universal truths.
+# Unknown models keep the long-standing fallback score of 0.85. Callers can
+# always override `min_score` explicitly for their own use cases or models. We
+# intentionally leave `max_matches` out of this table: the benchmark still
+# reports a best `max_hits` row, but the library default remains `None` unless
+# a caller opts into a specific limit.
+MODEL_DEFAULT_MIN_SCORES: dict[str, float] = {
+    "text-embedding-3-large": 0.74,
+    "text-embedding-3-small": 0.73,
+    "text-embedding-ada-002": 0.93,
+}
+
+
+def get_default_min_score(model_name: str) -> float:
+    """Return the repository default score cutoff for a known model name."""
+
+    return MODEL_DEFAULT_MIN_SCORES.get(model_name, DEFAULT_MIN_SCORE)
+
+
+def cosine_to_score(cosine_similarity: np.ndarray) -> np.ndarray:
+    """Map cosine similarity from -1..1 to the public 0..1 score scale."""
+
+    return np.clip((cosine_similarity + 1.0) / 2.0, 0.0, 1.0)
+
 
 @dataclass
 class ScoredInt:
+    """Associate an integer ordinal with its similarity score."""
+
     item: int
     score: float
 
 
 @dataclass
 class TextEmbeddingIndexSettings:
+    """Runtime settings for embedding-backed fuzzy lookup."""
+
     embedding_model: IEmbeddingModel
     min_score: float  # Between 0.0 and 1.0
     max_matches: int | None  # >= 1; None means no limit
@@ -34,10 +71,12 @@ def __init__(
         max_matches: int | None = None,
         batch_size: int | None = None,
     ):
-        self.min_score = min_score if min_score is not None else 0.85
+        self.embedding_model = embedding_model or create_embedding_model()
+        model_name = getattr(self.embedding_model, "model_name", "")
+        default_min_score = get_default_min_score(model_name)
+        self.min_score = min_score if min_score is not None else default_min_score
         self.max_matches = max_matches if max_matches and max_matches >= 1 else None
         self.batch_size = batch_size if batch_size and batch_size >= 1 else 8
-        self.embedding_model = embedding_model or create_embedding_model()
 
 
 class VectorBase:
@@ -76,20 +115,19 @@ def __bool__(self) -> bool:
     def add_embedding(
         self, key: str | None, embedding: NormalizedEmbedding | list[float]
     ) -> None:
-        if isinstance(embedding, list):
-            embedding = np.array(embedding, dtype=np.float32)
+        embedding_array = np.asarray(embedding, dtype=np.float32)
         if self._embedding_size == 0:
-            self._set_embedding_size(len(embedding))
+            self._set_embedding_size(len(embedding_array))
             self._vectors.shape = (0, self._embedding_size)
-        if len(embedding) != self._embedding_size:
+        if len(embedding_array) != self._embedding_size:
             raise ValueError(
                 f"Embedding size mismatch: expected {self._embedding_size}, "
-                f"got {len(embedding)}"
+                f"got {len(embedding_array)}"
             )
-        embeddings = embedding.reshape(1, -1)  # Make it 2D: 1xN
+        embeddings = embedding_array.reshape(1, -1)  # Make it 2D: 1xN
         self._vectors = np.append(self._vectors, embeddings, axis=0)
         if key is not None:
-            self._model.add_embedding(key, embedding)
+            self._model.add_embedding(key, embedding_array)
 
     def add_embeddings(
         self, keys: None | list[str], embeddings: NormalizedEmbeddings
@@ -135,7 +173,7 @@ def fuzzy_lookup_embedding(
             min_score = 0.0
         if len(self._vectors) == 0:
             return []
-        scores = np.dot(self._vectors, embedding)
+        scores = cosine_to_score(np.dot(self._vectors, embedding))
         if predicate is None:
             # Stay in numpy: filter by score, then top-k via argpartition.
             indices = np.flatnonzero(scores >= min_score)
@@ -177,7 +215,7 @@ def fuzzy_lookup_embedding_in_subset(
             return []
         # Compute dot products only for the subset instead of all vectors.
         subset = np.asarray(ordinals_of_subset)
-        scores = np.dot(self._vectors[subset], embedding)
+        scores = cosine_to_score(np.dot(self._vectors[subset], embedding))
         indices = np.flatnonzero(scores >= min_score)
         if len(indices) == 0:
             return []
@@ -238,7 +276,7 @@ def deserialize(self, data: NormalizedEmbeddings | None) -> None:
             return
         if self._embedding_size == 0:
             if data.ndim < 2 or data.shape[0] == 0:
-                # Empty data — can't determine size; just clear.
+                # Empty data can't determine size; just clear.
                 self.clear()
                 return
             self._set_embedding_size(data.shape[1])

diff --git a/src/typeagent/knowpro/convsettings.py b/src/typeagent/knowpro/convsettings.py
@@ -12,6 +12,9 @@
 from ..aitools.vectorbase import TextEmbeddingIndexSettings
 from .interfaces import IKnowledgeExtractor, IStorageProvider
 
+DEFAULT_RELATED_TERM_MIN_SCORE = 0.85
+DEFAULT_MESSAGE_TEXT_MIN_SCORE = 0.7
+
 
 @dataclass
 class MessageTextIndexSettings:
@@ -54,13 +57,13 @@ def __init__(
         # All settings share the same model, so they share the embedding cache.
         model = model or create_embedding_model(retrier=embed_retrier)
         self.embedding_model = model
-        min_score = 0.85
+        min_score = DEFAULT_RELATED_TERM_MIN_SCORE
         self.related_term_index_settings = RelatedTermIndexSettings(
             TextEmbeddingIndexSettings(model, min_score=min_score, max_matches=50)
         )
         self.thread_settings = TextEmbeddingIndexSettings(model, min_score=min_score)
         self.message_text_index_settings = MessageTextIndexSettings(
-            TextEmbeddingIndexSettings(model, min_score=0.7)
+            TextEmbeddingIndexSettings(model, min_score=DEFAULT_MESSAGE_TEXT_MIN_SCORE)
         )
         self.semantic_ref_index_settings = SemanticRefIndexSettings(
             concurrency=4,