From dda79af2a2f4f36455a841e239ab48f643c05bea Mon Sep 17 00:00:00 2001
From: Dennis <48984220+dejoma@users.noreply.github.com>
Date: Wed, 9 Jul 2025 16:26:03 +0100
Subject: [PATCH] improvement: remove redundant lowercase and punctuation
 checks

The _stem method performs unnecessary checks that are already handled upstream:
- Punctuation check: remove_non_alphanumeric() already removes all punctuation
- Lowercase conversion: SimpleTokenizer.tokenize() already converts to lowercase

These redundant operations can be safely removed without affecting functionality.
---
 fastembed/sparse/bm25.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fastembed/sparse/bm25.py b/fastembed/sparse/bm25.py
index b6ac59fda..42d5f0787 100644
--- a/fastembed/sparse/bm25.py
+++ b/fastembed/sparse/bm25.py
@@ -237,12 +237,7 @@ def embed(
 
     def _stem(self, tokens: list[str]) -> list[str]:
         stemmed_tokens: list[str] = []
-        for token in tokens:
-            lower_token = token.lower()
-
-            if token in self.punctuation:
-                continue
-
+        for lower_token in tokens:
             if lower_token in self.stopwords:
                 continue