From dda79af2a2f4f36455a841e239ab48f643c05bea Mon Sep 17 00:00:00 2001 From: Dennis <48984220+dejoma@users.noreply.github.com> Date: Wed, 9 Jul 2025 16:26:03 +0100 Subject: [PATCH] improvement: remove redundant lowercase and punctuation checks The _stem method performs unnecessary checks that are already handled upstream: - Punctuation check: remove_non_alphanumeric() already removes all punctuation - Lowercase conversion: SimpleTokenizer.tokenize() already converts to lowercase These redundant operations can be safely removed without affecting functionality. --- fastembed/sparse/bm25.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fastembed/sparse/bm25.py b/fastembed/sparse/bm25.py index b6ac59fda..42d5f0787 100644 --- a/fastembed/sparse/bm25.py +++ b/fastembed/sparse/bm25.py @@ -237,12 +237,7 @@ def embed( def _stem(self, tokens: list[str]) -> list[str]: stemmed_tokens: list[str] = [] - for token in tokens: - lower_token = token.lower() - - if token in self.punctuation: - continue - + for lower_token in tokens: if lower_token in self.stopwords: continue