diff --git a/api/src/services/text_processing/normalizer.py b/api/src/services/text_processing/normalizer.py index 2163cbc0..f439dfaa 100644 --- a/api/src/services/text_processing/normalizer.py +++ b/api/src/services/text_processing/normalizer.py @@ -441,22 +441,6 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st text = text.replace('\n', ' ') text = text.replace('\r', ' ') - # Handle other problematic symbols - text = text.replace('~', '') # Remove tilde - text = text.replace('@', ' at ') # At symbol - text = text.replace('#', ' number ') # Hash/pound - text = text.replace('$', ' dollar ') # Dollar sign (if not handled by money pattern) - text = text.replace('%', ' percent ') # Percent sign - text = text.replace('^', '') # Caret - text = text.replace('&', ' and ') # Ampersand - text = text.replace('*', '') # Asterisk - text = text.replace('_', ' ') # Underscore to space - text = text.replace('|', ' ') # Pipe to space - text = text.replace('\\', ' ') # Backslash to space - text = text.replace('/', ' slash ') # Forward slash to space (unless in URLs) - text = text.replace('=', ' equals ') # Equals sign - text = text.replace('+', ' plus ') # Plus sign - # Handle titles and abbreviations text = re.sub(r"\bD[Rr]\.(?= [A-Z])", "Doctor", text) text = re.sub(r"\b(?:Mr\.|MR\.(?= [A-Z]))", "Mister", text) @@ -467,7 +451,7 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st # Handle common words text = re.sub(r"(?i)\b(y)eah?\b", r"\1e'a", text) - # Handle numbers and money + # Handle numbers and money BEFORE replacing special characters text = re.sub(r"(?<=\d),(?=\d)", "", text) text = MONEY_PATTERN.sub( @@ -479,6 +463,22 @@ def normalize_text(text: str, normalization_options: NormalizationOptions) -> st text = re.sub(r"\d*\.\d+", handle_decimal, text) + # Handle other problematic symbols AFTER money/number processing + text = text.replace('~', '') # Remove tilde + text = text.replace('@', ' at ') # At symbol + text = text.replace('#', ' number ') # Hash/pound + text = text.replace('$', ' dollar ') # Dollar sign (if not handled by money pattern) + text = text.replace('%', ' percent ') # Percent sign + text = text.replace('^', '') # Caret + text = text.replace('&', ' and ') # Ampersand + text = text.replace('*', '') # Asterisk + text = text.replace('_', ' ') # Underscore to space + text = text.replace('|', ' ') # Pipe to space + text = text.replace('\\', ' ') # Backslash to space + text = text.replace('/', ' slash ') # Forward slash to space (unless in URLs) + text = text.replace('=', ' equals ') # Equals sign + text = text.replace('+', ' plus ') # Plus sign + # Handle various formatting text = re.sub(r"(?<=\d)-(?=\d)", " to ", text) text = re.sub(r"(?<=\d)S", " S", text)