From 4540176ff896a2d637e8a6cb1c7f6c3a49c704bd Mon Sep 17 00:00:00 2001 From: Lasse Borgholt Date: Sun, 29 Mar 2026 14:30:36 +0200 Subject: [PATCH 1/4] Added a translate call to remove non-spacing unicode characters after lower-casing --- src/error_align/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/error_align/utils.py b/src/error_align/utils.py index 4917481..edeee6a 100644 --- a/src/error_align/utils.py +++ b/src/error_align/utils.py @@ -1,3 +1,4 @@ +import unicodedata from dataclasses import dataclass from enum import IntEnum from itertools import chain, combinations @@ -5,6 +6,9 @@ import regex as re from unidecode import unidecode +# Build a translation table that maps all Mn (non-spacing mark) code points to None +_MN_TABLE = str.maketrans({cp: None for cp in range(0x110000) if unicodedata.category(chr(cp)) == "Mn"}) + class OpType(IntEnum): MATCH = 0 @@ -152,7 +156,7 @@ def basic_normalizer(text: str) -> str: str: The normalized text. """ - return text.lower() + return text.lower().translate(_MN_TABLE) def ensure_length_preservation(normalizer: callable) -> callable: From 5a8fe716a437140375cb278cad4f4e7e46aadd93 Mon Sep 17 00:00:00 2001 From: Lasse Borgholt Date: Mon, 30 Mar 2026 11:11:21 +0200 Subject: [PATCH 2/4] Added fallback to False for characters that transliterate to the empty string in character classification utils functions --- src/error_align/utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/error_align/utils.py b/src/error_align/utils.py index edeee6a..12ed1a3 100644 --- a/src/error_align/utils.py +++ b/src/error_align/utils.py @@ -97,7 +97,10 @@ def is_vowel(c: str) -> bool: """ assert len(c) == 1, "Input must be a single character." - return unidecode(c)[0] in "aeiouy" + decode_char = unidecode(c) + if len(decode_char) == 0: + return False + return decode_char[0] in "aeiouy" def is_consonant(c: str) -> bool: @@ -111,7 +114,10 @@ def is_consonant(c: str) -> bool: """ assert len(c) == 1, "Input must be a single character." - return unidecode(c)[0] in "bcdfghjklmnpqrstvwxyz" + decode_char = unidecode(c) + if len(decode_char) == 0: + return False + return decode_char[0] in "bcdfghjklmnpqrstvwxyz" def categorize_char(c: str) -> int: From d572efaf2f8aa2b8596e620759aeb14a592e8aef Mon Sep 17 00:00:00 2001 From: Lasse Borgholt Date: Mon, 30 Mar 2026 11:46:09 +0200 Subject: [PATCH 3/4] Simplified fix for non-spacing lower-casing characters --- src/error_align/utils.py | 12 ++++++------ tests/test_default.py | 9 ++++++++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/error_align/utils.py b/src/error_align/utils.py index 12ed1a3..5b8a4ad 100644 --- a/src/error_align/utils.py +++ b/src/error_align/utils.py @@ -1,4 +1,3 @@ -import unicodedata from dataclasses import dataclass from enum import IntEnum from itertools import chain, combinations @@ -6,9 +5,6 @@ import regex as re from unidecode import unidecode -# Build a translation table that maps all Mn (non-spacing mark) code points to None -_MN_TABLE = str.maketrans({cp: None for cp in range(0x110000) if unicodedata.category(chr(cp)) == "Mn"}) - class OpType(IntEnum): MATCH = 0 @@ -153,7 +149,11 @@ def basic_tokenizer(text: str) -> list: def basic_normalizer(text: str) -> str: - """Default normalizer that only converts text to lowercase. + """Default normalizer that converts text to lowercase. + + U+0130 (İ, Latin capital letter I with dot above) is replaced with a plain + 'I' before lowercasing to prevent the length-expanding decomposition that + Python's str.lower() would otherwise produce ('i' + combining dot above). Args: text (str): The input text to normalize. @@ -162,7 +162,7 @@ def basic_normalizer(text: str) -> str: str: The normalized text. """ - return text.lower().translate(_MN_TABLE) + return text.replace("\u0130", "I").lower() def ensure_length_preservation(normalizer: callable) -> callable: diff --git a/tests/test_default.py b/tests/test_default.py index 19e8239..e246fc6 100644 --- a/tests/test_default.py +++ b/tests/test_default.py @@ -8,7 +8,7 @@ from error_align.edit_distance import compute_error_align_distance_matrix, compute_levenshtein_distance_matrix from error_align.error_align import prepare_graph_metadata from error_align.graph_metadata import SubgraphMetadata -from error_align.utils import Alignment, OpType, categorize_char, ensure_length_preservation +from error_align.utils import Alignment, OpType, basic_normalizer, categorize_char, ensure_length_preservation def test_error_align() -> None: @@ -232,3 +232,10 @@ def bad_normalizer(text: str) -> str: raise AssertionError("Expected ValueError for length mismatch.") except ValueError: pass + + +def test_basic_normalizer_dotted_capital_i() -> None: + """Regression test: U+0130 (İ) must not expand length when lowercased.""" + result = basic_normalizer("İstanbul") + assert result == "istanbul" + assert len(result) == len("İstanbul") From 63b5d3d586d17347519dbd5a4c0d26d0d56fb8bb Mon Sep 17 00:00:00 2001 From: Lasse Borgholt Date: Mon, 30 Mar 2026 11:57:38 +0200 Subject: [PATCH 4/4] Added tests for empty unidecode chars --- tests/test_default.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/test_default.py b/tests/test_default.py index e246fc6..d33952c 100644 --- a/tests/test_default.py +++ b/tests/test_default.py @@ -8,7 +8,15 @@ from error_align.edit_distance import compute_error_align_distance_matrix, compute_levenshtein_distance_matrix from error_align.error_align import prepare_graph_metadata from error_align.graph_metadata import SubgraphMetadata -from error_align.utils import Alignment, OpType, basic_normalizer, categorize_char, ensure_length_preservation +from error_align.utils import ( + Alignment, + OpType, + basic_normalizer, + categorize_char, + ensure_length_preservation, + is_consonant, + is_vowel, +) def test_error_align() -> None: @@ -234,6 +242,13 @@ def bad_normalizer(text: str) -> str: pass +def test_is_vowel_and_is_consonant_with_empty_unidecode() -> None: + """Regression test: characters that unidecode to '' must return False instead of crashing.""" + # U+0300 (combining grave accent) unidecodes to an empty string + assert is_vowel("\u0300") is False + assert is_consonant("\u0300") is False + + def test_basic_normalizer_dotted_capital_i() -> None: """Regression test: U+0130 (İ) must not expand length when lowercased.""" result = basic_normalizer("İstanbul")