From 4540176ff896a2d637e8a6cb1c7f6c3a49c704bd Mon Sep 17 00:00:00 2001
From: Lasse Borgholt <lb@corti.ai>
Date: Sun, 29 Mar 2026 14:30:36 +0200
Subject: [PATCH 1/4] Added a translate call to remove non-spacing unicode
 characters after lower-casing

---
 src/error_align/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/error_align/utils.py b/src/error_align/utils.py
index 4917481..edeee6a 100644
--- a/src/error_align/utils.py
+++ b/src/error_align/utils.py
@@ -1,3 +1,4 @@
+import unicodedata
 from dataclasses import dataclass
 from enum import IntEnum
 from itertools import chain, combinations
@@ -5,6 +6,9 @@
 import regex as re
 from unidecode import unidecode
 
+# Build a translation table that maps all Mn (non-spacing mark) code points to None
+_MN_TABLE = str.maketrans({cp: None for cp in range(0x110000) if unicodedata.category(chr(cp)) == "Mn"})
+
 
 class OpType(IntEnum):
     MATCH = 0
@@ -152,7 +156,7 @@ def basic_normalizer(text: str) -> str:
         str: The normalized text.
 
     """
-    return text.lower()
+    return text.lower().translate(_MN_TABLE)
 
 
 def ensure_length_preservation(normalizer: callable) -> callable:

From 5a8fe716a437140375cb278cad4f4e7e46aadd93 Mon Sep 17 00:00:00 2001
From: Lasse Borgholt <lb@corti.ai>
Date: Mon, 30 Mar 2026 11:11:21 +0200
Subject: [PATCH 2/4] Added fallback to False for characters that transliterate
 to the empty string in character classification utils functions

---
 src/error_align/utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/error_align/utils.py b/src/error_align/utils.py
index edeee6a..12ed1a3 100644
--- a/src/error_align/utils.py
+++ b/src/error_align/utils.py
@@ -97,7 +97,10 @@ def is_vowel(c: str) -> bool:
 
     """
     assert len(c) == 1, "Input must be a single character."
-    return unidecode(c)[0] in "aeiouy"
+    decode_char = unidecode(c)
+    if len(decode_char) == 0:
+        return False
+    return decode_char[0] in "aeiouy"
 
 
 def is_consonant(c: str) -> bool:
@@ -111,7 +114,10 @@ def is_consonant(c: str) -> bool:
 
     """
     assert len(c) == 1, "Input must be a single character."
-    return unidecode(c)[0] in "bcdfghjklmnpqrstvwxyz"
+    decode_char = unidecode(c)
+    if len(decode_char) == 0:
+        return False
+    return decode_char[0] in "bcdfghjklmnpqrstvwxyz"
 
 
 def categorize_char(c: str) -> int:

From d572efaf2f8aa2b8596e620759aeb14a592e8aef Mon Sep 17 00:00:00 2001
From: Lasse Borgholt <lb@corti.ai>
Date: Mon, 30 Mar 2026 11:46:09 +0200
Subject: [PATCH 3/4] Simplified fix for non-spacing lower-casing characters

---
 src/error_align/utils.py | 12 ++++++------
 tests/test_default.py    |  9 ++++++++-
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/error_align/utils.py b/src/error_align/utils.py
index 12ed1a3..5b8a4ad 100644
--- a/src/error_align/utils.py
+++ b/src/error_align/utils.py
@@ -1,4 +1,3 @@
-import unicodedata
 from dataclasses import dataclass
 from enum import IntEnum
 from itertools import chain, combinations
@@ -6,9 +5,6 @@
 import regex as re
 from unidecode import unidecode
 
-# Build a translation table that maps all Mn (non-spacing mark) code points to None
-_MN_TABLE = str.maketrans({cp: None for cp in range(0x110000) if unicodedata.category(chr(cp)) == "Mn"})
-
 
 class OpType(IntEnum):
     MATCH = 0
@@ -153,7 +149,11 @@ def basic_tokenizer(text: str) -> list:
 
 
 def basic_normalizer(text: str) -> str:
-    """Default normalizer that only converts text to lowercase.
+    """Default normalizer that converts text to lowercase.
+
+    U+0130 (İ, Latin capital letter I with dot above) is replaced with a plain
+    'I' before lowercasing to prevent the length-expanding decomposition that
+    Python's str.lower() would otherwise produce ('i' + combining dot above).
 
     Args:
         text (str): The input text to normalize.
@@ -162,7 +162,7 @@ def basic_normalizer(text: str) -> str:
         str: The normalized text.
 
     """
-    return text.lower().translate(_MN_TABLE)
+    return text.replace("\u0130", "I").lower()
 
 
 def ensure_length_preservation(normalizer: callable) -> callable:
diff --git a/tests/test_default.py b/tests/test_default.py
index 19e8239..e246fc6 100644
--- a/tests/test_default.py
+++ b/tests/test_default.py
@@ -8,7 +8,7 @@
 from error_align.edit_distance import compute_error_align_distance_matrix, compute_levenshtein_distance_matrix
 from error_align.error_align import prepare_graph_metadata
 from error_align.graph_metadata import SubgraphMetadata
-from error_align.utils import Alignment, OpType, categorize_char, ensure_length_preservation
+from error_align.utils import Alignment, OpType, basic_normalizer, categorize_char, ensure_length_preservation
 
 
 def test_error_align() -> None:
@@ -232,3 +232,10 @@ def bad_normalizer(text: str) -> str:
         raise AssertionError("Expected ValueError for length mismatch.")
     except ValueError:
         pass
+
+
+def test_basic_normalizer_dotted_capital_i() -> None:
+    """Regression test: U+0130 (İ) must not expand length when lowercased."""
+    result = basic_normalizer("İstanbul")
+    assert result == "istanbul"
+    assert len(result) == len("İstanbul")

From 63b5d3d586d17347519dbd5a4c0d26d0d56fb8bb Mon Sep 17 00:00:00 2001
From: Lasse Borgholt <lb@corti.ai>
Date: Mon, 30 Mar 2026 11:57:38 +0200
Subject: [PATCH 4/4] Added tests for empty unidecode chars

---
 tests/test_default.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tests/test_default.py b/tests/test_default.py
index e246fc6..d33952c 100644
--- a/tests/test_default.py
+++ b/tests/test_default.py
@@ -8,7 +8,15 @@
 from error_align.edit_distance import compute_error_align_distance_matrix, compute_levenshtein_distance_matrix
 from error_align.error_align import prepare_graph_metadata
 from error_align.graph_metadata import SubgraphMetadata
-from error_align.utils import Alignment, OpType, basic_normalizer, categorize_char, ensure_length_preservation
+from error_align.utils import (
+    Alignment,
+    OpType,
+    basic_normalizer,
+    categorize_char,
+    ensure_length_preservation,
+    is_consonant,
+    is_vowel,
+)
 
 
 def test_error_align() -> None:
@@ -234,6 +242,13 @@ def bad_normalizer(text: str) -> str:
         pass
 
 
+def test_is_vowel_and_is_consonant_with_empty_unidecode() -> None:
+    """Regression test: characters that unidecode to '' must return False instead of crashing."""
+    # U+0300 (combining grave accent) unidecodes to an empty string
+    assert is_vowel("\u0300") is False
+    assert is_consonant("\u0300") is False
+
+
 def test_basic_normalizer_dotted_capital_i() -> None:
     """Regression test: U+0130 (İ) must not expand length when lowercased."""
     result = basic_normalizer("İstanbul")