From 6158c9abb780923f0840f2e88867b8af91c4dc85 Mon Sep 17 00:00:00 2001
From: Bo Lopker <lopkerk@gmail.com>
Date: Mon, 18 May 2026 15:11:56 -0700
Subject: [PATCH] Use unicase for full Unicode case folding of custom words
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace per-lookup to_lowercase() with unicase::eq / UniCase<String>
for the custom word allowlist, flag words, and TextDictionary. This
collapses pairs that Rust's default case mapping leaves distinct
(ß <-> SS, Greek Σ/σ/ς), and lets stored words keep their original
casing instead of being silently lowercased on insert.
---
 Cargo.lock                                    |  8 ++++
 Cargo.toml                                    |  1 +
 crates/codebook-config/Cargo.toml             |  1 +
 crates/codebook-config/src/settings.rs        | 37 ++++++++++---------
 crates/codebook/Cargo.toml                    |  1 +
 .../codebook/src/dictionaries/dictionary.rs   | 35 +++++++++++++++---
 6 files changed, 60 insertions(+), 23 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 7c4ce4c..26a2fd8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -436,6 +436,7 @@ dependencies = [
  "tree-sitter-vhdl",
  "tree-sitter-yaml",
  "tree-sitter-zig",
+ "unicase",
  "unicode-script",
  "unicode-segmentation",
 ]
@@ -494,6 +495,7 @@ dependencies = [
  "serde",
  "tempfile",
  "toml",
+ "unicase",
 ]
 
 [[package]]
@@ -3305,6 +3307,12 @@ version = "1.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
 
+[[package]]
+name = "unicase"
+version = "2.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.24"
diff --git a/Cargo.toml b/Cargo.toml
index 3b9df08..17d4d6d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -89,6 +89,7 @@ tree-sitter-yaml = "0.7.2"
 tree-sitter-zig = "<2"
 codebook-tree-sitter-latex = "<0.7.0"
 codebook-tree-sitter-typst = "<0.13.0"
+unicase = "2.9"
 unicode-script = "0.5.8"
 unicode-segmentation = "1.12.0"
 url = "2.4.0"
diff --git a/crates/codebook-config/Cargo.toml b/crates/codebook-config/Cargo.toml
index eca3418..1843762 100644
--- a/crates/codebook-config/Cargo.toml
+++ b/crates/codebook-config/Cargo.toml
@@ -19,6 +19,7 @@ log.workspace = true
 regex.workspace = true
 serde.workspace = true
 toml.workspace = true
+unicase.workspace = true
 
 
 [dev-dependencies]
diff --git a/crates/codebook-config/src/settings.rs b/crates/codebook-config/src/settings.rs
index fea468d..38d3953 100644
--- a/crates/codebook-config/src/settings.rs
+++ b/crates/codebook-config/src/settings.rs
@@ -88,9 +88,6 @@ impl<'de> Deserialize<'de> for ConfigSettings {
     where
         D: serde::Deserializer<'de>,
     {
-        fn to_lowercase_vec(v: Vec<String>) -> Vec<String> {
-            v.into_iter().map(|s| s.to_lowercase()).collect()
-        }
         #[derive(Deserialize)]
         struct Helper {
             #[serde(default)]
@@ -115,11 +112,18 @@ impl<'de> Deserialize<'de> for ConfigSettings {
             exclude_tags: Vec<String>,
         }
 
+        // Dictionary IDs are language codes (e.g. "en_US") — normalize to lowercase
+        // so lookups are case-insensitive. Word lists keep their original casing and
+        // are compared via unicase::eq.
         let helper = Helper::deserialize(deserializer)?;
         Ok(ConfigSettings {
-            dictionaries: to_lowercase_vec(helper.dictionaries),
-            words: to_lowercase_vec(helper.words),
-            flag_words: to_lowercase_vec(helper.flag_words),
+            dictionaries: helper
+                .dictionaries
+                .into_iter()
+                .map(|s| s.to_ascii_lowercase())
+                .collect(),
+            words: helper.words,
+            flag_words: helper.flag_words,
             include_paths: helper.include_paths,
             ignore_paths: helper.ignore_paths,
             ignore_patterns: helper.ignore_patterns,
@@ -199,14 +203,13 @@ impl ConfigSettings {
     }
 
     /// Insert a word into the allowlist, returning true when it was newly added.
+    /// Existing entries differing only in case are treated as duplicates.
     pub fn insert_word(&mut self, word: &str) -> bool {
-        let word = word.to_lowercase();
-        if self.words.contains(&word) {
+        if self.words.iter().any(|w| unicase::eq(w.as_str(), word)) {
             return false;
         }
-        self.words.push(word);
+        self.words.push(word.to_string());
         self.words.sort();
-        self.words.dedup();
         true
     }
 
@@ -260,14 +263,12 @@ impl ConfigSettings {
 
     /// Check if a word is explicitly allowed.
     pub fn is_allowed_word(&self, word: &str) -> bool {
-        let word = word.to_lowercase();
-        self.words.iter().any(|w| w == &word)
+        self.words.iter().any(|w| unicase::eq(w.as_str(), word))
     }
 
     /// Check if a word should be flagged.
     pub fn should_flag_word(&self, word: &str) -> bool {
-        let word = word.to_lowercase();
-        self.flag_words.iter().any(|w| w == &word)
+        self.flag_words.iter().any(|w| unicase::eq(w.as_str(), word))
     }
 
     /// Retrieve the configured minimum word length.
@@ -322,8 +323,8 @@ mod tests {
         let config: ConfigSettings = toml::from_str(toml_str).unwrap();
 
         assert_eq!(config.dictionaries, vec!["en_us", "en_gb"]);
-        assert_eq!(config.words, vec!["codebook", "rust", "апгрейдить"]);
-        assert_eq!(config.flag_words, vec!["todo", "fixme", "ошибка"]);
+        assert_eq!(config.words, vec!["CodeBook", "Rust", "Апгрейдить"]);
+        assert_eq!(config.flag_words, vec!["TODO", "FIXME", "Ошибка"]);
         assert_eq!(config.include_paths, vec!["src/**/*.rs", "lib/"]);
         assert_eq!(config.ignore_paths, vec!["**/*.md", "target/"]);
 
@@ -515,7 +516,7 @@ mod tests {
 
         assert!(config.insert_word("Апгрейдить"));
         assert!(!config.insert_word("апгрейдить"));
-        assert_eq!(config.words, vec!["апгрейдить"]);
+        assert_eq!(config.words, vec!["Апгрейдить"]);
         assert!(config.is_allowed_word("АПГРЕЙДИТЬ"));
         assert!(config.is_allowed_word("апгрейдить"));
 
@@ -665,7 +666,7 @@ mod tests {
         let config: ConfigSettings = toml::from_str(toml_str).unwrap();
 
         assert_eq!(config.dictionaries, vec!["en_us"]);
-        assert_eq!(config.words, vec!["codebook"]);
+        assert_eq!(config.words, vec!["CodeBook"]);
         assert_eq!(config.flag_words, Vec::<String>::new());
         assert_eq!(config.ignore_paths, Vec::<String>::new());
         assert_eq!(config.ignore_patterns, Vec::<String>::new());
diff --git a/crates/codebook/Cargo.toml b/crates/codebook/Cargo.toml
index bc0fefb..dc47a39 100644
--- a/crates/codebook/Cargo.toml
+++ b/crates/codebook/Cargo.toml
@@ -60,6 +60,7 @@ tree-sitter-yaml.workspace = true
 tree-sitter-zig.workspace = true
 tree-sitter-c-sharp.workspace = true
 tree-sitter.workspace = true
+unicase.workspace = true
 unicode-script.workspace = true
 unicode-segmentation.workspace = true
 
diff --git a/crates/codebook/src/dictionaries/dictionary.rs b/crates/codebook/src/dictionaries/dictionary.rs
index 1f31b9d..d603e89 100644
--- a/crates/codebook/src/dictionaries/dictionary.rs
+++ b/crates/codebook/src/dictionaries/dictionary.rs
@@ -1,4 +1,5 @@
 use lru::LruCache;
+use unicase::UniCase;
 
 use std::{
     collections::HashSet,
@@ -132,13 +133,12 @@ impl Dictionary for HunspellDictionary {
 
 #[derive(Debug)]
 pub struct TextDictionary {
-    words: HashSet<String>,
+    words: HashSet<UniCase<String>>,
 }
 
 impl Dictionary for TextDictionary {
     fn check(&self, word: &str) -> bool {
-        let lower = word.to_lowercase();
-        self.words.contains(&lower)
+        self.words.contains(&UniCase::new(word.to_string()))
     }
     fn suggest(&self, _word: &str) -> Vec<String> {
         vec![]
@@ -150,7 +150,7 @@ impl TextDictionary {
         let words = word_list
             .lines()
             .filter(|s| !s.is_empty() && !s.starts_with('#'))
-            .map(|s| s.to_lowercase())
+            .map(|s| UniCase::new(s.to_string()))
             .collect();
         Self { words }
     }
@@ -161,7 +161,7 @@ impl TextDictionary {
     }
 
     /// Get a reference to the internal HashSet for batch operations
-    pub fn word_set(&self) -> &HashSet<String> {
+    pub fn word_set(&self) -> &HashSet<UniCase<String>> {
         &self.words
     }
 }
@@ -200,4 +200,29 @@ mod dictionary_tests {
         assert!(dict.check("ии"));
         assert!(dict.check("ИИ"));
     }
+
+    #[test]
+    fn test_text_dictionary_preserves_original_case() {
+        let dict = TextDictionary::new("Straße\n");
+
+        assert!(dict.check("straße"));
+        assert!(dict.check("STRAßE"));
+        // Stored entry keeps its original casing rather than being lowercased.
+        assert!(dict.word_set().contains(&UniCase::new("Straße".to_string())));
+    }
+
+    #[test]
+    fn test_text_dictionary_full_case_folding() {
+        // unicase handles cases that to_lowercase() cannot collapse.
+        let dict = TextDictionary::new("Straße\nΣίγμα\n");
+
+        // ß <-> SS: full case folding
+        assert!(dict.check("STRASSE"));
+        assert!(dict.check("strasse"));
+        assert!(dict.check("Straße"));
+
+        // Greek sigma (Σ/σ/ς) all fold together
+        assert!(dict.check("ΣΊΓΜΑ"));
+        assert!(dict.check("σίγμα"));
+    }
 }