blopker · blopker · May 18, 2026 · May 18, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -89,6 +89,7 @@ tree-sitter-yaml = "0.7.2"
 tree-sitter-zig = "<2"
 codebook-tree-sitter-latex = "<0.7.0"
 codebook-tree-sitter-typst = "<0.13.0"
+unicase = "2.9"
 unicode-script = "0.5.8"
 unicode-segmentation = "1.12.0"
 url = "2.4.0"

diff --git a/crates/codebook-config/Cargo.toml b/crates/codebook-config/Cargo.toml
@@ -19,6 +19,7 @@ log.workspace = true
 regex.workspace = true
 serde.workspace = true
 toml.workspace = true
+unicase.workspace = true
 
 
 [dev-dependencies]

diff --git a/crates/codebook-config/src/settings.rs b/crates/codebook-config/src/settings.rs
@@ -88,9 +88,6 @@ impl<'de> Deserialize<'de> for ConfigSettings {
     where
         D: serde::Deserializer<'de>,
     {
-        fn to_lowercase_vec(v: Vec<String>) -> Vec<String> {
-            v.into_iter().map(|s| s.to_lowercase()).collect()
-        }
         #[derive(Deserialize)]
         struct Helper {
             #[serde(default)]
@@ -115,11 +112,18 @@ impl<'de> Deserialize<'de> for ConfigSettings {
             exclude_tags: Vec<String>,
         }
 
+        // Dictionary IDs are language codes (e.g. "en_US") — normalize to lowercase
+        // so lookups are case-insensitive. Word lists keep their original casing and
+        // are compared via unicase::eq.
         let helper = Helper::deserialize(deserializer)?;
         Ok(ConfigSettings {
-            dictionaries: to_lowercase_vec(helper.dictionaries),
-            words: to_lowercase_vec(helper.words),
-            flag_words: to_lowercase_vec(helper.flag_words),
+            dictionaries: helper
+                .dictionaries
+                .into_iter()
+                .map(|s| s.to_ascii_lowercase())
+                .collect(),
+            words: helper.words,
+            flag_words: helper.flag_words,
             include_paths: helper.include_paths,
             ignore_paths: helper.ignore_paths,
             ignore_patterns: helper.ignore_patterns,
@@ -199,14 +203,13 @@ impl ConfigSettings {
     }
 
     /// Insert a word into the allowlist, returning true when it was newly added.
+    /// Existing entries differing only in case are treated as duplicates.
     pub fn insert_word(&mut self, word: &str) -> bool {
-        let word = word.to_lowercase();
-        if self.words.contains(&word) {
+        if self.words.iter().any(|w| unicase::eq(w.as_str(), word)) {
             return false;
         }
-        self.words.push(word);
+        self.words.push(word.to_string());
         self.words.sort();
-        self.words.dedup();
         true
     }
 
@@ -260,14 +263,12 @@ impl ConfigSettings {
 
     /// Check if a word is explicitly allowed.
     pub fn is_allowed_word(&self, word: &str) -> bool {
-        let word = word.to_lowercase();
-        self.words.iter().any(|w| w == &word)
+        self.words.iter().any(|w| unicase::eq(w.as_str(), word))
     }
 
     /// Check if a word should be flagged.
     pub fn should_flag_word(&self, word: &str) -> bool {
-        let word = word.to_lowercase();
-        self.flag_words.iter().any(|w| w == &word)
+        self.flag_words.iter().any(|w| unicase::eq(w.as_str(), word))
     }
 
     /// Retrieve the configured minimum word length.
@@ -322,8 +323,8 @@ mod tests {
         let config: ConfigSettings = toml::from_str(toml_str).unwrap();
 
         assert_eq!(config.dictionaries, vec!["en_us", "en_gb"]);
-        assert_eq!(config.words, vec!["codebook", "rust", "апгрейдить"]);
-        assert_eq!(config.flag_words, vec!["todo", "fixme", "ошибка"]);
+        assert_eq!(config.words, vec!["CodeBook", "Rust", "Апгрейдить"]);
+        assert_eq!(config.flag_words, vec!["TODO", "FIXME", "Ошибка"]);
         assert_eq!(config.include_paths, vec!["src/**/*.rs", "lib/"]);
         assert_eq!(config.ignore_paths, vec!["**/*.md", "target/"]);
 
@@ -515,7 +516,7 @@ mod tests {
 
         assert!(config.insert_word("Апгрейдить"));
         assert!(!config.insert_word("апгрейдить"));
-        assert_eq!(config.words, vec!["апгрейдить"]);
+        assert_eq!(config.words, vec!["Апгрейдить"]);
         assert!(config.is_allowed_word("АПГРЕЙДИТЬ"));
         assert!(config.is_allowed_word("апгрейдить"));
 
@@ -665,7 +666,7 @@ mod tests {
         let config: ConfigSettings = toml::from_str(toml_str).unwrap();
 
         assert_eq!(config.dictionaries, vec!["en_us"]);
-        assert_eq!(config.words, vec!["codebook"]);
+        assert_eq!(config.words, vec!["CodeBook"]);
         assert_eq!(config.flag_words, Vec::<String>::new());
         assert_eq!(config.ignore_paths, Vec::<String>::new());
         assert_eq!(config.ignore_patterns, Vec::<String>::new());

diff --git a/crates/codebook/Cargo.toml b/crates/codebook/Cargo.toml
@@ -60,6 +60,7 @@ tree-sitter-yaml.workspace = true
 tree-sitter-zig.workspace = true
 tree-sitter-c-sharp.workspace = true
 tree-sitter.workspace = true
+unicase.workspace = true
 unicode-script.workspace = true
 unicode-segmentation.workspace = true
 

diff --git a/crates/codebook/src/dictionaries/dictionary.rs b/crates/codebook/src/dictionaries/dictionary.rs
@@ -1,4 +1,5 @@
 use lru::LruCache;
+use unicase::UniCase;
 
 use std::{
     collections::HashSet,
@@ -132,13 +133,12 @@ impl Dictionary for HunspellDictionary {
 
 #[derive(Debug)]
 pub struct TextDictionary {
-    words: HashSet<String>,
+    words: HashSet<UniCase<String>>,
 }
 
 impl Dictionary for TextDictionary {
     fn check(&self, word: &str) -> bool {
-        let lower = word.to_lowercase();
-        self.words.contains(&lower)
+        self.words.contains(&UniCase::new(word.to_string()))
     }
     fn suggest(&self, _word: &str) -> Vec<String> {
         vec![]
@@ -150,7 +150,7 @@ impl TextDictionary {
         let words = word_list
             .lines()
             .filter(|s| !s.is_empty() && !s.starts_with('#'))
-            .map(|s| s.to_lowercase())
+            .map(|s| UniCase::new(s.to_string()))
             .collect();
         Self { words }
     }
@@ -161,7 +161,7 @@ impl TextDictionary {
     }
 
     /// Get a reference to the internal HashSet for batch operations
-    pub fn word_set(&self) -> &HashSet<String> {
+    pub fn word_set(&self) -> &HashSet<UniCase<String>> {
         &self.words
     }
 }
@@ -200,4 +200,29 @@ mod dictionary_tests {
         assert!(dict.check("ии"));
         assert!(dict.check("ИИ"));
     }
+
+    #[test]
+    fn test_text_dictionary_preserves_original_case() {
+        let dict = TextDictionary::new("Straße\n");
+
+        assert!(dict.check("straße"));
+        assert!(dict.check("STRAßE"));
+        // Stored entry keeps its original casing rather than being lowercased.
+        assert!(dict.word_set().contains(&UniCase::new("Straße".to_string())));
+    }
+
+    #[test]
+    fn test_text_dictionary_full_case_folding() {
+        // unicase handles cases that to_lowercase() cannot collapse.
+        let dict = TextDictionary::new("Straße\nΣίγμα\n");
+
+        // ß <-> SS: full case folding
+        assert!(dict.check("STRASSE"));
+        assert!(dict.check("strasse"));
+        assert!(dict.check("Straße"));
+
+        // Greek sigma (Σ/σ/ς) all fold together
+        assert!(dict.check("ΣΊΓΜΑ"));
+        assert!(dict.check("σίγμα"));
+    }
 }