From 6158c9abb780923f0840f2e88867b8af91c4dc85 Mon Sep 17 00:00:00 2001 From: Bo Lopker Date: Mon, 18 May 2026 15:11:56 -0700 Subject: [PATCH] Use unicase for full Unicode case folding of custom words MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace per-lookup to_lowercase() with unicase::eq / UniCase for the custom word allowlist, flag words, and TextDictionary. This collapses pairs that Rust's default case mapping leaves distinct (ß <-> SS, Greek Σ/σ/ς), and lets stored words keep their original casing instead of being silently lowercased on insert. --- Cargo.lock | 8 ++++ Cargo.toml | 1 + crates/codebook-config/Cargo.toml | 1 + crates/codebook-config/src/settings.rs | 37 ++++++++++--------- crates/codebook/Cargo.toml | 1 + .../codebook/src/dictionaries/dictionary.rs | 35 +++++++++++++++--- 6 files changed, 60 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7c4ce4c..26a2fd8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -436,6 +436,7 @@ dependencies = [ "tree-sitter-vhdl", "tree-sitter-yaml", "tree-sitter-zig", + "unicase", "unicode-script", "unicode-segmentation", ] @@ -494,6 +495,7 @@ dependencies = [ "serde", "tempfile", "toml", + "unicase", ] [[package]] @@ -3305,6 +3307,12 @@ version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + [[package]] name = "unicode-ident" version = "1.0.24" diff --git a/Cargo.toml b/Cargo.toml index 3b9df08..17d4d6d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -89,6 +89,7 @@ tree-sitter-yaml = "0.7.2" tree-sitter-zig = "<2" codebook-tree-sitter-latex = "<0.7.0" codebook-tree-sitter-typst = "<0.13.0" +unicase = "2.9" unicode-script = "0.5.8" unicode-segmentation = "1.12.0" url = "2.4.0" diff --git a/crates/codebook-config/Cargo.toml b/crates/codebook-config/Cargo.toml index eca3418..1843762 100644 --- a/crates/codebook-config/Cargo.toml +++ b/crates/codebook-config/Cargo.toml @@ -19,6 +19,7 @@ log.workspace = true regex.workspace = true serde.workspace = true toml.workspace = true +unicase.workspace = true [dev-dependencies] diff --git a/crates/codebook-config/src/settings.rs b/crates/codebook-config/src/settings.rs index fea468d..38d3953 100644 --- a/crates/codebook-config/src/settings.rs +++ b/crates/codebook-config/src/settings.rs @@ -88,9 +88,6 @@ impl<'de> Deserialize<'de> for ConfigSettings { where D: serde::Deserializer<'de>, { - fn to_lowercase_vec(v: Vec) -> Vec { - v.into_iter().map(|s| s.to_lowercase()).collect() - } #[derive(Deserialize)] struct Helper { #[serde(default)] @@ -115,11 +112,18 @@ impl<'de> Deserialize<'de> for ConfigSettings { exclude_tags: Vec, } + // Dictionary IDs are language codes (e.g. "en_US") — normalize to lowercase + // so lookups are case-insensitive. Word lists keep their original casing and + // are compared via unicase::eq. let helper = Helper::deserialize(deserializer)?; Ok(ConfigSettings { - dictionaries: to_lowercase_vec(helper.dictionaries), - words: to_lowercase_vec(helper.words), - flag_words: to_lowercase_vec(helper.flag_words), + dictionaries: helper + .dictionaries + .into_iter() + .map(|s| s.to_ascii_lowercase()) + .collect(), + words: helper.words, + flag_words: helper.flag_words, include_paths: helper.include_paths, ignore_paths: helper.ignore_paths, ignore_patterns: helper.ignore_patterns, @@ -199,14 +203,13 @@ impl ConfigSettings { } /// Insert a word into the allowlist, returning true when it was newly added. + /// Existing entries differing only in case are treated as duplicates. pub fn insert_word(&mut self, word: &str) -> bool { - let word = word.to_lowercase(); - if self.words.contains(&word) { + if self.words.iter().any(|w| unicase::eq(w.as_str(), word)) { return false; } - self.words.push(word); + self.words.push(word.to_string()); self.words.sort(); - self.words.dedup(); true } @@ -260,14 +263,12 @@ impl ConfigSettings { /// Check if a word is explicitly allowed. pub fn is_allowed_word(&self, word: &str) -> bool { - let word = word.to_lowercase(); - self.words.iter().any(|w| w == &word) + self.words.iter().any(|w| unicase::eq(w.as_str(), word)) } /// Check if a word should be flagged. pub fn should_flag_word(&self, word: &str) -> bool { - let word = word.to_lowercase(); - self.flag_words.iter().any(|w| w == &word) + self.flag_words.iter().any(|w| unicase::eq(w.as_str(), word)) } /// Retrieve the configured minimum word length. @@ -322,8 +323,8 @@ mod tests { let config: ConfigSettings = toml::from_str(toml_str).unwrap(); assert_eq!(config.dictionaries, vec!["en_us", "en_gb"]); - assert_eq!(config.words, vec!["codebook", "rust", "апгрейдить"]); - assert_eq!(config.flag_words, vec!["todo", "fixme", "ошибка"]); + assert_eq!(config.words, vec!["CodeBook", "Rust", "Апгрейдить"]); + assert_eq!(config.flag_words, vec!["TODO", "FIXME", "Ошибка"]); assert_eq!(config.include_paths, vec!["src/**/*.rs", "lib/"]); assert_eq!(config.ignore_paths, vec!["**/*.md", "target/"]); @@ -515,7 +516,7 @@ mod tests { assert!(config.insert_word("Апгрейдить")); assert!(!config.insert_word("апгрейдить")); - assert_eq!(config.words, vec!["апгрейдить"]); + assert_eq!(config.words, vec!["Апгрейдить"]); assert!(config.is_allowed_word("АПГРЕЙДИТЬ")); assert!(config.is_allowed_word("апгрейдить")); @@ -665,7 +666,7 @@ mod tests { let config: ConfigSettings = toml::from_str(toml_str).unwrap(); assert_eq!(config.dictionaries, vec!["en_us"]); - assert_eq!(config.words, vec!["codebook"]); + assert_eq!(config.words, vec!["CodeBook"]); assert_eq!(config.flag_words, Vec::::new()); assert_eq!(config.ignore_paths, Vec::::new()); assert_eq!(config.ignore_patterns, Vec::::new()); diff --git a/crates/codebook/Cargo.toml b/crates/codebook/Cargo.toml index bc0fefb..dc47a39 100644 --- a/crates/codebook/Cargo.toml +++ b/crates/codebook/Cargo.toml @@ -60,6 +60,7 @@ tree-sitter-yaml.workspace = true tree-sitter-zig.workspace = true tree-sitter-c-sharp.workspace = true tree-sitter.workspace = true +unicase.workspace = true unicode-script.workspace = true unicode-segmentation.workspace = true diff --git a/crates/codebook/src/dictionaries/dictionary.rs b/crates/codebook/src/dictionaries/dictionary.rs index 1f31b9d..d603e89 100644 --- a/crates/codebook/src/dictionaries/dictionary.rs +++ b/crates/codebook/src/dictionaries/dictionary.rs @@ -1,4 +1,5 @@ use lru::LruCache; +use unicase::UniCase; use std::{ collections::HashSet, @@ -132,13 +133,12 @@ impl Dictionary for HunspellDictionary { #[derive(Debug)] pub struct TextDictionary { - words: HashSet, + words: HashSet>, } impl Dictionary for TextDictionary { fn check(&self, word: &str) -> bool { - let lower = word.to_lowercase(); - self.words.contains(&lower) + self.words.contains(&UniCase::new(word.to_string())) } fn suggest(&self, _word: &str) -> Vec { vec![] @@ -150,7 +150,7 @@ impl TextDictionary { let words = word_list .lines() .filter(|s| !s.is_empty() && !s.starts_with('#')) - .map(|s| s.to_lowercase()) + .map(|s| UniCase::new(s.to_string())) .collect(); Self { words } } @@ -161,7 +161,7 @@ impl TextDictionary { } /// Get a reference to the internal HashSet for batch operations - pub fn word_set(&self) -> &HashSet { + pub fn word_set(&self) -> &HashSet> { &self.words } } @@ -200,4 +200,29 @@ mod dictionary_tests { assert!(dict.check("ии")); assert!(dict.check("ИИ")); } + + #[test] + fn test_text_dictionary_preserves_original_case() { + let dict = TextDictionary::new("Straße\n"); + + assert!(dict.check("straße")); + assert!(dict.check("STRAßE")); + // Stored entry keeps its original casing rather than being lowercased. + assert!(dict.word_set().contains(&UniCase::new("Straße".to_string()))); + } + + #[test] + fn test_text_dictionary_full_case_folding() { + // unicase handles cases that to_lowercase() cannot collapse. + let dict = TextDictionary::new("Straße\nΣίγμα\n"); + + // ß <-> SS: full case folding + assert!(dict.check("STRASSE")); + assert!(dict.check("strasse")); + assert!(dict.check("Straße")); + + // Greek sigma (Σ/σ/ς) all fold together + assert!(dict.check("ΣΊΓΜΑ")); + assert!(dict.check("σίγμα")); + } }