Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ tree-sitter-yaml = "0.7.2"
tree-sitter-zig = "<2"
codebook-tree-sitter-latex = "<0.7.0"
codebook-tree-sitter-typst = "<0.13.0"
unicase = "2.9"
unicode-script = "0.5.8"
unicode-segmentation = "1.12.0"
url = "2.4.0"
Expand Down
1 change: 1 addition & 0 deletions crates/codebook-config/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ log.workspace = true
regex.workspace = true
serde.workspace = true
toml.workspace = true
unicase.workspace = true


[dev-dependencies]
Expand Down
37 changes: 19 additions & 18 deletions crates/codebook-config/src/settings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,6 @@ impl<'de> Deserialize<'de> for ConfigSettings {
where
D: serde::Deserializer<'de>,
{
fn to_lowercase_vec(v: Vec<String>) -> Vec<String> {
v.into_iter().map(|s| s.to_lowercase()).collect()
}
#[derive(Deserialize)]
struct Helper {
#[serde(default)]
Expand All @@ -115,11 +112,18 @@ impl<'de> Deserialize<'de> for ConfigSettings {
exclude_tags: Vec<String>,
}

// Dictionary IDs are language codes (e.g. "en_US") — normalize to lowercase
// so lookups are case-insensitive. Word lists keep their original casing and
// are compared via unicase::eq.
let helper = Helper::deserialize(deserializer)?;
Ok(ConfigSettings {
dictionaries: to_lowercase_vec(helper.dictionaries),
words: to_lowercase_vec(helper.words),
flag_words: to_lowercase_vec(helper.flag_words),
dictionaries: helper
.dictionaries
.into_iter()
.map(|s| s.to_ascii_lowercase())
.collect(),
words: helper.words,
flag_words: helper.flag_words,
include_paths: helper.include_paths,
ignore_paths: helper.ignore_paths,
ignore_patterns: helper.ignore_patterns,
Expand Down Expand Up @@ -199,14 +203,13 @@ impl ConfigSettings {
}

/// Insert a word into the allowlist, returning true when it was newly added.
/// Existing entries differing only in case are treated as duplicates.
pub fn insert_word(&mut self, word: &str) -> bool {
let word = word.to_lowercase();
if self.words.contains(&word) {
if self.words.iter().any(|w| unicase::eq(w.as_str(), word)) {
return false;
}
self.words.push(word);
self.words.push(word.to_string());
self.words.sort();
self.words.dedup();
true
}

Expand Down Expand Up @@ -260,14 +263,12 @@ impl ConfigSettings {

/// Check if a word is explicitly allowed.
pub fn is_allowed_word(&self, word: &str) -> bool {
let word = word.to_lowercase();
self.words.iter().any(|w| w == &word)
self.words.iter().any(|w| unicase::eq(w.as_str(), word))
}

/// Check if a word should be flagged.
pub fn should_flag_word(&self, word: &str) -> bool {
let word = word.to_lowercase();
self.flag_words.iter().any(|w| w == &word)
self.flag_words.iter().any(|w| unicase::eq(w.as_str(), word))
}

/// Retrieve the configured minimum word length.
Expand Down Expand Up @@ -322,8 +323,8 @@ mod tests {
let config: ConfigSettings = toml::from_str(toml_str).unwrap();

assert_eq!(config.dictionaries, vec!["en_us", "en_gb"]);
assert_eq!(config.words, vec!["codebook", "rust", "апгрейдить"]);
assert_eq!(config.flag_words, vec!["todo", "fixme", "ошибка"]);
assert_eq!(config.words, vec!["CodeBook", "Rust", "Апгрейдить"]);
assert_eq!(config.flag_words, vec!["TODO", "FIXME", "Ошибка"]);
assert_eq!(config.include_paths, vec!["src/**/*.rs", "lib/"]);
assert_eq!(config.ignore_paths, vec!["**/*.md", "target/"]);

Expand Down Expand Up @@ -515,7 +516,7 @@ mod tests {

assert!(config.insert_word("Апгрейдить"));
assert!(!config.insert_word("апгрейдить"));
assert_eq!(config.words, vec!["апгрейдить"]);
assert_eq!(config.words, vec!["Апгрейдить"]);
assert!(config.is_allowed_word("АПГРЕЙДИТЬ"));
assert!(config.is_allowed_word("апгрейдить"));

Expand Down Expand Up @@ -665,7 +666,7 @@ mod tests {
let config: ConfigSettings = toml::from_str(toml_str).unwrap();

assert_eq!(config.dictionaries, vec!["en_us"]);
assert_eq!(config.words, vec!["codebook"]);
assert_eq!(config.words, vec!["CodeBook"]);
assert_eq!(config.flag_words, Vec::<String>::new());
assert_eq!(config.ignore_paths, Vec::<String>::new());
assert_eq!(config.ignore_patterns, Vec::<String>::new());
Expand Down
1 change: 1 addition & 0 deletions crates/codebook/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ tree-sitter-yaml.workspace = true
tree-sitter-zig.workspace = true
tree-sitter-c-sharp.workspace = true
tree-sitter.workspace = true
unicase.workspace = true
unicode-script.workspace = true
unicode-segmentation.workspace = true

Expand Down
35 changes: 30 additions & 5 deletions crates/codebook/src/dictionaries/dictionary.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use lru::LruCache;
use unicase::UniCase;

use std::{
collections::HashSet,
Expand Down Expand Up @@ -132,13 +133,12 @@ impl Dictionary for HunspellDictionary {

#[derive(Debug)]
pub struct TextDictionary {
words: HashSet<String>,
words: HashSet<UniCase<String>>,
}

impl Dictionary for TextDictionary {
fn check(&self, word: &str) -> bool {
let lower = word.to_lowercase();
self.words.contains(&lower)
self.words.contains(&UniCase::new(word.to_string()))
}
fn suggest(&self, _word: &str) -> Vec<String> {
vec![]
Expand All @@ -150,7 +150,7 @@ impl TextDictionary {
let words = word_list
.lines()
.filter(|s| !s.is_empty() && !s.starts_with('#'))
.map(|s| s.to_lowercase())
.map(|s| UniCase::new(s.to_string()))
.collect();
Self { words }
}
Expand All @@ -161,7 +161,7 @@ impl TextDictionary {
}

/// Get a reference to the internal HashSet for batch operations
pub fn word_set(&self) -> &HashSet<String> {
pub fn word_set(&self) -> &HashSet<UniCase<String>> {
&self.words
}
}
Expand Down Expand Up @@ -200,4 +200,29 @@ mod dictionary_tests {
assert!(dict.check("ии"));
assert!(dict.check("ИИ"));
}

#[test]
fn test_text_dictionary_preserves_original_case() {
let dict = TextDictionary::new("Straße\n");

assert!(dict.check("straße"));
assert!(dict.check("STRAßE"));
// Stored entry keeps its original casing rather than being lowercased.
assert!(dict.word_set().contains(&UniCase::new("Straße".to_string())));
}

#[test]
fn test_text_dictionary_full_case_folding() {
// unicase handles cases that to_lowercase() cannot collapse.
let dict = TextDictionary::new("Straße\nΣίγμα\n");

// ß <-> SS: full case folding
assert!(dict.check("STRASSE"));
assert!(dict.check("strasse"));
assert!(dict.check("Straße"));

// Greek sigma (Σ/σ/ς) all fold together
assert!(dict.check("ΣΊΓΜΑ"));
assert!(dict.check("σίγμα"));
}
}
Loading