From 8f3f2eebcb5537b7d3550107fc6fc60774d91bbc Mon Sep 17 00:00:00 2001 From: choiharam Date: Thu, 28 May 2026 00:55:50 +0200 Subject: [PATCH] fix: replace substring pattern matching with globset in FileWalker matches_patterns() used path_str.contains(pattern), treating every pattern as a literal substring. Patterns like "**/vendor/**" never matched any path because the string "**/vendor/**" is not a substring of real file paths, making exclude_patterns effectively broken for glob-style inputs. globset is already a declared dependency. This change: - Adds build_glob_set() to compile patterns into a GlobSet once on construction rather than on every call to matches_patterns(). - Rewrites matches_patterns() to match against the path relative to the walk root, so patterns behave correctly regardless of where the root directory sits in the filesystem. - Updates tests to use glob-style patterns (e.g. **/*.rs, **/target/**) that reflect actual usage. --- src/indexer/file_walker/mod.rs | 63 +++++++++++++++++++++++--------- src/indexer/file_walker/tests.rs | 34 +++++++---------- 2 files changed, 60 insertions(+), 37 deletions(-) diff --git a/src/indexer/file_walker/mod.rs b/src/indexer/file_walker/mod.rs index 5f9bdd9..808df50 100644 --- a/src/indexer/file_walker/mod.rs +++ b/src/indexer/file_walker/mod.rs @@ -4,6 +4,7 @@ use super::file_info::FileInfo; use super::language::detect_language; use super::pdf_extractor::extract_pdf_to_markdown; use anyhow::{Context, Result}; +use globset::{Glob, GlobSet, GlobSetBuilder}; use ignore::WalkBuilder; use sha2::{Digest, Sha256}; use std::fs; @@ -17,10 +18,37 @@ pub struct FileWalker { pub(crate) max_file_size: usize, pub(crate) include_patterns: Vec, pub(crate) exclude_patterns: Vec, + /// Compiled glob sets, built once from the pattern strings. + include_set: Option, + exclude_set: Option, /// Optional cancellation flag - if set to true, walk() will exit early cancelled: Option>, } +/// Compile a list of glob patterns into a GlobSet. Patterns that fail to compile are +/// skipped with a warning rather than aborting the whole walk. Returns None when empty. +fn build_glob_set(patterns: &[String]) -> Option { + if patterns.is_empty() { + return None; + } + let mut builder = GlobSetBuilder::new(); + for pattern in patterns { + match Glob::new(pattern) { + Ok(glob) => { + builder.add(glob); + } + Err(e) => tracing::warn!("Ignoring invalid glob pattern '{}': {}", pattern, e), + } + } + match builder.build() { + Ok(set) => Some(set), + Err(e) => { + tracing::warn!("Failed to build glob set: {}", e); + None + } + } +} + impl FileWalker { pub fn new(root: impl AsRef, max_file_size: usize) -> Self { Self { @@ -29,6 +57,8 @@ impl FileWalker { max_file_size, include_patterns: vec![], exclude_patterns: vec![], + include_set: None, + exclude_set: None, cancelled: None, } } @@ -57,6 +87,8 @@ impl FileWalker { include_patterns: Vec, exclude_patterns: Vec, ) -> Self { + self.include_set = build_glob_set(&include_patterns); + self.exclude_set = build_glob_set(&exclude_patterns); self.include_patterns = include_patterns; self.exclude_patterns = exclude_patterns; self @@ -195,26 +227,23 @@ impl FileWalker { Ok((non_printable as f64 / content.len() as f64) < 0.3) } - /// Check if file matches include/exclude patterns + /// Check if a file matches the include/exclude glob patterns. + /// + /// Globs are matched against the path relative to the walk root, so patterns like + /// `**/vendor/**` or `src/**/*.rs` behave as expected regardless of where the root sits. pub(crate) fn matches_patterns(&self, path: &Path) -> bool { - let path_str = path.to_string_lossy(); - - // If include patterns are specified, file must match at least one - if !self.include_patterns.is_empty() { - let matches_include = self - .include_patterns - .iter() - .any(|pattern| path_str.contains(pattern)); - if !matches_include { - return false; - } + let rel = path.strip_prefix(&self.root).unwrap_or(path); + + // If include patterns are specified, file must match at least one. + if let Some(include_set) = &self.include_set + && !include_set.is_match(rel) + { + return false; } - // File must not match any exclude pattern - if self - .exclude_patterns - .iter() - .any(|pattern| path_str.contains(pattern)) + // File must not match any exclude pattern. + if let Some(exclude_set) = &self.exclude_set + && exclude_set.is_match(rel) { return false; } diff --git a/src/indexer/file_walker/tests.rs b/src/indexer/file_walker/tests.rs index 542e4c7..2cd046c 100644 --- a/src/indexer/file_walker/tests.rs +++ b/src/indexer/file_walker/tests.rs @@ -132,19 +132,13 @@ fn test_walk_with_include_patterns() { fs::write(temp_dir.path().join("test.toml"), "toml").unwrap(); let walker = - FileWalker::new(temp_dir.path(), 1024).with_patterns(vec![".rs".to_string()], vec![]); + FileWalker::new(temp_dir.path(), 1024).with_patterns(vec!["**/*.rs".to_string()], vec![]); let files = walker.walk().unwrap(); - // Debug: print what we found - eprintln!("Found {} files", files.len()); - for f in &files { - eprintln!(" - {:?}", f.path); - } - - assert_eq!(files.len(), 1, "Expected 1 file matching .rs pattern"); + assert_eq!(files.len(), 1, "Expected 1 file matching **/*.rs pattern"); assert!( - files[0].relative_path.contains(".rs"), - "Expected file to contain .rs" + files[0].relative_path.ends_with(".rs"), + "Expected file to end with .rs" ); } @@ -155,12 +149,12 @@ fn test_walk_with_exclude_patterns() { fs::write(temp_dir.path().join("exclude.txt"), "exclude").unwrap(); let walker = - FileWalker::new(temp_dir.path(), 1024).with_patterns(vec![], vec![".txt".to_string()]); + FileWalker::new(temp_dir.path(), 1024).with_patterns(vec![], vec!["**/*.txt".to_string()]); let files = walker.walk().unwrap(); - assert_eq!(files.len(), 1, "Expected 1 file after excluding .txt"); + assert_eq!(files.len(), 1, "Expected 1 file after excluding **/*.txt"); assert!( - files[0].relative_path.contains(".rs"), - "Expected file to contain .rs" + files[0].relative_path.ends_with(".rs"), + "Expected file to end with .rs" ); } @@ -172,7 +166,7 @@ fn test_walk_with_include_and_exclude_patterns() { fs::write(temp_dir.path().join("other.txt"), "other").unwrap(); let walker = FileWalker::new(temp_dir.path(), 1024) - .with_patterns(vec![".rs".to_string()], vec!["test".to_string()]); + .with_patterns(vec!["**/*.rs".to_string()], vec!["**/test*".to_string()]); let files = walker.walk().unwrap(); assert_eq!(files.len(), 1); assert!(files[0].path.ends_with("src.rs")); @@ -262,7 +256,7 @@ fn test_matches_patterns_no_patterns() { #[test] fn test_matches_patterns_include_match() { - let walker = FileWalker::new("/tmp", 1024).with_patterns(vec![".rs".to_string()], vec![]); + let walker = FileWalker::new("/tmp", 1024).with_patterns(vec!["**/*.rs".to_string()], vec![]); assert!(walker.matches_patterns(Path::new("/tmp/test.rs"))); assert!(!walker.matches_patterns(Path::new("/tmp/test.txt"))); } @@ -270,7 +264,7 @@ fn test_matches_patterns_include_match() { #[test] fn test_matches_patterns_include_multiple() { let walker = FileWalker::new("/tmp", 1024) - .with_patterns(vec![".rs".to_string(), ".toml".to_string()], vec![]); + .with_patterns(vec!["**/*.rs".to_string(), "**/*.toml".to_string()], vec![]); assert!(walker.matches_patterns(Path::new("/tmp/test.rs"))); assert!(walker.matches_patterns(Path::new("/tmp/Cargo.toml"))); assert!(!walker.matches_patterns(Path::new("/tmp/test.txt"))); @@ -279,7 +273,7 @@ fn test_matches_patterns_include_multiple() { #[test] fn test_matches_patterns_exclude_match() { let walker = - FileWalker::new("/tmp", 1024).with_patterns(vec![], vec!["target".to_string()]); + FileWalker::new("/tmp", 1024).with_patterns(vec![], vec!["**/target/**".to_string()]); assert!(walker.matches_patterns(Path::new("/tmp/src/main.rs"))); assert!(!walker.matches_patterns(Path::new("/tmp/target/debug/main"))); } @@ -288,7 +282,7 @@ fn test_matches_patterns_exclude_match() { fn test_matches_patterns_exclude_multiple() { let walker = FileWalker::new("/tmp", 1024).with_patterns( vec![], - vec!["target".to_string(), "node_modules".to_string()], + vec!["**/target/**".to_string(), "**/node_modules/**".to_string()], ); assert!(walker.matches_patterns(Path::new("/tmp/src/main.rs"))); assert!(!walker.matches_patterns(Path::new("/tmp/target/debug/main"))); @@ -298,7 +292,7 @@ fn test_matches_patterns_exclude_multiple() { #[test] fn test_matches_patterns_include_and_exclude() { let walker = FileWalker::new("/tmp", 1024) - .with_patterns(vec![".rs".to_string()], vec!["test".to_string()]); + .with_patterns(vec!["**/*.rs".to_string()], vec!["**/test*".to_string()]); assert!(walker.matches_patterns(Path::new("/tmp/src/main.rs"))); assert!(!walker.matches_patterns(Path::new("/tmp/src/test.rs"))); assert!(!walker.matches_patterns(Path::new("/tmp/src/main.txt")));