Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 46 additions & 17 deletions src/indexer/file_walker/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use super::file_info::FileInfo;
use super::language::detect_language;
use super::pdf_extractor::extract_pdf_to_markdown;
use anyhow::{Context, Result};
use globset::{Glob, GlobSet, GlobSetBuilder};
use ignore::WalkBuilder;
use sha2::{Digest, Sha256};
use std::fs;
Expand All @@ -17,10 +18,37 @@ pub struct FileWalker {
pub(crate) max_file_size: usize,
pub(crate) include_patterns: Vec<String>,
pub(crate) exclude_patterns: Vec<String>,
/// Compiled glob sets, built once from the pattern strings.
include_set: Option<GlobSet>,
exclude_set: Option<GlobSet>,
/// Optional cancellation flag - if set to true, walk() will exit early
cancelled: Option<Arc<AtomicBool>>,
}

/// Compile a list of glob patterns into a GlobSet. Patterns that fail to compile are
/// skipped with a warning rather than aborting the whole walk. Returns None when empty.
fn build_glob_set(patterns: &[String]) -> Option<GlobSet> {
if patterns.is_empty() {
return None;
}
let mut builder = GlobSetBuilder::new();
for pattern in patterns {
match Glob::new(pattern) {
Ok(glob) => {
builder.add(glob);
}
Err(e) => tracing::warn!("Ignoring invalid glob pattern '{}': {}", pattern, e),
}
}
match builder.build() {
Ok(set) => Some(set),
Err(e) => {
tracing::warn!("Failed to build glob set: {}", e);
None
}
}
}

impl FileWalker {
pub fn new(root: impl AsRef<Path>, max_file_size: usize) -> Self {
Self {
Expand All @@ -29,6 +57,8 @@ impl FileWalker {
max_file_size,
include_patterns: vec![],
exclude_patterns: vec![],
include_set: None,
exclude_set: None,
cancelled: None,
}
}
Expand Down Expand Up @@ -57,6 +87,8 @@ impl FileWalker {
include_patterns: Vec<String>,
exclude_patterns: Vec<String>,
) -> Self {
self.include_set = build_glob_set(&include_patterns);
self.exclude_set = build_glob_set(&exclude_patterns);
self.include_patterns = include_patterns;
self.exclude_patterns = exclude_patterns;
self
Expand Down Expand Up @@ -195,26 +227,23 @@ impl FileWalker {
Ok((non_printable as f64 / content.len() as f64) < 0.3)
}

/// Check if file matches include/exclude patterns
/// Check if a file matches the include/exclude glob patterns.
///
/// Globs are matched against the path relative to the walk root, so patterns like
/// `**/vendor/**` or `src/**/*.rs` behave as expected regardless of where the root sits.
pub(crate) fn matches_patterns(&self, path: &Path) -> bool {
let path_str = path.to_string_lossy();

// If include patterns are specified, file must match at least one
if !self.include_patterns.is_empty() {
let matches_include = self
.include_patterns
.iter()
.any(|pattern| path_str.contains(pattern));
if !matches_include {
return false;
}
let rel = path.strip_prefix(&self.root).unwrap_or(path);

// If include patterns are specified, file must match at least one.
if let Some(include_set) = &self.include_set
&& !include_set.is_match(rel)
{
return false;
}

// File must not match any exclude pattern
if self
.exclude_patterns
.iter()
.any(|pattern| path_str.contains(pattern))
// File must not match any exclude pattern.
if let Some(exclude_set) = &self.exclude_set
&& exclude_set.is_match(rel)
{
return false;
}
Expand Down
34 changes: 14 additions & 20 deletions src/indexer/file_walker/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,19 +132,13 @@ fn test_walk_with_include_patterns() {
fs::write(temp_dir.path().join("test.toml"), "toml").unwrap();

let walker =
FileWalker::new(temp_dir.path(), 1024).with_patterns(vec![".rs".to_string()], vec![]);
FileWalker::new(temp_dir.path(), 1024).with_patterns(vec!["**/*.rs".to_string()], vec![]);
let files = walker.walk().unwrap();

// Debug: print what we found
eprintln!("Found {} files", files.len());
for f in &files {
eprintln!(" - {:?}", f.path);
}

assert_eq!(files.len(), 1, "Expected 1 file matching .rs pattern");
assert_eq!(files.len(), 1, "Expected 1 file matching **/*.rs pattern");
assert!(
files[0].relative_path.contains(".rs"),
"Expected file to contain .rs"
files[0].relative_path.ends_with(".rs"),
"Expected file to end with .rs"
);
}

Expand All @@ -155,12 +149,12 @@ fn test_walk_with_exclude_patterns() {
fs::write(temp_dir.path().join("exclude.txt"), "exclude").unwrap();

let walker =
FileWalker::new(temp_dir.path(), 1024).with_patterns(vec![], vec![".txt".to_string()]);
FileWalker::new(temp_dir.path(), 1024).with_patterns(vec![], vec!["**/*.txt".to_string()]);
let files = walker.walk().unwrap();
assert_eq!(files.len(), 1, "Expected 1 file after excluding .txt");
assert_eq!(files.len(), 1, "Expected 1 file after excluding **/*.txt");
assert!(
files[0].relative_path.contains(".rs"),
"Expected file to contain .rs"
files[0].relative_path.ends_with(".rs"),
"Expected file to end with .rs"
);
}

Expand All @@ -172,7 +166,7 @@ fn test_walk_with_include_and_exclude_patterns() {
fs::write(temp_dir.path().join("other.txt"), "other").unwrap();

let walker = FileWalker::new(temp_dir.path(), 1024)
.with_patterns(vec![".rs".to_string()], vec!["test".to_string()]);
.with_patterns(vec!["**/*.rs".to_string()], vec!["**/test*".to_string()]);
let files = walker.walk().unwrap();
assert_eq!(files.len(), 1);
assert!(files[0].path.ends_with("src.rs"));
Expand Down Expand Up @@ -262,15 +256,15 @@ fn test_matches_patterns_no_patterns() {

#[test]
fn test_matches_patterns_include_match() {
let walker = FileWalker::new("/tmp", 1024).with_patterns(vec![".rs".to_string()], vec![]);
let walker = FileWalker::new("/tmp", 1024).with_patterns(vec!["**/*.rs".to_string()], vec![]);
assert!(walker.matches_patterns(Path::new("/tmp/test.rs")));
assert!(!walker.matches_patterns(Path::new("/tmp/test.txt")));
}

#[test]
fn test_matches_patterns_include_multiple() {
let walker = FileWalker::new("/tmp", 1024)
.with_patterns(vec![".rs".to_string(), ".toml".to_string()], vec![]);
.with_patterns(vec!["**/*.rs".to_string(), "**/*.toml".to_string()], vec![]);
assert!(walker.matches_patterns(Path::new("/tmp/test.rs")));
assert!(walker.matches_patterns(Path::new("/tmp/Cargo.toml")));
assert!(!walker.matches_patterns(Path::new("/tmp/test.txt")));
Expand All @@ -279,7 +273,7 @@ fn test_matches_patterns_include_multiple() {
#[test]
fn test_matches_patterns_exclude_match() {
let walker =
FileWalker::new("/tmp", 1024).with_patterns(vec![], vec!["target".to_string()]);
FileWalker::new("/tmp", 1024).with_patterns(vec![], vec!["**/target/**".to_string()]);
assert!(walker.matches_patterns(Path::new("/tmp/src/main.rs")));
assert!(!walker.matches_patterns(Path::new("/tmp/target/debug/main")));
}
Expand All @@ -288,7 +282,7 @@ fn test_matches_patterns_exclude_match() {
fn test_matches_patterns_exclude_multiple() {
let walker = FileWalker::new("/tmp", 1024).with_patterns(
vec![],
vec!["target".to_string(), "node_modules".to_string()],
vec!["**/target/**".to_string(), "**/node_modules/**".to_string()],
);
assert!(walker.matches_patterns(Path::new("/tmp/src/main.rs")));
assert!(!walker.matches_patterns(Path::new("/tmp/target/debug/main")));
Expand All @@ -298,7 +292,7 @@ fn test_matches_patterns_exclude_multiple() {
#[test]
fn test_matches_patterns_include_and_exclude() {
let walker = FileWalker::new("/tmp", 1024)
.with_patterns(vec![".rs".to_string()], vec!["test".to_string()]);
.with_patterns(vec!["**/*.rs".to_string()], vec!["**/test*".to_string()]);
assert!(walker.matches_patterns(Path::new("/tmp/src/main.rs")));
assert!(!walker.matches_patterns(Path::new("/tmp/src/test.rs")));
assert!(!walker.matches_patterns(Path::new("/tmp/src/main.txt")));
Expand Down