Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 87 additions & 1 deletion crates/codebook/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use std::collections::{HashMap, HashSet};
use std::str::FromStr;
use std::sync::{LazyLock, Mutex};
use streaming_iterator::StreamingIterator;
use tree_sitter::{Parser, Query, QueryCursor};
use tree_sitter::{Node, Parser, Query, QueryCursor, QueryPredicateArg};
use unicode_script::{Script, UnicodeScript};
use unicode_segmentation::UnicodeSegmentation;

Expand All @@ -16,10 +16,21 @@ use unicode_segmentation::UnicodeSegmentation;
static PARSER_CACHE: LazyLock<Mutex<HashMap<LanguageType, Parser>>> =
LazyLock::new(|| Mutex::new(HashMap::new()));

/// One `(#not-has-ancestor? @capture "kind" ...)` rule, pre-parsed so the
/// hot path does no string scanning. Drops a match's capture when any
/// ancestor of the captured node has a kind in `kinds`.
struct NotHasAncestorRule {
capture_index: u32,
kinds: Vec<String>,
}

/// Pre-compiled query for a language, with its capture names.
struct CompiledQuery {
query: Query,
capture_names: Vec<String>,
/// Indexed by `pattern_index`. Empty inner vec = no filtering for that
/// pattern, which is the common case across all .scm files.
not_has_ancestor: Vec<Vec<NotHasAncestorRule>>,
}

/// All tree-sitter queries compiled eagerly at startup. Since queries come
Expand All @@ -42,17 +53,78 @@ static COMPILED_QUERIES: LazyLock<HashMap<LanguageType, CompiledQuery>> = LazyLo
.iter()
.map(|s| s.to_string())
.collect();
let not_has_ancestor = (0..query.pattern_count())
.map(|i| parse_not_has_ancestor(&query, i, setting.type_))
.collect();
map.insert(
setting.type_,
CompiledQuery {
query,
capture_names,
not_has_ancestor,
},
);
}
map
});

/// Extract `(#not-has-ancestor? @cap "kind" ...)` rules from a pattern's
/// general predicates. Other custom predicates pass through unchanged.
/// Malformed predicates panic at startup so .scm authors get an immediate
/// error rather than a silent no-op at runtime.
fn parse_not_has_ancestor(
query: &Query,
pattern_index: usize,
language: LanguageType,
) -> Vec<NotHasAncestorRule> {
let mut rules = Vec::new();
for pred in query.general_predicates(pattern_index) {
if &*pred.operator != "not-has-ancestor?" {
continue;
}
let mut args = pred.args.iter();
let capture_index = match args.next() {
Some(QueryPredicateArg::Capture(i)) => *i,
_ => panic!(
"{:?}: #not-has-ancestor? must take a capture as its first argument",
language
),
};
let kinds: Vec<String> = args
.map(|a| match a {
QueryPredicateArg::String(s) => s.to_string(),
QueryPredicateArg::Capture(_) => panic!(
"{:?}: #not-has-ancestor? takes string node kinds after the capture",
language
),
})
.collect();
assert!(
!kinds.is_empty(),
"{:?}: #not-has-ancestor? needs at least one node kind",
language
);
rules.push(NotHasAncestorRule {
capture_index,
kinds,
});
}
rules
}

/// Returns true if any ancestor of `node` has a kind in `kinds`.
fn has_ancestor_kind(node: Node, kinds: &[String]) -> bool {
let mut cur = node.parent();
while let Some(parent) = cur {
let kind = parent.kind();
if kinds.iter().any(|k| k == kind) {
return true;
}
cur = parent.parent();
}
false
}

#[derive(Debug, Clone, Copy, PartialEq, Ord, Eq, PartialOrd, Hash)]
pub struct TextRange {
/// Start position in utf-8 byte offset
Expand Down Expand Up @@ -211,6 +283,11 @@ fn extract_recursive<'a>(
let mut matches_query = cursor.matches(&compiled.query, root_node, provider);

while let Some(match_) = matches_query.next() {
// Per-pattern `#not-has-ancestor?` rules. Cheap: the inner vec is
// empty for every pattern in every .scm except the rare ones that
// declare a rule, so we pay one `is_empty` check on the hot path.
let ancestor_rules = &compiled.not_has_ancestor[match_.pattern_index];

// First pass: look for dynamic injection pairs in this match
let mut injection_content: Option<tree_sitter::Node> = None;
let mut injection_language_text: Option<&str> = None;
Expand Down Expand Up @@ -266,6 +343,15 @@ fn extract_recursive<'a>(
continue;
}

// Apply any `#not-has-ancestor?` rules that target this capture.
if !ancestor_rules.is_empty()
&& ancestor_rules.iter().any(|rule| {
rule.capture_index == capture.index && has_ancestor_kind(node, &rule.kinds)
})
{
continue;
}

if let Some(lang_name) = tag.strip_prefix("injection.") {
// Static injection: @injection.html, @injection.javascript, etc.
if let Ok(child_lang) = LanguageType::from_str(lang_name)
Expand Down
15 changes: 15 additions & 0 deletions crates/codebook/src/queries/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ Every capture name is a **tag** that categorizes the matched text. Tags use a do

Not every language needs every tag. HTML, for example, only uses `@comment` and `@string`. You can get a feel for which tags are available for a specific language by looking at the `scm` file for that language in this directory.

### Custom Predicates

In addition to tree-sitter's built-in predicates (`#eq?`, `#match?`, `#any-of?`, …), codebook evaluates one extra predicate:

| Predicate | Effect |
| --- | --- |
| `(#not-has-ancestor? @capture "kind" ["kind" …])` | Drop the capture if any ancestor of the captured node has one of the listed tree-sitter node kinds. |

Use this to narrow a broad capture instead of enumerating every positive context. For example, Python's `(string_content)` matches every string in the file, but inside type annotations (forward references, generic arguments) it's another tool's job:

```scheme
((string_content) @string
(#not-has-ancestor? @string "type"))
```

### Injection Tags (Multi-Language Support)

Injection tags tell codebook to re-parse a region of the file using a different language's grammar. This is how Markdown code blocks, HTML `<script>` tags, and similar multi-language files are handled.
Expand Down
5 changes: 4 additions & 1 deletion crates/codebook/src/queries/python.scm
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
(comment) @comment

(string_content) @string
; String content inside type annotations (forward references, generic args)
; belongs to the type checker, not the spell checker — skip it.
((string_content) @string
(#not-has-ancestor? @string "type"))

(function_definition
name: (identifier) @identifier.function)
Expand Down
46 changes: 46 additions & 0 deletions crates/codebook/tests/languages/test_python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,52 @@ fn test_python_import_statements() {
assert_eq!(misspelled, expected);
}

#[test]
fn test_python_type_annotations() {
super::utils::init_logging();
let processor = super::utils::get_processor();

// Variable annotations, parameter annotations, and return types — both
// bare identifiers and string forward references — should be ignored.
// Regression test for https://github.com/blopker/codebook/issues/187.
let sample = r#"
from typing import Union

a: no_typpoa = ...
b: 'no_typpob' = ...
c: "no_typpoc" = ...
d: """no_typpod""" = ...
e: str | no_typpoe | "no_typpof" = ...
f: Union[str, no_typpog, "no_typpoh"] = ...
g: list["no_typpoi"] = ...

def func(
param_a: no_typpoj,
param_b: 'no_typpok',
param_d: no_typpom = ...,
):
pass

def func2() -> str | no_typpon | "no_typpoo":
pass
"#;

let misspelled = processor
.spell_check(sample, Some(LanguageType::Python), None)
.to_vec();
let words: Vec<&str> = misspelled.iter().map(|r| r.word.as_str()).collect();
println!("Misspelled words: {words:?}");
for forbidden in [
"typpoa", "typpob", "typpoc", "typpod", "typpoe", "typpof", "typpog", "typpoh", "typpoi",
"typpoj", "typpok", "typpom", "typpon", "typpoo",
] {
assert!(
!words.contains(&forbidden),
"did not expect {forbidden:?} in type annotation, got {words:?}"
);
}
}

#[test]
fn test_python_functions() {
super::utils::init_logging();
Expand Down
Loading