diff --git a/docs/src/format/index/scalar/fts.md b/docs/src/format/index/scalar/fts.md index c051f8d4bd5..792702aca08 100644 --- a/docs/src/format/index/scalar/fts.md +++ b/docs/src/format/index/scalar/fts.md @@ -56,7 +56,7 @@ The metadata file contains JSON-serialized configuration and partition informati | Field | Type | Default | Description | |---------------------|---------|-----------|----------------------------------------------------------------| -| `base_tokenizer` | String | "icu" | Base tokenizer type (see Tokenizers section) | +| `base_tokenizer` | String | "simple" | Base tokenizer type (see Tokenizers section) | | `language` | String | "English" | Language for stemming and stop words | | `with_position` | Boolean | false | Store term positions for phrase queries (increases index size) | | `max_token_length` | UInt32? | None | Maximum token length (tokens longer than this are removed) | @@ -76,17 +76,17 @@ The full text search index supports multiple tokenizer types for different text | Tokenizer | Description | Use Case | |----------------|---------------------------------------------------------------------------|------------------------| -| **icu** | ICU dictionary-based Unicode word segmentation | Mixed-language text (default) | -| **simple** | Splits on whitespace and punctuation, removes non-alphanumeric characters | General ASCII-oriented text | +| **simple** | Splits on whitespace and punctuation, removes non-alphanumeric characters | General text (default) | | **whitespace** | Splits only on whitespace characters | Preserve punctuation | | **raw** | No tokenization, treats entire text as single token | Exact matching | | **ngram** | Breaks text into overlapping character sequences | Substring/fuzzy search | +| **icu** | ICU dictionary-based Unicode word segmentation | Mixed-language text | | **jieba/*** | Chinese text tokenizer with word segmentation | Chinese text | | **lindera/*** | Japanese text tokenizer with morphological analysis | Japanese text | #### ICU Tokenizer (Mixed-language text) -The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is the default tokenizer because it handles mixed-language text where the `simple` tokenizer would keep an unspaced CJK span as one large token. +The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is useful for mixed-language text where the default `simple` tokenizer would keep an unspaced CJK span as one large token. - **Models**: Uses compiled ICU4X segmenter data bundled with Lance - **Usage**: Specify as `icu` diff --git a/docs/src/guide/json.md b/docs/src/guide/json.md index 60d63172676..7246c8fe08a 100644 --- a/docs/src/guide/json.md +++ b/docs/src/guide/json.md @@ -292,7 +292,7 @@ on a single path, create an `INVERTED` index on the JSON column. dataset.create_scalar_index( "data", index_type="INVERTED", - base_tokenizer="icu", + base_tokenizer="simple", lower_case=True, stem=True, remove_stop_words=True, diff --git a/docs/src/quickstart/full-text-search.md b/docs/src/quickstart/full-text-search.md index e008bf8f76f..17327e40bc5 100644 --- a/docs/src/quickstart/full-text-search.md +++ b/docs/src/quickstart/full-text-search.md @@ -90,7 +90,7 @@ ds.create_scalar_index( index_type="INVERTED", name="text_idx", # Optional index name (if omitted, default is "text_idx") with_position=False, # Set True to enable phrase queries (stores token positions) - base_tokenizer="icu", # Tokenizer: "icu" (default), "simple" (whitespace+punct), "whitespace", or "raw" + base_tokenizer="simple", # Tokenizer: "simple" (whitespace+punct), "icu", "whitespace", or "raw" (no tokenization) language="English", # Language used for stemming + stop words (only used if `stem` or `remove_stop_words` is True) max_token_length=40, # Drop tokens longer than this length lower_case=True, # Lowercase text before tokenization @@ -103,13 +103,13 @@ ds.create_scalar_index( ### Tokenizer Options -- **icu**: Unicode word segmentation with built-in ICU dictionaries (default) - **simple**: Splits tokens on whitespace and punctuation - **whitespace**: Splits tokens only on whitespace - **raw**: No tokenization (useful for exact matching) Lance also supports multilingual tokenization: +- **icu**: Unicode word segmentation with built-in ICU dictionaries - **jieba/default**: Chinese text tokenization using Jieba - **lindera/ipadic**: Japanese text tokenization using Lindera with IPAdic dictionary - **lindera/ko-dic**: Korean text tokenization using Lindera with Ko-dic dictionary diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 4cbd39fdebb..70e6867d8dd 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -3107,11 +3107,9 @@ def create_scalar_index( ``[1, num_compute_cpus]``. If unset, Lance uses ``num_compute_cpus`` workers unless ``LANCE_FTS_NUM_SHARDS`` is set. This parameter is only used for the current build and is not persisted with the index. - base_tokenizer: str, default "icu" + base_tokenizer: str, default "simple" This is for the ``INVERTED`` index. The base tokenizer to use. The value can be: - * "icu": Unicode word segmentation with dictionary support for CJK and - other scripts. * "simple": splits tokens on whitespace and punctuation. * "whitespace": splits tokens on whitespace. * "raw": no tokenization. diff --git a/python/python/tests/compat/test_scalar_indices.py b/python/python/tests/compat/test_scalar_indices.py index a01615abf53..35022df3b12 100644 --- a/python/python/tests/compat/test_scalar_indices.py +++ b/python/python/tests/compat/test_scalar_indices.py @@ -320,9 +320,7 @@ def create(self): max_rows_per_file=100, data_storage_version=safe_data_storage_version(self.compat_version), ) - dataset.create_scalar_index( - "text", "INVERTED", with_position=True, base_tokenizer="simple" - ) + dataset.create_scalar_index("text", "INVERTED", with_position=True) def check_read(self): """Verify FTS index can be queried.""" diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index 73b6680cfff..7b4dede319b 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -694,27 +694,6 @@ def test_unindexed_full_text_search_on_empty_index(tmp_path): assert results.num_rows == 1 -def test_default_fts_tokenizer_handles_unspaced_multilingual_text(tmp_path): - data = pa.table( - { - "id": [0, 1], - "text": ["Hello, こんにちは世界!", "Hello, こんにちは!"], - } - ) - ds = lance.write_dataset(data, tmp_path) - ds.create_scalar_index( - "text", - index_type="INVERTED", - stem=False, - remove_stop_words=False, - ascii_folding=False, - ) - - results = ds.to_table(full_text_query="世界") - - assert results["id"].to_pylist() == [0] - - def test_full_text_search_without_index(dataset): row = dataset.take(indices=[0], columns=["doc"]) query_text = row.column(0)[0].as_py() @@ -999,7 +978,7 @@ def test_fts_stats(dataset): params = stats["params"] assert params["with_position"] is False - assert params["base_tokenizer"] == "icu" + assert params["base_tokenizer"] == "simple" assert params["language"] == "English" assert params["max_token_length"] == 40 assert params["lower_case"] is True @@ -1469,7 +1448,7 @@ def test_fts_deleted_rows_with_stable_row_ids(tmp_path): # Regression test: stable-row-id prefiltering must not leak deleted rows. data = pa.table( { - "text": [f"dup {i}" for i in range(200)], + "text": [f"dup_{i}" for i in range(200)], "category": [["A", "B", "C", "D", "E"][i % 5] for i in range(200)], } ) @@ -4691,7 +4670,7 @@ def test_describe_indices(tmp_path, monkeypatch, fts_format_version): details = indices[0].details assert details is not None and len(details) > 0 assert details["lance_tokenizer"] is None - assert details["base_tokenizer"] == "icu" + assert details["base_tokenizer"] == "simple" assert details["language"] == "English" assert not details["with_position"] assert details["max_token_length"] == 40 diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 58a5dd317f1..6024747025b 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -45,7 +45,7 @@ pub struct InvertedIndexParams { /// - `lindera/*`: Lindera tokenizer /// - `jieba/*`: Jieba tokenizer /// - /// `icu` is recommended for most cases and is the default value + /// `simple` is recommended for most cases and the default value pub(crate) base_tokenizer: String, /// language for stemming and stop words @@ -184,7 +184,7 @@ fn default_max_ngram_length() -> u32 { impl Default for InvertedIndexParams { fn default() -> Self { - Self::new("icu".to_owned(), Language::English) + Self::new("simple".to_owned(), Language::English) } } @@ -192,11 +192,11 @@ impl InvertedIndexParams { /// Create a new `InvertedIndexParams` with the given base tokenizer and language. /// /// The `base_tokenizer` can be one of the following: - /// - `icu`: ICU dictionary-based word segmentation, default - /// - `simple`: splits tokens on whitespace and punctuation + /// - `simple`: splits tokens on whitespace and punctuation, default /// - `whitespace`: splits tokens on whitespace /// - `raw`: no tokenization /// - `ngram`: N-Gram tokenizer + /// - `icu`: ICU dictionary-based word segmentation /// - `lindera/*`: Lindera tokenizer /// - `jieba/*`: Jieba tokenizer /// @@ -442,22 +442,6 @@ mod tests { use super::InvertedIndexParams; use lance_tokenizer::TokenStream; - #[test] - fn test_default_uses_icu_tokenizer() { - assert_eq!(InvertedIndexParams::default().base_tokenizer, "icu"); - } - - #[test] - fn test_missing_details_base_tokenizer_uses_legacy_simple_default() { - let mut details = - crate::pbold::InvertedIndexDetails::try_from(&InvertedIndexParams::default()).unwrap(); - details.base_tokenizer = None; - - let params = InvertedIndexParams::try_from(&details).unwrap(); - - assert_eq!(params.base_tokenizer, "simple"); - } - #[test] fn test_build_only_fields_are_not_serialized() { let params = InvertedIndexParams::default() @@ -509,6 +493,7 @@ mod tests { #[test] fn test_build_icu_tokenizer() { let mut tokenizer = InvertedIndexParams::default() + .base_tokenizer("icu".to_string()) .stem(false) .remove_stop_words(false) .build() diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs index a45de821f65..e785de7bee4 100644 --- a/rust/lance/src/dataset/tests/dataset_index.rs +++ b/rust/lance/src/dataset/tests/dataset_index.rs @@ -959,7 +959,7 @@ async fn test_fts_unindexed_data_with_stop_words() { .unwrap(); // Append unindexed rows with a term not in the index - let unindexed: Vec = (0..10).map(|i| format!("hello {i}")).collect(); + let unindexed: Vec = (0..10).map(|i| format!("hello_{i}")).collect(); let text_col = StringArray::from(unindexed); let batch = RecordBatch::try_new( arrow_schema::Schema::new(vec![Field::new("text", DataType::Utf8, false)]).into(), @@ -3176,7 +3176,6 @@ async fn test_sql_contains_tokens() { IndexType::Inverted, None, &InvertedIndexParams::default() - .base_tokenizer("simple".to_string()) .max_token_length(None) .stem(false), true, diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs index 96129ea09fa..c55a357fa0f 100644 --- a/rust/lance/src/io/exec/fts.rs +++ b/rust/lance/src/io/exec/fts.rs @@ -38,7 +38,6 @@ use super::utils::{IndexMetrics, InstrumentedChildInputStream, build_prefilter}; use crate::index::scalar::inverted::{load_segment_details, load_segments}; use crate::{Dataset, index::DatasetIndexInternalExt}; use lance_index::metrics::MetricsCollector; -use lance_index::scalar::InvertedIndexParams; use lance_index::scalar::inverted::builder::ScoredDoc; use lance_index::scalar::inverted::builder::document_input; use lance_index::scalar::inverted::document_tokenizer::{DocType, JsonTokenizer, LanceTokenizer}; @@ -149,16 +148,11 @@ async fn search_segments( .unzip()) } -/// Fall back to the default base tokenizer when no on-disk FTS segment exists. +/// Fall back to the default simple tokenizer when no on-disk FTS segment exists. fn default_text_tokenizer() -> Box { - InvertedIndexParams::default() - .max_token_length(None) - .lower_case(false) - .stem(false) - .remove_stop_words(false) - .ascii_folding(false) - .build() - .expect("default FTS tokenizer should build") + Box::new(TextTokenizer::new( + TextAnalyzer::builder(SimpleTokenizer::default()).build(), + )) } pub struct FtsIndexMetrics {