diff --git a/docs/src/format/index/scalar/fts.md b/docs/src/format/index/scalar/fts.md
index c051f8d4bd5..792702aca08 100644
--- a/docs/src/format/index/scalar/fts.md
+++ b/docs/src/format/index/scalar/fts.md
@@ -56,7 +56,7 @@ The metadata file contains JSON-serialized configuration and partition informati
 
 | Field               | Type    | Default   | Description                                                    |
 |---------------------|---------|-----------|----------------------------------------------------------------|
-| `base_tokenizer`    | String  | "icu"     | Base tokenizer type (see Tokenizers section)                   |
+| `base_tokenizer`    | String  | "simple"  | Base tokenizer type (see Tokenizers section)                   |
 | `language`          | String  | "English" | Language for stemming and stop words                           |
 | `with_position`     | Boolean | false     | Store term positions for phrase queries (increases index size) |
 | `max_token_length`  | UInt32? | None      | Maximum token length (tokens longer than this are removed)     |
@@ -76,17 +76,17 @@ The full text search index supports multiple tokenizer types for different text
 
 | Tokenizer      | Description                                                               | Use Case               |
 |----------------|---------------------------------------------------------------------------|------------------------|
-| **icu**        | ICU dictionary-based Unicode word segmentation                            | Mixed-language text (default) |
-| **simple**     | Splits on whitespace and punctuation, removes non-alphanumeric characters | General ASCII-oriented text |
+| **simple**     | Splits on whitespace and punctuation, removes non-alphanumeric characters | General text (default) |
 | **whitespace** | Splits only on whitespace characters                                      | Preserve punctuation   |
 | **raw**        | No tokenization, treats entire text as single token                       | Exact matching         |
 | **ngram**      | Breaks text into overlapping character sequences                          | Substring/fuzzy search |
+| **icu**        | ICU dictionary-based Unicode word segmentation                            | Mixed-language text    |
 | **jieba/***    | Chinese text tokenizer with word segmentation                             | Chinese text           |
 | **lindera/***  | Japanese text tokenizer with morphological analysis                       | Japanese text          |
 
 #### ICU Tokenizer (Mixed-language text)
 
-The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is the default tokenizer because it handles mixed-language text where the `simple` tokenizer would keep an unspaced CJK span as one large token.
+The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is useful for mixed-language text where the default `simple` tokenizer would keep an unspaced CJK span as one large token.
 
 - **Models**: Uses compiled ICU4X segmenter data bundled with Lance
 - **Usage**: Specify as `icu`
diff --git a/docs/src/guide/json.md b/docs/src/guide/json.md
index 60d63172676..7246c8fe08a 100644
--- a/docs/src/guide/json.md
+++ b/docs/src/guide/json.md
@@ -292,7 +292,7 @@ on a single path, create an `INVERTED` index on the JSON column.
 dataset.create_scalar_index(
     "data",
     index_type="INVERTED",
-    base_tokenizer="icu",
+    base_tokenizer="simple",
     lower_case=True,
     stem=True,
     remove_stop_words=True,
diff --git a/docs/src/quickstart/full-text-search.md b/docs/src/quickstart/full-text-search.md
index e008bf8f76f..17327e40bc5 100644
--- a/docs/src/quickstart/full-text-search.md
+++ b/docs/src/quickstart/full-text-search.md
@@ -90,7 +90,7 @@ ds.create_scalar_index(
     index_type="INVERTED",
     name="text_idx",              # Optional index name (if omitted, default is "text_idx")
     with_position=False,          # Set True to enable phrase queries (stores token positions)
-    base_tokenizer="icu",         # Tokenizer: "icu" (default), "simple" (whitespace+punct), "whitespace", or "raw"
+    base_tokenizer="simple",      # Tokenizer: "simple" (whitespace+punct), "icu", "whitespace", or "raw" (no tokenization)
     language="English",           # Language used for stemming + stop words (only used if `stem` or `remove_stop_words` is True)
     max_token_length=40,          # Drop tokens longer than this length
     lower_case=True,              # Lowercase text before tokenization
@@ -103,13 +103,13 @@ ds.create_scalar_index(
 
 ### Tokenizer Options
 
-- **icu**: Unicode word segmentation with built-in ICU dictionaries (default)
 - **simple**: Splits tokens on whitespace and punctuation
 - **whitespace**: Splits tokens only on whitespace
 - **raw**: No tokenization (useful for exact matching)
 
 Lance also supports multilingual tokenization:
 
+- **icu**: Unicode word segmentation with built-in ICU dictionaries
 - **jieba/default**: Chinese text tokenization using Jieba
 - **lindera/ipadic**: Japanese text tokenization using Lindera with IPAdic dictionary
 - **lindera/ko-dic**: Korean text tokenization using Lindera with Ko-dic dictionary
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 4cbd39fdebb..70e6867d8dd 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -3107,11 +3107,9 @@ def create_scalar_index(
             ``[1, num_compute_cpus]``. If unset, Lance uses ``num_compute_cpus``
             workers unless ``LANCE_FTS_NUM_SHARDS`` is set. This parameter is
             only used for the current build and is not persisted with the index.
-        base_tokenizer: str, default "icu"
+        base_tokenizer: str, default "simple"
             This is for the ``INVERTED`` index. The base tokenizer to use. The
             value can be:
-            * "icu": Unicode word segmentation with dictionary support for CJK and
-              other scripts.
             * "simple": splits tokens on whitespace and punctuation.
             * "whitespace": splits tokens on whitespace.
             * "raw": no tokenization.
diff --git a/python/python/tests/compat/test_scalar_indices.py b/python/python/tests/compat/test_scalar_indices.py
index a01615abf53..35022df3b12 100644
--- a/python/python/tests/compat/test_scalar_indices.py
+++ b/python/python/tests/compat/test_scalar_indices.py
@@ -320,9 +320,7 @@ def create(self):
             max_rows_per_file=100,
             data_storage_version=safe_data_storage_version(self.compat_version),
         )
-        dataset.create_scalar_index(
-            "text", "INVERTED", with_position=True, base_tokenizer="simple"
-        )
+        dataset.create_scalar_index("text", "INVERTED", with_position=True)
 
     def check_read(self):
         """Verify FTS index can be queried."""
diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
index 73b6680cfff..7b4dede319b 100644
--- a/python/python/tests/test_scalar_index.py
+++ b/python/python/tests/test_scalar_index.py
@@ -694,27 +694,6 @@ def test_unindexed_full_text_search_on_empty_index(tmp_path):
     assert results.num_rows == 1
 
 
-def test_default_fts_tokenizer_handles_unspaced_multilingual_text(tmp_path):
-    data = pa.table(
-        {
-            "id": [0, 1],
-            "text": ["Hello, こんにちは世界!", "Hello, こんにちは!"],
-        }
-    )
-    ds = lance.write_dataset(data, tmp_path)
-    ds.create_scalar_index(
-        "text",
-        index_type="INVERTED",
-        stem=False,
-        remove_stop_words=False,
-        ascii_folding=False,
-    )
-
-    results = ds.to_table(full_text_query="世界")
-
-    assert results["id"].to_pylist() == [0]
-
-
 def test_full_text_search_without_index(dataset):
     row = dataset.take(indices=[0], columns=["doc"])
     query_text = row.column(0)[0].as_py()
@@ -999,7 +978,7 @@ def test_fts_stats(dataset):
     params = stats["params"]
 
     assert params["with_position"] is False
-    assert params["base_tokenizer"] == "icu"
+    assert params["base_tokenizer"] == "simple"
     assert params["language"] == "English"
     assert params["max_token_length"] == 40
     assert params["lower_case"] is True
@@ -1469,7 +1448,7 @@ def test_fts_deleted_rows_with_stable_row_ids(tmp_path):
     # Regression test: stable-row-id prefiltering must not leak deleted rows.
     data = pa.table(
         {
-            "text": [f"dup {i}" for i in range(200)],
+            "text": [f"dup_{i}" for i in range(200)],
             "category": [["A", "B", "C", "D", "E"][i % 5] for i in range(200)],
         }
     )
@@ -4691,7 +4670,7 @@ def test_describe_indices(tmp_path, monkeypatch, fts_format_version):
     details = indices[0].details
     assert details is not None and len(details) > 0
     assert details["lance_tokenizer"] is None
-    assert details["base_tokenizer"] == "icu"
+    assert details["base_tokenizer"] == "simple"
     assert details["language"] == "English"
     assert not details["with_position"]
     assert details["max_token_length"] == 40
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index 58a5dd317f1..6024747025b 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -45,7 +45,7 @@ pub struct InvertedIndexParams {
     /// - `lindera/*`: Lindera tokenizer
     /// - `jieba/*`: Jieba tokenizer
     ///
-    /// `icu` is recommended for most cases and is the default value
+    /// `simple` is recommended for most cases and the default value
     pub(crate) base_tokenizer: String,
 
     /// language for stemming and stop words
@@ -184,7 +184,7 @@ fn default_max_ngram_length() -> u32 {
 
 impl Default for InvertedIndexParams {
     fn default() -> Self {
-        Self::new("icu".to_owned(), Language::English)
+        Self::new("simple".to_owned(), Language::English)
     }
 }
 
@@ -192,11 +192,11 @@ impl InvertedIndexParams {
     /// Create a new `InvertedIndexParams` with the given base tokenizer and language.
     ///
     /// The `base_tokenizer` can be one of the following:
-    /// - `icu`: ICU dictionary-based word segmentation, default
-    /// - `simple`: splits tokens on whitespace and punctuation
+    /// - `simple`: splits tokens on whitespace and punctuation, default
     /// - `whitespace`: splits tokens on whitespace
     /// - `raw`: no tokenization
     /// - `ngram`: N-Gram tokenizer
+    /// - `icu`: ICU dictionary-based word segmentation
     /// - `lindera/*`: Lindera tokenizer
     /// - `jieba/*`: Jieba tokenizer
     ///
@@ -442,22 +442,6 @@ mod tests {
     use super::InvertedIndexParams;
     use lance_tokenizer::TokenStream;
 
-    #[test]
-    fn test_default_uses_icu_tokenizer() {
-        assert_eq!(InvertedIndexParams::default().base_tokenizer, "icu");
-    }
-
-    #[test]
-    fn test_missing_details_base_tokenizer_uses_legacy_simple_default() {
-        let mut details =
-            crate::pbold::InvertedIndexDetails::try_from(&InvertedIndexParams::default()).unwrap();
-        details.base_tokenizer = None;
-
-        let params = InvertedIndexParams::try_from(&details).unwrap();
-
-        assert_eq!(params.base_tokenizer, "simple");
-    }
-
     #[test]
     fn test_build_only_fields_are_not_serialized() {
         let params = InvertedIndexParams::default()
@@ -509,6 +493,7 @@ mod tests {
     #[test]
     fn test_build_icu_tokenizer() {
         let mut tokenizer = InvertedIndexParams::default()
+            .base_tokenizer("icu".to_string())
             .stem(false)
             .remove_stop_words(false)
             .build()
diff --git a/rust/lance/src/dataset/tests/dataset_index.rs b/rust/lance/src/dataset/tests/dataset_index.rs
index a45de821f65..e785de7bee4 100644
--- a/rust/lance/src/dataset/tests/dataset_index.rs
+++ b/rust/lance/src/dataset/tests/dataset_index.rs
@@ -959,7 +959,7 @@ async fn test_fts_unindexed_data_with_stop_words() {
         .unwrap();
 
     // Append unindexed rows with a term not in the index
-    let unindexed: Vec<String> = (0..10).map(|i| format!("hello {i}")).collect();
+    let unindexed: Vec<String> = (0..10).map(|i| format!("hello_{i}")).collect();
     let text_col = StringArray::from(unindexed);
     let batch = RecordBatch::try_new(
         arrow_schema::Schema::new(vec![Field::new("text", DataType::Utf8, false)]).into(),
@@ -3176,7 +3176,6 @@ async fn test_sql_contains_tokens() {
             IndexType::Inverted,
             None,
             &InvertedIndexParams::default()
-                .base_tokenizer("simple".to_string())
                 .max_token_length(None)
                 .stem(false),
             true,
diff --git a/rust/lance/src/io/exec/fts.rs b/rust/lance/src/io/exec/fts.rs
index 96129ea09fa..c55a357fa0f 100644
--- a/rust/lance/src/io/exec/fts.rs
+++ b/rust/lance/src/io/exec/fts.rs
@@ -38,7 +38,6 @@ use super::utils::{IndexMetrics, InstrumentedChildInputStream, build_prefilter};
 use crate::index::scalar::inverted::{load_segment_details, load_segments};
 use crate::{Dataset, index::DatasetIndexInternalExt};
 use lance_index::metrics::MetricsCollector;
-use lance_index::scalar::InvertedIndexParams;
 use lance_index::scalar::inverted::builder::ScoredDoc;
 use lance_index::scalar::inverted::builder::document_input;
 use lance_index::scalar::inverted::document_tokenizer::{DocType, JsonTokenizer, LanceTokenizer};
@@ -149,16 +148,11 @@ async fn search_segments(
         .unzip())
 }
 
-/// Fall back to the default base tokenizer when no on-disk FTS segment exists.
+/// Fall back to the default simple tokenizer when no on-disk FTS segment exists.
 fn default_text_tokenizer() -> Box<dyn LanceTokenizer> {
-    InvertedIndexParams::default()
-        .max_token_length(None)
-        .lower_case(false)
-        .stem(false)
-        .remove_stop_words(false)
-        .ascii_folding(false)
-        .build()
-        .expect("default FTS tokenizer should build")
+    Box::new(TextTokenizer::new(
+        TextAnalyzer::builder(SimpleTokenizer::default()).build(),
+    ))
 }
 
 pub struct FtsIndexMetrics {