From a791beb4bb1c36909205551e7c20c60c25dc4aed Mon Sep 17 00:00:00 2001 From: Rudi Floren Date: Fri, 29 May 2026 17:06:17 +0200 Subject: [PATCH] fix(datafusion): coerce filter literals for dictionary-encoded columns SQL string filters on a dictionary-encoded column (e.g. `Dictionary(Int16, Utf8)`) failed to plan with "could not convert to literal of type 'Dictionary(...)'" because `safe_coerce_scalar` had no arm for a dictionary target and no `ScalarValue::Dictionary` input arm. This also silently lost scalar-index pushdown for `=`/`IN` predicates on dictionary columns. Add two generic guard clauses to `safe_coerce_scalar`: unwrap a dictionary literal and coerce its inner value, and coerce a value to a dictionary target by recursing on the value type and re-wrapping. Nulls keep their untyped form, matching the existing behavior for all targets. Enabling pushdown exposed a `todo!()` in `OrderableScalarValue::cmp` for `Dictionary` vs `Dictionary` (scalar indexes store dictionary columns as `Dictionary` scalar keys in a BTreeMap), which would have turned a previously-working full-scan query into a panic. Implement the comparison by delegating to the inner values. Co-Authored-By: Claude Opus 4.8 (1M context) --- rust/lance-datafusion/src/expr.rs | 82 ++++++++++++++++++++++++++ rust/lance-index/src/scalar/btree.rs | 33 ++++++++++- rust/lance/src/dataset/scanner.rs | 87 ++++++++++++++++++++++++++++ 3 files changed, 201 insertions(+), 1 deletion(-) diff --git a/rust/lance-datafusion/src/expr.rs b/rust/lance-datafusion/src/expr.rs index 79650f6775e..9a5db74693c 100644 --- a/rust/lance-datafusion/src/expr.rs +++ b/rust/lance-datafusion/src/expr.rs @@ -17,6 +17,16 @@ const MS_PER_DAY: i64 = 86400000; // will always yield "x = 7_u64" regardless of the type of the column "x". As a result, we // need to do that literal coercion ourselves. pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option { + // A dictionary target coerces the value to the dictionary's value type and + // re-wraps it as a dictionary literal. Nulls keep their untyped form, + // matching the existing `ScalarValue::Null` behavior for all targets. + if let DataType::Dictionary(key_type, value_type) = ty { + if value.is_null() { + return Some(value.clone()); + } + let inner = safe_coerce_scalar(value, value_type)?; + return Some(ScalarValue::Dictionary(key_type.clone(), Box::new(inner))); + } match value { ScalarValue::Int8(val) => match ty { DataType::Int8 => Some(value.clone()), @@ -436,6 +446,9 @@ pub fn safe_coerce_scalar(value: &ScalarValue, ty: &DataType) -> Option Some(value.clone()), _ => None, }, + // A dictionary-encoded literal (e.g. produced by DataFusion's dictionary + // cast in the scalar-index path) coerces by unwrapping its underlying value. + ScalarValue::Dictionary(_, inner) => safe_coerce_scalar(inner, ty), _ => None, } } @@ -773,6 +786,75 @@ mod tests { &DataType::BinaryView ), Some(ScalarValue::BinaryView(Some(vec![1, 2, 3]))) + ); + } + + #[test] + fn test_dictionary_coerce() { + let dict_ty = DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)); + + // A string literal coerces to a dictionary target by wrapping the + // coerced value in a dictionary scalar. + assert_eq!( + safe_coerce_scalar(&ScalarValue::Utf8(Some("com".to_string())), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // The inner value is coerced through to the dictionary value type, so a + // LargeUtf8 literal lands as a Utf8 value inside the dictionary. + assert_eq!( + safe_coerce_scalar(&ScalarValue::LargeUtf8(Some("com".to_string())), &dict_ty), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // A dictionary literal round-trips back to its value type. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + ), + &DataType::Utf8, + ), + Some(ScalarValue::Utf8(Some("com".to_string()))) + ); + + // A dictionary literal coerces to a dictionary target, adopting the + // target's key type. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Dictionary( + Box::new(DataType::Int32), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + ), + &dict_ty, + ), + Some(ScalarValue::Dictionary( + Box::new(DataType::Int16), + Box::new(ScalarValue::Utf8(Some("com".to_string()))), + )) + ); + + // Null literals keep their untyped form, matching the behavior for all + // other target types. + assert_eq!( + safe_coerce_scalar(&ScalarValue::Utf8(None), &dict_ty), + Some(ScalarValue::Utf8(None)) + ); + + // A value that cannot be coerced to the dictionary value type fails. + assert_eq!( + safe_coerce_scalar( + &ScalarValue::Utf8(Some("com".to_string())), + &DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Int32)), + ), + None ); } } diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs index 6de490b9572..f8e0b2f1c75 100644 --- a/rust/lance-index/src/scalar/btree.rs +++ b/rust/lance-index/src/scalar/btree.rs @@ -581,7 +581,7 @@ impl Ord for OrderableScalarValue { } } (Struct(_arr), _) => panic!("Attempt to compare Struct with non-Struct"), - (Dictionary(_k1, _v1), Dictionary(_k2, _v2)) => todo!(), + (Dictionary(_k1, v1), Dictionary(_k2, v2)) => Self(*v1.clone()).cmp(&Self(*v2.clone())), (Dictionary(_, v1), Null) => Self(*v1.clone()).cmp(&Self(ScalarValue::Null)), (Dictionary(_, _), _) => panic!("Attempt to compare Dictionary with non-Dictionary"), // What would a btree of unions even look like? May not be possible. @@ -3036,6 +3036,37 @@ mod tests { assert!(size_of_many_i32 > 128 * 4); } + #[test] + fn test_orderable_dictionary_cmp() { + use arrow_schema::DataType; + use std::cmp::Ordering; + + let dict = |s: &str, key: DataType| { + OrderableScalarValue(ScalarValue::Dictionary( + Box::new(key), + Box::new(ScalarValue::Utf8(Some(s.to_string()))), + )) + }; + + // Dictionary scalars are ordered by their underlying value, regardless + // of the key type. This is exercised when loading a scalar index built + // on a dictionary-encoded column into a BTreeMap. + assert_eq!( + dict("a", DataType::Int16).cmp(&dict("b", DataType::Int16)), + Ordering::Less + ); + assert_eq!( + dict("b", DataType::Int32).cmp(&dict("b", DataType::Int16)), + Ordering::Equal + ); + + // A non-null dictionary value sorts after null. + assert_eq!( + dict("a", DataType::Int16).cmp(&OrderableScalarValue(ScalarValue::Null)), + Ordering::Greater + ); + } + #[tokio::test] async fn test_null_ids() { let tmpdir = TempObjDir::default(); diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index d0f5983b3ce..99c3a3669c0 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -8771,6 +8771,93 @@ full_filter=name LIKE Utf8(\"test%2\"), refine_filter=name LIKE Utf8(\"test%2\") ); } + /// Build an in-memory dataset with a single `Dictionary(Int16, Utf8)` column. + /// The dictionary cycles through "a", "b", "c" so each value appears in a + /// predictable, repeated pattern. + async fn dictionary_string_dataset() -> Dataset { + use arrow_array::{Int16Array, Int16DictionaryArray}; + + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "etld", + DataType::Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), + false, + )])); + + let dictionary = Arc::new(StringArray::from(vec!["a", "b", "c"])); + let indices = Int16Array::from((0..30).map(|i| i % 3).collect::>()); + let dict_array = Int16DictionaryArray::try_new(indices, dictionary).unwrap(); + + let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(dict_array)]).unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + Dataset::write(reader, "memory://test_dict_filter", None) + .await + .unwrap() + } + + /// Regression test for filtering a dictionary-encoded string column via the + /// SQL string path (`Scanner::filter`). This used to fail to plan with + /// "could not convert to literal of type 'Dictionary(Int16, Utf8)'". + #[tokio::test] + async fn test_filter_on_dictionary_string_column() { + let dataset = dictionary_string_dataset().await; + + // Equality predicate. + let count = dataset + .scan() + .filter("etld = 'a'") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 10); + + // IN-list predicate. + let count = dataset + .scan() + .filter("etld IN ('a', 'b')") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 20); + } + + /// An `IN`/`=` predicate on a dictionary column with a scalar index should be + /// pushed down to the index rather than falling back to a full scan. + #[tokio::test] + async fn test_dictionary_string_column_uses_scalar_index() { + use lance_index::scalar::BuiltinIndexType; + + let mut dataset = dictionary_string_dataset().await; + let params = ScalarIndexParams::for_builtin(BuiltinIndexType::Bitmap); + dataset + .create_index(&["etld"], IndexType::Scalar, None, ¶ms, true) + .await + .unwrap(); + + let mut scanner = dataset.scan(); + scanner.filter("etld IN ('a', 'b')").unwrap(); + let plan = scanner.create_plan().await.unwrap(); + let plan_str = format!("{:?}", plan); + assert!( + plan_str.contains("ScalarIndexExec") || plan_str.contains("MaterializeIndex"), + "IN on a dictionary column should use the scalar index, but got: {}", + plan_str + ); + + let count = dataset + .scan() + .filter("etld IN ('a', 'b')") + .unwrap() + .try_into_batch() + .await + .unwrap() + .num_rows(); + assert_eq!(count, 20); + } + #[tokio::test] async fn test_like_prefix_with_segmented_zone_map() { use lance_index::scalar::BuiltinIndexType;