From eb7d399cb6f7228908592b0154a31ce9e434c773 Mon Sep 17 00:00:00 2001 From: Andrei Dragomir Date: Wed, 29 Apr 2026 16:34:55 +0300 Subject: [PATCH 1/5] [HSTACK] Deep projection physical --- Cargo.lock | 1 + datafusion/common/src/deep.rs | 854 ++++++ datafusion/common/src/lib.rs | 1 + datafusion/core/tests/core_integration.rs | 1 + .../deep_projections/billing/billing.parquet | Bin 0 -> 2677 bytes .../data/deep_projections/billing/billing.sql | 47 + .../delta_with_deep_struct_in_list/table.sql | 58 + .../values.parquet | Bin 0 -> 1773 bytes .../data/deep_projections/first-duckdb.sql | 66 + .../tests/data/deep_projections/first.parquet | Bin 0 -> 880 bytes .../deep_projections/genstudio/generate.sql | 140 + .../meta_asset_featurization.parquet | Bin 0 -> 753 bytes .../meta_asset_summary_metrics.parquet | Bin 0 -> 990 bytes ..._summary_metrics_by_age_and_gender.parquet | Bin 0 -> 1017 bytes .../gs_summary_metrics/data.sql | 2600 ++++++++++++++++ .../gs_summary_metrics/generate.py | 35 + .../gs_summary_metrics/generate.sql | 54 + .../gs_summary_metrics/generate2.sql | 2628 +++++++++++++++++ .../gs_summary_metrics.parquet | Bin 0 -> 7014 bytes .../deep_projections/identity_map/raw.parquet | Bin 0 -> 1391 bytes .../deep_projections/identity_map/raw.sql | 117 + .../list_struct_map/table.parquet | Bin 0 -> 3629 bytes .../list_struct_map/table.sql | 101 + .../mid_values_2/mid_values_2.sql | 210 ++ .../mid_values_2/midvalues.parquet | Bin 0 -> 4336 bytes .../triplea/extra_user_data.parquet | Bin 0 -> 691 bytes .../triplea/midvalues.parquet | Bin 0 -> 1056 bytes .../deep_projections/triplea/midvalues.sql | 79 + .../core/tests/optimizer_deep_indices/mod.rs | 1640 ++++++++++ datafusion/datasource-parquet/Cargo.toml | 1 + datafusion/datasource-parquet/src/leaves.rs | 498 ++++ datafusion/datasource-parquet/src/mod.rs | 2 + datafusion/datasource-parquet/src/opener.rs | 139 +- .../src/push_all_projection_hints.rs | 816 +++++ datafusion/datasource-parquet/src/source.rs | 6 + 35 files changed, 10090 insertions(+), 4 deletions(-) create mode 100644 datafusion/common/src/deep.rs create mode 100644 datafusion/core/tests/data/deep_projections/billing/billing.parquet create mode 100644 datafusion/core/tests/data/deep_projections/billing/billing.sql create mode 100644 datafusion/core/tests/data/deep_projections/delta_with_deep_struct_in_list/table.sql create mode 100644 datafusion/core/tests/data/deep_projections/delta_with_deep_struct_in_list/values.parquet create mode 100644 datafusion/core/tests/data/deep_projections/first-duckdb.sql create mode 100644 datafusion/core/tests/data/deep_projections/first.parquet create mode 100644 datafusion/core/tests/data/deep_projections/genstudio/generate.sql create mode 100644 datafusion/core/tests/data/deep_projections/genstudio/meta_asset_featurization.parquet create mode 100644 datafusion/core/tests/data/deep_projections/genstudio/meta_asset_summary_metrics.parquet create mode 100644 datafusion/core/tests/data/deep_projections/genstudio/meta_asset_summary_metrics_by_age_and_gender.parquet create mode 100644 datafusion/core/tests/data/deep_projections/gs_summary_metrics/data.sql create mode 100644 datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate.py create mode 100644 datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate.sql create mode 100644 datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate2.sql create mode 100644 datafusion/core/tests/data/deep_projections/gs_summary_metrics/gs_summary_metrics.parquet create mode 100644 datafusion/core/tests/data/deep_projections/identity_map/raw.parquet create mode 100644 datafusion/core/tests/data/deep_projections/identity_map/raw.sql create mode 100644 datafusion/core/tests/data/deep_projections/list_struct_map/table.parquet create mode 100644 datafusion/core/tests/data/deep_projections/list_struct_map/table.sql create mode 100644 datafusion/core/tests/data/deep_projections/mid_values_2/mid_values_2.sql create mode 100644 datafusion/core/tests/data/deep_projections/mid_values_2/midvalues.parquet create mode 100644 datafusion/core/tests/data/deep_projections/triplea/extra_user_data.parquet create mode 100644 datafusion/core/tests/data/deep_projections/triplea/midvalues.parquet create mode 100644 datafusion/core/tests/data/deep_projections/triplea/midvalues.sql create mode 100644 datafusion/core/tests/optimizer_deep_indices/mod.rs create mode 100644 datafusion/datasource-parquet/src/leaves.rs create mode 100644 datafusion/datasource-parquet/src/push_all_projection_hints.rs diff --git a/Cargo.lock b/Cargo.lock index 5f9b686d51689..7a48dd65e59b3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2086,6 +2086,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", + "datafusion-physical-optimizer", "datafusion-physical-plan", "datafusion-pruning", "datafusion-session", diff --git a/datafusion/common/src/deep.rs b/datafusion/common/src/deep.rs new file mode 100644 index 0000000000000..84310d3e2901b --- /dev/null +++ b/datafusion/common/src/deep.rs @@ -0,0 +1,854 @@ +use crate::{project_schema, Result}; +use arrow::array::{ new_null_array, Array, ArrayRef, AsArray, FixedSizeListArray, GenericListArray, MapArray, OffsetSizeTrait, RecordBatch, RecordBatchOptions, StructArray}; +use arrow::compute::{can_cast_types, cast, cast_with_options, CastOptions}; +use arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; +use log::{error, trace}; +use std::collections::HashMap; +use std::fs::OpenOptions; +use std::io::Write; +use std::sync::Arc; +use arrow::error::ArrowError; + +/// Check whether an Arrow [DataType] is recursive in the sense that we need to +/// look inside and continue unpacking the data +/// This is used when creating a schema based on a deep projection +pub fn data_type_recurs(dt: &DataType) -> bool { + match dt { + // scalars + DataType::Null + | DataType::Boolean + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float16 + | DataType::Float32 + | DataType::Float64 + | DataType::Timestamp(_, _) + | DataType::Date32 + | DataType::Date64 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Duration(_) + | DataType::Interval(_) + | DataType::Binary + | DataType::FixedSizeBinary(_) + | DataType::LargeBinary + | DataType::BinaryView + | DataType::Utf8 + | DataType::LargeUtf8 + | DataType::Utf8View + | DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) + | DataType::Dictionary(_, _) => false, + // containers + DataType::RunEndEncoded(_, val) => data_type_recurs(val.data_type()), + DataType::Union(_, _) => true, + DataType::List(f) => data_type_recurs(f.data_type()), + DataType::ListView(f) => data_type_recurs(f.data_type()), + DataType::FixedSizeList(f, _) => data_type_recurs(f.data_type()), + DataType::LargeList(f) => data_type_recurs(f.data_type()), + DataType::LargeListView(f) => data_type_recurs(f.data_type()), + // list of struct + DataType::Map(_, _) => true, + DataType::Struct(_) => true, + } +} + +/// Mutually recursive with [can_rewrite_field] +/// checks whether we can rewrite a source [Fields] object to a destination one +/// the missing fields in the source behavior can be changed with the [`fill_missing_source_field`] +/// parameter. +pub fn can_rewrite_fields( + dst_fields: &Fields, + src_fields: &Fields, + fill_missing_source_fields: bool, +) -> bool { + let mut out = true; + for i in 0..dst_fields.len() { + let dst_field = Arc::clone(&dst_fields[i]); + let dst_name = dst_field.name(); + + let src_field_opt = src_fields + .iter() + .enumerate() + .find(|(_idx, b)| b.name() == dst_name); + + // if the field exists in the source + if let Some((_src_idx, src_field)) = src_field_opt { + let src_field = Arc::clone(src_field); + let can_cast = + can_rewrite_field(&dst_field, &src_field, fill_missing_source_fields); + out = out && can_cast; + } else { + out = out && fill_missing_source_fields; + } + } + out +} + +/// Mutually recursive with [can_rewrite_fielda] +/// checks whether we can rewrite a source [FieldRef] object to a destination one +/// the missing fields in the source behavior can be changed with the [`fill_missing_source_field`] +/// parameter. +pub fn can_rewrite_field( + dst_field: &FieldRef, + src_field: &FieldRef, + fill_missing_source_fields: bool, +) -> bool { + let can_cast_by_arrow = !data_type_recurs(dst_field.data_type()) + && !data_type_recurs(src_field.data_type()); + if can_cast_by_arrow { + return can_cast_types(src_field.data_type(), dst_field.data_type()); + } + match (src_field.data_type(), dst_field.data_type()) { + (DataType::List(src_inner), DataType::List(dst_inner)) + | (DataType::List(src_inner), DataType::LargeList(dst_inner)) + | (DataType::LargeList(src_inner), DataType::LargeList(dst_inner)) => { + if data_type_recurs(src_inner.data_type()) + && data_type_recurs(dst_inner.data_type()) + { + can_rewrite_field( + dst_inner, + src_inner, + fill_missing_source_fields, + ) + } else { + can_cast_types(src_inner.data_type(), dst_inner.data_type()) + } + } + ( + DataType::FixedSizeList(src_inner, src_sz), + DataType::FixedSizeList(dst_inner, dst_sz), + ) => { + if src_sz != dst_sz { + return false; + } + if data_type_recurs(src_inner.data_type()) + && data_type_recurs(dst_inner.data_type()) + { + can_rewrite_field( + dst_inner, + src_inner, + fill_missing_source_fields, + ) + } else { + can_cast_types(src_inner.data_type(), dst_inner.data_type()) + } + } + (DataType::Map(src_inner, _), DataType::Map(dst_inner, _)) => { + match (src_inner.data_type(), dst_inner.data_type()) { + (DataType::Struct(src_inner_f), DataType::Struct(dst_inner_f)) => { + can_rewrite_field( + &dst_inner_f[1], + &src_inner_f[1], + fill_missing_source_fields, + ) + } + _ => false, + } + } + (DataType::Struct(src_inner), DataType::Struct(dst_inner)) => { + can_rewrite_fields(dst_inner, src_inner, fill_missing_source_fields) + } + (DataType::Union(src_fields, src_mode), DataType::Union(dst_fields, dst_mode)) => { + src_fields == dst_fields && src_mode == dst_mode + } + (_src, _dest) => { + error!( + target: "deepschema", + " can_rewrite_field: Unhandled src dest field: src {}={:?}, dst {}={:?}", + src_field.name(), + src_field.data_type(), + dst_field.name(), + dst_field.data_type() + ); + false + } + } +} + +pub fn can_cast_datatype_deep(from: &DataType, to: &DataType, fill_missing_source_field: bool) -> bool { + let ffrom = Field::new("f1", from.clone(), true); + let fto = Field::new("f1", to.clone(), true); + can_rewrite_field(&Arc::new(fto), &Arc::new(ffrom), fill_missing_source_field) +} + +/// Deep projections are represented using a HashMap> +/// for backwards compatibility (current projections are represented using a [`Vec`] +/// Currently, deep projections are represented (even if there's some duplicated information as a +/// [`HashMap>`] +/// the key is the source field id of the top-level field +/// the value is a list of "paths" inside the top-level field +/// Examples: +/// Scalar fields - no representations of paths inside the field possible +/// List fields - same thing +/// List> - possible paths may be "*.id", "*.name", "*.address" +/// List>> +/// - possible paths may be "*.*", "*.*.*.id", "*.*.*.name", "*.*.*.address" +pub fn has_deep_projection(possible: &HashMap>) -> bool { + !(possible.is_empty() || possible.iter().all(|(_k, v)| v.is_empty())) +} + +/// Combines the current projection (numeric indices of top-level columns) with +/// the deep projection - "paths" inside a top-level column +pub fn splat_columns( + src: &SchemaRef, + projection: &[usize], + projection_deep: &HashMap>, +) -> Vec { + let mut out: Vec = vec![]; + for pi in projection.iter() { + let f = src.field(*pi); + match projection_deep.get(pi) { + None => { + out.push(f.name().to_owned()); + } + Some(rests) => { + if !rests.is_empty() { + for rest in rests.iter() { + out.push(format!("{}.{}", f.name(), rest)) + } + } else { + out.push(f.name().to_owned()); + } + } + } + } + out +} + +pub fn try_rewrite_schema_opt( + src: SchemaRef, + projection_opt: Option<&Vec>, + projection_deep_opt: Option<&HashMap>>, +) -> Result { + match projection_opt { + None => Ok(src), + Some(projection) => match projection_deep_opt { + None => project_schema(&src, projection_opt), + Some(projection_deep) => Ok(rewrite_schema(&src, projection, projection_deep)), + }, + } +} + +pub fn rewrite_field_projection( + src: &SchemaRef, + projected_field_idx: usize, + projection_deep: &HashMap>, +) -> FieldRef { + let original_field = Arc::new(src.field(projected_field_idx).clone()); + let single_field_schema = Arc::new(Schema::new(vec![original_field])); + // rewrite projection, deep projection to use 0 + let projected_vec = vec![0]; + let mut projected_deep_vec = HashMap::new(); + let empty_vec: Vec = vec![]; + projected_deep_vec.insert( + 0usize, + projection_deep + .get(&projected_field_idx) + .unwrap_or(&empty_vec) + .clone(), + ); + + let rewritten_single_field_schema = + rewrite_schema(&single_field_schema, &projected_vec, &projected_deep_vec); + Arc::new(rewritten_single_field_schema.field(0).clone()) +} + +fn make_path(parent: &str, name: &str) -> String { + if parent.is_empty() { + name.to_owned() + } else { + format!("{parent}.{name}") + } +} + +fn path_prefix_exists(filters: &[String], path: &String) -> bool { + filters.iter().any(|f| { + let tmp = f.find(path); + tmp.is_some() && tmp.unwrap() == 0 + }) +} + +fn path_included(filters: &[String], path: &str) -> bool { + filters.iter().any(|f| { + let tmp = path.find(f); + tmp.is_some() && tmp.unwrap() == 0 + }) +} + +pub fn rewrite_schema( + src: &SchemaRef, + projection: &Vec, + projection_deep: &HashMap>, +) -> SchemaRef { + trace!(target: "deepschema", "rewrite_schema: projection={projection:?}, projection_deep={projection_deep:?}, input schema={src:#?}"); + + const FLAG_PARENT_IS_LIST: u8 = 0x2; + const FLAG_PARENT_IS_MAP: u8 = 0x4; + + fn rewrite_schema_fields( + parent: &str, + parent_flags: u8, + src_fields: &Fields, + filters: &Vec, + ) -> Vec { + let mut out_fields: Vec = vec![]; + for i in 0..src_fields.len() { + let src_field = Arc::clone(&src_fields[i]); + let src_field_path = make_path(parent, src_field.name()); + // trace!(target:"deep", "rewrite_schema_fields: parent={}, src_field_path={}, filters={:?}, field={:#?}", parent, src_field_path, filters, src_field); + trace!(target:"deepschema", "rewrite_schema_fields: parent={parent}, src_field_path={src_field_path}, filters={filters:?}, parent_flags: {parent_flags}"); + + let field_path_included = path_included(filters, &src_field_path); //filters.contains(&src_field_path); + let mut field_path_with_star_included = false; + if parent_flags & FLAG_PARENT_IS_LIST > 0 { + let mut src_field_path_with_star = src_field_path.clone(); + src_field_path_with_star.push_str(".*"); + field_path_with_star_included = + path_included(filters, &src_field_path_with_star); + } + + if field_path_included || field_path_with_star_included { + out_fields.push(Arc::clone(&src_field)); + } else if data_type_recurs(src_field.data_type()) + && path_prefix_exists(filters, &src_field_path) + { + match rewrite_schema_field(parent, 0, &src_field, filters) { + None => {} + Some(f) => out_fields.push(f), + } + } + } + out_fields + } + + fn rewrite_schema_field( + parent: &str, + parent_flags: u8, + src_field: &FieldRef, + filters: &Vec, + ) -> Option { + let src_field_name = src_field.name(); + // FIXME: @HStack + // if we already navigated to this field and the accessor is "*" + // that means we don't care about the field name + // RETEST THIS for lists + let comes_from_list_and_last_is_set = !parent.is_empty() + && (parent.ends_with('*') + && (parent_flags & FLAG_PARENT_IS_LIST) > 0); + let comes_from_map_and_last_is_set = !parent.is_empty() + && (parent.ends_with('*') + && (parent_flags & FLAG_PARENT_IS_MAP) > 0); + + let src_field_path = + if comes_from_list_and_last_is_set || comes_from_map_and_last_is_set { + parent.to_string() + } else { + make_path(parent, src_field_name) + }; + // trace!(target:"deep", "rewrite_schema_field: src_field_name={}, src_field_path={}, filters={:?}, field={:#?}", src_field_name, src_field_path, filters, src_field); + trace!(target:"deepschema", "rewrite_schema_field: src_field_name={src_field_name}, src_field_path={src_field_path}, filters={filters:?}, parent_flags={parent_flags}"); + trace!(target:"deepschema", "rewrite_schema_field: src_field_type={:?}", &src_field.data_type()); + let field_path_included = path_included(filters, &src_field_path); //filters.contains(&src_field_path); + if field_path_included { + trace!(target:"deepschema", " rewrite_schema_field return {src_field_path} directly "); + Some(Arc::clone(src_field)) + } else if data_type_recurs(src_field.data_type()) + && path_prefix_exists(filters, &src_field_path) + { + let out = match src_field.data_type() { + DataType::List(src_inner) => { + rewrite_schema_field( + make_path(src_field_path.as_str(), "*").as_str(), + FLAG_PARENT_IS_LIST, + src_inner, + filters, + ) + .map(|inner| { + trace!(target:"deepschema", "return new list {} = {:#?}", src_field_name.clone(), Arc::clone(&inner)); + Arc::new(Field::new_list( + src_field.name(), + inner, + src_field.is_nullable(), + )) + }) + } + DataType::FixedSizeList(src_inner, src_sz) => rewrite_schema_field( + make_path(src_field_path.as_str(), "*").as_str(), + FLAG_PARENT_IS_LIST, + src_inner, + filters, + ) + .map(|inner| { + Arc::new(Field::new_fixed_size_list( + src_field.name(), + inner, + *src_sz, + src_field.is_nullable(), + )) + }), + DataType::LargeList(src_inner) => rewrite_schema_field( + make_path(&src_field_path, "*").as_str(), + FLAG_PARENT_IS_LIST, + src_inner, + filters, + ) + .map(|inner| { + Arc::new(Field::new_large_list( + src_field.name(), + inner, + src_field.is_nullable(), + )) + }), + + DataType::Map(map_entry, map_sorted) => { + #[allow(clippy::get_first)] + if let DataType::Struct(map_entry_fields) = map_entry.data_type() + { + let map_key_field = map_entry_fields.get(0).unwrap(); + let map_value_field = map_entry_fields.get(1).unwrap(); + rewrite_schema_field( + make_path(&src_field_path, "*").as_str(), + FLAG_PARENT_IS_MAP, + map_value_field, + filters, + ) + .map(|inner| { + Arc::new(Field::new_map( + src_field_name, + map_entry.name().clone(), + Arc::clone(map_key_field), + inner, + *map_sorted, + src_field.is_nullable(), + )) + }) + } else { + panic!("Invalid internal field map: expected struct, but got {}", map_entry.data_type()); + } + } + + DataType::Struct(src_inner) => { + let dst_fields = + rewrite_schema_fields(src_field_path.as_str(), parent_flags, src_inner, filters); + trace!(target:"deepschema", "for struct: {} {} = {:#?}", src_field_name, src_field_path.clone(), dst_fields); + if !dst_fields.is_empty() { + Some(Arc::new(Field::new_struct( + src_field.name(), + dst_fields, + src_field.is_nullable(), + ))) + } else { + None + } + } + x => { + panic!("Unhandled data type: {x:#?}"); + } + }; + out + } else { + None + } + } + + let actual_projection = if projection.is_empty() { + (0..src.fields().len()).collect() + } else { + projection.clone() + }; + let splatted = splat_columns(src, &actual_projection, projection_deep); + + // trace!(target:"deep", "rewrite_schema source: {:#?}", src); + trace!(target:"deepschema", "rewrite_schema splatted: {:?} {:?} = {:?}", &actual_projection, &projection_deep, splatted); + let mut dst_fields: Vec = vec![]; + for pi in actual_projection.iter() { + let src_field = src.field(*pi); + trace!(target:"deepschema", "rewrite_schema at field {}, splatted={:?}", src_field.name(), &splatted); + let foutopt = rewrite_schema_field( + "", + 0, + &Arc::new(src_field.clone()), + &splatted, + ); + match foutopt { + None => {} + Some(fout) => { + dst_fields.push(Arc::clone(&fout)); + } + } + } + + // let dst_fields = rewrite_fields("".to_string(), src.clone().fields(), &splatted); + trace!(target:"deepschema", "rewrite_schema dst_fields: {dst_fields:#?}"); + + if !dst_fields.is_empty() { + Arc::new(Schema::new_with_metadata(dst_fields, src.metadata.clone())) + } else { + Arc::clone(src) + } +} + +pub fn debug_to_file(name: &str, contents: &str) { + let mut file = OpenOptions::new() + .write(true) + .truncate(true) + .create(true) + .open(name) + .unwrap(); + file.write_all(contents.as_bytes()).unwrap(); +} + +fn cast_struct( + struct_array: &StructArray, + fields: &Fields, + cast_options: &CastOptions, + add_missing: bool, +) -> std::result::Result { + let num_rows = struct_array.len(); + StructArray::try_new_with_length( + fields.to_owned(), + fields + .iter() + .map(|field| { + let col_or_not = struct_array.column_by_name(field.name()); + match col_or_not { + None => { + if add_missing && field.is_nullable() { + Ok(new_null_array(field.data_type(), struct_array.len())) + } else { + Err(ArrowError::SchemaError(format!( + "Could not find column {}", + field.name() + ))) + } + } + Some(col) => cast_field(col, field, cast_options, add_missing), + } + }) + .collect::, _>>()?, + struct_array.nulls().map(ToOwned::to_owned), + num_rows, + ) +} + +fn cast_list( + array: &GenericListArray, + field: &FieldRef, + cast_options: &CastOptions, + add_missing: bool, +) -> std::result::Result, ArrowError> { + let values = cast_field(array.values(), field, cast_options, add_missing)?; + GenericListArray::::try_new( + Arc::clone(field), + array.offsets().clone(), + values, + array.nulls().cloned(), + ) +} + +fn cast_map( + array: &MapArray, + entries_field: &FieldRef, + sorted: bool, + cast_options: &CastOptions, + add_missing: bool, +) -> std::result::Result { + match entries_field.data_type() { + DataType::Struct(entry_fields) => { + let entries = cast_struct(array.entries(), entry_fields, cast_options, add_missing)?; + MapArray::try_new( + Arc::clone(entries_field), + array.offsets().to_owned(), + entries, + array.nulls().cloned(), + sorted, + ) + } + _ => Err(ArrowError::CastError( + "Map entries must be a struct".to_string(), + )), + } +} + +fn cast_field( + col: &ArrayRef, + field: &FieldRef, + cast_options: &CastOptions, + add_missing: bool, +) -> std::result::Result { + let (col_type, field_type) = (col.data_type(), field.data_type()); + + match (col_type, field_type) { + (DataType::Struct(_), DataType::Struct(child_fields)) => { + let child_struct = StructArray::from(col.into_data()); + Ok(Arc::new(cast_struct( + &child_struct, + child_fields, + cast_options, + add_missing, + )?) as ArrayRef) + } + (DataType::FixedSizeList(_, _), DataType::FixedSizeList(child_fields, _)) => { + let to_type = + DataType::new_list(child_fields.data_type().clone(), child_fields.is_nullable()); + let col = cast( + col.as_any() + .downcast_ref::() + .ok_or_else(|| { + ArrowError::CastError(format!( + "Failed to convert a FixedSizeList into a new list {} ({col_type})", + field.name() + )) + })?, + &to_type, + )?; + // Once the FixedSizeList has been converted to a regular list, go through the usual + // list casting code + cast_field(&col, field, cast_options, add_missing) + } + (DataType::List(_), DataType::List(child_fields)) => Ok(Arc::new(cast_list( + col.as_any() + .downcast_ref::>() + .ok_or_else(|| { + ArrowError::CastError(format!( + "Expected a list for {} but got {col_type}", + field.name(), + )) + })?, + child_fields, + cast_options, + add_missing, + )?) as ArrayRef), + (DataType::LargeList(_), DataType::LargeList(child_fields)) => Ok(Arc::new(cast_list( + col.as_any() + .downcast_ref::>() + .ok_or_else(|| { + ArrowError::CastError(format!( + "Expected a list for {} but got {col_type}", + field.name(), + )) + })?, + child_fields, + cast_options, + add_missing, + )?) as ArrayRef), + // TODO: add list view cast + (DataType::Map(_, _), DataType::Map(child_fields, sorted)) => Ok(Arc::new(cast_map( + col.as_map_opt().ok_or_else(|| { + ArrowError::CastError(format!( + "Expected a map for {} but got {col_type}", + field.name(), + )) + })?, + child_fields, + *sorted, + cast_options, + add_missing, + )?) as ArrayRef), + _ if is_cast_required(col_type, field_type) => { + cast_with_options(col, field_type, cast_options).map_err(|err| { + if let ArrowError::CastError(err) = err { + ArrowError::CastError(format!( + "Failed to cast {} from {field_type} to {col_type}: {err}", + field.name(), + )) + } else { + err + } + }) + } + _ => Ok(Arc::clone(col)), + } +} + +fn is_cast_required(a: &DataType, b: &DataType) -> bool { + match (a, b) { + (DataType::List(a_item), DataType::List(b_item)) => { + // If list item name is not the default('item') the list must be casted + !a.equals_datatype(b) || a_item.name() != b_item.name() + } + (_, _) => !a.equals_datatype(b), + } +} + +/// Cast recordbatch to a new target_schema, by casting each column array +pub fn cast_record_batch( + batch: &RecordBatch, + target_schema: SchemaRef, + safe: bool, + add_missing: bool, +) -> Result { + let cast_options = CastOptions { + safe, + ..Default::default() + }; + + let mut s = StructArray::try_new_with_length( + batch.schema().as_ref().to_owned().fields, + batch.columns().to_owned(), + None, + batch.num_rows(), + )?; + s = cast_struct(&s, target_schema.fields(), &cast_options, add_missing)?; + + Ok(RecordBatch::try_new_with_options( + target_schema, + s.columns().to_vec(), + &RecordBatchOptions::new().with_row_count(Some(batch.num_rows())), + )?) +} + + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use crate::deep::{can_rewrite_fields, rewrite_schema}; + use arrow::datatypes::{DataType, Field, Fields, Schema, TimeUnit}; + use std::sync::Arc; + + #[test] + fn test_schema_rewrite_with_similar_column_names() { + let schema = Arc::new(Schema::new(vec![ + Field::new("car", DataType::Utf8, true), + Field::new("cars", DataType::Utf8, true), + Field::new("amount", DataType::Utf8, true), + Field::new("amounts", DataType::Utf8, true) + ])); + let projection = vec![1]; + let mut projection_deep = HashMap::new(); + projection_deep.insert(1, vec![]); + let rewritten_schema = rewrite_schema(&schema, &projection, &projection_deep); + assert_eq!(rewritten_schema.fields().len(), 1); + assert_eq!(rewritten_schema.field(0).name(), "cars"); + + let projection = vec![0]; + let mut projection_deep = HashMap::new(); + projection_deep.insert(0, vec![]); + let rewritten_schema = rewrite_schema(&schema, &projection, &projection_deep); + assert_eq!(rewritten_schema.fields().len(), 1); + assert_eq!(rewritten_schema.field(0).name(), "car"); + + } + + #[test] + fn test_cast() -> crate::error::Result<()> { + // source, destination, is_fill_dependent + let cases = [( + Arc::new(Schema::new(vec![Field::new("i1", DataType::Int32, true)])), + Arc::new(Schema::new(vec![Field::new("i1", DataType::Int8, true)])), + false, + true, + ), + ( + Arc::new(Schema::new(vec![Field::new("i1", DataType::Int32, true)])), + Arc::new(Schema::new(vec![Field::new( + "i1", + DataType::Struct(Fields::from(vec![Field::new( + "s1", + DataType::Utf8, + true, + )])), + true, + )])), + false, + false, + ), + ( + Arc::new(Schema::new(vec![Field::new( + "l1", + DataType::List(Arc::new(Field::new( + "s1", + DataType::Struct(Fields::from(vec![ + Field::new("s1extra1", DataType::Utf8, true), + Field::new("s1extra2", DataType::Utf8, true), + Field::new("s1i2", DataType::Int32, true), + Field::new("s1s1", DataType::Utf8, true), + Field::new( + "s1m1", + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, false), + ])), + true, + )), + false, + ), + true, + ), + Field::new( + "s1l1", + DataType::List(Arc::new(Field::new( + "s1l1i1", + DataType::Int32, + true, + ))), + true, + ), + ])), + true, + ))), + true, + )])), + Arc::new(Schema::new(vec![Field::new( + "l1", + DataType::List(Arc::new(Field::new( + "s1", + DataType::Struct(Fields::from(vec![ + Field::new("s1s1", DataType::Utf8, true), + Field::new("s1i2", DataType::Int32, true), + Field::new( + "s1m1", + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(Fields::from(vec![ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, false), + ])), + true, + )), + false, + ), + true, + ), + Field::new( + "s1l1", + DataType::List(Arc::new(Field::new( + "s1l1i1", + DataType::Date32, + true, + ))), + true, + ), + // extra field + Field::new("s1ts1", DataType::Time32(TimeUnit::Second), true), + ])), + true, + ))), + true, + )])), + true, + true, + )]; + for (from, to, can_fill, res) in cases.iter() { + assert_eq!( + can_rewrite_fields(from.fields(), to.fields(), *can_fill), + *res, + "Wrong result" + ); + } + Ok(()) + } +} diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index fdd04f752455e..b0867d8ce2e12 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -61,6 +61,7 @@ pub mod test_util; pub mod tree_node; pub mod types; pub mod utils; +pub mod deep; /// Reexport arrow crate pub use arrow; pub use column::Column; diff --git a/datafusion/core/tests/core_integration.rs b/datafusion/core/tests/core_integration.rs index bdbe72245323d..28e120f5fd741 100644 --- a/datafusion/core/tests/core_integration.rs +++ b/datafusion/core/tests/core_integration.rs @@ -60,6 +60,7 @@ mod catalog_listing; /// Run all tests that are found in the `tracing` directory mod tracing; +mod optimizer_deep_indices; #[cfg(test)] #[ctor::ctor] fn init() { diff --git a/datafusion/core/tests/data/deep_projections/billing/billing.parquet b/datafusion/core/tests/data/deep_projections/billing/billing.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a421c523f6ee679dc9d746c5f60c1423a658ca0b GIT binary patch literal 2677 zcmb_ezi%T&6n-ABosDsPOYp4L(iSexBO@IV4wus*rATrc(lx1xXWVx&z z*_%MR6o>**Tt$ga6jxGEQBY9k51>F4CqzL_jZ*OD&3e7NVK3r@<=y%9zVCZ)-<#Rp z-+iyg4Q}z~L(Vt=m@W*SZa3ek0tdd%?OQl)@hYzZthN~%uu_QuLYmx(A+?*h2tz*_ z9rlVqOo4#cxupm}6IPp=@I?y^zMev?ID~jft^##~*LWk2rDByZmclT~I(8(A?R5y@ zb%6ZClAlL2|2hzc9;>W4W%Mc-7a|IBB2VQcx(;OWdMsnq&)$XqH}B-wj_U#}E)3)x zuN~c5*kDTA*x=7pYdmkh1F;pMR3B85lPnTuUe#mW*V=^v7a|5I`lw!K^JkO ztwp1pIdRYpSrtONtXlvhXHJyofVS#qf+G$O!NXRdjWtabU?7C39;lhNbx(3_7AW$5z;ZuSmyR-+)4^T-d6JI#k*ue-NM ze2Y0v&mSFo^Rsy*Lobw(KlUR+)e-e7txZp6Be|!}hX>;hW=yTDjizB3M5$S&QVS-Z zpvnNXIybQrm2fcc%Oe@du?ALV)6X;w0dVw4&gQ{%q5z7yhFfd&7>URVJzVpA-=9op z(V_I`5odsNt-;Jcl}CHisj|GU_DLUo502(|(h*GqK^{_}RdjYN7k*o6I+Wp1&L$`7 zzF4Txo?eW~UXX-Z^JpIqViu-?@^KNz0H8?ginRN$$-9~~QXJZ!il-$p zah<9@>lZD_<>o*b*a3V}MJ&1&6YalJ+L*K=RBc(a3G}=2((M-rrfNHZ{*r}St-INR zFD~_5i4)`d_DAA-LwsqtPN|1~N-p(4eDO*urT;G#S^dggw7O=qb0B`MX99H8E+(Q6?Xh@X6+cv6r;M**x)DV( z0`cuy7M$Ali8B`~;+qxa4852aZR|{D4#Z``$MgY^oEb#-$UYFyOmU7!fT3R7`c71- zF^Jz9S+_C=P2$*-Rq-5|a=TbNmfh_je!QU=?cYtC2@|^AKE5ydd+k$w47X2v8=D)w ec6WFZ9F5y|<<_Pg-G1xst=D0Q---M9!TAr}lFlXo literal 0 HcmV?d00001 diff --git a/datafusion/core/tests/data/deep_projections/billing/billing.sql b/datafusion/core/tests/data/deep_projections/billing/billing.sql new file mode 100644 index 0000000000000..09fec434ae664 --- /dev/null +++ b/datafusion/core/tests/data/deep_projections/billing/billing.sql @@ -0,0 +1,47 @@ +CREATE OR REPLACE TABLE billing ( + timestamp TIMESTAMP_S, + _acp_system_metadata STRUCT( + acp_sourceBatchId VARCHAR, + commitBatchId VARCHAR, + ingestTime INT8, + isDeleted BOOL, + rowId VARCHAR, + rowVersion INT8, + trackingId VARCHAR + ), + _aaanortheast STRUCT( + TravelBookingEventDetails STRUCT( + aaa_traveltype VARCHAR, + aaa_membernumber VARCHAR, + extra VARCHAR + ) + ), + _id VARCHAR, + _eventType VARCHAR, + + _ACP_DATE DATE, + _ACP_BATCHID VARCHAR +); + +INSERT INTO billing VALUES( + '2025-02-01 13:00:00', + { + acp_sourceBatchId: 'batch1', + isDeleted: false + }, + { + TravelBookingEventDetails: { + aaa_traveltype: '1', + aaa_membernumber: 'm1', + extra: 'extra1' + } + }, + 'id1', + 'event1', + '2025-01-01', + 'batch1' +); + +COPY billing TO 'billing.parquet' (FORMAT PARQUET); + + diff --git a/datafusion/core/tests/data/deep_projections/delta_with_deep_struct_in_list/table.sql b/datafusion/core/tests/data/deep_projections/delta_with_deep_struct_in_list/table.sql new file mode 100644 index 0000000000000..380b5fa0d998b --- /dev/null +++ b/datafusion/core/tests/data/deep_projections/delta_with_deep_struct_in_list/table.sql @@ -0,0 +1,58 @@ +CREATE OR REPLACE TABLE values( + _acp_system_metadata STRUCT( + acp_sourceBatchId VARCHAR, + commitBatchId VARCHAR + ), + _id varchar, + productListItems STRUCT( + SKU VARCHAR, + quantity INT4, + priceTotal FLOAT, + _experience STRUCT( + analytics STRUCT( + customDimensions STRUCT( + eVars STRUCT( + evar1 VARCHAR, + evar2 VARCHAR + ) + ), + events STRUCT( + event1 STRUCT( + value FLOAT + ), + event2 STRUCT( + value FLOAT + ) + ) + ) + ) + )[] +); + +INSERT INTO values VALUES +( + { + acp_sourceBatchId: 'b1', + commitBatchId: 'b1', + }, + 'id1', + [ + { + SKU: 'sku1', quantity: 1, priceTotal: 100, + _experience: { + analytics: { + customDimensions: { + eVars: { evar1: 'ev1', evar2: 'ev2' } + }, + events: { + event1: { value: 1.5 }, + event2: { value: 3.2 } + } + } + } + } + ] +); + +COPY values TO 'values.parquet' (FORMAT PARQUET); + diff --git a/datafusion/core/tests/data/deep_projections/delta_with_deep_struct_in_list/values.parquet b/datafusion/core/tests/data/deep_projections/delta_with_deep_struct_in_list/values.parquet new file mode 100644 index 0000000000000000000000000000000000000000..56521122591a2d4db9095d14af34fbef701d67d4 GIT binary patch literal 1773 zcmcIlPfrs;6rWDFxGE$NnaS1=(-_%QLkO*stfmoc2mx&j@ekK#x;usq-L~$|HulcJ zGatZD;Ms#G4jw${-Ou1>;NW{RWnI8fV)W3Lw==)_{ocGk)14#> zWO|7bLMTPsxy1HOb}M&!W<{Sa6ybCj;PEP3%I(j$^ghwnvH+FrrwFUCJ2}{FBM&P8 zYyyOkuT6ls>|T!L##WBS#*pL?wI-wL7t7g%9IZ`&eE-&3XS96GSBWzdr2c-;X(#@k zj09wPl{)in;Sbu$C{Z$MM^cGED8VekrWdI*r}z44?8~N5{@!-52r0mw8NVMzp*pi# zY=?*d+Sx(e2Q}3S6SWQ02~-yJEXV4?3Zsa*BD*q@J;iLpS$z5IHS$y7x8J6sr$RMa zyiJ%53)cp5=*w4qCAxU_dRx95$T*ZeUmCWnq9?i|75WK8j}iE3qWaNB2og!y??D8( zT#~yYMu@&fNe)F^JGqqps+$W;$svR`<}x2OV9>FYhN7EFWHTG}GaHqM!=arT{y3?Qy}3E)n+cSsi;~f8PNZ|Na2ns>>+V6J;S^zN zw1EgR|8xvn=W48Lds0=brlx$#pSxdx(!vol*30=BDaBk(AkErBGdKCBdq{y9C(~Fj z=V#3JC75&lm2v+-ZX9W2y_}zM>k7ZWI2HM3{oSVL8q~C=X-OT!7H_#9ick!AFSPM8 z??3B9M?){7{!gP{NBuhL8!yw>x&5pAk0)$ATV|%R)PJ&c>W|hi^+#)%`V;?OOV`gx zAEPuro5p%MUrY1D5~r@sKbJ_gx&bfhMzb=MF}^XCVQu-r^8HG+lZM@(64V0U^WHpM OUAapRN`&mdAKfo#R*Lrk literal 0 HcmV?d00001 diff --git a/datafusion/core/tests/data/deep_projections/first-duckdb.sql b/datafusion/core/tests/data/deep_projections/first-duckdb.sql new file mode 100644 index 0000000000000..ec699c32d4069 --- /dev/null +++ b/datafusion/core/tests/data/deep_projections/first-duckdb.sql @@ -0,0 +1,66 @@ +CREATE OR REPLACE TABLE cross_industry_demo_data ( + _ACP_DATE DATE, + timestamp TIMESTAMP_S, + endUserIDs STRUCT( + aaid_id VARCHAR, + extra INT4 + ), + _experience STRUCT( + eVar56 VARCHAR, + extra VARCHAR + ) +); + +INSERT INTO cross_industry_demo_data VALUES +( + '2025-01-03', + '2025-01-03 02:00:30', + ROW( + 'd1', + 1, + ), + ROW( + 'u1', + 'extra1', + ), +); +INSERT INTO cross_industry_demo_data VALUES +( + '2025-01-03', + '2025-01-03 02:00:30', + ROW( + 'd1', + 1, + ), + ROW( + 'u2', + 'extra1', + ), +); +INSERT INTO cross_industry_demo_data VALUES +( + '2025-01-03', + '2025-01-03 02:00:30', + ROW( + 'd2', + 1, + ), + ROW( + 'u1', + 'extra1', + ), +); +INSERT INTO cross_industry_demo_data VALUES +( + '2025-01-03', + '2025-01-03 02:00:30', + ROW( + 'd2', + 1, + ), + ROW( + 'u2', + 'extra1', + ), +); +COPY cross_industry_demo_data TO 'output.parquet' (FORMAT PARQUET); diff --git a/datafusion/core/tests/data/deep_projections/first.parquet b/datafusion/core/tests/data/deep_projections/first.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9cb5a18df587521e6b275deebbde61fbdd5b2966 GIT binary patch literal 880 zcmZ`&-D(p-6h0ZZ>sTX*bcPK?3BoRG8$u^d>JNJpji3-pDAfxoUACDKmo{nH&4z*? zeE=W9iy~h77Wza!foFC$?I`WV%$}L~zHh!W=j?c}D;Tgg+iWw6fy-O~zDEFn);{k5 z*aOc;EU%bhnGf$@~pC{8(e{J;jL9>lo0Da=Es+oM3tL*JCXAZDBVs9r&D&ti8 zVANL1OZ~?hem7QUrz%U;Y@(KUSG`ZN{l``dF$keZFPUDL#nuAnR7DB6uFw&_4lOqT~fuA;R z3jUJP*ajUvfRKzwroqfCDnaP8CbB54u<8*3QYtcsB?*|;r_q$-oRP0}L<%C+mlBl3 z#grpYOgJ8Wsol5n1hkLXbpNeP7fCpa2n+7-ZoYa7wr9GE!DNis{4!H|aPw9P|KQj&bmUFXbO|{RmW<%{hL(`Owt%c6>*%MH`vww)~r|fXS^c-s~9kv$(j|0dYtTEE^e6KJ& zqYaOJpH#`~4=3KBKOJ7%|EZ#SsofswlN}8j+3Fk1Zl~MnHQKX;1%9J@aejJo;`ScT Mj?lY<&_o~mCxzI(t^fc4 literal 0 HcmV?d00001 diff --git a/datafusion/core/tests/data/deep_projections/genstudio/meta_asset_summary_metrics.parquet b/datafusion/core/tests/data/deep_projections/genstudio/meta_asset_summary_metrics.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4a6c9de16938657dfd2e09d27fda00773f1e734d GIT binary patch literal 990 zcmah|O=}x55Ebs)jyFz9N)#OE!G{`LC;^i=X^;LWtsw`~8uz#;(qvcID=m`NrvIV8 zuD_>;j&@UGs7iIy%$s>nGf&IOml17T{<<5J-iavrMm!{zXOJ@*6B-Q~y^fPPSjX~W zaDM;e)$4b}pK+sio-ke{_PceECpkDKm}B&m&@NV;RXJT64pZiMwh#kSXHK}zbLkGE zo+XwlMV1wE4c-t+Jh8NM*n$I57dDOF*KMi-ysg1lsg*DAoUt>EZ?I8?LWLIzGhk7B z3Olxys1MOx)5yy|OF?~grI31|tV}a!X}9jvVVvtyxyAgC<_;5|j4q4kNW{eRrOm8` z19;4FT1=*@f*@BL$73Y7wB?y&P$^2vK{*_<4=^F@O3#>av{6sJiGoW%3QV8Y>2K0z z_tFL-QGpHXcO7;QG;40<25g8XE;8-(TEaeJ-z6pX~a;OBb1d`1o9o{% zDL{$8Dl4P56}Y`N8*i_McX~3JEtC0V^>OfI!EP0qPFAzem4`0KWp_B7bgICDCJi4M<3qb-rS6@AMf6hpDzem;?MpK9ZBw@ literal 0 HcmV?d00001 diff --git a/datafusion/core/tests/data/deep_projections/genstudio/meta_asset_summary_metrics_by_age_and_gender.parquet b/datafusion/core/tests/data/deep_projections/genstudio/meta_asset_summary_metrics_by_age_and_gender.parquet new file mode 100644 index 0000000000000000000000000000000000000000..07774b4f0c6ffbb13dfe3cbe31a0658bc8f37291 GIT binary patch literal 1017 zcmah|O=}x55EYI;;*FCgB?=Do;6n{Alz>UxHb;My){ui~jeA@aX|gNql@>{B)Bn)F z+JDePN4pI+RHeFU=Dm4OGo!`$%aGQtf8CEr_f!;oEgoabQ^**N2(9;P-L{h%SjV!w zfBEp^<*T>EpHZ!Q88e>8_PceE#Thszm}B&e&pvn@p`mYHkURc^0Y`}r2hz+B6Rhg22w-p#Gweke789BrF8Vi*uba)jr0~Wm} zuw@&G{t(SIt$X_y38=5G5K_;Tl}YL>J+JDt8RxoCZa#bV+-BmF(M7(G)S1{{j!@tk z+G5ti0X5>-IVKaehc{7KfI$Y@Xkq7!M{`&wE!Y=ic8Aof?KpOa# z!gmSNXI1)}wCRJiK}bEvg4L4`s|SiTHgXL%BopPSc6udY7fjc4joPK(V?SMVAH@Af zziohVKcXPHVc9n9u4z+0=aBJ}9L$q88B-kSvF})uZzNv_!7sR0+iInze*U3?2bh%~ zLkM#sv)u39zh9E$0{q1+cetg%?X+3@b*lMK5Ra#ecs5>s?02f|?39tocsc!CuGAH| v?hFPq{JOJA?~nc7c6c+qx$O-e3%L=!;r)kqcXy-Pr}uBj&m%$>`00NGin{Z7 literal 0 HcmV?d00001 diff --git a/datafusion/core/tests/data/deep_projections/gs_summary_metrics/data.sql b/datafusion/core/tests/data/deep_projections/gs_summary_metrics/data.sql new file mode 100644 index 0000000000000..8430adb152aa9 --- /dev/null +++ b/datafusion/core/tests/data/deep_projections/gs_summary_metrics/data.sql @@ -0,0 +1,2600 @@ + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-02 13:00:00', + { + acp_sourceBatchId: 'b0', + commitBatchId: 'b0', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-02', + 'b0' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-03 13:00:00', + { + acp_sourceBatchId: 'b1', + commitBatchId: 'b1', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-03', + 'b1' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-04 13:00:00', + { + acp_sourceBatchId: 'b2', + commitBatchId: 'b2', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-04', + 'b2' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-05 13:00:00', + { + acp_sourceBatchId: 'b3', + commitBatchId: 'b3', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-05', + 'b3' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-06 13:00:00', + { + acp_sourceBatchId: 'b4', + commitBatchId: 'b4', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-06', + 'b4' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-07 13:00:00', + { + acp_sourceBatchId: 'b5', + commitBatchId: 'b5', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-07', + 'b5' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-08 13:00:00', + { + acp_sourceBatchId: 'b6', + commitBatchId: 'b6', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-08', + 'b6' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-09 13:00:00', + { + acp_sourceBatchId: 'b7', + commitBatchId: 'b7', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-09', + 'b7' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-10 13:00:00', + { + acp_sourceBatchId: 'b8', + commitBatchId: 'b8', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-10', + 'b8' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-11 13:00:00', + { + acp_sourceBatchId: 'b9', + commitBatchId: 'b9', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-11', + 'b9' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-12 13:00:00', + { + acp_sourceBatchId: 'b10', + commitBatchId: 'b10', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-12', + 'b10' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-13 13:00:00', + { + acp_sourceBatchId: 'b11', + commitBatchId: 'b11', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-13', + 'b11' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-14 13:00:00', + { + acp_sourceBatchId: 'b12', + commitBatchId: 'b12', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-14', + 'b12' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-15 13:00:00', + { + acp_sourceBatchId: 'b13', + commitBatchId: 'b13', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-15', + 'b13' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-16 13:00:00', + { + acp_sourceBatchId: 'b14', + commitBatchId: 'b14', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-16', + 'b14' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-17 13:00:00', + { + acp_sourceBatchId: 'b15', + commitBatchId: 'b15', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-17', + 'b15' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-18 13:00:00', + { + acp_sourceBatchId: 'b16', + commitBatchId: 'b16', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-18', + 'b16' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-19 13:00:00', + { + acp_sourceBatchId: 'b17', + commitBatchId: 'b17', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-19', + 'b17' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-20 13:00:00', + { + acp_sourceBatchId: 'b18', + commitBatchId: 'b18', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-20', + 'b18' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-21 13:00:00', + { + acp_sourceBatchId: 'b19', + commitBatchId: 'b19', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-21', + 'b19' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-22 13:00:00', + { + acp_sourceBatchId: 'b20', + commitBatchId: 'b20', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-22', + 'b20' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-23 13:00:00', + { + acp_sourceBatchId: 'b21', + commitBatchId: 'b21', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-23', + 'b21' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-24 13:00:00', + { + acp_sourceBatchId: 'b22', + commitBatchId: 'b22', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-24', + 'b22' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-25 13:00:00', + { + acp_sourceBatchId: 'b23', + commitBatchId: 'b23', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-25', + 'b23' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-26 13:00:00', + { + acp_sourceBatchId: 'b24', + commitBatchId: 'b24', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-26', + 'b24' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-27 13:00:00', + { + acp_sourceBatchId: 'b25', + commitBatchId: 'b25', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-27', + 'b25' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-28 13:00:00', + { + acp_sourceBatchId: 'b26', + commitBatchId: 'b26', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-28', + 'b26' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-29 13:00:00', + { + acp_sourceBatchId: 'b27', + commitBatchId: 'b27', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-29', + 'b27' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-30 13:00:00', + { + acp_sourceBatchId: 'b28', + commitBatchId: 'b28', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-30', + 'b28' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-31 13:00:00', + { + acp_sourceBatchId: 'b29', + commitBatchId: 'b29', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-31', + 'b29' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-01 13:00:00', + { + acp_sourceBatchId: 'b30', + commitBatchId: 'b30', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-01', + 'b30' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-02 13:00:00', + { + acp_sourceBatchId: 'b31', + commitBatchId: 'b31', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-02', + 'b31' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-03 13:00:00', + { + acp_sourceBatchId: 'b32', + commitBatchId: 'b32', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-03', + 'b32' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-04 13:00:00', + { + acp_sourceBatchId: 'b33', + commitBatchId: 'b33', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-04', + 'b33' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-05 13:00:00', + { + acp_sourceBatchId: 'b34', + commitBatchId: 'b34', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-05', + 'b34' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-06 13:00:00', + { + acp_sourceBatchId: 'b35', + commitBatchId: 'b35', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-06', + 'b35' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-07 13:00:00', + { + acp_sourceBatchId: 'b36', + commitBatchId: 'b36', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-07', + 'b36' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-08 13:00:00', + { + acp_sourceBatchId: 'b37', + commitBatchId: 'b37', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-08', + 'b37' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-09 13:00:00', + { + acp_sourceBatchId: 'b38', + commitBatchId: 'b38', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-09', + 'b38' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-10 13:00:00', + { + acp_sourceBatchId: 'b39', + commitBatchId: 'b39', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-10', + 'b39' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-11 13:00:00', + { + acp_sourceBatchId: 'b40', + commitBatchId: 'b40', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-11', + 'b40' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-12 13:00:00', + { + acp_sourceBatchId: 'b41', + commitBatchId: 'b41', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-12', + 'b41' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-13 13:00:00', + { + acp_sourceBatchId: 'b42', + commitBatchId: 'b42', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-13', + 'b42' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-14 13:00:00', + { + acp_sourceBatchId: 'b43', + commitBatchId: 'b43', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-14', + 'b43' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-15 13:00:00', + { + acp_sourceBatchId: 'b44', + commitBatchId: 'b44', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-15', + 'b44' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-16 13:00:00', + { + acp_sourceBatchId: 'b45', + commitBatchId: 'b45', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-16', + 'b45' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-17 13:00:00', + { + acp_sourceBatchId: 'b46', + commitBatchId: 'b46', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-17', + 'b46' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-18 13:00:00', + { + acp_sourceBatchId: 'b47', + commitBatchId: 'b47', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-18', + 'b47' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-19 13:00:00', + { + acp_sourceBatchId: 'b48', + commitBatchId: 'b48', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-19', + 'b48' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-20 13:00:00', + { + acp_sourceBatchId: 'b49', + commitBatchId: 'b49', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-20', + 'b49' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-21 13:00:00', + { + acp_sourceBatchId: 'b50', + commitBatchId: 'b50', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-21', + 'b50' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-22 13:00:00', + { + acp_sourceBatchId: 'b51', + commitBatchId: 'b51', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-22', + 'b51' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-23 13:00:00', + { + acp_sourceBatchId: 'b52', + commitBatchId: 'b52', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-23', + 'b52' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-24 13:00:00', + { + acp_sourceBatchId: 'b53', + commitBatchId: 'b53', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-24', + 'b53' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-25 13:00:00', + { + acp_sourceBatchId: 'b54', + commitBatchId: 'b54', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-25', + 'b54' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-26 13:00:00', + { + acp_sourceBatchId: 'b55', + commitBatchId: 'b55', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-26', + 'b55' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-27 13:00:00', + { + acp_sourceBatchId: 'b56', + commitBatchId: 'b56', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-27', + 'b56' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-28 13:00:00', + { + acp_sourceBatchId: 'b57', + commitBatchId: 'b57', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-28', + 'b57' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-01 13:00:00', + { + acp_sourceBatchId: 'b58', + commitBatchId: 'b58', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-01', + 'b58' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-02 13:00:00', + { + acp_sourceBatchId: 'b59', + commitBatchId: 'b59', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-02', + 'b59' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-03 13:00:00', + { + acp_sourceBatchId: 'b60', + commitBatchId: 'b60', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-03', + 'b60' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-04 13:00:00', + { + acp_sourceBatchId: 'b61', + commitBatchId: 'b61', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-04', + 'b61' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-05 13:00:00', + { + acp_sourceBatchId: 'b62', + commitBatchId: 'b62', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-05', + 'b62' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-06 13:00:00', + { + acp_sourceBatchId: 'b63', + commitBatchId: 'b63', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-06', + 'b63' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-07 13:00:00', + { + acp_sourceBatchId: 'b64', + commitBatchId: 'b64', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-07', + 'b64' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-08 13:00:00', + { + acp_sourceBatchId: 'b65', + commitBatchId: 'b65', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-08', + 'b65' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-09 13:00:00', + { + acp_sourceBatchId: 'b66', + commitBatchId: 'b66', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-09', + 'b66' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-10 13:00:00', + { + acp_sourceBatchId: 'b67', + commitBatchId: 'b67', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-10', + 'b67' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-11 13:00:00', + { + acp_sourceBatchId: 'b68', + commitBatchId: 'b68', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-11', + 'b68' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-12 13:00:00', + { + acp_sourceBatchId: 'b69', + commitBatchId: 'b69', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-12', + 'b69' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-13 13:00:00', + { + acp_sourceBatchId: 'b70', + commitBatchId: 'b70', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-13', + 'b70' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-14 13:00:00', + { + acp_sourceBatchId: 'b71', + commitBatchId: 'b71', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-14', + 'b71' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-15 13:00:00', + { + acp_sourceBatchId: 'b72', + commitBatchId: 'b72', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-15', + 'b72' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-16 13:00:00', + { + acp_sourceBatchId: 'b73', + commitBatchId: 'b73', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-16', + 'b73' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-17 13:00:00', + { + acp_sourceBatchId: 'b74', + commitBatchId: 'b74', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-17', + 'b74' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-18 13:00:00', + { + acp_sourceBatchId: 'b75', + commitBatchId: 'b75', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-18', + 'b75' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-19 13:00:00', + { + acp_sourceBatchId: 'b76', + commitBatchId: 'b76', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-19', + 'b76' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-20 13:00:00', + { + acp_sourceBatchId: 'b77', + commitBatchId: 'b77', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-20', + 'b77' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-21 13:00:00', + { + acp_sourceBatchId: 'b78', + commitBatchId: 'b78', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-21', + 'b78' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-22 13:00:00', + { + acp_sourceBatchId: 'b79', + commitBatchId: 'b79', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-22', + 'b79' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-23 13:00:00', + { + acp_sourceBatchId: 'b80', + commitBatchId: 'b80', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-23', + 'b80' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-24 13:00:00', + { + acp_sourceBatchId: 'b81', + commitBatchId: 'b81', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-24', + 'b81' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-25 13:00:00', + { + acp_sourceBatchId: 'b82', + commitBatchId: 'b82', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-25', + 'b82' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-26 13:00:00', + { + acp_sourceBatchId: 'b83', + commitBatchId: 'b83', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-26', + 'b83' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-27 13:00:00', + { + acp_sourceBatchId: 'b84', + commitBatchId: 'b84', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-27', + 'b84' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-28 13:00:00', + { + acp_sourceBatchId: 'b85', + commitBatchId: 'b85', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-28', + 'b85' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-29 13:00:00', + { + acp_sourceBatchId: 'b86', + commitBatchId: 'b86', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-29', + 'b86' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-30 13:00:00', + { + acp_sourceBatchId: 'b87', + commitBatchId: 'b87', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-30', + 'b87' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-31 13:00:00', + { + acp_sourceBatchId: 'b88', + commitBatchId: 'b88', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-31', + 'b88' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-01 13:00:00', + { + acp_sourceBatchId: 'b89', + commitBatchId: 'b89', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-01', + 'b89' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-02 13:00:00', + { + acp_sourceBatchId: 'b90', + commitBatchId: 'b90', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-02', + 'b90' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-03 13:00:00', + { + acp_sourceBatchId: 'b91', + commitBatchId: 'b91', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-03', + 'b91' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-04 13:00:00', + { + acp_sourceBatchId: 'b92', + commitBatchId: 'b92', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-04', + 'b92' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-05 13:00:00', + { + acp_sourceBatchId: 'b93', + commitBatchId: 'b93', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-05', + 'b93' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-06 13:00:00', + { + acp_sourceBatchId: 'b94', + commitBatchId: 'b94', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-06', + 'b94' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-07 13:00:00', + { + acp_sourceBatchId: 'b95', + commitBatchId: 'b95', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-07', + 'b95' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-08 13:00:00', + { + acp_sourceBatchId: 'b96', + commitBatchId: 'b96', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-08', + 'b96' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-09 13:00:00', + { + acp_sourceBatchId: 'b97', + commitBatchId: 'b97', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-09', + 'b97' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-10 13:00:00', + { + acp_sourceBatchId: 'b98', + commitBatchId: 'b98', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-10', + 'b98' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-11 13:00:00', + { + acp_sourceBatchId: 'b99', + commitBatchId: 'b99', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-11', + 'b99' + ); diff --git a/datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate.py b/datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate.py new file mode 100644 index 0000000000000..d7fb4793c3109 --- /dev/null +++ b/datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate.py @@ -0,0 +1,35 @@ +import datetime + +date = datetime.datetime(2025, 1, 1) +for i in range(0, 100): + date = date + datetime.timedelta(days=1) + print() + date_fmt = date.strftime("%Y-%m-%d") + sql = f""" + INSERT INTO gs_summary_metrics VALUES( + '{date_fmt} 13:00:00', + {{ + acp_sourceBatchId: 'b{i}', + commitBatchId: 'b{i}', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }}, + {{ + genStudioInsights: {{ + entityIDs: {{ + account: {{ + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + }} + }}, + breakdownType: 'AccountCampaignAdGroupAd' + }} + }}, + '{date_fmt}', + 'b{i}' + ); + """ + print(sql.strip()) diff --git a/datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate.sql b/datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate.sql new file mode 100644 index 0000000000000..6817610246829 --- /dev/null +++ b/datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate.sql @@ -0,0 +1,54 @@ +CREATE OR REPLACE TABLE gs_summary_metrics( + timestamp TIMESTAMP, -- 0 + _acp_system_metadata STRUCT( -- 1 + acp_sourceBatchId VARCHAR, -- 2 + commitBatchId VARCHAR, --3 + ingestTime INT8, -- 4 + isDeleted BOOL, -- 5 + rowId VARCHAR, + rowVersion INT8, + trackingId VARCHAR + ), + _wfadoberm STRUCT( + genStudioInsights STRUCT( + entityIDs STRUCT( + account STRUCT( + accountID VARCHAR, + accountGUID VARCHAR + ) + ), + breakdownType VARCHAR + ), + extra VARCHAR + ), + _ACP_DATE DATE, + _ACP_BATCHID VARCHAR +); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-01 13:00:00', + { + acp_sourceBatchId: 'b1', + commitBatchId: 'b1', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-01', + 'batch1' +); +COPY gs_summary_metrics TO 'gs_summary_metrics.parquet' (FORMAT PARQUET); + diff --git a/datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate2.sql b/datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate2.sql new file mode 100644 index 0000000000000..9f99ba959bf57 --- /dev/null +++ b/datafusion/core/tests/data/deep_projections/gs_summary_metrics/generate2.sql @@ -0,0 +1,2628 @@ +CREATE OR REPLACE TABLE gs_summary_metrics( + timestamp TIMESTAMP, -- 0 + _acp_system_metadata STRUCT( -- 1 + acp_sourceBatchId VARCHAR, -- 2 + commitBatchId VARCHAR, --3 + ingestTime INT8, -- 4 + isDeleted BOOL, -- 5 + rowId VARCHAR, + rowVersion INT8, + trackingId VARCHAR + ), + _wfadoberm STRUCT( + genStudioInsights STRUCT( + entityIDs STRUCT( + account STRUCT( + accountID VARCHAR, + accountGUID VARCHAR + ) + ), + breakdownType VARCHAR + ), + extra VARCHAR + ), + _ACP_DATE DATE, + _ACP_BATCHID VARCHAR +); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-02 13:00:00', + { + acp_sourceBatchId: 'b0', + commitBatchId: 'b0', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-02', + 'b0' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-03 13:00:00', + { + acp_sourceBatchId: 'b1', + commitBatchId: 'b1', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-03', + 'b1' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-04 13:00:00', + { + acp_sourceBatchId: 'b2', + commitBatchId: 'b2', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-04', + 'b2' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-05 13:00:00', + { + acp_sourceBatchId: 'b3', + commitBatchId: 'b3', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-05', + 'b3' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-06 13:00:00', + { + acp_sourceBatchId: 'b4', + commitBatchId: 'b4', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-06', + 'b4' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-07 13:00:00', + { + acp_sourceBatchId: 'b5', + commitBatchId: 'b5', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-07', + 'b5' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-08 13:00:00', + { + acp_sourceBatchId: 'b6', + commitBatchId: 'b6', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-08', + 'b6' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-09 13:00:00', + { + acp_sourceBatchId: 'b7', + commitBatchId: 'b7', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-09', + 'b7' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-10 13:00:00', + { + acp_sourceBatchId: 'b8', + commitBatchId: 'b8', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-10', + 'b8' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-11 13:00:00', + { + acp_sourceBatchId: 'b9', + commitBatchId: 'b9', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-11', + 'b9' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-12 13:00:00', + { + acp_sourceBatchId: 'b10', + commitBatchId: 'b10', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-12', + 'b10' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-13 13:00:00', + { + acp_sourceBatchId: 'b11', + commitBatchId: 'b11', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-13', + 'b11' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-14 13:00:00', + { + acp_sourceBatchId: 'b12', + commitBatchId: 'b12', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-14', + 'b12' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-15 13:00:00', + { + acp_sourceBatchId: 'b13', + commitBatchId: 'b13', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-15', + 'b13' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-16 13:00:00', + { + acp_sourceBatchId: 'b14', + commitBatchId: 'b14', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-16', + 'b14' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-17 13:00:00', + { + acp_sourceBatchId: 'b15', + commitBatchId: 'b15', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-17', + 'b15' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-18 13:00:00', + { + acp_sourceBatchId: 'b16', + commitBatchId: 'b16', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-18', + 'b16' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-19 13:00:00', + { + acp_sourceBatchId: 'b17', + commitBatchId: 'b17', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-19', + 'b17' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-20 13:00:00', + { + acp_sourceBatchId: 'b18', + commitBatchId: 'b18', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-20', + 'b18' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-21 13:00:00', + { + acp_sourceBatchId: 'b19', + commitBatchId: 'b19', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-21', + 'b19' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-22 13:00:00', + { + acp_sourceBatchId: 'b20', + commitBatchId: 'b20', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-22', + 'b20' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-23 13:00:00', + { + acp_sourceBatchId: 'b21', + commitBatchId: 'b21', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-23', + 'b21' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-24 13:00:00', + { + acp_sourceBatchId: 'b22', + commitBatchId: 'b22', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-24', + 'b22' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-25 13:00:00', + { + acp_sourceBatchId: 'b23', + commitBatchId: 'b23', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-25', + 'b23' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-26 13:00:00', + { + acp_sourceBatchId: 'b24', + commitBatchId: 'b24', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-26', + 'b24' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-27 13:00:00', + { + acp_sourceBatchId: 'b25', + commitBatchId: 'b25', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-27', + 'b25' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-28 13:00:00', + { + acp_sourceBatchId: 'b26', + commitBatchId: 'b26', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-28', + 'b26' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-29 13:00:00', + { + acp_sourceBatchId: 'b27', + commitBatchId: 'b27', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-29', + 'b27' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-30 13:00:00', + { + acp_sourceBatchId: 'b28', + commitBatchId: 'b28', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-30', + 'b28' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-01-31 13:00:00', + { + acp_sourceBatchId: 'b29', + commitBatchId: 'b29', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-01-31', + 'b29' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-01 13:00:00', + { + acp_sourceBatchId: 'b30', + commitBatchId: 'b30', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-01', + 'b30' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-02 13:00:00', + { + acp_sourceBatchId: 'b31', + commitBatchId: 'b31', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-02', + 'b31' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-03 13:00:00', + { + acp_sourceBatchId: 'b32', + commitBatchId: 'b32', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-03', + 'b32' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-04 13:00:00', + { + acp_sourceBatchId: 'b33', + commitBatchId: 'b33', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-04', + 'b33' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-05 13:00:00', + { + acp_sourceBatchId: 'b34', + commitBatchId: 'b34', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-05', + 'b34' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-06 13:00:00', + { + acp_sourceBatchId: 'b35', + commitBatchId: 'b35', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-06', + 'b35' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-07 13:00:00', + { + acp_sourceBatchId: 'b36', + commitBatchId: 'b36', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-07', + 'b36' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-08 13:00:00', + { + acp_sourceBatchId: 'b37', + commitBatchId: 'b37', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-08', + 'b37' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-09 13:00:00', + { + acp_sourceBatchId: 'b38', + commitBatchId: 'b38', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-09', + 'b38' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-10 13:00:00', + { + acp_sourceBatchId: 'b39', + commitBatchId: 'b39', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-10', + 'b39' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-11 13:00:00', + { + acp_sourceBatchId: 'b40', + commitBatchId: 'b40', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-11', + 'b40' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-12 13:00:00', + { + acp_sourceBatchId: 'b41', + commitBatchId: 'b41', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-12', + 'b41' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-13 13:00:00', + { + acp_sourceBatchId: 'b42', + commitBatchId: 'b42', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-13', + 'b42' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-14 13:00:00', + { + acp_sourceBatchId: 'b43', + commitBatchId: 'b43', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-14', + 'b43' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-15 13:00:00', + { + acp_sourceBatchId: 'b44', + commitBatchId: 'b44', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-15', + 'b44' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-16 13:00:00', + { + acp_sourceBatchId: 'b45', + commitBatchId: 'b45', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-16', + 'b45' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-17 13:00:00', + { + acp_sourceBatchId: 'b46', + commitBatchId: 'b46', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-17', + 'b46' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-18 13:00:00', + { + acp_sourceBatchId: 'b47', + commitBatchId: 'b47', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-18', + 'b47' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-19 13:00:00', + { + acp_sourceBatchId: 'b48', + commitBatchId: 'b48', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-19', + 'b48' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-20 13:00:00', + { + acp_sourceBatchId: 'b49', + commitBatchId: 'b49', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-20', + 'b49' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-21 13:00:00', + { + acp_sourceBatchId: 'b50', + commitBatchId: 'b50', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-21', + 'b50' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-22 13:00:00', + { + acp_sourceBatchId: 'b51', + commitBatchId: 'b51', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-22', + 'b51' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-23 13:00:00', + { + acp_sourceBatchId: 'b52', + commitBatchId: 'b52', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-23', + 'b52' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-24 13:00:00', + { + acp_sourceBatchId: 'b53', + commitBatchId: 'b53', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-24', + 'b53' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-25 13:00:00', + { + acp_sourceBatchId: 'b54', + commitBatchId: 'b54', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-25', + 'b54' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-26 13:00:00', + { + acp_sourceBatchId: 'b55', + commitBatchId: 'b55', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-26', + 'b55' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-27 13:00:00', + { + acp_sourceBatchId: 'b56', + commitBatchId: 'b56', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-27', + 'b56' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-02-28 13:00:00', + { + acp_sourceBatchId: 'b57', + commitBatchId: 'b57', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-02-28', + 'b57' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-01 13:00:00', + { + acp_sourceBatchId: 'b58', + commitBatchId: 'b58', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-01', + 'b58' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-02 13:00:00', + { + acp_sourceBatchId: 'b59', + commitBatchId: 'b59', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-02', + 'b59' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-03 13:00:00', + { + acp_sourceBatchId: 'b60', + commitBatchId: 'b60', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-03', + 'b60' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-04 13:00:00', + { + acp_sourceBatchId: 'b61', + commitBatchId: 'b61', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-04', + 'b61' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-05 13:00:00', + { + acp_sourceBatchId: 'b62', + commitBatchId: 'b62', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-05', + 'b62' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-06 13:00:00', + { + acp_sourceBatchId: 'b63', + commitBatchId: 'b63', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-06', + 'b63' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-07 13:00:00', + { + acp_sourceBatchId: 'b64', + commitBatchId: 'b64', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-07', + 'b64' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-08 13:00:00', + { + acp_sourceBatchId: 'b65', + commitBatchId: 'b65', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-08', + 'b65' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-09 13:00:00', + { + acp_sourceBatchId: 'b66', + commitBatchId: 'b66', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-09', + 'b66' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-10 13:00:00', + { + acp_sourceBatchId: 'b67', + commitBatchId: 'b67', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-10', + 'b67' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-11 13:00:00', + { + acp_sourceBatchId: 'b68', + commitBatchId: 'b68', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-11', + 'b68' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-12 13:00:00', + { + acp_sourceBatchId: 'b69', + commitBatchId: 'b69', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-12', + 'b69' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-13 13:00:00', + { + acp_sourceBatchId: 'b70', + commitBatchId: 'b70', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-13', + 'b70' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-14 13:00:00', + { + acp_sourceBatchId: 'b71', + commitBatchId: 'b71', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-14', + 'b71' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-15 13:00:00', + { + acp_sourceBatchId: 'b72', + commitBatchId: 'b72', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-15', + 'b72' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-16 13:00:00', + { + acp_sourceBatchId: 'b73', + commitBatchId: 'b73', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-16', + 'b73' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-17 13:00:00', + { + acp_sourceBatchId: 'b74', + commitBatchId: 'b74', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-17', + 'b74' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-18 13:00:00', + { + acp_sourceBatchId: 'b75', + commitBatchId: 'b75', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-18', + 'b75' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-19 13:00:00', + { + acp_sourceBatchId: 'b76', + commitBatchId: 'b76', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-19', + 'b76' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-20 13:00:00', + { + acp_sourceBatchId: 'b77', + commitBatchId: 'b77', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-20', + 'b77' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-21 13:00:00', + { + acp_sourceBatchId: 'b78', + commitBatchId: 'b78', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-21', + 'b78' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-22 13:00:00', + { + acp_sourceBatchId: 'b79', + commitBatchId: 'b79', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-22', + 'b79' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-23 13:00:00', + { + acp_sourceBatchId: 'b80', + commitBatchId: 'b80', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-23', + 'b80' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-24 13:00:00', + { + acp_sourceBatchId: 'b81', + commitBatchId: 'b81', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-24', + 'b81' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-25 13:00:00', + { + acp_sourceBatchId: 'b82', + commitBatchId: 'b82', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-25', + 'b82' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-26 13:00:00', + { + acp_sourceBatchId: 'b83', + commitBatchId: 'b83', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-26', + 'b83' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-27 13:00:00', + { + acp_sourceBatchId: 'b84', + commitBatchId: 'b84', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-27', + 'b84' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-28 13:00:00', + { + acp_sourceBatchId: 'b85', + commitBatchId: 'b85', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-28', + 'b85' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-29 13:00:00', + { + acp_sourceBatchId: 'b86', + commitBatchId: 'b86', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-29', + 'b86' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-30 13:00:00', + { + acp_sourceBatchId: 'b87', + commitBatchId: 'b87', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-30', + 'b87' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-03-31 13:00:00', + { + acp_sourceBatchId: 'b88', + commitBatchId: 'b88', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-03-31', + 'b88' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-01 13:00:00', + { + acp_sourceBatchId: 'b89', + commitBatchId: 'b89', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-01', + 'b89' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-02 13:00:00', + { + acp_sourceBatchId: 'b90', + commitBatchId: 'b90', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-02', + 'b90' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-03 13:00:00', + { + acp_sourceBatchId: 'b91', + commitBatchId: 'b91', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-03', + 'b91' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-04 13:00:00', + { + acp_sourceBatchId: 'b92', + commitBatchId: 'b92', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-04', + 'b92' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-05 13:00:00', + { + acp_sourceBatchId: 'b93', + commitBatchId: 'b93', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-05', + 'b93' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-06 13:00:00', + { + acp_sourceBatchId: 'b94', + commitBatchId: 'b94', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-06', + 'b94' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-07 13:00:00', + { + acp_sourceBatchId: 'b95', + commitBatchId: 'b95', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-07', + 'b95' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-08 13:00:00', + { + acp_sourceBatchId: 'b96', + commitBatchId: 'b96', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-08', + 'b96' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-09 13:00:00', + { + acp_sourceBatchId: 'b97', + commitBatchId: 'b97', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-09', + 'b97' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-10 13:00:00', + { + acp_sourceBatchId: 'b98', + commitBatchId: 'b98', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-10', + 'b98' + ); + +INSERT INTO gs_summary_metrics VALUES( + '2025-04-11 13:00:00', + { + acp_sourceBatchId: 'b99', + commitBatchId: 'b99', + ingestTime: 1, + isDeleted: false, + rowId: '1', + rowVersion: 1, + trackingId: 't1' + }, + { + genStudioInsights: { + entityIDs: { + account: { + accountID: 'a1', + accountGUID: 'Meta_2974530739347344', + } + }, + breakdownType: 'AccountCampaignAdGroupAd' + } + }, + '2025-04-11', + 'b99' + ); +COPY gs_summary_metrics TO 'gs_summary_metrics.parquet' (FORMAT PARQUET); + diff --git a/datafusion/core/tests/data/deep_projections/gs_summary_metrics/gs_summary_metrics.parquet b/datafusion/core/tests/data/deep_projections/gs_summary_metrics/gs_summary_metrics.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e7ae74b70210dcc94b346cb192544bb418cf0e24 GIT binary patch literal 7014 zcmeI13s_WT8pqFh8E(VH(dNt!FsY>+0SURxa7j{7lnA_pV_KHuFpP=<<1z;^EyXTs zhL_w~HOeHn6kQ9U$!nUesTi1Aqh{u6yPIugnL24@oBhu@AR`Ll?(^*Kv+F$k&i|b6 zd%ySn&UZWCISfzFjpG#D&Opu)ki*rp9K$eGKCz72vVC)u55ov|A8%y+0$rVdI0eyv zq4|#Wi+1eP`}ansds*)7NHyA~#6g^c7Y>4`yRLr_B7C>62;yq|d^SY?iuY28V@8j4 z5O;3+4u~k2S>xVYwnRdk*YVj6CsYoF4+)wP^<}J{A`FF z;(TlJT!`B7%nJ}<@|qfmYryVZ5Pj%hk3t;hm@{zduJ6j+1*gWUR5enaWe-I_)WfEw zLIi&4aER+4Z{$PthYwAIIM$!7fViIwdKqFI)B8<`Gd^n{M6H}~0wR3+)H(QeZC>Ra z1K;|^J42A_nDFPjAntxYr9+IG&<7ySuahQ0)VoK{ga|L%mq1+h=T<}X*)>}sjyw0i z4{`tSSd5z9&+RnWvK>&aR3f5cS17 z%n)IJ|5AwSl}Da|=pUQ00^%6)d@aNswP`oRNQaL>oX5_84X5f_6~)1+FfT@fRF^3y z3ZhS%)DPkao|g-8fA;Elh;jR_=@6&m(}fUq;gwYoVc_kXAg<894G{hL(I+8}eN!*M zx4X*W8xP;cX?0;pbq+h&2cquv%|M9obHHfUFWGfCvQV_qzm{1JvmE7Du*< z^Wl(nkp1?67e;wK%lZ;SED?E;g1pIxe94deNl5_|NI?`#ok&HUsSDjkYT`&kzacG! zP$-3wj&7%})Q!5+9dsx4pq>;?JoTaoiliv&P0@50#ZVuLr8tVG1k#g%5-Ew2DTVH) zzLZM$P(SKVX_QU_Xdq?KAR0`WbT4JmeUwc(G=zrIFd9zxQ!b4lBaNg7XcUd62k9Xi zLl4sj?9!t7R=zkDY>ma%`>>pTTQsdyjX7^CJw}d1fci!Q5cBr$t2<;2_z#S z#f#M{;$r=MnK%#+5`Z2UKq5#2$^O3ngOS@Y5@6R6^sws)2H16kMA&tNB-nL?WY~3t z6xg*vYD$tg-11VAWl3p&P+}0?8kDyNWqX6OBLh;jX=6Z&>Hp7zQpKrrI1)EHemWLM z@rOC7;X*wh69EzOLxR|PR?LpzqNQv`eu}7bnzH5cO-HQ5N-ns`ySGa89`|dm6?ib+ ze1#NTAg5{yE>yPHeOxR>5qFPf5}n5@M4QQWEq@ z33`3oH`(kanM-YQqbt8o7cnPcoRp4c?3j6`Q0$v5H!5UBv z)`E55k6=A`4ZIF&zy?qY>cAVo3El)7!6vX7Yyn%rHt-hM4t9XI!A|fF_!HO#-UYkC z9aDl_%2sjEp0>{9|;5hgMoB-(FTRSpryl(Bt z@PP8%skQgO^UVAAKm7iOUK7emr(>9Am~M;c?6?a@E7Ql!KO6J$J(b-?oUu&X5t+TV zrs45T87DS|tA-5@TeWBZmfVR;LszgLK0Mb(0{2F>Zk;>ZwQc8|>7ze!eeY|0`=nw+ z@xhe`qQ4wfP*a?|WdH1jwKG=lU2^gB-G+BUFIY_H8t$Jpa`lz4msVbAygW3$%uu%f zi|vEAjo@0<=SbzEx=e_r+dmBjzL`lK~?m8jy+ z4~TDeKomv+{L;iT>@~RS%oXzfM#V@b3`8Z`ie8ws>&;ADmq@YUU@CN)I+H7*y9QQ*$^wywB_lrPag66c;X=m3n#0<>cPB zvB$48WY3JhGN7_x%cT<|RLYMwXXh<&26a>Ug-4#I1?L^LOO;u%xp!oLjReR}bLX2)wa?wVO^TwfWczgnGr>E!v}uI$vhKL7E7Pa!|44U9G@ztl20KhI>h zOtco6IVHm>!b$5d6ckzQ0(NTQ>YR!$x@fd4!x%`bHkmC`O!i0Zg0;w0gnePYSuk_K z3>Sz}ouxcmX^F);z${oMX61*Yo5h?Ui>;`rK=4QvqbLiCCm@OuvBG;|pCASaQ()@LF-qvZTmw1J*L|T7S`oQ6)jCA9m@Icvjpp-Ho-8k@G$(5#_ z4bg?1;TavL2;o#A9vy(MQ1z^)TBUia6aQ8K@2AuHHGk{ES<^>b4FjVaWib55{kbY) z7}0g0=sMztU3h)h$#ENlHS3i8X^yYxq|6k2yQvUJy6S4%NfIO9n6iD+lvX| zm^Rw0Wt_^vax}yF#-6EKmcgHfiI5Zlf2G;w;Vruus~OL~8_vHNuG6w{_>*cVoiY6K z%;q|^*8(lGqooBl@ca|}O}4SCy?eKvBUSpJ zD97;Mcvc+~$fN4=x+qsLUX#e5ib_qy#};{>ki7C;T`Xm@c}h*Cl0Oy4FYT?GmMHXp>Dy5oeBtj2SBUGqRJa}x~bVs{3iJ?hTdXXMH ziJ!oOpP={&iU&V{N6#L`lOMo?@6BZ0)U;q==e?c%{eJJ?Og8R6tx6%wa$#L^2>`{Q z`Q`J6)td#7Am^o8#_h5!$O1qqzF1|E-W%!tF+XRs^yJkTe2! zfQGX)0McFQbB)s|CYwP3kUq$?vCC-b!B8O@;y{s}DR>Cbbl>pv7!H`Y-G(VRDOt6n zBlMgL4M^Z;CD0xQuwm$PZTG);%Za9(06q) z@QUdj$vlPR>@D*^{}OT&fJA3Wv_XyerzN%(HpFwaqdzj;3%tC&P-A1&PwjyInoo6Y zvTJAPW_?Y2GOpq+*WVb{5!-12WS*;IdLQ(0A&uwim>f$s4jO~4aMVvp3N1W=@IjoGt3Ix06g~h_k~mLXFfL{XM5Yb1tPF~(tVt>htO~nU)Ix+J2vvcDwehaW5`ST@ z4dK9nQw0bi#3AB>gg9`l5aJKuQ035550&76IHR69pdOf+=d9nR`EaSg%D%6C^ZU)r zn>VxBD!=Y<#&djWh3ha#@f2fElyt_JuEUV+xM6*F!YzwDr=ZNNNfZrG@CWEKd`{Ug zWyuZr=44R=#Cn|%kpAQi3bw@;73}F1iH&WD@hX&o&_|^(HVMIQtOOg91GLIr=MoSrO zCz)C;;I!>fi;_QUd_$3ZN0Ho?l4zUyL_(3vM?$0_2}!5#`TOK_=#lAnjlZohbrhx| zF{P8BTDRBfI<|zCWe_HW- zb`|+_4gD8*r``Zfxg`3+gZw(PvfiM+=k3(`^n4w z4xa}|q_m`h#+(fc&FR1(5ozQJE zjtUuIzP~)b`T5-93iy{HVh+J;okp!6_J9{h@s$a*NHqe#={cYX0Ln?>EHOhWpY9KK zRLL2bBe&GVnE=6PyKop_Kbo5)b5RA8P+M*nAawL#8sfYL@iy?m%iuU>C16GBo0RKk z_iw2Y^STNMi!KG0rh+cA>0tlus~^0Hz@h zG~wBI0MvlGT%gt|D&q)R$3vO~I!a4~e^V1T>{|xlV-2?q4345Qj_}P%@s%t$X1JLI z2CR(O=E_B795Kk_96N`yBRLjB_YXiYk3J8^BRsu_S0hYv_|`BZU;uZV5 z6ddUgj&$f0M|~Xs!+(Qch;fcnFZt}ACK~pA;7tsj0ya)D8pXMPczo_-X+EYGC3H~9 z6n|{MjsBR71ai;wMZG9|3Goo+#PPBAw7 zA2g!sOA7t1iI41q3*a97o|I*|sEp&vSXik+P8TKdvi;RGL_Bv0ouWZoxl_{{SuA9}fTk literal 0 HcmV?d00001 diff --git a/datafusion/core/tests/data/deep_projections/list_struct_map/table.sql b/datafusion/core/tests/data/deep_projections/list_struct_map/table.sql new file mode 100644 index 0000000000000..4c51a4ac8588e --- /dev/null +++ b/datafusion/core/tests/data/deep_projections/list_struct_map/table.sql @@ -0,0 +1,101 @@ +CREATE OR REPLACE TABLE table1 ( + _acp_system_metadata STRUCT( + acp_sourceBatchId VARCHAR, + commitBatchId VARCHAR, + ingestTime INT8, + isDeleted BOOL, + rowId VARCHAR, + rowVersion INT8, + trackingId VARCHAR + ), + orderData STRUCT( + productList STRUCT( + SKU VARCHAR, + quantity INT4, + priceTotal FLOAT + )[], + date TIMESTAMP + ), + list_of_struct_of_list_of_struct STRUCT( + field1 VARCHAR, + field2 STRUCT( + subfield1 VARCHAR, + subliststruct1 STRUCT( + name VARCHAR, + price FLOAT + )[] + )[], + fieldmap MAP( + VARCHAR, + STRUCT( + prop1 VARCHAR, + val1 VARCHAR + )[] + ) + )[], + _id VARCHAR, + _ACP_BATCHID VARCHAR +); + +INSERT INTO table1 VALUES( + { + acp_sourceBatchId : 'batch1', + commitBatchId : 'batch1', + ingestTime : 1, + isDeleted : false, + rowId : 'row1', + rowVersion : 1, + trackingId : 't1' + }, + { + productList : [ + { + SKU : 'sku1', + quantity : 1, + priceTotal : 10.0 + }, + { + SKU : 'sku2', + quantity : 10, + priceTotal : 100.0 + } + ], + date: '2025-01-01 13:00:00' + }, + [ + { + field1 : 'v1', + field2 : [ + { + subfield1 : 'v11', + subliststruct1 : [ + {name : 'n1', price : 100.0}, + {name : 'n2', price : 100.0}, + ] + }, + { + subfield1 : 'v12', + subliststruct1 : [ + {name : 'n1', price : 100.0}, + {name : 'n2', price : 100.0}, + ] + } + ], + fieldmap: MAP { + 'key1': [ + { prop1: 'prop1', val1: 'val1'}, + { prop1: 'prop2', val1: 'val2'} + ], + 'key2': [ + { prop1: 'prop1', val1: 'val1'}, + { prop1: 'prop2', val1: 'val2'} + ], + } + } + ], + 'id1', + 'batch1' +); + +COPY table1 TO 'table.parquet' (FORMAT PARQUET); + diff --git a/datafusion/core/tests/data/deep_projections/mid_values_2/mid_values_2.sql b/datafusion/core/tests/data/deep_projections/mid_values_2/mid_values_2.sql new file mode 100644 index 0000000000000..194619ba72173 --- /dev/null +++ b/datafusion/core/tests/data/deep_projections/mid_values_2/mid_values_2.sql @@ -0,0 +1,210 @@ +CREATE OR REPLACE TABLE midvalues ( + -- _acp_system_metadata STRUCT( + -- acp_sourceBatchId VARCHAR, + -- commitBatchId VARCHAR, + -- ingestTime INT8, + -- isDeleted BOOL, + -- rowId VARCHAR, + -- rowVersion INT8, + -- trackingId VARCHAR + -- ), + -- channel STRUCT( + -- _id VARCHAR, + -- typeAtSource VARCHAR + -- ), + timestamp TIMESTAMP_S, + _id VARCHAR, + productListItems STRUCT( + SKU VARCHAR, + quantity INT4, + priceTotal FLOAT, + _experience STRUCT( + event1 VARCHAR, + event2 VARCHAR + ) + )[], + -- commerce STRUCT( + -- order STRUCT( + -- purchaseID VARCHAR, + -- currencyCode VARCHAR + -- ), + -- purchases STRUCT(value FLOAT), + -- productViews STRUCT(value FLOAT) + -- ), + -- dataSource STRUCT( + -- _id VARCHAR + -- ), + -- device STRUCT( + -- typeIDService VARCHAR + -- ), + -- search STRUCT( + -- searchEngine VARCHAR, + -- keywords VARCHAR + -- ), + -- receivedTimestamp TIMESTAMP_S, + endUserIDs STRUCT( + _experience STRUCT( + aaid STRUCT( + id VARCHAR, + namespace VARCHAR + ), + mcid STRUCT( + id VARCHAR, + namespace VARCHAR + ) + ) + ), + -- identityMap MAP( + -- VARCHAR, + -- STRUCT( + -- id VARCHAR, + -- primary BOOLEAN + -- )[] + -- ), + web STRUCT( + webPageDetails STRUCT( + name VARCHAR, + pageViews STRUCT(value INT8) + ) + ), + -- placeContext + -- marketing + -- userActivityRegion + -- environment + _experience STRUCT( + analytics STRUCT( + customDimensions STRUCT( + eVars STRUCT( + eVar40 VARCHAR, + eVar1 VARCHAR + ) + ), + environment STRUCT( + browserID INT4 + ) + ) + -- target + -- decisioning + ), + _ACP_DATE DATE, + _ACP_BATCHID VARCHAR +); + +INSERT INTO midvalues VALUES( + '2025-01-01 13:00:00', + '1', + [ + { + SKU : 'sku1', + quantity : 1, + priceTotal : 10.0, + _experience: { + event1: 'event11', + event2: 'event12' + } + }, + { + SKU : 'sku2', + quantity : 10, + priceTotal : 100.0, + _experience: { + event1: 'event21', + event2: 'event22' + } + } + ], + { + _experience: { + aaid: { + id: 'aaid1', + namespace: 'nsaaid1' + }, + mcid: { + id: 'mcid1', + namespace: 'nsmcid1' + } + } + }, + { + webPageDetails: { + name: 'page1', + pageViews: {value: 100} + } + }, + { + analytics: { + customDimensions: { + eVars: { + eVar40: 'entity1', + eVar1: 'xxx1' + } + }, + environment: { + browserID: 1 + } + } + }, + '2025-01-01', + 'batch1' +); + +INSERT INTO midvalues VALUES( + '2025-02-01 13:00:00', + '1', + [ + { + SKU : 'sku1', + quantity : 1, + priceTotal : 10.0, + _experience: { + event1: 'event11', + event2: 'event12' + } + }, + { + SKU : 'sku2', + quantity : 10, + priceTotal : 100.0, + _experience: { + event1: 'event21', + event2: 'event22' + } + } + ], + { + _experience: { + aaid: { + id: 'aaid1', + namespace: 'nsaaid1' + }, + mcid: { + id: 'mcid1', + namespace: 'nsmcid1' + } + } + }, + { + webPageDetails: { + name: 'page1', + pageViews: {value: 100} + } + }, + { + analytics: { + customDimensions: { + eVars: { + eVar40: 'entity1', + eVar1: 'xxx1' + } + }, + environment: { + browserID: 1 + } + } + }, + '2025-02-01', + 'batch2' +); + +COPY midvalues TO 'midvalues.parquet' (FORMAT PARQUET); + diff --git a/datafusion/core/tests/data/deep_projections/mid_values_2/midvalues.parquet b/datafusion/core/tests/data/deep_projections/mid_values_2/midvalues.parquet new file mode 100644 index 0000000000000000000000000000000000000000..439123e4ac02a63ac1a06696c020ef584054b979 GIT binary patch literal 4336 zcmbtYO>7%Q6n?uIJL|+vw;{VYt&z6Si~fot=5} z-uKO$dGltRnfyl?OIVI|WEvkLGEdOGufDzgMx1DN2Ygf%rXM9Lumsl z%#;x0M21kTKC@{J`NKBS%-o0f`4NbmJkf)b&xSB&jr(J;XM4f#uDI1kCex~Bc{+CX zDHisrJ6^VpxBDDz!NW@UtCMhI7+;}KE^WI)!doG%w{O!w%EZo=+-PRFoR&5`5wZE7LVlx0wbkY`Jk|TcTJe5r_5a>gsMLhL!Z^TO>J- zcNteW?5})C^bKB^?7T}ytua4Nho-TKX}-`XUd*&=99263ajG#xvVa;)b#Fcm29I3l7hJj^)V zM^<7>9Uq5PX$=HUM4gaEh-UCMO<*@+WTJ3L@M^75M7@bVCMS&yjjJln(liYDrKG@c)}=e`I!B*N=P zsSw!;f2bP zDwIb>L|2t4XP8PGN;!d%p~0FQ9f%K0AKrZJ7^U`=mW!84^KT&gX?x-ng@H%>R)g*r~)I2{uyQGvwL zS*O>nxC=;Hmf>4}{J|MmTRM=8&zB1-osGOomH|QEQmtHcXBQiV3fuP2xvNWVt?X8d zu7zWp+2h{X#t`jh)*DZsTP>ZbyS3Ak7)ciZKngxY0xVrQqHq$`0{mSn6kQKW@DZ>F zy`IbNyvH?w+6>NsNw-lbSL%VSU|q)s=E>v8t8>Bj6=S z%-0q#t106(P?EX)shPRS{Oo%pNpFsmN^&wkdur=C;6*sohM?@qyptIQ*h_(3-WtSu4(C%mcbt&bgxG8 z?tv{vQymehA~A@;9>g9X4g0!g-_YcSE*}F_;O>`IWkw=&+bxpU1_OAV$FvK4E1VU_ znr5$S@^+8>R|f~*3RCL*VUBi- zvwLpmi3)E#f5R!ZTbjM0$pYgumg%Rl?zhq`F{AJ31lgyf@C;D^diQ(b`D znb^y~isQWfiQMd$5Brh0O6s`7mTlD(Q>)dMll*Bo;PbXtIJVj;yd!ygY1>cNt)O_4 zl=qSx;oddBRq5&rrBLpzW zd+j#KAJYMhcdBeT%M{X8drgx!O!;tsIu%zxNV5KyPZ|-Gk^K4GT21f7#R31m>3usm{?eWWNvb13Vs!NiMgrxRS;)}ktmbo z98(4f-jveh?3ASV;^d6f+(c0(22&;puGGAg(Bjl0PnTkd0C#+9ML}v&W@=t?swfLc z2@A*;AXAh_l}UmrGewmF!ejxt56A&2Nle76M2tyNM$$)(K}M8EQj|f`1Iboo`@l8< zZ4hM=V-j-^b5Y~4kzfV;5lV?MXrS-~1%UxcoUvfn!VUHT8Vn9zC`IuT%j~8*F2>qW zxv<4Ru05qtpEY|TwJBd&5G{W0iq5e)p)c0Nuq`^aXjy_NlA_2I0Fotefo-0G&?F3d zv*%zbNsRqKqF766O#>x~q7kI|<{_A3dO*uYw5UekH*SU~N`FyJBw+ffT1AHNY5dvX zE_Ro-PGhdyyS(2tJzPS8Bc@aT0UW*bnjr%}R4SA<$as%}b>p$pDa`v=M zS}~qMJ2E+UQvmvNwO{s-JLmWQfd9w4)cx%J1+?dUe(Rmy1t~V7^f8FiX%jvQv#j?; l$F*-;dOw%VXY+cx-m{xr-{+iJg`M3-V+$Px5vsz!;SDdr-Z20G literal 0 HcmV?d00001 diff --git a/datafusion/core/tests/data/deep_projections/triplea/midvalues.sql b/datafusion/core/tests/data/deep_projections/triplea/midvalues.sql new file mode 100644 index 0000000000000..2469a348d4921 --- /dev/null +++ b/datafusion/core/tests/data/deep_projections/triplea/midvalues.sql @@ -0,0 +1,79 @@ +CREATE OR REPLACE TABLE midvalues( + timestamp TIMESTAMP_S, -- 0 + web STRUCT( + webPageDetails STRUCT( + pageViews STRUCT(value INT8) + ) + ), + endUserIDs STRUCT( + _experience STRUCT( + mcid STRUCT( + id VARCHAR, + extra1 VARCHAR + ), + aaid STRUCT( + id VARCHAR, + extra1 VARCHAR + ) + ) + ) +); + +INSERT INTO midvalues VALUES +( + '2025-01-15 00:00:01', + { + webPageDetails : { + pageViews : { + value : 100 + } + } + }, + { + _experience : { + mcid : { + id : 'mcid1', + extra1 : 'extram1' + }, + aaid : { + id : 'mcid1', + extra1 : 'extram1' + } + } + } +); + +CREATE OR REPLACE TABLE extra_user_data( + endUserIDs STRUCT( + _experience STRUCT( + mcid STRUCT( + id VARCHAR, + name VARCHAR + ), + aaid STRUCT( + id VARCHAR, + name VARCHAR + ) + ) + ) +); + +INSERT INTO extra_user_data VALUES +( + { + _experience : { + mcid : { + id : 'mcid1', + name : 'name1' + }, + aaid : { + id : 'mcid1', + name : 'name2' + } + } + } +); + +COPY midvalues TO 'midvalues.parquet' (FORMAT PARQUET); +COPY extra_user_data TO 'extra_user_data.parquet' (FORMAT PARQUET); + diff --git a/datafusion/core/tests/optimizer_deep_indices/mod.rs b/datafusion/core/tests/optimizer_deep_indices/mod.rs new file mode 100644 index 0000000000000..a62f0ff988e93 --- /dev/null +++ b/datafusion/core/tests/optimizer_deep_indices/mod.rs @@ -0,0 +1,1640 @@ +use arrow::util::display::FormatOptions; +use arrow::util::pretty; +use datafusion::execution::SessionStateBuilder; +#[allow(dead_code, clippy::let_unit_value)] +use datafusion::prelude::{ParquetReadOptions, SessionContext}; +use datafusion::test::object_store::local_unpartitioned_file; +use datafusion_common::Result; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::source::DataSourceExec; +use datafusion_datasource_parquet::leaves::projection_specifier; +use datafusion_datasource_parquet::metadata::DFParquetMetadata; +use datafusion_datasource_parquet::source::ParquetSource; +use datafusion_execution::cache::cache_manager::FileMetadataCache; +use datafusion_execution::config::SessionConfig; +use datafusion_execution::runtime_env::RuntimeEnv; +use datafusion_functions::core::getfield::HANDLE_STRUCT_IN_LIST; +use datafusion_physical_plan::{collect, displayable}; +use log::info; +use object_store::local::LocalFileSystem; +use object_store::{ObjectMeta, ObjectStore}; +use parquet::encryption::decrypt::FileDecryptionProperties; +use parquet::file::metadata::ParquetMetaData; +use parquet::schema::types::SchemaDescriptor; +use std::collections::HashMap; +use std::ops::Deref; +use std::sync::Arc; +use datafusion_datasource_parquet::push_all_projection_hints::PushAllProjectionHints; + +#[cfg(test)] +#[ctor::ctor] +fn init() { + let _ = env_logger::try_init(); +} + +pub async fn fetch_parquet_metadata_test( + store: &dyn ObjectStore, + object_meta: &ObjectMeta, + size_hint: Option, + #[allow(unused)] decryption_properties: Option>, + file_metadata_cache: Option>, +) -> Result> { + DFParquetMetadata::new(store, object_meta) + .with_metadata_size_hint(size_hint) + .with_decryption_properties(decryption_properties) + .with_file_metadata_cache(file_metadata_cache) + .fetch_metadata() + .await +} + +fn build_context() -> SessionContext { + let config = SessionConfig::new() + .set_bool("datafusion.sql_parser.enable_ident_normalization", false) + .set_usize("datafusion.optimizer.max_passes", 2); + + let builder = SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(Arc::new(RuntimeEnv::default())) + .with_default_features() + .with_physical_optimizer_rule(Arc::new(PushAllProjectionHints {})); + + SessionContext::new_with_state(builder.build()) +} + +pub type DeepColumnIndexMap = HashMap>; + +async fn run_deep_projection_optimize_test( + ctx: &SessionContext, + query: &str, + tests: Vec, +) -> Result<()> { + let plan = ctx + .state() + .create_logical_plan(query) + .await + .expect("Error creating logical plan"); + info!("Logical plan: {}", plan.display_indent()); + let optimized_plan = ctx.state().optimize(&plan).expect("Error optimizing plan"); + info!("Optimized plan: {}", optimized_plan.display_indent()); + let state = ctx.state(); + let query_planner = state.query_planner().clone(); + let physical_plan = query_planner + .create_physical_plan(&optimized_plan, &state) + .await + .expect("Error creating physical plan"); + info!( + "Physical plan: {}", + displayable(physical_plan.deref()).indent(true) + ); + let mut deep_projections_computed_with_physical_rule: Vec = + vec![]; + let _ = physical_plan.apply(|pp| { + if let Some(dse) = pp.as_any().downcast_ref::() { + if let Some((_file_scan_conf, parquet_source)) = + dse.downcast_to_file_source::() + { + deep_projections_computed_with_physical_rule.push(projection_specifier( + parquet_source.table_schema().file_schema().clone(), + // SAFETY - the projection function just makes an option, but the field exists + parquet_source.projection().unwrap(), + Some(&parquet_source.projection_hints), + &parquet_source.projection_hints_indices, + )); + } + } + Ok(TreeNodeRecursion::Continue) + }); + info!( + "Checking if plan has these deep projections: {:?} xxxxxxxxxxx {:?}", + &deep_projections_computed_with_physical_rule, tests + ); + assert_eq!( + deep_projections_computed_with_physical_rule.len(), + tests.len() + ); + + for i in 0..deep_projections_computed_with_physical_rule.len() { + let l = deep_projections_computed_with_physical_rule[i].clone(); + let r = tests[i].clone(); + + assert_eq!(l.len(), r.len(), "key count differs"); + for (k, lv) in l { + let rv = r.get(&k).unwrap_or_else(|| panic!("missing key {k}")); + let mut l = lv.clone(); + let mut r = rv.clone(); + l.sort(); + r.sort(); + assert_eq!(l, r, "values differ for key {k}"); + } + } + info!( + "COMPUTED: {:?}", + deep_projections_computed_with_physical_rule + ); + info!("EXPECTED: {:?}", tests); + let results = ctx + .execute_logical_plan(optimized_plan) + .await? + .collect() + .await?; + info!("Results: {}", results.len()); + Ok(()) +} + +#[tokio::test] +async fn test_deep_projections_1() -> Result<()> { + let parquet_path = format!( + "{}/tests/data/deep_projections/first.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let ctx = build_context(); + ctx.register_parquet( + "cross_industry_demo_data", + parquet_path, + ParquetReadOptions::default(), + ) + .await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + WITH events AS ( + SELECT + endUserIDs.aaid_id as DeviceId, + _experience.eVar56 as UserId, + timestamp + FROM + cross_industry_demo_data + WHERE _ACP_DATE='2025-01-03' + ) + SELECT + events.*, + LAG(UserId, 1) OVER (PARTITION BY DeviceId ORDER BY events.timestamp) AS PreviousUserColName, + cross_industry_demo_data._experience.eVar56 + FROM events + INNER JOIN cross_industry_demo_data on events.DeviceId = cross_industry_demo_data.endUserIDs.aaid_id + LIMIT 100 + "#, + vec![ + HashMap::from([(0, vec![]), (1, vec![]), (2, vec!["aaid_id".to_string()]), (3, vec!["eVar56".to_string()])]), + HashMap::from([(2, vec!["aaid_id".to_string()]), (3, vec!["eVar56".to_string()])]) + ], + ).await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + count(*) as count_events + FROM cross_industry_demo_data + WHERE + (_ACP_DATE BETWEEN '2023-01-01' AND '2025-02-02') + AND _experience.eVar56 is not null + LIMIT 100 + "#, + vec![HashMap::from([ + (0, vec![]), + (3, vec!["eVar56".to_string()]), + ])], + ) + .await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + endUserIDs + FROM cross_industry_demo_data + WHERE + (_ACP_DATE BETWEEN '2025-01-01' AND '2025-01-02') + AND _experience.eVar56 is not null + LIMIT 10 + "#, + vec![HashMap::from([ + (0, vec![]), + (2, vec![]), + (3, vec!["eVar56".to_string()]), + ])], + ) + .await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + * + FROM cross_industry_demo_data + WHERE + (_ACP_DATE BETWEEN '2023-01-01' AND '2025-02-02') + AND _experience.eVar56 is not null + LIMIT 100 + "#, + vec![HashMap::from([ + (0, vec![]), + (1, vec![]), + (2, vec![]), + (3, vec![]), + ])], + ) + .await?; + + Ok(()) +} + +#[tokio::test] +#[allow(clippy::let_unit_value)] +async fn test_deep_projections_genstudio() -> Result<()> { + let ctx = build_context(); + let _ = ctx.register_parquet( + "meta_asset_summary_metrics", + format!("{}/tests/data/deep_projections/genstudio/meta_asset_summary_metrics.parquet", env!("CARGO_MANIFEST_DIR")), + ParquetReadOptions::default(), + ).await?; + let _ = ctx.register_parquet( + "meta_asset_summary_metrics_by_age_and_gender", + format!("{}/tests/data/deep_projections/genstudio/meta_asset_summary_metrics_by_age_and_gender.parquet", env!("CARGO_MANIFEST_DIR")), + ParquetReadOptions::default(), + ).await?; + let _ = ctx.register_parquet( + "meta_asset_featurization", + format!("{}/tests/data/deep_projections/genstudio/meta_asset_featurization.parquet", env!("CARGO_MANIFEST_DIR")), + ParquetReadOptions::default(), + ).await?; + + // Stats: Asset summary metrics + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + count(*) AS cnt + FROM + meta_asset_summary_metrics + WHERE + _ACP_DATE = '2024-12-01' + "#, + vec![HashMap::from([(3, vec![])])], + ) + .await?; + + // Preview: Asset summary metrics + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + * + FROM + meta_asset_summary_metrics + LIMIT 100 + "#, + vec![HashMap::from([(0, vec![]), (1, vec![]), (2, vec![]), (3, vec![]), (4, vec![])])], + ) + .await?; + + // Agg: Count assets by age + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + count (*) AS cnt, + _aresstagevalidationco['genStudioInsights']['age'] AS age + FROM + meta_asset_summary_metrics_by_age_and_gender + WHERE + _ACP_DATE = '2024-12-01' + GROUP BY + age + ORDER BY + cnt DESC + LIMIT + 10 + "#, + vec![HashMap::from([ + (2, vec!["genStudioInsights.age".to_string()]), + (3, vec![]), + ])], + ) + .await?; + + // Agg: clicks by url + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + AVG( + asset_metrics._aresstagevalidationco.genStudioInsights.metrics.performance.clicks.value + ) AS clicks, + asset_meta._aresstagevalidationco.contentAssets.assetThumbnailURL AS asset_url + FROM + (meta_asset_featurization AS asset_meta + INNER JOIN meta_asset_summary_metrics AS asset_metrics ON ( + asset_meta._aresstagevalidationco['contentAssets']['assetID'] = asset_metrics._aresstagevalidationco['genStudioInsights']['assetID'] + )) + WHERE + _ACP_DATE = '2024-12-01' + GROUP BY + asset_url + ORDER BY + clicks DESC + "#, + vec![ + HashMap::from([ + (1, vec!["contentAssets.assetThumbnailURL".to_string(), "contentAssets.assetID".to_string()]), + ]), + HashMap::from([ + (2, vec!["genStudioInsights.metrics.performance.clicks.value".to_string(), "genStudioInsights.assetID".to_string()]), + (3, vec![]) + ]), + ], + ) + .await?; + + // Agg: clicks by url + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + talias.tmetrics_aresstagevalidationco.genStudioInsights.metrics.performance.clicks.value as clicks, + talias.tfeatures_aresstagevalidationco.contentAssets.assetThumbnailURL AS asset_url + FROM ( + SELECT + asset_metrics._aresstagevalidationco AS tmetrics_aresstagevalidationco, + asset_meta._aresstagevalidationco AS tfeatures_aresstagevalidationco + FROM + meta_asset_featurization AS asset_meta + INNER JOIN meta_asset_summary_metrics AS asset_metrics ON ( + asset_meta._aresstagevalidationco.contentAssets.assetID = asset_metrics._aresstagevalidationco.genStudioInsights.assetID + ) + WHERE + _ACP_DATE = '2024-12-01' + ) AS talias + ORDER BY + clicks DESC + "#, + vec![ + HashMap::from([ + (1, vec!["contentAssets.assetThumbnailURL".to_string(), "contentAssets.assetID".to_string()]), + ]), + HashMap::from([ + (2, vec!["genStudioInsights.metrics.performance.clicks.value".to_string(), "genStudioInsights.assetID".to_string()]), + (3, vec![]) + ]), + ], + ) + .await?; + + // SQL Editor + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + _ACP_DATE DAY, + _aresstagevalidationco.genStudioInsights.campaignID campaign_id, + SUM( + _aresstagevalidationco.genStudioInsights.metrics.spend.value + ) total_spend + FROM + meta_asset_summary_metrics + WHERE + _ACP_DATE BETWEEN '2024-12-01' AND '2024-12-15' + GROUP BY + DAY, + campaign_id + ORDER BY + DAY, + total_spend DESC, + campaign_id + "#, + vec![ + HashMap::from([ + ( + 2, + vec![ + "genStudioInsights.campaignID".to_string(), + "genStudioInsights.metrics.spend.value".to_string(), + ], + ), + (3, vec![]), + ]) + ], + ) + .await?; + + Ok(()) +} + +#[tokio::test] +async fn test_very_complicated_plan() -> Result<()> { + let ctx = build_context(); + let _ = ctx + .sql( + r#" + CREATE OR REPLACE TABLE fact_profile_overlap_of_namespace ( + merge_policy_id INT8, + date_key DATE, + overlap_id INT8, + count_of_profiles INT8 + ); + "#, + ) + .await?; + + let _ = ctx + .sql( + r#" + CREATE OR REPLACE TABLE dim_overlap_namespaces ( + overlap_id INT8, + merge_policy_id INT8, + overlap_namespaces VARCHAR + ); + "#, + ) + .await?; + + let _ = ctx + .sql( + r#" + CREATE OR REPLACE TABLE fact_profile_by_namespace_trendlines ( + namespace_id INT8, + merge_policy_id INT8, + date_key DATE, + count_of_profiles INT8 + ); + "#, + ) + .await?; + + let _ = ctx + .sql( + r#" + CREATE OR REPLACE TABLE dim_namespaces ( + namespace_id INT8, + namespace_description VARCHAR, + merge_policy_id INT8 + ); + "#, + ) + .await?; + let query = r#" +SELECT + sum(overlap_col1) overlap_col1, + sum(overlap_col2) overlap_col2, + coalesce(Sum(overlap_count), 0) overlap_count +FROM + ( + SELECT + 0 overlap_col1, + 0 overlap_col2, + Sum(count_of_profiles) overlap_count + FROM + fact_profile_overlap_of_namespace + WHERE + fact_profile_overlap_of_namespace.merge_policy_id = -115008144 + AND fact_profile_overlap_of_namespace.date_key = '2024-11-06' + AND fact_profile_overlap_of_namespace.overlap_id IN ( + SELECT + a.overlap_id + FROM + ( + SELECT + dim_overlap_namespaces.overlap_id overlap_id, + count(*) cnt_num + FROM + dim_overlap_namespaces + WHERE + dim_overlap_namespaces.merge_policy_id = -115008144 + AND dim_overlap_namespaces.overlap_namespaces IN ( + 'aaid', + 'ecid' + ) + GROUP BY + dim_overlap_namespaces.overlap_id + ) a + WHERE + a.cnt_num > 1 + ) + UNION + ALL + SELECT + count_of_profiles overlap_col1, + 0 overlap_col2, + 0 overlap_count + FROM + fact_profile_by_namespace_trendlines + JOIN dim_namespaces ON fact_profile_by_namespace_trendlines.namespace_id = dim_namespaces.namespace_id + AND fact_profile_by_namespace_trendlines.merge_policy_id = dim_namespaces.merge_policy_id + WHERE + fact_profile_by_namespace_trendlines.merge_policy_id = -115008144 + AND fact_profile_by_namespace_trendlines.date_key = '2024-11-06' + AND dim_namespaces.namespace_description = 'aaid' + UNION + ALL + SELECT + 0 overlap_col1, + count_of_profiles overlap_col2, + 0 overlap_count + FROM + fact_profile_by_namespace_trendlines + JOIN dim_namespaces ON fact_profile_by_namespace_trendlines.namespace_id = dim_namespaces.namespace_id + AND fact_profile_by_namespace_trendlines.merge_policy_id = dim_namespaces.merge_policy_id + WHERE + fact_profile_by_namespace_trendlines.merge_policy_id = -115008144 + AND fact_profile_by_namespace_trendlines.date_key = '2024-11-06' + AND dim_namespaces.namespace_description = 'ecid' + ) a; + "#; + let _ = run_deep_projection_optimize_test(&ctx, query, vec![]).await?; + let plan = ctx.state().create_logical_plan(query).await?; + info!("plan: {}", &plan); + let optimized_plan = ctx.state().optimize(&plan)?; + info!("optimized: {}", &optimized_plan.display_indent()); + let result = ctx.execute_logical_plan(optimized_plan).await?; + // let result = ctx.sql(query).await?; + result.show().await?; + + Ok(()) +} + +#[tokio::test] +#[allow(clippy::let_unit_value)] +async fn test_mid_values_window() -> Result<()> { + let ctx = build_context(); + let _ = ctx + .register_parquet( + "midvalues", + format!( + "{}/tests/data/deep_projections/triplea/midvalues.parquet", + env!("CARGO_MANIFEST_DIR") + ), + ParquetReadOptions::default(), + ) + .await?; + + let _ = ctx + .register_parquet( + "extra_user_data", + format!( + "{}/tests/data/deep_projections/triplea/extra_user_data.parquet", + env!("CARGO_MANIFEST_DIR") + ), + ParquetReadOptions::default(), + ) + .await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + timestamp, + web.webPageDetails.pageViews.value AS pageview, + endUserIDs._experience.mcid.id AS mcid, + endUserIDs._experience.aaid.id AS aaid, + COALESCE( + endUserIDs._experience.mcid.id, + endUserIDs._experience.aaid.id + ) AS partitionCol, + LAG(timestamp) OVER( + PARTITION BY COALESCE( + endUserIDs._experience.mcid.id, + endUserIDs._experience.aaid.id + ) + ORDER BY timestamp + ) AS last_event + FROM + midvalues + WHERE + timestamp >= TO_TIMESTAMP('2025-01-15') + AND timestamp < TO_TIMESTAMP('2025-01-16') + + "#, + vec![ + HashMap::from([ + (0, vec![]), + (1, vec!["webPageDetails.pageViews.value".to_string()]), + ( + 2, + vec![ + "_experience.mcid.id".to_string(), + "_experience.aaid.id".to_string(), + ], + ), + ]) + ], + ) + .await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + extra_user_data.endUserIDs['_experience']['mcid']['name'], + midvalues.timestamp, + midvalues.web['webPageDetails']['pageViews']['value'] AS pageview, + midvalues.endUserIDs['_experience']['mcid']['id'] AS mcid, + midvalues.endUserIDs['_experience']['aaid']['id'] AS aaid, + COALESCE( + midvalues.endUserIDs['_experience']['mcid']['id'], + midvalues.endUserIDs['_experience']['aaid']['id'] + ) AS partitionCol, + LAG(timestamp) OVER( + PARTITION BY COALESCE( + midvalues.endUserIDs['_experience']['mcid']['id'], + midvalues.endUserIDs['_experience']['aaid']['id'] + ) + ORDER BY midvalues.timestamp + ) AS last_event + FROM + midvalues + INNER JOIN + extra_user_data + ON midvalues.endUserIDs['_experience']['mcid']['id'] = extra_user_data.endUserIDs['_experience']['mcid']['id'] + WHERE + midvalues.timestamp >= TO_TIMESTAMP('2025-01-15') + AND midvalues.timestamp < TO_TIMESTAMP('2025-01-16') + + "#, + vec![ + HashMap::from([ + (0, vec![]), + (1, vec!["webPageDetails.pageViews.value".to_string()]), + ( + 2, + vec![ + "_experience.mcid.id".to_string(), + "_experience.aaid.id".to_string(), + ], + ), + ]), + HashMap::from([ + ( + 0, + vec![ + "_experience.mcid.id".to_string(), + "_experience.mcid.name".to_string(), + ], + ), + ]), + ], + ) + .await?; + Ok(()) +} + +#[tokio::test] +#[allow(dead_code, unused_variables)] +async fn test_mid_values_window_execution() -> Result<()> { + let ctx = build_context(); + ctx.register_parquet( + "midvalues", + format!( + "{}/tests/data/deep_projections/triplea/midvalues.parquet", + env!("CARGO_MANIFEST_DIR") + ), + ParquetReadOptions::default(), + ) + .await?; + let query = r#" + SELECT + timestamp, + web.webPageDetails.pageViews.value AS pageview, + endUserIDs._experience.mcid.id AS mcid, + endUserIDs._experience.aaid.id AS aaid, + COALESCE( + endUserIDs._experience.mcid.id, + endUserIDs._experience.aaid.id + ) AS partitionCol, + LAG(timestamp) OVER( + PARTITION BY COALESCE( + endUserIDs._experience.mcid.id, + endUserIDs._experience.aaid.id + ) + ORDER BY timestamp + ) AS last_event + FROM + midvalues + WHERE + timestamp >= TO_TIMESTAMP('2025-01-15') + AND timestamp < TO_TIMESTAMP('2025-01-16') + + "#; + + let _ = run_deep_projection_optimize_test( + &ctx, + query, + vec![ + HashMap::from([ + (0, vec![]), + ( + 1, + vec![ + "webPageDetails.pageViews.value".to_string() + ] + ), + ( + 2, + vec![ + "_experience.aaid.id".to_string(), + "_experience.mcid.id".to_string() + ] + ) + ]), + ] + ).await?; + + Ok(()) +} + +async fn dump_parquet_schema(filename: &str) -> Result { + let meta = local_unpartitioned_file(filename); + let store = Arc::new(LocalFileSystem::new()); + let metadata = + fetch_parquet_metadata_test(store.deref(), &meta, None, None, None).await?; + let file_metadata = metadata.file_metadata(); + let parquet_schema = file_metadata.schema_descr(); + // parquet_schema + // .columns() + // .iter() + // .enumerate() + // .for_each(|(i, c)| info!("parquet schema: {} = {} {}", i, c.name(), c.path())); + Ok(parquet_schema.clone()) +} + +#[tokio::test] +#[allow(clippy::let_unit_value, dead_code, unused_variables)] +async fn test_plain_map() -> Result<()> { + let filename = format!( + "{}/tests/data/parquet_map.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let _ = dump_parquet_schema(filename.as_str()).await?; + + let ctx = build_context(); + let _ = ctx + .register_parquet("table1", filename.clone(), ParquetReadOptions::default()) + .await?; + // let mut query = r#"SELECT orderData.productList.SKU from table1"#; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#"SELECT * from table1"#, + vec![HashMap::from([(0, vec![]), (1, vec![]), (2, vec![])])], + ) + .await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#"SELECT strings from table1 WHERE strings['method'] == 'GET'"#, + vec![HashMap::from([(1, vec![])])], + ) + .await?; + + let df = ctx + .sql(r#"SELECT ints['bytes'] from table1 WHERE strings['method'] == 'GET'"#) + .await? + .collect() + .await?; + + Ok(()) +} + +#[tokio::test] +async fn test_list_struct_map() -> Result<()> { + if !HANDLE_STRUCT_IN_LIST { + info!("Test disabled !"); + return Ok(()); + } + + let filename = format!( + "{}/tests/data/deep_projections/list_struct_map/table.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let _ = dump_parquet_schema(filename.as_str()).await?; + + let ctx = build_context(); + ctx.register_parquet("table1", filename.clone(), ParquetReadOptions::default()) + .await?; + + let query = r#" + SELECT + list_of_struct_of_list_of_struct[1]['fieldmap']['key1']['prop1'] + from + table1"#; + + let _ = run_deep_projection_optimize_test( + &ctx, + query, + vec![ + HashMap::from( + [(2, vec!["*.fieldmap.*.*.prop1".to_string()])] + ) + ] + ).await?; + + let plan = ctx.state().create_logical_plan(query).await?; + info!("plan: {}", &plan); + let optimized_plan = ctx.state().optimize(&plan)?; + info!("optimized: {}", &optimized_plan.display_indent()); + + let query_planner = ctx.state().query_planner().clone(); + let physical_plan = query_planner + .create_physical_plan(&optimized_plan, &ctx.state()) + .await + .expect("Error creating physical plan"); + info!( + "physical: {}", + displayable(physical_plan.as_ref()) + .set_show_schema(true) + .indent(true) + ); + let results = collect(physical_plan, ctx.state().task_ctx()).await?; + println!( + "{}", + pretty::pretty_format_batches_with_options(&results, &FormatOptions::default())? + .to_string() + ); + + Ok(()) +} + +#[tokio::test] +async fn test_list_struct_map_with_array_element() -> Result<()> { + if !HANDLE_STRUCT_IN_LIST { + info!("Test disabled !"); + return Ok(()); + } + + let filename = format!( + "{}/tests/data/deep_projections/list_struct_map/table.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let _ = dump_parquet_schema(filename.as_str()).await?; + + let ctx = build_context(); + ctx.register_parquet("table1", filename.clone(), ParquetReadOptions::default()) + .await?; + let query = r#"SELECT orderData.productList['SKU'] from table1"#; + ctx.sql(query).await?.show().await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#"SELECT orderData.productList.SKU from table1"#, + vec![HashMap::from([(1, vec!["productList.*.SKU".to_string()])])], + ) + .await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#"SELECT list_of_struct_of_list_of_struct[0]['field2'][0]['subliststruct1']['name'] from table1"#, + vec![HashMap::from([( + 2, + vec!["*.field2.*.subliststruct1.*.name".to_string()], + )])], + ).await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#"SELECT list_of_struct_of_list_of_struct[0]['fieldmap']['key1']['prop1'] from table1"#, + vec![HashMap::from([( + 2, + vec!["*.fieldmap.*.*.prop1".to_string()], + )])], + ).await?; + + Ok(()) +} + +#[tokio::test] +#[allow(clippy::let_unit_value)] +async fn test_xxx() -> Result<()> { + let ctx = build_context(); + + let filename = format!( + "{}/tests/data/deep_projections/mid_values_2/midvalues.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let _ = ctx + .register_parquet( + "mid_values", + filename.clone(), + ParquetReadOptions::default(), + ) + .await?; + + let query = r#" + SELECT + _experience.analytics.customDimensions.eVars.eVar40 as EntityID, + 'MembershipNumber' as EntityType, + web.webPageDetails, + -- productListItems, + productListItems.SKU + FROM mid_values + where + _experience.analytics.customDimensions.eVars.eVar40 is not null + AND (_ACP_DATE BETWEEN '2025-01-01' AND '2025-01-02') + LIMIT 20 + "#; + + + let _ = run_deep_projection_optimize_test( + &ctx, + query, + vec![ + HashMap::from([ + (2, vec!["*.SKU".to_string()]), + (4, vec!["webPageDetails".to_string()]), + (5, vec!["analytics.customDimensions.eVars.eVar40".to_string()]), + (6, vec![]) + ]) + ] + ).await?; + + let plan = ctx.state().create_logical_plan(query).await?; + info!("plan: {}", &plan); + let optimized_plan = ctx.state().optimize(&plan)?; + info!("optimized: {}", &optimized_plan.display_indent_schema()); + let query_planner = ctx.state().query_planner().clone(); + let physical_plan = query_planner + .create_physical_plan(&optimized_plan, &ctx.state()) + .await + .expect("Error creating physical plan"); + info!( + "physical: {}", + displayable(physical_plan.as_ref()) + .set_show_schema(true) + .indent(true) + ); + let results = collect(physical_plan, ctx.state().task_ctx()).await?; + println!( + "{}", + pretty::pretty_format_batches_with_options(&results, &FormatOptions::default())? + .to_string() + ); + Ok(()) +} + +#[tokio::test] +#[allow(clippy::let_unit_value)] +async fn test_mid_values_2() -> Result<()> { + if !HANDLE_STRUCT_IN_LIST { + info!("Test disabled !"); + return Ok(()); + } + let filename = format!( + "{}/tests/data/deep_projections/mid_values_2/midvalues.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let _ = dump_parquet_schema(filename.as_str()).await?; + + let ctx = build_context(); + let _ = ctx + .register_parquet( + "mid_values", + filename.clone(), + ParquetReadOptions::default(), + ) + .await?; + + let sql_create_view = r#" + SELECT * + FROM mid_values + WHERE + timestamp between cast('2025-01-01' as timestamp) and cast('2025-01-02' as timestamp) + AND + ( + -- Not in data (commerce.productListViews.value IS NOT NULL) OR + -- Not in data (commerce.checkouts.value IS NOT NULL) OR + -- (commerce.purchases.value IS NOT NULL) OR + -- Not in data (commerce.productListAdds.value IS NOT NULL) OR + -- Not in data (commerce.productListRemovals.value IS NOT NULL) OR + -- (commerce.order.purchaseID IS NOT NULL) OR + -- Not in data (commerce.productListOpens.value IS NOT NULL) OR + -- (commerce.productViews.value IS NOT NULL) OR + -- Not in data (commerce.cartAbandons.value IS NOT NULL) OR + (web.webPageDetails.name IS NOT NULL) --OR + -- (web.webInteraction.linkClicks.value != 0) OR + -- Not in data (application.applicationCloses.value IS NOT NULL) OR + -- Not in data (application.crashes.value IS NOT NULL) OR + -- Not in data (application.featureUsages.value IS NOT NULL) OR + -- Not in data (application.firstLaunches.value IS NOT NULL) OR + -- Not in data (application.installs.value IS NOT NULL) OR + -- Not in data (application.launches.value IS NOT NULL) OR + -- Not in data (application.upgrades.value IS NOT NULL) OR + -- (search.keywords IS NOT NULL) OR + -- (array_contains(productListItems.SKU, 'MVP Lead')) OR + -- (marketing.trackingCode LIKE '%txx%') + ) + -- user_filters + -- AND NOT (array_contains(productListItems.SKU, 'Renewal')) + "#; + let t = ctx.sql(sql_create_view).await?.into_view(); + ctx.register_table("mid_values_2", t)?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + _experience.analytics.customDimensions.eVars.eVar40 as EntityID, + 'MembershipNumber' as EntityType, + web.webPageDetails, + -- productListItems, + productListItems.SKU + FROM mid_values_2 + where + _experience.analytics.customDimensions.eVars.eVar40 is not null + AND (_ACP_DATE BETWEEN '2025-01-01' AND '2025-01-02') + LIMIT 20 + "#, + vec![HashMap::from([ + (2, vec!["*.SKU".to_string()]), + (4, vec!["webPageDetails".to_string()]), + ( + 5, + vec!["analytics.customDimensions.eVars.eVar40".to_string()], + ), + (6, vec![]), + (0, vec![]), + ])], + ) + .await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + _experience.analytics.customDimensions.eVars.eVar40 as EntityID, + 'MembershipNumber' as EntityType, + timestamp as RecordTime, + timestamp, + -- productListItems as prds_workaround, + unnest( + make_array( + CASE WHEN (web.webPageDetails.name IS NOT NULL) THEN named_struct('TargetType', 'WebPage', 'EventType', 'webVisit', 'Target_array', make_array(web.webPageDetails.name), 'lastQualificationTime', timestamp, 'EventProperty', cast(null as string)) END + ) + ) as col_data + FROM mid_values_2 where _experience.analytics.customDimensions.eVars.eVar40 is not null + LIMIT 10 + "#, + vec![HashMap::from([ + (0, vec![]), + (4, vec!["webPageDetails.name".to_string()]), + ( + 5, + vec!["analytics.customDimensions.eVars.eVar40".to_string()], + ), + ])], + ) + .await?; + Ok(()) +} + +#[tokio::test] +async fn test_mid_values_3() -> Result<()> { + if !HANDLE_STRUCT_IN_LIST { + info!("Test disabled !"); + return Ok(()); + } + + let filename = format!( + "{}/tests/data/deep_projections/mid_values_2/midvalues.parquet", + env!("CARGO_MANIFEST_DIR") + ); + let _ = dump_parquet_schema(filename.as_str()).await?; + + let ctx = build_context(); + ctx.register_parquet( + "mid_values", + filename.clone(), + ParquetReadOptions::default(), + ) + .await?; + + let sql_create_view = r#" + SELECT * + FROM mid_values + WHERE + timestamp between cast('2025-01-01' as timestamp) and cast('2025-01-02' as timestamp) + AND + ( + -- Not in data (commerce.productListViews.value IS NOT NULL) OR + -- Not in data (commerce.checkouts.value IS NOT NULL) OR + -- (commerce.purchases.value IS NOT NULL) OR + -- Not in data (commerce.productListAdds.value IS NOT NULL) OR + -- Not in data (commerce.productListRemovals.value IS NOT NULL) OR + -- (commerce.order.purchaseID IS NOT NULL) OR + -- Not in data (commerce.productListOpens.value IS NOT NULL) OR + -- (commerce.productViews.value IS NOT NULL) OR + -- Not in data (commerce.cartAbandons.value IS NOT NULL) OR + (web.webPageDetails.name IS NOT NULL) --OR + -- (web.webInteraction.linkClicks.value != 0) OR + -- Not in data (application.applicationCloses.value IS NOT NULL) OR + -- Not in data (application.crashes.value IS NOT NULL) OR + -- Not in data (application.featureUsages.value IS NOT NULL) OR + -- Not in data (application.firstLaunches.value IS NOT NULL) OR + -- Not in data (application.installs.value IS NOT NULL) OR + -- Not in data (application.launches.value IS NOT NULL) OR + -- Not in data (application.upgrades.value IS NOT NULL) OR + -- (search.keywords IS NOT NULL) OR + -- (array_contains(productListItems.SKU, 'MVP Lead')) OR + -- (marketing.trackingCode LIKE '%txx%') + ) + -- user_filters + -- AND NOT (array_contains(productListItems.SKU, 'Renewal')) + "#; + let t = ctx.sql(sql_create_view).await?.into_view(); + ctx.register_table("adc_step1", t)?; + + let sql_create_adc_step_2 = r#" + SELECT + _experience.analytics.customDimensions.eVars.eVar40 as EntityID, + 'MembershipNumber' as EntityType, + timestamp as RecordTime, + timestamp, + -- productListItems as prds_workaround, + unnest( + make_array( + CASE WHEN (web.webPageDetails.name IS NOT NULL) THEN named_struct('TargetType', 'WebPage', 'EventType', 'webVisit', 'Target_array', make_array(web.webPageDetails.name), 'lastQualificationTime', timestamp, 'EventProperty', cast(null as string)) END + ) + ) as col_data + FROM adc_step1 where _experience.analytics.customDimensions.eVars.eVar40 is not null + "#; + let t2 = ctx.sql(sql_create_adc_step_2).await?.into_view(); + ctx.register_table("adc_step_2", t2)?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + EntityID, + EntityType, + RecordTime, + col_data.EventType as EventType, + -- lower(cast(trans_target.col as string)) as Target, + unnest(col_data.Target_array) as Target, + col_data.TargetType as TargetType, + coalesce(col_data.lastQualificationTime, timestamp) EventTime, + col_data.EventProperty as EventProperty, + 1 as EventCount + FROM adc_step_2 + where col_data is not null + "#, + vec![HashMap::from([ + (0, vec![]), + (4, vec!["webPageDetails.name".to_string()]), + ( + 5, + vec!["analytics.customDimensions.eVars.eVar40".to_string()], + ), + ])], + ) + .await?; + + Ok(()) +} + +#[tokio::test] +async fn test_mid_values_4() -> Result<()> { + if !HANDLE_STRUCT_IN_LIST { + info!("Test disabled !"); + return Ok(()); + } + let filename = format!( + "{}/tests/data/deep_projections/mid_values_2/midvalues.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let _ = dump_parquet_schema(filename.as_str()).await?; + + let ctx = build_context(); + ctx.register_parquet( + "mid_values", + filename.clone(), + ParquetReadOptions::default(), + ) + .await?; + + // let tmp = ctx.sql(r#" + // + // "#).await.unwrap().show().await.unwrap(); + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + EntityID, + timestamp as RecordTime, + signals.EventType AS EventType, + unnest(signals.Target_array) AS Target, + signals.TargetType AS TargetType, + coalesce(signals.lastQualificationTime, timestamp) EventTime, + 1 AS EventCount + FROM + ( + SELECT + _experience.analytics.customDimensions.eVars.eVar40 AS EntityID, + timestamp, + unnest( + make_array( + CASE + WHEN (web.webPageDetails.name IS NOT NULL) THEN named_struct( + 'TargetType', + 'WebPage', + 'EventType', + 'webVisit', + 'Target_array', + make_array(web.webPageDetails.name), + 'lastQualificationTime', + timestamp + ) + END + ) + ) AS signals, + 0 AS is_conversion + FROM + mid_values + WHERE + timestamp BETWEEN cast('2024-12-31' AS timestamp) AND cast('2025-02-01' AS timestamp) + AND _experience.analytics.customDimensions.eVars.eVar40 IS NOT NULL + AND web.webPageDetails.name IS NOT NULL + ) AS t + WHERE + t.signals IS NOT NULL + "#, + vec![HashMap::from([ + (0, vec![]), + (4, vec!["webPageDetails.name".to_string()]), + ( + 5, + vec!["analytics.customDimensions.eVars.eVar40".to_string()], + ), + ])], + ) + .await?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + EntityID, + RecordTime, + signals.EventType AS EventType, + unnest(signals.Target_array) AS Target, + signals.TargetType AS TargetType, + coalesce(signals.lastQualificationTime, RecordTime) EventTime, + 1 AS EventCount + FROM + ( + SELECT + _experience.analytics.customDimensions.eVars.eVar40 AS EntityID, + timestamp as RecordTime, + unnest( + make_array( + CASE + WHEN (web.webPageDetails.name IS NOT NULL) THEN named_struct( + 'TargetType', + 'WebPage', + 'EventType', + 'webVisit', + 'Target_array', + make_array(web.webPageDetails.name), + 'lastQualificationTime', + timestamp + ) + END + ) + ) AS signals, + 0 AS is_conversion + FROM + mid_values + WHERE + timestamp BETWEEN cast('2024-12-31' AS timestamp) AND cast('2025-02-01' AS timestamp) + AND _experience.analytics.customDimensions.eVars.eVar40 IS NOT NULL + AND web.webPageDetails.name IS NOT NULL + ) AS t + WHERE + t.signals IS NOT NULL + "#, + vec![HashMap::from([ + (0, vec![]), + (4, vec!["webPageDetails.name".to_string()]), + ( + 5, + vec!["analytics.customDimensions.eVars.eVar40".to_string()], + ), + ])], + ) + .await?; + + Ok(()) +} + +#[tokio::test] +async fn test_mid_values_5() -> Result<()> { + if !HANDLE_STRUCT_IN_LIST { + info!("Test disabled !"); + return Ok(()); + } + let filename = format!( + "{}/tests/data/deep_projections/mid_values_2/midvalues.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let _ = dump_parquet_schema(filename.as_str()).await?; + + let ctx = build_context(); + ctx.register_parquet( + "mid_values", + filename.clone(), + ParquetReadOptions::default(), + ) + .await?; + + ctx.sql(r#" + SELECT + COUNT(*) as total_events, + SUM(CASE WHEN _experience.analytics.customDimensions.eVars.eVar40 IS NOT NULL THEN 1 ELSE 0 END) as events_with_users, + SUM(CASE WHEN endUserIDs._experience.mcid.id IS NOT NULL THEN 1 ELSE 0 END) as events_with_devices + FROM mid_values + WHERE _ACP_DATE = '2025-01-01' + "#).await?.show().await?; + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + COUNT(*) as total_events, + SUM(CASE WHEN _experience.analytics.customDimensions.eVars.eVar40 IS NOT NULL THEN 1 ELSE 0 END) as events_with_users, + SUM(CASE WHEN endUserIDs._experience.mcid.id IS NOT NULL THEN 1 ELSE 0 END) as events_with_devices + FROM mid_values + WHERE _ACP_DATE = '2025-04-11' + "#, + vec![HashMap::from([ + (3, vec!["_experience.mcid.id".to_string()]), + ( + 5, + vec!["analytics.customDimensions.eVars.eVar40".to_string()], + ), + (6, vec![]), + ])], + ) + .await?; + Ok(()) +} + +#[tokio::test] +#[allow(clippy::let_unit_value)] +async fn test_billing() -> Result<()> { + let filename = format!( + "{}/tests/data/deep_projections/billing/billing.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let _ = dump_parquet_schema(filename.as_str()).await?; + + let ctx = build_context(); + + let _ = ctx + .register_parquet( + "billing_step0", + filename.clone(), + ParquetReadOptions::default(), + ) + .await?; + + let sql_create_view = r#" + SELECT * + FROM billing_step0 + WHERE + timestamp between cast('2025-02-01' as timestamp) and cast('2025-03-01' as timestamp) + AND ( + (_aaanortheast.TravelBookingEventDetails.aaa_traveltype = '1') + OR (_aaanortheast.TravelBookingEventDetails.aaa_traveltype = '2') + OR (_aaanortheast.TravelBookingEventDetails.aaa_traveltype = '3') + OR (_aaanortheast.TravelBookingEventDetails.aaa_traveltype = '4') + ) + "#; + let t = ctx.sql(sql_create_view).await?.into_view(); + ctx.register_table("billing_step1", t)?; + + let sql_create_billing_step_2 = r#" + SELECT + _aaanortheast.TravelBookingEventDetails.aaa_membernumber as EntityID, + 'MembershipNumber' as EntityType, + timestamp as RecordTime, + timestamp, + CASE + WHEN (_aaanortheast.TravelBookingEventDetails.aaa_traveltype = '1') THEN named_struct('TargetType', '', 'EventType', 'Car Booking', 'Target_array', make_array(cast(null as string)), 'lastQualificationTime', timestamp, 'EventProperty', cast(null as string)) + WHEN (_aaanortheast.TravelBookingEventDetails.aaa_traveltype = '2') THEN named_struct('TargetType', '', 'EventType', 'Flight Booking', 'Target_array', make_array(cast(null as string)), 'lastQualificationTime', timestamp, 'EventProperty', cast(null as string)) + WHEN (_aaanortheast.TravelBookingEventDetails.aaa_traveltype = '3') THEN named_struct('TargetType', '', 'EventType', 'Hotel Booking', 'Target_array', make_array(cast(null as string)), 'lastQualificationTime', timestamp, 'EventProperty', cast(null as string)) + WHEN (_aaanortheast.TravelBookingEventDetails.aaa_traveltype = '4') THEN named_struct('TargetType', '', 'EventType', 'prediction_goal_event', 'Target_array', make_array(cast(null as string)), 'lastQualificationTime', timestamp, 'EventProperty', cast(null as string)) + ELSE null + END as col_data + FROM billing_step1 + "#; + let t2 = ctx.sql(sql_create_billing_step_2).await?.into_view(); + ctx.register_table("billing_step2", t2)?; + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + EntityID, + EntityType, + RecordTime, + col_data.EventType as EventType, + cast(null as string) as Target, + col_data.TargetType as TargetType, + coalesce(col_data.lastQualificationTime, timestamp) EventTime, + col_data.EventProperty as EventProperty, + cast(1 as bigint) as EventCount + FROM billing_step2 + --where col_data is not null + limit 10 + "#, + vec![HashMap::from([ + (0, vec![]), + ( + 2, + vec![ + "TravelBookingEventDetails.aaa_membernumber".to_string(), + "TravelBookingEventDetails.aaa_traveltype".to_string(), + ], + ), + ])], + ) + .await?; + + Ok(()) +} + +#[tokio::test] +async fn test_identity_map() -> Result<()> { + let filename = format!( + "{}/tests/data/deep_projections/identity_map/raw.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let _ = dump_parquet_schema(filename.as_str()).await?; + + let ctx = build_context(); + ctx.register_parquet("raw", filename.clone(), ParquetReadOptions::default()) + .await?; + + // let sql = r#" + // SELECT + // identityMap['ECID'][0]['id'] + // FROM + // raw + // "#; + // ctx.sql(sql).await.unwrap().show().await.unwrap(); + + let _ = run_deep_projection_optimize_test( + &ctx, + r#" + SELECT + identityMap['ECID'][0]['id'] AS EntityID, + 'ECID' AS EntityType, + timestamp AS RecordTime, + unnest( + make_array( + CASE + WHEN (web.webPageDetails.name IS NOT NULL) THEN named_struct( + 'TargetType', + 'WebPage', + 'EventType', + 'webVisit', + 'Target_array', + make_array(web.webPageDetails.name), + 'lastQualificationTime', + timestamp, + 'EventProperty', + cast(NULL AS STRING) + ) + END + ) + ) AS signals + FROM + raw + WHERE + timestamp BETWEEN cast('2025-02-28' AS timestamp) + AND cast('2025-03-28' AS timestamp) + AND identityMap['ECID'][0]['id'] IS NOT NULL + "#, + vec![HashMap::from([ + (0, vec![]), + (1, vec!["webPageDetails.name".to_string()]), + (2, vec!["*.*.id".to_string()]), + ])], + ) + .await?; + + Ok(()) +} + +#[tokio::test] +#[allow(dead_code, unused_variables)] +async fn test_gs_summary_metrics() -> Result<()> { + let filename = format!( + "{}/tests/data/deep_projections/gs_summary_metrics/gs_summary_metrics.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let ctx = build_context(); + ctx.register_parquet( + "gs_summary_metrics", + filename.clone(), + ParquetReadOptions::default(), + ) + .await?; + + let sql1 = r#" + SELECT + COUNT(*) + FROM + gs_summary_metrics + WHERE + _wfadoberm.genStudioInsights.entityIDs.account.accountGUID = 'Meta_2974530739347344' + AND _wfadoberm.genStudioInsights.breakdownType = 'AccountCampaignAdGroupAd' + AND _ACP_DATE >= '2024-10-09' AND _ACP_DATE <= '2024-10-10' + "#; + let _ = run_deep_projection_optimize_test( + &ctx, + sql1, + vec![ + HashMap::from([ + (2, vec!["genStudioInsights.breakdownType".to_string(), "genStudioInsights.entityIDs.account.accountGUID".to_string()]), + (3, vec![]) + ]) + ] + ).await?; + + let sql2 = r#" + SELECT + count(*) + FROM + gs_summary_metrics + WHERE + _wfadoberm.genStudioInsights.entityIDs.account.accountGUID = 'Meta_2974530739347344' + AND _wfadoberm.genStudioInsights.breakdownType = 'AccountCampaignAdGroupAd' + AND timestamp >= '2024-10-09 00:00:00' + "#; + let _ = run_deep_projection_optimize_test( + &ctx, + sql2, + vec![ + HashMap::from([ + (2, vec!["genStudioInsights.breakdownType".to_string(), "genStudioInsights.entityIDs.account.accountGUID".to_string()]), + (0, vec![]) + ]) + ] + ).await?; + + Ok(()) +} + +#[tokio::test] +#[allow(dead_code, unused_variables)] +async fn test_delta_with_deep_struct_in_list() -> Result<()> { + let filename = format!( + "{}/tests/data/deep_projections/delta_with_deep_struct_in_list/values.parquet", + env!("CARGO_MANIFEST_DIR") + ); + + let ctx = build_context(); + + ctx.register_parquet("values", filename.clone(), ParquetReadOptions::default()) + .await?; + + let query = r#" + SELECT + productListItems['_experience']['analytics']['events'] + FROM + values; + "#; + + let _ = run_deep_projection_optimize_test( + &ctx, + query, + vec![HashMap::from([ + (2, vec!["*._experience.analytics.events".to_string()]) + ])], + ) + .await?; + + let plan = ctx.state().create_logical_plan(query).await?; + info!("plan: {}", &plan); + let optimized_plan = ctx.state().optimize(&plan)?; + info!("optimized: {}", &optimized_plan.display_indent()); + + let query_planner = ctx.state().query_planner().clone(); + let physical_plan = query_planner + .create_physical_plan(&optimized_plan, &ctx.state()) + .await + .expect("Error creating physical plan"); + info!("physical: {}", &optimized_plan.display_indent()); + + Ok(()) +} diff --git a/datafusion/datasource-parquet/Cargo.toml b/datafusion/datasource-parquet/Cargo.toml index b865422366f46..45a4c33edd01c 100644 --- a/datafusion/datasource-parquet/Cargo.toml +++ b/datafusion/datasource-parquet/Cargo.toml @@ -44,6 +44,7 @@ datafusion-physical-expr = { workspace = true } datafusion-physical-expr-adapter = { workspace = true } datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } +datafusion-physical-optimizer = { workspace = true } datafusion-pruning = { workspace = true } datafusion-session = { workspace = true } futures = { workspace = true } diff --git a/datafusion/datasource-parquet/src/leaves.rs b/datafusion/datasource-parquet/src/leaves.rs new file mode 100644 index 0000000000000..7912f7195dd9d --- /dev/null +++ b/datafusion/datasource-parquet/src/leaves.rs @@ -0,0 +1,498 @@ +use arrow::datatypes::{DataType, Field, SchemaRef}; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_common::{ScalarValue, internal_err, DataFusionError}; +use datafusion_physical_expr::ScalarFunctionExpr; +use datafusion_physical_expr::expressions::{CastExpr, Column, Literal}; +use datafusion_physical_expr::projection::ProjectionExprs; +use datafusion_physical_expr_common::physical_expr::PhysicalExpr; +use log::{error, info, trace, warn}; +use parquet::schema::types::SchemaDescriptor; +use std::cmp::min; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::sync::Arc; + +pub fn remap_top_level_field_indices(left: &SchemaRef, right: &SchemaRef) -> HashMap> { + let mut out: HashMap> = HashMap::new(); + for (left_index, left_field) in left.fields.iter().enumerate() { + if let Some((right_index, right_field)) = + right.fields.find(left_field.name().as_str()) { + out.insert(left_index, Some(right_index)); + } else { + out.insert(left_index, None); + } + } + out +} + +/// Combines the current projection (numeric indices of top-level columns) with +/// the deep projection - "paths" inside a top-level column +pub fn splat_columns( + src: &SchemaRef, + projection: &[usize], + projection_deep: &HashMap>, +) -> Vec { + projection + .iter() + .map(|pi| { + let f = src.field(*pi); + match projection_deep.get(pi) { + None => { + vec![f.name().to_owned()] + } + Some(rests) => { + if !rests.is_empty() { + rests + .iter() + .map(|rest| format!("{}.{}", f.name(), rest)) + .collect::>() + } else { + vec![f.name().to_owned()] + } + } + } + }) + .flatten() + .collect::>() +} + +// FIXME: ACTUALLY look at the arrow schema and handle map types correctly +// Right now, we are matching "map-like" parquet leaves like "key_value.key" etc +// But, we neeed to walk through both the arrow schema (which KNOWS about the map type) +// and the parquet leaves to do this correctly. +fn equivalent_projection_paths_from_parquet_schema( + _arrow_schema: SchemaRef, + parquet_schema: &SchemaDescriptor, +) -> Vec<(usize, (String, String))> { + let mut output: Vec<(usize, (String, String))> = vec![]; + for (i, col) in parquet_schema.columns().iter().enumerate() { + let original_path = col.path().string(); + let converted_path = { + // we convert the path in the parquet schema to ignore stuff related to maps, entries, lists + let parquet_path = original_path.as_str(); + if parquet_path.contains(".key_value.key") + || parquet_path.contains(".key_value.value") + || parquet_path.contains(".entries.keys") + || parquet_path.contains(".entries.values") + || parquet_path.contains(".list.element") + { + parquet_path + .replace("key_value.key", "*") + .replace("key_value.value", "*") + .replace("entries.keys", "*") + .replace("entries.values", "*") + .replace("list.element", "*") + } else { + parquet_path.to_string() + } + }; + output.push((i, (original_path.clone(), converted_path))); + } + output +} + +#[allow(clippy::ptr_arg)] +pub fn parquet_leaf_paths( + arrow_schema: SchemaRef, + parquet_schema: &SchemaDescriptor, + projection: &Vec, + projection_deep: &HashMap>, +) -> Vec { + trace!(target: "deep", "parquet_leaf_paths {:?} {:?}", projection, projection_deep); + let actual_projection = if projection.is_empty() { + (0..arrow_schema.fields().len()).collect() + } else { + projection.clone() + }; + let splatted = splat_columns(&arrow_schema, &actual_projection, projection_deep); + + let mut out: Vec = vec![]; + for (i, (original, converted)) in + equivalent_projection_paths_from_parquet_schema(arrow_schema, parquet_schema) + { + // FIXME + // for map fields, the actual parquet paths look like x.y.z.key_value.key, x.y.z.key_value.value + // since we are ignoring these names in the paths, we need to actually collapse this access to a * + // so we can filter for them + // also, we need BOTH the key and the value for maps otherwise we run into an arrow-rs error + // "partial projection of MapArray is not supported" + trace!(target: "deep", " generate_leaf_paths looking at index {} {} = {}", i, &original, &converted); + + let mut found = false; + for filter in splatted.iter() { + // check if this filter matches this leaf path + let filter_pieces = filter.split(".").collect::>(); + let col_pieces = converted.split(".").collect::>(); + let mut filter_found = true; + for i in 0..min(filter_pieces.len(), col_pieces.len()) { + if i >= filter_pieces.len() { + // we are at the end of the filter, and we matched until now, so we break, we match ! + break; + } + if i >= col_pieces.len() { + // we have a longer filter, we matched until now, we match ! + break; + } + // we can actually check + if !(col_pieces[i] == filter_pieces[i] || filter_pieces[i] == "*") { + filter_found = false; + break; + } + } + if filter_found { + found = true; + break; + } + } + if found { + out.push(i); + } + } + out +} + +pub fn has_simplified_parquet_columns(possible: &HashMap>) -> bool { + !(possible.is_empty() || possible.iter().all(|(_k, v)| v.is_empty())) +} + +pub fn projection_specifier( + logical_file_schema: SchemaRef, + _projection: &ProjectionExprs, + projection_hints: Option<&ProjectionExprs>, + _projection_hints_indices: &[usize], +) -> HashMap> { + // info!(target: "deep", "leaves::projection_specifier >>>>>>>>>>>>>>>>>>>>>>>>>>>"); + let mut deep_column_map: HashMap> = HashMap::new(); + + let source_schema = logical_file_schema.clone(); + // for expr in projection + // .iter() + // .map(|pe| extract_expressions_containing_column(&pe.expr)) + // .flatten() + // { + // info!("leaves::projection_specifier XXX expr: {:?}, indices: {:?}", expr.to_string(), projection_hints_indices); + // let col_arg = find_column_in_expr(&expr).unwrap(); + // let col_index = source_schema + // .index_of(col_arg.name()) + // .expect("Col in table"); + // if !projection_hints_indices.contains(&col_index) { + // // let marker = format!("xx: {} {}", col_index, expr_to_deep_projection(&pexpr)); + // let marker = simplified_parquet_column_path(&expr); + // let entry = deep_column_map.entry(col_index).or_default(); + // if entry.contains("") { + // // already full column !!!!!!! + // } else { + // entry.insert(marker); + // } + // } + // } + // info!("leaves::projection_specifier deep column map after handling projection: {:?}", &deep_column_map); + + for pexpr in projection_hints.into_iter().flatten() { + let expr = pexpr.clone().expr; + trace!(target: "deep", "leaves::projection_specifier projection hint: {}", expr.to_string()); + if let Some(col_arg) = get_first_column_from_expr(&expr) { + if let Ok(col_index) = source_schema.index_of(col_arg.name()) { + let marker = simplified_parquet_column_path(&expr); + trace!(target: "deep", " > marker: {}", marker.as_str()); + let entry = deep_column_map.entry(col_index).or_default(); + if entry.contains("") { + // already full column !!!!!!! + } else { + entry.insert(marker); + } + } + } + } + trace!(target: "deep", "leaves::projection_specifier deep column map after handling projection hints: {:?}", &deep_column_map); + + let final_map: HashMap> = deep_column_map + .iter() + .map(|(k, v)| { + let k = k.clone(); + let mut newv = v + .into_iter() + // clone + .map(|s| s.clone()) + // // remove empty specifiers + // .filter(|v| v != "") + // fix fake field names which are actually map names + .map(|v| { + trace!(target: "deep", "fix_deep for {}", v.as_str()); + fix_simplified_column_path(v.as_str(), source_schema.field(k)) + // if let Ok(cp) = fix_simplified_column_path(v.as_str(), source_schema.field(k)) { + // Ok(cp) + // } else { + // "".to_string() + // } + // let pieces = v.split(".").collect::>(); + }) + .filter_map(Result::ok) + .collect::>(); + // trace!(target: "deep", "newv = {:?}", &newv) + newv.sort_by_key(|s| (s.split('.').count(), s.len())); + // newv.dedup(); + // newv.sort_by_key(|s| (s.split('.').count(), s.len())); + trace!(target: "deep", "newv = {:?}", &newv); + let mut kept: Vec = Vec::new(); + newv.retain(|s| { + let is_extension = kept.iter().any(|k| { + k.is_empty() || s == k || s.starts_with(&format!("{k}.")) + }); + if !is_extension { + kept.push(s.clone()); + true + } else { + false + } + }); + let newv = newv.into_iter().filter(|v| v != "").collect::>(); + (k, newv) + }) + .into_iter() + .collect(); + trace!(target: "deep", "leaves::projection_specifier final map: {:?}", &final_map); + // info!(target: "deep", "leaves::projection_specifier <<<<<<<<<<<<<<<<<<<<<<<<<<"); + final_map +} + +/// returns true if the expr is a combination of get_field, array_elements, CAST, or column reference +pub fn expr_is_only_get_field_or_array_or_cast_and_contains_column( + input: &Arc, +) -> bool { + let mut has_invalid_expr_type = false; + let mut has_column = false; + let _ = input.apply(|pe| { + if let Some(sfe) = pe.as_any().downcast_ref::() + && sfe.name() == "get_field" + { + return Ok(TreeNodeRecursion::Continue); + }; + if let Some(sfe) = pe.as_any().downcast_ref::() + && sfe.name() == "array_element" + { + return Ok(TreeNodeRecursion::Continue); + }; + if let Some(_) = pe.as_any().downcast_ref::() { + return Ok(TreeNodeRecursion::Continue); + }; + if let Some(_) = pe.as_any().downcast_ref::() { + has_column = true; + return Ok(TreeNodeRecursion::Stop); + }; + has_invalid_expr_type = true; + return Ok(TreeNodeRecursion::Stop); + }); + !has_invalid_expr_type && has_column +} + +pub fn extract_expressions_for_deep_projection(input: &Arc) -> Vec> { + let mut out: Vec> = vec![]; + let _ = input.apply(|pe| { + if expr_is_only_get_field_or_array_or_cast_and_contains_column(pe) { + out.push(pe.clone()); + return Ok(TreeNodeRecursion::Jump); + } + Ok(TreeNodeRecursion::Continue) + }); + out +} + +/// returns true if the expr is a combination of get_field, array_elements, CAST, or column reference +pub fn expr_has_get_field_or_array_element( + input: &Arc, +) -> bool { + let mut has_invalid_expr_type = false; + let _ = input.apply(|pe| { + if let Some(sfe) = pe.as_any().downcast_ref::() + && sfe.name() == "get_field" + { + has_invalid_expr_type = true; + return Ok(TreeNodeRecursion::Stop); + }; + if let Some(sfe) = pe.as_any().downcast_ref::() + && sfe.name() == "array_element" + { + has_invalid_expr_type = true; + return Ok(TreeNodeRecursion::Stop); + }; + return Ok(TreeNodeRecursion::Continue); + }); + has_invalid_expr_type +} + + +/// extracts a list of expression that contain a small set of operations from a larger column +/// This drills down in expressions until it reaches branches of the tree that only contain get_field, array_element, CAST, or column +/// linearizes that and return them +pub fn get_expressions_amenable_to_deep_projection( + input: &Arc, +) -> Vec> { + let mut out = vec![]; + let _ = input.apply(|pe| { + if expr_is_only_get_field_or_array_or_cast_and_contains_column(pe) { + out.push(pe.clone()); + return Ok(TreeNodeRecursion::Jump); + } + Ok(TreeNodeRecursion::Continue) + }); + out +} + +/// finds a column inside a larger expression that may contain columns +pub fn get_first_column_from_expr(expr: &Arc) -> Option { + let mut out: Option = None; + let _ = expr.apply(|expr| { + if let Some(column) = expr.as_any().downcast_ref::() { + out = Some(column.clone()); + return Ok(TreeNodeRecursion::Stop); + } + Ok(TreeNodeRecursion::Continue) + }); + out +} + +pub fn simplified_parquet_column_path(expr: &Arc) -> String { + let mut accum: VecDeque = VecDeque::new(); + let mut out: Vec = vec![]; + let mut in_other_literal_call: bool = false; + let _ = expr.apply(|e| { + if let Some(_) = e.as_any().downcast_ref::() { + // This used to have a .rev at the end, because we were getting nested get_field expressions + // now, get_field adds all the parameters, so we can + // out.push(column.name().to_string()); + for f in accum.iter().rev() { + out.push(f.to_owned()); + } + return Ok(TreeNodeRecursion::Stop); + } + if let Some(sfe) = e.as_any().downcast_ref::() + && sfe.name() == "get_field" + { + if sfe.args().len() == 2 { + if let Some(literal) = sfe.args()[1].as_any().downcast_ref::() { + match literal.value() { + ScalarValue::Utf8(Some(str)) => { + accum.push_back(str.clone()); + } + _ => { + error!("Can't handle expression 1 {:?}", sfe.args()[1]); + in_other_literal_call = true; + } + } + } else { + error!("Can't handle expression 2 {:?}", sfe.args()[1]); + } + } else { + sfe.args() + .iter() + .skip(1) + .enumerate() + .map(|(_i, arg)| { + if let Some(literal) = arg.as_any().downcast_ref::() { + match literal.value() { + ScalarValue::Utf8(Some(str)) => Ok(str.clone()), + _ => { + error!("Can't handle expression {:?}", sfe.args()[1]); + in_other_literal_call = true; + internal_err!("field argument") + } + } + } else { + error!("Can't handle expression {:?}", sfe.args()[1]); + internal_err!("field argument") + } + }) + .map_while(datafusion_common::Result::ok) + .collect::>() + .into_iter() + .rev() + .for_each(|arg| { + accum.push_back(arg); + }); + } + } + if let Some(sfe) = e.as_any().downcast_ref::() + && sfe.name() == "array_element" + { + accum.push_back("*".to_string()); + } + Ok(TreeNodeRecursion::Continue) + }); + out.join(".") +} + +pub fn fix_simplified_column_path( + specifier: &str, + field: &Field, +) -> datafusion_common::Result { + if specifier == "" { + return Ok("".to_string()); + } + let pieces = specifier.split(".").collect::>(); + let mut idx = 0; + let mut out_pieces: Vec = vec![]; + let mut current = field; + 'outer: loop { + if idx > pieces.len() - 1 { + break; + } + // not at the end + match current.data_type() { + DataType::Map(map_struct, _) => { + out_pieces.push("*".to_string()); + idx += 1; + if let DataType::Struct(map_struct_fields) = map_struct.data_type() { + let (_idx, field) = map_struct_fields + .find("value") + .expect("map struct should have field with name 'value'"); + current = field; + continue 'outer; + } + } + DataType::List(inner) + | DataType::ListView(inner) + | DataType::FixedSizeList(inner, _) + | DataType::LargeList(inner) + | DataType::LargeListView(inner) => { + if pieces[idx] != "*" { + // this was a get_field call, we need to insert an access + out_pieces.push("*".to_string()); + current = inner; + continue 'outer; + } else { + // this was an array_element call + out_pieces.push("*".to_string()); + idx += 1; + current = inner; + continue 'outer; + } + } + DataType::Struct(inner) => { + // FIXME @HStack - this used to crash, but we have a problem with + // aep datasets and observable schemas, which are only at the logical level !!!! + // Making this return whatever it can, and not crash + if let Some((_idx, field)) = inner.find(pieces[idx]) { + out_pieces.push(pieces[idx].to_string()); + current = field; + idx += 1; + continue 'outer; + } else { + warn!("struct should have field with name {}", pieces[idx]); + return internal_err!("struct should have field with name {}", pieces[idx]); + } + } + _ => { + error!( + "fix_deep_column_specifier_for_field at index {} for specifier {} - cannot handle {:?}", + idx, + specifier, + current.data_type() + ); + return internal_err!( + "fix_deep_column_specifier_for_field not at end, but non-nested field" + ); + } + } + } + Ok(out_pieces.join(".")) +} diff --git a/datafusion/datasource-parquet/src/mod.rs b/datafusion/datasource-parquet/src/mod.rs index 0e137a706fad7..9ab7c487f3c78 100644 --- a/datafusion/datasource-parquet/src/mod.rs +++ b/datafusion/datasource-parquet/src/mod.rs @@ -33,6 +33,8 @@ mod sort; pub mod source; mod supported_predicates; mod writer; +pub mod leaves; +pub mod push_all_projection_hints; pub use access_plan::{ParquetAccessPlan, RowGroupAccess}; pub use file_format::*; diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index f657b709fe099..27f8f0fabc0de 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -34,7 +34,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use arrow::datatypes::{SchemaRef, TimeUnit}; +use arrow::datatypes::{Schema, SchemaRef, TimeUnit}; use datafusion_common::encryption::FileDecryptionProperties; use datafusion_common::stats::Precision; use datafusion_common::{ @@ -57,7 +57,8 @@ use datafusion_common::config::EncryptionFactoryOptions; #[cfg(feature = "parquet_encryption")] use datafusion_execution::parquet_encryption::EncryptionFactory; use futures::{Stream, StreamExt, TryStreamExt, ready}; -use log::debug; +use itertools::Itertools; +use log::{debug, info, trace}; use parquet::arrow::arrow_reader::metrics::ArrowReaderMetrics; use parquet::arrow::arrow_reader::{ ArrowReaderMetadata, ArrowReaderOptions, RowSelectionPolicy, @@ -65,6 +66,8 @@ use parquet::arrow::arrow_reader::{ use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; use parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader, RowGroupMetaData}; +use datafusion_common::deep::{cast_record_batch, has_deep_projection, rewrite_schema}; +use crate::leaves::{parquet_leaf_paths, projection_specifier, remap_top_level_field_indices}; /// Implements [`FileOpener`] for a parquet file pub(super) struct ParquetOpener { @@ -72,6 +75,8 @@ pub(super) struct ParquetOpener { pub(crate) partition_index: usize, /// Projection to apply on top of the table schema (i.e. can reference partition columns). pub projection: ProjectionExprs, + pub projection_hints: Option, + pub projection_hints_indices: Vec, /// Target number of rows in each output RecordBatch pub batch_size: usize, /// Optional limit on the number of rows to read @@ -209,6 +214,8 @@ impl FileOpener for ParquetOpener { // Calculate the output schema from the original projection (before literal replacement) // so we get correct field names from column references let logical_file_schema = Arc::clone(self.table_schema.file_schema()); + trace!(target: "deep", "ParquetOpener::open logical_file_schema: {:#?}", &logical_file_schema); + let output_schema = Arc::new( self.projection .project_schema(self.table_schema.table_schema())?, @@ -281,6 +288,10 @@ impl FileOpener for ParquetOpener { let reverse_row_groups = self.reverse_row_groups; let preserve_order = self.preserve_order; + let projection_hints = self.projection_hints.clone(); + let projection_hints_indices = self.projection_hints_indices.clone(); + + Ok(Box::pin(async move { #[cfg(feature = "parquet_encryption")] let file_decryption_properties = encryption_context @@ -372,6 +383,7 @@ impl FileOpener for ParquetOpener { // - The physical file schema: this is the schema that the arrow-rs // parquet reader will actually produce. let mut physical_file_schema = Arc::clone(reader_metadata.schema()); + trace!(target: "deep", "ParquetOpener::open physical_file_schema 1: {:#?}", &physical_file_schema); // The schema loaded from the file may not be the same as the // desired schema (for example if we want to instruct the parquet @@ -387,6 +399,7 @@ impl FileOpener for ParquetOpener { options.clone(), )?; } + trace!(target: "deep", "ParquetOpener::open physical_file_schema 2: {:#?}", &physical_file_schema); if let Some(ref coerce) = coerce_int96 && let Some(merged) = coerce_int96_to_resolution( @@ -402,6 +415,7 @@ impl FileOpener for ParquetOpener { options.clone(), )?; } + trace!(target: "deep", "ParquetOpener::open physical_file_schema 3: {:#?}", &physical_file_schema); // Adapt the projection & filter predicate to the physical file schema. // This evaluates missing columns and inserts any necessary casts. @@ -612,8 +626,95 @@ impl FileOpener for ParquetOpener { // metrics from the arrow reader itself let arrow_reader_metrics = ArrowReaderMetrics::enabled(); + // @HStack deep projections + let mut simplified_parquet_columns_in_logical_file_schema: HashMap> = projection_specifier( + logical_file_schema.clone(), + &projection, + projection_hints.as_ref(), + &projection_hints_indices + ); + trace!(target: "deep", "ParquetOpener::open simplified_parquet_columns_in_logical_file_schema: {:?}", + &simplified_parquet_columns_in_logical_file_schema); + // we have to remap the top-level indices from simplified_parquet_columns_in_logical_file_schema to the physical file schema + let simplified_parquet_columns_in_physical_file_schema = { + let indices_map = remap_top_level_field_indices(&logical_file_schema, &physical_file_schema); + simplified_parquet_columns_in_logical_file_schema + .iter() + .filter_map(|(li, v)| { + // SAFETY - we ALWAYS fill the left fields + if let Some(pi) = indices_map.get(li).unwrap() { + Some((pi.clone(), v.clone())) + } else { + None + } + }) + .collect::>() + }; + trace!(target: "deep", "ParquetOpener::open simplified_parquet_columns_in_physical_file_schema: {:?}", + &simplified_parquet_columns_in_physical_file_schema); + // simplified_parquet_columns_in_logical_file_schema.retain(|key, _val| *key < physical_file_schema.fields().len()); + let has_deep_projection = has_deep_projection(&simplified_parquet_columns_in_physical_file_schema); + // let indices = simplified_parquet_columns.keys().cloned().collect(); ?????????????? let indices = projection.column_indices(); - let mask = ProjectionMask::roots(builder.parquet_schema(), indices); + // we need the logical file schema, but only top level projections + // we will use this + // FIXME: ACTUALLY PHYSICAL + let top_level_projection_logical_file_schema = Arc::new(if has_deep_projection { + Some(rewrite_schema( + &physical_file_schema, + &indices.clone(), + &indices.clone() + .iter() + .map(|idx| (idx.clone(), vec![])) + .collect::>>() + )) + } else { + None + }); + trace!(target: "deep", "ParquetOpener.open: top_level_projection_logical_file_schema: {:?}", &top_level_projection_logical_file_schema); + + let mask = if has_deep_projection { + { + let pes = projection_hints.clone().unwrap().clone(); + trace!( + target: "deep", + "ParquetOpener::open projection_hints: {}", + pes + .iter() + .map(|pe| pe.expr.to_string()) + .join(", ") + ) + } + trace!(target: "deep", "ParquetOpener::open physical file schema: {:#?}", &physical_file_schema); + + // @DeepProjections + trace!(target: "deep", "ParquetOpener::open deep projections: {:?}", simplified_parquet_columns_in_physical_file_schema); + let leaves = parquet_leaf_paths( + Arc::clone(&physical_file_schema), + builder.parquet_schema(), + &indices, + &simplified_parquet_columns_in_physical_file_schema, + ); + debug!( + target: "deep", + "ParquetOpener::open, using deep projection parquet leaves: {:?}", + leaves.clone() + ); + ProjectionMask::leaves(builder.parquet_schema(), leaves) + } else { + let indices = projection.column_indices(); + debug!( + target: "deep", + "ParquetOpener::open, using root projections: {:?}", + &indices + ); + + ProjectionMask::roots( + builder.parquet_schema(), + indices + ) + }; + info!("ParquetOpener::open actual final mask: {:?}", &mask); let stream = builder .with_projection(mask) @@ -627,7 +728,21 @@ impl FileOpener for ParquetOpener { file_metrics.predicate_cache_inner_records.clone(); let predicate_cache_records = file_metrics.predicate_cache_records.clone(); - let stream_schema = Arc::clone(stream.schema()); + // @DeepProjections + // this schema does NOT contain the entire schema, because it was built from leaves + let stream_schema = if has_deep_projection { + // the intermediate output schema + // this is the actual output schema - WITHOUT deep projections + top_level_projection_logical_file_schema + .clone() + .as_ref() + .clone() + .unwrap() + .clone() + } else { + Arc::clone(stream.schema()) + }; + // Check if we need to replace the schema to handle things like differing nullability or metadata. // See note below about file vs. output schema. let replace_schema = !stream_schema.eq(&output_schema); @@ -647,6 +762,20 @@ impl FileOpener for ParquetOpener { &predicate_cache_inner_records, &predicate_cache_records, ); + if has_deep_projection { + let dest_schema = top_level_projection_logical_file_schema + .clone() + .as_ref() + .clone() + .unwrap(); + let new_b = cast_record_batch( + &b, + dest_schema, + false, + true + ).unwrap(); + b = new_b; + } b = projector.project_batch(&b)?; if replace_schema { // Ensure the output batch has the expected schema. @@ -1185,6 +1314,8 @@ mod test { ParquetOpener { partition_index: self.partition_index, projection, + projection_hints: None, + projection_hints_indices: vec![], batch_size: self.batch_size, limit: self.limit, predicate: self.predicate, diff --git a/datafusion/datasource-parquet/src/push_all_projection_hints.rs b/datafusion/datasource-parquet/src/push_all_projection_hints.rs new file mode 100644 index 0000000000000..200624825e7bc --- /dev/null +++ b/datafusion/datasource-parquet/src/push_all_projection_hints.rs @@ -0,0 +1,816 @@ +use datafusion_common::Result; +use datafusion_common::config::ConfigOptions; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; +use datafusion_common::{JoinSide, JoinType}; +use datafusion_datasource::file::FileSource; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::source::DataSourceExec; +use crate::leaves::{ + expr_has_get_field_or_array_element, expr_is_only_get_field_or_array_or_cast_and_contains_column, + get_expressions_amenable_to_deep_projection, get_first_column_from_expr, +}; +use crate::source::ParquetSource; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_expr::projection::{ + ProjectionExpr, ProjectionExprs, ProjectionRef, +}; +use datafusion_physical_expr_common::physical_expr::PhysicalExpr; +use datafusion_physical_optimizer::PhysicalOptimizerRule; +use datafusion_physical_plan::aggregates::AggregateExec; +use datafusion_physical_plan::filter::FilterExec; +use datafusion_physical_plan::joins::utils::build_join_schema; +use datafusion_physical_plan::joins::{ + HashJoinExec, NestedLoopJoinExec, PiecewiseMergeJoinExec, SortMergeJoinExec, + SymmetricHashJoinExec, +}; +use datafusion_physical_plan::projection::ProjectionExec; +use datafusion_physical_plan::sorts::sort::SortExec; +use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; +use datafusion_physical_plan::union::UnionExec; +use datafusion_physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; +use datafusion_physical_plan::{ExecutionPlan, displayable}; +use log::{error, trace, warn}; +use std::collections::{BTreeSet, HashMap}; +use std::sync::Arc; + +#[derive(Debug)] +pub struct PushAllProjectionHints {} + +impl PushAllProjectionHints {} + +/// This optimizer rule +/// - searches recursively in the execution plans and tries to resolve expressions referenced +/// in the plan down to DataSourceExec +/// - some type of plans have projections specified . For those plans, we try to go down through +/// the projections, so we always solve down to Data sources +/// - for array_element / get_field expressions we collect and change the expressions +/// so that we reach the minimal projection needed to solve them +/// - the result - is saved to the DataSourceExec +impl PhysicalOptimizerRule for PushAllProjectionHints { + fn optimize( + &self, + plan: Arc, + _config: &ConfigOptions, + ) -> Result> { + trace!( + target: "deep", + "PushAllProjectionHints input: {}", + displayable(plan.as_ref()).indent(true) + ); + // find DataSourceExec + let mut data_sources: HashMap> = HashMap::new(); + let _ = plan.apply(|p| { + if let Some(ds_exec) = p.as_any().downcast_ref::() { + if let Some((_file_scan_conf, _parquet_source)) = + ds_exec.downcast_to_file_source::() + { + // fix this, but how ? + let plan_key = displayable(p.as_ref()).indent(true).to_string(); + data_sources.insert(plan_key, p); + } + return Ok(TreeNodeRecursion::Jump); + } + Ok(TreeNodeRecursion::Continue) + }); + + // find extra projections + let mut accum: HashMap, usize)>> = + HashMap::new(); + + let mut visited_expressions: HashMap> = HashMap::new(); + let _ = plan.apply(|physical_plan| { + if let Some(_ds_exec) = + physical_plan.as_any().downcast_ref::() + { + let key = get_key_from_plan(physical_plan); + // if we do not have expression for this plan, accumulate the expressions in this plan + let already_used_col_indices_for_this_datasource_exec = accum.get(key.as_str()) + .into_iter() + .flatten() + .map(|(_expr, index)| { + *index + }) + .collect::>(); + let exprs = extract_expressions_containing_column_from_plan(&physical_plan); + for (expr_index, expr) in exprs.iter().enumerate() { + trace!(target:"deep", + "PushAllProjectionHints::optimize accum expr for DataSourceExec index={}: {}", + expr_index, + &expr.to_string() + ); + if let Some(ve) = visited_expressions.get(get_key_from_plan(&physical_plan).as_str()) + && ve.contains(&expr_index) { + trace!(target:"deep", + " PushAllProjectionHints::optimize expr at index {} is used, skip", + expr_index, + ); + continue + } + if !already_used_col_indices_for_this_datasource_exec.contains(&expr_index) { + accum + .entry(key.clone()) + .or_default() + .push((expr.clone(), expr_index)); + } + } + Ok(TreeNodeRecursion::Jump) + } else { + let exprs = extract_expressions_containing_column_from_plan(&physical_plan); + for (expr_index, expr) in exprs.iter().enumerate() { + trace!(target:"deep", + "PushAllProjectionHints::optimize accum expr for plan type={} index={}: {}", + &physical_plan.name(), + expr_index, + &expr.to_string() + ); + // if (&physical_plan).name() == "FilterExec" + // && let Some(ve) = visited_expressions.get(get_key_from_plan(&physical_plan).as_str()) + // && ve.contains(&expr_index) { + // continue + // } + if let Some(ve) = visited_expressions.get(get_key_from_plan(&physical_plan).as_str()) + && ve.contains(&expr_index) { + trace!(target:"deep", + " PushAllProjectionHints::optimize expr at index {} is used, skip", + expr_index, + ); + continue + } + let expression_sources = find_sources_for_column_expr(&expr, &physical_plan); + visited_expressions = combine_visited_expressions( + visited_expressions.clone(), + expression_sources + .iter() + .map(|es| es.visited_exprs.clone()) + .collect::>>() + ); + for ExpressionSource{plan:source, expr: source_expr, col_index: source_col_index, .. } in expression_sources + { + trace!(target:"deep", " PushAllProjectionHints::optimize FOUND modified EXPR: {}", &source_expr.to_string()); + accum + .entry(get_key_from_plan(&source)) + .or_default() + .push((source_expr, source_col_index)); + } + } + Ok(TreeNodeRecursion::Continue) + } + }); + + // recreate the plan with extra projections + if accum.len() == 0 { + return Ok(plan); + } + + let new_plan = plan.transform_down(|p| { + if let Some(ds_exec) = p.as_any().downcast_ref::() { + if let Some((file_scan_conf, parquet_source)) = + ds_exec.downcast_to_file_source::() + { + let key = get_key_from_plan(&p); + if let Some(exprs) = accum.get(&key) { + let (exprs, indices): (Vec<_>, Vec<_>) = + exprs.into_iter().map(|(e, i)| (e.clone(), i)).unzip(); + + let pexprs = exprs + .iter() + .map(|e| ProjectionExpr::new(e.clone(), "".to_string())); + + let mut new_parquet_source = parquet_source.clone(); + new_parquet_source.projection_hints = + ProjectionExprs::new(pexprs); + new_parquet_source.projection_hints_indices = indices; + + let new_file_scan_config = + FileScanConfigBuilder::from(file_scan_conf.clone()) + .with_source(Arc::new(new_parquet_source)) + .build(); + + let execution_plan = Arc::new( + ds_exec + .clone() + .with_data_source(Arc::new(new_file_scan_config)), + ); + return Ok(Transformed::yes(execution_plan)); + } + } + } + Ok(Transformed::no(p)) + })?; + + Ok(new_plan.data) + } + + fn name(&self) -> &str { + "LeafProjectionOptimizerRule" + } + + fn schema_check(&self) -> bool { + true + } +} + +/// For an expression, we might get a list of expressions in the data source +/// like we have Column(a) + Column(b) - this will get resolved down to the DataSource, but there +/// we will recurse two times, and we need to mark both columns as being used. +fn combine_visited_expressions( + initial: HashMap>, + maps: Vec>, +) -> HashMap> { + let mut acc: HashMap> = initial + .into_iter() + .map(|(k, vs)| (k, vs.into_iter().collect())) + .collect(); + for m in maps { + for (k, v) in m { + acc.entry(k).or_default().insert(v); + } + } + acc.into_iter() + .map(|(k, set)| (k, set.into_iter().collect())) + .collect() +} + +/// builds a key for an execution plan +/// we need this so we can keep, for each plan, the list of "visited" expressions. +/// +/// there is a *small* possibility that this will break if we have two plans inside a +pub fn get_key_from_plan(plan: &Arc) -> String { + // fix this, but how ? + let plan_key = displayable(plan.as_ref()).indent(false).to_string(); + return plan_key; +} + +pub fn extract_expressions_containing_column_from_plan( + input: &Arc, +) -> Vec> { + let mut out = vec![]; + get_expressions_to_check(input) + .iter() + .for_each(|e| out.extend(get_expressions_amenable_to_deep_projection(e))); + out +} + +/// extract lists of expression from a physical plan +/// TODO: add here for missing plan types. +pub fn get_expressions_to_check( + plan: &Arc, +) -> Vec> { + if let Some(ds_exec) = plan.as_any().downcast_ref::() { + if let Some((_file_scan_conf, parquet_source)) = + ds_exec.downcast_to_file_source::() + { + return parquet_source + .projection() + // SAFETY - parquet source always has a projection + .unwrap() + .iter() + .map(|pe| pe.expr.clone()) + .collect::>(); + } else { + return vec![]; + } + } + + if let Some(plan) = plan.as_any().downcast_ref::() { + return plan + .expr() + .iter() + .map(|pe| pe.expr.clone()) + .collect::>(); + } + if let Some(plan) = plan.as_any().downcast_ref::() { + let mut out = vec![]; + if let Some(filter_projection) = plan.projection() { + let input = plan.children()[0]; + let filter_projection_columns = filter_projection + .iter() + .map(|index| { + let name = + input.schema().fields().get(*index).unwrap().name().clone(); + let col = Column::new(name.as_str(), *index); + Arc::new(col) as Arc + }) + .collect::>>(); + out.extend(filter_projection_columns); + } + out.push(plan.predicate().clone()); + return out; + } + if let Some(plan) = plan.as_any().downcast_ref::() { + return plan + .expr() + .iter() + .map(|le| le.expr.clone()) + .collect::>(); + } + if let Some(plan) = plan.as_any().downcast_ref::() { + let mut out: Vec> = vec![]; + plan.group_expr() + .expr() + .iter() + .for_each(|ge| out.push(ge.0.clone())); + plan.aggr_expr().iter().for_each(|ae| { + ae.expressions().iter().for_each(|e| out.push(e.clone())); + }); + plan.filter_expr().iter().for_each(|fe| { + if let Some(fe) = fe { + out.push(fe.clone()) + } + }); + return out; + } + if let Some(plan) = plan.as_any().downcast_ref::() { + let mut out: Vec> = vec![]; + plan.window_expr() + .iter() + .for_each(|we| we.expressions().iter().for_each(|e| out.push(e.clone()))); + return out; + } + if let Some(plan) = plan.as_any().downcast_ref::() { + let mut out: Vec> = vec![]; + plan.window_expr() + .iter() + .for_each(|we| we.expressions().iter().for_each(|e| out.push(e.clone()))); + return out; + } + if let Some(plan) = plan.as_any().downcast_ref::() { + let mut out: Vec> = vec![]; + plan.on.iter().for_each(|oe| { + out.push(oe.0.clone()); + out.push(oe.1.clone()); + }); + return out; + } + // if let Some(plan) = plan.as_any().downcast_ref::() {} + // if let Some(plan) = plan.as_any().downcast_ref::() {} + if let Some(plan) = plan.as_any().downcast_ref::() { + return vec![plan.on.0.clone(), plan.on.1.clone()]; + } + if let Some(plan) = plan.as_any().downcast_ref::() { + let mut out: Vec> = vec![]; + plan.on.iter().for_each(|oe| { + out.push(oe.0.clone()); + out.push(oe.1.clone()); + }); + return out; + } + if let Some(plan) = plan.as_any().downcast_ref::() { + let mut out: Vec> = vec![]; + plan.expr().iter().for_each(|se| out.push(se.expr.clone())); + return out; + } + if let Some(plan) = plan.as_any().downcast_ref::() { + let mut out: Vec> = vec![]; + plan.on().iter().for_each(|oe| { + out.push(oe.0.clone()); + out.push(oe.1.clone()); + }); + return out; + } + vec![] +} + +// if we call this, we have to make sure that expr contains a column +pub fn replace_column_in_expr( + expr: &Arc, + new_col: &Arc, +) -> Arc { + let mut executed = false; + let new_expr = expr + .clone() + .transform_down(|e| { + if let Some(_col) = e.as_any().downcast_ref::() + && !executed + { + executed = true; + return Ok(Transformed::yes(new_col.clone())); + } + Ok(Transformed::no(e)) + }) + .unwrap(); + new_expr.data +} + +pub fn merge_column_expressions( + left: &Arc, + right: &Arc, +) -> Arc { + let _left_column = get_first_column_from_expr(left).unwrap(); + let right_column = get_first_column_from_expr(right).unwrap(); + let left_has_deep_projection = expr_has_get_field_or_array_element(left); + let right_has_deep_projection = expr_has_get_field_or_array_element(right); + if !left_has_deep_projection { + // left is a column, so we return right. + right.clone() + } else { + // left HAS deep projection + if !right_has_deep_projection { + replace_column_in_expr( + &left, + &(Arc::new(right_column.clone()) as Arc), + ) + } else { + // both left AND right have deep projection !!!!!!! + trace!( + target: "deep", + "dpp::merge_column_expressions: complicated merge !!!!!! left={}, right={}", + &left.to_string(), &right.to_string() + ); + let out = replace_column_in_expr(&left, &right); + // let out = right.clone(); + trace!(target: "deep", " dpp::merge_column_expressions forced merged: {}", &out.to_string()); + out + } + } +} + +/// temporary struct that is kept until we resolve all expressions down to their data sources +#[derive(Debug, Clone)] +pub struct ExpressionSource { + pub plan: Arc, + pub expr: Arc, + pub col_index: usize, + pub visited_exprs: HashMap, +} + +// expands an complex expression to a list of get_field / array_element / column expressions +pub fn find_sources_for_column_expr( + expr: &Arc, + plan: &Arc, +) -> Vec { + let mut out: Vec = vec![]; + let expanded_exprs = get_expressions_amenable_to_deep_projection(expr); + // info!("find_sources_for_column_expr EXPRESSIONS LEN: {}", expanded_exprs.len()); + + for expr in expanded_exprs { + // info!("find_sources_for_column_expr START: {}", expr.to_string()); + out.extend(find_sources_for_single_column_expr(&expr, plan)); + } + out +} + +pub fn find_sources_for_single_column_expr( + expr: &Arc, + plan: &Arc, +) -> Vec { + trace!(target: "deep", "dpp::find_sources_for_single_column_expr expr={}", &expr.to_string()); + let mut expr = expr.clone(); + let mut expr_is_from_current_plan = true; + if let Some(col) = get_first_column_from_expr(&expr) { + let mut col_index = col.index(); + // let mut col_expr = expr.clone(); + let mut visited_exprs: HashMap = HashMap::new(); + + trace!(target:"deep", "dpp::find_sources_for_single_column_expr for expr={} index={}", expr.to_string(), col_index); + let mut current = plan; + + if let Some(plan) = current.as_any().downcast_ref::() { + let mut found = false; + for (l, r) in plan.on().iter() { + if &expr == l { + current = plan.left(); + found = true; + } + if &expr == r { + current = plan.right(); + found = true; + } + } + if !found { + warn!("find_sources_for_single_column_expr Could not find expr {:?} in plan HashJoinExec", expr); + return vec![]; + } + } + + 'outer: loop { + trace!(target:"deep", + " dpp::find_sources_for_single_column_expr start at plan {}, col_index={}, expr={}", + current.name(), + col_index, + expr.to_string() + ); + if !expr_is_from_current_plan { + trace!(target:"deep", + " dpp::find_sources_for_single_column_expr SAVE used column in plan={}, column={}, expr={}", + current.name(), + col_index, + expr.to_string() + ); + visited_exprs.insert(get_key_from_plan(current), col_index); + } else { + trace!(target:"deep", + " dpp::find_sources_for_single_column_expr DON'T SAVE used column in plan={}, column={}, expr={}", + current.name(), + col_index, + expr.to_string() + ); + } + + if let Some(plan) = current.as_any().downcast_ref::() { + // info!(" at DataSourceExec, col_index = {}", col_index); + if let Some((_file_scan_conf, parquet_source)) = + plan.downcast_to_file_source::() + { + // SAFETY = parquet source always has a projection + let projection = parquet_source + .projection() + .expect("ParquetSource projection"); + let projection_vec = projection.iter().collect::>(); + if let Some(pexpr) = projection_vec.get(col_index) { + let actual_expr = pexpr.expr.clone(); + let new_scalar_func_expr = + merge_column_expressions(&expr, &actual_expr); + return vec![ExpressionSource { + plan: current.clone(), + expr: new_scalar_func_expr, + col_index, + visited_exprs, + }]; + } else { + return vec![]; + } + } + + return vec![]; + } + // col_index refers to the index of the column in current plan's input + if current.children().len() == 0 { + error!("unreachable code in find_source_for_column_expr: 0 children for plan {}", current.name()); + return vec![]; + } + // some plans have projection, like projection, filter, hash join, nested loop join + if let Some(proj) = current.as_any().downcast_ref::() { + if Arc::ptr_eq(current, plan) { + // the expression starts here + current = current.children()[0]; + expr_is_from_current_plan = false; + } else { + if let Some(expr_in_projection) = + proj.projection_expr().as_ref().get(col_index) + { + trace!( + target: "deep", + " dpp::find_sources_for_single_column_expr EXPR IN PROJECTION IS: {}", + expr_in_projection.to_string() + ); + + if expr_is_only_get_field_or_array_or_cast_and_contains_column( + &expr_in_projection.expr, + ) { + let column_in_projection = + get_first_column_from_expr(&expr_in_projection.expr).unwrap(); + expr = + merge_column_expressions(&expr, &expr_in_projection.expr); + col_index = column_in_projection.index(); + current = proj.children()[0]; + expr_is_from_current_plan = false; + continue 'outer; + } else { + trace!( + target: "deep", + " dpp::find_sources_for_single_column_expr can't handle this expression: {}", + expr.to_string() + ); + // we DON't do anything, don't know how to handle other things in projection + let out = find_sources_for_column_expr(&expr_in_projection.expr, current); + trace!( + target: "deep", + " dpp::find_sources_for_single_column_expr EXPANDED: {}", + out.iter().map(|es| es.expr.to_string()).collect::>().join(", ") + ); + return out; + } + } else { + return vec![]; + } + } + } + + if let Some(filter_child) = current.as_any().downcast_ref::() { + // filter has a projection, but we apply this ONLY if the expression comes from above this plan + // if the expression is in the actual FilterExec plan, we just go down + let new_col_index = if Arc::ptr_eq(plan, current) { + Some(&col_index) + } else { + if let Some(projection) = &filter_child.projection() { + projection.get(col_index) + } else { + Some(&col_index) + } + }; + if let Some(new_col_index) = new_col_index { + col_index = *new_col_index; + current = filter_child.input(); + expr_is_from_current_plan = false; + continue 'outer; + } else { + return vec![]; + } + } + if let Some(join) = current.as_any().downcast_ref::() { + if Arc::ptr_eq(current, plan) { + // if the expression originates in this plan + let mut found = false; + for (l, r) in join.on().iter() { + if &expr == l { + current = join.left(); + expr_is_from_current_plan = false; + found = true; + } + if &expr == r { + current = join.right(); + expr_is_from_current_plan = false; + found = true; + } + } + if !found { + warn!("Could not find expr {:?} in plan HashJoinExec", expr); + return vec![]; + } + continue 'outer; + } else { + if let Some((tmp_col_index, tmp_plan)) = + col_index_and_plan_for_join_like_plan( + join.left(), + join.right(), + &join.join_type, + join.projection.clone(), + col_index, + ) + { + col_index = tmp_col_index; + current = tmp_plan; + expr_is_from_current_plan = false; + continue 'outer; + } else { + return vec![]; + } + } + } + if current.children().len() == 1 + && let Some(join) = current.children()[0] + .as_any() + .downcast_ref::() + { + if let Some((tmp_col_index, tmp_plan)) = + col_index_and_plan_for_join_like_plan( + join.left(), + join.right(), + join.join_type(), + join.projection().clone(), + col_index, + ) + { + col_index = tmp_col_index; + current = tmp_plan; + expr_is_from_current_plan = false; + continue 'outer; + } else { + return vec![]; + } + } + if current.children().len() == 1 + && let Some(join) = current.children()[0] + .as_any() + .downcast_ref::() + { + if Arc::ptr_eq(plan, current) { + let mut found = false; + for (l, r) in join.on().iter() { + if &expr == l { + current = join.left(); + expr_is_from_current_plan = false; + found = true; + } + if &expr == r { + current = join.right(); + expr_is_from_current_plan = false; + found = true; + } + } + if !found { + warn!("Could not find expr {:?} in plan HashJoinExec", expr); + return vec![]; + } + continue 'outer; + } else { + if let Some((tmp_col_index, tmp_plan)) = + col_index_and_plan_for_join_like_plan( + join.left(), + join.right(), + join.join_type(), + None, + col_index, + ) + { + col_index = tmp_col_index; + current = tmp_plan; + expr_is_from_current_plan = false; + continue 'outer; + } else { + return vec![]; + } + } + } + if current.children().len() == 1 + && let Some(join) = current.children()[0] + .as_any() + .downcast_ref::() + { + if Arc::ptr_eq(current, plan) { + let mut found = false; + for (l, r) in join.on().iter() { + if &expr == l { + current = join.left(); + expr_is_from_current_plan = false; + found = true; + } + if &expr == r { + current = join.right(); + expr_is_from_current_plan = false; + found = true; + } + } + if !found { + warn!("Could not find expr {:?} in plan HashJoinExec", expr); + return vec![]; + } + continue 'outer; + } else { + if let Some((tmp_col_index, tmp_plan)) = + col_index_and_plan_for_join_like_plan( + join.left(), + join.right(), + &join.join_type(), + None, + col_index, + ) + { + col_index = tmp_col_index; + current = tmp_plan; + expr_is_from_current_plan = false; + continue 'outer; + } else { + return vec![]; + } + } + } + if current.children().len() == 1 + && let Some(union) = + current.children()[0].as_any().downcast_ref::() + { + let mut out = vec![]; + for union_child in union.children().iter() { + out.extend(find_sources_for_column_expr(&expr, union_child)) + } + return out; + } + + if current.children().len() == 1 { + current = current.children()[0]; + expr_is_from_current_plan = false; + continue 'outer; + } + + warn!( + "Should NOT reach here, number of children: {} for plan name {}", + current.children().len(), current.name() + ); + break 'outer; + } + } + vec![] +} + +pub fn col_index_and_plan_for_join_like_plan<'a>( + left: &'a Arc, + right: &'a Arc, + join_type: &JoinType, + projection_ref: Option, + col_index: usize, +) -> Option<(usize, &'a Arc)> { + let (_join_schema, column_indices) = + build_join_schema(&left.schema(), &right.schema(), &join_type); + let col_index_to_search_in_indices = if let Some(projection_ref) = projection_ref { + projection_ref.get(col_index).cloned() + } else { + Some(col_index) + }; + if col_index_to_search_in_indices.is_none() { + return None; + } + let col_index_to_search_in_indices = col_index_to_search_in_indices.unwrap(); + let column_index = column_indices.get(col_index_to_search_in_indices); + + if column_index.is_none() { + return None; + } + let column_index = column_index.unwrap(); + + match column_index.side { + JoinSide::Left => Some((column_index.index, left)), + JoinSide::Right => Some((column_index.index, right)), + JoinSide::None => None, + } +} diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 75d87a4cd16fc..6a16152bb5838 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -286,6 +286,8 @@ pub struct ParquetSource { pub(crate) metadata_size_hint: Option, /// Projection to apply to the output. pub(crate) projection: ProjectionExprs, + pub projection_hints: ProjectionExprs, + pub projection_hints_indices: Vec, #[cfg(feature = "parquet_encryption")] pub(crate) encryption_factory: Option>, /// If true, read files in reverse order and reverse row groups within files. @@ -318,6 +320,8 @@ impl ParquetSource { #[cfg(feature = "parquet_encryption")] encryption_factory: None, reverse_row_groups: false, + projection_hints: ProjectionExprs::new(vec![]), + projection_hints_indices: vec![], } } @@ -544,6 +548,8 @@ impl FileSource for ParquetSource { let opener = Arc::new(ParquetOpener { partition_index: partition, projection: self.projection.clone(), + projection_hints: Some(self.projection_hints.clone()), + projection_hints_indices: self.projection_hints_indices.clone(), batch_size: self .batch_size .expect("Batch size must set before creating ParquetOpener"), From 9073807739ec982ca5009eef9f22358f5681cf62 Mon Sep 17 00:00:00 2001 From: Andrei Dragomir Date: Wed, 6 May 2026 14:19:20 +0300 Subject: [PATCH 2/5] [HSTACK] Deep projection physical serde --- datafusion/proto/proto/datafusion.proto | 3 ++ datafusion/proto/src/generated/pbjson.rs | 39 +++++++++++++++ datafusion/proto/src/generated/prost.rs | 4 ++ datafusion/proto/src/physical_plan/mod.rs | 59 ++++++++++++++++++++++- 4 files changed, 103 insertions(+), 2 deletions(-) diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 37b31a84deab1..343d366bc482b 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -1083,6 +1083,9 @@ message ParquetScanExecNode { PhysicalExprNode predicate = 3; datafusion_common.TableParquetOptions parquet_options = 4; + + optional ProjectionExprs projection_hints = 5; + repeated uint64 projection_hints_indices = 6; } message CsvScanExecNode { diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 419105c40c792..76afc7820c22f 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -14117,6 +14117,12 @@ impl serde::Serialize for ParquetScanExecNode { if self.parquet_options.is_some() { len += 1; } + if self.projection_hints.is_some() { + len += 1; + } + if !self.projection_hints_indices.is_empty() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion.ParquetScanExecNode", len)?; if let Some(v) = self.base_conf.as_ref() { struct_ser.serialize_field("baseConf", v)?; @@ -14127,6 +14133,12 @@ impl serde::Serialize for ParquetScanExecNode { if let Some(v) = self.parquet_options.as_ref() { struct_ser.serialize_field("parquetOptions", v)?; } + if let Some(v) = self.projection_hints.as_ref() { + struct_ser.serialize_field("projectionHints", v)?; + } + if !self.projection_hints_indices.is_empty() { + struct_ser.serialize_field("projectionHintsIndices", &self.projection_hints_indices.iter().map(ToString::to_string).collect::>())?; + } struct_ser.end() } } @@ -14142,6 +14154,10 @@ impl<'de> serde::Deserialize<'de> for ParquetScanExecNode { "predicate", "parquet_options", "parquetOptions", + "projection_hints", + "projectionHints", + "projection_hints_indices", + "projectionHintsIndices", ]; #[allow(clippy::enum_variant_names)] @@ -14149,6 +14165,8 @@ impl<'de> serde::Deserialize<'de> for ParquetScanExecNode { BaseConf, Predicate, ParquetOptions, + ProjectionHints, + ProjectionHintsIndices, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -14173,6 +14191,8 @@ impl<'de> serde::Deserialize<'de> for ParquetScanExecNode { "baseConf" | "base_conf" => Ok(GeneratedField::BaseConf), "predicate" => Ok(GeneratedField::Predicate), "parquetOptions" | "parquet_options" => Ok(GeneratedField::ParquetOptions), + "projectionHints" | "projection_hints" => Ok(GeneratedField::ProjectionHints), + "projectionHintsIndices" | "projection_hints_indices" => Ok(GeneratedField::ProjectionHintsIndices), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -14195,6 +14215,8 @@ impl<'de> serde::Deserialize<'de> for ParquetScanExecNode { let mut base_conf__ = None; let mut predicate__ = None; let mut parquet_options__ = None; + let mut projection_hints__ = None; + let mut projection_hints_indices__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::BaseConf => { @@ -14215,12 +14237,29 @@ impl<'de> serde::Deserialize<'de> for ParquetScanExecNode { } parquet_options__ = map_.next_value()?; } + GeneratedField::ProjectionHints => { + if projection_hints__.is_some() { + return Err(serde::de::Error::duplicate_field("projectionHints")); + } + projection_hints__ = map_.next_value()?; + } + GeneratedField::ProjectionHintsIndices => { + if projection_hints_indices__.is_some() { + return Err(serde::de::Error::duplicate_field("projectionHintsIndices")); + } + projection_hints_indices__ = + Some(map_.next_value::>>()? + .into_iter().map(|x| x.0).collect()) + ; + } } } Ok(ParquetScanExecNode { base_conf: base_conf__, predicate: predicate__, parquet_options: parquet_options__, + projection_hints: projection_hints__, + projection_hints_indices: projection_hints_indices__.unwrap_or_default(), }) } } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index a0d4ef9e973c4..0fbe52faeac88 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -1629,6 +1629,10 @@ pub struct ParquetScanExecNode { pub parquet_options: ::core::option::Option< super::datafusion_common::TableParquetOptions, >, + #[prost(message, optional, tag = "5")] + pub projection_hints: ::core::option::Option, + #[prost(uint64, repeated, tag = "6")] + pub projection_hints_indices: ::prost::alloc::vec::Vec, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct CsvScanExecNode { diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 85406e31da614..ecd65acb89059 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -90,7 +90,7 @@ use datafusion_physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; use datafusion_physical_plan::{ExecutionPlan, InputOrderMode, PhysicalExpr, WindowExpr}; use prost::Message; use prost::bytes::BufMut; - +use datafusion_physical_expr::projection::ProjectionExprs; use self::from_proto::parse_protobuf_partitioning; use self::to_proto::serialize_partitioning; use crate::common::{byte_to_string, str_to_byte}; @@ -829,7 +829,7 @@ impl protobuf::PhysicalPlanNode { .collect(); Arc::new(Schema::new(projected_fields)) } else { - schema + schema.clone() }; let predicate = scan @@ -869,6 +869,35 @@ impl protobuf::PhysicalPlanNode { if let Some(predicate) = predicate { source = source.with_predicate(predicate); } + + if let Some(proto_projection_hints) = &scan.projection_hints { + let projection_hints: Vec = proto_projection_hints + .projections + .iter() + .map(|proto_expr| { + let expr = proto_converter.proto_to_physical_expr( + proto_expr.expr.as_ref().ok_or_else(|| { + internal_datafusion_err!("ProjectionExpr missing expr field") + })?, + ctx, + &schema.clone(), + codec, + )?; + Ok(ProjectionExpr::new(expr, proto_expr.alias.clone())) + }) + .collect::>>()?; + + let projection_hints = ProjectionExprs::new(projection_hints); + source.projection_hints = projection_hints; + + let projection_hints_indices = scan + .projection_hints_indices + .iter() + .map(|x| x.clone() as usize) + .collect::>(); + source.projection_hints_indices = projection_hints_indices; + } + let base_config = parse_protobuf_file_scan_config( base_conf, ctx, @@ -2933,6 +2962,30 @@ impl protobuf::PhysicalPlanNode { .filter() .map(|pred| proto_converter.physical_expr_to_proto(&pred, codec)) .transpose()?; + + let projection_hints = protobuf::ProjectionExprs { + projections: conf + .projection_hints + .as_ref() + .iter() + .map(|expr| { + Ok(protobuf::ProjectionExpr { + alias: expr.alias.to_string(), + expr: Some( + proto_converter + .physical_expr_to_proto(&expr.expr, codec)?, + ), + }) + }) + .collect::>>()? + }; + + let projection_hints_indices = conf + .projection_hints_indices + .iter() + .map(|x| x.clone() as u64) + .collect::>(); + return Ok(Some(protobuf::PhysicalPlanNode { physical_plan_type: Some(PhysicalPlanType::ParquetScan( protobuf::ParquetScanExecNode { @@ -2943,6 +2996,8 @@ impl protobuf::PhysicalPlanNode { )?), predicate, parquet_options: Some(conf.table_parquet_options().try_into()?), + projection_hints: Some(projection_hints), + projection_hints_indices, }, )), })); From 4582d1288144fc4c000e9e883b9399508871de77 Mon Sep 17 00:00:00 2001 From: Adrian Tanase Date: Thu, 5 Feb 2026 21:40:26 +0200 Subject: [PATCH 3/5] [HSTACK] Add TreeNode trait implementation for substrait Rel --- Cargo.lock | 2 + datafusion/common/Cargo.toml | 3 + datafusion/common/src/lib.rs | 2 + datafusion/common/src/substrait_tree.rs | 638 ++++++++++++++++++ datafusion/substrait/tests/cases/mod.rs | 1 + datafusion/substrait/tests/cases/tree_node.rs | 86 +++ 6 files changed, 732 insertions(+) create mode 100644 datafusion/common/src/substrait_tree.rs create mode 100644 datafusion/substrait/tests/cases/tree_node.rs diff --git a/Cargo.lock b/Cargo.lock index 7a48dd65e59b3..52cae34249943 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1931,9 +1931,11 @@ dependencies = [ "object_store", "parquet", "paste", + "prost", "rand 0.9.2", "recursive", "sqlparser", + "substrait", "tokio", "web-time", ] diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index e4ba71e45c661..ff1495a53ea86 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -86,6 +86,9 @@ recursive = { workspace = true, optional = true } sqlparser = { workspace = true, optional = true } tokio = { workspace = true } +prost = { workspace = true } +substrait = { version = "0.62", features = ["serde"] } + [target.'cfg(target_family = "wasm")'.dependencies] web-time = "1.1.0" diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index b0867d8ce2e12..a27d0a7891e4c 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -62,6 +62,8 @@ pub mod tree_node; pub mod types; pub mod utils; pub mod deep; +pub mod substrait_tree; + /// Reexport arrow crate pub use arrow; pub use column::Column; diff --git a/datafusion/common/src/substrait_tree.rs b/datafusion/common/src/substrait_tree.rs new file mode 100644 index 0000000000000..80cc743e507d7 --- /dev/null +++ b/datafusion/common/src/substrait_tree.rs @@ -0,0 +1,638 @@ +#[allow(deprecated)] +use crate::tree_node::{Transformed, TreeNode, TreeNodeIterator, TreeNodeRecursion}; +use crate::{DataFusionError, Result}; +use substrait::proto::{ + rel::RelType, AggregateRel, ConsistentPartitionWindowRel, CrossRel, DdlRel, + ExchangeRel, ExpandRel, ExtensionMultiRel, ExtensionSingleRel, FetchRel, FilterRel, + HashJoinRel, JoinRel, MergeJoinRel, NestedLoopJoinRel, ProjectRel, Rel, SetRel, + SortRel, WriteRel, +}; + +fn inputs(rel: &Rel) -> Vec<&Rel> { + match &rel.rel_type { + Some(rel_type) => match rel_type { + RelType::Read(_) => vec![], + RelType::Project(project_rel) => { + project_rel.input.as_deref().into_iter().collect() + } + RelType::Filter(filter_rel) => { + filter_rel.input.as_deref().into_iter().collect() + } + RelType::Fetch(fetch_rel) => fetch_rel.input.as_deref().into_iter().collect(), + RelType::Aggregate(aggregate_rel) => { + aggregate_rel.input.as_deref().into_iter().collect() + } + RelType::Sort(sort_rel) => sort_rel.input.as_deref().into_iter().collect(), + // FIXME + RelType::Join(join_rel) => { + let mut output: Vec<&Rel> = vec![]; + if let Some(left) = join_rel.left.as_ref() { + output.push(left.as_ref()); + } + if let Some(right) = join_rel.right.as_ref() { + output.push(right.as_ref()); + } + output + } + RelType::Set(set_rel) => set_rel.inputs.iter().collect(), + RelType::ExtensionSingle(extension_single_rel) => { + extension_single_rel.input.as_deref().into_iter().collect() + } + RelType::ExtensionMulti(extension_multi_rel) => extension_multi_rel + .inputs + .iter() + .collect(), + RelType::ExtensionLeaf(_) => vec![], + RelType::Cross(cross_rel) => { + let mut output: Vec<&Rel> = vec![]; + if let Some(left) = cross_rel.left.as_ref() { + output.push(left.as_ref()); + } + if let Some(right) = cross_rel.right.as_ref() { + output.push(right.as_ref()); + } + output + } + RelType::Exchange(exchange_rel) => { + exchange_rel.input.as_deref().into_iter().collect() + } + // FIXME - add all the others + RelType::Reference(_ref_rel) => vec![], + RelType::Write(write_rel) => write_rel.input.as_deref().into_iter().collect(), + RelType::Ddl(ddl_rel) => { + ddl_rel.view_definition.as_deref().into_iter().collect() + } + RelType::HashJoin(hash_join_rel) => { + let mut output: Vec<&Rel> = vec![]; + if let Some(left) = hash_join_rel.left.as_ref() { + output.push(left.as_ref()); + } + if let Some(right) = hash_join_rel.right.as_ref() { + output.push(right.as_ref()); + } + output + } + RelType::MergeJoin(merge_join_rel) => { + let mut output: Vec<&Rel> = vec![]; + if let Some(left) = merge_join_rel.left.as_ref() { + output.push(left.as_ref()); + } + if let Some(right) = merge_join_rel.right.as_ref() { + output.push(right.as_ref()); + } + output + } + RelType::NestedLoopJoin(nested_loop_join) => { + let mut output: Vec<&Rel> = vec![]; + if let Some(left) = nested_loop_join.left.as_ref() { + output.push(left.as_ref()); + } + if let Some(right) = nested_loop_join.right.as_ref() { + output.push(right.as_ref()); + } + output + } + RelType::Window(window_rel) => { + window_rel.input.as_deref().into_iter().collect() + } + RelType::Expand(expand_rel) => { + expand_rel.input.as_deref().into_iter().collect() + } + RelType::Update(_update_rel) => vec![], + }, + None => vec![], + } +} + +fn transform_box Result>>( + br: Box, + f: &mut F, +) -> Result>> { + Ok(f(*br)?.update_data(Box::new)) +} + +fn transform_option_box Result>>( + obr: Option>, + f: &mut F, +) -> Result>>> { + obr.map_or(Ok(Transformed::no(None)), |be| { + Ok(transform_box(be, f)?.update_data(Some)) + }) +} + +impl TreeNode for Rel { + fn apply_children<'n, F: FnMut(&'n Self) -> Result>( + &'n self, + f: F, + ) -> Result { + inputs(self).into_iter().apply_until_stop(f) + } + + #[allow(deprecated)] + fn map_children Result>>( + self, + mut f: F, + ) -> Result> { + if let Some(rel_type) = self.rel_type { + let t = match rel_type { + RelType::Read(_) => Transformed::no(rel_type), + RelType::Project(p) => { + let ProjectRel { + common, + input, + expressions, + advanced_extension, + } = *p; + transform_option_box(input, &mut f)?.update_data(|input| { + RelType::Project(Box::new(ProjectRel { + common, + input, + expressions, + advanced_extension, + })) + }) + } + RelType::Filter(p) => { + let FilterRel { + common, + input, + condition, + advanced_extension, + } = *p; + transform_option_box(input, &mut f)?.update_data(|input| { + RelType::Filter(Box::new(FilterRel { + common, + input, + condition, + advanced_extension, + })) + }) + } + + RelType::Fetch(p) => { + let FetchRel { + common, + input, + advanced_extension, + offset_mode, + count_mode, + } = *p; + transform_option_box(input, &mut f)?.update_data(|input| { + RelType::Fetch(Box::new(FetchRel { + common, + input, + advanced_extension, + offset_mode, + count_mode, + })) + }) + } + RelType::Aggregate(p) => { + let AggregateRel { + common, + input, + groupings, + measures, + grouping_expressions, + advanced_extension, + } = *p; + transform_option_box(input, &mut f)?.update_data(|input| { + RelType::Aggregate(Box::new(AggregateRel { + common, + input, + groupings, + measures, + grouping_expressions, + advanced_extension, + })) + }) + } + RelType::Sort(p) => { + let SortRel { + common, + input, + sorts, + advanced_extension, + } = *p; + transform_option_box(input, &mut f)?.update_data(|input| { + RelType::Sort(Box::new(SortRel { + common, + input, + sorts, + advanced_extension, + })) + }) + } + // FIXME + RelType::Set(p) => { + let SetRel { + common, + inputs, + op, + advanced_extension, + } = p; + let mut transformed_any = false; + let new_inputs: Vec<_> = inputs + .into_iter() + .map(|input| { + let transformed = + transform_box(Box::new(input), &mut f).unwrap(); + if transformed.transformed { + transformed_any = true; + } + *transformed.data + }) + .collect(); + if transformed_any { + Transformed::yes(RelType::Set(SetRel { + common, + inputs: new_inputs, + op, + advanced_extension, + })) + } else { + Transformed::no(RelType::Set(SetRel { + common, + inputs: new_inputs, + op, + advanced_extension, + })) + } + } + RelType::ExtensionSingle(p) => { + let ExtensionSingleRel { + common, + input, + detail, + } = *p; + transform_option_box(input, &mut f)?.update_data(|input| { + RelType::ExtensionSingle(Box::new(ExtensionSingleRel { + common, + input, + detail, + })) + }) + } + RelType::ExtensionMulti(p) => { + let ExtensionMultiRel { + common, + inputs, + detail, + } = p; + let mut transformed_any = false; + let new_inputs: Vec = inputs + .into_iter() + .map(|input| { + let transformed = + transform_box(Box::new(input), &mut f).unwrap(); + if transformed.transformed { + transformed_any = true; + } + *transformed.data + }) + .collect(); + if transformed_any { + Transformed::yes(RelType::ExtensionMulti(ExtensionMultiRel { + common, + inputs: new_inputs, + detail, + })) + } else { + Transformed::no(RelType::ExtensionMulti(ExtensionMultiRel { + common, + inputs: new_inputs, + detail, + })) + } + } + RelType::Join(p) => { + let JoinRel { + common, + left, + right, + expression, + post_join_filter, + r#type, + advanced_extension, + } = *p; + let mut transformed_any = false; + let new_left = transform_option_box(left, &mut f)?; + if new_left.transformed { + transformed_any = true; + } + let new_right = transform_option_box(right, &mut f)?; + if new_right.transformed { + transformed_any = true; + } + + if transformed_any { + Transformed::yes(RelType::Join(Box::new(JoinRel { + common, + left: new_left.data, + right: new_right.data, + expression, + post_join_filter, + r#type, + advanced_extension, + }))) + } else { + Transformed::no(RelType::Join(Box::new(JoinRel { + common, + left: new_left.data, + right: new_right.data, + expression, + post_join_filter, + r#type, + advanced_extension, + }))) + } + } + RelType::ExtensionLeaf(inner) => { + Transformed::no(RelType::ExtensionLeaf(inner)) + } + RelType::Cross(p) => { + let CrossRel { + common, + left, + right, + advanced_extension, + } = *p; + let mut transformed_any = false; + let new_left = transform_option_box(left, &mut f)?; + if new_left.transformed { + transformed_any = true; + } + let new_right = transform_option_box(right, &mut f)?; + if new_right.transformed { + transformed_any = true; + } + + if transformed_any { + Transformed::yes(RelType::Cross(Box::new(CrossRel { + common, + left: new_left.data, + right: new_right.data, + advanced_extension, + }))) + } else { + Transformed::no(RelType::Cross(Box::new(CrossRel { + common, + left: new_left.data, + right: new_right.data, + advanced_extension, + }))) + } + } + RelType::Reference(inner) => Transformed::no(RelType::Reference(inner)), + RelType::Write(p) => { + let WriteRel { + table_schema, + op, + input, + create_mode, + output, + common, + advanced_extension, + write_type, + } = *p; + transform_option_box(input, &mut f)?.update_data(|input| { + RelType::Write(Box::new(WriteRel { + table_schema, + op, + input, + create_mode, + output, + common, + advanced_extension, + write_type, + })) + }) + } + RelType::Ddl(p) => { + let DdlRel { + table_schema, + table_defaults, + object, + op, + view_definition, + common, + advanced_extension, + write_type, + } = *p; + transform_option_box(view_definition, &mut f)?.update_data(|input| { + RelType::Ddl(Box::new(DdlRel { + table_schema, + table_defaults, + object, + op, + view_definition: input, + common, + advanced_extension, + write_type, + })) + }) + } + RelType::HashJoin(p) => { + let HashJoinRel { + common, + left, + right, + left_keys, + right_keys, + keys, + post_join_filter, + r#type, + build_input, + advanced_extension, + } = *p; + let mut transformed_any = false; + let new_left = transform_option_box(left, &mut f)?; + if new_left.transformed { + transformed_any = true; + } + let new_right = transform_option_box(right, &mut f)?; + if new_right.transformed { + transformed_any = true; + } + + if transformed_any { + Transformed::yes(RelType::HashJoin(Box::new(HashJoinRel { + common, + left: new_left.data, + right: new_right.data, + left_keys, + right_keys, + keys, + post_join_filter, + r#type, + build_input, + advanced_extension, + }))) + } else { + Transformed::no(RelType::HashJoin(Box::new(HashJoinRel { + common, + left: new_left.data, + right: new_right.data, + left_keys, + right_keys, + keys, + post_join_filter, + r#type, + build_input, + advanced_extension, + }))) + } + } + RelType::MergeJoin(p) => { + let MergeJoinRel { + common, + left, + right, + left_keys, + right_keys, + keys, + post_join_filter, + r#type, + advanced_extension, + } = *p; + let mut transformed_any = false; + let new_left = transform_option_box(left, &mut f)?; + if new_left.transformed { + transformed_any = true; + } + let new_right = transform_option_box(right, &mut f)?; + if new_right.transformed { + transformed_any = true; + } + + if transformed_any { + Transformed::yes(RelType::MergeJoin(Box::new(MergeJoinRel { + common, + left: new_left.data, + right: new_right.data, + left_keys, + right_keys, + keys, + post_join_filter, + r#type, + advanced_extension, + }))) + } else { + Transformed::no(RelType::MergeJoin(Box::new(MergeJoinRel { + common, + left: new_left.data, + right: new_right.data, + left_keys, + right_keys, + keys, + post_join_filter, + r#type, + advanced_extension, + }))) + } + } + RelType::NestedLoopJoin(p) => { + let NestedLoopJoinRel { + common, + left, + right, + expression, + r#type, + advanced_extension, + } = *p; + let mut transformed_any = false; + let new_left = transform_option_box(left, &mut f)?; + if new_left.transformed { + transformed_any = true; + } + let new_right = transform_option_box(right, &mut f)?; + if new_right.transformed { + transformed_any = true; + } + + if transformed_any { + Transformed::yes(RelType::NestedLoopJoin(Box::new( + NestedLoopJoinRel { + common, + left: new_left.data, + right: new_right.data, + expression, + r#type, + advanced_extension, + }, + ))) + } else { + Transformed::no(RelType::NestedLoopJoin(Box::new( + NestedLoopJoinRel { + common, + left: new_left.data, + right: new_right.data, + expression, + r#type, + advanced_extension, + }, + ))) + } + } + RelType::Window(p) => { + let ConsistentPartitionWindowRel { + common, + input, + window_functions, + partition_expressions, + sorts, + advanced_extension, + } = *p; + transform_option_box(input, &mut f)?.update_data(|input| { + RelType::Window(Box::new(ConsistentPartitionWindowRel { + common, + input, + window_functions, + partition_expressions, + sorts, + advanced_extension, + })) + }) + } + RelType::Exchange(p) => { + let ExchangeRel { + common, + input, + partition_count, + targets, + advanced_extension, + exchange_kind, + } = *p; + transform_option_box(input, &mut f)?.update_data(|input| { + RelType::Exchange(Box::new(ExchangeRel { + common, + input, + partition_count, + targets, + advanced_extension, + exchange_kind, + })) + }) + } + RelType::Expand(p) => { + let ExpandRel { + common, + input, + fields, + } = *p; + transform_option_box(input, &mut f)?.update_data(|input| { + RelType::Expand(Box::new(ExpandRel { + common, + input, + fields, + })) + }) + } + RelType::Update(_) => Transformed::no(rel_type), + }; + Ok(t.update_data(|rt| Rel { rel_type: Some(rt) })) + } else { + Err(DataFusionError::Plan("RelType is None".into())) + } + } +} diff --git a/datafusion/substrait/tests/cases/mod.rs b/datafusion/substrait/tests/cases/mod.rs index 0870c56cd3ba2..1601b47f4ac36 100644 --- a/datafusion/substrait/tests/cases/mod.rs +++ b/datafusion/substrait/tests/cases/mod.rs @@ -26,3 +26,4 @@ mod roundtrip_logical_plan; mod roundtrip_physical_plan; mod serialize; mod substrait_validations; +mod tree_node; diff --git a/datafusion/substrait/tests/cases/tree_node.rs b/datafusion/substrait/tests/cases/tree_node.rs new file mode 100644 index 0000000000000..8606699558582 --- /dev/null +++ b/datafusion/substrait/tests/cases/tree_node.rs @@ -0,0 +1,86 @@ +//! Tests for TreeNode Compatibility + +#[cfg(test)] +mod tests { + use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; + use datafusion::common::Result; + use std::fs::File; + use std::io::BufReader; + use substrait::proto::plan_rel::RelType; + use substrait::proto::rel::RelType::Project; + use substrait::proto::{Plan, ProjectRel, Rel}; + + #[test] + fn tree_visit() -> Result<()> { + let path = "tests/testdata/contains_plan.substrait.json"; + let proto_plan = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + for r in proto_plan.relations { + let rel = match r.rel_type.unwrap() { + RelType::Rel(rel) => rel, + RelType::Root(root_rel) => root_rel.input.unwrap(), + }; + + rel.apply(|r| { + println!("REL: {r:#?}"); + Ok(TreeNodeRecursion::Continue) + })?; + } + + Ok(()) + } + #[test] + fn tree_map() -> Result<()> { + let path = "tests/testdata/contains_plan.substrait.json"; + let proto_plan = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + for r in proto_plan.relations { + let rel = match r.rel_type.unwrap() { + RelType::Rel(rel) => rel, + RelType::Root(root_rel) => root_rel.input.unwrap(), + }; + + rel.apply(|r| { + if let Some(Project(p)) = &r.rel_type { + println!("PROJECT REL: {p:#?}"); + } + Ok(TreeNodeRecursion::Continue) + })?; + + // rewrite ProjectRel node - remove common field + let t = rel + .transform(|r| { + if let Some(Project(p)) = &r.rel_type { + let updated = Project(Box::new(ProjectRel { + common: None, + input: p.input.clone(), + expressions: p.expressions.clone(), + advanced_extension: p.advanced_extension.clone(), + })); + Ok(Transformed::yes(Rel { + rel_type: Some(updated), + })) + } else { + Ok(Transformed::no(r)) + } + })? + .data; + + println!("AFTER"); + t.apply(|r| { + if let Some(Project(p)) = &r.rel_type { + println!("PROJECT REL: {p:#?}"); + } + Ok(TreeNodeRecursion::Continue) + })?; + } + + Ok(()) + } +} From 2056b4243ccd4cc26b5783530831f051a1d0ab27 Mon Sep 17 00:00:00 2001 From: Andrei Dragomir Date: Fri, 8 May 2026 23:08:49 +0300 Subject: [PATCH 4/5] [HSTACK] fix: don't crash if an object_store is not available on deserialize - We need this so that our code that fills up object stores AFTER deserialization doesn't crash --- datafusion/proto/src/physical_plan/mod.rs | 33 ++++++++++++++++------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index ecd65acb89059..392a8e2b971c5 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -856,15 +856,30 @@ impl protobuf::PhysicalPlanNode { false => ObjectStoreUrl::parse(&base_conf.object_store_url)?, true => ObjectStoreUrl::local_filesystem(), }; - let store = ctx.runtime_env().object_store(object_store_url)?; - let metadata_cache = - ctx.runtime_env().cache_manager.get_file_metadata_cache(); - let reader_factory = - Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache)); - - let mut source = ParquetSource::new(table_schema) - .with_parquet_file_reader_factory(reader_factory) - .with_table_parquet_options(options); + // let store = ctx.runtime_env().object_store(object_store_url)?; + // let metadata_cache = + // ctx.runtime_env().cache_manager.get_file_metadata_cache(); + // let reader_factory = + // Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache)); + // + // let mut source = ParquetSource::new(table_schema) + // .with_parquet_file_reader_factory(reader_factory) + // .with_table_parquet_options(options); + + // FIXME: @HSTack - we re-register delta object stores AFTER deserialization + let mut source = if let Ok(store) = ctx.runtime_env().object_store(object_store_url) { + let metadata_cache = + ctx.runtime_env().cache_manager.get_file_metadata_cache(); + let reader_factory = + Arc::new(CachedParquetFileReaderFactory::new(store, metadata_cache)); + + ParquetSource::new(table_schema) + .with_parquet_file_reader_factory(reader_factory) + .with_table_parquet_options(options) + } else { + ParquetSource::new(table_schema) + .with_table_parquet_options(options) + }; if let Some(predicate) = predicate { source = source.with_predicate(predicate); From 1d569b2b4861eb7bb9d9102a65d3204d4c6a5050 Mon Sep 17 00:00:00 2001 From: Catalin Dobre Date: Tue, 26 May 2026 16:44:49 +0300 Subject: [PATCH 5/5] fixup: parquet opener info -> trace --- datafusion/datasource-parquet/src/opener.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/datasource-parquet/src/opener.rs b/datafusion/datasource-parquet/src/opener.rs index 27f8f0fabc0de..c1a2eb6fb486f 100644 --- a/datafusion/datasource-parquet/src/opener.rs +++ b/datafusion/datasource-parquet/src/opener.rs @@ -58,7 +58,7 @@ use datafusion_common::config::EncryptionFactoryOptions; use datafusion_execution::parquet_encryption::EncryptionFactory; use futures::{Stream, StreamExt, TryStreamExt, ready}; use itertools::Itertools; -use log::{debug, info, trace}; +use log::{debug, trace}; use parquet::arrow::arrow_reader::metrics::ArrowReaderMetrics; use parquet::arrow::arrow_reader::{ ArrowReaderMetadata, ArrowReaderOptions, RowSelectionPolicy, @@ -714,7 +714,7 @@ impl FileOpener for ParquetOpener { indices ) }; - info!("ParquetOpener::open actual final mask: {:?}", &mask); + trace!(target:"deep", "ParquetOpener::open actual final mask: {:?}", &mask); let stream = builder .with_projection(mask)