Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions arrow-schema/src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,13 @@ impl DataType {
})
})
}
// Timestamps with matching unit whose timezones are both recognized UTC
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to me that this cast / comparsion should be done at the application layer (aka the thing calling the parquet writer). My rationale is that there are many other places in the codbase that compare arrays based on exact equality of DataType, and don't do this timezone normalization.

Perhaps in the iceberg code you could add a call to cast for all columns before you pass it to the parquet writer . The cast is a noop for arrays of the already correct type

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I started drafting that but when I realised DataFusion has to do something similar I thought it might be worth pushing it lower in the stack.

Is there a case where we wouldn't want UTC to equal +00:00?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gave it a go in iceberg-rust here apache/iceberg-rust#2477 to see what it would look like

// aliases are considered equivalent.
(DataType::Timestamp(a_unit, Some(a_tz)), DataType::Timestamp(b_unit, Some(b_tz)))
if a_unit == b_unit && a_tz != b_tz =>
{
is_utc_alias(a_tz) && is_utc_alias(b_tz)
}
_ => self == other,
}
}
Expand Down Expand Up @@ -874,6 +881,16 @@ impl DataType {
}
}

/// Returns true when `tz` is one of the recognized UTC timezone aliases.
///
/// Producers of arrow batches use a variety of strings to denote UTC: `"UTC"`,
/// `"+00:00"` and `"Z"` are all common and semantically
/// identical. Treating these as interchangeable lets writers accept batches from
/// upstream systems that disagree on the canonical spelling.
pub fn is_utc_alias(tz: &str) -> bool {
matches!(tz, "UTC" | "+00:00" | "Z")
}

/// The maximum precision for [DataType::Decimal32] values
pub const DECIMAL32_MAX_PRECISION: u8 = 9;

Expand Down Expand Up @@ -1290,4 +1307,101 @@ mod tests {
)
");
}

#[test]
fn test_is_utc_alias() {
assert!(is_utc_alias("UTC"));
assert!(is_utc_alias("+00:00"));
assert!(is_utc_alias("Z"));
assert!(!is_utc_alias("America/New_York"));
assert!(!is_utc_alias("+05:30"));
assert!(!is_utc_alias(""));
assert!(!is_utc_alias("utc"));
assert!(!is_utc_alias("+0000"));
assert!(!is_utc_alias("+00"));
}

#[test]
fn test_equals_datatype_utc_aliases_flat() {
use crate::TimeUnit;
let aliases = ["UTC", "+00:00", "Z"];
for a in &aliases {
for b in &aliases {
let ta = DataType::Timestamp(TimeUnit::Microsecond, Some((*a).into()));
let tb = DataType::Timestamp(TimeUnit::Microsecond, Some((*b).into()));
assert!(ta.equals_datatype(&tb), "{a} should be equivalent to {b}",);
}
}
}

#[test]
fn test_equals_datatype_utc_aliases_nested() {
use crate::TimeUnit;
let leaf_a = DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into()));
let leaf_b = DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into()));

// List
let list_a = DataType::List(Arc::new(Field::new("item", leaf_a.clone(), true)));
let list_b = DataType::List(Arc::new(Field::new("item", leaf_b.clone(), true)));
assert!(list_a.equals_datatype(&list_b));

// LargeList
let ll_a = DataType::LargeList(Arc::new(Field::new("item", leaf_a.clone(), true)));
let ll_b = DataType::LargeList(Arc::new(Field::new("item", leaf_b.clone(), true)));
assert!(ll_a.equals_datatype(&ll_b));

// FixedSizeList
let fsl_a = DataType::FixedSizeList(Arc::new(Field::new("item", leaf_a.clone(), true)), 3);
let fsl_b = DataType::FixedSizeList(Arc::new(Field::new("item", leaf_b.clone(), true)), 3);
assert!(fsl_a.equals_datatype(&fsl_b));

// Struct
let struct_a = DataType::Struct(vec![Field::new("ts", leaf_a.clone(), true)].into());
let struct_b = DataType::Struct(vec![Field::new("ts", leaf_b.clone(), true)].into());
assert!(struct_a.equals_datatype(&struct_b));

// Map
let map_a = DataType::Map(
Arc::new(Field::new(
"entries",
DataType::Struct(
vec![
Field::new("keys", DataType::Utf8, false),
Field::new("values", leaf_a.clone(), true),
]
.into(),
),
false,
)),
false,
);
let map_b = DataType::Map(
Arc::new(Field::new(
"entries",
DataType::Struct(
vec![
Field::new("keys", DataType::Utf8, false),
Field::new("values", leaf_b.clone(), true),
]
.into(),
),
false,
)),
false,
);
assert!(map_a.equals_datatype(&map_b));
}

#[test]
fn test_equals_datatype_non_utc_timezones_differ() {
use crate::TimeUnit;
let utc = DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into()));
let est = DataType::Timestamp(TimeUnit::Microsecond, Some("America/New_York".into()));
let offset = DataType::Timestamp(TimeUnit::Microsecond, Some("+05:30".into()));
let diff_unit = DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into()));

assert!(!utc.equals_datatype(&est));
assert!(!utc.equals_datatype(&offset));
assert!(!utc.equals_datatype(&diff_unit));
}
}
18 changes: 17 additions & 1 deletion parquet/src/arrow/arrow_writer/levels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1029,7 +1029,7 @@ mod tests {
use arrow_buffer::{Buffer, ToByteSlice};
use arrow_cast::display::array_value_to_string;
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{Fields, Schema};
use arrow_schema::{Fields, Schema, TimeUnit};

#[test]
fn test_calculate_array_levels_twitter_example() {
Expand Down Expand Up @@ -2297,6 +2297,22 @@ mod tests {
assert_eq!(levels[0], expected_level);
}

#[test]
fn timestamp_utc_aliases_accepted_by_writer() {
// Verifies that LevelInfoBuilder::try_new (the writer's entry point)
// accepts arrays whose timezone is a UTC alias different from the field's.
// Detailed equivalence tests live in arrow_schema::datatype::tests.
let field = Field::new(
"ts",
DataType::Timestamp(TimeUnit::Microsecond, Some("+00:00".into())),
true,
);
let array = Arc::new(TimestampMicrosecondArray::from(vec![0_i64, 1]).with_timezone("UTC"))
as ArrayRef;
LevelInfoBuilder::try_new(&field, Default::default(), &array)
.expect("UTC and +00:00 should be treated as compatible");
}

#[test]
fn mismatched_types() {
let array = Arc::new(Int32Array::from_iter(0..10)) as ArrayRef;
Expand Down
Loading