-
Notifications
You must be signed in to change notification settings - Fork 2.5k
fix(schema): permit opt-in timestamp-precision evolution #19028
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -52,29 +52,39 @@ public class SchemaChangeUtils { | |
| * @param dst new column type. | ||
| * @return whether to allow the column type to be updated. | ||
| */ | ||
| public static boolean isTypeUpdateAllow(Type src, Type dst) { | ||
| /** | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 The PR description notes "Pre-existing overloads kept as delegates," which is followed for - AI-generated; verify before applying. React 👍/👎 to flag quality. |
||
| * Variant that allows opting in to timestamp-millis <-> timestamp-micros (and the local-timestamp | ||
| * variants) precision evolution, which is rejected by default. | ||
| */ | ||
| public static boolean isTypeUpdateAllow(Type src, Type dst, boolean allowTimestampPrecisionEvolution) { | ||
| if (src.isNestedType() || dst.isNestedType()) { | ||
| throw new IllegalArgumentException("only support update primitive type"); | ||
| } | ||
| if (src.equals(dst)) { | ||
| return true; | ||
| } | ||
| return isTypeUpdateAllowInternal(src, dst); | ||
| return isTypeUpdateAllowInternal(src, dst, allowTimestampPrecisionEvolution); | ||
| } | ||
|
|
||
| public static boolean shouldPromoteType(Type src, Type dst) { | ||
| if (src.equals(dst) || src.isNestedType() || dst.isNestedType()) { | ||
| return false; | ||
| } | ||
| return isTypeUpdateAllowInternal(src, dst); | ||
| return isTypeUpdateAllowInternal(src, dst, false); | ||
| } | ||
|
|
||
| private static boolean isTypeUpdateAllowInternal(Type src, Type dst) { | ||
| private static boolean isTypeUpdateAllowInternal(Type src, Type dst, boolean allowTimestampPrecisionEvolution) { | ||
| switch (src.typeId()) { | ||
| case INT: | ||
| return dst == Types.LongType.get() || dst == Types.FloatType.get() | ||
| || dst == Types.DoubleType.get() || dst == Types.StringType.get() || dst.typeId() == Type.TypeID.DECIMAL || dst.typeId() == Type.TypeID.DECIMAL_FIXED; | ||
| case LONG: | ||
| if (allowTimestampPrecisionEvolution | ||
| && (dst.typeId() == Type.TypeID.LOCAL_TIMESTAMP_MILLIS || dst.typeId() == Type.TypeID.LOCAL_TIMESTAMP_MICROS)) { | ||
| // Forward-fix path: 0.x stored local-timestamp columns as bare long because its converter | ||
| // did not recognize the logical type. Allow attaching the logical type when the gate is open. | ||
| return true; | ||
| } | ||
| return dst == Types.FloatType.get() || dst == Types.DoubleType.get() || dst == Types.StringType.get() || dst.typeId() == Type.TypeID.DECIMAL || dst.typeId() == Type.TypeID.DECIMAL_FIXED; | ||
| case FLOAT: | ||
| return dst == Types.DoubleType.get() || dst == Types.StringType.get() || dst.typeId() == Type.TypeID.DECIMAL || dst.typeId() == Type.TypeID.DECIMAL_FIXED; | ||
|
|
@@ -90,6 +100,18 @@ private static boolean isTypeUpdateAllowInternal(Type src, Type dst) { | |
| return isDecimalFixedUpdateAllowInternal(src, dst); | ||
| case STRING: | ||
| return dst == Types.DateType.get() || dst.typeId() == Type.TypeID.DECIMAL || dst.typeId() == Type.TypeID.DECIMAL_FIXED || dst == Types.BinaryType.get(); | ||
| case TIMESTAMP: | ||
| case TIMESTAMP_MILLIS: | ||
| if (!allowTimestampPrecisionEvolution) { | ||
| return false; | ||
| } | ||
| return dst.typeId() == Type.TypeID.TIMESTAMP || dst.typeId() == Type.TypeID.TIMESTAMP_MILLIS; | ||
| case LOCAL_TIMESTAMP_MILLIS: | ||
| case LOCAL_TIMESTAMP_MICROS: | ||
| if (!allowTimestampPrecisionEvolution) { | ||
| return false; | ||
| } | ||
| return dst.typeId() == Type.TypeID.LOCAL_TIMESTAMP_MILLIS || dst.typeId() == Type.TypeID.LOCAL_TIMESTAMP_MICROS; | ||
| default: | ||
| return false; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -169,11 +169,15 @@ object HoodieSchemaUtils { | |
| HoodieWriteConfig.SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP.defaultValue).toBoolean | ||
| val setNullForMissingColumns = opts.getOrElse(DataSourceWriteOptions.SET_NULL_FOR_MISSING_COLUMNS.key(), | ||
| DataSourceWriteOptions.SET_NULL_FOR_MISSING_COLUMNS.defaultValue).toBoolean | ||
| val allowTimestampPrecisionEvolution = opts.getOrElse(HoodieCommonConfig.ALLOW_TIMESTAMP_PRECISION_EVOLUTION.key, | ||
| HoodieCommonConfig.ALLOW_TIMESTAMP_PRECISION_EVOLUTION.defaultValue.toString).toBoolean | ||
|
|
||
| if (!mergeIntoWrites && !shouldValidateSchemasCompatibility && !allowAutoEvolutionColumnDrop) { | ||
| // Default behaviour | ||
| val reconciledSchema = if (setNullForMissingColumns) { | ||
| HoodieSchema.fromAvroSchema(AvroSchemaEvolutionUtils.reconcileSchema(canonicalizedSourceSchema.toAvroSchema(), latestTableSchema.toAvroSchema(), setNullForMissingColumns)) | ||
| HoodieSchema.fromAvroSchema(AvroSchemaEvolutionUtils.reconcileSchema( | ||
| canonicalizedSourceSchema.toAvroSchema(), latestTableSchema.toAvroSchema(), | ||
| setNullForMissingColumns, allowTimestampPrecisionEvolution)) | ||
| } else { | ||
| canonicalizedSourceSchema | ||
| } | ||
|
|
@@ -205,7 +209,10 @@ object HoodieSchemaUtils { | |
| // Apply schema evolution, by auto-merging write schema and read schema | ||
| val setNullForMissingColumns = opts.getOrElse(HoodieCommonConfig.SET_NULL_FOR_MISSING_COLUMNS.key(), | ||
| HoodieCommonConfig.SET_NULL_FOR_MISSING_COLUMNS.defaultValue()).toBoolean | ||
| val mergedInternalSchema = AvroSchemaEvolutionUtils.reconcileSchema(canonicalizedSourceSchema.toAvroSchema(), internalSchema, setNullForMissingColumns) | ||
| val allowTimestampPrecisionEvolution = opts.getOrElse(HoodieCommonConfig.ALLOW_TIMESTAMP_PRECISION_EVOLUTION.key, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 nit: - AI-generated; verify before applying. React 👍/👎 to flag quality. |
||
| HoodieCommonConfig.ALLOW_TIMESTAMP_PRECISION_EVOLUTION.defaultValue.toString).toBoolean | ||
| val mergedInternalSchema = AvroSchemaEvolutionUtils.reconcileSchema(canonicalizedSourceSchema.toAvroSchema(), internalSchema, | ||
| setNullForMissingColumns, allowTimestampPrecisionEvolution) | ||
| val evolvedSchema = InternalSchemaConverter.convert(mergedInternalSchema, latestTableSchema.getFullName) | ||
| val shouldRemoveMetaDataFromInternalSchema = sourceSchema.getFields.asScala.filter(f => f.name().equalsIgnoreCase(HoodieRecord.RECORD_KEY_METADATA_FIELD)).isEmpty | ||
| if (shouldRemoveMetaDataFromInternalSchema) HoodieCommonSchemaUtils.removeMetadataFields(evolvedSchema) else evolvedSchema | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤖 Should the docs make it explicit that this only relabels the logical type — it does not rewrite stored values? Since old base files keep their original encoding, enabling the gate when actual values don't match the new precision can lead to misinterpretation on read. A one-liner like "Stored values are not rewritten; only enable when the actual values already match the target precision" would steer users away from misuse.
- AI-generated; verify before applying. React 👍/👎 to flag quality.