-
Notifications
You must be signed in to change notification settings - Fork 2.5k
feat(hive-sync): parallelize DROP partitions in HiveQL sync mode #19033
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
71946a6
49a7005
4bb34bc
acefd03
7022e7d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,6 +37,8 @@ | |
| import org.apache.hudi.hive.ddl.HiveSyncMode; | ||
| import org.apache.hudi.hive.ddl.JDBCBasedMetadataOperator; | ||
| import org.apache.hudi.hive.ddl.JDBCExecutor; | ||
| import org.apache.hudi.hive.util.HiveDriverPool; | ||
| import org.apache.hudi.hive.util.IMetaStoreClientPool; | ||
| import org.apache.hudi.hive.util.IMetaStoreClientUtil; | ||
| import org.apache.hudi.hive.util.PartitionFilterGenerator; | ||
| import org.apache.hudi.sync.common.HoodieSyncClient; | ||
|
|
@@ -66,6 +68,8 @@ | |
| import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getInputFormatClassName; | ||
| import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getOutputFormatClassName; | ||
| import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getSerDeClassName; | ||
| import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BATCHING_ENABLED; | ||
| import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_BATCHING_THREADS; | ||
| import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_MODE; | ||
| import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_SYNC_USE_SPARK_CATALOG; | ||
| import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_USE_JDBC; | ||
|
|
@@ -86,6 +90,16 @@ public class HoodieHiveSyncClient extends HoodieSyncClient { | |
| private final Map<String, Table> initialTableByName = new HashMap<>(); | ||
| DDLExecutor ddlExecutor; | ||
| private IMetaStoreClient client; | ||
| // Non-null only when HIVE_SYNC_BATCHING_ENABLED and sync mode is HMS or HIVEQL. | ||
| // Owned by this class; closed in close() before Hive.closeCurrent(). Hands clients | ||
| // to the active DDL executor for partition-row work only — see IMetaStoreClientPool | ||
| // javadoc. HMSDDLExecutor uses it for add/alter/drop; HiveQueryDDLExecutor uses it | ||
| // only for DROP (Hive Thrift, not Hive Driver). | ||
| private IMetaStoreClientPool partitionClientPool; | ||
| // Present only when HIVE_SYNC_BATCHING_ENABLED and sync mode is HIVEQL (explicit | ||
| // or legacy default). Owned by HiveQueryDDLExecutor; this field is kept for | ||
| // reference only — close() is delegated through ddlExecutor.close(). | ||
| private Option<HiveDriverPool> partitionDriverPool = Option.empty(); | ||
|
|
||
| /** | ||
| * JDBC-based metadata operator, lazily initialized on first Thrift | ||
|
|
@@ -121,10 +135,14 @@ public HoodieHiveSyncClient(HiveSyncConfig config, HoodieTableMetaClient metaCli | |
| HiveSyncMode syncMode = HiveSyncMode.of(config.getString(HIVE_SYNC_MODE)); | ||
| switch (syncMode) { | ||
| case HMS: | ||
| ddlExecutor = new HMSDDLExecutor(config, this.client); | ||
| this.partitionClientPool = maybeBuildPartitionClientPool(config); | ||
| ddlExecutor = new HMSDDLExecutor(config, this.client, this.partitionClientPool); | ||
| break; | ||
| case HIVEQL: | ||
| ddlExecutor = new HiveQueryDDLExecutor(config, this.client); | ||
| this.partitionDriverPool = maybeBuildHiveDriverPool(config); | ||
| this.partitionClientPool = maybeBuildPartitionClientPool(config); | ||
| ddlExecutor = new HiveQueryDDLExecutor(config, this.client, this.partitionDriverPool, | ||
| Option.ofNullable(this.partitionClientPool)); | ||
| break; | ||
| case JDBC: | ||
| JDBCExecutor jdbcExecutor = new JDBCExecutor(config); | ||
|
|
@@ -142,7 +160,10 @@ public HoodieHiveSyncClient(HiveSyncConfig config, HoodieTableMetaClient metaCli | |
| jdbcMetadataOperator = new JDBCBasedMetadataOperator( | ||
| jdbcExecutor.getConnection(), databaseName); | ||
| } else { | ||
| ddlExecutor = new HiveQueryDDLExecutor(config, this.client); | ||
| this.partitionDriverPool = maybeBuildHiveDriverPool(config); | ||
| this.partitionClientPool = maybeBuildPartitionClientPool(config); | ||
| ddlExecutor = new HiveQueryDDLExecutor(config, this.client, this.partitionDriverPool, | ||
| Option.ofNullable(this.partitionClientPool)); | ||
| } | ||
| } | ||
| } catch (Exception e) { | ||
|
|
@@ -201,6 +222,30 @@ private IMetaStoreClient createMetaStoreClient(HiveSyncConfig config) { | |
| } | ||
| } | ||
|
|
||
| private IMetaStoreClientPool maybeBuildPartitionClientPool(HiveSyncConfig config) { | ||
| if (!config.getBooleanOrDefault(HIVE_SYNC_BATCHING_ENABLED)) { | ||
| return null; | ||
| } | ||
| if (config.getBooleanOrDefault(HIVE_SYNC_USE_SPARK_CATALOG)) { | ||
| // The Spark catalog client is constructed via reflection against a Spark-side | ||
| // class and isn't compatible with the direct RetryingMetaStoreClient pool path. | ||
| // Fall back to single-client sequential behavior rather than failing the sync. | ||
| log.warn("hive_sync.batching.enabled=true is not supported with use_spark_catalog=true; " | ||
| + "falling back to sequential partition sync."); | ||
| return null; | ||
| } | ||
| int size = config.getIntOrDefault(HIVE_SYNC_BATCHING_THREADS); | ||
| return new IMetaStoreClientPool(config, size); | ||
| } | ||
|
|
||
| private Option<HiveDriverPool> maybeBuildHiveDriverPool(HiveSyncConfig config) { | ||
| if (!config.getBooleanOrDefault(HIVE_SYNC_BATCHING_ENABLED)) { | ||
| return Option.empty(); | ||
| } | ||
| int size = config.getIntOrDefault(HIVE_SYNC_BATCHING_THREADS); | ||
| return Option.of(new HiveDriverPool(config, size)); | ||
| } | ||
|
|
||
| private Table getInitialTable(String table) { | ||
| return initialTableByName.computeIfAbsent(table, t -> { | ||
| try { | ||
|
|
@@ -597,6 +642,17 @@ public void deleteLastReplicatedTimeStamp(String tableName) { | |
| public void close() { | ||
| try { | ||
| ddlExecutor.close(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 If - AI-generated; verify before applying. React 👍/👎 to flag quality. |
||
| // Close the partition client pool before Hive.closeCurrent() so the | ||
| // RetryingMetaStoreClient instances held by the pool release their Thrift | ||
| // sockets without racing the ThreadLocal Hive cleanup. | ||
| if (partitionClientPool != null) { | ||
| try { | ||
| partitionClientPool.close(); | ||
| } catch (Exception e) { | ||
| log.warn("Error closing IMetaStoreClient pool", e); | ||
| } | ||
| partitionClientPool = null; | ||
| } | ||
| if (client != null) { | ||
| Hive.closeCurrent(); | ||
| client = null; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,7 @@ | |
| import org.apache.hudi.hive.HoodieHiveSyncException; | ||
| import org.apache.hudi.hive.util.HivePartitionUtil; | ||
| import org.apache.hudi.hive.util.HiveSchemaUtil; | ||
| import org.apache.hudi.hive.util.IMetaStoreClientPool; | ||
| import org.apache.hudi.storage.StorageSchemes; | ||
| import org.apache.hudi.sync.common.model.PartitionValueExtractor; | ||
|
|
||
|
|
@@ -46,13 +47,13 @@ | |
| import org.apache.hadoop.hive.ql.metadata.Hive; | ||
| import org.apache.hadoop.hive.ql.metadata.HiveException; | ||
| import org.apache.hadoop.security.UserGroupInformation; | ||
| import org.apache.thrift.TException; | ||
|
|
||
| import java.util.ArrayList; | ||
| import java.util.HashMap; | ||
| import java.util.LinkedHashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.concurrent.Future; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| import static org.apache.hudi.hive.HiveSyncConfigHolder.HIVE_BATCH_SYNC_PARTITION_NUM; | ||
|
|
@@ -73,11 +74,21 @@ public class HMSDDLExecutor implements DDLExecutor { | |
| private final String databaseName; | ||
| private final IMetaStoreClient client; | ||
| private final PartitionValueExtractor partitionValueExtractor; | ||
| // Optional. When non-null, partition-row operations fan out across this pool; | ||
| // table-row operations always use the session `client` field above. See | ||
| // IMetaStoreClientPool javadoc for the usage contract. | ||
| private final IMetaStoreClientPool partitionClientPool; | ||
|
|
||
| public HMSDDLExecutor(HiveSyncConfig syncConfig, IMetaStoreClient metaStoreClient) throws HiveException, MetaException { | ||
| this(syncConfig, metaStoreClient, null); | ||
| } | ||
|
|
||
| public HMSDDLExecutor(HiveSyncConfig syncConfig, IMetaStoreClient metaStoreClient, | ||
| IMetaStoreClientPool partitionClientPool) throws HiveException, MetaException { | ||
| this.syncConfig = syncConfig; | ||
| this.databaseName = syncConfig.getStringOrDefault(META_SYNC_DATABASE_NAME); | ||
| this.client = metaStoreClient; | ||
| this.partitionClientPool = partitionClientPool; | ||
| try { | ||
| this.partitionValueExtractor = | ||
| (PartitionValueExtractor) Class.forName(syncConfig.getStringOrDefault(META_SYNC_PARTITION_EXTRACTOR_CLASS)).newInstance(); | ||
|
|
@@ -193,11 +204,14 @@ public void addPartitionsToTable(String tableName, List<String> partitionsToAdd) | |
| } | ||
| log.info("Adding partitions {} to table {}", partitionsToAdd.size(), tableName); | ||
| try { | ||
| // Fetch the table StorageDescriptor once on the session client. Each worker | ||
| // copies fields off it; the Thrift POJO is not mutated after this read. | ||
| StorageDescriptor sd = client.getTable(databaseName, tableName).getSd(); | ||
| int batchSyncPartitionNum = syncConfig.getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM); | ||
| for (List<String> batch : CollectionUtils.batches(partitionsToAdd, batchSyncPartitionNum)) { | ||
| List<Partition> partitionList = new ArrayList<>(); | ||
| batch.forEach(x -> { | ||
| List<List<String>> batches = CollectionUtils.batches(partitionsToAdd, batchSyncPartitionNum); | ||
| runBatches("add", tableName, batches, (poolClient, batch) -> { | ||
| List<Partition> partitionList = new ArrayList<>(batch.size()); | ||
| for (String x : batch) { | ||
| StorageDescriptor partitionSd = new StorageDescriptor(); | ||
| partitionSd.setCols(sd.getCols()); | ||
| partitionSd.setInputFormat(sd.getInputFormat()); | ||
|
|
@@ -208,11 +222,11 @@ public void addPartitionsToTable(String tableName, List<String> partitionsToAdd) | |
| List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(x); | ||
| partitionSd.setLocation(fullPartitionPath); | ||
| partitionList.add(new Partition(partitionValues, databaseName, tableName, 0, 0, partitionSd, null)); | ||
| }); | ||
| client.add_partitions(partitionList, true, false); | ||
| } | ||
| poolClient.add_partitions(partitionList, true, false); | ||
| log.info("HMSDDLExecutor add a batch partitions done: {}", partitionList.size()); | ||
| } | ||
| } catch (TException e) { | ||
| }); | ||
| } catch (Exception e) { | ||
| log.error("{}.{} add partition failed", databaseName, tableName, e); | ||
| throw new HoodieHiveSyncException(databaseName + "." + tableName + " add partition failed", e); | ||
| } | ||
|
|
@@ -247,15 +261,22 @@ public void dropPartitionsToTable(String tableName, List<String> partitionsToDro | |
|
|
||
| log.info("Drop partitions {} on {}", partitionsToDrop.size(), tableName); | ||
| try { | ||
| for (String dropPartition : partitionsToDrop) { | ||
| if (HivePartitionUtil.partitionExists(client, tableName, dropPartition, partitionValueExtractor, syncConfig)) { | ||
| String partitionClause = | ||
| HivePartitionUtil.getPartitionClauseForDrop(dropPartition, partitionValueExtractor, syncConfig); | ||
| client.dropPartition(databaseName, tableName, partitionClause, false); | ||
| // HMS has no batch-drop primitive that matches dropPartition's semantics, so each | ||
| // worker still iterates its chunk one partition at a time. The win is fanning | ||
| // chunks across pooled clients. | ||
| int batchSyncPartitionNum = syncConfig.getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM); | ||
| List<List<String>> batches = CollectionUtils.batches(partitionsToDrop, batchSyncPartitionNum); | ||
| runBatches("drop", tableName, batches, (poolClient, batch) -> { | ||
| for (String dropPartition : batch) { | ||
| if (HivePartitionUtil.partitionExists(poolClient, tableName, dropPartition, partitionValueExtractor, syncConfig)) { | ||
| String partitionClause = | ||
| HivePartitionUtil.getPartitionClauseForDrop(dropPartition, partitionValueExtractor, syncConfig); | ||
| poolClient.dropPartition(databaseName, tableName, partitionClause, false); | ||
| } | ||
| log.info("Drop partition {} on {}", dropPartition, tableName); | ||
| } | ||
| log.info("Drop partition {} on {}", dropPartition, tableName); | ||
| } | ||
| } catch (TException e) { | ||
| }); | ||
| } catch (Exception e) { | ||
| log.error("{}.{} drop partition failed", databaseName, tableName, e); | ||
| throw new HoodieHiveSyncException(databaseName + "." + tableName + " drop partition failed", e); | ||
| } | ||
|
|
@@ -291,21 +312,71 @@ public void close() { | |
|
|
||
| private void registerAlterPartitionEvent(String tableName, List<String> alteredPartitions) { | ||
| try { | ||
| // Read the StorageDescriptor once on the session client; each worker deep-copies | ||
| // it per partition (alter_partitions semantics today). | ||
| StorageDescriptor sd = client.getTable(databaseName, tableName).getSd(); | ||
| List<Partition> partitionList = alteredPartitions.stream().map(partition -> { | ||
| Path partitionPath = HadoopFSUtils.constructAbsolutePathInHadoopPath(syncConfig.getString(META_SYNC_BASE_PATH), partition); | ||
| String partitionScheme = partitionPath.toUri().getScheme(); | ||
| String fullPartitionPath = StorageSchemes.HDFS.getScheme().equals(partitionScheme) | ||
| ? HadoopFSUtils.getDFSFullPartitionPath(syncConfig.getHadoopFileSystem(), partitionPath) : partitionPath.toString(); | ||
| List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); | ||
| StorageDescriptor partitionSd = sd.deepCopy(); | ||
| partitionSd.setLocation(fullPartitionPath); | ||
| return new Partition(partitionValues, databaseName, tableName, 0, 0, partitionSd, null); | ||
| }).collect(Collectors.toList()); | ||
| client.alter_partitions(databaseName, tableName, partitionList, null); | ||
| } catch (TException e) { | ||
| int batchSyncPartitionNum = syncConfig.getIntOrDefault(HIVE_BATCH_SYNC_PARTITION_NUM); | ||
| List<List<String>> batches = CollectionUtils.batches(alteredPartitions, batchSyncPartitionNum); | ||
| runBatches("alter", tableName, batches, (poolClient, batch) -> { | ||
| List<Partition> partitionList = batch.stream().map(partition -> { | ||
| Path partitionPath = HadoopFSUtils.constructAbsolutePathInHadoopPath(syncConfig.getString(META_SYNC_BASE_PATH), partition); | ||
| String partitionScheme = partitionPath.toUri().getScheme(); | ||
| String fullPartitionPath = StorageSchemes.HDFS.getScheme().equals(partitionScheme) | ||
| ? HadoopFSUtils.getDFSFullPartitionPath(syncConfig.getHadoopFileSystem(), partitionPath) : partitionPath.toString(); | ||
| List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition); | ||
| StorageDescriptor partitionSd = sd.deepCopy(); | ||
| partitionSd.setLocation(fullPartitionPath); | ||
| return new Partition(partitionValues, databaseName, tableName, 0, 0, partitionSd, null); | ||
| }).collect(Collectors.toList()); | ||
| poolClient.alter_partitions(databaseName, tableName, partitionList, null); | ||
| }); | ||
| } catch (Exception e) { | ||
| log.error("{}.{} update partition failed", databaseName, tableName, e); | ||
| throw new HoodieHiveSyncException(databaseName + "." + tableName + " update partition failed", e); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Dispatches partition batches either in parallel against the client pool (if | ||
| * configured) or sequentially against the session client. The sequential path | ||
| * preserves the exact behavior we had before the pool existed, including failure | ||
| * semantics where batch N+1 never runs if batch N throws. | ||
| */ | ||
| @FunctionalInterface | ||
| private interface BatchAction { | ||
| void apply(IMetaStoreClient client, List<String> batch) throws Exception; | ||
| } | ||
|
|
||
| private void runBatches(String opName, String tableName, List<List<String>> batches, BatchAction action) throws Exception { | ||
| if (partitionClientPool == null) { | ||
| for (List<String> batch : batches) { | ||
| action.apply(client, batch); | ||
| } | ||
| return; | ||
| } | ||
| List<Future<Void>> futures = new ArrayList<>(batches.size()); | ||
| for (List<String> batch : batches) { | ||
| futures.add(partitionClientPool.executor().submit(() -> | ||
| partitionClientPool.run(poolClient -> { | ||
| action.apply(poolClient, batch); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 When a Future fails, - AI-generated; verify before applying. React 👍/👎 to flag quality. |
||
| return null; | ||
| }) | ||
| )); | ||
| } | ||
| Exception firstError = null; | ||
| for (Future<Void> f : futures) { | ||
| try { | ||
| f.get(); | ||
| } catch (Exception e) { | ||
| if (firstError == null) { | ||
| firstError = e; | ||
| } else { | ||
| log.warn("Additional {} batch failed on {} (suppressed in favor of first error)", opName, tableName, e); | ||
| } | ||
| } | ||
| } | ||
| if (firstError != null) { | ||
| throw firstError; | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤖 nit: the comment says this field is 'kept for reference only — close() is delegated through ddlExecutor.close()', which means it's never read after being passed into
HiveQueryDDLExecutor. Could you make it a local variable in the constructor instead? A field that isn't read after construction will make future readers wonder what lifecycle role it plays.- AI-generated; verify before applying. React 👍/👎 to flag quality.