-
Notifications
You must be signed in to change notification settings - Fork 2.5k
feat:(DNM) add a lsm-tree based FG reader #18987
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,7 +34,6 @@ | |
| import org.apache.hudi.exception.HoodieException; | ||
| import org.apache.hudi.exception.HoodieIOException; | ||
| import org.apache.hudi.exception.HoodieValidationException; | ||
| import org.apache.hudi.exception.InvalidHoodieFileNameException; | ||
| import org.apache.hudi.exception.InvalidHoodiePathException; | ||
| import org.apache.hudi.metadata.HoodieTableMetadata; | ||
| import org.apache.hudi.storage.HoodieStorage; | ||
|
|
@@ -79,6 +78,8 @@ public class FSUtils { | |
| public static final String PATH_SEPARATOR = "/"; | ||
| public static final Pattern LOG_FILE_PATTERN = | ||
| Pattern.compile("^\\.([^._]+)_([^.]*)\\.(log|archive)\\.(\\d+)(_((\\d+)-(\\d+)-(\\d+))(\\.cdc)?)?$"); | ||
| public static final Pattern NATIVE_LOG_FILE_PATTERN = | ||
| Pattern.compile("^([^_]+)_((\\d+)-(\\d+)-(\\d+))_([^_]+)_(\\d+)(\\.delete)?\\.(parquet)$"); | ||
| public static final Pattern PREFIX_BY_FILE_ID_PATTERN = Pattern.compile("^(.+)-(\\d+)"); | ||
| private static final Pattern BASE_FILE_PATTERN = Pattern.compile("[a-zA-Z0-9-]+_[a-zA-Z0-9-]+_[0-9]+\\.[a-zA-Z0-9]+"); | ||
|
|
||
|
|
@@ -131,6 +132,10 @@ public static String maskWithoutFileId(String instantTime, int taskPartitionId) | |
|
|
||
| public static String getCommitTime(String fullFileName) { | ||
| try { | ||
| Option<Matcher> nativeLogMatcher = matchNativeLogFile(fullFileName); | ||
| if (nativeLogMatcher.isPresent()) { | ||
| return nativeLogMatcher.get().group(6); | ||
| } | ||
| if (isLogFile(fullFileName)) { | ||
| return fullFileName.split("_")[1].split("\\.", 2)[0]; | ||
| } | ||
|
|
@@ -328,6 +333,10 @@ public static StoragePath getAbsoluteFilePath(StoragePath basePath, String parti | |
| * Get the file extension from the log file. | ||
| */ | ||
| public static String getFileExtensionFromLog(StoragePath logPath) { | ||
| Option<Matcher> nativeLogMatcher = matchNativeLogFile(logPath.getName()); | ||
| if (nativeLogMatcher.isPresent()) { | ||
| return nativeLogMatcher.get().group(9); | ||
| } | ||
| Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName()); | ||
| if (!matcher.matches()) { | ||
| throw new InvalidHoodiePathException(logPath.toString(), "LogFile"); | ||
|
|
@@ -336,22 +345,19 @@ public static String getFileExtensionFromLog(StoragePath logPath) { | |
| } | ||
|
|
||
| public static String getFileIdFromFileName(String fileName) { | ||
| if (FSUtils.isLogFile(fileName)) { | ||
| Matcher matcher = LOG_FILE_PATTERN.matcher(fileName); | ||
| if (!matcher.matches()) { | ||
| throw new InvalidHoodieFileNameException(fileName, "LogFile"); | ||
| } | ||
| return matcher.group(1); | ||
| Option<Matcher> logFileMatcher = matchLogFile(fileName); | ||
| if (logFileMatcher.isPresent()) { | ||
| return logFileMatcher.get().group(1); | ||
| } | ||
| return FSUtils.getFileId(fileName); | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤖 Switching to - AI-generated; verify before applying. React 👍/👎 to flag quality. |
||
|
|
||
| public static String getFileIdFromLogPath(StoragePath path) { | ||
| Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); | ||
| if (!matcher.matches()) { | ||
| Option<Matcher> logFileMatcher = matchLogFile(path.getName()); | ||
| if (!logFileMatcher.isPresent()) { | ||
| throw new InvalidHoodiePathException(path, "LogFile"); | ||
| } | ||
| return matcher.group(1); | ||
| return logFileMatcher.get().group(1); | ||
| } | ||
|
|
||
| public static String getFileIdFromFilePath(StoragePath filePath) { | ||
|
|
@@ -365,6 +371,10 @@ public static String getFileIdFromFilePath(StoragePath filePath) { | |
| * Get the second part of the file name in the log file. That will be the delta commit time. | ||
| */ | ||
| public static String getDeltaCommitTimeFromLogPath(StoragePath path) { | ||
| Option<Matcher> nativeLogMatcher = matchNativeLogFile(path.getName()); | ||
| if (nativeLogMatcher.isPresent()) { | ||
| return nativeLogMatcher.get().group(6); | ||
| } | ||
| Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); | ||
| if (!matcher.matches()) { | ||
| throw new InvalidHoodiePathException(path.toString(), "LogFile"); | ||
|
|
@@ -376,6 +386,10 @@ public static String getDeltaCommitTimeFromLogPath(StoragePath path) { | |
| * Get TaskPartitionId used in log-path. | ||
| */ | ||
| public static Integer getTaskPartitionIdFromLogPath(StoragePath path) { | ||
| Option<Matcher> nativeLogMatcher = matchNativeLogFile(path.getName()); | ||
| if (nativeLogMatcher.isPresent()) { | ||
| return Integer.parseInt(nativeLogMatcher.get().group(3)); | ||
| } | ||
| Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); | ||
| if (!matcher.matches()) { | ||
| throw new InvalidHoodiePathException(path.toString(), "LogFile"); | ||
|
|
@@ -388,6 +402,10 @@ public static Integer getTaskPartitionIdFromLogPath(StoragePath path) { | |
| * Get Write-Token used in log-path. | ||
| */ | ||
| public static String getWriteTokenFromLogPath(StoragePath path) { | ||
| Option<Matcher> nativeLogMatcher = matchNativeLogFile(path.getName()); | ||
| if (nativeLogMatcher.isPresent()) { | ||
| return nativeLogMatcher.get().group(2); | ||
| } | ||
| Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); | ||
| if (!matcher.matches()) { | ||
| throw new InvalidHoodiePathException(path.toString(), "LogFile"); | ||
|
|
@@ -399,6 +417,10 @@ public static String getWriteTokenFromLogPath(StoragePath path) { | |
| * Get StageId used in log-path. | ||
| */ | ||
| public static Integer getStageIdFromLogPath(StoragePath path) { | ||
| Option<Matcher> nativeLogMatcher = matchNativeLogFile(path.getName()); | ||
| if (nativeLogMatcher.isPresent()) { | ||
| return Integer.parseInt(nativeLogMatcher.get().group(4)); | ||
| } | ||
| Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); | ||
| if (!matcher.matches()) { | ||
| throw new InvalidHoodiePathException(path.toString(), "LogFile"); | ||
|
|
@@ -411,6 +433,10 @@ public static Integer getStageIdFromLogPath(StoragePath path) { | |
| * Get Task Attempt Id used in log-path. | ||
| */ | ||
| public static Integer getTaskAttemptIdFromLogPath(StoragePath path) { | ||
| Option<Matcher> nativeLogMatcher = matchNativeLogFile(path.getName()); | ||
| if (nativeLogMatcher.isPresent()) { | ||
| return Integer.parseInt(nativeLogMatcher.get().group(5)); | ||
| } | ||
| Matcher matcher = LOG_FILE_PATTERN.matcher(path.getName()); | ||
| if (!matcher.matches()) { | ||
| throw new InvalidHoodiePathException(path.toString(), "LogFile"); | ||
|
|
@@ -427,6 +453,10 @@ public static int getFileVersionFromLog(StoragePath logPath) { | |
| } | ||
|
|
||
| public static int getFileVersionFromLog(String logFileName) { | ||
| Option<Matcher> nativeLogMatcher = matchNativeLogFile(logFileName); | ||
| if (nativeLogMatcher.isPresent()) { | ||
| return Integer.parseInt(nativeLogMatcher.get().group(7)); | ||
| } | ||
| Matcher matcher = LOG_FILE_PATTERN.matcher(logFileName); | ||
| if (!matcher.matches()) { | ||
| throw new HoodieIOException("Invalid log file name: " + logFileName); | ||
|
|
@@ -443,6 +473,9 @@ public static String makeLogFileName(String fileId, String logFileExtension, Str | |
| } | ||
|
|
||
| public static boolean isBaseFile(StoragePath path) { | ||
| if (matchNativeLogFile(path.getName()).isPresent()) { | ||
| return false; | ||
| } | ||
| String extension = getFileExtension(path.getName()); | ||
| if (HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(extension)) { | ||
| return BASE_FILE_PATTERN.matcher(path.getName()).matches(); | ||
|
|
@@ -466,13 +499,42 @@ public static boolean isLogFile(StoragePath logPath) { | |
| } | ||
|
|
||
| public static boolean isLogFile(String fileName) { | ||
| if (matchNativeLogFile(fileName).isPresent()) { | ||
| return true; | ||
| } | ||
| if (fileName.startsWith(LOG_FILE_START_WITH_CHARACTER)) { | ||
| Matcher matcher = LOG_FILE_PATTERN.matcher(fileName); | ||
| return matcher.matches() && matcher.group(3).equals(LOG_FILE_EXTENSION); | ||
| } | ||
| return false; | ||
| } | ||
|
|
||
| public static Option<Matcher> matchNativeLogFile(String fileName) { | ||
| if (StringUtils.isNullOrEmpty(fileName)) { | ||
| return Option.empty(); | ||
| } | ||
| String actualFileName = fileName.contains(StoragePath.SEPARATOR) | ||
| ? fileName.substring(fileName.lastIndexOf(StoragePath.SEPARATOR) + 1) | ||
| : fileName; | ||
| Matcher matcher = NATIVE_LOG_FILE_PATTERN.matcher(actualFileName); | ||
| return matcher.matches() ? Option.of(matcher) : Option.empty(); | ||
| } | ||
|
|
||
| public static boolean isNativeDeleteLogFile(String fileName) { | ||
| return matchNativeLogFile(fileName).map(matcher -> matcher.group(8) != null).orElse(false); | ||
| } | ||
|
|
||
| private static Option<Matcher> matchLogFile(String fileName) { | ||
| Option<Matcher> nativeLogMatcher = matchNativeLogFile(fileName); | ||
| if (nativeLogMatcher.isPresent()) { | ||
| return nativeLogMatcher; | ||
| } | ||
| Matcher matcher = LOG_FILE_PATTERN.matcher(fileName); | ||
| return matcher.matches() && matcher.group(3).equals(LOG_FILE_EXTENSION) | ||
| ? Option.of(matcher) | ||
| : Option.empty(); | ||
| } | ||
|
|
||
| public static boolean isDataFile(StoragePath path) { | ||
| return isBaseFile(path) || isLogFile(path); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤖 The pattern uses
[^_]+for the file id, but Hudi file ids elsewhere inBASE_FILE_PATTERNallow[a-zA-Z0-9-]+andLOG_FILE_PATTERNallows[^._]+. Are we sure native log file ids will never contain.or other special characters? Tightening this to[a-zA-Z0-9-]+(or[^._]+for consistency with the existing log pattern) would avoid accidental matches against unrelated.parquetpaths.- AI-generated; verify before applying. React 👍/👎 to flag quality.