From 972af5b78d289d2699146073c0d1dc76200c801f Mon Sep 17 00:00:00 2001 From: Vova Kolmakov Date: Tue, 16 Jun 2026 16:29:56 +0700 Subject: [PATCH 1/3] perf(io): Avoid per-entry KeyValue allocation in HFileDataBlock.seekTo HFileDataBlock.seekTo materialized a KeyValue (and its Key) for every entry it scanned, only to compare the entry key and compute the stride to the next entry. On the metadata-table read path (record-level index, bloom filter and column-stats point lookups) this is the hottest inner loop, allocating two short-lived objects per scanned entry. This compares the entry key directly against the backing block buffer and computes the stride from the on-disk length fields, materializing a KeyValue only on an exact match. The "in range" and end-of-block cases point the cursor at the previous offset and defer the read, which getKeyValue() already performs lazily. The lookup key is a UTF8StringKey, so its polymorphic content accessors are used for the comparison. No on-disk format or public API change. JMH microbenchmark over an uncompressed HFile fixture (5000 entries, 625 sorted point lookups), forks(0), gc.alloc.rate.norm and throughput: point lookups: 677,729 -> 363,721 B/op (-46%), 5.25 -> 6.16 ops/ms (+17%) full scan (seekTo is not on that path): 643,705 -> 643,681 B/op (unchanged) --- .../apache/hudi/io/hfile/HFileDataBlock.java | 37 ++++++++++++++----- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java index cbf3e719f6a02..6d3cb3e397691 100644 --- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java +++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java @@ -20,6 +20,7 @@ package org.apache.hudi.io.hfile; import org.apache.hudi.common.util.Option; +import org.apache.hudi.io.util.IOUtils; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; @@ -30,6 +31,7 @@ import static org.apache.hudi.io.hfile.DataSize.SIZEOF_BYTE; import static org.apache.hudi.io.hfile.DataSize.SIZEOF_INT16; +import static org.apache.hudi.io.hfile.DataSize.SIZEOF_INT32; import static org.apache.hudi.io.hfile.DataSize.SIZEOF_INT64; import static org.apache.hudi.io.hfile.HFileReader.SEEK_TO_BEFORE_BLOCK_FIRST_KEY; import static org.apache.hudi.io.hfile.HFileReader.SEEK_TO_FOUND; @@ -104,15 +106,30 @@ static HFileDataBlock createDataBlockToWrite(HFileContext context, int seekTo(HFileCursor cursor, Key key, int blockStartOffsetInFile) { int relativeOffset = cursor.getOffset() - blockStartOffsetInFile; int lastRelativeOffset = relativeOffset; + // The key-value cached at the starting position, if any. It is only consulted to re-cache it + // in the cursor when the lookup lands "in range" on the very first comparison; entries scanned + // past are compared directly against the backing buffer below (no per-entry KeyValue/Key + // allocation), so this is emptied after the first iteration and the cursor falls back to a + // deferred read. Option lastKeyValue = cursor.getKeyValue(); + // The lookup key content is fixed across the scan; hoist it out of the loop. Note the lookup + // key may be a UTF8StringKey, so use the polymorphic content accessors (no 2-byte prefix). + byte[] lookupBytes = key.getBytes(); + int lookupContentOffset = key.getContentOffset(); + int lookupContentLength = key.getContentLength(); while (relativeOffset < uncompressedContentEndRelativeOffset) { - // Full length is not known yet until parsing - KeyValue kv = readKeyValue(relativeOffset); - int comp = kv.getKey().compareTo(key); + // Compare the entry key against the lookup key directly on the buffer, without materializing + // a KeyValue/Key for every scanned entry. Layout at `relativeOffset`: [int keyLength] + // [int valueLength][short keyContentLength][key content]...; see KeyValue and Key. + int keyContentLength = IOUtils.readShort(byteBuff, relativeOffset + KEY_OFFSET); + int keyContentOffset = relativeOffset + KEY_OFFSET + KEY_LENGTH_LENGTH; + int comp = IOUtils.compareTo( + byteBuff, keyContentOffset, keyContentLength, + lookupBytes, lookupContentOffset, lookupContentLength); if (comp == 0) { // The lookup key equals the key `relativeOffset` points to; the key is found. - // Set the cursor to the current offset that points to the exact match - cursor.set(relativeOffset + blockStartOffsetInFile, kv); + // Materialize the KeyValue once and set the cursor to the exact match. + cursor.set(relativeOffset + blockStartOffsetInFile, readKeyValue(relativeOffset)); return SEEK_TO_FOUND; } else if (comp > 0) { // There is no matched key (otherwise, the method should already stop there and return 0) @@ -120,10 +137,10 @@ int seekTo(HFileCursor cursor, Key key, int blockStartOffsetInFile) { // So set the cursor to the previous offset, pointing the greatest key in the file that is // less than the lookup key. if (lastKeyValue.isPresent()) { - // If the key-value pair is already, cache it + // The previous key-value was already cached (first iteration); reuse it. cursor.set(lastRelativeOffset + blockStartOffsetInFile, lastKeyValue.get()); } else { - // Otherwise, defer the read till it's needed + // Otherwise, defer the read till it's needed; getKeyValue() materializes it lazily. cursor.setOffset(lastRelativeOffset + blockStartOffsetInFile); } // If the lookup key is lexicographically smaller than the first key pointed to by @@ -132,11 +149,13 @@ int seekTo(HFileCursor cursor, Key key, int blockStartOffsetInFile) { return isAtFirstKey(relativeOffset) ? SEEK_TO_BEFORE_BLOCK_FIRST_KEY : SEEK_TO_IN_RANGE; } long increment = - (long) KEY_OFFSET + (long) kv.getKeyLength() + (long) kv.getValueLength() + (long) KEY_OFFSET + (long) IOUtils.readInt(byteBuff, relativeOffset) + + (long) IOUtils.readInt(byteBuff, relativeOffset + SIZEOF_INT32) + ZERO_TS_VERSION_BYTE_LENGTH; lastRelativeOffset = relativeOffset; relativeOffset += increment; - lastKeyValue = Option.of(kv); + // Past entries are not materialized; clear the cache so the "in range" branch above defers. + lastKeyValue = Option.empty(); } // We reach the end of the block. Set the cursor to the offset of last key. // In this case, the lookup key is greater than the last key. From 35c14418f1132d9e51e0339715fe90f5f46c9d77 Mon Sep 17 00:00:00 2001 From: Vova Kolmakov Date: Wed, 17 Jun 2026 08:42:48 +0700 Subject: [PATCH 2/3] addressed review comments: extract entry key/value lengths into named locals in HFileDataBlock.seekTo --- .../main/java/org/apache/hudi/io/hfile/HFileDataBlock.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java index 6d3cb3e397691..1e26fe9f0a417 100644 --- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java +++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java @@ -148,9 +148,10 @@ int seekTo(HFileCursor cursor, Key key, int blockStartOffsetInFile) { // know that the cursor is ahead of the lookup key in this case. return isAtFirstKey(relativeOffset) ? SEEK_TO_BEFORE_BLOCK_FIRST_KEY : SEEK_TO_IN_RANGE; } + int entryKeyLength = IOUtils.readInt(byteBuff, relativeOffset); + int entryValueLength = IOUtils.readInt(byteBuff, relativeOffset + SIZEOF_INT32); long increment = - (long) KEY_OFFSET + (long) IOUtils.readInt(byteBuff, relativeOffset) - + (long) IOUtils.readInt(byteBuff, relativeOffset + SIZEOF_INT32) + (long) KEY_OFFSET + (long) entryKeyLength + (long) entryValueLength + ZERO_TS_VERSION_BYTE_LENGTH; lastRelativeOffset = relativeOffset; relativeOffset += increment; From 82abf7d4bafa68750fe46ddd0e09e982224260a0 Mon Sep 17 00:00:00 2001 From: Vova Kolmakov Date: Wed, 17 Jun 2026 10:59:12 +0700 Subject: [PATCH 3/3] addressed review comments: add unit test for HFileDataBlock.seekTo buffer-direct scan --- .../apache/hudi/io/hfile/TestHFileReader.java | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java index 96c24a147ee69..1341fb4e47de6 100644 --- a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java +++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java @@ -31,6 +31,8 @@ import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.ValueSource; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; @@ -843,6 +845,61 @@ public void testReadHFileCompatibility(String hfilePrefix) throws IOException { verifyHFileReadCompatibility(bootstrapIndexFile, 4, Option.empty()); } + /** + * Validates {@link HFileDataBlock#seekTo} when a data block holds many entries, which is the + * case the optimization targets: the scan compares each entry key directly against the lookup + * key on the backing buffer instead of materializing a {@link KeyValue}/{@link Key} per entry. + * + *

The HFile is written with a small block size so that several entries land in each data + * block (and the file spans multiple blocks). Keys/values are zero-padded to a constant width, + * so the packing is deterministic (8 entries per block at this block size); the lookups below are + * chosen to land in the middle of a block so the scan must iterate past earlier entries before + * the comparison resolves. This exercises both the buffer-direct comparison / {@code readInt} + * based increment and the deferred-cursor path (the previous key-value is no longer cached for + * scanned-past entries, so an in-range result resolves the key lazily via {@code getKeyValue}). + */ + @Test + public void testSeekToScanWithinMultiEntryBlocks() throws IOException { + int numEntries = 64; + // 59 bytes per entry (18-byte key + 20-byte value + 21 extra bytes), so a 512-byte block + // holds 8 entries and the 64 entries span 8 data blocks. + HFileContext context = HFileContext.builder().blockSize(512).build(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (DataOutputStream outputStream = new DataOutputStream(baos); + HFileWriter writer = new HFileWriterImpl(context, outputStream)) { + for (int i = 0; i < numEntries; i++) { + writer.append(KEY_CREATOR.apply(i), VALUE_CREATOR.apply(i).getBytes(StandardCharsets.UTF_8)); + } + } + byte[] content = baos.toByteArray(); + + List keyLookUpInfoList = Arrays.asList( + // Lookup smaller than the first key: cursor sits before the file's first key. + new KeyLookUpInfo("", SEEK_TO_BEFORE_FILE_FIRST_KEY, KEY_CREATOR.apply(0), VALUE_CREATOR.apply(0)), + // Exact match on the first entry of the first block (match on the first comparison). + new KeyLookUpInfo(KEY_CREATOR.apply(0), SEEK_TO_FOUND, KEY_CREATOR.apply(0), VALUE_CREATOR.apply(0)), + // Exact match on the last entry of a block: the scan walks past the earlier 7 entries. + new KeyLookUpInfo(KEY_CREATOR.apply(7), SEEK_TO_FOUND, KEY_CREATOR.apply(7), VALUE_CREATOR.apply(7)), + // Exact match mid-block. + new KeyLookUpInfo(KEY_CREATOR.apply(25), SEEK_TO_FOUND, KEY_CREATOR.apply(25), VALUE_CREATOR.apply(25)), + // Lookup strictly between two adjacent mid-block keys: in range, cursor resolves to the + // lower key via the deferred read (the scanned-past key-value is not cached). + new KeyLookUpInfo(KEY_CREATOR.apply(30) + "a", SEEK_TO_IN_RANGE, KEY_CREATOR.apply(30), VALUE_CREATOR.apply(30)), + // Exact match in a later block. + new KeyLookUpInfo(KEY_CREATOR.apply(50), SEEK_TO_FOUND, KEY_CREATOR.apply(50), VALUE_CREATOR.apply(50)), + // Exact match on the very last entry of the file. + new KeyLookUpInfo(KEY_CREATOR.apply(numEntries - 1), SEEK_TO_FOUND, + KEY_CREATOR.apply(numEntries - 1), VALUE_CREATOR.apply(numEntries - 1)), + // Lookup greater than the last key: end of file. + new KeyLookUpInfo(KEY_CREATOR.apply(numEntries - 1) + "a", SEEK_TO_EOF, "", "")); + + try (HFileReader reader = new HFileReaderImpl( + new ByteArraySeekableDataInputStream(new ByteBufferBackedInputStream(content)), content.length)) { + reader.initializeMetadata(); + verifyHFileSeekToReads(reader, keyLookUpInfoList); + } + } + public static byte[] readHFileFromResources(String filename) throws IOException { long size = TestHFileReader.class .getResource(filename).openConnection().getContentLength();