From 972af5b78d289d2699146073c0d1dc76200c801f Mon Sep 17 00:00:00 2001
From: Vova Kolmakov <wombatukun@apache.org>
Date: Tue, 16 Jun 2026 16:29:56 +0700
Subject: [PATCH 1/3] perf(io): Avoid per-entry KeyValue allocation in
 HFileDataBlock.seekTo

HFileDataBlock.seekTo materialized a KeyValue (and its Key) for every entry it scanned, only to compare the entry key and compute the stride to the next entry. On the metadata-table read path (record-level index, bloom filter and column-stats point lookups) this is the hottest inner loop, allocating two short-lived objects per scanned entry.

This compares the entry key directly against the backing block buffer and computes the stride from the on-disk length fields, materializing a KeyValue only on an exact match. The "in range" and end-of-block cases point the cursor at the previous offset and defer the read, which getKeyValue() already performs lazily. The lookup key is a UTF8StringKey, so its polymorphic content accessors are used for the comparison. No on-disk format or public API change.

JMH microbenchmark over an uncompressed HFile fixture (5000 entries, 625 sorted point lookups), forks(0), gc.alloc.rate.norm and throughput:

point lookups: 677,729 -> 363,721 B/op (-46%), 5.25 -> 6.16 ops/ms (+17%)
full scan (seekTo is not on that path): 643,705 -> 643,681 B/op (unchanged)
---
 .../apache/hudi/io/hfile/HFileDataBlock.java  | 37 ++++++++++++++-----
 1 file changed, 28 insertions(+), 9 deletions(-)
diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java
index cbf3e719f6a02..6d3cb3e397691 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java
@@ -20,6 +20,7 @@
 package org.apache.hudi.io.hfile;
 
 import org.apache.hudi.common.util.Option;
+import org.apache.hudi.io.util.IOUtils;
 
 import java.io.ByteArrayOutputStream;
 import java.io.DataOutputStream;
@@ -30,6 +31,7 @@
 
 import static org.apache.hudi.io.hfile.DataSize.SIZEOF_BYTE;
 import static org.apache.hudi.io.hfile.DataSize.SIZEOF_INT16;
+import static org.apache.hudi.io.hfile.DataSize.SIZEOF_INT32;
 import static org.apache.hudi.io.hfile.DataSize.SIZEOF_INT64;
 import static org.apache.hudi.io.hfile.HFileReader.SEEK_TO_BEFORE_BLOCK_FIRST_KEY;
 import static org.apache.hudi.io.hfile.HFileReader.SEEK_TO_FOUND;
@@ -104,15 +106,30 @@ static HFileDataBlock createDataBlockToWrite(HFileContext context,
   int seekTo(HFileCursor cursor, Key key, int blockStartOffsetInFile) {
     int relativeOffset = cursor.getOffset() - blockStartOffsetInFile;
     int lastRelativeOffset = relativeOffset;
+    // The key-value cached at the starting position, if any. It is only consulted to re-cache it
+    // in the cursor when the lookup lands "in range" on the very first comparison; entries scanned
+    // past are compared directly against the backing buffer below (no per-entry KeyValue/Key
+    // allocation), so this is emptied after the first iteration and the cursor falls back to a
+    // deferred read.
     Option<KeyValue> lastKeyValue = cursor.getKeyValue();
+    // The lookup key content is fixed across the scan; hoist it out of the loop. Note the lookup
+    // key may be a UTF8StringKey, so use the polymorphic content accessors (no 2-byte prefix).
+    byte[] lookupBytes = key.getBytes();
+    int lookupContentOffset = key.getContentOffset();
+    int lookupContentLength = key.getContentLength();
     while (relativeOffset < uncompressedContentEndRelativeOffset) {
-      // Full length is not known yet until parsing
-      KeyValue kv = readKeyValue(relativeOffset);
-      int comp = kv.getKey().compareTo(key);
+      // Compare the entry key against the lookup key directly on the buffer, without materializing
+      // a KeyValue/Key for every scanned entry. Layout at `relativeOffset`: [int keyLength]
+      // [int valueLength][short keyContentLength][key content]...; see KeyValue and Key.
+      int keyContentLength = IOUtils.readShort(byteBuff, relativeOffset + KEY_OFFSET);
+      int keyContentOffset = relativeOffset + KEY_OFFSET + KEY_LENGTH_LENGTH;
+      int comp = IOUtils.compareTo(
+          byteBuff, keyContentOffset, keyContentLength,
+          lookupBytes, lookupContentOffset, lookupContentLength);
       if (comp == 0) {
         // The lookup key equals the key `relativeOffset` points to; the key is found.
-        // Set the cursor to the current offset that points to the exact match
-        cursor.set(relativeOffset + blockStartOffsetInFile, kv);
+        // Materialize the KeyValue once and set the cursor to the exact match.
+        cursor.set(relativeOffset + blockStartOffsetInFile, readKeyValue(relativeOffset));
         return SEEK_TO_FOUND;
       } else if (comp > 0) {
         // There is no matched key (otherwise, the method should already stop there and return 0)
@@ -120,10 +137,10 @@ int seekTo(HFileCursor cursor, Key key, int blockStartOffsetInFile) {
         // So set the cursor to the previous offset, pointing the greatest key in the file that is
         // less than the lookup key.
         if (lastKeyValue.isPresent()) {
-          // If the key-value pair is already, cache it
+          // The previous key-value was already cached (first iteration); reuse it.
           cursor.set(lastRelativeOffset + blockStartOffsetInFile, lastKeyValue.get());
         } else {
-          // Otherwise, defer the read till it's needed
+          // Otherwise, defer the read till it's needed; getKeyValue() materializes it lazily.
           cursor.setOffset(lastRelativeOffset + blockStartOffsetInFile);
         }
         // If the lookup key is lexicographically smaller than the first key pointed to by
@@ -132,11 +149,13 @@ int seekTo(HFileCursor cursor, Key key, int blockStartOffsetInFile) {
         return isAtFirstKey(relativeOffset) ? SEEK_TO_BEFORE_BLOCK_FIRST_KEY : SEEK_TO_IN_RANGE;
       }
       long increment =
-          (long) KEY_OFFSET + (long) kv.getKeyLength() + (long) kv.getValueLength()
+          (long) KEY_OFFSET + (long) IOUtils.readInt(byteBuff, relativeOffset)
+              + (long) IOUtils.readInt(byteBuff, relativeOffset + SIZEOF_INT32)
               + ZERO_TS_VERSION_BYTE_LENGTH;
       lastRelativeOffset = relativeOffset;
       relativeOffset += increment;
-      lastKeyValue = Option.of(kv);
+      // Past entries are not materialized; clear the cache so the "in range" branch above defers.
+      lastKeyValue = Option.empty();
     }
     // We reach the end of the block. Set the cursor to the offset of last key.
     // In this case, the lookup key is greater than the last key.

From 35c14418f1132d9e51e0339715fe90f5f46c9d77 Mon Sep 17 00:00:00 2001
From: Vova Kolmakov <wombatukun@apache.org>
Date: Wed, 17 Jun 2026 08:42:48 +0700
Subject: [PATCH 2/3] addressed review comments: extract entry key/value
 lengths into named locals in HFileDataBlock.seekTo

---
 .../main/java/org/apache/hudi/io/hfile/HFileDataBlock.java   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java
index 6d3cb3e397691..1e26fe9f0a417 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileDataBlock.java
@@ -148,9 +148,10 @@ int seekTo(HFileCursor cursor, Key key, int blockStartOffsetInFile) {
         // know that the cursor is ahead of the lookup key in this case.
         return isAtFirstKey(relativeOffset) ? SEEK_TO_BEFORE_BLOCK_FIRST_KEY : SEEK_TO_IN_RANGE;
       }
+      int entryKeyLength = IOUtils.readInt(byteBuff, relativeOffset);
+      int entryValueLength = IOUtils.readInt(byteBuff, relativeOffset + SIZEOF_INT32);
       long increment =
-          (long) KEY_OFFSET + (long) IOUtils.readInt(byteBuff, relativeOffset)
-              + (long) IOUtils.readInt(byteBuff, relativeOffset + SIZEOF_INT32)
+          (long) KEY_OFFSET + (long) entryKeyLength + (long) entryValueLength
               + ZERO_TS_VERSION_BYTE_LENGTH;
       lastRelativeOffset = relativeOffset;
       relativeOffset += increment;

From 82abf7d4bafa68750fe46ddd0e09e982224260a0 Mon Sep 17 00:00:00 2001
From: Vova Kolmakov <wombatukun@apache.org>
Date: Wed, 17 Jun 2026 10:59:12 +0700
Subject: [PATCH 3/3] addressed review comments: add unit test for
 HFileDataBlock.seekTo buffer-direct scan

---
 .../apache/hudi/io/hfile/TestHFileReader.java | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java
index 96c24a147ee69..1341fb4e47de6 100644
--- a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java
+++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileReader.java
@@ -31,6 +31,8 @@
 import org.junit.jupiter.params.provider.MethodSource;
 import org.junit.jupiter.params.provider.ValueSource;
 
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
@@ -843,6 +845,61 @@ public void testReadHFileCompatibility(String hfilePrefix) throws IOException {
     verifyHFileReadCompatibility(bootstrapIndexFile, 4, Option.empty());
   }
 
+  /**
+   * Validates {@link HFileDataBlock#seekTo} when a data block holds many entries, which is the
+   * case the optimization targets: the scan compares each entry key directly against the lookup
+   * key on the backing buffer instead of materializing a {@link KeyValue}/{@link Key} per entry.
+   *
+   * <p>The HFile is written with a small block size so that several entries land in each data
+   * block (and the file spans multiple blocks). Keys/values are zero-padded to a constant width,
+   * so the packing is deterministic (8 entries per block at this block size); the lookups below are
+   * chosen to land in the middle of a block so the scan must iterate past earlier entries before
+   * the comparison resolves. This exercises both the buffer-direct comparison / {@code readInt}
+   * based increment and the deferred-cursor path (the previous key-value is no longer cached for
+   * scanned-past entries, so an in-range result resolves the key lazily via {@code getKeyValue}).
+   */
+  @Test
+  public void testSeekToScanWithinMultiEntryBlocks() throws IOException {
+    int numEntries = 64;
+    // 59 bytes per entry (18-byte key + 20-byte value + 21 extra bytes), so a 512-byte block
+    // holds 8 entries and the 64 entries span 8 data blocks.
+    HFileContext context = HFileContext.builder().blockSize(512).build();
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    try (DataOutputStream outputStream = new DataOutputStream(baos);
+         HFileWriter writer = new HFileWriterImpl(context, outputStream)) {
+      for (int i = 0; i < numEntries; i++) {
+        writer.append(KEY_CREATOR.apply(i), VALUE_CREATOR.apply(i).getBytes(StandardCharsets.UTF_8));
+      }
+    }
+    byte[] content = baos.toByteArray();
+
+    List<KeyLookUpInfo> keyLookUpInfoList = Arrays.asList(
+        // Lookup smaller than the first key: cursor sits before the file's first key.
+        new KeyLookUpInfo("", SEEK_TO_BEFORE_FILE_FIRST_KEY, KEY_CREATOR.apply(0), VALUE_CREATOR.apply(0)),
+        // Exact match on the first entry of the first block (match on the first comparison).
+        new KeyLookUpInfo(KEY_CREATOR.apply(0), SEEK_TO_FOUND, KEY_CREATOR.apply(0), VALUE_CREATOR.apply(0)),
+        // Exact match on the last entry of a block: the scan walks past the earlier 7 entries.
+        new KeyLookUpInfo(KEY_CREATOR.apply(7), SEEK_TO_FOUND, KEY_CREATOR.apply(7), VALUE_CREATOR.apply(7)),
+        // Exact match mid-block.
+        new KeyLookUpInfo(KEY_CREATOR.apply(25), SEEK_TO_FOUND, KEY_CREATOR.apply(25), VALUE_CREATOR.apply(25)),
+        // Lookup strictly between two adjacent mid-block keys: in range, cursor resolves to the
+        // lower key via the deferred read (the scanned-past key-value is not cached).
+        new KeyLookUpInfo(KEY_CREATOR.apply(30) + "a", SEEK_TO_IN_RANGE, KEY_CREATOR.apply(30), VALUE_CREATOR.apply(30)),
+        // Exact match in a later block.
+        new KeyLookUpInfo(KEY_CREATOR.apply(50), SEEK_TO_FOUND, KEY_CREATOR.apply(50), VALUE_CREATOR.apply(50)),
+        // Exact match on the very last entry of the file.
+        new KeyLookUpInfo(KEY_CREATOR.apply(numEntries - 1), SEEK_TO_FOUND,
+            KEY_CREATOR.apply(numEntries - 1), VALUE_CREATOR.apply(numEntries - 1)),
+        // Lookup greater than the last key: end of file.
+        new KeyLookUpInfo(KEY_CREATOR.apply(numEntries - 1) + "a", SEEK_TO_EOF, "", ""));
+
+    try (HFileReader reader = new HFileReaderImpl(
+        new ByteArraySeekableDataInputStream(new ByteBufferBackedInputStream(content)), content.length)) {
+      reader.initializeMetadata();
+      verifyHFileSeekToReads(reader, keyLookUpInfoList);
+    }
+  }
+
   public static byte[] readHFileFromResources(String filename) throws IOException {
     long size = TestHFileReader.class
         .getResource(filename).openConnection().getContentLength();