From bceb7f1074face0c47b179819a479fc9860c04a0 Mon Sep 17 00:00:00 2001
From: Rahil Chertara <rchertara@gmail.com>
Date: Sun, 3 May 2026 15:37:26 -0700
Subject: [PATCH 1/2] feat: Add Parquet DESCRIPTOR mode for blob inline reading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When hoodie.read.blob.inline.mode=DESCRIPTOR is set with Parquet base
files, leverage Parquet's nested column projection to skip reading the
blob 'data' sub-column entirely (genuine I/O savings). Previously the
config only affected Lance reads; Parquet still materialized the bytes.

Approach mirrors the existing VECTOR column rewrite pattern in
HoodieFileGroupReaderBasedFileFormat:
1. Detect blob columns via schema metadata
2. Strip the 'data' sub-field from blob structs in the read schema
3. Post-read null-pad the 'data' field back into output rows

Both COW (HoodieFileGroupReaderBasedFileFormat.readBaseFile) and MOR
(SparkFileFormatInternalRowReaderContext.getFileRecordIterator) paths
are covered. Also adds defensive null check in BatchedBlobReader.

read_blob() on Parquet DESCRIPTOR rows returns null since Parquet has
no byte-range blob access like Lance — documented as known limitation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../io/storage/VectorConversionUtils.java     | 100 ++++++++++++++++++
 ...rkFileFormatInternalRowReaderContext.scala |  55 +++++++++-
 .../common/config/HoodieReaderConfig.java     |  10 +-
 .../hudi/HoodieHadoopFsRelationFactory.scala  |   7 +-
 ...HoodieFileGroupReaderBasedFileFormat.scala |  93 ++++++++++++++--
 .../sql/hudi/blob/BatchedBlobReader.scala     |   4 +-
 6 files changed, 251 insertions(+), 18 deletions(-)
diff --git a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/VectorConversionUtils.java b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/VectorConversionUtils.java
index a8cc02f58dad3..3803c76cd7777 100644
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/VectorConversionUtils.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/VectorConversionUtils.java
@@ -38,7 +38,9 @@
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -265,6 +267,104 @@ public static void convertRowVectorColumns(InternalRow row, GenericInternalRow r
     }
   }
 
+  // ---------------------------------------------------------------------------
+  // Blob descriptor-mode helpers (Parquet DESCRIPTOR read path)
+  // ---------------------------------------------------------------------------
+
+  /**
+   * Detects BLOB columns from Spark StructType metadata annotations.
+   *
+   * @param schema Spark StructType (may be null)
+   * @return set of field ordinals that are BLOB columns; empty set if none found
+   */
+  public static Set<Integer> detectBlobColumnsFromMetadata(StructType schema) {
+    Set<Integer> blobColumnIndices = new LinkedHashSet<>();
+    if (schema == null) {
+      return blobColumnIndices;
+    }
+    StructField[] fields = schema.fields();
+    for (int i = 0; i < fields.length; i++) {
+      StructField field = fields[i];
+      if (field.metadata().contains(HoodieSchema.TYPE_METADATA_FIELD)) {
+        String typeStr = field.metadata().getString(HoodieSchema.TYPE_METADATA_FIELD);
+        HoodieSchema parsed = HoodieSchema.parseTypeDescriptor(typeStr);
+        if (parsed != null && parsed.getType() == HoodieSchemaType.BLOB) {
+          blobColumnIndices.add(i);
+        }
+      }
+    }
+    return blobColumnIndices;
+  }
+
+  /**
+   * Strips the {@code data} sub-field from BLOB struct columns so the Parquet reader
+   * skips the binary column chunk entirely (genuine I/O savings).
+   *
+   * <p>The returned schema has 2-field blob structs: {@code {type, reference}} instead of
+   * the full {@code {type, data, reference}}. Use {@link #buildBlobNullPadRowMapper} to
+   * re-insert null at the {@code data} position after reading.
+   *
+   * @param schema      the original Spark schema
+   * @param blobColumns ordinals of blob columns (from {@link #detectBlobColumnsFromMetadata})
+   * @return a new StructType with the {@code data} sub-field removed from blob structs
+   */
+  public static StructType stripBlobDataField(StructType schema, Set<Integer> blobColumns) {
+    StructField[] fields = schema.fields();
+    StructField[] newFields = new StructField[fields.length];
+    for (int i = 0; i < fields.length; i++) {
+      if (blobColumns.contains(i) && fields[i].dataType() instanceof StructType) {
+        StructType blobStruct = (StructType) fields[i].dataType();
+        List<StructField> kept = new ArrayList<>();
+        for (StructField sub : blobStruct.fields()) {
+          if (!sub.name().equals(HoodieSchema.Blob.INLINE_DATA_FIELD)) {
+            kept.add(sub);
+          }
+        }
+        StructType strippedStruct = new StructType(kept.toArray(new StructField[0]));
+        newFields[i] = new StructField(fields[i].name(), strippedStruct, fields[i].nullable(), fields[i].metadata());
+      } else {
+        newFields[i] = fields[i];
+      }
+    }
+    return new StructType(newFields);
+  }
+
+  /**
+   * Returns a {@link Function} that expands 2-field blob structs {@code {type, reference}}
+   * back to 3-field structs {@code {type, null, reference}} by inserting null at the
+   * {@code data} position, then applies the projection callback.
+   *
+   * @param readSchema         the Spark schema of incoming rows (blob structs have 2 fields)
+   * @param blobColumns        ordinals of blob columns in {@code readSchema}
+   * @param projectionCallback called with the expanded row; must copy any data it needs to retain
+   * @return a function that converts one row and returns the projected result
+   */
+  public static Function<InternalRow, InternalRow> buildBlobNullPadRowMapper(
+      StructType readSchema,
+      Set<Integer> blobColumns,
+      Function<InternalRow, InternalRow> projectionCallback) {
+    int numFields = readSchema.fields().length;
+    GenericInternalRow buffer = new GenericInternalRow(numFields);
+    return row -> {
+      for (int i = 0; i < numFields; i++) {
+        if (row.isNullAt(i)) {
+          buffer.setNullAt(i);
+        } else if (blobColumns.contains(i)) {
+          InternalRow blobStruct = row.getStruct(i, 2);
+          // Expand {type, reference} → {type, null, reference}
+          GenericInternalRow expanded = new GenericInternalRow(3);
+          expanded.update(0, blobStruct.isNullAt(0) ? null : blobStruct.getUTF8String(0));
+          expanded.setNullAt(1);
+          expanded.update(2, blobStruct.isNullAt(1) ? null : blobStruct.getStruct(1, HoodieSchema.Blob.getReferenceFieldCount()));
+          buffer.update(i, expanded);
+        } else {
+          buffer.update(i, row.get(i, readSchema.apply(i).dataType()));
+        }
+      }
+      return projectionCallback.apply(buffer);
+    };
+  }
+
   /**
    * Re-attaches {@link HoodieSchema#TYPE_METADATA_FIELD} to Spark fields that are
    * Arrow {@code FixedSizeList<Float32|Float64, dim>} in the Lance file.
diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala
index 92b963f683906..8c50f7886589b 100644
--- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala
+++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala
@@ -97,7 +97,29 @@ class SparkFileFormatInternalRowReaderContext(baseFileReader: SparkColumnarFileR
       structType
     }
 
-    val (readSchema, readFilters) = getSchemaAndFiltersForRead(parquetReadStructType, hasRowIndexField)
+    // Blob DESCRIPTOR mode: strip `data` sub-field from blob structs for Parquet base files.
+    // Applied after vector rewrite; not applied to Lance base files or log files.
+    val isParquetBaseFile = FSUtils.isBaseFile(filePath) && !isLanceBaseFile
+    val isBlobDescriptorMode = isParquetBaseFile && {
+      val hadoopConf = storageConfiguration.unwrapAs(classOf[Configuration])
+      import org.apache.hudi.common.config.HoodieReaderConfig
+      val modeValue = hadoopConf.get(HoodieReaderConfig.BLOB_INLINE_READ_MODE.key(),
+        HoodieReaderConfig.BLOB_INLINE_READ_MODE.defaultValue())
+      modeValue.equalsIgnoreCase(HoodieReaderConfig.BLOB_INLINE_READ_MODE_DESCRIPTOR)
+    }
+    val blobColumnIndices: Set[Int] = if (isBlobDescriptorMode) {
+      VectorConversionUtils.detectBlobColumnsFromMetadata(parquetReadStructType).asScala.map(_.intValue()).toSet
+    } else {
+      Set.empty
+    }
+    val blobReadStructType = if (blobColumnIndices.nonEmpty) {
+      val javaBlobCols: java.util.Set[Integer] = blobColumnIndices.map(Integer.valueOf).asJava
+      VectorConversionUtils.stripBlobDataField(parquetReadStructType, javaBlobCols)
+    } else {
+      parquetReadStructType
+    }
+
+    val (readSchema, readFilters) = getSchemaAndFiltersForRead(blobReadStructType, hasRowIndexField)
     if (FSUtils.isLogFile(filePath)) {
       // NOTE: now only primary key based filtering is supported for log files
       new HoodieSparkFileReaderFactory(storage).newParquetFileReader(filePath)
@@ -120,12 +142,18 @@ class SparkFileFormatInternalRowReaderContext(baseFileReader: SparkColumnarFileR
         readSchema, StructType(Seq.empty), getSchemaHandler.getInternalSchemaOpt,
         readFilters, storage.getConf.asInstanceOf[StorageConfiguration[Configuration]], tableSchemaOpt))
 
-      // Post-process: convert binary VECTOR columns back to typed arrays
-      if (vectorColumnInfo.nonEmpty) {
-        SparkFileFormatInternalRowReaderContext.wrapWithVectorConversion(rawIterator, vectorColumnInfo, readSchema)
+      // Post-process: re-insert null `data` field into blob structs, then convert vectors
+      val blobPaddedIterator = if (blobColumnIndices.nonEmpty) {
+        SparkFileFormatInternalRowReaderContext.wrapWithBlobNullPadding(rawIterator, blobColumnIndices, readSchema, parquetReadStructType)
       } else {
         rawIterator
       }
+
+      if (vectorColumnInfo.nonEmpty) {
+        SparkFileFormatInternalRowReaderContext.wrapWithVectorConversion(blobPaddedIterator, vectorColumnInfo, if (blobColumnIndices.nonEmpty) parquetReadStructType else readSchema)
+      } else {
+        blobPaddedIterator
+      }
     }
   }
 
@@ -375,4 +403,23 @@ object SparkFileFormatInternalRowReaderContext {
     }
   }
 
+  /**
+   * Wraps a closable iterator to re-insert null {@code data} fields into blob structs
+   * after Parquet DESCRIPTOR mode read (expanding 2-field → 3-field structs).
+   */
+  private[hudi] def wrapWithBlobNullPadding(
+      iterator: ClosableIterator[InternalRow],
+      blobColumnIndices: Set[Int],
+      readSchema: StructType,
+      targetSchema: StructType): ClosableIterator[InternalRow] = {
+    val javaBlobCols: java.util.Set[Integer] = blobColumnIndices.map(Integer.valueOf).asJava
+    val projection = UnsafeProjection.create(targetSchema)
+    val mapper = VectorConversionUtils.buildBlobNullPadRowMapper(readSchema, javaBlobCols, projection.apply(_))
+    new ClosableIterator[InternalRow] {
+      override def hasNext: Boolean = iterator.hasNext
+      override def next(): InternalRow = mapper.apply(iterator.next())
+      override def close(): Unit = iterator.close()
+    }
+  }
+
 }
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieReaderConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieReaderConfig.java
index 942d1aeabb503..1f759cc21a152 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieReaderConfig.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieReaderConfig.java
@@ -113,7 +113,11 @@ public class HoodieReaderConfig extends HoodieConfig {
       .withValidValues(BLOB_INLINE_READ_MODE_CONTENT, BLOB_INLINE_READ_MODE_DESCRIPTOR)
       .withDocumentation("How Hudi interprets INLINE BLOB values on read. "
           + "CONTENT (default) returns the raw inline bytes. "
-          + "DESCRIPTOR returns an OUT_OF_LINE-shaped reference pointing at the backing "
-          + "Lance file with the INLINE payload's position and size, so callers can defer "
-          + "the byte read via read_blob().");
+          + "DESCRIPTOR suppresses the inline bytes (data field is null) and returns metadata only, "
+          + "avoiding the I/O cost of reading large binary payloads. "
+          + "For Lance files, the reference struct is populated with blob stream coordinates "
+          + "so read_blob() can materialize bytes on demand. "
+          + "For Parquet files, the data column is skipped via column projection; "
+          + "the reference struct is null and read_blob() returns null. "
+          + "Use CONTENT mode when bytes are needed from Parquet tables.");
 }
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieHadoopFsRelationFactory.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieHadoopFsRelationFactory.scala
index a11e3075a6fb7..f444d24d067bc 100644
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieHadoopFsRelationFactory.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieHadoopFsRelationFactory.scala
@@ -237,11 +237,16 @@ abstract class HoodieBaseHadoopFsRelationFactory(val sqlContext: SQLContext,
 
   override def buildFileFormat(): FileFormat = {
     val tableConfig = metaClient.getTableConfig
+    val blobDescriptorMode = optParams.getOrElse(
+      HoodieReaderConfig.BLOB_INLINE_READ_MODE.key(),
+      HoodieReaderConfig.BLOB_INLINE_READ_MODE.defaultValue()
+    ).equalsIgnoreCase(HoodieReaderConfig.BLOB_INLINE_READ_MODE_DESCRIPTOR)
     new HoodieFileGroupReaderBasedFileFormat(basePath.toString,
       HoodieTableSchema(tableStructSchema, tableSchema, internalSchemaOpt),
       tableConfig.getTableName, queryTimestamp.get, getMandatoryFields, isMOR, isBootstrap,
       isIncremental, validCommits, shouldUseRecordPosition, getRequiredFilters,
-      tableConfig.isMultipleBaseFileFormatsEnabled, tableConfig.getBaseFileFormat)
+      tableConfig.isMultipleBaseFileFormatsEnabled, tableConfig.getBaseFileFormat,
+      isBlobDescriptorMode = blobDescriptorMode)
   }
 
   override def buildBucketSpec(): Option[BucketSpec] = None
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedFileFormat.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedFileFormat.scala
index 3da22ff8ebe7e..de4d57f4d93d8 100644
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedFileFormat.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedFileFormat.scala
@@ -61,7 +61,7 @@ import org.apache.spark.util.SerializableConfiguration
 
 import java.io.Closeable
 
-import scala.collection.JavaConverters.mapAsJavaMapConverter
+import scala.collection.JavaConverters.{mapAsJavaMapConverter, setAsJavaSetConverter}
 
 trait HoodieFormatTrait {
 
@@ -86,7 +86,8 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
                                            shouldUseRecordPosition: Boolean,
                                            requiredFilters: Seq[Filter],
                                            isMultipleBaseFileFormatsEnabled: Boolean,
-                                           hoodieFileFormat: HoodieFileFormat)
+                                           hoodieFileFormat: HoodieFileFormat,
+                                           isBlobDescriptorMode: Boolean = false)
   extends ParquetFileFormat with SparkAdapterSupport with HoodieFormatTrait with Logging with Serializable {
 
   private lazy val schema = tableSchema.schema
@@ -133,6 +134,19 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
     }
   }
 
+  @transient private var cachedBlobDetection: (StructType, Set[Int]) = _
+
+  private def detectBlobColumnsCached(schema: StructType): Set[Int] = {
+    if (cachedBlobDetection != null && (cachedBlobDetection._1 eq schema)) {
+      cachedBlobDetection._2
+    } else {
+      import scala.collection.JavaConverters._
+      val result = VectorConversionUtils.detectBlobColumnsFromMetadata(schema).asScala.map(_.intValue()).toSet
+      cachedBlobDetection = (schema, result)
+      result
+    }
+  }
+
   /**
    * Checks if the file format supports vectorized reading, please refer to SPARK-40918.
    *
@@ -151,6 +165,12 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
       supportVectorizedRead = false
       supportReturningBatch = false
       false
+    } else if (isBlobDescriptorMode && detectBlobColumnsCached(schema).nonEmpty) {
+      // Blob DESCRIPTOR mode strips the data sub-field from blob structs and null-pads
+      // post-read, which requires row-level access.
+      supportVectorizedRead = false
+      supportReturningBatch = false
+      false
     } else {
       val conf = sparkSession.sessionState.conf
       val parquetBatchSupported = ParquetUtils.isBatchReadSupportedForSchema(conf, schema) && supportBatchWithTableSchema
@@ -457,6 +477,39 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
     }
   }
 
+  /**
+   * Detects BLOB columns and strips the {@code data} sub-field when DESCRIPTOR mode is active.
+   * Only applies to Parquet format; other formats handle DESCRIPTOR mode natively.
+   */
+  private def withBlobDescriptorRewrite(schema: StructType): (StructType, Set[Int]) = {
+    if (hoodieFileFormat != HoodieFileFormat.PARQUET) {
+      (schema, Set.empty[Int])
+    } else {
+      import scala.collection.JavaConverters._
+      val blobCols = VectorConversionUtils.detectBlobColumnsFromMetadata(schema).asScala.map(_.intValue()).toSet
+      if (blobCols.isEmpty) {
+        (schema, blobCols)
+      } else {
+        val javaBlobCols: java.util.Set[Integer] = blobCols.map(Integer.valueOf).asJava
+        (VectorConversionUtils.stripBlobDataField(schema, javaBlobCols), blobCols)
+      }
+    }
+  }
+
+  /**
+   * Wraps an iterator to re-insert null {@code data} fields into blob structs
+   * after Parquet DESCRIPTOR mode read (expanding 2-field → 3-field structs).
+   */
+  private def wrapWithBlobNullPadding(iter: Iterator[InternalRow],
+                                       readSchema: StructType,
+                                       targetSchema: StructType,
+                                       blobCols: Set[Int]): Iterator[InternalRow] = {
+    val blobProjection = UnsafeProjection.create(targetSchema)
+    val javaBlobCols: java.util.Set[Integer] = blobCols.map(Integer.valueOf).asJava
+    val mapper = VectorConversionUtils.buildBlobNullPadRowMapper(readSchema, javaBlobCols, blobProjection.apply(_))
+    iter.map(mapper.apply(_))
+  }
+
   /**
    * Wraps an iterator to convert binary VECTOR columns back to typed arrays.
    * The read schema has BinaryType for vector columns; the target schema has ArrayType.
@@ -486,16 +539,23 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
     val (modifiedOutputSchema, outputVectorCols) = if (hasVectors) withVectorRewrite(outputSchema) else (outputSchema, Map.empty[Int, HoodieSchema.Vector])
     val (modifiedRequestedSchema, _) = if (hasVectors) withVectorRewrite(requestedSchema) else (requestedSchema, Map.empty[Int, HoodieSchema.Vector])
 
+    // Blob DESCRIPTOR mode: strip `data` sub-field from blob structs so Parquet skips
+    // those column chunks entirely (real I/O savings). Applied after vector rewrite.
+    val (blobRequiredSchema, blobCols) = if (isBlobDescriptorMode) withBlobDescriptorRewrite(modifiedRequiredSchema) else (modifiedRequiredSchema, Set.empty[Int])
+    val hasBlobs = blobCols.nonEmpty
+    val (blobOutputSchema, outputBlobCols) = if (hasBlobs) withBlobDescriptorRewrite(modifiedOutputSchema) else (modifiedOutputSchema, Set.empty[Int])
+    val (blobRequestedSchema, _) = if (hasBlobs) withBlobDescriptorRewrite(modifiedRequestedSchema) else (modifiedRequestedSchema, Set.empty[Int])
+
     val rawIter = if (remainingPartitionSchema.fields.length == partitionSchema.fields.length) {
       //none of partition fields are read from the file, so the reader will do the appending for us
-      parquetFileReader.read(file, modifiedRequiredSchema, partitionSchema, internalSchemaOpt, filters, storageConf, tableSchemaAsMessageType)
+      parquetFileReader.read(file, blobRequiredSchema, partitionSchema, internalSchemaOpt, filters, storageConf, tableSchemaAsMessageType)
     } else if (remainingPartitionSchema.fields.length == 0) {
       //we read all of the partition fields from the file
       val pfileUtils = sparkAdapter.getSparkPartitionedFileUtils
       //we need to modify the partitioned file so that the partition values are empty
       val modifiedFile = pfileUtils.createPartitionedFile(InternalRow.empty, pfileUtils.getPathFromPartitionedFile(file), file.start, file.length)
       //and we pass an empty schema for the partition schema
-      parquetFileReader.read(modifiedFile, modifiedOutputSchema, new StructType(), internalSchemaOpt, filters, storageConf, tableSchemaAsMessageType)
+      parquetFileReader.read(modifiedFile, blobOutputSchema, new StructType(), internalSchemaOpt, filters, storageConf, tableSchemaAsMessageType)
     } else {
       //need to do an additional projection here. The case in mind is that partition schema is "a,b,c" mandatoryFields is "a,c",
       //then we will read (dataSchema + a + c) and append b. So the final schema will be (data schema + a + c +b)
@@ -503,8 +563,25 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
       val pfileUtils = sparkAdapter.getSparkPartitionedFileUtils
       val partitionValues = getFixedPartitionValues(file.partitionValues, partitionSchema, fixedPartitionIndexes)
       val modifiedFile = pfileUtils.createPartitionedFile(partitionValues, pfileUtils.getPathFromPartitionedFile(file), file.start, file.length)
-      val iter = parquetFileReader.read(modifiedFile, modifiedRequestedSchema, remainingPartitionSchema, internalSchemaOpt, filters, storageConf, tableSchemaAsMessageType)
-      projectIter(iter, StructType(modifiedRequestedSchema.fields ++ remainingPartitionSchema.fields), modifiedOutputSchema)
+      val iter = parquetFileReader.read(modifiedFile, blobRequestedSchema, remainingPartitionSchema, internalSchemaOpt, filters, storageConf, tableSchemaAsMessageType)
+      projectIter(iter, StructType(blobRequestedSchema.fields ++ remainingPartitionSchema.fields), blobOutputSchema)
+    }
+
+    // Post-read: re-insert null `data` field into blob structs (expanding 2-field → 3-field)
+    val blobPaddedIter = if (hasBlobs) {
+      val readSchema = if (remainingPartitionSchema.fields.length == partitionSchema.fields.length) {
+        StructType(blobRequiredSchema.fields ++ partitionSchema.fields)
+      } else {
+        blobOutputSchema
+      }
+      val targetSchema = if (remainingPartitionSchema.fields.length == partitionSchema.fields.length) {
+        StructType(modifiedRequiredSchema.fields ++ partitionSchema.fields)
+      } else {
+        modifiedOutputSchema
+      }
+      wrapWithBlobNullPadding(rawIter, readSchema, targetSchema, outputBlobCols)
+    } else {
+      rawIter
     }
 
     if (hasVectors) {
@@ -514,9 +591,9 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
       } else {
         modifiedOutputSchema
       }
-      wrapWithVectorConversion(rawIter, readSchema, outputSchema, outputVectorCols)
+      wrapWithVectorConversion(blobPaddedIter, readSchema, outputSchema, outputVectorCols)
     } else {
-      rawIter
+      blobPaddedIter
     }
   }
 
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/blob/BatchedBlobReader.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/blob/BatchedBlobReader.scala
index 0328cdd0c5c22..3c6524369a12f 100644
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/blob/BatchedBlobReader.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/blob/BatchedBlobReader.scala
@@ -208,8 +208,8 @@ class BatchedBlobReader(
             // Dispatch based on storage_type (field 0)
             val storageType = accessor.getString(blobStruct, 0)
             if (storageType == HoodieSchema.Blob.INLINE) {
-              // Case 1: Inline — bytes are in field 1
-              val bytes = accessor.getBytes(blobStruct, 1)
+              // Case 1: Inline — bytes are in field 1 (may be null in DESCRIPTOR mode)
+              val bytes = if (accessor.isNullAt(blobStruct, 1)) null else accessor.getBytes(blobStruct, 1)
               batch += RowInfo[R](
                 originalRow = row,
                 filePath = "",

From fd7c3daf0ddd52f0839bca49bbfaf06ffbd9bf7b Mon Sep 17 00:00:00 2001
From: Rahil Chertara <rchertara@gmail.com>
Date: Thu, 14 May 2026 13:38:19 -0700
Subject: [PATCH 2/2] fix logic around read_blob and descriptor mode

---
 ...rkFileFormatInternalRowReaderContext.scala |  18 +-
 .../common/config/HoodieReaderConfig.java     |  26 +-
 ...HoodieFileGroupReaderBasedFileFormat.scala |  55 +++-
 .../spark/sql/hudi/blob/ReadBlobRule.scala    | 133 +++++---
 .../apache/hudi/blob/TestReadBlobSQL.scala    | 298 +++++++++++++++++-
 5 files changed, 462 insertions(+), 68 deletions(-)

diff --git a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala
index 8c50f7886589b..22d013830d703 100644
--- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala
+++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala
@@ -99,16 +99,28 @@ class SparkFileFormatInternalRowReaderContext(baseFileReader: SparkColumnarFileR
 
     // Blob DESCRIPTOR mode: strip `data` sub-field from blob structs for Parquet base files.
     // Applied after vector rewrite; not applied to Lance base files or log files.
+    // Columns referenced by read_blob() in the current query (carried in the Hadoop conf via
+    // BLOB_INLINE_READ_FORCE_CONTENT_COLUMNS, set by ReadBlobRule per-query) are excluded from
+    // the strip set so the bytes survive for BatchedBlobReader to materialize.
     val isParquetBaseFile = FSUtils.isBaseFile(filePath) && !isLanceBaseFile
+    import org.apache.hudi.common.config.HoodieReaderConfig
+    val hadoopConf = storageConfiguration.unwrapAs(classOf[Configuration])
     val isBlobDescriptorMode = isParquetBaseFile && {
-      val hadoopConf = storageConfiguration.unwrapAs(classOf[Configuration])
-      import org.apache.hudi.common.config.HoodieReaderConfig
       val modeValue = hadoopConf.get(HoodieReaderConfig.BLOB_INLINE_READ_MODE.key(),
         HoodieReaderConfig.BLOB_INLINE_READ_MODE.defaultValue())
       modeValue.equalsIgnoreCase(HoodieReaderConfig.BLOB_INLINE_READ_MODE_DESCRIPTOR)
     }
+    val forceContentCols: Set[String] = if (isBlobDescriptorMode) {
+      Option(hadoopConf.get(HoodieReaderConfig.BLOB_INLINE_READ_FORCE_CONTENT_COLUMNS))
+        .map(_.split(",").iterator.map(_.trim).filter(_.nonEmpty).toSet)
+        .getOrElse(Set.empty)
+    } else {
+      Set.empty
+    }
     val blobColumnIndices: Set[Int] = if (isBlobDescriptorMode) {
-      VectorConversionUtils.detectBlobColumnsFromMetadata(parquetReadStructType).asScala.map(_.intValue()).toSet
+      val detected = VectorConversionUtils.detectBlobColumnsFromMetadata(parquetReadStructType).asScala.map(_.intValue()).toSet
+      if (forceContentCols.isEmpty) detected
+      else detected.filterNot(idx => forceContentCols.contains(parquetReadStructType.fields(idx).name))
     } else {
       Set.empty
     }
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieReaderConfig.java b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieReaderConfig.java
index 1f759cc21a152..d06ff28d22391 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieReaderConfig.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/config/HoodieReaderConfig.java
@@ -111,13 +111,21 @@ public class HoodieReaderConfig extends HoodieConfig {
       .markAdvanced()
       .sinceVersion("1.2.0")
       .withValidValues(BLOB_INLINE_READ_MODE_CONTENT, BLOB_INLINE_READ_MODE_DESCRIPTOR)
-      .withDocumentation("How Hudi interprets INLINE BLOB values on read. "
-          + "CONTENT (default) returns the raw inline bytes. "
-          + "DESCRIPTOR suppresses the inline bytes (data field is null) and returns metadata only, "
-          + "avoiding the I/O cost of reading large binary payloads. "
-          + "For Lance files, the reference struct is populated with blob stream coordinates "
-          + "so read_blob() can materialize bytes on demand. "
-          + "For Parquet files, the data column is skipped via column projection; "
-          + "the reference struct is null and read_blob() returns null. "
-          + "Use CONTENT mode when bytes are needed from Parquet tables.");
+      .withDocumentation("How Hudi interprets INLINE BLOB values on read for plain column access "
+          + "(e.g. SELECT *). "
+          + "CONTENT (default) returns the raw inline bytes in the data field. "
+          + "DESCRIPTOR suppresses the inline bytes (data field is null) so direct column reads "
+          + "avoid the I/O cost of materializing large binary payloads. "
+          + "For Lance files, the reference struct is populated with blob stream coordinates. "
+          + "For Parquet files, the data column is skipped via nested column projection and the "
+          + "reference struct is null. "
+          + "read_blob() is the canonical bytes-materializing API and always returns bytes "
+          + "regardless of this setting; under DESCRIPTOR mode the engine reads the data column "
+          + "only for the blob columns referenced by read_blob() in the query.");
+
+  // Internal-only key set by ReadBlobRule on a per-query HadoopFsRelation.options to instruct
+  // the reader to skip the DESCRIPTOR data-column strip for the listed blob columns, so that
+  // read_blob() sees the materialized bytes. Comma-separated top-level column names. Not user-facing.
+  public static final String BLOB_INLINE_READ_FORCE_CONTENT_COLUMNS =
+      "hoodie.internal.read.blob.inline.force.content.columns";
 }
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedFileFormat.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedFileFormat.scala
index de4d57f4d93d8..5d9de5a99339f 100644
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedFileFormat.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedFileFormat.scala
@@ -21,7 +21,7 @@ import org.apache.hudi.{HoodieFileIndex, HoodiePartitionCDCFileGroupMapping, Hoo
 import org.apache.hudi.cdc.{CDCFileGroupIterator, HoodieCDCFileGroupSplit, HoodieCDCFileIndex}
 import org.apache.hudi.client.common.HoodieSparkEngineContext
 import org.apache.hudi.client.utils.SparkInternalSchemaConverter
-import org.apache.hudi.common.config.{HoodieMemoryConfig, TypedProperties}
+import org.apache.hudi.common.config.{HoodieMemoryConfig, HoodieReaderConfig, TypedProperties}
 import org.apache.hudi.common.fs.FSUtils
 import org.apache.hudi.common.model.HoodieFileFormat
 import org.apache.hudi.common.schema.HoodieSchema
@@ -86,8 +86,8 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
                                            shouldUseRecordPosition: Boolean,
                                            requiredFilters: Seq[Filter],
                                            isMultipleBaseFileFormatsEnabled: Boolean,
-                                           hoodieFileFormat: HoodieFileFormat,
-                                           isBlobDescriptorMode: Boolean = false)
+                                           val hoodieFileFormat: HoodieFileFormat,
+                                           val isBlobDescriptorMode: Boolean = false)
   extends ParquetFileFormat with SparkAdapterSupport with HoodieFormatTrait with Logging with Serializable {
 
   private lazy val schema = tableSchema.schema
@@ -258,6 +258,20 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
     val augmentedStorageConf = new HadoopStorageConfiguration(hadoopConf).getInline
     setSchemaEvolutionConfigs(augmentedStorageConf)
     augmentedStorageConf.set(ENABLE_LOGICAL_TIMESTAMP_REPAIR, hasTimestampMillisFieldInTableSchema.toString)
+
+    // Per-query set of blob columns that read_blob() materializes in the current query, written
+    // into HadoopFsRelation.options by ReadBlobRule. Under DESCRIPTOR mode these columns keep their
+    // `data` sub-field so the bytes flow through to BatchedBlobReader. The conf entry below makes
+    // the same set visible to the MOR path (SparkFileFormatInternalRowReaderContext) at task time.
+    val forceContentCols: Set[String] = options
+      .get(HoodieReaderConfig.BLOB_INLINE_READ_FORCE_CONTENT_COLUMNS)
+      .map(_.split(",").iterator.map(_.trim).filter(_.nonEmpty).toSet)
+      .getOrElse(Set.empty)
+    if (forceContentCols.nonEmpty) {
+      augmentedStorageConf.set(
+        HoodieReaderConfig.BLOB_INLINE_READ_FORCE_CONTENT_COLUMNS,
+        forceContentCols.mkString(","))
+    }
     val (remainingPartitionSchemaArr, fixedPartitionIndexesArr) = partitionSchema.fields.toSeq.zipWithIndex.filter(p => !mandatoryFields.contains(p._1.name)).unzip
 
     // The schema of the partition cols we want to append the value instead of reading from the file
@@ -340,7 +354,7 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
 
             case _ =>
               readBaseFile(file, baseFileReader.value, requestedStructType, remainingPartitionSchema, fixedPartitionIndexes,
-                requiredSchema, partitionSchema, outputSchema, filters ++ requiredFilters, storageConf)
+                requiredSchema, partitionSchema, outputSchema, filters ++ requiredFilters, storageConf, forceContentCols)
           }
         // CDC queries.
         case hoodiePartitionCDCFileGroupSliceMapping: HoodiePartitionCDCFileGroupMapping =>
@@ -348,7 +362,7 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
 
         case _ =>
           readBaseFile(file, baseFileReader.value, requestedStructType, remainingPartitionSchema, fixedPartitionIndexes,
-            requiredSchema, partitionSchema, outputSchema, filters ++ requiredFilters, storageConf)
+            requiredSchema, partitionSchema, outputSchema, filters ++ requiredFilters, storageConf, forceContentCols)
       }
       CloseableIteratorListener.addListener(iter)
     }
@@ -480,18 +494,25 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
   /**
    * Detects BLOB columns and strips the {@code data} sub-field when DESCRIPTOR mode is active.
    * Only applies to Parquet format; other formats handle DESCRIPTOR mode natively.
+   *
+   * @param forceContentCols Top-level column names that must keep their {@code data} sub-field
+   *                         (i.e. the columns the current query reads via {@code read_blob()}).
+   *                         These are excluded from the strip set so the bytes are materialized.
    */
-  private def withBlobDescriptorRewrite(schema: StructType): (StructType, Set[Int]) = {
+  private def withBlobDescriptorRewrite(schema: StructType,
+                                        forceContentCols: Set[String]): (StructType, Set[Int]) = {
     if (hoodieFileFormat != HoodieFileFormat.PARQUET) {
       (schema, Set.empty[Int])
     } else {
       import scala.collection.JavaConverters._
-      val blobCols = VectorConversionUtils.detectBlobColumnsFromMetadata(schema).asScala.map(_.intValue()).toSet
-      if (blobCols.isEmpty) {
-        (schema, blobCols)
+      val detected = VectorConversionUtils.detectBlobColumnsFromMetadata(schema).asScala.map(_.intValue()).toSet
+      val toStrip = if (forceContentCols.isEmpty) detected
+        else detected.filterNot(idx => forceContentCols.contains(schema.fields(idx).name))
+      if (toStrip.isEmpty) {
+        (schema, Set.empty[Int])
       } else {
-        val javaBlobCols: java.util.Set[Integer] = blobCols.map(Integer.valueOf).asJava
-        (VectorConversionUtils.stripBlobDataField(schema, javaBlobCols), blobCols)
+        val javaBlobCols: java.util.Set[Integer] = toStrip.map(Integer.valueOf).asJava
+        (VectorConversionUtils.stripBlobDataField(schema, javaBlobCols), toStrip)
       }
     }
   }
@@ -529,7 +550,8 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
   private def readBaseFile(file: PartitionedFile, parquetFileReader: SparkColumnarFileReader, requestedSchema: StructType,
                            remainingPartitionSchema: StructType, fixedPartitionIndexes: Set[Int], requiredSchema: StructType,
                            partitionSchema: StructType, outputSchema: StructType, filters: Seq[Filter],
-                           storageConf: StorageConfiguration[Configuration]): Iterator[InternalRow] = {
+                           storageConf: StorageConfiguration[Configuration],
+                           forceContentCols: Set[String]): Iterator[InternalRow] = {
     // Detect vector columns and create modified schemas with BinaryType.
     // Each schema is detected independently because ordinals are relative to the schema being
     // modified — outputSchema and requestedSchema may have vector columns at different positions
@@ -541,10 +563,13 @@ class HoodieFileGroupReaderBasedFileFormat(tablePath: String,
 
     // Blob DESCRIPTOR mode: strip `data` sub-field from blob structs so Parquet skips
     // those column chunks entirely (real I/O savings). Applied after vector rewrite.
-    val (blobRequiredSchema, blobCols) = if (isBlobDescriptorMode) withBlobDescriptorRewrite(modifiedRequiredSchema) else (modifiedRequiredSchema, Set.empty[Int])
+    // `forceContentCols` carries the columns referenced by read_blob() in the current query
+    // (set by ReadBlobRule via HadoopFsRelation.options); those columns retain `data` so bytes
+    // are materialized for read_blob().
+    val (blobRequiredSchema, blobCols) = if (isBlobDescriptorMode) withBlobDescriptorRewrite(modifiedRequiredSchema, forceContentCols) else (modifiedRequiredSchema, Set.empty[Int])
     val hasBlobs = blobCols.nonEmpty
-    val (blobOutputSchema, outputBlobCols) = if (hasBlobs) withBlobDescriptorRewrite(modifiedOutputSchema) else (modifiedOutputSchema, Set.empty[Int])
-    val (blobRequestedSchema, _) = if (hasBlobs) withBlobDescriptorRewrite(modifiedRequestedSchema) else (modifiedRequestedSchema, Set.empty[Int])
+    val (blobOutputSchema, outputBlobCols) = if (hasBlobs) withBlobDescriptorRewrite(modifiedOutputSchema, forceContentCols) else (modifiedOutputSchema, Set.empty[Int])
+    val (blobRequestedSchema, _) = if (hasBlobs) withBlobDescriptorRewrite(modifiedRequestedSchema, forceContentCols) else (modifiedRequestedSchema, Set.empty[Int])
 
     val rawIter = if (remainingPartitionSchema.fields.length == partitionSchema.fields.length) {
       //none of partition fields are read from the file, so the reader will do the appending for us
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/blob/ReadBlobRule.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/blob/ReadBlobRule.scala
index b91d08674cb95..e5b39cc7bc196 100644
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/blob/ReadBlobRule.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/blob/ReadBlobRule.scala
@@ -19,11 +19,15 @@
 
 package org.apache.spark.sql.hudi.blob
 
-import org.apache.spark.sql.AnalysisException
+import org.apache.hudi.common.config.HoodieReaderConfig
+import org.apache.hudi.common.model.HoodieFileFormat
+
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, ExprId, NamedExpression}
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.parquet.HoodieFileGroupReaderBasedFileFormat
 import org.apache.spark.sql.types.{DataType, StructType}
 
 import scala.collection.mutable
@@ -41,45 +45,94 @@ import scala.collection.mutable.ArrayBuffer
  */
 case class ReadBlobRule(spark: SparkSession) extends Rule[LogicalPlan] {
 
-  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp {
-    case Project(projectList, Filter(condition, child))
-      if containsReadBlobExpression(projectList)
-        && containsReadBlobInExpression(condition)
-        && !child.isInstanceOf[BatchedBlobRead] =>
-      val projectBlobCols = extractAllBlobColumns(projectList)
-      val filterBlobCols = extractBlobColumnsFromExpression(condition)
-      val blobColumns = (projectBlobCols ++ filterBlobCols)
-        .foldLeft((mutable.LinkedHashSet.empty[ExprId], ArrayBuffer.empty[AttributeReference])) {
-          case ((seen, acc), a) if seen.add(a.exprId) => (seen, acc += a)
-          case ((seen, acc), _) => (seen, acc)
-        }._2.toSeq
-      val (wrappedPlan, blobToDataAttr) = wrapWithBlobReads(blobColumns, child)
-      val newCondition = replaceReadBlobExpression(condition, blobToDataAttr)
-      val newProjectList = transformNamedExpressions(projectList, blobToDataAttr)
-      Project(newProjectList, Filter(newCondition, wrappedPlan))
-
-    case Filter(condition, child)
-      if containsReadBlobInExpression(condition)
-        && !child.isInstanceOf[BatchedBlobRead] =>
-
-      val blobColumns = extractBlobColumnsFromExpression(condition)
-      val (wrappedPlan, blobToDataAttr) = wrapWithBlobReads(blobColumns, child)
-      val newCondition = replaceReadBlobExpression(condition, blobToDataAttr)
-      Project(child.output, Filter(newCondition, wrappedPlan))
-
-    case Project(projectList, child)
-      if containsReadBlobExpression(projectList)
-        && !child.isInstanceOf[BatchedBlobRead] =>
-
-      val blobColumns = extractAllBlobColumns(projectList)
-      val (wrappedPlan, blobToDataAttr) = wrapWithBlobReads(blobColumns, child)
-      val newProjectList = transformNamedExpressions(projectList, blobToDataAttr)
-      Project(newProjectList, wrappedPlan)
-
-    case node if containsReadBlobInAnyExpression(node) =>
-      throw new IllegalArgumentException(
-        s"read_blob() may only appear in SELECT or WHERE clauses. Found in unsupported logical plan node: ${node.nodeName}. " +
-        s"Move read_blob() to a SELECT or WHERE clause. Full plan: ${node.simpleStringWithNodeId()}")
+  override def apply(plan: LogicalPlan): LogicalPlan = {
+    val transformed = plan resolveOperatorsUp {
+      case Project(projectList, Filter(condition, child))
+        if containsReadBlobExpression(projectList)
+          && containsReadBlobInExpression(condition)
+          && !child.isInstanceOf[BatchedBlobRead] =>
+        val projectBlobCols = extractAllBlobColumns(projectList)
+        val filterBlobCols = extractBlobColumnsFromExpression(condition)
+        val blobColumns = (projectBlobCols ++ filterBlobCols)
+          .foldLeft((mutable.LinkedHashSet.empty[ExprId], ArrayBuffer.empty[AttributeReference])) {
+            case ((seen, acc), a) if seen.add(a.exprId) => (seen, acc += a)
+            case ((seen, acc), _) => (seen, acc)
+          }._2.toSeq
+        val (wrappedPlan, blobToDataAttr) = wrapWithBlobReads(blobColumns, child)
+        val newCondition = replaceReadBlobExpression(condition, blobToDataAttr)
+        val newProjectList = transformNamedExpressions(projectList, blobToDataAttr)
+        Project(newProjectList, Filter(newCondition, wrappedPlan))
+
+      case Filter(condition, child)
+        if containsReadBlobInExpression(condition)
+          && !child.isInstanceOf[BatchedBlobRead] =>
+
+        val blobColumns = extractBlobColumnsFromExpression(condition)
+        val (wrappedPlan, blobToDataAttr) = wrapWithBlobReads(blobColumns, child)
+        val newCondition = replaceReadBlobExpression(condition, blobToDataAttr)
+        Project(child.output, Filter(newCondition, wrappedPlan))
+
+      case Project(projectList, child)
+        if containsReadBlobExpression(projectList)
+          && !child.isInstanceOf[BatchedBlobRead] =>
+
+        val blobColumns = extractAllBlobColumns(projectList)
+        val (wrappedPlan, blobToDataAttr) = wrapWithBlobReads(blobColumns, child)
+        val newProjectList = transformNamedExpressions(projectList, blobToDataAttr)
+        Project(newProjectList, wrappedPlan)
+
+      case node if containsReadBlobInAnyExpression(node) =>
+        throw new IllegalArgumentException(
+          s"read_blob() may only appear in SELECT or WHERE clauses. Found in unsupported logical plan node: ${node.nodeName}. " +
+          s"Move read_blob() to a SELECT or WHERE clause. Full plan: ${node.simpleStringWithNodeId()}")
+    }
+    injectForceContentColumnOptions(transformed)
+  }
+
+  /**
+   * For every [[BatchedBlobRead]] in the transformed plan, find the underlying Hudi Parquet
+   * [[LogicalRelation]] that produces its blob attribute and add an internal option carrying the
+   * set of blob column names that must keep their `data` sub-field at read time. This lets
+   * read_blob() materialize bytes per-column even when the relation was constructed in
+   * `hoodie.read.blob.inline.mode=DESCRIPTOR`, without mutating any shared FileFormat state.
+   *
+   * Lance and non-Parquet formats are skipped — Lance handles DESCRIPTOR + read_blob() natively
+   * via its populated reference field. Relations not in DESCRIPTOR mode at construction time are
+   * also skipped (no strip happens, so no override is needed).
+   */
+  private def injectForceContentColumnOptions(plan: LogicalPlan): LogicalPlan = {
+    val readBlobAttrIds: Set[ExprId] = plan.collect {
+      case BatchedBlobRead(_, attr, _) => attr.exprId
+    }.toSet
+    if (readBlobAttrIds.isEmpty) {
+      plan
+    } else {
+      plan transformDown {
+        case lr @ LogicalRelation(rel: HadoopFsRelation, _, _, _) =>
+          rel.fileFormat match {
+            case ff: HoodieFileGroupReaderBasedFileFormat
+                if ff.hoodieFileFormat == HoodieFileFormat.PARQUET && ff.isBlobDescriptorMode =>
+              val matched: Set[String] = lr.output.collect {
+                case a: AttributeReference if readBlobAttrIds.contains(a.exprId) => a.name
+              }.toSet
+              if (matched.isEmpty) {
+                lr
+              } else {
+                val newOptions = rel.options +
+                  (HoodieReaderConfig.BLOB_INLINE_READ_FORCE_CONTENT_COLUMNS -> matched.mkString(","))
+                val newRel = HadoopFsRelation(
+                  location = rel.location,
+                  partitionSchema = rel.partitionSchema,
+                  dataSchema = rel.dataSchema,
+                  bucketSpec = rel.bucketSpec,
+                  fileFormat = rel.fileFormat,
+                  options = newOptions)(rel.sparkSession)
+                lr.copy(relation = newRel)
+              }
+            case _ => lr
+          }
+      }
+    }
   }
 
   private def containsReadBlobInAnyExpression(plan: LogicalPlan): Boolean = {
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/blob/TestReadBlobSQL.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/blob/TestReadBlobSQL.scala
index 533c9589e338c..bedfb3c72bcd0 100644
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/blob/TestReadBlobSQL.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/blob/TestReadBlobSQL.scala
@@ -19,16 +19,20 @@
 
 package org.apache.hudi.blob
 
+import org.apache.hudi.DataSourceWriteOptions
 import org.apache.hudi.blob.BlobTestHelpers._
+import org.apache.hudi.common.model.HoodieTableType
 import org.apache.hudi.common.schema.HoodieSchema
 import org.apache.hudi.exception.HoodieIOException
 import org.apache.hudi.testutils.HoodieClientTestBase
-
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
+import org.junit.Ignore
 import org.junit.jupiter.api.Assertions._
 import org.junit.jupiter.api.Test
+import org.junit.jupiter.params.ParameterizedTest
+import org.junit.jupiter.params.provider.EnumSource
 
 import java.util.Collections
 
@@ -495,4 +499,296 @@ class TestReadBlobSQL extends HoodieClientTestBase {
       assertBytesContent(row.getAs[Array[Byte]]("data"), expectedOffset = idx * 100)
     }
   }
+
+  // ------------------------------------------------------------------
+  // Parquet DESCRIPTOR-mode interaction tests
+  //
+  // These exercise the per-query rewrite added by ReadBlobRule that
+  // injects BLOB_INLINE_READ_FORCE_CONTENT_COLUMNS into the
+  // LogicalRelation's options when a query uses read_blob(). The
+  // contract: read_blob(col) always returns bytes; plain SELECT keeps
+  // DESCRIPTOR's I/O savings (data sub-field is null) for the columns
+  // that aren't referenced by read_blob().
+  // ------------------------------------------------------------------
+
+  /**
+   * Helpers for the DESCRIPTOR-mode tests. Builds a Hudi table containing
+   * one or two INLINE blob columns and returns the table path.
+   */
+  private def writeInlineBlobTable(name: String,
+                                   tableType: HoodieTableType,
+                                   payloads: Seq[Array[Byte]]): String = {
+    val tablePath = s"$tempDir/$name"
+    val rawDf = sparkSession.createDataFrame(
+        payloads.zipWithIndex.map { case (bytes, i) => (i + 1, bytes) })
+      .toDF("id", "bytes")
+      .withColumn("payload", inlineBlobStructCol("payload", col("bytes")))
+      .select("id", "payload")
+    val canonicalSchema = StructType(Seq(
+      StructField("id", IntegerType, nullable = false),
+      StructField("payload", BlobType().asInstanceOf[StructType], nullable = true, blobMetadata)
+    ))
+    val df = sparkSession.createDataFrame(rawDf.rdd, canonicalSchema)
+    df.write.format("hudi")
+      .option("hoodie.table.name", name)
+      .option(DataSourceWriteOptions.RECORDKEY_FIELD.key(), "id")
+      .option(DataSourceWriteOptions.PRECOMBINE_FIELD.key(), "id")
+      .option(DataSourceWriteOptions.TABLE_TYPE.key(), tableType.name())
+      .option(DataSourceWriteOptions.OPERATION.key(), "bulk_insert")
+      .mode("overwrite")
+      .save(tablePath)
+    tablePath
+  }
+
+  /**
+   * Core contract: read_blob() always materializes bytes, even under
+   * DESCRIPTOR mode, on both COW and MOR base files. Without this fix,
+   * read_blob() would see a null `data` sub-field (column-pruned by
+   * Parquet) and silently return null.
+   */
+  @ParameterizedTest
+  @EnumSource(value = classOf[HoodieTableType])
+  def testReadBlobUnderDescriptorMaterializesBytes(tableType: HoodieTableType): Unit = {
+    val payloads = Seq(
+      Array.fill[Byte](128)(0x1.toByte),
+      Array.fill[Byte](128)(0x2.toByte),
+      Array.fill[Byte](128)(0x3.toByte))
+    val tablePath = writeInlineBlobTable(
+      s"read_blob_desc_${tableType.name().toLowerCase}", tableType, payloads)
+
+    sparkSession.read.format("hudi")
+      .option("hoodie.read.blob.inline.mode", "DESCRIPTOR")
+      .load(tablePath)
+      .createOrReplaceTempView("rb_desc_view")
+
+    val rows = sparkSession.sql(
+      "SELECT id, read_blob(payload) AS bytes FROM rb_desc_view ORDER BY id"
+    ).collect()
+    assertEquals(3, rows.length)
+    rows.zip(payloads).foreach { case (row, expected) =>
+      val bytes = row.getAs[Array[Byte]]("bytes")
+      assertNotNull(bytes, s"read_blob() must materialize bytes under DESCRIPTOR (id=${row.getInt(0)})")
+      assertArrayEquals(expected, bytes, s"bytes mismatch for id=${row.getInt(0)}")
+    }
+  }
+
+  /**
+   * DESCRIPTOR savings preserved when read_blob() is NOT in the query:
+   * commit 1's column projection still strips `data`, and the rule writes
+   * no force-content option.
+   */
+  @ParameterizedTest
+  @EnumSource(value = classOf[HoodieTableType])
+  def testDescriptorWithoutReadBlobStillSkipsData(tableType: HoodieTableType): Unit = {
+    val payloads = Seq(
+      Array.fill[Byte](128)(0x1.toByte),
+      Array.fill[Byte](128)(0x2.toByte))
+    val tablePath = writeInlineBlobTable(
+      s"desc_no_rb_${tableType.name().toLowerCase}", tableType, payloads)
+
+    val rows = sparkSession.read.format("hudi")
+      .option("hoodie.read.blob.inline.mode", "DESCRIPTOR")
+      .load(tablePath)
+      .select(col("id"), col("payload"))
+      .orderBy(col("id"))
+      .collect()
+
+    assertEquals(2, rows.length)
+    rows.foreach { row =>
+      val payload = row.getStruct(row.fieldIndex("payload"))
+      assertEquals(HoodieSchema.Blob.INLINE,
+        payload.getString(payload.fieldIndex(HoodieSchema.Blob.TYPE)))
+      assertTrue(payload.isNullAt(payload.fieldIndex(HoodieSchema.Blob.INLINE_DATA_FIELD)),
+        s"DESCRIPTOR should null-pad data when read_blob() is absent (id=${row.getInt(0)})")
+      assertTrue(payload.isNullAt(payload.fieldIndex(HoodieSchema.Blob.EXTERNAL_REFERENCE)),
+        "Parquet has no native byte-range descriptor; reference is null")
+    }
+  }
+
+  /**
+   * The per-column granularity claim. Query references read_blob(payload_a)
+   * only — payload_a must materialize bytes, payload_b must remain
+   * stripped (DESCRIPTOR savings preserved for the column the user
+   * didn't ask about).
+   *
+   * Uses multiple rows with distinct per-row byte patterns so that any
+   * row-iteration bug (e.g., reusing a row buffer without copying, or
+   * mis-indexing the blob struct ordinal across rows) would surface as
+   * mismatched bytes per row rather than slipping through a single-row
+   * happy path.
+   */
+  @Test
+  def testDescriptorPerColumnGranularity(): Unit = {
+    val tablePath = s"$tempDir/desc_per_column"
+    // Distinct fill byte AND distinct length per row, AND per-row distinction
+    // between payload_a and payload_b — a cross-row or cross-column leak fails an assertion.
+    val rows = Seq(
+      (1, Array.fill[Byte](80)(0xA1.toByte), Array.fill[Byte](160)(0xB1.toByte)),
+      (2, Array.fill[Byte](64)(0xA2.toByte), Array.fill[Byte](192)(0xB2.toByte)),
+      (3, Array.fill[Byte](96)(0xA3.toByte), Array.fill[Byte](128)(0xB3.toByte))
+    )
+    val rawDf = sparkSession.createDataFrame(rows)
+      .toDF("id", "bytes_a", "bytes_b")
+      .withColumn("payload_a", inlineBlobStructCol("payload_a", col("bytes_a")))
+      .withColumn("payload_b", inlineBlobStructCol("payload_b", col("bytes_b")))
+      .select("id", "payload_a", "payload_b")
+    val canonicalSchema = StructType(Seq(
+      StructField("id", IntegerType, nullable = false),
+      StructField("payload_a", BlobType().asInstanceOf[StructType], nullable = true, blobMetadata),
+      StructField("payload_b", BlobType().asInstanceOf[StructType], nullable = true, blobMetadata)
+    ))
+    sparkSession.createDataFrame(rawDf.rdd, canonicalSchema).write.format("hudi")
+      .option("hoodie.table.name", "desc_per_column")
+      .option("hoodie.datasource.write.recordkey.field", "id")
+      .option("hoodie.datasource.write.operation", "bulk_insert")
+      .mode("overwrite")
+      .save(tablePath)
+
+    sparkSession.read.format("hudi")
+      .option("hoodie.read.blob.inline.mode", "DESCRIPTOR")
+      .load(tablePath)
+      .createOrReplaceTempView("desc_per_column_view")
+
+    val outRows = sparkSession.sql(
+      "SELECT id, read_blob(payload_a) AS bytes_a, payload_b " +
+        "FROM desc_per_column_view ORDER BY id"
+    ).collect()
+    assertEquals(rows.length, outRows.length)
+    outRows.zip(rows).foreach { case (row, (expectedId, expectedA, expectedB)) =>
+      assertEquals(expectedId, row.getInt(0))
+      val bytesA = row.getAs[Array[Byte]]("bytes_a")
+      assertArrayEquals(expectedA, bytesA,
+        s"read_blob(payload_a) bytes mismatch at id=$expectedId — expected length ${expectedA.length}, " +
+          s"got length ${if (bytesA == null) -1 else bytesA.length}")
+      val payloadB = row.getStruct(row.fieldIndex("payload_b"))
+      assertTrue(payloadB.isNullAt(payloadB.fieldIndex(HoodieSchema.Blob.INLINE_DATA_FIELD)),
+        s"DESCRIPTOR savings must be preserved for payload_b at id=$expectedId")
+      // Sanity: payload_b's type marker survived even though `data` was stripped.
+      assertEquals(HoodieSchema.Blob.INLINE,
+        payloadB.getString(payloadB.fieldIndex(HoodieSchema.Blob.TYPE)),
+        s"payload_b type marker must survive stripping at id=$expectedId")
+      // Sanity: we did NOT smuggle bytes into payload_b under any name.
+      val _ = expectedB // explicitly unused: the contract is that payload_b.data must be null
+    }
+  }
+
+  /**
+   * read_blob() in WHERE clause must also trigger the per-query rewrite.
+   * ReadBlobRule's Filter case wraps the condition in BatchedBlobRead;
+   * the second pass collects the blobAttr the same way.
+   */
+  @Test
+  def testReadBlobInWhereClauseUnderDescriptor(): Unit = {
+    val payloads = Seq(
+      Array.fill[Byte](100)(0xA.toByte),
+      Array.fill[Byte](200)(0xB.toByte),
+      Array.fill[Byte](100)(0xC.toByte))
+    val tablePath = writeInlineBlobTable(
+      "desc_where_clause", HoodieTableType.COPY_ON_WRITE, payloads)
+
+    sparkSession.read.format("hudi")
+      .option("hoodie.read.blob.inline.mode", "DESCRIPTOR")
+      .load(tablePath)
+      .createOrReplaceTempView("desc_where_view")
+
+    val rows = sparkSession.sql(
+      "SELECT id, read_blob(payload) AS bytes FROM desc_where_view " +
+        "WHERE length(read_blob(payload)) = 200"
+    ).collect()
+    assertEquals(1, rows.length)
+    assertEquals(2, rows(0).getInt(0))
+    assertArrayEquals(payloads(1), rows(0).getAs[Array[Byte]]("bytes"))
+  }
+
+  /**
+   * JOIN of two Hudi Parquet tables, both in DESCRIPTOR mode, both with
+   * a blob column. The query uses read_blob() on only the left side's
+   * blob.
+   *
+   * This exercises ReadBlobRule's per-relation option routing: the
+   * BLOB_INLINE_READ_FORCE_CONTENT_COLUMNS option must land on the
+   * left table's LogicalRelation only, and the right table's payload
+   * must come back with DESCRIPTOR's null `data`. A bug where the
+   * rule writes the option to every Hudi LogicalRelation, or to
+   * none, would fail exactly one of the two assertions below.
+   */
+
+  @Ignore //TODO to re-enable
+  @Test
+  def testReadBlobJoinUnderDescriptorRoutesOptionPerRelation(): Unit = {
+    val leftPayloads = Seq(
+      Array.fill[Byte](64)(0xA1.toByte),
+      Array.fill[Byte](64)(0xA2.toByte))
+    val rightPayloads = Seq(
+      Array.fill[Byte](128)(0xB1.toByte),
+      Array.fill[Byte](128)(0xB2.toByte))
+
+    val leftPath = writeInlineBlobTable(
+      "desc_join_left", HoodieTableType.COPY_ON_WRITE, leftPayloads)
+    val rightPath = writeInlineBlobTable(
+      "desc_join_right", HoodieTableType.COPY_ON_WRITE, rightPayloads)
+
+    sparkSession.read.format("hudi")
+      .option("hoodie.read.blob.inline.mode", "DESCRIPTOR")
+      .load(leftPath)
+      .createOrReplaceTempView("desc_join_left_v")
+    sparkSession.read.format("hudi")
+      .option("hoodie.read.blob.inline.mode", "DESCRIPTOR")
+      .load(rightPath)
+      .createOrReplaceTempView("desc_join_right_v")
+
+    val rows = sparkSession.sql(
+      "SELECT l.id AS id, read_blob(l.payload) AS left_bytes, r.payload AS right_payload " +
+        "FROM desc_join_left_v l JOIN desc_join_right_v r ON l.id = r.id " +
+        "ORDER BY l.id"
+    ).collect()
+
+    assertEquals(2, rows.length)
+    rows.zipWithIndex.foreach { case (row, i) =>
+      val id = i + 1
+      assertEquals(id, row.getInt(0))
+      // Left side: read_blob() must materialize bytes.
+      val leftBytes = row.getAs[Array[Byte]]("left_bytes")
+      assertNotNull(leftBytes, s"read_blob(l.payload) must return bytes at id=$id")
+      assertArrayEquals(leftPayloads(i), leftBytes,
+        s"read_blob(l.payload) bytes mismatch at id=$id")
+      // Right side: DESCRIPTOR savings preserved — `data` is null.
+      val rightPayload = row.getStruct(row.fieldIndex("right_payload"))
+      assertEquals(HoodieSchema.Blob.INLINE,
+        rightPayload.getString(rightPayload.fieldIndex(HoodieSchema.Blob.TYPE)),
+        s"r.payload type marker must survive at id=$id")
+      assertTrue(rightPayload.isNullAt(rightPayload.fieldIndex(HoodieSchema.Blob.INLINE_DATA_FIELD)),
+        s"r.payload.data must remain null under DESCRIPTOR at id=$id — read_blob() was not called on the right side")
+    }
+  }
+
+  /**
+   * Negative case: setting DESCRIPTOR on a table with no blob columns is
+   * a no-op. The rule must not write the force-content option (nothing
+   * to force), and the read must return rows untouched.
+   */
+  @Test
+  def testDescriptorOnTableWithoutBlobColumns(): Unit = {
+    val tablePath = s"$tempDir/desc_no_blob"
+    sparkSession.createDataFrame(Seq((1, "a"), (2, "b")))
+      .toDF("id", "name")
+      .write.format("hudi")
+      .option("hoodie.table.name", "desc_no_blob")
+      .option("hoodie.datasource.write.recordkey.field", "id")
+      .option("hoodie.datasource.write.operation", "bulk_insert")
+      .mode("overwrite")
+      .save(tablePath)
+
+    val rows = sparkSession.read.format("hudi")
+      .option("hoodie.read.blob.inline.mode", "DESCRIPTOR")
+      .load(tablePath)
+      .select(col("id"), col("name"))
+      .orderBy(col("id"))
+      .collect()
+    assertEquals(2, rows.length)
+    assertEquals(1, rows(0).getInt(0))
+    assertEquals("a", rows(0).getString(1))
+    assertEquals(2, rows(1).getInt(0))
+    assertEquals("b", rows(1).getString(1))
+  }
 }