From 537d2cb45deb3df6802a2fe00c770e0bae380da9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 22 Jun 2026 10:42:36 -0700 Subject: [PATCH 1/6] block: introduce bio_endio_errno helper No functional change; purely introducing a convenience function. Signed-off-by: Keith Busch --- block/blk-merge.c | 4 ++-- include/linux/blkdev.h | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index ab1161ca69f1..c93170f34097 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -122,7 +122,7 @@ struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs); if (IS_ERR(split)) { - bio_endio_status(bio, errno_to_blk_status(PTR_ERR(split))); + bio_endio_errno(bio, PTR_ERR(split)); return NULL; } @@ -142,7 +142,7 @@ EXPORT_SYMBOL_GPL(bio_submit_split_bioset); static struct bio *bio_submit_split(struct bio *bio, int split_sectors) { if (unlikely(split_sectors < 0)) { - bio_endio_status(bio, errno_to_blk_status(split_sectors)); + bio_endio_errno(bio, split_sectors); return NULL; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5070851cf924..10da43d6e7c0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1047,6 +1047,11 @@ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); +static inline void bio_endio_errno(struct bio *bio, int errno) +{ + bio_endio_status(bio, errno_to_blk_status(errno)); +} + /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); From 0fd959d4516683056f505082d7dfa4b1a3aac87a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 22 Jun 2026 10:42:37 -0700 Subject: [PATCH 2/6] block: report the actual status Rather than assume EIO, set the actual reported status for user space informational purposes. Signed-off-by: Keith Busch --- block/fops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/fops.c b/block/fops.c index 15783a6180de..f237d6cab897 100644 --- a/block/fops.c +++ b/block/fops.c @@ -218,7 +218,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ret = blkdev_iov_iter_get_pages(bio, iter, bdev); if (unlikely(ret)) { - bio_endio_status(bio, BLK_STS_IOERR); + bio_endio_errno(bio, ret); break; } if (iocb->ki_flags & IOCB_NOWAIT) { From 8ec16923272c9619bc8263595a82f6fdfd1b8b84 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 22 Jun 2026 10:42:38 -0700 Subject: [PATCH 3/6] block: fix dio leak on metadata mapping error A failed integrity mapping holds a dio reference, so we need to go through the full bio ending in case there were previously submitted bio's in the sequence. Fixes: 2729a60bbfb92 ("block: don't silently ignore metadata for sync read/write") Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig --- block/fops.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/fops.c b/block/fops.c index f237d6cab897..b5c320da2812 100644 --- a/block/fops.c +++ b/block/fops.c @@ -238,8 +238,10 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } if (iocb->ki_flags & IOCB_HAS_METADATA) { ret = bio_integrity_map_iter(bio, iocb->private); - if (unlikely(ret)) - goto fail; + if (unlikely(ret)) { + bio_endio_errno(bio, ret); + break; + } } if (is_read) { From 34d3d5133cd2a480e9bdb90259b05f9f066a24e6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 22 Jun 2026 10:42:39 -0700 Subject: [PATCH 4/6] loop: set dma_alignment from the backing file for direct I/O Direct I/O user pages are forwarded to the backing file unchanged, so the backing's DMA alignment requirement applies to them. Track the backing's dio_mem_align and advertise it as the loop device's dma_alignment so we advertise proper limits and misaligned I/O is rejected here instead of being dispatched to the backend. Signed-off-by: Keith Busch --- drivers/block/loop.c | 50 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 310de0463beb..7114f80ab162 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -54,6 +54,7 @@ struct loop_device { struct file *lo_backing_file; unsigned int lo_min_dio_size; + unsigned int lo_dio_mem_align; struct block_device *lo_device; gfp_t old_gfp_mask; @@ -447,26 +448,37 @@ static void loop_reread_partitions(struct loop_device *lo) __func__, lo->lo_number, lo->lo_file_name, rc); } -static unsigned int loop_query_min_dio_size(struct loop_device *lo) +static void loop_update_dio_alignment(struct loop_device *lo) { struct file *file = lo->lo_backing_file; struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev; struct kstat st; /* - * Use the minimal dio alignment of the file system if provided. + * Use the dio alignment of the file system if provided. dio_offset_align + * is the minimum dio size and offset; dio_mem_align is the buffer memory + * alignment, kept as a mask to become the loop device's dma_alignment in + * direct I/O mode where the buffer is handed to the backing file unchanged. */ if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) && - (st.result_mask & STATX_DIOALIGN)) - return st.dio_offset_align; + (st.result_mask & STATX_DIOALIGN)) { + lo->lo_min_dio_size = st.dio_offset_align; + lo->lo_dio_mem_align = st.dio_mem_align - 1; + return; + } /* * In a perfect world this wouldn't be needed, but as of Linux 6.13 only * a handful of file systems support the STATX_DIOALIGN flag. */ - if (sb_bdev) - return bdev_logical_block_size(sb_bdev); - return SECTOR_SIZE; + if (sb_bdev) { + lo->lo_min_dio_size = bdev_logical_block_size(sb_bdev); + lo->lo_dio_mem_align = bdev_dma_alignment(sb_bdev); + return; + } + + lo->lo_min_dio_size = SECTOR_SIZE; + lo->lo_dio_mem_align = SECTOR_SIZE - 1; } static inline int is_loop_device(struct file *file) @@ -509,7 +521,7 @@ static void loop_assign_backing_file(struct loop_device *lo, struct file *file) lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS)); if (lo->lo_backing_file->f_flags & O_DIRECT) lo->lo_flags |= LO_FLAGS_DIRECT_IO; - lo->lo_min_dio_size = loop_query_min_dio_size(lo); + loop_update_dio_alignment(lo); } static int loop_check_backing_file(struct file *file) @@ -961,6 +973,17 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, lim->logical_block_size = bsize; lim->physical_block_size = bsize; lim->io_min = bsize; + /* + * In direct I/O the user pages are handed to the backing file as-is, so + * the backing's DMA alignment requirement applies to them. Advertise it + * so misaligned I/O is rejected at this device's entry instead of being + * dispatched to the backend. Buffered I/O copies through the page cache + * and imposes no such requirement. + */ + if (lo->lo_flags & LO_FLAGS_DIRECT_IO) + lim->dma_alignment = lo->lo_dio_mem_align; + else + lim->dma_alignment = SECTOR_SIZE - 1; lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) lim->features |= BLK_FEAT_WRITE_CACHE; @@ -1416,6 +1439,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg) { bool use_dio = !!arg; unsigned int memflags; + struct queue_limits lim; if (lo->lo_state != Lo_bound) return -ENXIO; @@ -1434,6 +1458,16 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg) lo->lo_flags |= LO_FLAGS_DIRECT_IO; else lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; + /* + * Direct I/O forwards the user pages to the backing file unchanged, so + * track the backing's DMA alignment requirement as the mode is toggled. + */ + lim = queue_limits_start_update(lo->lo_queue); + if (lo->lo_flags & LO_FLAGS_DIRECT_IO) + lim.dma_alignment = lo->lo_dio_mem_align; + else + lim.dma_alignment = SECTOR_SIZE - 1; + queue_limits_commit_update(lo->lo_queue, &lim); blk_mq_unfreeze_queue(lo->lo_queue, memflags); return 0; } From ffed88fa07fe90153570dbd2596d35b773dc7ec8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 22 Jun 2026 10:42:40 -0700 Subject: [PATCH 5/6] zloop: set dma_alignment from the backing files for direct I/O Direct I/O request's use pages handed to the backing files unchanged, so the backing's DMA alignment requirement applies. Track dio_mem_align and advertise it as the device's dma_alignment so we communicate proper limits and misaligned I/O is rejected here instead of reaching the backend. Signed-off-by: Keith Busch --- drivers/block/zloop.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 55eeb6aac0ea..1149b817b5bc 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -144,6 +144,7 @@ struct zloop_device { unsigned int nr_conv_zones; unsigned int max_open_zones; unsigned int block_size; + unsigned int dio_mem_align; spinlock_t open_zones_lock; struct list_head open_zones_lru_list; @@ -1035,6 +1036,9 @@ static int zloop_get_block_size(struct zloop_device *zlo, { struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev; struct kstat st; + bool have_dioalign = !vfs_getattr(&zone->file->f_path, &st, + STATX_DIOALIGN, 0) && + (st.result_mask & STATX_DIOALIGN); /* * If the FS block size is lower than or equal to 4K, use that as the @@ -1044,14 +1048,25 @@ static int zloop_get_block_size(struct zloop_device *zlo, */ if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K) zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize; - else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) && - (st.result_mask & STATX_DIOALIGN)) + else if (have_dioalign) zlo->block_size = st.dio_offset_align; else if (sb_bdev) zlo->block_size = bdev_physical_block_size(sb_bdev); else zlo->block_size = SECTOR_SIZE; + /* + * In direct I/O the request's pages are handed to the backing files + * unchanged, so track their required memory alignment as a mask for + * dma_alignment. + */ + if (have_dioalign) + zlo->dio_mem_align = st.dio_mem_align - 1; + else if (sb_bdev) + zlo->dio_mem_align = bdev_dma_alignment(sb_bdev); + else + zlo->dio_mem_align = SECTOR_SIZE - 1; + if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) { pr_err("Zone capacity is not aligned to block size %u\n", zlo->block_size); @@ -1279,6 +1294,9 @@ static int zloop_ctl_add(struct zloop_options *opts) lim.physical_block_size = zlo->block_size; lim.logical_block_size = zlo->block_size; + /* Direct I/O hands the request's pages to the backing files unchanged. */ + if (!opts->buffered_io) + lim.dma_alignment = zlo->dio_mem_align; if (zlo->zone_append) lim.max_hw_zone_append_sectors = lim.max_hw_sectors; lim.max_open_zones = zlo->max_open_zones; From f6aac906f1baf35440d95eb26e7f7214e290da87 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 22 Jun 2026 10:42:41 -0700 Subject: [PATCH 6/6] block: validate user space vectors during extraction The bio-based drivers don't necessarily check the alignment split, and stacking block drivers don't always handle a misalignment detected after submitting the bio. Validate user vectors against the device's dma_alignment as the bio is built from the iov_iter, rejecting misaligned early with -EINVAL. Cc: stable@vger.kernel.org Fixes: 5ff3f74e145a ("block: simplify direct io validity check") Fixes: 7eac33186957 ("iomap: simplify direct io validity check") Signed-off-by: Keith Busch --- block/bio.c | 50 +++++++++++++++++++++++++++++++++++++++++--- block/blk-map.c | 2 +- block/fops.c | 1 + fs/iomap/direct-io.c | 1 + include/linux/bio.h | 2 +- include/linux/uio.h | 3 ++- lib/iov_iter.c | 9 +++++++- 7 files changed, 61 insertions(+), 7 deletions(-) diff --git a/block/bio.c b/block/bio.c index 811a96796202..96fb523ee61a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1220,10 +1220,39 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, return 0; } +#ifdef CONFIG_DEBUG_KERNEL +static inline bool bio_iov_bvec_aligned(const struct bio *bio, + unsigned mem_align_mask) +{ + struct bvec_iter iter; + struct bio_vec bv; + + for_each_mp_bvec(bv, bio->bi_io_vec, iter, bio->bi_iter) + if ((bv.bv_offset | bv.bv_len) & mem_align_mask) + return false; + return true; +} +#else +static inline bool bio_iov_bvec_aligned(const struct bio *bio, + unsigned mem_align_mask) +{ + /* + * The vectors are owned and laid out by the caller; we only forward + * them. Most callers are already aligned, but io_uring can place a + * user chosen offset through a registered buffer, where only the first + * vector may be unaligned. + */ + return !(mp_bvec_iter_offset(bio->bi_io_vec, bio->bi_iter) & + mem_align_mask); +} +#endif + /** * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added + * @mem_align_mask: the mask the source address and length must be aligned to, + * 0 for no requirement * @len_align_mask: the mask to align the total size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to @@ -1242,7 +1271,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, * is returned only if 0 pages could be pinned. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, - unsigned len_align_mask) + unsigned mem_align_mask, unsigned len_align_mask) { iov_iter_extraction_t flags = 0; @@ -1251,6 +1280,10 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, if (iov_iter_is_bvec(iter)) { bio_iov_bvec_set(bio, iter); + + if (!bio_iov_bvec_aligned(bio, mem_align_mask)) + return -EINVAL; + iov_iter_advance(iter, bio->bi_iter.bi_size); return 0; } @@ -1265,8 +1298,19 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec, BIO_MAX_SIZE - bio->bi_iter.bi_size, - &bio->bi_vcnt, bio->bi_max_vecs, flags); + &bio->bi_vcnt, bio->bi_max_vecs, + mem_align_mask, flags); if (ret <= 0) { + /* + * A misaligned vector fails the whole I/O. Release any + * pages pinned by earlier iterations before returning + * since this bio won't be submitted to release them. + */ + if (ret == -EINVAL) { + bio_release_pages(bio, false); + bio_clear_flag(bio, BIO_PAGE_PINNED); + bio->bi_vcnt = 0; + } if (!bio->bi_vcnt) return ret; break; @@ -1362,7 +1406,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, ssize_t ret; ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len, - &bio->bi_vcnt, bio->bi_max_vecs - 1, 0); + &bio->bi_vcnt, bio->bi_max_vecs - 1, 0, 0); if (ret <= 0) { if (!bio->bi_vcnt) { folio_put(folio); diff --git a/block/blk-map.c b/block/blk-map.c index 768549f19f97..c9535efe1a91 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -274,7 +274,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, * No alignment requirements on our part to support arbitrary * passthrough commands. */ - ret = bio_iov_iter_get_pages(bio, iter, 0); + ret = bio_iov_iter_get_pages(bio, iter, 0, 0); if (ret) goto out_put; ret = blk_rq_append_bio(rq, bio); diff --git a/block/fops.c b/block/fops.c index b5c320da2812..84eeabd97e1f 100644 --- a/block/fops.c +++ b/block/fops.c @@ -47,6 +47,7 @@ static inline int blkdev_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, struct block_device *bdev) { return bio_iov_iter_get_pages(bio, iter, + bdev_dma_alignment(bdev), bdev_logical_block_size(bdev) - 1); } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b485e3b191da..ff458aa12ae2 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -358,6 +358,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, iomap_max_bio_size(&iter->iomap), alignment); else ret = bio_iov_iter_get_pages(bio, dio->submit.iter, + bdev_dma_alignment(bio->bi_bdev), alignment - 1); if (unlikely(ret)) goto out_put_bio; diff --git a/include/linux/bio.h b/include/linux/bio.h index 8f33f717b14f..ce34ea49ef35 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -477,7 +477,7 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, - unsigned len_align_mask); + unsigned mem_align_mask, unsigned len_align_mask); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); diff --git a/include/linux/uio.h b/include/linux/uio.h index a9bc5b3067e3..653dee76c0b3 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -391,7 +391,8 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t *offset0); ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv, size_t max_size, unsigned short *nr_vecs, - unsigned short max_vecs, iov_iter_extraction_t extraction_flags); + unsigned short max_vecs, unsigned mem_align_mask, + iov_iter_extraction_t extraction_flags); /** * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 273919b16161..8d5ca3e38522 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1886,6 +1886,8 @@ static unsigned int get_contig_folio_len(struct page **pages, * @max_size: maximum size to extract from @iter * @nr_vecs: number of vectors in @bv (on in and output) * @max_vecs: maximum vectors in @bv, including those filled before calling + * @mem_align_mask: reject with -EINVAL if the source address or length is not + * aligned to this mask * @extraction_flags: flags to qualify request * * Like iov_iter_extract_pages(), but returns physically contiguous ranges @@ -1897,14 +1899,19 @@ static unsigned int get_contig_folio_len(struct page **pages, */ ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv, size_t max_size, unsigned short *nr_vecs, - unsigned short max_vecs, iov_iter_extraction_t extraction_flags) + unsigned short max_vecs, unsigned mem_align_mask, + iov_iter_extraction_t extraction_flags) { + unsigned long start = (unsigned long)iter_iov_addr(iter); unsigned short entries_left = max_vecs - *nr_vecs; unsigned short nr_pages, i = 0; size_t left, offset, len; struct page **pages; ssize_t size; + if ((start | iter_iov_len(iter)) & mem_align_mask) + return -EINVAL; + /* * Move page array up in the allocated memory for the bio vecs as far as * possible so that we can start filling biovecs from the beginning