From e3850c98b5bdc848623d69be2b1469e3f59d7a8e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 16 Jun 2026 08:05:53 -0700 Subject: [PATCH 1/2] dm-io: clone the source bio instead of copying its biovec For DM_IO_BIO requests, do_region() built each destination bio by walking the source bio's biovec and re-adding the pages one at a time, tracking the remaining transfer in sectors. The vector lengths are byte granular and need not be sector aligned (e.g. a misaligned O_DIRECT buffer split across pages), so the sector-based accounting could lose a sub-sector fragment: to_sector() truncated the remainder and the outer loop spun forever submitting empty bios, hanging the I/O. There is no need to rebuild the biovec at all. The destination reads into (or writes from) exactly the same pages as the source bio, so the bio can simply clone the source's biovec with bio_alloc_clone() and remap it to the target device. The clone inherits the source's iterator and alignment, and the block layer splits it to the target's limits on submission, so the whole region maps to a single cloned bio with no manual page copying or sector accounting. This removes the per-page copy path (and its open-coded bvec dpages helpers) for bio-backed I/O and fixes the hang on misaligned direct I/O to a dm-mirror device. Page-list, vma and kmem sources keep the existing copy path. Fixes: 7eac33186957 ("iomap: simplify direct io validity check") Fixes: 5ff3f74e145a ("block: simplify direct io validity check") Reported-by: Dr. David Alan Gilbert Reported-by: Vjaceslavs Klimovs Signed-off-by: Keith Busch --- drivers/md/dm-io.c | 67 +++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 1db565b376200..28adfeb58f240 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -170,12 +170,11 @@ struct dpages { struct page **p, unsigned long *len, unsigned int *offset); void (*next_page)(struct dpages *dp); - union { - unsigned int context_u; - struct bvec_iter context_bi; - }; + unsigned int context_u; void *context_ptr; + struct bio *orig_bio; + void *vma_invalidate_address; unsigned long vma_invalidate_size; }; @@ -210,44 +209,6 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned int o dp->context_ptr = pl; } -/* - * Functions for getting the pages from a bvec. - */ -static void bio_get_page(struct dpages *dp, struct page **p, - unsigned long *len, unsigned int *offset) -{ - struct bio_vec bvec = bvec_iter_bvec((struct bio_vec *)dp->context_ptr, - dp->context_bi); - - *p = bvec.bv_page; - *len = bvec.bv_len; - *offset = bvec.bv_offset; - - /* avoid figuring it out again in bio_next_page() */ - dp->context_bi.bi_sector = (sector_t)bvec.bv_len; -} - -static void bio_next_page(struct dpages *dp) -{ - unsigned int len = (unsigned int)dp->context_bi.bi_sector; - - bvec_iter_advance((struct bio_vec *)dp->context_ptr, - &dp->context_bi, len); -} - -static void bio_dp_init(struct dpages *dp, struct bio *bio) -{ - dp->get_page = bio_get_page; - dp->next_page = bio_next_page; - - /* - * We just use bvec iterator to retrieve pages, so it is ok to - * access the bvec table directly here - */ - dp->context_ptr = bio->bi_io_vec; - dp->context_bi = bio->bi_iter; -} - /* * Functions for getting the pages from a VMA. */ @@ -332,6 +293,21 @@ static void do_region(const blk_opf_t opf, unsigned int region, return; } + if (dp->orig_bio) { + bio = bio_alloc_clone(where->bdev, dp->orig_bio, GFP_NOIO, + &io->client->bios); + bio->bi_iter.bi_sector = where->sector; + bio->bi_iter.bi_size = where->count << SECTOR_SHIFT; + bio->bi_opf = opf; + bio->bi_end_io = endio; + bio->bi_ioprio = ioprio; + store_io_and_region_in_bio(bio, io, region); + + atomic_inc(&io->count); + submit_bio(bio); + return; + } + /* * where->count may be zero if op holds a flush and we need to * send a zero-sized flush. @@ -468,6 +444,7 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp, dp->vma_invalidate_address = NULL; dp->vma_invalidate_size = 0; + dp->orig_bio = NULL; switch (io_req->mem.type) { case DM_IO_PAGE_LIST: @@ -475,7 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp, break; case DM_IO_BIO: - bio_dp_init(dp, io_req->mem.ptr.bio); + /* + * The destination bios clone this bio's biovec directly, so + * there are no per-page accessors to set up here. + */ + dp->orig_bio = io_req->mem.ptr.bio; break; case DM_IO_VMA: From 5c978fc075e3f37258303469d7f77e1696978b31 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 16 Jun 2026 09:58:50 -0600 Subject: [PATCH 2/2] dm-raid1: don't fail the mirror for invalid I/O errors BLK_STS_INVAL indicates the I/O request itself was invalid (for example a misaligned direct I/O), not that the device has failed. dm-raid1 treated any read or write completion error as a device failure: it failed the mirror leg, retried on the alternatives - which fail identically - and eventually returned EIO while spuriously degrading the array. Since commit 5ff3f74e145a ("block: simplify direct io validity check") the direct I/O path no longer rejects misaligned buffers up front, so an invalid bio now reaches the lower block layers, which fail it with BLK_STS_INVAL. dm-io collapses the block status into a per-region error bit before invoking the completion callback, so record BLK_STS_INVAL on the originating bio and have the dm-raid1 read, write and end_io paths propagate it instead of failing the device. This mirrors the raid1/raid10 fix in commit f7b24c7b41f23 ("md/raid1,raid10: don't fail devices for invalid IO errors") for the device-mapper mirror target. Fixes: 7eac33186957 ("iomap: simplify direct io validity check") Fixes: 5ff3f74e145a ("block: simplify direct io validity check") Reported-by: Dr. David Alan Gilbert Reported-by: Vjaceslavs Klimovs Signed-off-by: Keith Busch --- drivers/md/dm-io.c | 14 +++++++++++++- drivers/md/dm-raid1.c | 28 +++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 28adfeb58f240..f382e9f9be059 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -37,6 +37,7 @@ struct io { struct dm_io_client *client; io_notify_fn callback; void *context; + struct bio *orig_bio; void *vma_invalidate_address; unsigned long vma_invalidate_size; } __aligned(DM_IO_MAX_REGIONS); @@ -132,8 +133,18 @@ static void complete_io(struct io *io) static void dec_count(struct io *io, unsigned int region, blk_status_t error) { - if (error) + if (error) { set_bit(region, &io->error_bits); + /* + * BLK_STS_INVAL means the bio was not valid for the underlying + * device (e.g. a misaligned direct I/O), which is a caller error + * rather than a device failure. Record it on the original bio so + * bio-based targets can propagate it instead of treating it as a + * media error and failing the device. + */ + if (error == BLK_STS_INVAL && io->orig_bio) + io->orig_bio->bi_status = error; + } if (atomic_dec_and_test(&io->count)) complete_io(io); @@ -398,6 +409,7 @@ static void async_io(struct dm_io_client *client, unsigned int num_regions, io->client = client; io->callback = fn; io->context = context; + io->orig_bio = dp->orig_bio; io->vma_invalidate_address = dp->vma_invalidate_address; io->vma_invalidate_size = dp->vma_invalidate_size; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index de5c00704e69c..022ad791c2957 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -524,6 +524,17 @@ static void read_callback(unsigned long error, void *context) return; } + /* + * BLK_STS_INVAL means the bio was not valid for the underlying device, + * e.g. a misaligned direct I/O. That is a caller error, not a device + * failure, so propagate it rather than failing the mirror and retrying + * on the other legs, which would fail the same way. + */ + if (bio->bi_status == BLK_STS_INVAL) { + bio_endio(bio); + return; + } + fail_mirror(m, DM_RAID1_READ_ERROR); if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { @@ -622,6 +633,16 @@ static void write_callback(unsigned long error, void *context) return; } + /* + * BLK_STS_INVAL means the bio was not valid for the underlying device, + * e.g. a misaligned direct I/O. Propagate the error without degrading + * the array. + */ + if (bio->bi_status == BLK_STS_INVAL) { + bio_endio(bio); + return; + } + /* * If the bio is discard, return an error, but do not * degrade the array. @@ -1262,7 +1283,12 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, return DM_ENDIO_DONE; } - if (*error == BLK_STS_NOTSUPP) + /* + * BLK_STS_INVAL means the bio was not valid for the underlying device, + * e.g. a misaligned direct I/O. Propagate it rather than failing the + * mirror and retrying, which would fail the same way on every leg. + */ + if (*error == BLK_STS_NOTSUPP || *error == BLK_STS_INVAL) goto out; if (bio->bi_opf & REQ_RAHEAD)