Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 47 additions & 3 deletions block/bio.c
Original file line number Diff line number Diff line change
Expand Up @@ -1220,10 +1220,39 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
return 0;
}

#ifdef CONFIG_DEBUG_KERNEL
static inline bool bio_iov_bvec_aligned(const struct bio *bio,
unsigned mem_align_mask)
{
struct bvec_iter iter;
struct bio_vec bv;

for_each_mp_bvec(bv, bio->bi_io_vec, iter, bio->bi_iter)
if ((bv.bv_offset | bv.bv_len) & mem_align_mask)
return false;
return true;
}
#else
static inline bool bio_iov_bvec_aligned(const struct bio *bio,
unsigned mem_align_mask)
{
/*
* The vectors are owned and laid out by the caller; we only forward
* them. Most callers are already aligned, but io_uring can place a
* user chosen offset through a registered buffer, where only the first
* vector may be unaligned.
*/
return !(mp_bvec_iter_offset(bio->bi_io_vec, bio->bi_iter) &
mem_align_mask);
}
#endif

/**
* bio_iov_iter_get_pages - add user or kernel pages to a bio
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be added
* @mem_align_mask: the mask the source address and length must be aligned to,
* 0 for no requirement
* @len_align_mask: the mask to align the total size to, 0 for any length
*
* This takes either an iterator pointing to user memory, or one pointing to
Expand All @@ -1242,7 +1271,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter,
* is returned only if 0 pages could be pinned.
*/
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
unsigned len_align_mask)
unsigned mem_align_mask, unsigned len_align_mask)
{
iov_iter_extraction_t flags = 0;

Expand All @@ -1251,6 +1280,10 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,

if (iov_iter_is_bvec(iter)) {
bio_iov_bvec_set(bio, iter);

if (!bio_iov_bvec_aligned(bio, mem_align_mask))
return -EINVAL;

iov_iter_advance(iter, bio->bi_iter.bi_size);
return 0;
}
Expand All @@ -1265,8 +1298,19 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,

ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec,
BIO_MAX_SIZE - bio->bi_iter.bi_size,
&bio->bi_vcnt, bio->bi_max_vecs, flags);
&bio->bi_vcnt, bio->bi_max_vecs,
mem_align_mask, flags);
if (ret <= 0) {
/*
* A misaligned vector fails the whole I/O. Release any
* pages pinned by earlier iterations before returning
* since this bio won't be submitted to release them.
*/
if (ret == -EINVAL) {
bio_release_pages(bio, false);
bio_clear_flag(bio, BIO_PAGE_PINNED);
bio->bi_vcnt = 0;
}
if (!bio->bi_vcnt)
return ret;
break;
Expand Down Expand Up @@ -1362,7 +1406,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter,
ssize_t ret;

ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len,
&bio->bi_vcnt, bio->bi_max_vecs - 1, 0);
&bio->bi_vcnt, bio->bi_max_vecs - 1, 0, 0);
if (ret <= 0) {
if (!bio->bi_vcnt) {
folio_put(folio);
Expand Down
2 changes: 1 addition & 1 deletion block/blk-map.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
* No alignment requirements on our part to support arbitrary
* passthrough commands.
*/
ret = bio_iov_iter_get_pages(bio, iter, 0);
ret = bio_iov_iter_get_pages(bio, iter, 0, 0);
if (ret)
goto out_put;
ret = blk_rq_append_bio(rq, bio);
Expand Down
4 changes: 2 additions & 2 deletions block/blk-merge.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors,
struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs);

if (IS_ERR(split)) {
bio_endio_status(bio, errno_to_blk_status(PTR_ERR(split)));
bio_endio_errno(bio, PTR_ERR(split));
return NULL;
}

Expand All @@ -142,7 +142,7 @@ EXPORT_SYMBOL_GPL(bio_submit_split_bioset);
static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
{
if (unlikely(split_sectors < 0)) {
bio_endio_status(bio, errno_to_blk_status(split_sectors));
bio_endio_errno(bio, split_sectors);
return NULL;
}

Expand Down
9 changes: 6 additions & 3 deletions block/fops.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ static inline int blkdev_iov_iter_get_pages(struct bio *bio,
struct iov_iter *iter, struct block_device *bdev)
{
return bio_iov_iter_get_pages(bio, iter,
bdev_dma_alignment(bdev),
bdev_logical_block_size(bdev) - 1);
}

Expand Down Expand Up @@ -218,7 +219,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,

ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
if (unlikely(ret)) {
bio_endio_status(bio, BLK_STS_IOERR);
bio_endio_errno(bio, ret);
break;
}
if (iocb->ki_flags & IOCB_NOWAIT) {
Expand All @@ -238,8 +239,10 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
}
if (iocb->ki_flags & IOCB_HAS_METADATA) {
ret = bio_integrity_map_iter(bio, iocb->private);
if (unlikely(ret))
goto fail;
if (unlikely(ret)) {
bio_endio_errno(bio, ret);
break;
}
}

if (is_read) {
Expand Down
50 changes: 42 additions & 8 deletions drivers/block/loop.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ struct loop_device {

struct file *lo_backing_file;
unsigned int lo_min_dio_size;
unsigned int lo_dio_mem_align;
struct block_device *lo_device;

gfp_t old_gfp_mask;
Expand Down Expand Up @@ -447,26 +448,37 @@ static void loop_reread_partitions(struct loop_device *lo)
__func__, lo->lo_number, lo->lo_file_name, rc);
}

static unsigned int loop_query_min_dio_size(struct loop_device *lo)
static void loop_update_dio_alignment(struct loop_device *lo)
{
struct file *file = lo->lo_backing_file;
struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev;
struct kstat st;

/*
* Use the minimal dio alignment of the file system if provided.
* Use the dio alignment of the file system if provided. dio_offset_align
* is the minimum dio size and offset; dio_mem_align is the buffer memory
* alignment, kept as a mask to become the loop device's dma_alignment in
* direct I/O mode where the buffer is handed to the backing file unchanged.
*/
if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) &&
(st.result_mask & STATX_DIOALIGN))
return st.dio_offset_align;
(st.result_mask & STATX_DIOALIGN)) {
lo->lo_min_dio_size = st.dio_offset_align;
lo->lo_dio_mem_align = st.dio_mem_align - 1;
return;
}

/*
* In a perfect world this wouldn't be needed, but as of Linux 6.13 only
* a handful of file systems support the STATX_DIOALIGN flag.
*/
if (sb_bdev)
return bdev_logical_block_size(sb_bdev);
return SECTOR_SIZE;
if (sb_bdev) {
lo->lo_min_dio_size = bdev_logical_block_size(sb_bdev);
lo->lo_dio_mem_align = bdev_dma_alignment(sb_bdev);
return;
}

lo->lo_min_dio_size = SECTOR_SIZE;
lo->lo_dio_mem_align = SECTOR_SIZE - 1;
}

static inline int is_loop_device(struct file *file)
Expand Down Expand Up @@ -509,7 +521,7 @@ static void loop_assign_backing_file(struct loop_device *lo, struct file *file)
lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS));
if (lo->lo_backing_file->f_flags & O_DIRECT)
lo->lo_flags |= LO_FLAGS_DIRECT_IO;
lo->lo_min_dio_size = loop_query_min_dio_size(lo);
loop_update_dio_alignment(lo);
}

static int loop_check_backing_file(struct file *file)
Expand Down Expand Up @@ -961,6 +973,17 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
lim->logical_block_size = bsize;
lim->physical_block_size = bsize;
lim->io_min = bsize;
/*
* In direct I/O the user pages are handed to the backing file as-is, so
* the backing's DMA alignment requirement applies to them. Advertise it
* so misaligned I/O is rejected at this device's entry instead of being
* dispatched to the backend. Buffered I/O copies through the page cache
* and imposes no such requirement.
*/
if (lo->lo_flags & LO_FLAGS_DIRECT_IO)
lim->dma_alignment = lo->lo_dio_mem_align;
else
lim->dma_alignment = SECTOR_SIZE - 1;
lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY))
lim->features |= BLK_FEAT_WRITE_CACHE;
Expand Down Expand Up @@ -1416,6 +1439,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
{
bool use_dio = !!arg;
unsigned int memflags;
struct queue_limits lim;

if (lo->lo_state != Lo_bound)
return -ENXIO;
Expand All @@ -1434,6 +1458,16 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
lo->lo_flags |= LO_FLAGS_DIRECT_IO;
else
lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
/*
* Direct I/O forwards the user pages to the backing file unchanged, so
* track the backing's DMA alignment requirement as the mode is toggled.
*/
lim = queue_limits_start_update(lo->lo_queue);
if (lo->lo_flags & LO_FLAGS_DIRECT_IO)
lim.dma_alignment = lo->lo_dio_mem_align;
else
lim.dma_alignment = SECTOR_SIZE - 1;
queue_limits_commit_update(lo->lo_queue, &lim);
blk_mq_unfreeze_queue(lo->lo_queue, memflags);
return 0;
}
Expand Down
22 changes: 20 additions & 2 deletions drivers/block/zloop.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ struct zloop_device {
unsigned int nr_conv_zones;
unsigned int max_open_zones;
unsigned int block_size;
unsigned int dio_mem_align;

spinlock_t open_zones_lock;
struct list_head open_zones_lru_list;
Expand Down Expand Up @@ -1035,6 +1036,9 @@ static int zloop_get_block_size(struct zloop_device *zlo,
{
struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
struct kstat st;
bool have_dioalign = !vfs_getattr(&zone->file->f_path, &st,
STATX_DIOALIGN, 0) &&
(st.result_mask & STATX_DIOALIGN);

/*
* If the FS block size is lower than or equal to 4K, use that as the
Expand All @@ -1044,14 +1048,25 @@ static int zloop_get_block_size(struct zloop_device *zlo,
*/
if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
(st.result_mask & STATX_DIOALIGN))
else if (have_dioalign)
zlo->block_size = st.dio_offset_align;
else if (sb_bdev)
zlo->block_size = bdev_physical_block_size(sb_bdev);
else
zlo->block_size = SECTOR_SIZE;

/*
* In direct I/O the request's pages are handed to the backing files
* unchanged, so track their required memory alignment as a mask for
* dma_alignment.
*/
if (have_dioalign)
zlo->dio_mem_align = st.dio_mem_align - 1;
else if (sb_bdev)
zlo->dio_mem_align = bdev_dma_alignment(sb_bdev);
else
zlo->dio_mem_align = SECTOR_SIZE - 1;

if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
pr_err("Zone capacity is not aligned to block size %u\n",
zlo->block_size);
Expand Down Expand Up @@ -1279,6 +1294,9 @@ static int zloop_ctl_add(struct zloop_options *opts)

lim.physical_block_size = zlo->block_size;
lim.logical_block_size = zlo->block_size;
/* Direct I/O hands the request's pages to the backing files unchanged. */
if (!opts->buffered_io)
lim.dma_alignment = zlo->dio_mem_align;
if (zlo->zone_append)
lim.max_hw_zone_append_sectors = lim.max_hw_sectors;
lim.max_open_zones = zlo->max_open_zones;
Expand Down
1 change: 1 addition & 0 deletions fs/iomap/direct-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
iomap_max_bio_size(&iter->iomap), alignment);
else
ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
bdev_dma_alignment(bio->bi_bdev),
alignment - 1);
if (unlikely(ret))
goto out_put_bio;
Expand Down
2 changes: 1 addition & 1 deletion include/linux/bio.h
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
size_t len, enum req_op op);

int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
unsigned len_align_mask);
unsigned mem_align_mask, unsigned len_align_mask);

void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter);
void __bio_release_pages(struct bio *bio, bool mark_dirty);
Expand Down
5 changes: 5 additions & 0 deletions include/linux/blkdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -1047,6 +1047,11 @@ extern const char *blk_op_str(enum req_op op);
int blk_status_to_errno(blk_status_t status);
blk_status_t errno_to_blk_status(int errno);

static inline void bio_endio_errno(struct bio *bio, int errno)
{
bio_endio_status(bio, errno_to_blk_status(errno));
}

/* only poll the hardware once, don't continue until a completion was found */
#define BLK_POLL_ONESHOT (1 << 0)
int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
Expand Down
3 changes: 2 additions & 1 deletion include/linux/uio.h
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
size_t *offset0);
ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
size_t max_size, unsigned short *nr_vecs,
unsigned short max_vecs, iov_iter_extraction_t extraction_flags);
unsigned short max_vecs, unsigned mem_align_mask,
iov_iter_extraction_t extraction_flags);

/**
* iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
Expand Down
9 changes: 8 additions & 1 deletion lib/iov_iter.c
Original file line number Diff line number Diff line change
Expand Up @@ -1886,6 +1886,8 @@ static unsigned int get_contig_folio_len(struct page **pages,
* @max_size: maximum size to extract from @iter
* @nr_vecs: number of vectors in @bv (on in and output)
* @max_vecs: maximum vectors in @bv, including those filled before calling
* @mem_align_mask: reject with -EINVAL if the source address or length is not
* aligned to this mask
* @extraction_flags: flags to qualify request
*
* Like iov_iter_extract_pages(), but returns physically contiguous ranges
Expand All @@ -1897,14 +1899,19 @@ static unsigned int get_contig_folio_len(struct page **pages,
*/
ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
size_t max_size, unsigned short *nr_vecs,
unsigned short max_vecs, iov_iter_extraction_t extraction_flags)
unsigned short max_vecs, unsigned mem_align_mask,
iov_iter_extraction_t extraction_flags)
{
unsigned long start = (unsigned long)iter_iov_addr(iter);
unsigned short entries_left = max_vecs - *nr_vecs;
unsigned short nr_pages, i = 0;
size_t left, offset, len;
struct page **pages;
ssize_t size;

if ((start | iter_iov_len(iter)) & mem_align_mask)
return -EINVAL;

/*
* Move page array up in the allocated memory for the bio vecs as far as
* possible so that we can start filling biovecs from the beginning
Expand Down
Loading