diff mbox series

[4/8] iomap: split bios to zone append limits in the submission handlers

Message ID 20241211085420.1380396-5-hch@lst.de (mailing list archive)
State New
Headers show
Series [1/8] iomap: allow the file system to submit the writeback bios | expand

Commit Message

Christoph Hellwig Dec. 11, 2024, 8:53 a.m. UTC
Provide helpers for file systems to split bios in the direct I/O and
writeback I/O submission handlers.

This Follows btrfs' lead and don't try to build bios to hardware limits
for zone append commands, but instead build them as normal unconstrained
bios and split them to the hardware limits in the I/O submission handler.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap/Makefile      |  1 +
 fs/iomap/buffered-io.c | 43 ++++++++++++++-----------
 fs/iomap/ioend.c       | 73 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h  |  9 ++++++
 4 files changed, 108 insertions(+), 18 deletions(-)
 create mode 100644 fs/iomap/ioend.c

Comments

Brian Foster Dec. 12, 2024, 1:28 p.m. UTC | #1
On Wed, Dec 11, 2024 at 09:53:44AM +0100, Christoph Hellwig wrote:
> Provide helpers for file systems to split bios in the direct I/O and
> writeback I/O submission handlers.
> 
> This Follows btrfs' lead and don't try to build bios to hardware limits
> for zone append commands, but instead build them as normal unconstrained
> bios and split them to the hardware limits in the I/O submission handler.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap/Makefile      |  1 +
>  fs/iomap/buffered-io.c | 43 ++++++++++++++-----------
>  fs/iomap/ioend.c       | 73 ++++++++++++++++++++++++++++++++++++++++++
>  include/linux/iomap.h  |  9 ++++++
>  4 files changed, 108 insertions(+), 18 deletions(-)
>  create mode 100644 fs/iomap/ioend.c
> 
...
> diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
> new file mode 100644
> index 000000000000..f3d98121c593
> --- /dev/null
> +++ b/fs/iomap/ioend.c
...

It might be useful to add a small comment here to point out this splits
from the front of the ioend (i.e. akin to bio_split()), documents the
params, and maybe mentions the ioend relationship requirements (i.e.
according to bio_split(), the split ioend bio refers to the vectors in
the original ioend bio).

Brian

> +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, bool is_append,
> +		unsigned int *alloc_len)
> +{
> +	struct bio *bio = &ioend->io_bio;
> +	struct iomap_ioend *split_ioend;
> +	struct bio *split;
> +	int sector_offset;
> +	unsigned int nr_segs;
> +
> +	if (is_append) {
> +		struct queue_limits *lim = bdev_limits(bio->bi_bdev);
> +
> +		sector_offset = bio_split_rw_at(bio, lim, &nr_segs,
> +			min(lim->max_zone_append_sectors << SECTOR_SHIFT,
> +			    *alloc_len));
> +		if (!sector_offset)
> +			return NULL;
> +	} else {
> +		if (bio->bi_iter.bi_size <= *alloc_len)
> +			return NULL;
> +		sector_offset = *alloc_len >> SECTOR_SHIFT;
> +	}
> +
> +	/* ensure the split ioend is still block size aligned */
> +	sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
> +			i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
> +
> +	split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
> +	if (!split)
> +		return NULL;
> +	split->bi_private = bio->bi_private;
> +	split->bi_end_io = bio->bi_end_io;
> +
> +	split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
> +			ioend->io_flags);
> +	split_ioend->io_parent = ioend;
> +
> +	atomic_inc(&ioend->io_remaining);
> +	ioend->io_offset += split_ioend->io_size;
> +	ioend->io_size -= split_ioend->io_size;
> +
> +	split_ioend->io_sector = ioend->io_sector;
> +	if (!is_append)
> +		ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
> +
> +	*alloc_len -= split->bi_iter.bi_size;
> +	return split_ioend;
> +}
> +EXPORT_SYMBOL_GPL(iomap_split_ioend);
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 173d490c20ba..eaa8cb9083eb 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -354,6 +354,9 @@ struct iomap_ioend {
>  	struct list_head	io_list;	/* next ioend in chain */
>  	u16			io_flags;	/* IOMAP_IOEND_* */
>  	struct inode		*io_inode;	/* file being written to */
> +	atomic_t		io_remaining;	/* completetion defer count */
> +	int			io_error;	/* stashed away status */
> +	struct iomap_ioend	*io_parent;	/* parent for completions */
>  	size_t			io_size;	/* size of the extent */
>  	loff_t			io_offset;	/* offset in the file */
>  	sector_t		io_sector;	/* start sector of ioend */
> @@ -404,6 +407,10 @@ struct iomap_writepage_ctx {
>  	u32			nr_folios;	/* folios added to the ioend */
>  };
>  
> +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
> +		loff_t file_offset, u16 flags);
> +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, bool is_append,
> +		unsigned int *alloc_len);
>  void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
>  void iomap_ioend_try_merge(struct iomap_ioend *ioend,
>  		struct list_head *more_ioends);
> @@ -475,4 +482,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
>  # define iomap_swapfile_activate(sis, swapfile, pagespan, ops)	(-EIO)
>  #endif /* CONFIG_SWAP */
>  
> +extern struct bio_set iomap_ioend_bioset;
> +
>  #endif /* LINUX_IOMAP_H */
> -- 
> 2.45.2
> 
>
John Garry Dec. 12, 2024, 2:21 p.m. UTC | #2
On 11/12/2024 08:53, Christoph Hellwig wrote:
> +	if (is_append) {
> +		struct queue_limits *lim = bdev_limits(bio->bi_bdev);
> +
> +		sector_offset = bio_split_rw_at(bio, lim, &nr_segs,
> +			min(lim->max_zone_append_sectors << SECTOR_SHIFT,
> +			    *alloc_len));
> +		if (!sector_offset)

Should this be:

		if (sector_offset <= 0)

> +			return NULL;
> +	} else {
> +		if (bio->bi_iter.bi_size <= *alloc_len)
> +			return NULL;
> +		sector_offset = *alloc_len >> SECTOR_SHIFT;
> +	}
> +
> +	/* ensure the split ioend is still block size aligned */
> +	sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
> +			i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
Christoph Hellwig Dec. 12, 2024, 3:05 p.m. UTC | #3
On Thu, Dec 12, 2024 at 08:28:17AM -0500, Brian Foster wrote:
> It might be useful to add a small comment here to point out this splits
> from the front of the ioend (i.e. akin to bio_split()), documents the
> params, and maybe mentions the ioend relationship requirements (i.e.
> according to bio_split(), the split ioend bio refers to the vectors in
> the original ioend bio).

Sure, I can add that.
Christoph Hellwig Dec. 12, 2024, 3:07 p.m. UTC | #4
On Thu, Dec 12, 2024 at 02:21:32PM +0000, John Garry wrote:
> On 11/12/2024 08:53, Christoph Hellwig wrote:
>> +	if (is_append) {
>> +		struct queue_limits *lim = bdev_limits(bio->bi_bdev);
>> +
>> +		sector_offset = bio_split_rw_at(bio, lim, &nr_segs,
>> +			min(lim->max_zone_append_sectors << SECTOR_SHIFT,
>> +			    *alloc_len));
>> +		if (!sector_offset)
>
> Should this be:
>
> 		if (sector_offset <= 0)

No support for REQ_ATOMIC and REQ_NOWAIT in this path right now,
but we might as well future prove it by checking for a negative
error value.  But we'll then need to propagate the error as well.
I'll see what I can do.
Darrick J. Wong Dec. 12, 2024, 7:51 p.m. UTC | #5
On Wed, Dec 11, 2024 at 09:53:44AM +0100, Christoph Hellwig wrote:
> Provide helpers for file systems to split bios in the direct I/O and
> writeback I/O submission handlers.
> 
> This Follows btrfs' lead and don't try to build bios to hardware limits
> for zone append commands, but instead build them as normal unconstrained
> bios and split them to the hardware limits in the I/O submission handler.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/iomap/Makefile      |  1 +
>  fs/iomap/buffered-io.c | 43 ++++++++++++++-----------
>  fs/iomap/ioend.c       | 73 ++++++++++++++++++++++++++++++++++++++++++
>  include/linux/iomap.h  |  9 ++++++
>  4 files changed, 108 insertions(+), 18 deletions(-)
>  create mode 100644 fs/iomap/ioend.c
> 
> diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
> index 381d76c5c232..69e8ebb41302 100644
> --- a/fs/iomap/Makefile
> +++ b/fs/iomap/Makefile
> @@ -12,6 +12,7 @@ iomap-y				+= trace.o \
>  				   iter.o
>  iomap-$(CONFIG_BLOCK)		+= buffered-io.o \
>  				   direct-io.o \
> +				   ioend.o \
>  				   fiemap.o \
>  				   seek.o
>  iomap-$(CONFIG_SWAP)		+= swapfile.o
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 129cd96c6c96..8125f758a99d 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -40,7 +40,8 @@ struct iomap_folio_state {
>  	unsigned long		state[];
>  };
>  
> -static struct bio_set iomap_ioend_bioset;
> +struct bio_set iomap_ioend_bioset;
> +EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
>  
>  static inline bool ifs_is_fully_uptodate(struct folio *folio,
>  		struct iomap_folio_state *ifs)
> @@ -1539,15 +1540,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
>   * ioend after this.
>   */
>  static u32
> -iomap_finish_ioend(struct iomap_ioend *ioend, int error)
> +iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
>  {
>  	struct inode *inode = ioend->io_inode;
>  	struct bio *bio = &ioend->io_bio;
>  	struct folio_iter fi;
>  	u32 folio_count = 0;
>  
> -	if (error) {
> -		mapping_set_error(inode->i_mapping, error);
> +	if (ioend->io_error) {
> +		mapping_set_error(inode->i_mapping, ioend->io_error);
>  		if (!bio_flagged(bio, BIO_QUIET)) {
>  			pr_err_ratelimited(
>  "%s: writeback error on inode %lu, offset %lld, sector %llu",
> @@ -1566,6 +1567,24 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
>  	return folio_count;
>  }
>  
> +static u32
> +iomap_finish_ioend(struct iomap_ioend *ioend, int error)
> +{
> +	if (ioend->io_parent) {
> +		struct bio *bio = &ioend->io_bio;
> +
> +		ioend = ioend->io_parent;
> +		bio_put(bio);
> +	}
> +
> +	if (error)
> +		cmpxchg(&ioend->io_error, 0, error);
> +
> +	if (!atomic_dec_and_test(&ioend->io_remaining))
> +		return 0;
> +	return iomap_finish_ioend_buffered(ioend);
> +}
> +
>  /*
>   * Ioend completion routine for merged bios. This can only be called from task
>   * contexts as merged ioends can be of unbound length. Hence we have to break up
> @@ -1709,7 +1728,6 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
>  		struct writeback_control *wbc, struct inode *inode, loff_t pos,
>  		u16 ioend_flags)
>  {
> -	struct iomap_ioend *ioend;
>  	struct bio *bio;
>  
>  	bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
> @@ -1717,21 +1735,10 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
>  			       GFP_NOFS, &iomap_ioend_bioset);
>  	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
>  	bio->bi_end_io = iomap_writepage_end_bio;
> -	wbc_init_bio(wbc, bio);
>  	bio->bi_write_hint = inode->i_write_hint;
> -
> -	ioend = iomap_ioend_from_bio(bio);
> -	INIT_LIST_HEAD(&ioend->io_list);
> -	ioend->io_flags = ioend_flags;
> -	if (pos > wpc->iomap.offset)
> -		wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
> -	ioend->io_inode = inode;
> -	ioend->io_size = 0;
> -	ioend->io_offset = pos;
> -	ioend->io_sector = bio->bi_iter.bi_sector;
> -
> +	wbc_init_bio(wbc, bio);
>  	wpc->nr_folios = 0;
> -	return ioend;
> +	return iomap_init_ioend(inode, bio, pos, ioend_flags);
>  }
>  
>  static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
> diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
> new file mode 100644
> index 000000000000..f3d98121c593
> --- /dev/null
> +++ b/fs/iomap/ioend.c
> @@ -0,0 +1,73 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2024 Christoph Hellwig.
> + */
> +#include <linux/iomap.h>
> +
> +struct iomap_ioend *iomap_init_ioend(struct inode *inode,
> +		struct bio *bio, loff_t file_offset, u16 flags)
> +{

Nit: s/flags/ioend_flags/ to be consistent with the previous few
patches.

> +	struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
> +
> +	atomic_set(&ioend->io_remaining, 1);
> +	ioend->io_error = 0;
> +	ioend->io_parent = NULL;
> +	INIT_LIST_HEAD(&ioend->io_list);
> +	ioend->io_flags = flags;
> +	ioend->io_inode = inode;
> +	ioend->io_offset = file_offset;
> +	ioend->io_size = bio->bi_iter.bi_size;
> +	ioend->io_sector = bio->bi_iter.bi_sector;
> +	return ioend;
> +}
> +EXPORT_SYMBOL_GPL(iomap_init_ioend);
> +
> +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, bool is_append,

Can you determine is_append from (ioend->io_flags & ZONE_APPEND)?

Also it's not clear to me what the initial and output state of
*alloc_len is supposed to be?  I guess you set it to the number of bytes
the @ioend covers?  And this function either returns NULL and alloc_len
untouched; or it returns a new ioend and the number of bytes remaining
in the passed-in ioend?

(or, as bfoster said, please improve the comments)

> +		unsigned int *alloc_len)
> +{
> +	struct bio *bio = &ioend->io_bio;
> +	struct iomap_ioend *split_ioend;
> +	struct bio *split;
> +	int sector_offset;
> +	unsigned int nr_segs;
> +
> +	if (is_append) {
> +		struct queue_limits *lim = bdev_limits(bio->bi_bdev);
> +
> +		sector_offset = bio_split_rw_at(bio, lim, &nr_segs,
> +			min(lim->max_zone_append_sectors << SECTOR_SHIFT,
> +			    *alloc_len));
> +		if (!sector_offset)
> +			return NULL;
> +	} else {
> +		if (bio->bi_iter.bi_size <= *alloc_len)
> +			return NULL;
> +		sector_offset = *alloc_len >> SECTOR_SHIFT;
> +	}
> +
> +	/* ensure the split ioend is still block size aligned */
> +	sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
> +			i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
> +
> +	split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
> +	if (!split)
> +		return NULL;
> +	split->bi_private = bio->bi_private;
> +	split->bi_end_io = bio->bi_end_io;
> +
> +	split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
> +			ioend->io_flags);
> +	split_ioend->io_parent = ioend;
> +
> +	atomic_inc(&ioend->io_remaining);
> +	ioend->io_offset += split_ioend->io_size;
> +	ioend->io_size -= split_ioend->io_size;
> +
> +	split_ioend->io_sector = ioend->io_sector;
> +	if (!is_append)
> +		ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
> +
> +	*alloc_len -= split->bi_iter.bi_size;
> +	return split_ioend;
> +}
> +EXPORT_SYMBOL_GPL(iomap_split_ioend);
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 173d490c20ba..eaa8cb9083eb 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -354,6 +354,9 @@ struct iomap_ioend {
>  	struct list_head	io_list;	/* next ioend in chain */
>  	u16			io_flags;	/* IOMAP_IOEND_* */
>  	struct inode		*io_inode;	/* file being written to */
> +	atomic_t		io_remaining;	/* completetion defer count */
> +	int			io_error;	/* stashed away status */
> +	struct iomap_ioend	*io_parent;	/* parent for completions */

I guess this means ioends can chain together, sort of like how bios can
when you split them?

--D

>  	size_t			io_size;	/* size of the extent */
>  	loff_t			io_offset;	/* offset in the file */
>  	sector_t		io_sector;	/* start sector of ioend */
> @@ -404,6 +407,10 @@ struct iomap_writepage_ctx {
>  	u32			nr_folios;	/* folios added to the ioend */
>  };
>  
> +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
> +		loff_t file_offset, u16 flags);
> +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, bool is_append,
> +		unsigned int *alloc_len);
>  void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
>  void iomap_ioend_try_merge(struct iomap_ioend *ioend,
>  		struct list_head *more_ioends);
> @@ -475,4 +482,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
>  # define iomap_swapfile_activate(sis, swapfile, pagespan, ops)	(-EIO)
>  #endif /* CONFIG_SWAP */
>  
> +extern struct bio_set iomap_ioend_bioset;
> +
>  #endif /* LINUX_IOMAP_H */
> -- 
> 2.45.2
> 
>
Christoph Hellwig Dec. 13, 2024, 4:50 a.m. UTC | #6
On Thu, Dec 12, 2024 at 11:51:49AM -0800, Darrick J. Wong wrote:
> > +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, bool is_append,
> 
> Can you determine is_append from (ioend->io_flags & ZONE_APPEND)?

That would require us to add that flag first :)  As we don't really
need that as persistent per-iomap that it's probably not worth it.

> Also it's not clear to me what the initial and output state of
> *alloc_len is supposed to be?  I guess you set it to the number of bytes
> the @ioend covers?

It gets set to the number of blocks that the allocator could find,
and iomap_split_ioend decrements the amount of that it used for the
ioend returned, which is min(*alloc_len, max_zone_append_sectors) for
sequential zones, or *alloc_len for conventional zones.

> > +++ b/include/linux/iomap.h
> > @@ -354,6 +354,9 @@ struct iomap_ioend {
> >  	struct list_head	io_list;	/* next ioend in chain */
> >  	u16			io_flags;	/* IOMAP_IOEND_* */
> >  	struct inode		*io_inode;	/* file being written to */
> > +	atomic_t		io_remaining;	/* completetion defer count */
> > +	int			io_error;	/* stashed away status */
> > +	struct iomap_ioend	*io_parent;	/* parent for completions */
> 
> I guess this means ioends can chain together, sort of like how bios can
> when you split them?

Exactly.
diff mbox series

Patch

diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 381d76c5c232..69e8ebb41302 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -12,6 +12,7 @@  iomap-y				+= trace.o \
 				   iter.o
 iomap-$(CONFIG_BLOCK)		+= buffered-io.o \
 				   direct-io.o \
+				   ioend.o \
 				   fiemap.o \
 				   seek.o
 iomap-$(CONFIG_SWAP)		+= swapfile.o
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 129cd96c6c96..8125f758a99d 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -40,7 +40,8 @@  struct iomap_folio_state {
 	unsigned long		state[];
 };
 
-static struct bio_set iomap_ioend_bioset;
+struct bio_set iomap_ioend_bioset;
+EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
 
 static inline bool ifs_is_fully_uptodate(struct folio *folio,
 		struct iomap_folio_state *ifs)
@@ -1539,15 +1540,15 @@  static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
  * ioend after this.
  */
 static u32
-iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
 {
 	struct inode *inode = ioend->io_inode;
 	struct bio *bio = &ioend->io_bio;
 	struct folio_iter fi;
 	u32 folio_count = 0;
 
-	if (error) {
-		mapping_set_error(inode->i_mapping, error);
+	if (ioend->io_error) {
+		mapping_set_error(inode->i_mapping, ioend->io_error);
 		if (!bio_flagged(bio, BIO_QUIET)) {
 			pr_err_ratelimited(
 "%s: writeback error on inode %lu, offset %lld, sector %llu",
@@ -1566,6 +1567,24 @@  iomap_finish_ioend(struct iomap_ioend *ioend, int error)
 	return folio_count;
 }
 
+static u32
+iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+{
+	if (ioend->io_parent) {
+		struct bio *bio = &ioend->io_bio;
+
+		ioend = ioend->io_parent;
+		bio_put(bio);
+	}
+
+	if (error)
+		cmpxchg(&ioend->io_error, 0, error);
+
+	if (!atomic_dec_and_test(&ioend->io_remaining))
+		return 0;
+	return iomap_finish_ioend_buffered(ioend);
+}
+
 /*
  * Ioend completion routine for merged bios. This can only be called from task
  * contexts as merged ioends can be of unbound length. Hence we have to break up
@@ -1709,7 +1728,6 @@  static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct inode *inode, loff_t pos,
 		u16 ioend_flags)
 {
-	struct iomap_ioend *ioend;
 	struct bio *bio;
 
 	bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
@@ -1717,21 +1735,10 @@  static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
 			       GFP_NOFS, &iomap_ioend_bioset);
 	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
 	bio->bi_end_io = iomap_writepage_end_bio;
-	wbc_init_bio(wbc, bio);
 	bio->bi_write_hint = inode->i_write_hint;
-
-	ioend = iomap_ioend_from_bio(bio);
-	INIT_LIST_HEAD(&ioend->io_list);
-	ioend->io_flags = ioend_flags;
-	if (pos > wpc->iomap.offset)
-		wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
-	ioend->io_inode = inode;
-	ioend->io_size = 0;
-	ioend->io_offset = pos;
-	ioend->io_sector = bio->bi_iter.bi_sector;
-
+	wbc_init_bio(wbc, bio);
 	wpc->nr_folios = 0;
-	return ioend;
+	return iomap_init_ioend(inode, bio, pos, ioend_flags);
 }
 
 static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
new file mode 100644
index 000000000000..f3d98121c593
--- /dev/null
+++ b/fs/iomap/ioend.c
@@ -0,0 +1,73 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024 Christoph Hellwig.
+ */
+#include <linux/iomap.h>
+
+struct iomap_ioend *iomap_init_ioend(struct inode *inode,
+		struct bio *bio, loff_t file_offset, u16 flags)
+{
+	struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+	atomic_set(&ioend->io_remaining, 1);
+	ioend->io_error = 0;
+	ioend->io_parent = NULL;
+	INIT_LIST_HEAD(&ioend->io_list);
+	ioend->io_flags = flags;
+	ioend->io_inode = inode;
+	ioend->io_offset = file_offset;
+	ioend->io_size = bio->bi_iter.bi_size;
+	ioend->io_sector = bio->bi_iter.bi_sector;
+	return ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_init_ioend);
+
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, bool is_append,
+		unsigned int *alloc_len)
+{
+	struct bio *bio = &ioend->io_bio;
+	struct iomap_ioend *split_ioend;
+	struct bio *split;
+	int sector_offset;
+	unsigned int nr_segs;
+
+	if (is_append) {
+		struct queue_limits *lim = bdev_limits(bio->bi_bdev);
+
+		sector_offset = bio_split_rw_at(bio, lim, &nr_segs,
+			min(lim->max_zone_append_sectors << SECTOR_SHIFT,
+			    *alloc_len));
+		if (!sector_offset)
+			return NULL;
+	} else {
+		if (bio->bi_iter.bi_size <= *alloc_len)
+			return NULL;
+		sector_offset = *alloc_len >> SECTOR_SHIFT;
+	}
+
+	/* ensure the split ioend is still block size aligned */
+	sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
+			i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
+
+	split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
+	if (!split)
+		return NULL;
+	split->bi_private = bio->bi_private;
+	split->bi_end_io = bio->bi_end_io;
+
+	split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
+			ioend->io_flags);
+	split_ioend->io_parent = ioend;
+
+	atomic_inc(&ioend->io_remaining);
+	ioend->io_offset += split_ioend->io_size;
+	ioend->io_size -= split_ioend->io_size;
+
+	split_ioend->io_sector = ioend->io_sector;
+	if (!is_append)
+		ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
+
+	*alloc_len -= split->bi_iter.bi_size;
+	return split_ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_split_ioend);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 173d490c20ba..eaa8cb9083eb 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -354,6 +354,9 @@  struct iomap_ioend {
 	struct list_head	io_list;	/* next ioend in chain */
 	u16			io_flags;	/* IOMAP_IOEND_* */
 	struct inode		*io_inode;	/* file being written to */
+	atomic_t		io_remaining;	/* completetion defer count */
+	int			io_error;	/* stashed away status */
+	struct iomap_ioend	*io_parent;	/* parent for completions */
 	size_t			io_size;	/* size of the extent */
 	loff_t			io_offset;	/* offset in the file */
 	sector_t		io_sector;	/* start sector of ioend */
@@ -404,6 +407,10 @@  struct iomap_writepage_ctx {
 	u32			nr_folios;	/* folios added to the ioend */
 };
 
+struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
+		loff_t file_offset, u16 flags);
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, bool is_append,
+		unsigned int *alloc_len);
 void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
 void iomap_ioend_try_merge(struct iomap_ioend *ioend,
 		struct list_head *more_ioends);
@@ -475,4 +482,6 @@  int iomap_swapfile_activate(struct swap_info_struct *sis,
 # define iomap_swapfile_activate(sis, swapfile, pagespan, ops)	(-EIO)
 #endif /* CONFIG_SWAP */
 
+extern struct bio_set iomap_ioend_bioset;
+
 #endif /* LINUX_IOMAP_H */