diff mbox series

[PATCHv6,5/5] iomap: Add per-block dirty state tracking to improve performance

Message ID c38a4081e762e38b8fc4c0a54d848741d28d7455.1685900733.git.ritesh.list@gmail.com (mailing list archive)
State Under Review
Headers show
Series iomap: Add support for per-block dirty state to improve write performance | expand

Commit Message

Ritesh Harjani (IBM) June 5, 2023, 1:31 a.m. UTC
When filesystem blocksize is less than folio size (either with
mapping_large_folio_support() or with blocksize < pagesize) and when the
folio is uptodate in pagecache, then even a byte write can cause
an entire folio to be written to disk during writeback. This happens
because we currently don't have a mechanism to track per-block dirty
state within struct iomap_page. We currently only track uptodate state.

This patch implements support for tracking per-block dirty state in
iomap_page->state bitmap. This should help improve the filesystem write
performance and help reduce write amplification.

Performance testing of below fio workload reveals ~16x performance
improvement using nvme with XFS (4k blocksize) on Power (64K pagesize)
FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.

1. <test_randwrite.fio>
[global]
	ioengine=psync
	rw=randwrite
	overwrite=1
	pre_read=1
	direct=0
	bs=4k
	size=1G
	dir=./
	numjobs=8
	fdatasync=1
	runtime=60
	iodepth=64
	group_reporting=1

[fio-run]

2. Also our internal performance team reported that this patch improves
   their database workload performance by around ~83% (with XFS on Power)

Reported-by: Aravinda Herle <araherle@in.ibm.com>
Reported-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 fs/gfs2/aops.c         |   2 +-
 fs/iomap/buffered-io.c | 172 +++++++++++++++++++++++++++++++++++------
 fs/xfs/xfs_aops.c      |   2 +-
 fs/zonefs/file.c       |   2 +-
 include/linux/iomap.h  |   1 +
 5 files changed, 152 insertions(+), 27 deletions(-)

Comments

Matthew Wilcox June 5, 2023, 4:03 a.m. UTC | #1
On Mon, Jun 05, 2023 at 07:01:52AM +0530, Ritesh Harjani (IBM) wrote:
> +static void iop_set_range_dirty(struct inode *inode, struct folio *folio,
> +				size_t off, size_t len)
> +{
> +	struct iomap_page *iop = to_iomap_page(folio);
> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +	unsigned int first_blk = off >> inode->i_blkbits;
> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> +	unsigned int nr_blks = last_blk - first_blk + 1;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iop->state_lock, flags);
> +	bitmap_set(iop->state, first_blk + blks_per_folio, nr_blks);
> +	spin_unlock_irqrestore(&iop->state_lock, flags);
> +}
> +
> +static void iomap_iop_set_range_dirty(struct inode *inode, struct folio *folio,
> +				size_t off, size_t len)
> +{
> +	struct iomap_page *iop = to_iomap_page(folio);
> +
> +	if (iop)
> +		iop_set_range_dirty(inode, folio, off, len);
> +}

Why are these separate functions?  It'd be much better written as:

static void iomap_iop_set_range_dirty(struct inode *inode, struct folio *folio,
		size_t off, size_t len)
{
	struct iomap_page *iop = to_iomap_page(folio);
	unsigned int start, first, last;
	unsigned long flags;

	if (!iop)
		return;

	start = i_blocks_per_folio(inode, folio);
	first = off >> inode->i_blkbits;
	last = (off + len - 1) >> inode->i_blkbits;

	spin_lock_irqsave(&iop->state_lock, flags);
	bitmap_set(iop->state, start + first, last - first + 1);
	spin_unlock_irqrestore(&iop->state_lock, flags);
}

> +static void iop_clear_range_dirty(struct inode *inode, struct folio *folio,
> +				  size_t off, size_t len)
> +{
> +	struct iomap_page *iop = to_iomap_page(folio);
> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +	unsigned int first_blk = off >> inode->i_blkbits;
> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> +	unsigned int nr_blks = last_blk - first_blk + 1;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iop->state_lock, flags);
> +	bitmap_clear(iop->state, first_blk + blks_per_folio, nr_blks);
> +	spin_unlock_irqrestore(&iop->state_lock, flags);
> +}
> +
> +static void iomap_iop_clear_range_dirty(struct inode *inode,
> +				struct folio *folio, size_t off, size_t len)
> +{
> +	struct iomap_page *iop = to_iomap_page(folio);
> +
> +	if (iop)
> +		iop_clear_range_dirty(inode, folio, off, len);
> +}

Similarly

> +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
> +{
> +	struct iomap_page __maybe_unused *iop;
> +	struct inode *inode = mapping->host;
> +	size_t len = folio_size(folio);
> +
> +	iop = iomap_iop_alloc(inode, folio, 0);

Why do you keep doing this?  Just throw away the return value from
iomap_iop_alloc().  Don't clutter the source with the unnecessary
variable declaration and annotation that it's not used!

> +static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
> +		loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
> +		int (*punch)(struct inode *inode, loff_t offset, loff_t length))
> +{
> +	struct iomap_page *iop;
> +	unsigned int first_blk, last_blk, i;
> +	loff_t last_byte;
> +	u8 blkbits = inode->i_blkbits;
> +	int ret = 0;
> +
> +	if (start_byte > *punch_start_byte) {
> +		ret = punch(inode, *punch_start_byte,
> +				start_byte - *punch_start_byte);
> +		if (ret)
> +			goto out_err;
> +	}
> +	/*
> +	 * When we have per-block dirty tracking, there can be
> +	 * blocks within a folio which are marked uptodate
> +	 * but not dirty. In that case it is necessary to punch
> +	 * out such blocks to avoid leaking any delalloc blocks.
> +	 */
> +	iop = to_iomap_page(folio);
> +	if (!iop)
> +		goto skip_iop_punch;
> +
> +	last_byte = min_t(loff_t, end_byte - 1,
> +		(folio_next_index(folio) << PAGE_SHIFT) - 1);
> +	first_blk = offset_in_folio(folio, start_byte) >> blkbits;
> +	last_blk = offset_in_folio(folio, last_byte) >> blkbits;
> +	for (i = first_blk; i <= last_blk; i++) {
> +		if (!iop_test_block_dirty(folio, i)) {
> +			ret = punch(inode, i << blkbits, 1 << blkbits);
> +			if (ret)
> +				goto out_err;
> +		}
> +	}
> +
> +skip_iop_punch:
> +	/*
> +	 * Make sure the next punch start is correctly bound to
> +	 * the end of this data range, not the end of the folio.
> +	 */
> +	*punch_start_byte = min_t(loff_t, end_byte,
> +			folio_next_index(folio) << PAGE_SHIFT);
> +
> +	return ret;
> +
> +out_err:
> +	folio_unlock(folio);
> +	folio_put(folio);
> +	return ret;
> +
> +}
> +
>  /*
>   * Scan the data range passed to us for dirty page cache folios. If we find a
>   * dirty folio, punch out the preceeding range and update the offset from which
> @@ -940,26 +1074,9 @@ static int iomap_write_delalloc_scan(struct inode *inode,
>  		}
>  
>  		/* if dirty, punch up to offset */
> -		if (folio_test_dirty(folio)) {
> -			if (start_byte > *punch_start_byte) {
> -				int	error;
> -
> -				error = punch(inode, *punch_start_byte,
> -						start_byte - *punch_start_byte);
> -				if (error) {
> -					folio_unlock(folio);
> -					folio_put(folio);
> -					return error;
> -				}
> -			}
> -
> -			/*
> -			 * Make sure the next punch start is correctly bound to
> -			 * the end of this data range, not the end of the folio.
> -			 */
> -			*punch_start_byte = min_t(loff_t, end_byte,
> -					folio_next_index(folio) << PAGE_SHIFT);
> -		}
> +		if (folio_test_dirty(folio))
> +			iomap_write_delalloc_punch(inode, folio, punch_start_byte,
> +					   start_byte, end_byte, punch);
>  
>  		/* move offset to start of next folio in range */
>  		start_byte = folio_next_index(folio) << PAGE_SHIFT;

I'm having trouble following this refactoring + modification.  Perhaps
I'm just tired.
Ritesh Harjani (IBM) June 5, 2023, 5:20 a.m. UTC | #2
On Mon, Jun 5, 2023 at 9:33 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Mon, Jun 05, 2023 at 07:01:52AM +0530, Ritesh Harjani (IBM) wrote:
> > +static void iop_set_range_dirty(struct inode *inode, struct folio *folio,
> > +                             size_t off, size_t len)
> > +{
> > +     struct iomap_page *iop = to_iomap_page(folio);
> > +     unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> > +     unsigned int first_blk = off >> inode->i_blkbits;
> > +     unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> > +     unsigned int nr_blks = last_blk - first_blk + 1;
> > +     unsigned long flags;
> > +
> > +     spin_lock_irqsave(&iop->state_lock, flags);
> > +     bitmap_set(iop->state, first_blk + blks_per_folio, nr_blks);
> > +     spin_unlock_irqrestore(&iop->state_lock, flags);
> > +}
> > +
> > +static void iomap_iop_set_range_dirty(struct inode *inode, struct folio *folio,
> > +                             size_t off, size_t len)
> > +{
> > +     struct iomap_page *iop = to_iomap_page(folio);
> > +
> > +     if (iop)
> > +             iop_set_range_dirty(inode, folio, off, len);
> > +}
>
> Why are these separate functions?  It'd be much better written as:

It got discussed here [1] on the preference would be to have it in a
separate helper.
[1]: https://lore.kernel.org/linux-xfs/ZGYnzcoGuzWKa7lh@infradead.org/

>
> static void iomap_iop_set_range_dirty(struct inode *inode, struct folio *folio,
>                 size_t off, size_t len)
> {
>         struct iomap_page *iop = to_iomap_page(folio);
>         unsigned int start, first, last;
>         unsigned long flags;
>
>         if (!iop)
>                 return;
>
>         start = i_blocks_per_folio(inode, folio);
>         first = off >> inode->i_blkbits;
>         last = (off + len - 1) >> inode->i_blkbits;
>
>         spin_lock_irqsave(&iop->state_lock, flags);
>         bitmap_set(iop->state, start + first, last - first + 1);
>         spin_unlock_irqrestore(&iop->state_lock, flags);
> }
>
> > +static void iop_clear_range_dirty(struct inode *inode, struct folio *folio,
> > +                               size_t off, size_t len)
> > +{
> > +     struct iomap_page *iop = to_iomap_page(folio);
> > +     unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> > +     unsigned int first_blk = off >> inode->i_blkbits;
> > +     unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> > +     unsigned int nr_blks = last_blk - first_blk + 1;
> > +     unsigned long flags;
> > +
> > +     spin_lock_irqsave(&iop->state_lock, flags);
> > +     bitmap_clear(iop->state, first_blk + blks_per_folio, nr_blks);
> > +     spin_unlock_irqrestore(&iop->state_lock, flags);
> > +}
> > +
> > +static void iomap_iop_clear_range_dirty(struct inode *inode,
> > +                             struct folio *folio, size_t off, size_t len)
> > +{
> > +     struct iomap_page *iop = to_iomap_page(folio);
> > +
> > +     if (iop)
> > +             iop_clear_range_dirty(inode, folio, off, len);
> > +}
>
> Similarly
>
> > +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
> > +{
> > +     struct iomap_page __maybe_unused *iop;
> > +     struct inode *inode = mapping->host;
> > +     size_t len = folio_size(folio);
> > +
> > +     iop = iomap_iop_alloc(inode, folio, 0);
>
> Why do you keep doing this?  Just throw away the return value from
> iomap_iop_alloc().  Don't clutter the source with the unnecessary
> variable declaration and annotation that it's not used!
>

Sorry, it got leftover. I will quickly fix this.

> > +static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
> > +             loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
> > +             int (*punch)(struct inode *inode, loff_t offset, loff_t length))
> > +{
> > +     struct iomap_page *iop;
> > +     unsigned int first_blk, last_blk, i;
> > +     loff_t last_byte;
> > +     u8 blkbits = inode->i_blkbits;
> > +     int ret = 0;
> > +
> > +     if (start_byte > *punch_start_byte) {
> > +             ret = punch(inode, *punch_start_byte,
> > +                             start_byte - *punch_start_byte);
> > +             if (ret)
> > +                     goto out_err;
> > +     }
> > +     /*
> > +      * When we have per-block dirty tracking, there can be
> > +      * blocks within a folio which are marked uptodate
> > +      * but not dirty. In that case it is necessary to punch
> > +      * out such blocks to avoid leaking any delalloc blocks.
> > +      */
> > +     iop = to_iomap_page(folio);
> > +     if (!iop)
> > +             goto skip_iop_punch;
> > +
> > +     last_byte = min_t(loff_t, end_byte - 1,
> > +             (folio_next_index(folio) << PAGE_SHIFT) - 1);
> > +     first_blk = offset_in_folio(folio, start_byte) >> blkbits;
> > +     last_blk = offset_in_folio(folio, last_byte) >> blkbits;
> > +     for (i = first_blk; i <= last_blk; i++) {
> > +             if (!iop_test_block_dirty(folio, i)) {
> > +                     ret = punch(inode, i << blkbits, 1 << blkbits);
> > +                     if (ret)
> > +                             goto out_err;
> > +             }
> > +     }
> > +
> > +skip_iop_punch:
> > +     /*
> > +      * Make sure the next punch start is correctly bound to
> > +      * the end of this data range, not the end of the folio.
> > +      */
> > +     *punch_start_byte = min_t(loff_t, end_byte,
> > +                     folio_next_index(folio) << PAGE_SHIFT);
> > +
> > +     return ret;
> > +
> > +out_err:
> > +     folio_unlock(folio);
> > +     folio_put(folio);
> > +     return ret;
> > +
> > +}
> > +
> >  /*
> >   * Scan the data range passed to us for dirty page cache folios. If we find a
> >   * dirty folio, punch out the preceeding range and update the offset from which
> > @@ -940,26 +1074,9 @@ static int iomap_write_delalloc_scan(struct inode *inode,
> >               }
> >
> >               /* if dirty, punch up to offset */
> > -             if (folio_test_dirty(folio)) {
> > -                     if (start_byte > *punch_start_byte) {
> > -                             int     error;
> > -
> > -                             error = punch(inode, *punch_start_byte,
> > -                                             start_byte - *punch_start_byte);
> > -                             if (error) {
> > -                                     folio_unlock(folio);
> > -                                     folio_put(folio);
> > -                                     return error;
> > -                             }
> > -                     }
> > -
> > -                     /*
> > -                      * Make sure the next punch start is correctly bound to
> > -                      * the end of this data range, not the end of the folio.
> > -                      */
> > -                     *punch_start_byte = min_t(loff_t, end_byte,
> > -                                     folio_next_index(folio) << PAGE_SHIFT);
> > -             }
> > +             if (folio_test_dirty(folio))
> > +                     iomap_write_delalloc_punch(inode, folio, punch_start_byte,
> > +                                        start_byte, end_byte, punch);
> >
> >               /* move offset to start of next folio in range */
> >               start_byte = folio_next_index(folio) << PAGE_SHIFT;
>
> I'm having trouble following this refactoring + modification.  Perhaps
> I'm just tired.
>

Let me refactor this part out in the next revision.

-ritesh
diff mbox series

Patch

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index a5f4be6b9213..75efec3c3b71 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -746,7 +746,7 @@  static const struct address_space_operations gfs2_aops = {
 	.writepages = gfs2_writepages,
 	.read_folio = gfs2_read_folio,
 	.readahead = gfs2_readahead,
-	.dirty_folio = filemap_dirty_folio,
+	.dirty_folio = iomap_dirty_folio,
 	.release_folio = iomap_release_folio,
 	.invalidate_folio = iomap_invalidate_folio,
 	.bmap = gfs2_bmap,
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index a70242cb32b1..dee81d16804e 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -85,6 +85,63 @@  static void iomap_iop_set_range_uptodate(struct inode *inode,
 		folio_mark_uptodate(folio);
 }
 
+static bool iop_test_block_dirty(struct folio *folio, int block)
+{
+	struct iomap_page *iop = to_iomap_page(folio);
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+
+	return test_bit(block + blks_per_folio, iop->state);
+}
+
+static void iop_set_range_dirty(struct inode *inode, struct folio *folio,
+				size_t off, size_t len)
+{
+	struct iomap_page *iop = to_iomap_page(folio);
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+	unsigned int first_blk = off >> inode->i_blkbits;
+	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iop->state_lock, flags);
+	bitmap_set(iop->state, first_blk + blks_per_folio, nr_blks);
+	spin_unlock_irqrestore(&iop->state_lock, flags);
+}
+
+static void iomap_iop_set_range_dirty(struct inode *inode, struct folio *folio,
+				size_t off, size_t len)
+{
+	struct iomap_page *iop = to_iomap_page(folio);
+
+	if (iop)
+		iop_set_range_dirty(inode, folio, off, len);
+}
+
+static void iop_clear_range_dirty(struct inode *inode, struct folio *folio,
+				  size_t off, size_t len)
+{
+	struct iomap_page *iop = to_iomap_page(folio);
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+	unsigned int first_blk = off >> inode->i_blkbits;
+	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iop->state_lock, flags);
+	bitmap_clear(iop->state, first_blk + blks_per_folio, nr_blks);
+	spin_unlock_irqrestore(&iop->state_lock, flags);
+}
+
+static void iomap_iop_clear_range_dirty(struct inode *inode,
+				struct folio *folio, size_t off, size_t len)
+{
+	struct iomap_page *iop = to_iomap_page(folio);
+
+	if (iop)
+		iop_clear_range_dirty(inode, folio, off, len);
+}
+
 static struct iomap_page *iomap_iop_alloc(struct inode *inode,
 				struct folio *folio, unsigned int flags)
 {
@@ -100,12 +157,20 @@  static struct iomap_page *iomap_iop_alloc(struct inode *inode,
 	else
 		gfp = GFP_NOFS | __GFP_NOFAIL;
 
-	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
+	/*
+	 * iop->state tracks two sets of state flags when the
+	 * filesystem block size is smaller than the folio size.
+	 * The first state tracks per-block uptodate and the
+	 * second tracks per-block dirty state.
+	 */
+	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(2 * nr_blocks)),
 		      gfp);
 	if (iop) {
 		spin_lock_init(&iop->state_lock);
 		if (folio_test_uptodate(folio))
-			bitmap_fill(iop->state, nr_blocks);
+			bitmap_set(iop->state, 0, nr_blocks);
+		if (folio_test_dirty(folio))
+			bitmap_set(iop->state, nr_blocks, nr_blocks);
 		folio_attach_private(folio, iop);
 	}
 	return iop;
@@ -536,6 +601,18 @@  void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 }
 EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
 
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+	struct iomap_page __maybe_unused *iop;
+	struct inode *inode = mapping->host;
+	size_t len = folio_size(folio);
+
+	iop = iomap_iop_alloc(inode, folio, 0);
+	iomap_iop_set_range_dirty(inode, folio, 0, len);
+	return filemap_dirty_folio(mapping, folio);
+}
+EXPORT_SYMBOL_GPL(iomap_dirty_folio);
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -742,6 +819,8 @@  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 		return 0;
 	iomap_iop_set_range_uptodate(inode, folio, offset_in_folio(folio, pos),
 				     len);
+	iomap_iop_set_range_dirty(inode, folio, offset_in_folio(folio, pos),
+				  copied);
 	filemap_dirty_folio(inode->i_mapping, folio);
 	return copied;
 }
@@ -906,6 +985,61 @@  iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
 
+static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
+		loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
+		int (*punch)(struct inode *inode, loff_t offset, loff_t length))
+{
+	struct iomap_page *iop;
+	unsigned int first_blk, last_blk, i;
+	loff_t last_byte;
+	u8 blkbits = inode->i_blkbits;
+	int ret = 0;
+
+	if (start_byte > *punch_start_byte) {
+		ret = punch(inode, *punch_start_byte,
+				start_byte - *punch_start_byte);
+		if (ret)
+			goto out_err;
+	}
+	/*
+	 * When we have per-block dirty tracking, there can be
+	 * blocks within a folio which are marked uptodate
+	 * but not dirty. In that case it is necessary to punch
+	 * out such blocks to avoid leaking any delalloc blocks.
+	 */
+	iop = to_iomap_page(folio);
+	if (!iop)
+		goto skip_iop_punch;
+
+	last_byte = min_t(loff_t, end_byte - 1,
+		(folio_next_index(folio) << PAGE_SHIFT) - 1);
+	first_blk = offset_in_folio(folio, start_byte) >> blkbits;
+	last_blk = offset_in_folio(folio, last_byte) >> blkbits;
+	for (i = first_blk; i <= last_blk; i++) {
+		if (!iop_test_block_dirty(folio, i)) {
+			ret = punch(inode, i << blkbits, 1 << blkbits);
+			if (ret)
+				goto out_err;
+		}
+	}
+
+skip_iop_punch:
+	/*
+	 * Make sure the next punch start is correctly bound to
+	 * the end of this data range, not the end of the folio.
+	 */
+	*punch_start_byte = min_t(loff_t, end_byte,
+			folio_next_index(folio) << PAGE_SHIFT);
+
+	return ret;
+
+out_err:
+	folio_unlock(folio);
+	folio_put(folio);
+	return ret;
+
+}
+
 /*
  * Scan the data range passed to us for dirty page cache folios. If we find a
  * dirty folio, punch out the preceeding range and update the offset from which
@@ -940,26 +1074,9 @@  static int iomap_write_delalloc_scan(struct inode *inode,
 		}
 
 		/* if dirty, punch up to offset */
-		if (folio_test_dirty(folio)) {
-			if (start_byte > *punch_start_byte) {
-				int	error;
-
-				error = punch(inode, *punch_start_byte,
-						start_byte - *punch_start_byte);
-				if (error) {
-					folio_unlock(folio);
-					folio_put(folio);
-					return error;
-				}
-			}
-
-			/*
-			 * Make sure the next punch start is correctly bound to
-			 * the end of this data range, not the end of the folio.
-			 */
-			*punch_start_byte = min_t(loff_t, end_byte,
-					folio_next_index(folio) << PAGE_SHIFT);
-		}
+		if (folio_test_dirty(folio))
+			iomap_write_delalloc_punch(inode, folio, punch_start_byte,
+					   start_byte, end_byte, punch);
 
 		/* move offset to start of next folio in range */
 		start_byte = folio_next_index(folio) << PAGE_SHIFT;
@@ -1641,7 +1758,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct inode *inode,
 		struct folio *folio, u64 end_pos)
 {
-	struct iomap_page *iop = iomap_iop_alloc(inode, folio, 0);
+	struct iomap_page *iop = to_iomap_page(folio);
 	struct iomap_ioend *ioend, *next;
 	unsigned len = i_blocksize(inode);
 	unsigned nblocks = i_blocks_per_folio(inode, folio);
@@ -1649,6 +1766,11 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	int error = 0, count = 0, i;
 	LIST_HEAD(submit_list);
 
+	if (!iop && nblocks > 1) {
+		iop = iomap_iop_alloc(inode, folio, 0);
+		iomap_iop_set_range_dirty(inode, folio, 0, folio_size(folio));
+	}
+
 	WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
 
 	/*
@@ -1657,7 +1779,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 * invalid, grab a new one.
 	 */
 	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
-		if (iop && !iop_test_block_uptodate(folio, i))
+		if (iop && !iop_test_block_dirty(folio, i))
 			continue;
 
 		error = wpc->ops->map_blocks(wpc, inode, pos);
@@ -1701,6 +1823,8 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		}
 	}
 
+	iomap_iop_clear_range_dirty(inode, folio, 0,
+				    end_pos - folio_pos(folio));
 	folio_start_writeback(folio);
 	folio_unlock(folio);
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 2ef78aa1d3f6..77c7332ae197 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -578,7 +578,7 @@  const struct address_space_operations xfs_address_space_operations = {
 	.read_folio		= xfs_vm_read_folio,
 	.readahead		= xfs_vm_readahead,
 	.writepages		= xfs_vm_writepages,
-	.dirty_folio		= filemap_dirty_folio,
+	.dirty_folio		= iomap_dirty_folio,
 	.release_folio		= iomap_release_folio,
 	.invalidate_folio	= iomap_invalidate_folio,
 	.bmap			= xfs_vm_bmap,
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 132f01d3461f..e508c8e97372 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -175,7 +175,7 @@  const struct address_space_operations zonefs_file_aops = {
 	.read_folio		= zonefs_read_folio,
 	.readahead		= zonefs_readahead,
 	.writepages		= zonefs_writepages,
-	.dirty_folio		= filemap_dirty_folio,
+	.dirty_folio		= iomap_dirty_folio,
 	.release_folio		= iomap_release_folio,
 	.invalidate_folio	= iomap_invalidate_folio,
 	.migrate_folio		= filemap_migrate_folio,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119..eb9335c46bf3 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -264,6 +264,7 @@  bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,