diff mbox series

[RFCv3,3/3] iomap: Support subpage size dirty tracking to improve write performance

Message ID 9650ef88e09c6227b99bb5793eef2b8e47994c7d.1677428795.git.ritesh.list@gmail.com (mailing list archive)
State New, archived
Headers show
Series iomap: Add support for subpage dirty state tracking to improve write performance | expand

Commit Message

Ritesh Harjani (IBM) Feb. 26, 2023, 7:43 p.m. UTC
On a 64k pagesize platforms (specially Power and/or aarch64) with 4k
filesystem blocksize, this patch should improve the performance by doing
only the subpage dirty data write.

This should also reduce the write amplification since we can now track
subpage dirty status within state bitmaps. Earlier we had to
write the entire 64k page even if only a part of it (e.g. 4k) was
updated.

Performance testing of below fio workload reveals ~16x performance
improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.

1. <test_randwrite.fio>
[global]
	ioengine=psync
	rw=randwrite
	overwrite=1
	pre_read=1
	direct=0
	bs=4k
	size=1G
	dir=./
	numjobs=8
	fdatasync=1
	runtime=60
	iodepth=64
	group_reporting=1

[fio-run]

2. Also our internal performance team reported that this patch improves there
   database workload performance by around ~83% (with XFS on Power)

Reported-by: Aravinda Herle <araherle@in.ibm.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 fs/gfs2/aops.c         |   2 +-
 fs/iomap/buffered-io.c | 104 +++++++++++++++++++++++++++++++++++++----
 fs/xfs/xfs_aops.c      |   2 +-
 fs/zonefs/super.c      |   2 +-
 include/linux/iomap.h  |   1 +
 5 files changed, 99 insertions(+), 12 deletions(-)

--
2.39.2

Comments

Dave Chinner Feb. 26, 2023, 11:48 p.m. UTC | #1
On Mon, Feb 27, 2023 at 01:13:32AM +0530, Ritesh Harjani (IBM) wrote:
> On a 64k pagesize platforms (specially Power and/or aarch64) with 4k
> filesystem blocksize, this patch should improve the performance by doing
> only the subpage dirty data write.
> 
> This should also reduce the write amplification since we can now track
> subpage dirty status within state bitmaps. Earlier we had to
> write the entire 64k page even if only a part of it (e.g. 4k) was
> updated.
> 
> Performance testing of below fio workload reveals ~16x performance
> improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
> FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
> 
> 1. <test_randwrite.fio>
> [global]
> 	ioengine=psync
> 	rw=randwrite
> 	overwrite=1
> 	pre_read=1
> 	direct=0
> 	bs=4k
> 	size=1G
> 	dir=./
> 	numjobs=8
> 	fdatasync=1
> 	runtime=60
> 	iodepth=64
> 	group_reporting=1
> 
> [fio-run]
> 
> 2. Also our internal performance team reported that this patch improves there
>    database workload performance by around ~83% (with XFS on Power)
> 
> Reported-by: Aravinda Herle <araherle@in.ibm.com>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> ---
>  fs/gfs2/aops.c         |   2 +-
>  fs/iomap/buffered-io.c | 104 +++++++++++++++++++++++++++++++++++++----
>  fs/xfs/xfs_aops.c      |   2 +-
>  fs/zonefs/super.c      |   2 +-
>  include/linux/iomap.h  |   1 +
>  5 files changed, 99 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
> index e782b4f1d104..b9c35288a5eb 100644
> --- a/fs/gfs2/aops.c
> +++ b/fs/gfs2/aops.c
> @@ -741,7 +741,7 @@ static const struct address_space_operations gfs2_aops = {
>  	.writepages = gfs2_writepages,
>  	.read_folio = gfs2_read_folio,
>  	.readahead = gfs2_readahead,
> -	.dirty_folio = filemap_dirty_folio,
> +	.dirty_folio = iomap_dirty_folio,
>  	.release_folio = iomap_release_folio,
>  	.invalidate_folio = iomap_invalidate_folio,
>  	.bmap = gfs2_bmap,
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index e0b0be16278e..fb55183c547f 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -44,8 +44,8 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
>  static struct bio_set iomap_ioend_bioset;
> 
>  /*
> - * Accessor functions for setting/clearing/checking uptodate bits in
> - * iop->state bitmap.
> + * Accessor functions for setting/clearing/checking uptodate and
> + * dirty bits in iop->state bitmap.
>   * nrblocks is i_blocks_per_folio() which is passed in every
>   * function as the last argument for API consistency.
>   */
> @@ -75,8 +75,29 @@ static inline bool iop_full_uptodate(struct iomap_page *iop,
>  	return bitmap_full(iop->state, nrblocks);
>  }
> 
> +static inline void iop_set_range_dirty(struct iomap_page *iop,
> +				unsigned int start, unsigned int len,
> +				unsigned int nrblocks)
> +{
> +	bitmap_set(iop->state, start + nrblocks, len);
> +}
> +
> +static inline void iop_clear_range_dirty(struct iomap_page *iop,
> +				unsigned int start, unsigned int len,
> +				unsigned int nrblocks)
> +{
> +	bitmap_clear(iop->state, start + nrblocks, len);
> +}
> +
> +static inline bool iop_test_dirty(struct iomap_page *iop, unsigned int pos,
> +			     unsigned int nrblocks)
> +{
> +	return test_bit(pos + nrblocks, iop->state);
> +}
> +
>  static struct iomap_page *
> -iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
> +iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags,
> +		  bool is_dirty)
>  {
>  	struct iomap_page *iop = to_iomap_page(folio);
>  	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> @@ -90,12 +111,18 @@ iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
>  	else
>  		gfp = GFP_NOFS | __GFP_NOFAIL;
> 
> -	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
> +	/*
> +	 * iop->state tracks 2 types of bitmaps i.e. uptodate & dirty
> +	 * for bs < ps.
> +	 */

PLease write comments out in full. Also "bs < ps" is actually
wrong because we have large folios in the page cache and they will
need to use sub-folio state tracking if they are dirtied.

	/*
	 * iop->state tracks two sets of state flags when the
	 * filesystem block size is smaller than the folio size.
	 * state. The first tracks per-filesystem block uptodate
	 * state, the second tracks per-filesystem block dirty
	 * state.
	 */

> +	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(2 * nr_blocks)),
>  		      gfp);
>  	if (iop) {
>  		spin_lock_init(&iop->state_lock);
>  		if (folio_test_uptodate(folio))
>  			iop_set_range_uptodate(iop, 0, nr_blocks, nr_blocks);
> +		if (is_dirty)
> +			iop_set_range_dirty(iop, 0, nr_blocks, nr_blocks);
>  		folio_attach_private(folio, iop);
>  	}
>  	return iop;
> @@ -202,6 +229,48 @@ static void iomap_set_range_uptodate(struct folio *folio,
>  		folio_mark_uptodate(folio);
>  }
> 
> +static void iomap_iop_set_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> +	unsigned first = (off >> inode->i_blkbits);
> +	unsigned last = ((off + len - 1) >> inode->i_blkbits);

first_bit, last_bit if we are leaving this code unchanged.


> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iop->state_lock, flags);
> +	iop_set_range_dirty(iop, first, last - first + 1, nr_blocks);
                                        ^^^^^^^^^^^^^^^^ nr_bits

I dislike all the magic "- 1" and "+ 1" sprinkles that end up in
this code because of the closed ranges nomenclature infecting this
code. If we use closed start/open ended ranges like we do all
through XFS, we have:

offset_to_start_bit()
{
	return off >> bits;			// round_down
}

offset_to_end_bit()
{
	return (off + (1 << bits) - 1) >> bits; // round_up
}

	unsigned start_bit = offset_to_start_bit(off, inode->i_blkbits);
	unsigned end_bit = offset_to_end_bit(off + len, inode->i_blkbits);
	unsigned nr_bits = end_bit - start_bit;

	iop_set_range_dirty(iop, start_bit, nr_bits, nr_blocks);

And we don't have to add magic "handle the off by one"
sprinkles everywhere that maths on closed ranges requires to get
correct...


> +	spin_unlock_irqrestore(&iop->state_lock, flags);
> +}
> +
> +static void iomap_set_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	if (iop)
> +		iomap_iop_set_range_dirty(folio, iop, off, len);
> +}
> +
> +static void iomap_iop_clear_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> +	unsigned first = (off >> inode->i_blkbits);
> +	unsigned last = ((off + len - 1) >> inode->i_blkbits);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iop->state_lock, flags);
> +	iop_clear_range_dirty(iop, first, last - first + 1, nr_blocks);
> +	spin_unlock_irqrestore(&iop->state_lock, flags);

Same here with the magic off by one sprinkles.

FWIW, this is exactly the same code as
iomap_iop_clear_range_uptodate(), is it not? The only difference is
the offset into the state bitmap and that is done by
iop_clear_range_dirty() vs iop_clear_range_uptodate()? Seems like a
layer of wrappers could be taken out of here simply by placing the
start offset correctly here for a common iop_clear_range() helper...

> +static void iomap_clear_range_dirty(struct folio *folio,
> +		struct iomap_page *iop, size_t off, size_t len)
> +{
> +	if (iop)
> +		iomap_iop_clear_range_dirty(folio, iop, off, len);
> +}

Or even here at this layer, and we squash out two layers of
very similar wrappers.

-Dave.
Matthew Wilcox Feb. 26, 2023, 11:56 p.m. UTC | #2
On Mon, Feb 27, 2023 at 10:48:14AM +1100, Dave Chinner wrote:
> > +static void iomap_iop_set_range_dirty(struct folio *folio,
> > +		struct iomap_page *iop, size_t off, size_t len)
> > +{
> > +	struct inode *inode = folio->mapping->host;
> > +	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> > +	unsigned first = (off >> inode->i_blkbits);
> > +	unsigned last = ((off + len - 1) >> inode->i_blkbits);
> 
> first_bit, last_bit if we are leaving this code unchanged.

first_blk, last_blk, surely?
Dave Chinner Feb. 27, 2023, 12:10 a.m. UTC | #3
On Sun, Feb 26, 2023 at 11:56:05PM +0000, Matthew Wilcox wrote:
> On Mon, Feb 27, 2023 at 10:48:14AM +1100, Dave Chinner wrote:
> > > +static void iomap_iop_set_range_dirty(struct folio *folio,
> > > +		struct iomap_page *iop, size_t off, size_t len)
> > > +{
> > > +	struct inode *inode = folio->mapping->host;
> > > +	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
> > > +	unsigned first = (off >> inode->i_blkbits);
> > > +	unsigned last = ((off + len - 1) >> inode->i_blkbits);
> > 
> > first_bit, last_bit if we are leaving this code unchanged.
> 
> first_blk, last_blk, surely?

Probably.

And that's entirely my point - the variable names need to
describe what they contain...

-Dave.
Brian Foster March 2, 2023, 2:02 p.m. UTC | #4
On Mon, Feb 27, 2023 at 01:13:32AM +0530, Ritesh Harjani (IBM) wrote:
> On a 64k pagesize platforms (specially Power and/or aarch64) with 4k
> filesystem blocksize, this patch should improve the performance by doing
> only the subpage dirty data write.
> 
> This should also reduce the write amplification since we can now track
> subpage dirty status within state bitmaps. Earlier we had to
> write the entire 64k page even if only a part of it (e.g. 4k) was
> updated.
> 
> Performance testing of below fio workload reveals ~16x performance
> improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
> FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
> 
> 1. <test_randwrite.fio>
> [global]
> 	ioengine=psync
> 	rw=randwrite
> 	overwrite=1
> 	pre_read=1
> 	direct=0
> 	bs=4k
> 	size=1G
> 	dir=./
> 	numjobs=8
> 	fdatasync=1
> 	runtime=60
> 	iodepth=64
> 	group_reporting=1
> 
> [fio-run]
> 
> 2. Also our internal performance team reported that this patch improves there
>    database workload performance by around ~83% (with XFS on Power)
> 
> Reported-by: Aravinda Herle <araherle@in.ibm.com>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> ---
>  fs/gfs2/aops.c         |   2 +-
>  fs/iomap/buffered-io.c | 104 +++++++++++++++++++++++++++++++++++++----
>  fs/xfs/xfs_aops.c      |   2 +-
>  fs/zonefs/super.c      |   2 +-
>  include/linux/iomap.h  |   1 +
>  5 files changed, 99 insertions(+), 12 deletions(-)
> 
...
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index e0b0be16278e..fb55183c547f 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
...
> @@ -1630,7 +1715,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  		struct writeback_control *wbc, struct inode *inode,
>  		struct folio *folio, u64 end_pos)
>  {
> -	struct iomap_page *iop = iomap_page_create(inode, folio, 0);
> +	struct iomap_page *iop = iomap_page_create(inode, folio, 0, true);
>  	struct iomap_ioend *ioend, *next;
>  	unsigned len = i_blocksize(inode);
>  	unsigned nblocks = i_blocks_per_folio(inode, folio);
> @@ -1646,7 +1731,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	 * invalid, grab a new one.
>  	 */
>  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> -		if (iop && !iop_test_uptodate(iop, i, nblocks))
> +		if (iop && !iop_test_dirty(iop, i, nblocks))
>  			continue;
> 
>  		error = wpc->ops->map_blocks(wpc, inode, pos);

Hi Ritesh,

I'm not sure if you followed any of the discussion on the imap
revalidation series that landed in the last cycle or so, but the
associated delalloc punch error handling code has a subtle dependency on
current writeback behavior and thus left a bit of a landmine for this
work. For reference, more detailed discussion starts around here [1].
The context is basically that the use of mapping seek hole/data relies
on uptodate status, which means in certain error cases the filesystem
might allocate a delalloc block for a write, but not punch it out of the
associated write happens to fail and the underlying portion of the folio
was uptodate.

This doesn't cause a problem in current mainline because writeback maps
every uptodate block in a dirty folio, and so the delalloc block will
convert at writeback time even though it wasn't written. This no longer
occurs with the change above, which means there's a vector for a stale
delalloc block to be left around in the inode. This is a free space
accounting corruption issue on XFS. Here's a quick example [2] on a 1k
FSB XFS filesystem to show exactly what I mean:

# xfs_io -fc "truncate 4k" -c "mmap 0 4k" -c "mread 0 4k" -c "pwrite 0 1" -c "pwrite -f 2k 1" -c fsync /mnt/file
# xfs_io -c "fiemap -v" /mnt/file 
/mnt/file:
 EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
...
   2: [4..5]:          0..1                 2   0x7
...
(the above shows delalloc after an fsync)
# umount /mnt
  kernel:XFS: Assertion failed: xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0, file: fs/xfs/xfs_super.c, line: 1068
# xfs_repair -n /dev/vdb2 
Phase 1 - find and verify superblock...
Phase 2 - using internal log
...
sb_fdblocks 20960187, counted 20960195
...
#

I suspect this means either the error handling code needs to be updated
to consider dirty state (i.e. punch delalloc if the block is !dirty), or
otherwise this needs to depend on a broader change in XFS to reclaim
delalloc blocks before inode eviction (along the lines of Dave's recent
proposal to do something like that for corrupted inodes). Of course the
caveat with the latter is that doesn't help for any other filesystems
(?) that might have similar expectations for delayed allocation and want
to use iomap.

Brian

[1] https://lore.kernel.org/linux-fsdevel/Y3TsPzd0XzXXIzQv@bfoster/

[2] This test case depends on a local xfs_io hack to co-opt the -f flag
into inducing a write failure. A POC patch for that is available here,
if you wanted to replicate:

https://lore.kernel.org/linux-xfs/20221123181322.3710820-1-bfoster@redhat.com/
Ritesh Harjani (IBM) April 26, 2023, 9:57 a.m. UTC | #5
Brian Foster <bfoster@redhat.com> writes:

> On Mon, Feb 27, 2023 at 01:13:32AM +0530, Ritesh Harjani (IBM) wrote:
>> On a 64k pagesize platforms (specially Power and/or aarch64) with 4k
>> filesystem blocksize, this patch should improve the performance by doing
>> only the subpage dirty data write.
>>
>> This should also reduce the write amplification since we can now track
>> subpage dirty status within state bitmaps. Earlier we had to
>> write the entire 64k page even if only a part of it (e.g. 4k) was
>> updated.
>>
>> Performance testing of below fio workload reveals ~16x performance
>> improvement on nvme with XFS (4k blocksize) on Power (64K pagesize)
>> FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
>>
>> 1. <test_randwrite.fio>
>> [global]
>> 	ioengine=psync
>> 	rw=randwrite
>> 	overwrite=1
>> 	pre_read=1
>> 	direct=0
>> 	bs=4k
>> 	size=1G
>> 	dir=./
>> 	numjobs=8
>> 	fdatasync=1
>> 	runtime=60
>> 	iodepth=64
>> 	group_reporting=1
>>
>> [fio-run]
>>
>> 2. Also our internal performance team reported that this patch improves there
>>    database workload performance by around ~83% (with XFS on Power)
>>
>> Reported-by: Aravinda Herle <araherle@in.ibm.com>
>> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
>> ---
>>  fs/gfs2/aops.c         |   2 +-
>>  fs/iomap/buffered-io.c | 104 +++++++++++++++++++++++++++++++++++++----
>>  fs/xfs/xfs_aops.c      |   2 +-
>>  fs/zonefs/super.c      |   2 +-
>>  include/linux/iomap.h  |   1 +
>>  5 files changed, 99 insertions(+), 12 deletions(-)
>>
> ...
>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>> index e0b0be16278e..fb55183c547f 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
> ...
>> @@ -1630,7 +1715,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>>  		struct writeback_control *wbc, struct inode *inode,
>>  		struct folio *folio, u64 end_pos)
>>  {
>> -	struct iomap_page *iop = iomap_page_create(inode, folio, 0);
>> +	struct iomap_page *iop = iomap_page_create(inode, folio, 0, true);
>>  	struct iomap_ioend *ioend, *next;
>>  	unsigned len = i_blocksize(inode);
>>  	unsigned nblocks = i_blocks_per_folio(inode, folio);
>> @@ -1646,7 +1731,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>>  	 * invalid, grab a new one.
>>  	 */
>>  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
>> -		if (iop && !iop_test_uptodate(iop, i, nblocks))
>> +		if (iop && !iop_test_dirty(iop, i, nblocks))
>>  			continue;
>>
>>  		error = wpc->ops->map_blocks(wpc, inode, pos);
>
> Hi Ritesh,
>
> I'm not sure if you followed any of the discussion on the imap
> revalidation series that landed in the last cycle or so, but the
> associated delalloc punch error handling code has a subtle dependency on
> current writeback behavior and thus left a bit of a landmine for this
> work. For reference, more detailed discussion starts around here [1].
> The context is basically that the use of mapping seek hole/data relies
> on uptodate status, which means in certain error cases the filesystem
> might allocate a delalloc block for a write, but not punch it out of the
> associated write happens to fail and the underlying portion of the folio
> was uptodate.
>
> This doesn't cause a problem in current mainline because writeback maps
> every uptodate block in a dirty folio, and so the delalloc block will
> convert at writeback time even though it wasn't written. This no longer
> occurs with the change above, which means there's a vector for a stale
> delalloc block to be left around in the inode. This is a free space
> accounting corruption issue on XFS. Here's a quick example [2] on a 1k
> FSB XFS filesystem to show exactly what I mean:
>
> # xfs_io -fc "truncate 4k" -c "mmap 0 4k" -c "mread 0 4k" -c "pwrite 0 1" -c "pwrite -f 2k 1" -c fsync /mnt/file
> # xfs_io -c "fiemap -v" /mnt/file
> /mnt/file:
>  EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
> ...
>    2: [4..5]:          0..1                 2   0x7
> ...
> (the above shows delalloc after an fsync)
> # umount /mnt
>   kernel:XFS: Assertion failed: xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0, file: fs/xfs/xfs_super.c, line: 1068
> # xfs_repair -n /dev/vdb2
> Phase 1 - find and verify superblock...
> Phase 2 - using internal log
> ...
> sb_fdblocks 20960187, counted 20960195
> ...
> #
>
> I suspect this means either the error handling code needs to be updated
> to consider dirty state (i.e. punch delalloc if the block is !dirty), or
> otherwise this needs to depend on a broader change in XFS to reclaim
> delalloc blocks before inode eviction (along the lines of Dave's recent
> proposal to do something like that for corrupted inodes). Of course the
> caveat with the latter is that doesn't help for any other filesystems
> (?) that might have similar expectations for delayed allocation and want
> to use iomap.
>
> Brian
>
> [1] https://lore.kernel.org/linux-fsdevel/Y3TsPzd0XzXXIzQv@bfoster/
>
> [2] This test case depends on a local xfs_io hack to co-opt the -f flag
> into inducing a write failure. A POC patch for that is available here,
> if you wanted to replicate:
>
> https://lore.kernel.org/linux-xfs/20221123181322.3710820-1-bfoster@redhat.com/

Thanks a lot Brian for such detailed info along with a test case.
Really appreciate your insights on this! :)
And apologies for not getting to it earlier. I picked up iomap DIO conversion for
ext2 in between. Now that it is settled, I will be working on addressing
this problem.

Yes, I did follow that patch series which you pointed, but I guess
I mostly thought it was the stale iomap problem which we were solving.
But you are right the approach we take in that patch series to punch out
the delalloc blocks in case of short writes does open up a potential bug
when we add subpage size dirty tracking in iomap, which was also
discussed during the time and I had marked it as well. But I guess I had
completely forgotten about it. So thanks for bringing it up again.

I looked into the Dave's series and what we are trying to do there is
identifying uptodate folios within the [start, end) range and when if
there is a dirty folio within that range, then we punch out all of the
data blocks from *punch_start_byte uptil before this dirty folio byte.
(because these dirty folios anyways contain uptodate dirty data which
can be written back at the time of writeback. Any uptodate non-dirty
folios can be punched out to ensure we don't leak any delalloc blocks
due to short writes).
The only problem here is...when we have subpage size blocks within a
dirty folio, it can still have subblocks within the folio which are
only marked uptodate but not dirty.
Given that we never punch out these uptodate non-dirty subblocks
from within this folio, this can leave a potential delalloc blocks leak
because at the time of writeback we only considers writing subblocks
of a folio which are marked dirty (in this patch series which adds
subpage size dirty tracking). This can then cause bug on problems as you
pointed out during unmount.

I have a short fix for this problem i.e. similar to how we do writeback i.e.
to consider the dirty state of every subblock within a folio, during
punch out operation in iomap_write_delalloc_release() ->
iomap_write_delalloc_scan().
So when we identify a dirty folio in above delalloc release path, we
should iterate over each subblock of that folio as well to punch out non-dirty
blocks. This should make sure that we don't have any uptodate non-dirty
subblocks in the entire given range from [start, end).

I have also addressed many of the other smaller review comments from
others in v3. I will include this change as well will send a new
revision for review (after some cleanups and some initial testing).

Thanks!
-ritesh
diff mbox series

Patch

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index e782b4f1d104..b9c35288a5eb 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -741,7 +741,7 @@  static const struct address_space_operations gfs2_aops = {
 	.writepages = gfs2_writepages,
 	.read_folio = gfs2_read_folio,
 	.readahead = gfs2_readahead,
-	.dirty_folio = filemap_dirty_folio,
+	.dirty_folio = iomap_dirty_folio,
 	.release_folio = iomap_release_folio,
 	.invalidate_folio = iomap_invalidate_folio,
 	.bmap = gfs2_bmap,
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e0b0be16278e..fb55183c547f 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -44,8 +44,8 @@  static inline struct iomap_page *to_iomap_page(struct folio *folio)
 static struct bio_set iomap_ioend_bioset;

 /*
- * Accessor functions for setting/clearing/checking uptodate bits in
- * iop->state bitmap.
+ * Accessor functions for setting/clearing/checking uptodate and
+ * dirty bits in iop->state bitmap.
  * nrblocks is i_blocks_per_folio() which is passed in every
  * function as the last argument for API consistency.
  */
@@ -75,8 +75,29 @@  static inline bool iop_full_uptodate(struct iomap_page *iop,
 	return bitmap_full(iop->state, nrblocks);
 }

+static inline void iop_set_range_dirty(struct iomap_page *iop,
+				unsigned int start, unsigned int len,
+				unsigned int nrblocks)
+{
+	bitmap_set(iop->state, start + nrblocks, len);
+}
+
+static inline void iop_clear_range_dirty(struct iomap_page *iop,
+				unsigned int start, unsigned int len,
+				unsigned int nrblocks)
+{
+	bitmap_clear(iop->state, start + nrblocks, len);
+}
+
+static inline bool iop_test_dirty(struct iomap_page *iop, unsigned int pos,
+			     unsigned int nrblocks)
+{
+	return test_bit(pos + nrblocks, iop->state);
+}
+
 static struct iomap_page *
-iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
+iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags,
+		  bool is_dirty)
 {
 	struct iomap_page *iop = to_iomap_page(folio);
 	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
@@ -90,12 +111,18 @@  iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
 	else
 		gfp = GFP_NOFS | __GFP_NOFAIL;

-	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(nr_blocks)),
+	/*
+	 * iop->state tracks 2 types of bitmaps i.e. uptodate & dirty
+	 * for bs < ps.
+	 */
+	iop = kzalloc(struct_size(iop, state, BITS_TO_LONGS(2 * nr_blocks)),
 		      gfp);
 	if (iop) {
 		spin_lock_init(&iop->state_lock);
 		if (folio_test_uptodate(folio))
 			iop_set_range_uptodate(iop, 0, nr_blocks, nr_blocks);
+		if (is_dirty)
+			iop_set_range_dirty(iop, 0, nr_blocks, nr_blocks);
 		folio_attach_private(folio, iop);
 	}
 	return iop;
@@ -202,6 +229,48 @@  static void iomap_set_range_uptodate(struct folio *folio,
 		folio_mark_uptodate(folio);
 }

+static void iomap_iop_set_range_dirty(struct folio *folio,
+		struct iomap_page *iop, size_t off, size_t len)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+	unsigned first = (off >> inode->i_blkbits);
+	unsigned last = ((off + len - 1) >> inode->i_blkbits);
+	unsigned long flags;
+
+	spin_lock_irqsave(&iop->state_lock, flags);
+	iop_set_range_dirty(iop, first, last - first + 1, nr_blocks);
+	spin_unlock_irqrestore(&iop->state_lock, flags);
+}
+
+static void iomap_set_range_dirty(struct folio *folio,
+		struct iomap_page *iop, size_t off, size_t len)
+{
+	if (iop)
+		iomap_iop_set_range_dirty(folio, iop, off, len);
+}
+
+static void iomap_iop_clear_range_dirty(struct folio *folio,
+		struct iomap_page *iop, size_t off, size_t len)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+	unsigned first = (off >> inode->i_blkbits);
+	unsigned last = ((off + len - 1) >> inode->i_blkbits);
+	unsigned long flags;
+
+	spin_lock_irqsave(&iop->state_lock, flags);
+	iop_clear_range_dirty(iop, first, last - first + 1, nr_blocks);
+	spin_unlock_irqrestore(&iop->state_lock, flags);
+}
+
+static void iomap_clear_range_dirty(struct folio *folio,
+		struct iomap_page *iop, size_t off, size_t len)
+{
+	if (iop)
+		iomap_iop_clear_range_dirty(folio, iop, off, len);
+}
+
 static void iomap_finish_folio_read(struct folio *folio, size_t offset,
 		size_t len, int error)
 {
@@ -265,7 +334,8 @@  static int iomap_read_inline_data(const struct iomap_iter *iter,
 	if (WARN_ON_ONCE(size > iomap->length))
 		return -EIO;
 	if (offset > 0)
-		iop = iomap_page_create(iter->inode, folio, iter->flags);
+		iop = iomap_page_create(iter->inode, folio, iter->flags,
+					folio_test_dirty(folio));
 	else
 		iop = to_iomap_page(folio);

@@ -303,7 +373,8 @@  static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
 		return iomap_read_inline_data(iter, folio);

 	/* zero post-eof blocks as the page may be mapped */
-	iop = iomap_page_create(iter->inode, folio, iter->flags);
+	iop = iomap_page_create(iter->inode, folio, iter->flags,
+				folio_test_dirty(folio));
 	iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
 	if (plen == 0)
 		goto done;
@@ -532,6 +603,18 @@  void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 }
 EXPORT_SYMBOL_GPL(iomap_invalidate_folio);

+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+	unsigned int nr_blocks = i_blocks_per_folio(mapping->host, folio);
+	struct iomap_page *iop;
+
+	iop = iomap_page_create(mapping->host, folio, 0, false);
+	iomap_set_range_dirty(folio, iop, 0,
+			      nr_blocks << mapping->host->i_blkbits);
+	return filemap_dirty_folio(mapping, folio);
+}
+EXPORT_SYMBOL_GPL(iomap_dirty_folio);
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -574,7 +657,8 @@  static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
 	    pos + len >= folio_pos(folio) + folio_size(folio))
 		return 0;

-	iop = iomap_page_create(iter->inode, folio, iter->flags);
+	iop = iomap_page_create(iter->inode, folio, iter->flags,
+				folio_test_dirty(folio));

 	if (folio_test_uptodate(folio))
 		return 0;
@@ -726,6 +810,7 @@  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	if (unlikely(copied < len && !folio_test_uptodate(folio)))
 		return 0;
 	iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
+	iomap_set_range_dirty(folio, iop, offset_in_folio(folio, pos), len);
 	filemap_dirty_folio(inode->i_mapping, folio);
 	return copied;
 }
@@ -1630,7 +1715,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct inode *inode,
 		struct folio *folio, u64 end_pos)
 {
-	struct iomap_page *iop = iomap_page_create(inode, folio, 0);
+	struct iomap_page *iop = iomap_page_create(inode, folio, 0, true);
 	struct iomap_ioend *ioend, *next;
 	unsigned len = i_blocksize(inode);
 	unsigned nblocks = i_blocks_per_folio(inode, folio);
@@ -1646,7 +1731,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 * invalid, grab a new one.
 	 */
 	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
-		if (iop && !iop_test_uptodate(iop, i, nblocks))
+		if (iop && !iop_test_dirty(iop, i, nblocks))
 			continue;

 		error = wpc->ops->map_blocks(wpc, inode, pos);
@@ -1690,6 +1775,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		}
 	}

+	iomap_clear_range_dirty(folio, iop, 0, end_pos - folio_pos(folio));
 	folio_start_writeback(folio);
 	folio_unlock(folio);

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 41734202796f..7e6c54955b4f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -571,7 +571,7 @@  const struct address_space_operations xfs_address_space_operations = {
 	.read_folio		= xfs_vm_read_folio,
 	.readahead		= xfs_vm_readahead,
 	.writepages		= xfs_vm_writepages,
-	.dirty_folio		= filemap_dirty_folio,
+	.dirty_folio		= iomap_dirty_folio,
 	.release_folio		= iomap_release_folio,
 	.invalidate_folio	= iomap_invalidate_folio,
 	.bmap			= xfs_vm_bmap,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index a9c5c3f720ad..4cefc2af87f3 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -267,7 +267,7 @@  static const struct address_space_operations zonefs_file_aops = {
 	.read_folio		= zonefs_read_folio,
 	.readahead		= zonefs_readahead,
 	.writepages		= zonefs_writepages,
-	.dirty_folio		= filemap_dirty_folio,
+	.dirty_folio		= iomap_dirty_folio,
 	.release_folio		= iomap_release_folio,
 	.invalidate_folio	= iomap_invalidate_folio,
 	.migrate_folio		= filemap_migrate_folio,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 0983dfc9a203..b60562a0b893 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -262,6 +262,7 @@  void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
 bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,