diff mbox series

[PATCHv8,5/5] iomap: Add per-block dirty state tracking to improve performance

Message ID 0c3108f6ea45e18c7aae87c2fb3d8fa3311c13a4.1686050333.git.ritesh.list@gmail.com (mailing list archive)
State New, archived
Headers show
Series iomap: Add support for per-block dirty state to improve write performance | expand

Commit Message

Ritesh Harjani (IBM) June 6, 2023, 11:43 a.m. UTC
When filesystem blocksize is less than folio size (either with
mapping_large_folio_support() or with blocksize < pagesize) and when the
folio is uptodate in pagecache, then even a byte write can cause
an entire folio to be written to disk during writeback. This happens
because we currently don't have a mechanism to track per-block dirty
state within struct iomap_folio. We currently only track uptodate state.

This patch implements support for tracking per-block dirty state in
iomap_folio->state bitmap. This should help improve the filesystem write
performance and help reduce write amplification.

Performance testing of below fio workload reveals ~16x performance
improvement using nvme with XFS (4k blocksize) on Power (64K pagesize)
FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.

1. <test_randwrite.fio>
[global]
	ioengine=psync
	rw=randwrite
	overwrite=1
	pre_read=1
	direct=0
	bs=4k
	size=1G
	dir=./
	numjobs=8
	fdatasync=1
	runtime=60
	iodepth=64
	group_reporting=1

[fio-run]

2. Also our internal performance team reported that this patch improves
   their database workload performance by around ~83% (with XFS on Power)

Reported-by: Aravinda Herle <araherle@in.ibm.com>
Reported-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 fs/gfs2/aops.c         |   2 +-
 fs/iomap/buffered-io.c | 126 +++++++++++++++++++++++++++++++++++++++--
 fs/xfs/xfs_aops.c      |   2 +-
 fs/zonefs/file.c       |   2 +-
 include/linux/iomap.h  |   1 +
 5 files changed, 126 insertions(+), 7 deletions(-)

Comments

Darrick J. Wong June 6, 2023, 3:06 p.m. UTC | #1
On Tue, Jun 06, 2023 at 05:13:52PM +0530, Ritesh Harjani (IBM) wrote:
> When filesystem blocksize is less than folio size (either with
> mapping_large_folio_support() or with blocksize < pagesize) and when the
> folio is uptodate in pagecache, then even a byte write can cause
> an entire folio to be written to disk during writeback. This happens
> because we currently don't have a mechanism to track per-block dirty
> state within struct iomap_folio. We currently only track uptodate state.
> 
> This patch implements support for tracking per-block dirty state in
> iomap_folio->state bitmap. This should help improve the filesystem write
> performance and help reduce write amplification.
> 
> Performance testing of below fio workload reveals ~16x performance
> improvement using nvme with XFS (4k blocksize) on Power (64K pagesize)
> FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
> 
> 1. <test_randwrite.fio>
> [global]
> 	ioengine=psync
> 	rw=randwrite
> 	overwrite=1
> 	pre_read=1
> 	direct=0
> 	bs=4k
> 	size=1G
> 	dir=./
> 	numjobs=8
> 	fdatasync=1
> 	runtime=60
> 	iodepth=64
> 	group_reporting=1
> 
> [fio-run]
> 
> 2. Also our internal performance team reported that this patch improves
>    their database workload performance by around ~83% (with XFS on Power)
> 
> Reported-by: Aravinda Herle <araherle@in.ibm.com>
> Reported-by: Brian Foster <bfoster@redhat.com>
> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> ---
>  fs/gfs2/aops.c         |   2 +-
>  fs/iomap/buffered-io.c | 126 +++++++++++++++++++++++++++++++++++++++--
>  fs/xfs/xfs_aops.c      |   2 +-
>  fs/zonefs/file.c       |   2 +-
>  include/linux/iomap.h  |   1 +
>  5 files changed, 126 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
> index a5f4be6b9213..75efec3c3b71 100644
> --- a/fs/gfs2/aops.c
> +++ b/fs/gfs2/aops.c
> @@ -746,7 +746,7 @@ static const struct address_space_operations gfs2_aops = {
>  	.writepages = gfs2_writepages,
>  	.read_folio = gfs2_read_folio,
>  	.readahead = gfs2_readahead,
> -	.dirty_folio = filemap_dirty_folio,
> +	.dirty_folio = iomap_dirty_folio,
>  	.release_folio = iomap_release_folio,
>  	.invalidate_folio = iomap_invalidate_folio,
>  	.bmap = gfs2_bmap,
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 2b72ca3ba37a..b99d611f1cd5 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -84,6 +84,70 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
>  		folio_mark_uptodate(folio);
>  }
>  
> +static inline bool iomap_iof_is_block_dirty(struct folio *folio,
> +			struct iomap_folio *iof, int block)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +
> +	return test_bit(block + blks_per_folio, iof->state);
> +}
> +
> +static void iomap_iof_clear_range_dirty(struct folio *folio,
> +			struct iomap_folio *iof, size_t off, size_t len)
> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +	unsigned int first_blk = off >> inode->i_blkbits;
> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> +	unsigned int nr_blks = last_blk - first_blk + 1;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iof->state_lock, flags);
> +	bitmap_clear(iof->state, first_blk + blks_per_folio, nr_blks);
> +	spin_unlock_irqrestore(&iof->state_lock, flags);
> +}
> +
> +static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
> +{
> +	struct iomap_folio *iof = iomap_get_iof(folio);
> +
> +	if (!iof)
> +		return;
> +	iomap_iof_clear_range_dirty(folio, iof, off, len);
> +}
> +
> +static void iomap_iof_set_range_dirty(struct inode *inode, struct folio *folio,
> +			struct iomap_folio *iof, size_t off, size_t len)
> +{
> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +	unsigned int first_blk = off >> inode->i_blkbits;
> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> +	unsigned int nr_blks = last_blk - first_blk + 1;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&iof->state_lock, flags);
> +	bitmap_set(iof->state, first_blk + blks_per_folio, nr_blks);
> +	spin_unlock_irqrestore(&iof->state_lock, flags);
> +}
> +
> +/*
> + * The reason we pass inode explicitly here is to avoid any race with truncate
> + * when iomap_set_range_dirty() gets called from iomap_dirty_folio().
> + * Check filemap_dirty_folio() & __folio_mark_dirty() for more details.
> + * Hence we explicitly pass mapping->host to iomap_set_range_dirty() from
> + * iomap_dirty_folio().
> + */
> +static void iomap_set_range_dirty(struct inode *inode, struct folio *folio,
> +				  size_t off, size_t len)
> +{
> +	struct iomap_folio *iof = iomap_get_iof(folio);
> +
> +	if (!iof)
> +		return;
> +	iomap_iof_set_range_dirty(inode, folio, iof, off, len);
> +}
> +
>  static struct iomap_folio *iomap_iof_alloc(struct inode *inode,
>  				struct folio *folio, unsigned int flags)
>  {
> @@ -99,12 +163,20 @@ static struct iomap_folio *iomap_iof_alloc(struct inode *inode,
>  	else
>  		gfp = GFP_NOFS | __GFP_NOFAIL;
>  
> -	iof = kzalloc(struct_size(iof, state, BITS_TO_LONGS(nr_blocks)),
> +	/*
> +	 * iof->state tracks two sets of state flags when the
> +	 * filesystem block size is smaller than the folio size.
> +	 * The first state tracks per-block uptodate and the
> +	 * second tracks per-block dirty state.
> +	 */
> +	iof = kzalloc(struct_size(iof, state, BITS_TO_LONGS(2 * nr_blocks)),
>  		      gfp);
>  	if (iof) {
>  		spin_lock_init(&iof->state_lock);
>  		if (folio_test_uptodate(folio))
> -			bitmap_fill(iof->state, nr_blocks);
> +			bitmap_set(iof->state, 0, nr_blocks);
> +		if (folio_test_dirty(folio))
> +			bitmap_set(iof->state, nr_blocks, nr_blocks);
>  		folio_attach_private(folio, iof);
>  	}
>  	return iof;
> @@ -529,6 +601,17 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
>  }
>  EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
>  
> +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
> +{
> +	struct inode *inode = mapping->host;
> +	size_t len = folio_size(folio);
> +
> +	iomap_iof_alloc(inode, folio, 0);
> +	iomap_set_range_dirty(inode, folio, 0, len);
> +	return filemap_dirty_folio(mapping, folio);
> +}
> +EXPORT_SYMBOL_GPL(iomap_dirty_folio);
> +
>  static void
>  iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
>  {
> @@ -733,6 +816,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
>  	if (unlikely(copied < len && !folio_test_uptodate(folio)))
>  		return 0;
>  	iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
> +	iomap_set_range_dirty(inode, folio, offset_in_folio(folio, pos),
> +			      copied);
>  	filemap_dirty_folio(inode->i_mapping, folio);
>  	return copied;
>  }
> @@ -902,6 +987,10 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
>  		int (*punch)(struct inode *inode, loff_t offset, loff_t length))
>  {
>  	int ret = 0;
> +	struct iomap_folio *iof;
> +	unsigned int first_blk, last_blk, i;
> +	loff_t last_byte;
> +	u8 blkbits = inode->i_blkbits;
>  
>  	if (!folio_test_dirty(folio))
>  		return ret;
> @@ -913,6 +1002,29 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
>  		if (ret)
>  			goto out;
>  	}
> +	/*
> +	 * When we have per-block dirty tracking, there can be
> +	 * blocks within a folio which are marked uptodate
> +	 * but not dirty. In that case it is necessary to punch
> +	 * out such blocks to avoid leaking any delalloc blocks.
> +	 */
> +	iof = iomap_get_iof(folio);
> +	if (!iof)
> +		goto skip_iof_punch;
> +
> +	last_byte = min_t(loff_t, end_byte - 1,
> +		(folio_next_index(folio) << PAGE_SHIFT) - 1);
> +	first_blk = offset_in_folio(folio, start_byte) >> blkbits;
> +	last_blk = offset_in_folio(folio, last_byte) >> blkbits;
> +	for (i = first_blk; i <= last_blk; i++) {
> +		if (!iomap_iof_is_block_dirty(folio, iof, i)) {
> +			ret = punch(inode, i << blkbits, 1 << blkbits);

Isn't the second argument to punch() supposed to be the offset within
the file, not the offset within the folio?

I /almost/ wonder if this chunk should be its own static inline
iomap_iof_delalloc_punch function, but ... eh.  My enthusiasm for
slicing and dicing is weak today.

--D

> +			if (ret)
> +				goto out;
> +		}
> +	}
> +
> +skip_iof_punch:
>  	/*
>  	 * Make sure the next punch start is correctly bound to
>  	 * the end of this data range, not the end of the folio.
> @@ -1646,7 +1758,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  		struct writeback_control *wbc, struct inode *inode,
>  		struct folio *folio, u64 end_pos)
>  {
> -	struct iomap_folio *iof = iomap_iof_alloc(inode, folio, 0);
> +	struct iomap_folio *iof = iomap_get_iof(folio);
>  	struct iomap_ioend *ioend, *next;
>  	unsigned len = i_blocksize(inode);
>  	unsigned nblocks = i_blocks_per_folio(inode, folio);
> @@ -1654,6 +1766,11 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	int error = 0, count = 0, i;
>  	LIST_HEAD(submit_list);
>  
> +	if (!iof && nblocks > 1) {
> +		iof = iomap_iof_alloc(inode, folio, 0);
> +		iomap_set_range_dirty(inode, folio, 0, folio_size(folio));
> +	}
> +
>  	WARN_ON_ONCE(iof && atomic_read(&iof->write_bytes_pending) != 0);
>  
>  	/*
> @@ -1662,7 +1779,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  	 * invalid, grab a new one.
>  	 */
>  	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
> -		if (iof && !iomap_iof_is_block_uptodate(iof, i))
> +		if (iof && !iomap_iof_is_block_dirty(folio, iof, i))
>  			continue;
>  
>  		error = wpc->ops->map_blocks(wpc, inode, pos);
> @@ -1706,6 +1823,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
>  		}
>  	}
>  
> +	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
>  	folio_start_writeback(folio);
>  	folio_unlock(folio);
>  
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 2ef78aa1d3f6..77c7332ae197 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
>  	.read_folio		= xfs_vm_read_folio,
>  	.readahead		= xfs_vm_readahead,
>  	.writepages		= xfs_vm_writepages,
> -	.dirty_folio		= filemap_dirty_folio,
> +	.dirty_folio		= iomap_dirty_folio,
>  	.release_folio		= iomap_release_folio,
>  	.invalidate_folio	= iomap_invalidate_folio,
>  	.bmap			= xfs_vm_bmap,
> diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
> index 132f01d3461f..e508c8e97372 100644
> --- a/fs/zonefs/file.c
> +++ b/fs/zonefs/file.c
> @@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = {
>  	.read_folio		= zonefs_read_folio,
>  	.readahead		= zonefs_readahead,
>  	.writepages		= zonefs_writepages,
> -	.dirty_folio		= filemap_dirty_folio,
> +	.dirty_folio		= iomap_dirty_folio,
>  	.release_folio		= iomap_release_folio,
>  	.invalidate_folio	= iomap_invalidate_folio,
>  	.migrate_folio		= filemap_migrate_folio,
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index e2b836c2e119..eb9335c46bf3 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -264,6 +264,7 @@ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
>  struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
>  bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
>  void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
> +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
>  int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
>  		const struct iomap_ops *ops);
>  int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
> -- 
> 2.40.1
>
Christoph Hellwig June 7, 2023, 7:04 a.m. UTC | #2
> +static inline bool iomap_iof_is_block_dirty(struct folio *folio,
> +			struct iomap_folio *iof, int block)

Two tabs indents here please and for various other functions.

> +{
> +	struct inode *inode = folio->mapping->host;
> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> +	unsigned int first_blk = off >> inode->i_blkbits;
> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> +	unsigned int nr_blks = last_blk - first_blk + 1;

Given how many places we we opencode this logic I wonder if a helper
would be usefuļ, even if the calling conventions are a bit odd.

To make this nicer it would also be good an take a neat trick from
the btrfs subpage support and use an enum for the bits, e.g.:

enum iomap_block_state {
	IOMAP_ST_UPTODATE,
	IOMAP_ST_DIRTY,

	IOMAP_ST_MAX,
};


static void iomap_ibs_calc_range(struct folio *folio, size_t off, size_t len,
		enum iomap_block_state state, unsigned int *first_blk,
		unsigned int *nr_blks)
{
	struct inode *inode = folio->mapping->host;
	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;

	*first_blk = state * blks_per_folio + (off >> inode->i_blkbits);
	*nr_blks = last_blk - first_blk + 1;
}

> +	/*
> +	 * iof->state tracks two sets of state flags when the
> +	 * filesystem block size is smaller than the folio size.
> +	 * The first state tracks per-block uptodate and the
> +	 * second tracks per-block dirty state.
> +	 */
> +	iof = kzalloc(struct_size(iof, state, BITS_TO_LONGS(2 * nr_blocks)),
>  		      gfp);

with the above this can use IOMAP_ST_MAX and make the whole thing a
little more robus.

>
>  	if (iof) {

No that this adds a lot more initialization I'd do a

	if (!iof)
		return NULL;

here and unindent the rest.
Ritesh Harjani (IBM) June 7, 2023, 10:27 a.m. UTC | #3
"Darrick J. Wong" <djwong@kernel.org> writes:

> On Tue, Jun 06, 2023 at 05:13:52PM +0530, Ritesh Harjani (IBM) wrote:
>> When filesystem blocksize is less than folio size (either with
>> mapping_large_folio_support() or with blocksize < pagesize) and when the
>> folio is uptodate in pagecache, then even a byte write can cause
>> an entire folio to be written to disk during writeback. This happens
>> because we currently don't have a mechanism to track per-block dirty
>> state within struct iomap_folio. We currently only track uptodate state.
>>
>> This patch implements support for tracking per-block dirty state in
>> iomap_folio->state bitmap. This should help improve the filesystem write
>> performance and help reduce write amplification.
>>
>> Performance testing of below fio workload reveals ~16x performance
>> improvement using nvme with XFS (4k blocksize) on Power (64K pagesize)
>> FIO reported write bw scores improved from around ~28 MBps to ~452 MBps.
>>
>> 1. <test_randwrite.fio>
>> [global]
>> 	ioengine=psync
>> 	rw=randwrite
>> 	overwrite=1
>> 	pre_read=1
>> 	direct=0
>> 	bs=4k
>> 	size=1G
>> 	dir=./
>> 	numjobs=8
>> 	fdatasync=1
>> 	runtime=60
>> 	iodepth=64
>> 	group_reporting=1
>>
>> [fio-run]
>>
>> 2. Also our internal performance team reported that this patch improves
>>    their database workload performance by around ~83% (with XFS on Power)
>>
>> Reported-by: Aravinda Herle <araherle@in.ibm.com>
>> Reported-by: Brian Foster <bfoster@redhat.com>
>> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
>> ---
>>  fs/gfs2/aops.c         |   2 +-
>>  fs/iomap/buffered-io.c | 126 +++++++++++++++++++++++++++++++++++++++--
>>  fs/xfs/xfs_aops.c      |   2 +-
>>  fs/zonefs/file.c       |   2 +-
>>  include/linux/iomap.h  |   1 +
>>  5 files changed, 126 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
>> index a5f4be6b9213..75efec3c3b71 100644
>> --- a/fs/gfs2/aops.c
>> +++ b/fs/gfs2/aops.c
>> @@ -746,7 +746,7 @@ static const struct address_space_operations gfs2_aops = {
>>  	.writepages = gfs2_writepages,
>>  	.read_folio = gfs2_read_folio,
>>  	.readahead = gfs2_readahead,
>> -	.dirty_folio = filemap_dirty_folio,
>> +	.dirty_folio = iomap_dirty_folio,
>>  	.release_folio = iomap_release_folio,
>>  	.invalidate_folio = iomap_invalidate_folio,
>>  	.bmap = gfs2_bmap,
>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>> index 2b72ca3ba37a..b99d611f1cd5 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
>> @@ -84,6 +84,70 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
>>  		folio_mark_uptodate(folio);
>>  }
>>
>> +static inline bool iomap_iof_is_block_dirty(struct folio *folio,
>> +			struct iomap_folio *iof, int block)
>> +{
>> +	struct inode *inode = folio->mapping->host;
>> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
>> +
>> +	return test_bit(block + blks_per_folio, iof->state);
>> +}
>> +
>> +static void iomap_iof_clear_range_dirty(struct folio *folio,
>> +			struct iomap_folio *iof, size_t off, size_t len)
>> +{
>> +	struct inode *inode = folio->mapping->host;
>> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
>> +	unsigned int first_blk = off >> inode->i_blkbits;
>> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
>> +	unsigned int nr_blks = last_blk - first_blk + 1;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&iof->state_lock, flags);
>> +	bitmap_clear(iof->state, first_blk + blks_per_folio, nr_blks);
>> +	spin_unlock_irqrestore(&iof->state_lock, flags);
>> +}
>> +
>> +static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
>> +{
>> +	struct iomap_folio *iof = iomap_get_iof(folio);
>> +
>> +	if (!iof)
>> +		return;
>> +	iomap_iof_clear_range_dirty(folio, iof, off, len);
>> +}
>> +
>> +static void iomap_iof_set_range_dirty(struct inode *inode, struct folio *folio,
>> +			struct iomap_folio *iof, size_t off, size_t len)
>> +{
>> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
>> +	unsigned int first_blk = off >> inode->i_blkbits;
>> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
>> +	unsigned int nr_blks = last_blk - first_blk + 1;
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&iof->state_lock, flags);
>> +	bitmap_set(iof->state, first_blk + blks_per_folio, nr_blks);
>> +	spin_unlock_irqrestore(&iof->state_lock, flags);
>> +}
>> +
>> +/*
>> + * The reason we pass inode explicitly here is to avoid any race with truncate
>> + * when iomap_set_range_dirty() gets called from iomap_dirty_folio().
>> + * Check filemap_dirty_folio() & __folio_mark_dirty() for more details.
>> + * Hence we explicitly pass mapping->host to iomap_set_range_dirty() from
>> + * iomap_dirty_folio().
>> + */
>> +static void iomap_set_range_dirty(struct inode *inode, struct folio *folio,
>> +				  size_t off, size_t len)
>> +{
>> +	struct iomap_folio *iof = iomap_get_iof(folio);
>> +
>> +	if (!iof)
>> +		return;
>> +	iomap_iof_set_range_dirty(inode, folio, iof, off, len);
>> +}
>> +
>>  static struct iomap_folio *iomap_iof_alloc(struct inode *inode,
>>  				struct folio *folio, unsigned int flags)
>>  {
>> @@ -99,12 +163,20 @@ static struct iomap_folio *iomap_iof_alloc(struct inode *inode,
>>  	else
>>  		gfp = GFP_NOFS | __GFP_NOFAIL;
>>
>> -	iof = kzalloc(struct_size(iof, state, BITS_TO_LONGS(nr_blocks)),
>> +	/*
>> +	 * iof->state tracks two sets of state flags when the
>> +	 * filesystem block size is smaller than the folio size.
>> +	 * The first state tracks per-block uptodate and the
>> +	 * second tracks per-block dirty state.
>> +	 */
>> +	iof = kzalloc(struct_size(iof, state, BITS_TO_LONGS(2 * nr_blocks)),
>>  		      gfp);
>>  	if (iof) {
>>  		spin_lock_init(&iof->state_lock);
>>  		if (folio_test_uptodate(folio))
>> -			bitmap_fill(iof->state, nr_blocks);
>> +			bitmap_set(iof->state, 0, nr_blocks);
>> +		if (folio_test_dirty(folio))
>> +			bitmap_set(iof->state, nr_blocks, nr_blocks);
>>  		folio_attach_private(folio, iof);
>>  	}
>>  	return iof;
>> @@ -529,6 +601,17 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
>>  }
>>  EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
>>
>> +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
>> +{
>> +	struct inode *inode = mapping->host;
>> +	size_t len = folio_size(folio);
>> +
>> +	iomap_iof_alloc(inode, folio, 0);
>> +	iomap_set_range_dirty(inode, folio, 0, len);
>> +	return filemap_dirty_folio(mapping, folio);
>> +}
>> +EXPORT_SYMBOL_GPL(iomap_dirty_folio);
>> +
>>  static void
>>  iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
>>  {
>> @@ -733,6 +816,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
>>  	if (unlikely(copied < len && !folio_test_uptodate(folio)))
>>  		return 0;
>>  	iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
>> +	iomap_set_range_dirty(inode, folio, offset_in_folio(folio, pos),
>> +			      copied);
>>  	filemap_dirty_folio(inode->i_mapping, folio);
>>  	return copied;
>>  }
>> @@ -902,6 +987,10 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
>>  		int (*punch)(struct inode *inode, loff_t offset, loff_t length))
>>  {
>>  	int ret = 0;
>> +	struct iomap_folio *iof;
>> +	unsigned int first_blk, last_blk, i;
>> +	loff_t last_byte;
>> +	u8 blkbits = inode->i_blkbits;
>>
>>  	if (!folio_test_dirty(folio))
>>  		return ret;
>> @@ -913,6 +1002,29 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
>>  		if (ret)
>>  			goto out;
>>  	}
>> +	/*
>> +	 * When we have per-block dirty tracking, there can be
>> +	 * blocks within a folio which are marked uptodate
>> +	 * but not dirty. In that case it is necessary to punch
>> +	 * out such blocks to avoid leaking any delalloc blocks.
>> +	 */
>> +	iof = iomap_get_iof(folio);
>> +	if (!iof)
>> +		goto skip_iof_punch;
>> +
>> +	last_byte = min_t(loff_t, end_byte - 1,
>> +		(folio_next_index(folio) << PAGE_SHIFT) - 1);
>> +	first_blk = offset_in_folio(folio, start_byte) >> blkbits;
>> +	last_blk = offset_in_folio(folio, last_byte) >> blkbits;
>> +	for (i = first_blk; i <= last_blk; i++) {
>> +		if (!iomap_iof_is_block_dirty(folio, iof, i)) {
>> +			ret = punch(inode, i << blkbits, 1 << blkbits);
>
> Isn't the second argument to punch() supposed to be the offset within
> the file, not the offset within the folio?
>

Thanks for spotting this. Somehow got missed.
Yes, it should be byte position within file. Will fix in the next rev.

    punch(inode, folio_pos(folio) + (i << blkbits), 1 << blkbits);

> I /almost/ wonder if this chunk should be its own static inline
> iomap_iof_delalloc_punch function, but ... eh.  My enthusiasm for
> slicing and dicing is weak today.
>

umm, I felt having all punch logic in one function would be better.
Hence iomap_write_delalloc_punch().

-ritesh
Ritesh Harjani (IBM) June 7, 2023, 3:37 p.m. UTC | #4
Christoph Hellwig <hch@infradead.org> writes:

>> +static inline bool iomap_iof_is_block_dirty(struct folio *folio,
>> +			struct iomap_folio *iof, int block)
>
> Two tabs indents here please and for various other functions.
>

Sure. 

>> +{
>> +	struct inode *inode = folio->mapping->host;
>> +	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
>> +	unsigned int first_blk = off >> inode->i_blkbits;
>> +	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
>> +	unsigned int nr_blks = last_blk - first_blk + 1;
>
> Given how many places we we opencode this logic I wonder if a helper
> would be usefuļ, even if the calling conventions are a bit odd.

I agree that moving it to a common helper would come useful as it is
open coded at 3 places.

>
> To make this nicer it would also be good an take a neat trick from
> the btrfs subpage support and use an enum for the bits, e.g.:
>
> enum iomap_block_state {
> 	IOMAP_ST_UPTODATE,
> 	IOMAP_ST_DIRTY,
>
> 	IOMAP_ST_MAX,
> };
>

I think the only remaining piece is the naming of this enum and struct
iomap_page.

Ok, so here is what I think my preference would be -

This enum perfectly qualifies for "iomap_block_state" as it holds the
state of per-block.
Then the struct iomap_page (iop) should be renamed to struct
"iomap_folio_state" (ifs), because that holds the state information of all the
blocks within a folio.

Is this something which sounds ok to others too? 

>
> static void iomap_ibs_calc_range(struct folio *folio, size_t off, size_t len,
> 		enum iomap_block_state state, unsigned int *first_blk,
> 		unsigned int *nr_blks)
> {
> 	struct inode *inode = folio->mapping->host;
> 	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> 	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
>
> 	*first_blk = state * blks_per_folio + (off >> inode->i_blkbits);
> 	*nr_blks = last_blk - first_blk + 1;
> }
>

Sure, like the idea of the state enum. Will make the changes.

>> +	/*
>> +	 * iof->state tracks two sets of state flags when the
>> +	 * filesystem block size is smaller than the folio size.
>> +	 * The first state tracks per-block uptodate and the
>> +	 * second tracks per-block dirty state.
>> +	 */
>> +	iof = kzalloc(struct_size(iof, state, BITS_TO_LONGS(2 * nr_blocks)),
>>  		      gfp);
>
> with the above this can use IOMAP_ST_MAX and make the whole thing a
> little more robus.
>

yes.

>>
>>  	if (iof) {
>
> No that this adds a lot more initialization I'd do a
>
> 	if (!iof)
> 		return NULL;
>
> here and unindent the rest.

Sure. Is it ok to fold this change in the same patch, right?
Or does it qualify for a seperate patch?

-ritesh
Christoph Hellwig June 8, 2023, 5:33 a.m. UTC | #5
On Wed, Jun 07, 2023 at 09:07:19PM +0530, Ritesh Harjani wrote:
> I think the only remaining piece is the naming of this enum and struct
> iomap_page.
> 
> Ok, so here is what I think my preference would be -
> 
> This enum perfectly qualifies for "iomap_block_state" as it holds the
> state of per-block.
> Then the struct iomap_page (iop) should be renamed to struct
> "iomap_folio_state" (ifs), because that holds the state information of all the
> blocks within a folio.

Fine with me.

> > 	if (!iof)
> > 		return NULL;
> >
> > here and unindent the rest.
> 
> Sure. Is it ok to fold this change in the same patch, right?
> Or does it qualify for a seperate patch?

Either way is fine with me.
Darrick J. Wong June 8, 2023, 2:45 p.m. UTC | #6
On Wed, Jun 07, 2023 at 10:33:52PM -0700, Christoph Hellwig wrote:
> On Wed, Jun 07, 2023 at 09:07:19PM +0530, Ritesh Harjani wrote:
> > I think the only remaining piece is the naming of this enum and struct
> > iomap_page.
> > 
> > Ok, so here is what I think my preference would be -
> > 
> > This enum perfectly qualifies for "iomap_block_state" as it holds the
> > state of per-block.
> > Then the struct iomap_page (iop) should be renamed to struct
> > "iomap_folio_state" (ifs), because that holds the state information of all the
> > blocks within a folio.
> 
> Fine with me.

Yeah, fine with me too.

> > > 	if (!iof)
> > > 		return NULL;
> > >
> > > here and unindent the rest.
> > 
> > Sure. Is it ok to fold this change in the same patch, right?
> > Or does it qualify for a seperate patch?
> 
> Either way is fine with me.

Same patch is ok.

--D
diff mbox series

Patch

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index a5f4be6b9213..75efec3c3b71 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -746,7 +746,7 @@  static const struct address_space_operations gfs2_aops = {
 	.writepages = gfs2_writepages,
 	.read_folio = gfs2_read_folio,
 	.readahead = gfs2_readahead,
-	.dirty_folio = filemap_dirty_folio,
+	.dirty_folio = iomap_dirty_folio,
 	.release_folio = iomap_release_folio,
 	.invalidate_folio = iomap_invalidate_folio,
 	.bmap = gfs2_bmap,
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 2b72ca3ba37a..b99d611f1cd5 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -84,6 +84,70 @@  static void iomap_set_range_uptodate(struct folio *folio, size_t off,
 		folio_mark_uptodate(folio);
 }
 
+static inline bool iomap_iof_is_block_dirty(struct folio *folio,
+			struct iomap_folio *iof, int block)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+
+	return test_bit(block + blks_per_folio, iof->state);
+}
+
+static void iomap_iof_clear_range_dirty(struct folio *folio,
+			struct iomap_folio *iof, size_t off, size_t len)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+	unsigned int first_blk = off >> inode->i_blkbits;
+	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iof->state_lock, flags);
+	bitmap_clear(iof->state, first_blk + blks_per_folio, nr_blks);
+	spin_unlock_irqrestore(&iof->state_lock, flags);
+}
+
+static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
+{
+	struct iomap_folio *iof = iomap_get_iof(folio);
+
+	if (!iof)
+		return;
+	iomap_iof_clear_range_dirty(folio, iof, off, len);
+}
+
+static void iomap_iof_set_range_dirty(struct inode *inode, struct folio *folio,
+			struct iomap_folio *iof, size_t off, size_t len)
+{
+	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+	unsigned int first_blk = off >> inode->i_blkbits;
+	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iof->state_lock, flags);
+	bitmap_set(iof->state, first_blk + blks_per_folio, nr_blks);
+	spin_unlock_irqrestore(&iof->state_lock, flags);
+}
+
+/*
+ * The reason we pass inode explicitly here is to avoid any race with truncate
+ * when iomap_set_range_dirty() gets called from iomap_dirty_folio().
+ * Check filemap_dirty_folio() & __folio_mark_dirty() for more details.
+ * Hence we explicitly pass mapping->host to iomap_set_range_dirty() from
+ * iomap_dirty_folio().
+ */
+static void iomap_set_range_dirty(struct inode *inode, struct folio *folio,
+				  size_t off, size_t len)
+{
+	struct iomap_folio *iof = iomap_get_iof(folio);
+
+	if (!iof)
+		return;
+	iomap_iof_set_range_dirty(inode, folio, iof, off, len);
+}
+
 static struct iomap_folio *iomap_iof_alloc(struct inode *inode,
 				struct folio *folio, unsigned int flags)
 {
@@ -99,12 +163,20 @@  static struct iomap_folio *iomap_iof_alloc(struct inode *inode,
 	else
 		gfp = GFP_NOFS | __GFP_NOFAIL;
 
-	iof = kzalloc(struct_size(iof, state, BITS_TO_LONGS(nr_blocks)),
+	/*
+	 * iof->state tracks two sets of state flags when the
+	 * filesystem block size is smaller than the folio size.
+	 * The first state tracks per-block uptodate and the
+	 * second tracks per-block dirty state.
+	 */
+	iof = kzalloc(struct_size(iof, state, BITS_TO_LONGS(2 * nr_blocks)),
 		      gfp);
 	if (iof) {
 		spin_lock_init(&iof->state_lock);
 		if (folio_test_uptodate(folio))
-			bitmap_fill(iof->state, nr_blocks);
+			bitmap_set(iof->state, 0, nr_blocks);
+		if (folio_test_dirty(folio))
+			bitmap_set(iof->state, nr_blocks, nr_blocks);
 		folio_attach_private(folio, iof);
 	}
 	return iof;
@@ -529,6 +601,17 @@  void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 }
 EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
 
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+	struct inode *inode = mapping->host;
+	size_t len = folio_size(folio);
+
+	iomap_iof_alloc(inode, folio, 0);
+	iomap_set_range_dirty(inode, folio, 0, len);
+	return filemap_dirty_folio(mapping, folio);
+}
+EXPORT_SYMBOL_GPL(iomap_dirty_folio);
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -733,6 +816,8 @@  static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
 	if (unlikely(copied < len && !folio_test_uptodate(folio)))
 		return 0;
 	iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
+	iomap_set_range_dirty(inode, folio, offset_in_folio(folio, pos),
+			      copied);
 	filemap_dirty_folio(inode->i_mapping, folio);
 	return copied;
 }
@@ -902,6 +987,10 @@  static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
 		int (*punch)(struct inode *inode, loff_t offset, loff_t length))
 {
 	int ret = 0;
+	struct iomap_folio *iof;
+	unsigned int first_blk, last_blk, i;
+	loff_t last_byte;
+	u8 blkbits = inode->i_blkbits;
 
 	if (!folio_test_dirty(folio))
 		return ret;
@@ -913,6 +1002,29 @@  static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
 		if (ret)
 			goto out;
 	}
+	/*
+	 * When we have per-block dirty tracking, there can be
+	 * blocks within a folio which are marked uptodate
+	 * but not dirty. In that case it is necessary to punch
+	 * out such blocks to avoid leaking any delalloc blocks.
+	 */
+	iof = iomap_get_iof(folio);
+	if (!iof)
+		goto skip_iof_punch;
+
+	last_byte = min_t(loff_t, end_byte - 1,
+		(folio_next_index(folio) << PAGE_SHIFT) - 1);
+	first_blk = offset_in_folio(folio, start_byte) >> blkbits;
+	last_blk = offset_in_folio(folio, last_byte) >> blkbits;
+	for (i = first_blk; i <= last_blk; i++) {
+		if (!iomap_iof_is_block_dirty(folio, iof, i)) {
+			ret = punch(inode, i << blkbits, 1 << blkbits);
+			if (ret)
+				goto out;
+		}
+	}
+
+skip_iof_punch:
 	/*
 	 * Make sure the next punch start is correctly bound to
 	 * the end of this data range, not the end of the folio.
@@ -1646,7 +1758,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		struct writeback_control *wbc, struct inode *inode,
 		struct folio *folio, u64 end_pos)
 {
-	struct iomap_folio *iof = iomap_iof_alloc(inode, folio, 0);
+	struct iomap_folio *iof = iomap_get_iof(folio);
 	struct iomap_ioend *ioend, *next;
 	unsigned len = i_blocksize(inode);
 	unsigned nblocks = i_blocks_per_folio(inode, folio);
@@ -1654,6 +1766,11 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	int error = 0, count = 0, i;
 	LIST_HEAD(submit_list);
 
+	if (!iof && nblocks > 1) {
+		iof = iomap_iof_alloc(inode, folio, 0);
+		iomap_set_range_dirty(inode, folio, 0, folio_size(folio));
+	}
+
 	WARN_ON_ONCE(iof && atomic_read(&iof->write_bytes_pending) != 0);
 
 	/*
@@ -1662,7 +1779,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 	 * invalid, grab a new one.
 	 */
 	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
-		if (iof && !iomap_iof_is_block_uptodate(iof, i))
+		if (iof && !iomap_iof_is_block_dirty(folio, iof, i))
 			continue;
 
 		error = wpc->ops->map_blocks(wpc, inode, pos);
@@ -1706,6 +1823,7 @@  iomap_writepage_map(struct iomap_writepage_ctx *wpc,
 		}
 	}
 
+	iomap_clear_range_dirty(folio, 0, end_pos - folio_pos(folio));
 	folio_start_writeback(folio);
 	folio_unlock(folio);
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 2ef78aa1d3f6..77c7332ae197 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -578,7 +578,7 @@  const struct address_space_operations xfs_address_space_operations = {
 	.read_folio		= xfs_vm_read_folio,
 	.readahead		= xfs_vm_readahead,
 	.writepages		= xfs_vm_writepages,
-	.dirty_folio		= filemap_dirty_folio,
+	.dirty_folio		= iomap_dirty_folio,
 	.release_folio		= iomap_release_folio,
 	.invalidate_folio	= iomap_invalidate_folio,
 	.bmap			= xfs_vm_bmap,
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 132f01d3461f..e508c8e97372 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -175,7 +175,7 @@  const struct address_space_operations zonefs_file_aops = {
 	.read_folio		= zonefs_read_folio,
 	.readahead		= zonefs_readahead,
 	.writepages		= zonefs_writepages,
-	.dirty_folio		= filemap_dirty_folio,
+	.dirty_folio		= iomap_dirty_folio,
 	.release_folio		= iomap_release_folio,
 	.invalidate_folio	= iomap_invalidate_folio,
 	.migrate_folio		= filemap_migrate_folio,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119..eb9335c46bf3 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -264,6 +264,7 @@  bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
 struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
 bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
 void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,