diff mbox series

[RFC,v4,24/34] ext4: implement buffered write iomap path

Message ID 20240410142948.2817554-25-yi.zhang@huaweicloud.com (mailing list archive)
State New
Headers show
Series ext4: use iomap for regular file's buffered IO path and enable large folio | expand

Commit Message

Zhang Yi April 10, 2024, 2:29 p.m. UTC
From: Zhang Yi <yi.zhang@huawei.com>

Implement buffered write iomap path, use ext4_da_map_blocks() to map
delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
delalloc is disabled or free space is about to run out.

Note that we always allocate unwritten extents for new blocks in the
iomap write path, this means that the allocation type is no longer
controlled by the dioread_nolock mount option. After that, we could
postpone the i_disksize updating to the writeback path, and drop journal
handle in the buffered dealloc write path completely.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h  |   3 +
 fs/ext4/file.c  |  19 +++++-
 fs/ext4/inode.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 183 insertions(+), 7 deletions(-)

Comments

Dave Chinner May 1, 2024, 8:11 a.m. UTC | #1
On Wed, Apr 10, 2024 at 10:29:38PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Implement buffered write iomap path, use ext4_da_map_blocks() to map
> delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
> delalloc is disabled or free space is about to run out.
> 
> Note that we always allocate unwritten extents for new blocks in the
> iomap write path, this means that the allocation type is no longer
> controlled by the dioread_nolock mount option. After that, we could
> postpone the i_disksize updating to the writeback path, and drop journal
> handle in the buffered dealloc write path completely.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  fs/ext4/ext4.h  |   3 +
>  fs/ext4/file.c  |  19 +++++-
>  fs/ext4/inode.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++--
>  3 files changed, 183 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 05949a8136ae..2bd543c43341 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2970,6 +2970,7 @@ int ext4_walk_page_buffers(handle_t *handle,
>  				     struct buffer_head *bh));
>  int do_journal_get_write_access(handle_t *handle, struct inode *inode,
>  				struct buffer_head *bh);
> +int ext4_nonda_switch(struct super_block *sb);
>  #define FALL_BACK_TO_NONDELALLOC 1
>  #define CONVERT_INLINE_DATA	 2
>  
> @@ -3827,6 +3828,8 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
>  extern const struct iomap_ops ext4_iomap_ops;
>  extern const struct iomap_ops ext4_iomap_overwrite_ops;
>  extern const struct iomap_ops ext4_iomap_report_ops;
> +extern const struct iomap_ops ext4_iomap_buffered_write_ops;
> +extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
>  
>  static inline int ext4_buffer_uptodate(struct buffer_head *bh)
>  {
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 54d6ff22585c..52f37c49572a 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -282,6 +282,20 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>  	return count;
>  }
>  
> +static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
> +					 struct iov_iter *from)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +	const struct iomap_ops *iomap_ops;
> +
> +	if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
> +		iomap_ops = &ext4_iomap_buffered_da_write_ops;
> +	else
> +		iomap_ops = &ext4_iomap_buffered_write_ops;
> +
> +	return iomap_file_buffered_write(iocb, from, iomap_ops);
> +}
> +
>  static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>  					struct iov_iter *from)
>  {
> @@ -296,7 +310,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>  	if (ret <= 0)
>  		goto out;
>  
> -	ret = generic_perform_write(iocb, from);
> +	if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
> +		ret = ext4_iomap_buffered_write(iocb, from);
> +	else
> +		ret = generic_perform_write(iocb, from);
>  
>  out:
>  	inode_unlock(inode);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 20eb772f4f62..e825ed16fd60 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2857,7 +2857,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
>  	return ret;
>  }
>  
> -static int ext4_nonda_switch(struct super_block *sb)
> +int ext4_nonda_switch(struct super_block *sb)
>  {
>  	s64 free_clusters, dirty_clusters;
>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
> @@ -3254,6 +3254,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
>  	return inode->i_state & I_DIRTY_DATASYNC;
>  }
>  
> +static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
> +{
> +	return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
> +}
> +
> +static const struct iomap_folio_ops ext4_iomap_folio_ops = {
> +	.iomap_valid = ext4_iomap_valid,
> +};
> +
>  static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>  			   struct ext4_map_blocks *map, loff_t offset,
>  			   loff_t length, unsigned int flags)
> @@ -3284,6 +3293,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>  	    !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>  		iomap->flags |= IOMAP_F_MERGED;
>  
> +	iomap->validity_cookie = READ_ONCE(EXT4_I(inode)->i_es_seq);
> +	iomap->folio_ops = &ext4_iomap_folio_ops;
> +
>  	/*
>  	 * Flags passed to ext4_map_blocks() for direct I/O writes can result
>  	 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
> @@ -3523,11 +3535,42 @@ const struct iomap_ops ext4_iomap_report_ops = {
>  	.iomap_begin = ext4_iomap_begin_report,
>  };
>  
> -static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
> +static int ext4_iomap_get_blocks(struct inode *inode,
> +				 struct ext4_map_blocks *map)
> +{
> +	handle_t *handle;
> +	int ret, needed_blocks;
> +
> +	/*
> +	 * Reserve one block more for addition to orphan list in case
> +	 * we allocate blocks but write fails for some reason.
> +	 */
> +	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
> +	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
> +	if (IS_ERR(handle))
> +		return PTR_ERR(handle);
> +
> +	ret = ext4_map_blocks(handle, inode, map,
> +			      EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
> +	/*
> +	 * Have to stop journal here since there is a potential deadlock
> +	 * caused by later balance_dirty_pages(), it might wait on the
> +	 * ditry pages to be written back, which might start another
> +	 * handle and wait this handle stop.
> +	 */
> +	ext4_journal_stop(handle);
> +
> +	return ret;
> +}
> +
> +#define IOMAP_F_EXT4_DELALLOC		IOMAP_F_PRIVATE
> +
> +static int __ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
>  				loff_t length, unsigned int iomap_flags,
> -				struct iomap *iomap, struct iomap *srcmap)
> +				struct iomap *iomap, struct iomap *srcmap,
> +				bool delalloc)
>  {
> -	int ret;
> +	int ret, retries = 0;
>  	struct ext4_map_blocks map;
>  	u8 blkbits = inode->i_blkbits;
>  
> @@ -3537,20 +3580,133 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
>  		return -EINVAL;
>  	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
>  		return -ERANGE;
> -
> +retry:
>  	/* Calculate the first and last logical blocks respectively. */
>  	map.m_lblk = offset >> blkbits;
>  	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
>  			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
> +	if (iomap_flags & IOMAP_WRITE) {
> +		if (delalloc)
> +			ret = ext4_da_map_blocks(inode, &map);
> +		else
> +			ret = ext4_iomap_get_blocks(inode, &map);
>  
> -	ret = ext4_map_blocks(NULL, inode, &map, 0);
> +		if (ret == -ENOSPC &&
> +		    ext4_should_retry_alloc(inode->i_sb, &retries))
> +			goto retry;
> +	} else {
> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
> +	}
>  	if (ret < 0)
>  		return ret;
>  
>  	ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags);
> +	if (delalloc)
> +		iomap->flags |= IOMAP_F_EXT4_DELALLOC;
> +
> +	return 0;
> +}

Why are you implementing both read and write mapping paths in
the one function? The whole point of having separate ops vectors for
read and write is that it allows a clean separation of the read and
write mapping operations. i.e. there is no need to use "if (write)
else {do read}" code constructs at all.

You can even have a different delalloc mapping function so you don't
need "if (delalloc) else {do nonda}" branches everiywhere...

> +
> +static inline int ext4_iomap_buffered_io_begin(struct inode *inode,
> +			loff_t offset, loff_t length, unsigned int flags,
> +			struct iomap *iomap, struct iomap *srcmap)
> +{
> +	return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
> +					      iomap, srcmap, false);
> +}
> +
> +static inline int ext4_iomap_buffered_da_write_begin(struct inode *inode,
> +			loff_t offset, loff_t length, unsigned int flags,
> +			struct iomap *iomap, struct iomap *srcmap)
> +{
> +	return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
> +					      iomap, srcmap, true);
> +}
> +
> +/*
> + * Drop the staled delayed allocation range from the write failure,
> + * including both start and end blocks. If not, we could leave a range
> + * of delayed extents covered by a clean folio, it could lead to
> + * inaccurate space reservation.
> + */
> +static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
> +				     loff_t length)
> +{
> +	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
> +			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
>  	return 0;
>  }
>  
> +static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
> +					 loff_t length, ssize_t written,
> +					 unsigned int flags,
> +					 struct iomap *iomap)
> +{
> +	handle_t *handle;
> +	loff_t end;
> +	int ret = 0, ret2;
> +
> +	/* delalloc */
> +	if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
> +		ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
> +			offset, length, written, ext4_iomap_punch_delalloc);
> +		if (ret)
> +			ext4_warning(inode->i_sb,
> +			     "Failed to clean up delalloc for inode %lu, %d",
> +			     inode->i_ino, ret);
> +		return ret;
> +	}

Why are you creating a delalloc extent for the write operation and
then immediately deleting it from the extent tree once the write
operation is done? Delayed allocation is a state that persists for
that file range until the data is written back, right, but this
appears to return the range to a hole in the extent state tree?

What happens when we do a second write to this same range? WIll it
do another delayed allocation because there are no extents over this
range?

Also, why do you need IOMAP_F_EXT4_DELALLOC? Isn't a delalloc iomap
set up with iomap->type = IOMAP_DELALLOC? Why can't that be used?

-Dave.
Dave Chinner May 1, 2024, 8:33 a.m. UTC | #2
On Wed, May 01, 2024 at 06:11:13PM +1000, Dave Chinner wrote:
> On Wed, Apr 10, 2024 at 10:29:38PM +0800, Zhang Yi wrote:
> > From: Zhang Yi <yi.zhang@huawei.com>
> > 
> > Implement buffered write iomap path, use ext4_da_map_blocks() to map
> > delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
> > delalloc is disabled or free space is about to run out.
> > 
> > Note that we always allocate unwritten extents for new blocks in the
> > iomap write path, this means that the allocation type is no longer
> > controlled by the dioread_nolock mount option. After that, we could
> > postpone the i_disksize updating to the writeback path, and drop journal
> > handle in the buffered dealloc write path completely.
.....
> > +/*
> > + * Drop the staled delayed allocation range from the write failure,
> > + * including both start and end blocks. If not, we could leave a range
> > + * of delayed extents covered by a clean folio, it could lead to
> > + * inaccurate space reservation.
> > + */
> > +static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
> > +				     loff_t length)
> > +{
> > +	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
> > +			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
> >  	return 0;
> >  }
> >  
> > +static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
> > +					 loff_t length, ssize_t written,
> > +					 unsigned int flags,
> > +					 struct iomap *iomap)
> > +{
> > +	handle_t *handle;
> > +	loff_t end;
> > +	int ret = 0, ret2;
> > +
> > +	/* delalloc */
> > +	if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
> > +		ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
> > +			offset, length, written, ext4_iomap_punch_delalloc);
> > +		if (ret)
> > +			ext4_warning(inode->i_sb,
> > +			     "Failed to clean up delalloc for inode %lu, %d",
> > +			     inode->i_ino, ret);
> > +		return ret;
> > +	}
> 
> Why are you creating a delalloc extent for the write operation and
> then immediately deleting it from the extent tree once the write
> operation is done?

Ignore this, I mixed up the ext4_iomap_punch_delalloc() code
directly above with iomap_file_buffered_write_punch_delalloc().

In hindsight, iomap_file_buffered_write_punch_delalloc() is poorly
named, as it is handling a short write situation which requires
newly allocated delalloc blocks to be punched.
iomap_file_buffered_write_finish() would probably be a better name
for it....

> Also, why do you need IOMAP_F_EXT4_DELALLOC? Isn't a delalloc iomap
> set up with iomap->type = IOMAP_DELALLOC? Why can't that be used?

But this still stands - the first thing
iomap_file_buffered_write_punch_delalloc() is:

	if (iomap->type != IOMAP_DELALLOC)
                return 0;

Cheers,

Dave.
Zhang Yi May 6, 2024, 11:21 a.m. UTC | #3
On 2024/5/1 16:11, Dave Chinner wrote:
> On Wed, Apr 10, 2024 at 10:29:38PM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> Implement buffered write iomap path, use ext4_da_map_blocks() to map
>> delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
>> delalloc is disabled or free space is about to run out.
>>
>> Note that we always allocate unwritten extents for new blocks in the
>> iomap write path, this means that the allocation type is no longer
>> controlled by the dioread_nolock mount option. After that, we could
>> postpone the i_disksize updating to the writeback path, and drop journal
>> handle in the buffered dealloc write path completely.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>>  fs/ext4/ext4.h  |   3 +
>>  fs/ext4/file.c  |  19 +++++-
>>  fs/ext4/inode.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++--
>>  3 files changed, 183 insertions(+), 7 deletions(-)
>>
[...]
>> +#define IOMAP_F_EXT4_DELALLOC		IOMAP_F_PRIVATE
>> +
>> +static int __ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
>>  				loff_t length, unsigned int iomap_flags,
>> -				struct iomap *iomap, struct iomap *srcmap)
>> +				struct iomap *iomap, struct iomap *srcmap,
>> +				bool delalloc)
>>  {
>> -	int ret;
>> +	int ret, retries = 0;
>>  	struct ext4_map_blocks map;
>>  	u8 blkbits = inode->i_blkbits;
>>  
>> @@ -3537,20 +3580,133 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
>>  		return -EINVAL;
>>  	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
>>  		return -ERANGE;
>> -
>> +retry:
>>  	/* Calculate the first and last logical blocks respectively. */
>>  	map.m_lblk = offset >> blkbits;
>>  	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
>>  			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
>> +	if (iomap_flags & IOMAP_WRITE) {
>> +		if (delalloc)
>> +			ret = ext4_da_map_blocks(inode, &map);
>> +		else
>> +			ret = ext4_iomap_get_blocks(inode, &map);
>>  
>> -	ret = ext4_map_blocks(NULL, inode, &map, 0);
>> +		if (ret == -ENOSPC &&
>> +		    ext4_should_retry_alloc(inode->i_sb, &retries))
>> +			goto retry;
>> +	} else {
>> +		ret = ext4_map_blocks(NULL, inode, &map, 0);
>> +	}
>>  	if (ret < 0)
>>  		return ret;
>>  
>>  	ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags);
>> +	if (delalloc)
>> +		iomap->flags |= IOMAP_F_EXT4_DELALLOC;
>> +
>> +	return 0;
>> +}
> 
> Why are you implementing both read and write mapping paths in
> the one function? The whole point of having separate ops vectors for
> read and write is that it allows a clean separation of the read and
> write mapping operations. i.e. there is no need to use "if (write)
> else {do read}" code constructs at all.
> 
> You can even have a different delalloc mapping function so you don't
> need "if (delalloc) else {do nonda}" branches everiywhere...
> 

Because current ->iomap_begin() for ext4 buffered IO path
(i.e. __ext4_iomap_buffered_io_begin()) is simple, almost only the map
blocks handlers are different for read, da write and no da write paths,
the rest of the function parameter check and inode status check are
the same, and I noticed that the ->iomap_begin() for direct IO path
(i.e. ext4_iomap_begin()) also implemented in one function. So I'd
like to save some code now, and it looks like implement them in one
function doesn't make this function too complicated, I guess we could
split them if things change in the future.

But think about it again, split them now could make things more clear,
it's also fine to me.

Thanks,
Yi.
Zhang Yi May 6, 2024, 11:44 a.m. UTC | #4
On 2024/5/1 16:33, Dave Chinner wrote:
> On Wed, May 01, 2024 at 06:11:13PM +1000, Dave Chinner wrote:
>> On Wed, Apr 10, 2024 at 10:29:38PM +0800, Zhang Yi wrote:
>>> From: Zhang Yi <yi.zhang@huawei.com>
>>>
>>> Implement buffered write iomap path, use ext4_da_map_blocks() to map
>>> delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
>>> delalloc is disabled or free space is about to run out.
>>>
>>> Note that we always allocate unwritten extents for new blocks in the
>>> iomap write path, this means that the allocation type is no longer
>>> controlled by the dioread_nolock mount option. After that, we could
>>> postpone the i_disksize updating to the writeback path, and drop journal
>>> handle in the buffered dealloc write path completely.
> .....
>>> +/*
>>> + * Drop the staled delayed allocation range from the write failure,
>>> + * including both start and end blocks. If not, we could leave a range
>>> + * of delayed extents covered by a clean folio, it could lead to
>>> + * inaccurate space reservation.
>>> + */
>>> +static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
>>> +				     loff_t length)
>>> +{
>>> +	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
>>> +			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
>>>  	return 0;
>>>  }
>>>  
>>> +static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
>>> +					 loff_t length, ssize_t written,
>>> +					 unsigned int flags,
>>> +					 struct iomap *iomap)
>>> +{
>>> +	handle_t *handle;
>>> +	loff_t end;
>>> +	int ret = 0, ret2;
>>> +
>>> +	/* delalloc */
>>> +	if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
>>> +		ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
>>> +			offset, length, written, ext4_iomap_punch_delalloc);
>>> +		if (ret)
>>> +			ext4_warning(inode->i_sb,
>>> +			     "Failed to clean up delalloc for inode %lu, %d",
>>> +			     inode->i_ino, ret);
>>> +		return ret;
>>> +	}
>>
>> Why are you creating a delalloc extent for the write operation and
>> then immediately deleting it from the extent tree once the write
>> operation is done?
> 
> Ignore this, I mixed up the ext4_iomap_punch_delalloc() code
> directly above with iomap_file_buffered_write_punch_delalloc().
> 
> In hindsight, iomap_file_buffered_write_punch_delalloc() is poorly
> named, as it is handling a short write situation which requires
> newly allocated delalloc blocks to be punched.
> iomap_file_buffered_write_finish() would probably be a better name
> for it....
> 
>> Also, why do you need IOMAP_F_EXT4_DELALLOC? Isn't a delalloc iomap
>> set up with iomap->type = IOMAP_DELALLOC? Why can't that be used?
> 
> But this still stands - the first thing
> iomap_file_buffered_write_punch_delalloc() is:
> 
> 	if (iomap->type != IOMAP_DELALLOC)
>                 return 0;
> 

Thanks for the suggestion, the delalloc and non-delalloc write paths
share the same ->iomap_end() now (i.e. ext4_iomap_buffered_write_end()),
I use the IOMAP_F_EXT4_DELALLOC to identify the write path. For
non-delalloc path, If we have allocated more blocks and copied less, we
should truncate extra blocks that newly allocated by ->iomap_begin().
If we use IOMAP_DELALLOC, we can't tell if the blocks are pre-existing
or newly allocated, we can't truncate the pre-existing blocks, so I have
to introduce IOMAP_F_EXT4_DELALLOC. But if we split the delalloc and
non-delalloc handler, we could drop IOMAP_F_EXT4_DELALLOC.

I also checked xfs, IIUC, xfs doesn't free the extra blocks beyond EOF
in xfs_buffered_write_iomap_end() for non-delalloc case since they will
be freed by xfs_free_eofblocks in some other inactive paths, like
xfs_release()/xfs_inactive()/..., is that right?

Thanks,
Yi.
Dave Chinner May 6, 2024, 11:19 p.m. UTC | #5
On Mon, May 06, 2024 at 07:44:44PM +0800, Zhang Yi wrote:
> On 2024/5/1 16:33, Dave Chinner wrote:
> > On Wed, May 01, 2024 at 06:11:13PM +1000, Dave Chinner wrote:
> >> On Wed, Apr 10, 2024 at 10:29:38PM +0800, Zhang Yi wrote:
> >>> From: Zhang Yi <yi.zhang@huawei.com>
> >>>
> >>> Implement buffered write iomap path, use ext4_da_map_blocks() to map
> >>> delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
> >>> delalloc is disabled or free space is about to run out.
> >>>
> >>> Note that we always allocate unwritten extents for new blocks in the
> >>> iomap write path, this means that the allocation type is no longer
> >>> controlled by the dioread_nolock mount option. After that, we could
> >>> postpone the i_disksize updating to the writeback path, and drop journal
> >>> handle in the buffered dealloc write path completely.
> > .....
> >>> +/*
> >>> + * Drop the staled delayed allocation range from the write failure,
> >>> + * including both start and end blocks. If not, we could leave a range
> >>> + * of delayed extents covered by a clean folio, it could lead to
> >>> + * inaccurate space reservation.
> >>> + */
> >>> +static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
> >>> +				     loff_t length)
> >>> +{
> >>> +	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
> >>> +			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
> >>>  	return 0;
> >>>  }
> >>>  
> >>> +static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
> >>> +					 loff_t length, ssize_t written,
> >>> +					 unsigned int flags,
> >>> +					 struct iomap *iomap)
> >>> +{
> >>> +	handle_t *handle;
> >>> +	loff_t end;
> >>> +	int ret = 0, ret2;
> >>> +
> >>> +	/* delalloc */
> >>> +	if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
> >>> +		ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
> >>> +			offset, length, written, ext4_iomap_punch_delalloc);
> >>> +		if (ret)
> >>> +			ext4_warning(inode->i_sb,
> >>> +			     "Failed to clean up delalloc for inode %lu, %d",
> >>> +			     inode->i_ino, ret);
> >>> +		return ret;
> >>> +	}
> >>
> >> Why are you creating a delalloc extent for the write operation and
> >> then immediately deleting it from the extent tree once the write
> >> operation is done?
> > 
> > Ignore this, I mixed up the ext4_iomap_punch_delalloc() code
> > directly above with iomap_file_buffered_write_punch_delalloc().
> > 
> > In hindsight, iomap_file_buffered_write_punch_delalloc() is poorly
> > named, as it is handling a short write situation which requires
> > newly allocated delalloc blocks to be punched.
> > iomap_file_buffered_write_finish() would probably be a better name
> > for it....
> > 
> >> Also, why do you need IOMAP_F_EXT4_DELALLOC? Isn't a delalloc iomap
> >> set up with iomap->type = IOMAP_DELALLOC? Why can't that be used?
> > 
> > But this still stands - the first thing
> > iomap_file_buffered_write_punch_delalloc() is:
> > 
> > 	if (iomap->type != IOMAP_DELALLOC)
> >                 return 0;
> > 
> 
> Thanks for the suggestion, the delalloc and non-delalloc write paths
> share the same ->iomap_end() now (i.e. ext4_iomap_buffered_write_end()),
> I use the IOMAP_F_EXT4_DELALLOC to identify the write path.

Again, you don't need that. iomap tracks newly allocated
IOMAP_DELALLOC extents via the IOMAP_F_NEW flag that should be
getting set in the ->iomap_begin() call when it creates a new
delalloc extent.

Please look at the second check in
iomap_file_buffered_write_punch_delalloc():

	if (iomap->type != IOMAP_DELALLOC)
                return 0;

        /* If we didn't reserve the blocks, we're not allowed to punch them. */
        if (!(iomap->flags & IOMAP_F_NEW))
                return 0;

> For
> non-delalloc path, If we have allocated more blocks and copied less, we
> should truncate extra blocks that newly allocated by ->iomap_begin().

Why? If they were allocated as unwritten, then you can just leave
them there as unwritten extents, same as XFS. Keep in mind that if
we get a short write, it is extremely likely the application is
going to rewrite the remaining data immediately, so if we allocated
blocks they are likely to still be needed, anyway....

> If we use IOMAP_DELALLOC, we can't tell if the blocks are
> pre-existing or newly allocated, we can't truncate the
> pre-existing blocks, so I have to introduce IOMAP_F_EXT4_DELALLOC.
> But if we split the delalloc and non-delalloc handler, we could
> drop IOMAP_F_EXT4_DELALLOC.

As per above: IOMAP_F_NEW tells us -exactly- this.

IOMAP_F_NEW should be set on any newly allocated block - delalloc or
real - because that's the flag that tells the iomap infrastructure
whether zero-around is needed for partial block writes. If ext4 is
not setting this flag on delalloc regions allocated by
->iomap_begin(), then that's a serious bug.

> I also checked xfs, IIUC, xfs doesn't free the extra blocks beyond EOF
> in xfs_buffered_write_iomap_end() for non-delalloc case since they will
> be freed by xfs_free_eofblocks in some other inactive paths, like
> xfs_release()/xfs_inactive()/..., is that right?

XFS doesn't care about real blocks beyond EOF existing -
xfs_free_eofblocks() is an optimistic operation that does not
guarantee that it will remove blocks beyond EOF. Similarly, we don't
care about real blocks within EOF because we alway allocate data
extents as unwritten, so we don't have any stale data exposure
issues to worry about on short writes leaving allocated blocks
behind.

OTOH, delalloc extents without dirty page cache pages over them
cannot be allowed to exist. Without dirty pages, there is no trigger
to convert those to real extents (i.e. nothing to write back). Hence
the only sane thing that can be done with them on a write error or
short write is remove them in the context where they were created.

This is the only reason that the
iomap_file_buffered_write_punch_delalloc() exists - it abstracts
this nasty corner case away from filesystems that support delalloc
so they don't have to worry about getting this right. That's whole
point of having delalloc aware infrastructure - individual
filesysetms don't need to handle all these weird corner cases
themselves because the infrastructure takes care of them...

-Dave.
Zhang Yi May 7, 2024, 5:10 a.m. UTC | #6
On 2024/5/7 7:19, Dave Chinner wrote:
> On Mon, May 06, 2024 at 07:44:44PM +0800, Zhang Yi wrote:
>> On 2024/5/1 16:33, Dave Chinner wrote:
>>> On Wed, May 01, 2024 at 06:11:13PM +1000, Dave Chinner wrote:
>>>> On Wed, Apr 10, 2024 at 10:29:38PM +0800, Zhang Yi wrote:
>>>>> From: Zhang Yi <yi.zhang@huawei.com>
>>>>>
>>>>> Implement buffered write iomap path, use ext4_da_map_blocks() to map
>>>>> delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if
>>>>> delalloc is disabled or free space is about to run out.
>>>>>
>>>>> Note that we always allocate unwritten extents for new blocks in the
>>>>> iomap write path, this means that the allocation type is no longer
>>>>> controlled by the dioread_nolock mount option. After that, we could
>>>>> postpone the i_disksize updating to the writeback path, and drop journal
>>>>> handle in the buffered dealloc write path completely.
>>> .....
>>>>> +/*
>>>>> + * Drop the staled delayed allocation range from the write failure,
>>>>> + * including both start and end blocks. If not, we could leave a range
>>>>> + * of delayed extents covered by a clean folio, it could lead to
>>>>> + * inaccurate space reservation.
>>>>> + */
>>>>> +static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
>>>>> +				     loff_t length)
>>>>> +{
>>>>> +	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
>>>>> +			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
>>>>>  	return 0;
>>>>>  }
>>>>>  
>>>>> +static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
>>>>> +					 loff_t length, ssize_t written,
>>>>> +					 unsigned int flags,
>>>>> +					 struct iomap *iomap)
>>>>> +{
>>>>> +	handle_t *handle;
>>>>> +	loff_t end;
>>>>> +	int ret = 0, ret2;
>>>>> +
>>>>> +	/* delalloc */
>>>>> +	if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
>>>>> +		ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
>>>>> +			offset, length, written, ext4_iomap_punch_delalloc);
>>>>> +		if (ret)
>>>>> +			ext4_warning(inode->i_sb,
>>>>> +			     "Failed to clean up delalloc for inode %lu, %d",
>>>>> +			     inode->i_ino, ret);
>>>>> +		return ret;
>>>>> +	}
>>>>
>>>> Why are you creating a delalloc extent for the write operation and
>>>> then immediately deleting it from the extent tree once the write
>>>> operation is done?
>>>
>>> Ignore this, I mixed up the ext4_iomap_punch_delalloc() code
>>> directly above with iomap_file_buffered_write_punch_delalloc().
>>>
>>> In hindsight, iomap_file_buffered_write_punch_delalloc() is poorly
>>> named, as it is handling a short write situation which requires
>>> newly allocated delalloc blocks to be punched.
>>> iomap_file_buffered_write_finish() would probably be a better name
>>> for it....
>>>
>>>> Also, why do you need IOMAP_F_EXT4_DELALLOC? Isn't a delalloc iomap
>>>> set up with iomap->type = IOMAP_DELALLOC? Why can't that be used?
>>>
>>> But this still stands - the first thing
>>> iomap_file_buffered_write_punch_delalloc() is:
>>>
>>> 	if (iomap->type != IOMAP_DELALLOC)
>>>                 return 0;
>>>
>>
>> Thanks for the suggestion, the delalloc and non-delalloc write paths
>> share the same ->iomap_end() now (i.e. ext4_iomap_buffered_write_end()),
>> I use the IOMAP_F_EXT4_DELALLOC to identify the write path.
> 
> Again, you don't need that. iomap tracks newly allocated
> IOMAP_DELALLOC extents via the IOMAP_F_NEW flag that should be
> getting set in the ->iomap_begin() call when it creates a new
> delalloc extent.
> 
> Please look at the second check in
> iomap_file_buffered_write_punch_delalloc():
> 
> 	if (iomap->type != IOMAP_DELALLOC)
>                 return 0;
> 
>         /* If we didn't reserve the blocks, we're not allowed to punch them. */
>         if (!(iomap->flags & IOMAP_F_NEW))
>                 return 0;
> 
>> For
>> non-delalloc path, If we have allocated more blocks and copied less, we
>> should truncate extra blocks that newly allocated by ->iomap_begin().
> 
> Why? If they were allocated as unwritten, then you can just leave
> them there as unwritten extents, same as XFS. Keep in mind that if
> we get a short write, it is extremely likely the application is
> going to rewrite the remaining data immediately, so if we allocated
> blocks they are likely to still be needed, anyway....
> 

Make sense, we don't need to free the extra blocks beyond EOF since they
are unwritten, we can drop this handle for non-delalloc path on ext4 now.

>> If we use IOMAP_DELALLOC, we can't tell if the blocks are
>> pre-existing or newly allocated, we can't truncate the
>> pre-existing blocks, so I have to introduce IOMAP_F_EXT4_DELALLOC.
>> But if we split the delalloc and non-delalloc handler, we could
>> drop IOMAP_F_EXT4_DELALLOC.
> 
> As per above: IOMAP_F_NEW tells us -exactly- this.
> 
> IOMAP_F_NEW should be set on any newly allocated block - delalloc or
> real - because that's the flag that tells the iomap infrastructure
> whether zero-around is needed for partial block writes. If ext4 is
> not setting this flag on delalloc regions allocated by
> ->iomap_begin(), then that's a serious bug.
> 
>> I also checked xfs, IIUC, xfs doesn't free the extra blocks beyond EOF
>> in xfs_buffered_write_iomap_end() for non-delalloc case since they will
>> be freed by xfs_free_eofblocks in some other inactive paths, like
>> xfs_release()/xfs_inactive()/..., is that right?
> 
> XFS doesn't care about real blocks beyond EOF existing -
> xfs_free_eofblocks() is an optimistic operation that does not
> guarantee that it will remove blocks beyond EOF. Similarly, we don't
> care about real blocks within EOF because we alway allocate data
> extents as unwritten, so we don't have any stale data exposure
> issues to worry about on short writes leaving allocated blocks
> behind.
> 
> OTOH, delalloc extents without dirty page cache pages over them
> cannot be allowed to exist. Without dirty pages, there is no trigger
> to convert those to real extents (i.e. nothing to write back). Hence
> the only sane thing that can be done with them on a write error or
> short write is remove them in the context where they were created.
> 
> This is the only reason that the
> iomap_file_buffered_write_punch_delalloc() exists - it abstracts
> this nasty corner case away from filesystems that support delalloc
> so they don't have to worry about getting this right. That's whole
> point of having delalloc aware infrastructure - individual
> filesysetms don't need to handle all these weird corner cases
> themselves because the infrastructure takes care of them...
> 

Yeah, thanks for the explanation. The iomap_file_buffered_write_punch_delalloc()
is very useful, it find pages that have dirty data still pending in the page
cache, punch out all the delalloc blocks beside those blocks. I realized that
it is used to fix a race condition between either writeback or mmap page
faults that xfs encountered [1].

We will meet the same problem for ext3 and ext2 which are not extent based.
Their new allocated blocks were written, we need to free them if we get a short
write, but we can't simply do it through ext2_write_failed() and
ext4_truncate_failed_write(), we still need to use
iomap_file_buffered_write_punch_delalloc().

[1] https://lore.kernel.org/all/20221123055812.747923-6-david@fromorbit.com/

Thanks,
Yi.
diff mbox series

Patch

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 05949a8136ae..2bd543c43341 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2970,6 +2970,7 @@  int ext4_walk_page_buffers(handle_t *handle,
 				     struct buffer_head *bh));
 int do_journal_get_write_access(handle_t *handle, struct inode *inode,
 				struct buffer_head *bh);
+int ext4_nonda_switch(struct super_block *sb);
 #define FALL_BACK_TO_NONDELALLOC 1
 #define CONVERT_INLINE_DATA	 2
 
@@ -3827,6 +3828,8 @@  static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 extern const struct iomap_ops ext4_iomap_ops;
 extern const struct iomap_ops ext4_iomap_overwrite_ops;
 extern const struct iomap_ops ext4_iomap_report_ops;
+extern const struct iomap_ops ext4_iomap_buffered_write_ops;
+extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
 
 static inline int ext4_buffer_uptodate(struct buffer_head *bh)
 {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 54d6ff22585c..52f37c49572a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -282,6 +282,20 @@  static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	return count;
 }
 
+static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
+					 struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	const struct iomap_ops *iomap_ops;
+
+	if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
+		iomap_ops = &ext4_iomap_buffered_da_write_ops;
+	else
+		iomap_ops = &ext4_iomap_buffered_write_ops;
+
+	return iomap_file_buffered_write(iocb, from, iomap_ops);
+}
+
 static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
 					struct iov_iter *from)
 {
@@ -296,7 +310,10 @@  static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
 	if (ret <= 0)
 		goto out;
 
-	ret = generic_perform_write(iocb, from);
+	if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP))
+		ret = ext4_iomap_buffered_write(iocb, from);
+	else
+		ret = generic_perform_write(iocb, from);
 
 out:
 	inode_unlock(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 20eb772f4f62..e825ed16fd60 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2857,7 +2857,7 @@  static int ext4_dax_writepages(struct address_space *mapping,
 	return ret;
 }
 
-static int ext4_nonda_switch(struct super_block *sb)
+int ext4_nonda_switch(struct super_block *sb)
 {
 	s64 free_clusters, dirty_clusters;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3254,6 +3254,15 @@  static bool ext4_inode_datasync_dirty(struct inode *inode)
 	return inode->i_state & I_DIRTY_DATASYNC;
 }
 
+static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
+{
+	return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
+}
+
+static const struct iomap_folio_ops ext4_iomap_folio_ops = {
+	.iomap_valid = ext4_iomap_valid,
+};
+
 static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
 			   struct ext4_map_blocks *map, loff_t offset,
 			   loff_t length, unsigned int flags)
@@ -3284,6 +3293,9 @@  static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
 	    !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		iomap->flags |= IOMAP_F_MERGED;
 
+	iomap->validity_cookie = READ_ONCE(EXT4_I(inode)->i_es_seq);
+	iomap->folio_ops = &ext4_iomap_folio_ops;
+
 	/*
 	 * Flags passed to ext4_map_blocks() for direct I/O writes can result
 	 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
@@ -3523,11 +3535,42 @@  const struct iomap_ops ext4_iomap_report_ops = {
 	.iomap_begin = ext4_iomap_begin_report,
 };
 
-static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
+static int ext4_iomap_get_blocks(struct inode *inode,
+				 struct ext4_map_blocks *map)
+{
+	handle_t *handle;
+	int ret, needed_blocks;
+
+	/*
+	 * Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason.
+	 */
+	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	ret = ext4_map_blocks(handle, inode, map,
+			      EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+	/*
+	 * Have to stop journal here since there is a potential deadlock
+	 * caused by later balance_dirty_pages(), it might wait on the
+	 * ditry pages to be written back, which might start another
+	 * handle and wait this handle stop.
+	 */
+	ext4_journal_stop(handle);
+
+	return ret;
+}
+
+#define IOMAP_F_EXT4_DELALLOC		IOMAP_F_PRIVATE
+
+static int __ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
 				loff_t length, unsigned int iomap_flags,
-				struct iomap *iomap, struct iomap *srcmap)
+				struct iomap *iomap, struct iomap *srcmap,
+				bool delalloc)
 {
-	int ret;
+	int ret, retries = 0;
 	struct ext4_map_blocks map;
 	u8 blkbits = inode->i_blkbits;
 
@@ -3537,20 +3580,133 @@  static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset,
 		return -EINVAL;
 	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
 		return -ERANGE;
-
+retry:
 	/* Calculate the first and last logical blocks respectively. */
 	map.m_lblk = offset >> blkbits;
 	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
 			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+	if (iomap_flags & IOMAP_WRITE) {
+		if (delalloc)
+			ret = ext4_da_map_blocks(inode, &map);
+		else
+			ret = ext4_iomap_get_blocks(inode, &map);
 
-	ret = ext4_map_blocks(NULL, inode, &map, 0);
+		if (ret == -ENOSPC &&
+		    ext4_should_retry_alloc(inode->i_sb, &retries))
+			goto retry;
+	} else {
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+	}
 	if (ret < 0)
 		return ret;
 
 	ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags);
+	if (delalloc)
+		iomap->flags |= IOMAP_F_EXT4_DELALLOC;
+
+	return 0;
+}
+
+static inline int ext4_iomap_buffered_io_begin(struct inode *inode,
+			loff_t offset, loff_t length, unsigned int flags,
+			struct iomap *iomap, struct iomap *srcmap)
+{
+	return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
+					      iomap, srcmap, false);
+}
+
+static inline int ext4_iomap_buffered_da_write_begin(struct inode *inode,
+			loff_t offset, loff_t length, unsigned int flags,
+			struct iomap *iomap, struct iomap *srcmap)
+{
+	return __ext4_iomap_buffered_io_begin(inode, offset, length, flags,
+					      iomap, srcmap, true);
+}
+
+/*
+ * Drop the staled delayed allocation range from the write failure,
+ * including both start and end blocks. If not, we could leave a range
+ * of delayed extents covered by a clean folio, it could lead to
+ * inaccurate space reservation.
+ */
+static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
+				     loff_t length)
+{
+	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
+			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
 	return 0;
 }
 
+static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset,
+					 loff_t length, ssize_t written,
+					 unsigned int flags,
+					 struct iomap *iomap)
+{
+	handle_t *handle;
+	loff_t end;
+	int ret = 0, ret2;
+
+	/* delalloc */
+	if (iomap->flags & IOMAP_F_EXT4_DELALLOC) {
+		ret = iomap_file_buffered_write_punch_delalloc(inode, iomap,
+			offset, length, written, ext4_iomap_punch_delalloc);
+		if (ret)
+			ext4_warning(inode->i_sb,
+			     "Failed to clean up delalloc for inode %lu, %d",
+			     inode->i_ino, ret);
+		return ret;
+	}
+
+	/* nodelalloc */
+	end = offset + length;
+	if (!(iomap->flags & IOMAP_F_SIZE_CHANGED) && end <= inode->i_size)
+		return 0;
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	if (iomap->flags & IOMAP_F_SIZE_CHANGED) {
+		ext4_update_i_disksize(inode, inode->i_size);
+		ret = ext4_mark_inode_dirty(handle, inode);
+	}
+
+	/*
+	 * If we have allocated more blocks and copied less.
+	 * We will have blocks allocated outside inode->i_size,
+	 * so truncate them.
+	 */
+	if (end > inode->i_size)
+		ext4_orphan_add(handle, inode);
+
+	ret2 = ext4_journal_stop(handle);
+	ret = ret ? : ret2;
+
+	if (end > inode->i_size) {
+		ext4_truncate_failed_write(inode);
+		/*
+		 * If truncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
+	return ret;
+}
+
+
+const struct iomap_ops ext4_iomap_buffered_write_ops = {
+	.iomap_begin = ext4_iomap_buffered_io_begin,
+	.iomap_end = ext4_iomap_buffered_write_end,
+};
+
+const struct iomap_ops ext4_iomap_buffered_da_write_ops = {
+	.iomap_begin = ext4_iomap_buffered_da_write_begin,
+	.iomap_end = ext4_iomap_buffered_write_end,
+};
+
 const struct iomap_ops ext4_iomap_buffered_read_ops = {
 	.iomap_begin = ext4_iomap_buffered_io_begin,
 };