diff mbox series

[RFC,v3,07/26] iomap: don't increase i_size if it's not a write operation

Message ID 20240127015825.1608160-8-yi.zhang@huaweicloud.com (mailing list archive)
State New
Headers show
Series ext4: use iomap for regular file's buffered IO path and enable large foilo | expand

Commit Message

Zhang Yi Jan. 27, 2024, 1:58 a.m. UTC
From: Zhang Yi <yi.zhang@huawei.com>

Increase i_size in iomap_zero_range() and iomap_unshare_iter() is not
needed, the caller should handle it. Especially, when truncate partial
block, we could not increase i_size beyond the new EOF here. It dosn't
affect xfs and gfs2 now because they reset the new file size after zero
out, it doesn't matter that a brief increase in i_size, but it will
affect ext4 because it set file size before truncate. At the same time,
iomap_write_failed() is also not needed for above two cases too, so
let's introduce a new helper iomap_write_end_simple() to replace the
common iomap_write_end() helper which designed for buffer write, and
also move out iomap_write_failed() from iomap_write_begin() to
iomap_write_iter().

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/iomap/buffered-io.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

Comments

Christoph Hellwig Feb. 13, 2024, 5:46 a.m. UTC | #1
Wouldn't it make more sense to just move the size manipulation to the
write-only code?  An untested version of that is below.  With this
the naming of the status variable becomes even more confusing than
it already is, maybe we need to do a cleanup of the *_write_end
calling conventions as it always returns the passed in copied value
or 0.

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 3dab060aed6d7b..8401a9ca702fc0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -876,34 +876,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
 		size_t copied, struct folio *folio)
 {
 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
-	loff_t old_size = iter->inode->i_size;
-	size_t ret;
-
-	if (srcmap->type == IOMAP_INLINE) {
-		ret = iomap_write_end_inline(iter, folio, pos, copied);
-	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
-		ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
-				copied, &folio->page, NULL);
-	} else {
-		ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
-	}
-
-	/*
-	 * Update the in-memory inode size after copying the data into the page
-	 * cache.  It's up to the file system to write the updated size to disk,
-	 * preferably after I/O completion so that no stale data is exposed.
-	 */
-	if (pos + ret > old_size) {
-		i_size_write(iter->inode, pos + ret);
-		iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
-	}
-	__iomap_put_folio(iter, pos, ret, folio);
 
-	if (old_size < pos)
-		pagecache_isize_extended(iter->inode, old_size, pos);
-	if (ret < len)
-		iomap_write_failed(iter->inode, pos + ret, len - ret);
-	return ret;
+	if (srcmap->type == IOMAP_INLINE)
+		return iomap_write_end_inline(iter, folio, pos, copied);
+	if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
+		return block_write_end(NULL, iter->inode->i_mapping, pos, len,
+					copied, &folio->page, NULL);
+	return __iomap_write_end(iter->inode, pos, len, copied, folio);
 }
 
 static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
@@ -918,6 +897,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 
 	do {
 		struct folio *folio;
+		loff_t old_size;
 		size_t offset;		/* Offset into folio */
 		size_t bytes;		/* Bytes to write to folio */
 		size_t copied;		/* Bytes copied from user */
@@ -964,7 +944,24 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 
 		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
 		status = iomap_write_end(iter, pos, bytes, copied, folio);
+		/*
+		 * Update the in-memory inode size after copying the data into
+		 * the page cache.  It's up to the file system to write the
+		 * updated size to disk, preferably after I/O completion so that
+		 * no stale data is exposed.
+		 */
+		old_size = iter->inode->i_size;
+		if (pos + status > old_size) {
+			i_size_write(iter->inode, pos + status);
+			iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
+		}
+		__iomap_put_folio(iter, pos, status, folio);
 
+		if (old_size < pos)
+			pagecache_isize_extended(iter->inode, old_size, pos);
+		if (status < bytes)
+			iomap_write_failed(iter->inode, pos + status,
+						bytes - status);
 		if (unlikely(copied != status))
 			iov_iter_revert(i, copied - status);
 
@@ -1334,6 +1331,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
 			bytes = folio_size(folio) - offset;
 
 		bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+		__iomap_put_folio(iter, pos, bytes, folio);
 		if (WARN_ON_ONCE(bytes == 0))
 			return -EIO;
 
@@ -1398,6 +1396,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
 		folio_mark_accessed(folio);
 
 		bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+		__iomap_put_folio(iter, pos, bytes, folio);
 		if (WARN_ON_ONCE(bytes == 0))
 			return -EIO;
Zhang Yi Feb. 17, 2024, 8:55 a.m. UTC | #2
On 2024/2/13 13:46, Christoph Hellwig wrote:
> Wouldn't it make more sense to just move the size manipulation to the
> write-only code?  An untested version of that is below.  With this

Sorry for the late reply and thanks for your suggestion, The reason why
I introduced this new helper iomap_write_end_simple() is I don't want to
open code __iomap_put_folio() in each caller since corresponding to
iomap_write_begin(), it's the responsibility for iomap_write_end_*() to
put and unlock folio, so I'd like to keep it in iomap_write_end_*().
But I don't feel strongly about it, it's also fine by me to just move
the size manipulation to the write-only code if you think it's better.

> the naming of the status variable becomes even more confusing than
> it already is, maybe we need to do a cleanup of the *_write_end
> calling conventions as it always returns the passed in copied value
> or 0.

Indeed, it becomes more confused and deserve a cleanup.

Thanks,
Yi.

> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 3dab060aed6d7b..8401a9ca702fc0 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -876,34 +876,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
>  		size_t copied, struct folio *folio)
>  {
>  	const struct iomap *srcmap = iomap_iter_srcmap(iter);
> -	loff_t old_size = iter->inode->i_size;
> -	size_t ret;
> -
> -	if (srcmap->type == IOMAP_INLINE) {
> -		ret = iomap_write_end_inline(iter, folio, pos, copied);
> -	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
> -		ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
> -				copied, &folio->page, NULL);
> -	} else {
> -		ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
> -	}
> -
> -	/*
> -	 * Update the in-memory inode size after copying the data into the page
> -	 * cache.  It's up to the file system to write the updated size to disk,
> -	 * preferably after I/O completion so that no stale data is exposed.
> -	 */
> -	if (pos + ret > old_size) {
> -		i_size_write(iter->inode, pos + ret);
> -		iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
> -	}
> -	__iomap_put_folio(iter, pos, ret, folio);
>  
> -	if (old_size < pos)
> -		pagecache_isize_extended(iter->inode, old_size, pos);
> -	if (ret < len)
> -		iomap_write_failed(iter->inode, pos + ret, len - ret);
> -	return ret;
> +	if (srcmap->type == IOMAP_INLINE)
> +		return iomap_write_end_inline(iter, folio, pos, copied);
> +	if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
> +		return block_write_end(NULL, iter->inode->i_mapping, pos, len,
> +					copied, &folio->page, NULL);
> +	return __iomap_write_end(iter->inode, pos, len, copied, folio);
>  }
>  
>  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
> @@ -918,6 +897,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  
>  	do {
>  		struct folio *folio;
> +		loff_t old_size;
>  		size_t offset;		/* Offset into folio */
>  		size_t bytes;		/* Bytes to write to folio */
>  		size_t copied;		/* Bytes copied from user */
> @@ -964,7 +944,24 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
>  
>  		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
>  		status = iomap_write_end(iter, pos, bytes, copied, folio);
> +		/*
> +		 * Update the in-memory inode size after copying the data into
> +		 * the page cache.  It's up to the file system to write the
> +		 * updated size to disk, preferably after I/O completion so that
> +		 * no stale data is exposed.
> +		 */
> +		old_size = iter->inode->i_size;
> +		if (pos + status > old_size) {
> +			i_size_write(iter->inode, pos + status);
> +			iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
> +		}
> +		__iomap_put_folio(iter, pos, status, folio);
>  
> +		if (old_size < pos)
> +			pagecache_isize_extended(iter->inode, old_size, pos);
> +		if (status < bytes)
> +			iomap_write_failed(iter->inode, pos + status,
> +						bytes - status);
>  		if (unlikely(copied != status))
>  			iov_iter_revert(i, copied - status);
>  
> @@ -1334,6 +1331,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
>  			bytes = folio_size(folio) - offset;
>  
>  		bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
> +		__iomap_put_folio(iter, pos, bytes, folio);
>  		if (WARN_ON_ONCE(bytes == 0))
>  			return -EIO;
>  
> @@ -1398,6 +1396,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
>  		folio_mark_accessed(folio);
>  
>  		bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
> +		__iomap_put_folio(iter, pos, bytes, folio);
>  		if (WARN_ON_ONCE(bytes == 0))
>  			return -EIO;
>  
>
Dave Chinner Feb. 18, 2024, 11:30 p.m. UTC | #3
On Sat, Feb 17, 2024 at 04:55:51PM +0800, Zhang Yi wrote:
> On 2024/2/13 13:46, Christoph Hellwig wrote:
> > Wouldn't it make more sense to just move the size manipulation to the
> > write-only code?  An untested version of that is below.  With this
> 
> Sorry for the late reply and thanks for your suggestion, The reason why
> I introduced this new helper iomap_write_end_simple() is I don't want to
> open code __iomap_put_folio() in each caller since corresponding to
> iomap_write_begin(), it's the responsibility for iomap_write_end_*() to
> put and unlock folio, so I'd like to keep it in iomap_write_end_*().

Just because we currently put the folio in iomap_write_end_*(), it
doesn't mean we must always do it that way.

> But I don't feel strongly about it, it's also fine by me to just move
> the size manipulation to the write-only code if you think it's better.

I agree with Christoph that it's better to move the i_size update
into iomap_write_iter() than it is to implement a separate write_end
function that does not update the i_size. The iter functions already
do work directly on the folio that iomap_write_begin() returns, so
having them drop the folio when everything is done isn't a huge
deal...

Cheers,

Dave.
Zhang Yi Feb. 19, 2024, 1:14 a.m. UTC | #4
On 2024/2/19 7:30, Dave Chinner wrote:
> On Sat, Feb 17, 2024 at 04:55:51PM +0800, Zhang Yi wrote:
>> On 2024/2/13 13:46, Christoph Hellwig wrote:
>>> Wouldn't it make more sense to just move the size manipulation to the
>>> write-only code?  An untested version of that is below.  With this
>>
>> Sorry for the late reply and thanks for your suggestion, The reason why
>> I introduced this new helper iomap_write_end_simple() is I don't want to
>> open code __iomap_put_folio() in each caller since corresponding to
>> iomap_write_begin(), it's the responsibility for iomap_write_end_*() to
>> put and unlock folio, so I'd like to keep it in iomap_write_end_*().
> 
> Just because we currently put the folio in iomap_write_end_*(), it
> doesn't mean we must always do it that way.
> 
>> But I don't feel strongly about it, it's also fine by me to just move
>> the size manipulation to the write-only code if you think it's better.
> 
> I agree with Christoph that it's better to move the i_size update
> into iomap_write_iter() than it is to implement a separate write_end
> function that does not update the i_size. The iter functions already
> do work directly on the folio that iomap_write_begin() returns, so
> having them drop the folio when everything is done isn't a huge
> deal...
> 

Sure, I will revise it as you suggested in my next iteration.

Thanks,
Yi.
Zhang Yi Feb. 28, 2024, 8:53 a.m. UTC | #5
On 2024/2/13 13:46, Christoph Hellwig wrote:
> Wouldn't it make more sense to just move the size manipulation to the
> write-only code?  An untested version of that is below.  With this
> the naming of the status variable becomes even more confusing than
> it already is, maybe we need to do a cleanup of the *_write_end
> calling conventions as it always returns the passed in copied value
> or 0.
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 3dab060aed6d7b..8401a9ca702fc0 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -876,34 +876,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
>  		size_t copied, struct folio *folio)
>  {
>  	const struct iomap *srcmap = iomap_iter_srcmap(iter);
> -	loff_t old_size = iter->inode->i_size;
> -	size_t ret;
> -
> -	if (srcmap->type == IOMAP_INLINE) {
> -		ret = iomap_write_end_inline(iter, folio, pos, copied);
> -	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
> -		ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
> -				copied, &folio->page, NULL);
> -	} else {
> -		ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
> -	}
> -
> -	/*
> -	 * Update the in-memory inode size after copying the data into the page
> -	 * cache.  It's up to the file system to write the updated size to disk,
> -	 * preferably after I/O completion so that no stale data is exposed.
> -	 */
> -	if (pos + ret > old_size) {
> -		i_size_write(iter->inode, pos + ret);
> -		iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
> -	}

I've recently discovered that if we don't increase i_size in
iomap_zero_iter(), it would break fstests generic/476 on xfs. xfs
depends on iomap_zero_iter() to increase i_size in some cases.

 generic/476 75s ... _check_xfs_filesystem: filesystem on /dev/pmem2 is inconsistent (r)
 (see /home/zhangyi/xfstests-dev/results//xfs/generic/476.full for details)

 _check_xfs_filesystem: filesystem on /dev/pmem2 is inconsistent (r)
 *** xfs_repair -n output ***
 Phase 1 - find and verify superblock...
 Phase 2 - using internal log
         - zero log...
         - scan filesystem freespace and inode maps...
 sb_fdblocks 10916, counted 10923
         - found root inode chunk
 ...

After debugging and analysis, I found the root cause of the problem is
related to the pre-allocations of xfs. xfs pre-allocates some blocks to
reduce fragmentation during buffer append writing, then if we write new
data or do file copy(reflink) after the end of the pre-allocating range,
xfs would zero-out and write back the pre-allocate space(e.g.
xfs_file_write_checks() -> xfs_zero_range()), so we have to update
i_size before writing back in iomap_zero_iter(), otherwise, it will
result in stale delayed extent.

For more details, let's think about this case,
1. Buffered write from range [A, B) of an empty file foo, and
   xfs_buffered_write_iomap_begin() prealloc blocks for it, then create
   a delayed extent from [A, D).
2. Write back process map blocks but only convert above delayed extent
   from [A, C) since the lack of a contiguous physical blocks, now we
   have a left over delayed extent from [C, D), and the file size is B.
3. Copy range from another file to range [E, F), then
   xfs_reflink_zero_posteof() would zero-out post eof range [B, E), it
   writes zero, dirty and write back [C, E).
4. Since we don't update i_size in iomap_zero_iter(),the writeback
   doesn't write anything back, also doesn't convert the delayed extent.
   After copy range, the file size will update to F.
5. Finally, the delayed extent becomes stale, and the free blocks count
   becomes incorrect.

So, we have to handle above case for xfs. I suppose we could keep
increasing i_size if the zeroed folio is entirely outside of i_size,
make sure we could write back and allocate blocks for the
zeroed & delayed extent, something like below, any suggestions ?


@@ -1390,6 +1390,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)

 	do {
 		struct folio *folio;
+		loff_t old_size;
 		int status;
 		size_t offset;
 		size_t bytes = min_t(u64, SIZE_MAX, length);
@@ -1408,6 +1409,28 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
 		folio_mark_accessed(folio);

 		bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+
+		/*
+		 * If folio is entirely outside of i_size, update the
+		 * in-memory inode size after zeroing the data in the page
+		 * cache to prevent the write-back process from not writing
+		 * back zeroed pages.
+		 */
+		old_size = iter->inode->i_size;
+		if (pos + bytes > old_size) {
+			size_t offset = offset_in_folio(folio, old_size);
+			pgoff_t end_index = old_size >> PAGE_SHIFT;
+
+			if (folio->index > end_index ||
+			    (folio->index == end_index && offset == 0)) {
+				i_size_write(iter->inode, pos + bytes);
+				iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
+			}
+		}
+		__iomap_put_folio(iter, pos, bytes, folio);
+		if (old_size < pos)
+			pagecache_isize_extended(iter->inode, old_size, pos);
+
 		if (WARN_ON_ONCE(bytes == 0))
 			return -EIO;

Thansk,
Yi.
Christoph Hellwig Feb. 28, 2024, 10:13 p.m. UTC | #6
On Wed, Feb 28, 2024 at 04:53:32PM +0800, Zhang Yi wrote:
> So, we have to handle above case for xfs. I suppose we could keep
> increasing i_size if the zeroed folio is entirely outside of i_size,
> make sure we could write back and allocate blocks for the
> zeroed & delayed extent, something like below, any suggestions ?

Sorry for being dumb, but what was the problem solved by not updating
the size for ext4 again?  (for unshare I can't see any reason to
ever update the inode size)
Dave Chinner Feb. 28, 2024, 10:25 p.m. UTC | #7
On Wed, Feb 28, 2024 at 04:53:32PM +0800, Zhang Yi wrote:
> On 2024/2/13 13:46, Christoph Hellwig wrote:
> > Wouldn't it make more sense to just move the size manipulation to the
> > write-only code?  An untested version of that is below.  With this
> > the naming of the status variable becomes even more confusing than
> > it already is, maybe we need to do a cleanup of the *_write_end
> > calling conventions as it always returns the passed in copied value
> > or 0.
> > 
> > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> > index 3dab060aed6d7b..8401a9ca702fc0 100644
> > --- a/fs/iomap/buffered-io.c
> > +++ b/fs/iomap/buffered-io.c
> > @@ -876,34 +876,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
> >  		size_t copied, struct folio *folio)
> >  {
> >  	const struct iomap *srcmap = iomap_iter_srcmap(iter);
> > -	loff_t old_size = iter->inode->i_size;
> > -	size_t ret;
> > -
> > -	if (srcmap->type == IOMAP_INLINE) {
> > -		ret = iomap_write_end_inline(iter, folio, pos, copied);
> > -	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
> > -		ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
> > -				copied, &folio->page, NULL);
> > -	} else {
> > -		ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
> > -	}
> > -
> > -	/*
> > -	 * Update the in-memory inode size after copying the data into the page
> > -	 * cache.  It's up to the file system to write the updated size to disk,
> > -	 * preferably after I/O completion so that no stale data is exposed.
> > -	 */
> > -	if (pos + ret > old_size) {
> > -		i_size_write(iter->inode, pos + ret);
> > -		iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
> > -	}
> 
> I've recently discovered that if we don't increase i_size in
> iomap_zero_iter(), it would break fstests generic/476 on xfs. xfs
> depends on iomap_zero_iter() to increase i_size in some cases.
> 
>  generic/476 75s ... _check_xfs_filesystem: filesystem on /dev/pmem2 is inconsistent (r)
>  (see /home/zhangyi/xfstests-dev/results//xfs/generic/476.full for details)
> 
>  _check_xfs_filesystem: filesystem on /dev/pmem2 is inconsistent (r)
>  *** xfs_repair -n output ***
>  Phase 1 - find and verify superblock...
>  Phase 2 - using internal log
>          - zero log...
>          - scan filesystem freespace and inode maps...
>  sb_fdblocks 10916, counted 10923
>          - found root inode chunk
>  ...
> 
> After debugging and analysis, I found the root cause of the problem is
> related to the pre-allocations of xfs. xfs pre-allocates some blocks to
> reduce fragmentation during buffer append writing, then if we write new
> data or do file copy(reflink) after the end of the pre-allocating range,
> xfs would zero-out and write back the pre-allocate space(e.g.
> xfs_file_write_checks() -> xfs_zero_range()), so we have to update
> i_size before writing back in iomap_zero_iter(), otherwise, it will
> result in stale delayed extent.

Ok, so this is long because the example is lacking in clear details
so to try to understand it I've laid it out in detail to make sure
I've understood it correctly.

> 
> For more details, let's think about this case,
> 1. Buffered write from range [A, B) of an empty file foo, and
>    xfs_buffered_write_iomap_begin() prealloc blocks for it, then create
>    a delayed extent from [A, D).

So we have a delayed allocation extent  and the file size is now B
like so:

	A                      B                    D
	+DDDDDDDDDDDDDDDDDDDDDD+dddddddddddddddddddd+
	                      EOF
			  (in memory)

where 'd' is a delalloc block with no data and 'D' is a delalloc
block with dirty folios over it.

> 2. Write back process map blocks but only convert above delayed extent
>    from [A, C) since the lack of a contiguous physical blocks, now we
>    have a left over delayed extent from [C, D), and the file size is B.

So this produces:

	A          C           B                    D
	+wwwwwwwwww+DDDDDDDDDDD+dddddddddddddddddddd+
	          EOF         EOF
               (on disk)  (in memory)

where 'w' contains allocated written data blocks.

> 3. Copy range from another file to range [E, F), then
>    xfs_reflink_zero_posteof() would zero-out post eof range [B, E), it
>    writes zero, dirty and write back [C, E).

I'm going to assume that [E,F) is located like this because you
are talking about post-eof zeroing from B to E:

	A          C           B     E       F      D
	+wwwwwwwwww+DDDDDDDDDDD+ddddd+rrrrrrr+dddddd+
	          EOF         EOF
               (on disk)  (in memory)

where 'r' is the clone destination over dellaloc blocks.

Did I get that right?

And so reflink wants to zero [B,E] before it updates the file size,
just like a truncate(E) would. iomap_zero_iter() will see a delalloc
extent (IOMAP_DELALLOC) for [B,E], so it will write zeros into cache
for it. We then have:

	A          C           B     E       F      D
	+wwwwwwwwww+DDDDDDDDDDD+ZZZZZ+rrrrrrr+dddddd+
	          EOF         EOF
               (on disk)  (in memory)

where 'Z' is delalloc blocks with zeroes in cache.

Because the destination is post EOF, xfs_reflink_remap_prep() then
does:

        /*
         * If pos_out > EOF, we may have dirtied blocks between EOF and
         * pos_out. In that case, we need to extend the flush and unmap to cover
         * from EOF to the end of the copy length.
         */
        if (pos_out > XFS_ISIZE(dest)) {
                loff_t  flen = *len + (pos_out - XFS_ISIZE(dest));
                ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
	} ....

Which attempts to flush from the current in memory EOF up to the end
of the clone destination range. This should result in:

	A          C           B     E       F      D
	+wwwwwwwwww+DDDDDDDDDDD+zzzzz+rrrrrrr+dddddd+
	          EOF         EOF
               (on disk)  (in memory)

Where 'z' is zeroes on disk.

Have I understood this correctly?

However, if this did actually write zeroes to disk, this would end
up with:

	A          C           B     E       F      D
	+wwwwwwwwww+DDDDDDDDDDD+zzzzz+rrrrrrr+dddddd+
	                      EOF   EOF
                      (in memory)   (on disk)

Which is wrong - the file extension and zeros should not get exposed
to the user until the entire reflink completes. This would expose
zeros at the EOF and a file size that the user never asked for after
a crash. Experience tells me that they would report this as
"filesystem corrupting data on crash".

If we move where i_size gets updated by iomap_zero_iter(), we get:

	A          C           B     E       F      D
	+wwwwwwwwww+DDDDDDDDDDD+zzzzz+rrrrrrr+dddddd+
	                            EOF
                                (in memory)
		                 (on disk)

Which is also wrong, because now the user can see the size change
and read zeros in the middle of the clone operation, which is also
wrong.

IOWs, we do not want to move the in-memory or on-disk EOF as a
result of zeroing delalloc extents beyond EOF as it opens up
transient, non-atomic on-disk states in the event of a crash.

So, catch-22: we need to move the in-memory EOF to write back zeroes
beyond EOF, but that would move the on-disk EOF to E before the
clone operation starts. i.e. it makes clone non-atomic.

What should acutally result from the iomap_zero_range() call from
xfs_reflink_remap_prep() is a state like this:

	A          C           B     E       F      D
	+wwwwwwwwww+DDDDDDDDDDD+uuuuu+rrrrrrr+dddddd+
	          EOF         EOF
               (on disk)  (in memory)

where 'u' are unwritten extent blocks.

i.e. instead of writing zeroes through the page cache for
IOMAP_DELALLOC ranges beyond EOF, we should be converting those
ranges to unwritten and invalidating any cached data over that range
beyond EOF.

IOWs, it looks to me like the problem is that
xfs_buffered_write_iomap_begin() is doing the wrong thing for
IOMAP_ZERO operations for post-EOF regions spanned by speculative
delalloc. It should be converting the region to unwritten so it has
zeroes on disk, not relying on the page cache to be able to do
writeback beyond the current EOF....

> 4. Since we don't update i_size in iomap_zero_iter(),the writeback
>    doesn't write anything back, also doesn't convert the delayed extent.
>    After copy range, the file size will update to F.

Yup, this is all, individually, correct behaviour. But when put
together, the wrong thing happens. I suspect xfs_zero_range() needs
to provide a custom set of iomap_begin/end callbacks rather than
overloading the normal buffered write mechanisms.

-Dave.
Zhang Yi Feb. 29, 2024, 8:59 a.m. UTC | #8
Hello, Dave!

On 2024/2/29 6:25, Dave Chinner wrote:
> On Wed, Feb 28, 2024 at 04:53:32PM +0800, Zhang Yi wrote:
>> On 2024/2/13 13:46, Christoph Hellwig wrote:
>>> Wouldn't it make more sense to just move the size manipulation to the
>>> write-only code?  An untested version of that is below.  With this
>>> the naming of the status variable becomes even more confusing than
>>> it already is, maybe we need to do a cleanup of the *_write_end
>>> calling conventions as it always returns the passed in copied value
>>> or 0.
>>>
>>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>>> index 3dab060aed6d7b..8401a9ca702fc0 100644
>>> --- a/fs/iomap/buffered-io.c
>>> +++ b/fs/iomap/buffered-io.c
>>> @@ -876,34 +876,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
>>>  		size_t copied, struct folio *folio)
>>>  {
>>>  	const struct iomap *srcmap = iomap_iter_srcmap(iter);
>>> -	loff_t old_size = iter->inode->i_size;
>>> -	size_t ret;
>>> -
>>> -	if (srcmap->type == IOMAP_INLINE) {
>>> -		ret = iomap_write_end_inline(iter, folio, pos, copied);
>>> -	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
>>> -		ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
>>> -				copied, &folio->page, NULL);
>>> -	} else {
>>> -		ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
>>> -	}
>>> -
>>> -	/*
>>> -	 * Update the in-memory inode size after copying the data into the page
>>> -	 * cache.  It's up to the file system to write the updated size to disk,
>>> -	 * preferably after I/O completion so that no stale data is exposed.
>>> -	 */
>>> -	if (pos + ret > old_size) {
>>> -		i_size_write(iter->inode, pos + ret);
>>> -		iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
>>> -	}
>>
>> I've recently discovered that if we don't increase i_size in
>> iomap_zero_iter(), it would break fstests generic/476 on xfs. xfs
>> depends on iomap_zero_iter() to increase i_size in some cases.
>>
>>  generic/476 75s ... _check_xfs_filesystem: filesystem on /dev/pmem2 is inconsistent (r)
>>  (see /home/zhangyi/xfstests-dev/results//xfs/generic/476.full for details)
>>
>>  _check_xfs_filesystem: filesystem on /dev/pmem2 is inconsistent (r)
>>  *** xfs_repair -n output ***
>>  Phase 1 - find and verify superblock...
>>  Phase 2 - using internal log
>>          - zero log...
>>          - scan filesystem freespace and inode maps...
>>  sb_fdblocks 10916, counted 10923
>>          - found root inode chunk
>>  ...
>>
>> After debugging and analysis, I found the root cause of the problem is
>> related to the pre-allocations of xfs. xfs pre-allocates some blocks to
>> reduce fragmentation during buffer append writing, then if we write new
>> data or do file copy(reflink) after the end of the pre-allocating range,
>> xfs would zero-out and write back the pre-allocate space(e.g.
>> xfs_file_write_checks() -> xfs_zero_range()), so we have to update
>> i_size before writing back in iomap_zero_iter(), otherwise, it will
>> result in stale delayed extent.
> 
> Ok, so this is long because the example is lacking in clear details
> so to try to understand it I've laid it out in detail to make sure
> I've understood it correctly.
> 

Thanks for the graph, the added detail makes things clear and easy to
understand. To be honest, it's not exactly the same as the results I
captured and described (the position A\B\C\D\E\F I described is
increased one by one), but the root cause of the problem is the same,
so it doesn't affect our understanding of the problem.

>>
>> For more details, let's think about this case,
>> 1. Buffered write from range [A, B) of an empty file foo, and
>>    xfs_buffered_write_iomap_begin() prealloc blocks for it, then create
>>    a delayed extent from [A, D).
> 
> So we have a delayed allocation extent  and the file size is now B
> like so:
> 
> 	A                      B                    D
> 	+DDDDDDDDDDDDDDDDDDDDDD+dddddddddddddddddddd+
> 	                      EOF
> 			  (in memory)
> 
> where 'd' is a delalloc block with no data and 'D' is a delalloc
> block with dirty folios over it.
> 

Yes

>> 2. Write back process map blocks but only convert above delayed extent
>>    from [A, C) since the lack of a contiguous physical blocks, now we
>>    have a left over delayed extent from [C, D), and the file size is B.
> 
> So this produces:
> 
> 	A          C           B                    D
> 	+wwwwwwwwww+DDDDDDDDDDD+dddddddddddddddddddd+
> 	          EOF         EOF
>                (on disk)  (in memory)
> 
> where 'w' contains allocated written data blocks.
> 

The results I captured is:

 	A                      B         C          D
 	+wwwwwwwwwwwwwwwwwwwwww+uuuuuuuuu+ddddddddddd+
 	                      EOF
                          (in memory)
                           (on disk)

>> 3. Copy range from another file to range [E, F), then
>>    xfs_reflink_zero_posteof() would zero-out post eof range [B, E), it
>>    writes zero, dirty and write back [C, E).
> 
> I'm going to assume that [E,F) is located like this because you
> are talking about post-eof zeroing from B to E:
> 
> 	A          C           B     E       F      D
> 	+wwwwwwwwww+DDDDDDDDDDD+ddddd+rrrrrrr+dddddd+
> 	          EOF         EOF
>                (on disk)  (in memory)
> 
> where 'r' is the clone destination over dellaloc blocks.
> 
> Did I get that right?
> 

The results I captured is:

 	A                      B         C          D      E       F
 	+wwwwwwwwwwwwwwwwwwwwww+uuuuuuuuu+dddddddddd+hhhhhh+rrrrrrr+
 	                      EOF
                          (in memory)
                           (on disk)

where 'h' contains a hole.

> And so reflink wants to zero [B,E] before it updates the file size,
> just like a truncate(E) would. iomap_zero_iter() will see a delalloc
> extent (IOMAP_DELALLOC) for [B,E], so it will write zeros into cache
> for it. We then have:
> 
> 	A          C           B     E       F      D
> 	+wwwwwwwwww+DDDDDDDDDDD+ZZZZZ+rrrrrrr+dddddd+
> 	          EOF         EOF
>                (on disk)  (in memory)
> 
> where 'Z' is delalloc blocks with zeroes in cache.
> 

The results I captured is:

 	A                      B         C          D      E       F
 	+wwwwwwwwwwwwwwwwwwwwww+uuuuuuuuu+ZZZZZZZZZZ+hhhhhh+rrrrrrr+
 	                      EOF
                          (in memory)
                           (on disk)

> Because the destination is post EOF, xfs_reflink_remap_prep() then
> does:
> 
>         /*
>          * If pos_out > EOF, we may have dirtied blocks between EOF and
>          * pos_out. In that case, we need to extend the flush and unmap to cover
>          * from EOF to the end of the copy length.
>          */
>         if (pos_out > XFS_ISIZE(dest)) {
>                 loff_t  flen = *len + (pos_out - XFS_ISIZE(dest));
>                 ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
> 	} ....
> 
> Which attempts to flush from the current in memory EOF up to the end
> of the clone destination range. This should result in:
> 
> 	A          C           B     E       F      D
> 	+wwwwwwwwww+DDDDDDDDDDD+zzzzz+rrrrrrr+dddddd+
> 	          EOF         EOF
>                (on disk)  (in memory)
> 
> Where 'z' is zeroes on disk.
> 
> Have I understood this correctly?
> 

The results I captured is:

 	A                      B         C          D      E       F
 	+wwwwwwwwwwwwwwwwwwwwww+uuuuuuuuu+zzzzzzzzzz+hhhhhh+rrrrrrr+
 	                      EOF
                          (in memory)
                           (on disk)

Since we don't update i_size in iomap_zero_iter(), the zeroed C to D
in cache would never write back to disk (iomap_writepage_handle_eof()
would skip them since it's entirely ouside of i_size) and the
'i_size & i_disksize' is still at B, after reflink, the i_size would
be update to F, so the delayed C to D cannot be freed by
xfs_free_eofblocks().

 	A                      B         C          D      E       F
 	+wwwwwwwwwwwwwwwwwwwwww+uuuuuuuuu+dddddddddd+hhhhhh+rrrrrrr+
 	                                                          EOF
                                                              (in memory)
                                                               (on disk)

Although the result is not exactly the same as your understanding,
the situation you describe still triggers the problem.

> However, if this did actually write zeroes to disk, this would end
> up with:
> 
> 	A          C           B     E       F      D
> 	+wwwwwwwwww+DDDDDDDDDDD+zzzzz+rrrrrrr+dddddd+
> 	                      EOF   EOF
>                       (in memory)   (on disk)
> 
> Which is wrong - the file extension and zeros should not get exposed
> to the user until the entire reflink completes. This would expose
> zeros at the EOF and a file size that the user never asked for after
> a crash. Experience tells me that they would report this as
> "filesystem corrupting data on crash".
> 
> If we move where i_size gets updated by iomap_zero_iter(), we get:
> 
> 	A          C           B     E       F      D
> 	+wwwwwwwwww+DDDDDDDDDDD+zzzzz+rrrrrrr+dddddd+
> 	                            EOF
>                                 (in memory)
> 		                 (on disk)
> 
> Which is also wrong, because now the user can see the size change
> and read zeros in the middle of the clone operation, which is also
> wrong.
> 
> IOWs, we do not want to move the in-memory or on-disk EOF as a
> result of zeroing delalloc extents beyond EOF as it opens up
> transient, non-atomic on-disk states in the event of a crash.
> 
> So, catch-22: we need to move the in-memory EOF to write back zeroes
> beyond EOF, but that would move the on-disk EOF to E before the
> clone operation starts. i.e. it makes clone non-atomic.

Make sense. IIUC, I also notice that xfs_file_write_checks() zero
out EOF blocks if the later write offset is beyond the size of the
file. Think about if we replace the reflink operation to a buffer
write E to F, although it doesn't call xfs_flush_unmap_range()
directly, but if it could be raced by another background write
back, and trigger the same problem (I've not try to reproduce it,
so please correct me if I understand wrong).

> 
> What should acutally result from the iomap_zero_range() call from
> xfs_reflink_remap_prep() is a state like this:
> 
> 	A          C           B     E       F      D
> 	+wwwwwwwwww+DDDDDDDDDDD+uuuuu+rrrrrrr+dddddd+
> 	          EOF         EOF
>                (on disk)  (in memory)
> 
> where 'u' are unwritten extent blocks.
> 

Yeah, this is a good solution.

In xfs_file_write_checks(), I don't fully understand why we need
the xfs_zero_range(). Theoretically, iomap have already handled
partial block zeroing for both buffered IO and DIO, so I guess
the only reason we still need it is to handle pre-allocated blocks
(no?). If so,would it be better to call xfs_free_eofblocks() to
release all the preallocated extents in range? If not, maybe we
could only zero out mapped partial blocks and also release
preallocated extents?

In xfs_reflink_remap_prep(), I read the commit 410fdc72b05a ("xfs:
zero posteof blocks when cloning above eof"), xfs used to release
preallocations, the change log said it didn't work because of the
PREALLOC flag, but the 'force' parameter is 'true' when calling
xfs_can_free_eofblocks(), so I don't get the problem met. Could we
fall back to use xfs_free_eofblocks() and make a state like this?

 	A          C           B     E       F      D
 	+wwwwwwwwww+DDDDDDDDDDD+hhhhh+rrrrrrr+dddddd+
 	          EOF         EOF
                (on disk)  (in memory)


Thanks,
Yi.

> i.e. instead of writing zeroes through the page cache for
> IOMAP_DELALLOC ranges beyond EOF, we should be converting those
> ranges to unwritten and invalidating any cached data over that range
> beyond EOF.
> 
> IOWs, it looks to me like the problem is that
> xfs_buffered_write_iomap_begin() is doing the wrong thing for
> IOMAP_ZERO operations for post-EOF regions spanned by speculative
> delalloc. It should be converting the region to unwritten so it has
> zeroes on disk, not relying on the page cache to be able to do
> writeback beyond the current EOF....
> 
>> 4. Since we don't update i_size in iomap_zero_iter(),the writeback
>>    doesn't write anything back, also doesn't convert the delayed extent.
>>    After copy range, the file size will update to F.
> 
> Yup, this is all, individually, correct behaviour. But when put
> together, the wrong thing happens. I suspect xfs_zero_range() needs
> to provide a custom set of iomap_begin/end callbacks rather than
> overloading the normal buffered write mechanisms.
> 
> -Dave.
>
Zhang Yi Feb. 29, 2024, 9:20 a.m. UTC | #9
Hello Christoph!

On 2024/2/29 6:13, Christoph Hellwig wrote:
> On Wed, Feb 28, 2024 at 04:53:32PM +0800, Zhang Yi wrote:
>> So, we have to handle above case for xfs. I suppose we could keep
>> increasing i_size if the zeroed folio is entirely outside of i_size,
>> make sure we could write back and allocate blocks for the
>> zeroed & delayed extent, something like below, any suggestions ?
> 
> Sorry for being dumb, but what was the problem solved by not updating
> the size for ext4 again?  (for unshare I can't see any reason to
> ever update the inode size)
> 

The problem I want to slove by not updating the size for ext4 is
truncate. Now ext4 use iomap_zero_range() for the case of zero
partial blocks, and ext4's truncate is different from xfs.

Let's think about a simple case, we have a reg file with size 3K,
then truncate it to 1K. ext4 first set i_size to 1K and then call
ext4_block_truncate_page() to zero out data after 1K(EOF) through
iomap_zero_range(). But now it will update i_size in
iomap_write_end(), so the size of the file will increase to 4K,
this is wrong. xfs first zero out data through iomap_truncate_page()
and then set file size to 1K, so the file size is 3K->4K->1K.
Although the result is correct, but the increasing in
iomap_zero_range() is also not necessary, so so I'm just gonna
delete the i_size updating in iomap_zero_range(). It's not for
unhare.

Thanks,
Yi.
Dave Chinner Feb. 29, 2024, 11:19 p.m. UTC | #10
On Thu, Feb 29, 2024 at 04:59:34PM +0800, Zhang Yi wrote:
> Hello, Dave!
> 
> On 2024/2/29 6:25, Dave Chinner wrote:
> > On Wed, Feb 28, 2024 at 04:53:32PM +0800, Zhang Yi wrote:
> >> On 2024/2/13 13:46, Christoph Hellwig wrote:
> >>> Wouldn't it make more sense to just move the size manipulation to the
> >>> write-only code?  An untested version of that is below.  With this
> >>> the naming of the status variable becomes even more confusing than
> >>> it already is, maybe we need to do a cleanup of the *_write_end
> >>> calling conventions as it always returns the passed in copied value
> >>> or 0.
> >>>
> >>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> >>> index 3dab060aed6d7b..8401a9ca702fc0 100644
> >>> --- a/fs/iomap/buffered-io.c
> >>> +++ b/fs/iomap/buffered-io.c
> >>> @@ -876,34 +876,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
> >>>  		size_t copied, struct folio *folio)
> >>>  {
> >>>  	const struct iomap *srcmap = iomap_iter_srcmap(iter);
> >>> -	loff_t old_size = iter->inode->i_size;
> >>> -	size_t ret;
> >>> -
> >>> -	if (srcmap->type == IOMAP_INLINE) {
> >>> -		ret = iomap_write_end_inline(iter, folio, pos, copied);
> >>> -	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
> >>> -		ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
> >>> -				copied, &folio->page, NULL);
> >>> -	} else {
> >>> -		ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
> >>> -	}
> >>> -
> >>> -	/*
> >>> -	 * Update the in-memory inode size after copying the data into the page
> >>> -	 * cache.  It's up to the file system to write the updated size to disk,
> >>> -	 * preferably after I/O completion so that no stale data is exposed.
> >>> -	 */
> >>> -	if (pos + ret > old_size) {
> >>> -		i_size_write(iter->inode, pos + ret);
> >>> -		iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
> >>> -	}
> >>
> >> I've recently discovered that if we don't increase i_size in
> >> iomap_zero_iter(), it would break fstests generic/476 on xfs. xfs
> >> depends on iomap_zero_iter() to increase i_size in some cases.
> >>
> >>  generic/476 75s ... _check_xfs_filesystem: filesystem on /dev/pmem2 is inconsistent (r)
> >>  (see /home/zhangyi/xfstests-dev/results//xfs/generic/476.full for details)
> >>
> >>  _check_xfs_filesystem: filesystem on /dev/pmem2 is inconsistent (r)
> >>  *** xfs_repair -n output ***
> >>  Phase 1 - find and verify superblock...
> >>  Phase 2 - using internal log
> >>          - zero log...
> >>          - scan filesystem freespace and inode maps...
> >>  sb_fdblocks 10916, counted 10923
> >>          - found root inode chunk
> >>  ...
> >>
> >> After debugging and analysis, I found the root cause of the problem is
> >> related to the pre-allocations of xfs. xfs pre-allocates some blocks to
> >> reduce fragmentation during buffer append writing, then if we write new
> >> data or do file copy(reflink) after the end of the pre-allocating range,
> >> xfs would zero-out and write back the pre-allocate space(e.g.
> >> xfs_file_write_checks() -> xfs_zero_range()), so we have to update
> >> i_size before writing back in iomap_zero_iter(), otherwise, it will
> >> result in stale delayed extent.
> > 
> > Ok, so this is long because the example is lacking in clear details
> > so to try to understand it I've laid it out in detail to make sure
> > I've understood it correctly.
> > 
> 
> Thanks for the graph, the added detail makes things clear and easy to
> understand. To be honest, it's not exactly the same as the results I
> captured and described (the position A\B\C\D\E\F I described is
> increased one by one), but the root cause of the problem is the same,
> so it doesn't affect our understanding of the problem.

OK, good :)

.....

> > However, if this did actually write zeroes to disk, this would end
> > up with:
> > 
> > 	A          C           B     E       F      D
> > 	+wwwwwwwwww+DDDDDDDDDDD+zzzzz+rrrrrrr+dddddd+
> > 	                      EOF   EOF
> >                       (in memory)   (on disk)
> > 
> > Which is wrong - the file extension and zeros should not get exposed
> > to the user until the entire reflink completes. This would expose
> > zeros at the EOF and a file size that the user never asked for after
> > a crash. Experience tells me that they would report this as
> > "filesystem corrupting data on crash".
> > 
> > If we move where i_size gets updated by iomap_zero_iter(), we get:
> > 
> > 	A          C           B     E       F      D
> > 	+wwwwwwwwww+DDDDDDDDDDD+zzzzz+rrrrrrr+dddddd+
> > 	                            EOF
> >                                 (in memory)
> > 		                 (on disk)
> > 
> > Which is also wrong, because now the user can see the size change
> > and read zeros in the middle of the clone operation, which is also
> > wrong.
> > 
> > IOWs, we do not want to move the in-memory or on-disk EOF as a
> > result of zeroing delalloc extents beyond EOF as it opens up
> > transient, non-atomic on-disk states in the event of a crash.
> > 
> > So, catch-22: we need to move the in-memory EOF to write back zeroes
> > beyond EOF, but that would move the on-disk EOF to E before the
> > clone operation starts. i.e. it makes clone non-atomic.
> 
> Make sense. IIUC, I also notice that xfs_file_write_checks() zero
> out EOF blocks if the later write offset is beyond the size of the
> file.  Think about if we replace the reflink operation to a buffer
> write E to F, although it doesn't call xfs_flush_unmap_range()
> directly, but if it could be raced by another background write
> back, and trigger the same problem (I've not try to reproduce it,
> so please correct me if I understand wrong).

Correct, but the write is about to extend the file size when it
writes into the cache beyond the zeroed region. There is no cache
invalidate possible in this path, so the write of data moves the
in-memory EOF past the zeroes in cache and everything works just
fine.

If it races with concurrent background writeback, the writeback will
skip the zeroed range beyond EOF until they are exposed by the first
data write beyond the zeroed post-eof region which moves the
in-memory EOF.

truncate(to a larger size) also does this same zeroing - the page
cache is zeroed before we move the EOF in memory, and so the
writeback will only occur once the in-memory EOF is moved. i.e. it
effectively does:

	xfs_zero_range(oldsize to newsize)
	truncate_setsize(newsize)
	filemap_write_and_wait_range(old size to new size)

> > What should acutally result from the iomap_zero_range() call from
> > xfs_reflink_remap_prep() is a state like this:
> > 
> > 	A          C           B     E       F      D
> > 	+wwwwwwwwww+DDDDDDDDDDD+uuuuu+rrrrrrr+dddddd+
> > 	          EOF         EOF
> >                (on disk)  (in memory)
> > 
> > where 'u' are unwritten extent blocks.
> > 
> 
> Yeah, this is a good solution.
> 
> In xfs_file_write_checks(), I don't fully understand why we need
> the xfs_zero_range().

The EOF block may only be partially written. Hence on extension, we
have to guarantee the part of that block beyond the current EOF is
zero if the write leaves a hole between the current EOF and the
start of the new extending write.

> Theoretically, iomap have already handled
> partial block zeroing for both buffered IO and DIO, so I guess
> the only reason we still need it is to handle pre-allocated blocks
> (no?).

Historically speaking, Linux is able to leak data beyond EOF on
writeback of partial EOF blocks (e.g. mmap() can write to the EOF
page beyond EOF without failing. We try to mitigate these cases
where we can, but we have to consider that at any time the data in
the cache beyond EOF can be non-zero thanks to mmap() and so any
file extension *must* zero any region beyond EOF cached in the page
cache.

> If so,would it be better to call xfs_free_eofblocks() to
> release all the preallocated extents in range? If not, maybe we
> could only zero out mapped partial blocks and also release
> preallocated extents?

No, that will cause all sorts of other performance problems,
especially for reflinked files that triggering COW
operations...

> 
> In xfs_reflink_remap_prep(), I read the commit 410fdc72b05a ("xfs:
> zero posteof blocks when cloning above eof"), xfs used to release
> preallocations, the change log said it didn't work because of the
> PREALLOC flag, but the 'force' parameter is 'true' when calling
> xfs_can_free_eofblocks(), so I don't get the problem met. Could we
> fall back to use xfs_free_eofblocks() and make a state like this?
> 
>  	A          C           B     E       F      D
>  	+wwwwwwwwww+DDDDDDDDDDD+hhhhh+rrrrrrr+dddddd+
>  	          EOF         EOF
>                 (on disk)  (in memory)

It could, but that then requires every place that may call
xfs_zero_range() to be aware of this need to trim EOF blocks to do
the right thing in all cases. We don't want to remove speculative
delalloc in the write() path nor in the truncate(up) case, and so it
doesn't fix the general problem of zeroing specualtive delalloc
beyond EOF requiring writeback to push page caceh pages to disk
before the inode size has been updated.

The general solution is to have zeroing of speculative prealloc
extents beyond EOF simply convert the range to unwritten and then
invalidate any cached pages over that range. At this point, we are
guaranteed to have zeroes across that range, all without needing to
do any IO at all...

-Dave.
Darrick J. Wong Feb. 29, 2024, 11:29 p.m. UTC | #11
On Fri, Mar 01, 2024 at 10:19:30AM +1100, Dave Chinner wrote:
> On Thu, Feb 29, 2024 at 04:59:34PM +0800, Zhang Yi wrote:
> > Hello, Dave!
> > 
> > On 2024/2/29 6:25, Dave Chinner wrote:
> > > On Wed, Feb 28, 2024 at 04:53:32PM +0800, Zhang Yi wrote:
> > >> On 2024/2/13 13:46, Christoph Hellwig wrote:
> > >>> Wouldn't it make more sense to just move the size manipulation to the
> > >>> write-only code?  An untested version of that is below.  With this
> > >>> the naming of the status variable becomes even more confusing than
> > >>> it already is, maybe we need to do a cleanup of the *_write_end
> > >>> calling conventions as it always returns the passed in copied value
> > >>> or 0.
> > >>>
> > >>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> > >>> index 3dab060aed6d7b..8401a9ca702fc0 100644
> > >>> --- a/fs/iomap/buffered-io.c
> > >>> +++ b/fs/iomap/buffered-io.c
> > >>> @@ -876,34 +876,13 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
> > >>>  		size_t copied, struct folio *folio)
> > >>>  {
> > >>>  	const struct iomap *srcmap = iomap_iter_srcmap(iter);
> > >>> -	loff_t old_size = iter->inode->i_size;
> > >>> -	size_t ret;
> > >>> -
> > >>> -	if (srcmap->type == IOMAP_INLINE) {
> > >>> -		ret = iomap_write_end_inline(iter, folio, pos, copied);
> > >>> -	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
> > >>> -		ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
> > >>> -				copied, &folio->page, NULL);
> > >>> -	} else {
> > >>> -		ret = __iomap_write_end(iter->inode, pos, len, copied, folio);
> > >>> -	}
> > >>> -
> > >>> -	/*
> > >>> -	 * Update the in-memory inode size after copying the data into the page
> > >>> -	 * cache.  It's up to the file system to write the updated size to disk,
> > >>> -	 * preferably after I/O completion so that no stale data is exposed.
> > >>> -	 */
> > >>> -	if (pos + ret > old_size) {
> > >>> -		i_size_write(iter->inode, pos + ret);
> > >>> -		iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
> > >>> -	}
> > >>
> > >> I've recently discovered that if we don't increase i_size in
> > >> iomap_zero_iter(), it would break fstests generic/476 on xfs. xfs
> > >> depends on iomap_zero_iter() to increase i_size in some cases.
> > >>
> > >>  generic/476 75s ... _check_xfs_filesystem: filesystem on /dev/pmem2 is inconsistent (r)
> > >>  (see /home/zhangyi/xfstests-dev/results//xfs/generic/476.full for details)
> > >>
> > >>  _check_xfs_filesystem: filesystem on /dev/pmem2 is inconsistent (r)
> > >>  *** xfs_repair -n output ***
> > >>  Phase 1 - find and verify superblock...
> > >>  Phase 2 - using internal log
> > >>          - zero log...
> > >>          - scan filesystem freespace and inode maps...
> > >>  sb_fdblocks 10916, counted 10923
> > >>          - found root inode chunk
> > >>  ...
> > >>
> > >> After debugging and analysis, I found the root cause of the problem is
> > >> related to the pre-allocations of xfs. xfs pre-allocates some blocks to
> > >> reduce fragmentation during buffer append writing, then if we write new
> > >> data or do file copy(reflink) after the end of the pre-allocating range,
> > >> xfs would zero-out and write back the pre-allocate space(e.g.
> > >> xfs_file_write_checks() -> xfs_zero_range()), so we have to update
> > >> i_size before writing back in iomap_zero_iter(), otherwise, it will
> > >> result in stale delayed extent.
> > > 
> > > Ok, so this is long because the example is lacking in clear details
> > > so to try to understand it I've laid it out in detail to make sure
> > > I've understood it correctly.
> > > 
> > 
> > Thanks for the graph, the added detail makes things clear and easy to
> > understand. To be honest, it's not exactly the same as the results I
> > captured and described (the position A\B\C\D\E\F I described is
> > increased one by one), but the root cause of the problem is the same,
> > so it doesn't affect our understanding of the problem.
> 
> OK, good :)
> 
> .....
> 
> > > However, if this did actually write zeroes to disk, this would end
> > > up with:
> > > 
> > > 	A          C           B     E       F      D
> > > 	+wwwwwwwwww+DDDDDDDDDDD+zzzzz+rrrrrrr+dddddd+
> > > 	                      EOF   EOF
> > >                       (in memory)   (on disk)
> > > 
> > > Which is wrong - the file extension and zeros should not get exposed
> > > to the user until the entire reflink completes. This would expose
> > > zeros at the EOF and a file size that the user never asked for after
> > > a crash. Experience tells me that they would report this as
> > > "filesystem corrupting data on crash".
> > > 
> > > If we move where i_size gets updated by iomap_zero_iter(), we get:
> > > 
> > > 	A          C           B     E       F      D
> > > 	+wwwwwwwwww+DDDDDDDDDDD+zzzzz+rrrrrrr+dddddd+
> > > 	                            EOF
> > >                                 (in memory)
> > > 		                 (on disk)
> > > 
> > > Which is also wrong, because now the user can see the size change
> > > and read zeros in the middle of the clone operation, which is also
> > > wrong.
> > > 
> > > IOWs, we do not want to move the in-memory or on-disk EOF as a
> > > result of zeroing delalloc extents beyond EOF as it opens up
> > > transient, non-atomic on-disk states in the event of a crash.
> > > 
> > > So, catch-22: we need to move the in-memory EOF to write back zeroes
> > > beyond EOF, but that would move the on-disk EOF to E before the
> > > clone operation starts. i.e. it makes clone non-atomic.
> > 
> > Make sense. IIUC, I also notice that xfs_file_write_checks() zero
> > out EOF blocks if the later write offset is beyond the size of the
> > file.  Think about if we replace the reflink operation to a buffer
> > write E to F, although it doesn't call xfs_flush_unmap_range()
> > directly, but if it could be raced by another background write
> > back, and trigger the same problem (I've not try to reproduce it,
> > so please correct me if I understand wrong).
> 
> Correct, but the write is about to extend the file size when it
> writes into the cache beyond the zeroed region. There is no cache
> invalidate possible in this path, so the write of data moves the
> in-memory EOF past the zeroes in cache and everything works just
> fine.
> 
> If it races with concurrent background writeback, the writeback will
> skip the zeroed range beyond EOF until they are exposed by the first
> data write beyond the zeroed post-eof region which moves the
> in-memory EOF.
> 
> truncate(to a larger size) also does this same zeroing - the page
> cache is zeroed before we move the EOF in memory, and so the
> writeback will only occur once the in-memory EOF is moved. i.e. it
> effectively does:
> 
> 	xfs_zero_range(oldsize to newsize)
> 	truncate_setsize(newsize)
> 	filemap_write_and_wait_range(old size to new size)
> 
> > > What should acutally result from the iomap_zero_range() call from
> > > xfs_reflink_remap_prep() is a state like this:
> > > 
> > > 	A          C           B     E       F      D
> > > 	+wwwwwwwwww+DDDDDDDDDDD+uuuuu+rrrrrrr+dddddd+
> > > 	          EOF         EOF
> > >                (on disk)  (in memory)
> > > 
> > > where 'u' are unwritten extent blocks.
> > > 
> > 
> > Yeah, this is a good solution.
> > 
> > In xfs_file_write_checks(), I don't fully understand why we need
> > the xfs_zero_range().
> 
> The EOF block may only be partially written. Hence on extension, we
> have to guarantee the part of that block beyond the current EOF is
> zero if the write leaves a hole between the current EOF and the
> start of the new extending write.
> 
> > Theoretically, iomap have already handled
> > partial block zeroing for both buffered IO and DIO, so I guess
> > the only reason we still need it is to handle pre-allocated blocks
> > (no?).
> 
> Historically speaking, Linux is able to leak data beyond EOF on
> writeback of partial EOF blocks (e.g. mmap() can write to the EOF
> page beyond EOF without failing. We try to mitigate these cases
> where we can, but we have to consider that at any time the data in
> the cache beyond EOF can be non-zero thanks to mmap() and so any
> file extension *must* zero any region beyond EOF cached in the page
> cache.
> 
> > If so,would it be better to call xfs_free_eofblocks() to
> > release all the preallocated extents in range? If not, maybe we
> > could only zero out mapped partial blocks and also release
> > preallocated extents?
> 
> No, that will cause all sorts of other performance problems,
> especially for reflinked files that triggering COW
> operations...
> 
> > 
> > In xfs_reflink_remap_prep(), I read the commit 410fdc72b05a ("xfs:
> > zero posteof blocks when cloning above eof"), xfs used to release
> > preallocations, the change log said it didn't work because of the
> > PREALLOC flag, but the 'force' parameter is 'true' when calling
> > xfs_can_free_eofblocks(), so I don't get the problem met. Could we
> > fall back to use xfs_free_eofblocks() and make a state like this?
> > 
> >  	A          C           B     E       F      D
> >  	+wwwwwwwwww+DDDDDDDDDDD+hhhhh+rrrrrrr+dddddd+
> >  	          EOF         EOF
> >                 (on disk)  (in memory)
> 
> It could, but that then requires every place that may call
> xfs_zero_range() to be aware of this need to trim EOF blocks to do
> the right thing in all cases. We don't want to remove speculative
> delalloc in the write() path nor in the truncate(up) case, and so it
> doesn't fix the general problem of zeroing specualtive delalloc
> beyond EOF requiring writeback to push page caceh pages to disk
> before the inode size has been updated.
> 
> The general solution is to have zeroing of speculative prealloc
> extents beyond EOF simply convert the range to unwritten and then
> invalidate any cached pages over that range. At this point, we are
> guaranteed to have zeroes across that range, all without needing to
> do any IO at all...

That (separate iomap ops that do the delalloc -> unwritten allocation)
seems a lot more straightforward to me than whacking preallocations.

--D

> -Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
>
Zhang Yi March 1, 2024, 3:26 a.m. UTC | #12
On 2024/3/1 7:19, Dave Chinner wrote:
> On Thu, Feb 29, 2024 at 04:59:34PM +0800, Zhang Yi wrote:
>> Hello, Dave!
>>
>> On 2024/2/29 6:25, Dave Chinner wrote:
>>> On Wed, Feb 28, 2024 at 04:53:32PM +0800, Zhang Yi wrote:
>>>> On 2024/2/13 13:46, Christoph Hellwig wrote:
...
> 
> The general solution is to have zeroing of speculative prealloc
> extents beyond EOF simply convert the range to unwritten and then
> invalidate any cached pages over that range. At this point, we are
> guaranteed to have zeroes across that range, all without needing to
> do any IO at all...
> 

Sure, thanks for the explanation, I will try to solve this problem
for xfs first.

Thanks,
Yi.
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e0c9cede82ee..2ae936e5af74 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -834,7 +834,6 @@  static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
 
 out_unlock:
 	__iomap_put_folio(iter, pos, 0, folio);
-	iomap_write_failed(iter->inode, pos, len);
 
 	return status;
 }
@@ -881,6 +880,25 @@  static size_t iomap_write_end_inline(const struct iomap_iter *iter,
 	return copied;
 }
 
+static size_t iomap_write_end_simple(struct iomap_iter *iter, loff_t pos,
+		size_t len, struct folio *folio)
+{
+	const struct iomap *srcmap = iomap_iter_srcmap(iter);
+	size_t ret;
+
+	if (srcmap->type == IOMAP_INLINE) {
+		ret = iomap_write_end_inline(iter, folio, pos, len);
+	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
+		ret = block_write_end(NULL, iter->inode->i_mapping, pos, len,
+				len, &folio->page, NULL);
+	} else {
+		ret = __iomap_write_end(iter->inode, pos, len, len, folio);
+	}
+
+	__iomap_put_folio(iter, pos, ret, folio);
+	return ret;
+}
+
 /* Returns the number of bytes copied.  May be 0.  Cannot be an errno. */
 static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
 		size_t copied, struct folio *folio)
@@ -960,8 +978,10 @@  static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
 		}
 
 		status = iomap_write_begin(iter, pos, bytes, &folio);
-		if (unlikely(status))
+		if (unlikely(status)) {
+			iomap_write_failed(iter->inode, pos, bytes);
 			break;
+		}
 		if (iter->iomap.flags & IOMAP_F_STALE)
 			break;
 
@@ -1343,7 +1363,7 @@  static loff_t iomap_unshare_iter(struct iomap_iter *iter)
 		if (bytes > folio_size(folio) - offset)
 			bytes = folio_size(folio) - offset;
 
-		bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+		bytes = iomap_write_end_simple(iter, pos, bytes, folio);
 		if (WARN_ON_ONCE(bytes == 0))
 			return -EIO;
 
@@ -1407,7 +1427,7 @@  static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
 		folio_zero_range(folio, offset, bytes);
 		folio_mark_accessed(folio);
 
-		bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+		bytes = iomap_write_end_simple(iter, pos, bytes, folio);
 		if (WARN_ON_ONCE(bytes == 0))
 			return -EIO;