diff mbox series

[v4,1/2] iomap: fix zero padding data issue in concurrent append writes

Message ID 20241125023341.2816630-1-leo.lilong@huawei.com (mailing list archive)
State New
Headers show
Series [v4,1/2] iomap: fix zero padding data issue in concurrent append writes | expand

Commit Message

Long Li Nov. 25, 2024, 2:33 a.m. UTC
During concurrent append writes to XFS filesystem, zero padding data
may appear in the file after power failure. This happens due to imprecise
disk size updates when handling write completion.

Consider this scenario with concurrent append writes same file:

  Thread 1:                  Thread 2:
  ------------               -----------
  write [A, A+B]
  update inode size to A+B
  submit I/O [A, A+BS]
                             write [A+B, A+B+C]
                             update inode size to A+B+C
  <I/O completes, updates disk size to min(A+B+C, A+BS)>
  <power failure>

After reboot:
  1) with A+B+C < A+BS, the file has zero padding in range [A+B, A+B+C]

  |<         Block Size (BS)      >|
  |DDDDDDDDDDDDDDDD0000000000000000|
  ^               ^        ^
  A              A+B     A+B+C
                         (EOF)

  2) with A+B+C > A+BS, the file has zero padding in range [A+B, A+BS]

  |<         Block Size (BS)      >|<           Block Size (BS)    >|
  |DDDDDDDDDDDDDDDD0000000000000000|00000000000000000000000000000000|
  ^               ^                ^               ^
  A              A+B              A+BS           A+B+C
                                  (EOF)

  D = Valid Data
  0 = Zero Padding

The issue stems from disk size being set to min(io_offset + io_size,
inode->i_size) at I/O completion. Since io_offset+io_size is block
size granularity, it may exceed the actual valid file data size. In
the case of concurrent append writes, inode->i_size may be larger
than the actual range of valid file data written to disk, leading to
inaccurate disk size updates.

This patch changes the meaning of io_size to represent the size of
valid data within eof in ioend, while the extent size of ioend can
be obtained by rounding up to block size in wrapper function.
This function is specifically used for ioend grow/merge management:
1. In concurrent writes, when one write's io_size is truncated due
   to non-block-aligned file size while another write extends the file
   size, if these two writes are physically and logically contiguous
   at block boundaries, rounding up io_size to block boundaries helps
   grow the first write's ioend and share this ioend between both
   writes.
2. During IO completion, we try to merge physically and logically
   contiguous ioends before completion to minimize the number of
   transactions. Rounding up io_size to block boundaries helps merge
   ioends whose io_size is not block-aligned.

Another benefit is that it makes the xfs_ioend_is_append() check more
accurate, which can reduce unnecessary end bio callbacks of xfs_end_bio()
in certain scenarios, such as repeated writes at the file tail without
extending the file size.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Long Li <leo.lilong@huawei.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
v3->v4:
  1. collect reviewed tag
  2. Modify the comment of io_size and iomap_ioend_size_aligned().
  3. Add explain of iomap_ioend_size_aligned() to commit message.
 fs/iomap/buffered-io.c | 37 +++++++++++++++++++++++++++++++------
 include/linux/iomap.h  |  2 +-
 2 files changed, 32 insertions(+), 7 deletions(-)

Comments

Christoph Hellwig Nov. 25, 2024, 6:40 a.m. UTC | #1
On Mon, Nov 25, 2024 at 10:33:40AM +0800, Long Li wrote:
>   1. collect reviewed tag
>   2. Modify the comment of io_size and iomap_ioend_size_aligned().
>   3. Add explain of iomap_ioend_size_aligned() to commit message.

Just curious, did you look into Brian's suggestions to do away
with the rounding up entirely as there is not much practical benefit
in merging behind EOF?
diff mbox series

Patch

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d42f01e0fc1c..e59c08c0e529 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1593,12 +1593,24 @@  iomap_finish_ioends(struct iomap_ioend *ioend, int error)
 }
 EXPORT_SYMBOL_GPL(iomap_finish_ioends);
 
+/*
+ * Calculate the physical size of an ioend by rounding up to block granularity.
+ * io_size might be unaligned if the last block crossed an unaligned i_size
+ * boundary at creation time.
+ */
+static inline size_t iomap_ioend_size_aligned(struct iomap_ioend *ioend)
+{
+	return round_up(ioend->io_size, i_blocksize(ioend->io_inode));
+}
+
 /*
  * We can merge two adjacent ioends if they have the same set of work to do.
  */
 static bool
 iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
 {
+	size_t size = iomap_ioend_size_aligned(ioend);
+
 	if (ioend->io_bio.bi_status != next->io_bio.bi_status)
 		return false;
 	if (next->io_flags & IOMAP_F_BOUNDARY)
@@ -1609,7 +1621,7 @@  iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
 	if ((ioend->io_type == IOMAP_UNWRITTEN) ^
 	    (next->io_type == IOMAP_UNWRITTEN))
 		return false;
-	if (ioend->io_offset + ioend->io_size != next->io_offset)
+	if (ioend->io_offset + size != next->io_offset)
 		return false;
 	/*
 	 * Do not merge physically discontiguous ioends. The filesystem
@@ -1621,7 +1633,7 @@  iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
 	 * submission so does not point to the start sector of the bio at
 	 * completion.
 	 */
-	if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
+	if (ioend->io_sector + (size >> 9) != next->io_sector)
 		return false;
 	return true;
 }
@@ -1638,7 +1650,8 @@  iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
 		if (!iomap_ioend_can_merge(ioend, next))
 			break;
 		list_move_tail(&next->io_list, &ioend->io_list);
-		ioend->io_size += next->io_size;
+		ioend->io_size = iomap_ioend_size_aligned(ioend) +
+					next->io_size;
 	}
 }
 EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
@@ -1742,7 +1755,7 @@  static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
 		return false;
 	if (wpc->iomap.type != wpc->ioend->io_type)
 		return false;
-	if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
+	if (pos != wpc->ioend->io_offset + iomap_ioend_size_aligned(wpc->ioend))
 		return false;
 	if (iomap_sector(&wpc->iomap, pos) !=
 	    bio_end_sector(&wpc->ioend->io_bio))
@@ -1774,6 +1787,8 @@  static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 {
 	struct iomap_folio_state *ifs = folio->private;
 	size_t poff = offset_in_folio(folio, pos);
+	loff_t isize = i_size_read(inode);
+	struct iomap_ioend *ioend;
 	int error;
 
 	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
@@ -1784,12 +1799,22 @@  static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
 		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
 	}
 
-	if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+	ioend = wpc->ioend;
+	if (!bio_add_folio(&ioend->io_bio, folio, len, poff))
 		goto new_ioend;
 
 	if (ifs)
 		atomic_add(len, &ifs->write_bytes_pending);
-	wpc->ioend->io_size += len;
+
+	/*
+	 * If the ioend spans i_size, trim io_size to the former to provide
+	 * the fs with more accurate size information. This is useful for
+	 * completion time on-disk size updates.
+	 */
+	ioend->io_size = iomap_ioend_size_aligned(ioend) + len;
+	if (ioend->io_offset + ioend->io_size > isize)
+		ioend->io_size = isize - ioend->io_offset;
+
 	wbc_account_cgroup_owner(wbc, folio, len);
 	return 0;
 }
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 5675af6b740c..75bf54e76f3b 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -335,7 +335,7 @@  struct iomap_ioend {
 	u16			io_type;
 	u16			io_flags;	/* IOMAP_F_* */
 	struct inode		*io_inode;	/* file being written to */
-	size_t			io_size;	/* size of the extent */
+	size_t			io_size;	/* size of data within eof */
 	loff_t			io_offset;	/* offset in the file */
 	sector_t		io_sector;	/* start sector of ioend */
 	struct bio		io_bio;		/* MUST BE LAST! */