@@ -895,6 +895,25 @@ const struct iomap_ops xfs_dax_write_iomap_ops = {
.iomap_end = xfs_dax_write_iomap_end,
};
+/*
+ * Determine whether a mapping requires a flush and retry before returning to
+ * iomap. This is restricted to cases where a flush may change extent state in a
+ * way known to change iomap behavior.
+ */
+static inline bool
+imap_needs_flush(
+ unsigned flags,
+ bool range_dirty,
+ struct xfs_bmbt_irec *imap)
+{
+ if (!(flags & IOMAP_ZERO))
+ return false;
+ if (!range_dirty)
+ return false;
+ return imap->br_startblock == HOLESTARTBLOCK ||
+ imap->br_state == XFS_EXT_UNWRITTEN;
+}
+
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
@@ -908,10 +927,12 @@ xfs_buffered_write_iomap_begin(
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+ loff_t endoff = offset + count - 1;
struct xfs_bmbt_irec imap, cmap;
struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
bool eof = false, cow_eof = false, shared = false;
+ bool range_dirty;
int allocfork = XFS_DATA_FORK;
int error = 0;
unsigned int lockmode = XFS_ILOCK_EXCL;
@@ -926,6 +947,8 @@ xfs_buffered_write_iomap_begin(
ASSERT(!XFS_IS_REALTIME_INODE(ip));
+restart:
+ range_dirty = false;
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
@@ -942,6 +965,24 @@ xfs_buffered_write_iomap_begin(
if (error)
goto out_unlock;
+ /*
+ * Certain operations may not work correctly if the associated file
+ * range is dirty in pagecache. For example, zeroing skips unwritten
+ * extents even if uptodate and dirty folios exist in cache. This can
+ * cause various sorts of stale data exposure problems where zeroing is
+ * used as part of truncate or size extending I/O operations that don't
+ * typically flush and invalidate cache beforehand.
+ *
+ * To prevent this problem, flush and convert any such extent mappings
+ * before returning to iomap. Ideally, iomap should learn to handle
+ * pagecache correctly on its own so this can be removed. Note that this
+ * flush should be relatively uncommon as most heavy zero range users
+ * flush and invalidate the entire target range, so this is a simple,
+ * low overhead step to prevent data corruption issues.
+ */
+ if (filemap_range_needs_writeback(inode->i_mapping, offset, endoff))
+ range_dirty = true;
+
/*
* Search the data fork first to look up our source mapping. We
* always need the data fork map, as we have to return it to the
@@ -952,8 +993,18 @@ xfs_buffered_write_iomap_begin(
if (eof)
imap.br_startoff = end_fsb; /* fake hole until the end */
- /* We never need to allocate blocks for zeroing a hole. */
+ /*
+ * We never need to allocate blocks for zeroing a hole, but we might
+ * need to flush the mapping if dirty. This could mean the range is at
+ * least partially backed by the COW fork. This is generally unexpected,
+ * but writeback handles this situation so let it resolve the conflict
+ * and ensure zeroing does the right thing.
+ */
if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+ if (imap_needs_flush(flags, range_dirty, &imap)) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto flush_restart;
+ }
xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
goto out_unlock;
}
@@ -1100,11 +1151,15 @@ xfs_buffered_write_iomap_begin(
found_imap:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (imap_needs_flush(flags, range_dirty, &imap))
+ goto flush_restart;
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
found_cow:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (imap.br_startoff <= offset_fsb) {
+ if (imap_needs_flush(flags, range_dirty, &imap))
+ goto flush_restart;
error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
if (error)
return error;
@@ -1112,12 +1167,20 @@ xfs_buffered_write_iomap_begin(
IOMAP_F_SHARED);
}
+ if (imap_needs_flush(flags, range_dirty, &cmap))
+ goto flush_restart;
xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
+
+flush_restart:
+ error = filemap_write_and_wait_range(inode->i_mapping, offset, endoff);
+ if (!error)
+ goto restart;
+ return error;
}
static int
@@ -840,16 +840,6 @@ xfs_setattr_size(
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
&did_zeroing);
} else {
- /*
- * iomap won't detect a dirty page over an unwritten block (or a
- * cow block over a hole) and subsequently skips zeroing the
- * newly post-EOF portion of the page. Flush the new EOF to
- * convert the block before the pagecache truncate.
- */
- error = filemap_write_and_wait_range(inode->i_mapping, newsize,
- newsize);
- if (error)
- return error;
error = xfs_truncate_page(ip, newsize, &did_zeroing);
}
The truncate code has a tortured history of hacks trying to prevent various consistency problems. The most recent example is the flush introduced by commit 869ae85dae64b ("xfs: flush new eof page on truncate to avoid post-eof corruption") to work around the fact that iomap zeroing doesn't handle certain extent types properly when backed by dirty pagecache. Unfortunately, this workaround has shown to cause a significant performance impact in certain small file overwrite and truncate workloads. The ideal solution here is for iomap to learn to handle such dirty cache scenarios and perform proper zeroing, but that is not a trivial endeavor and may depend on additional functionality that is not currently complete. In lieu of that goal, we can move the unconditional flush in the truncate path one step closer to iomap by moving it into ->iomap_begin(). This allows selective invocation for only the scenarios where the mapping type can change in the background, and therefore avoids the flush completely in the aforementioned overwrite scenario and provides a significant improvement in latency of such operations. For example, from a test program doing a small, fixed size overwrite and truncate: On recent master: path /mnt/file, 100000 iters % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 81.56 1.626925 16 100000 ftruncate 18.42 0.367488 3 100000 pwrite64 ... v6.1.0-rc3 with this patch: path /mnt/file, 100000 iters % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 74.76 1.077406 10 100000 ftruncate 25.19 0.363060 3 100000 pwrite64 ... This also has the side effect of helping to prevent further outstanding post-eof data zeroing issues caused by the same fundamental iomap problem. For example, consider the following (somewhat contrived) example that demonstrates a potential stale data exposure vector. On recent mainline: $ xfs_io -fc "falloc 0 1k" -c "pwrite 0 1k" \ -c "mmap 0 4k" -c "mwrite 3k 1k" \ -c "pwrite 32k 4k" -c fsync -c "pread -v 3k 32" <file> ... 00000c00: 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 XXXXXXXXXXXXXXXX 00000c10: 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 XXXXXXXXXXXXXXXX ... while the same command with this patch produces: 00000c00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00000c10: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ ... which also happens to match behavior with the first falloc subcommand removed. While this patch does not implement full iomap based handling of dirty pagecache data, it improves correctness, performance, is simple enough for stable backports, and doesn't preclude replacement by more advanced or efficient solutions via iomap in the future. Signed-off-by: Brian Foster <bfoster@redhat.com> --- This is a followup to the RFC and discussion here[1]. Towards the end, Dave describes an approach to implement iomap pagecache zeroing based on the recently posted iomap stale mapping detection. While that sounds plausible, it's a notable jump in complexity and test scope and a more involved dependency chain than is necessary to fix the performance and stale data exposure problems. Therefore I'm posting this modified variant as an intermediate step toward that goal. Since there is apparent overlap between the iomap based RFC and stale iomap detection, this instead relocates the flush/retry step into the ->iomap_begin() callback such that it can be lifted into iomap sometime later when proper support mechanisms to validate pagecache state against extent state are stabilized. This can potentially enable a higher level folio state check or more explicit folio zeroing to avoid the need to flush entirely. Brian [1] https://lore.kernel.org/linux-xfs/20221028130411.977076-1-bfoster@redhat.com/ fs/xfs/xfs_iomap.c | 65 +++++++++++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_iops.c | 10 ------- 2 files changed, 64 insertions(+), 11 deletions(-)