From patchwork Fri Dec 13 01:21:43 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13906311
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6278317BA1
	for <linux-xfs@vger.kernel.org>; Fri, 13 Dec 2024 01:21:44 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1734052904; cv=none;
 b=rkfMovjceYNsCY94FppUv9Yd8FiIE/A5pw7CeuGBZwPGGhJhRRAIJVmsd373l4wjzFDMrLrTUsmfKnAuaTMz5o+5wOaJfIpmAclMzm2J5LuYgcMTwYZ/bP9Lfh7KdkrBJ0U0kwtRa//bFqoLbt2Myr0Tk6k06ZhvBVGxigp4LrI=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1734052904; c=relaxed/simple;
	bh=v/juFr1JZ3Mw7m4ULZG1dvu6bJZJbU4F1Y2pTICBUMU=;
	h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=lSId6vDb2gXNNGjRYV3n5CwP2M3ZfxrundZuQTOCmaH3dQN/KBmcQyyZlvXzdG4nk6mcZ7gShB6onzgd7uesASNsYfiKN7jSh/vveKWPnC0XuwPsqyhiHnRtAJd6lLDU4+AuLHdxQTDuDoBSUR5gxpvx/0fOPWAwJdgYrr8Gfxo=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=L85F9kfU; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="L85F9kfU"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id D11DAC4CECE;
	Fri, 13 Dec 2024 01:21:43 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1734052903;
	bh=v/juFr1JZ3Mw7m4ULZG1dvu6bJZJbU4F1Y2pTICBUMU=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=L85F9kfUlrqLww01xtaGJptmXm6EavzSBPc1zUZh8q/drd+KCMHvkd9NfRwtlwEck
	 uNBxmA+hNH6slNf+XUcNgMYbjTskU7bjKOyKl+RUkP3AYoLOxzoGu4IAsa7HO5Iq10
	 k7wLmizFpnW4bSLzoXwqPBWKrbpQNkTE7aVxD40V57G5+ZcevDxc1N3SrPMikmFpGi
	 SaGkbalv8r8bcKF+iphay1xSD3SwFTv4pUA1e9a9rFPGh4Pz8p44PZ906kmiL1kovl
	 8l4f8PQe5PpNpL/DsbCMIgKijRfBcKNYlgMR9JftmJFCDZ3vvHUawQ5mA2pWuMafCS
	 phE/jVBf5WuJw==
Date: Thu, 12 Dec 2024 17:21:43 -0800
Subject: [PATCH 01/11] vfs: explicitly pass the block size to the remap prep
 function
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: hch@lst.de, linux-xfs@vger.kernel.org
Message-ID: <173405125759.1184063.6610287530974429945.stgit@frogsfrogsfrogs>
In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Make it so that filesystems can pass an explicit blocksize to the remap
prep function.  This enables filesystems whose fundamental allocation
units are /not/ the same as the blocksize to ensure that the remapping
checks are aligned properly.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/dax.c           |    5 ++++-
 fs/remap_range.c   |   30 ++++++++++++++++++------------
 include/linux/fs.h |    3 ++-
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 21b47402b3dca4..c7ea298b4214a5 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -2076,7 +2076,10 @@ int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 			      loff_t *len, unsigned int remap_flags,
 			      const struct iomap_ops *ops)
 {
+	unsigned int blocksize = file_inode(file_out)->i_sb->s_blocksize;
+
 	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
-					       pos_out, len, remap_flags, ops);
+					       pos_out, len, remap_flags, ops,
+					       blocksize);
 }
 EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 26afbbbfb10c2e..d3c6c6b05eb191 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -30,18 +30,18 @@
  */
 static int generic_remap_checks(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
-				loff_t *req_count, unsigned int remap_flags)
+				loff_t *req_count, unsigned int remap_flags,
+				unsigned int blocksize)
 {
 	struct inode *inode_in = file_in->f_mapping->host;
 	struct inode *inode_out = file_out->f_mapping->host;
 	uint64_t count = *req_count;
 	uint64_t bcount;
 	loff_t size_in, size_out;
-	loff_t bs = inode_out->i_sb->s_blocksize;
 	int ret;
 
 	/* The start of both ranges must be aligned to an fs block. */
-	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
+	if (!IS_ALIGNED(pos_in, blocksize) || !IS_ALIGNED(pos_out, blocksize))
 		return -EINVAL;
 
 	/* Ensure offsets don't wrap. */
@@ -75,10 +75,10 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
 	 */
 	if (pos_in + count == size_in &&
 	    (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
-		bcount = ALIGN(size_in, bs) - pos_in;
+		bcount = ALIGN(size_in, blocksize) - pos_in;
 	} else {
-		if (!IS_ALIGNED(count, bs))
-			count = ALIGN_DOWN(count, bs);
+		if (!IS_ALIGNED(count, blocksize))
+			count = ALIGN_DOWN(count, blocksize);
 		bcount = count;
 	}
 
@@ -134,9 +134,10 @@ static int generic_remap_check_len(struct inode *inode_in,
 				   struct inode *inode_out,
 				   loff_t pos_out,
 				   loff_t *len,
-				   unsigned int remap_flags)
+				   unsigned int remap_flags,
+				   unsigned int blocksize)
 {
-	u64 blkmask = i_blocksize(inode_in) - 1;
+	u64 blkmask = blocksize - 1;
 	loff_t new_len = *len;
 
 	if ((*len & blkmask) == 0)
@@ -277,7 +278,8 @@ int
 __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				struct file *file_out, loff_t pos_out,
 				loff_t *len, unsigned int remap_flags,
-				const struct iomap_ops *dax_read_ops)
+				const struct iomap_ops *dax_read_ops,
+				unsigned int blocksize)
 {
 	struct inode *inode_in = file_inode(file_in);
 	struct inode *inode_out = file_inode(file_out);
@@ -312,7 +314,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 
 	/* Check that we don't violate system file offset limits. */
 	ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
-			remap_flags);
+			remap_flags, blocksize);
 	if (ret || *len == 0)
 		return ret;
 
@@ -353,7 +355,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 	}
 
 	ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
-			remap_flags);
+			remap_flags, blocksize);
 	if (ret || *len == 0)
 		return ret;
 
@@ -363,13 +365,17 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 
 	return ret;
 }
+EXPORT_SYMBOL(__generic_remap_file_range_prep);
 
 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				  struct file *file_out, loff_t pos_out,
 				  loff_t *len, unsigned int remap_flags)
 {
+	unsigned int blocksize = file_inode(file_out)->i_sb->s_blocksize;
+
 	return __generic_remap_file_range_prep(file_in, pos_in, file_out,
-					       pos_out, len, remap_flags, NULL);
+					       pos_out, len, remap_flags, NULL,
+					       blocksize);
 }
 EXPORT_SYMBOL(generic_remap_file_range_prep);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e29433c5ecce2..b638fb1bcbc96f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2191,7 +2191,8 @@ int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write);
 int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				    struct file *file_out, loff_t pos_out,
 				    loff_t *len, unsigned int remap_flags,
-				    const struct iomap_ops *dax_read_ops);
+				    const struct iomap_ops *dax_read_ops,
+				    unsigned int block_size);
 int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				  struct file *file_out, loff_t pos_out,
 				  loff_t *count, unsigned int remap_flags);

From patchwork Fri Dec 13 01:21:59 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13906312
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id A05722AD25
	for <linux-xfs@vger.kernel.org>; Fri, 13 Dec 2024 01:21:59 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1734052919; cv=none;
 b=mu670qsHSFYfcIZiUXr6aLEJlFdQwHh1NU70W9JajHqT9YZ709dLD9JDiB9HgNcQ+dMe8LHEbOSs4b9AAvj04IKGcfdqdf7KFlGkI6sZnnMEFTEumy3qYNGx117t+PEYF6E+pe9noBKElsa6Y1JHTbhPOvDTYRfVc6ahhxanztU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1734052919; c=relaxed/simple;
	bh=DnbyVknEHkpEa7j2O2HYhbucDbhzPaQWV7TRZ0C8uOU=;
	h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=g1CXmpiyDEgHG1z54Rqo4Bnc3MWBmKrJX6nfZWk0M/G8n+wg53IvOmz8vsg94aL8xuG1+2o6xlPQP2Kzip9xIgec9Bqvl7D4GUB2xdq+X0lEww2Brr8XIXIE+wR5hBnzerNC5cNtOXSTi6uJdkU3x01XeFlRKfiZN4z/NWp3yc4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=KUvsxyJX; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="KUvsxyJX"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 774BFC4CED4;
	Fri, 13 Dec 2024 01:21:59 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1734052919;
	bh=DnbyVknEHkpEa7j2O2HYhbucDbhzPaQWV7TRZ0C8uOU=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=KUvsxyJXJa1gA/L3mgBNu507BVpty2IyWVj/o7l8WEEOhmJpYuS9YnWMoh+Ze7Cf3
	 32j9n23KvlhhjCJWrw+yHKfpjZQKQ8yxupmJl7tp0nLYSlrqcOZLHyh/SFQasVJ1Mg
	 O0tKTfJrgUoxU1L9Makw7pvJdWwZNq03OJSvigW8PAr0m6c4otfAKx7aeGxNHeSnJq
	 AEpKiaIMXHlswa+wRZUF53sevz2ZpJr4AIjPP+uN2LarTUIazXBn5RRLowUw/VwuiS
	 B/SLbiGqkOy1TtFzc3dAApjosvvE2j3Q3SK5Q7JgKjGHFSIcpaxqhU/wOPz3iMxFl9
	 hndeGPDQ6DBNQ==
Date: Thu, 12 Dec 2024 17:21:59 -0800
Subject: [PATCH 02/11] iomap: allow zeroing of written extents beyond EOF
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: hch@lst.de, linux-xfs@vger.kernel.org
Message-ID: <173405125776.1184063.5414430767804356851.stgit@frogsfrogsfrogs>
In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

In commit eb65540aa9fc82 ("iomap: warn on zero range of a post-eof
folio"), we established that iomap_zero_range cannot dirty folios beyond
EOF because writeback will ignore those dirty folios.

However, XFS can only handle copy on write of entire file allocation
units.  For reflink on a realtime volume where the allocation unit size
is larger than a single fsblock, if EOF is in the middle of an
allocation unit, we must use the pagecache to stage the out of place
write, even if that means having (zeroed) dirty pagecache beyond EOF.

To support this, the writeback call knows how to extend the writeback
range to align with an allocation unit, and it successfully finds the
dirty post-EOF folios.  Therefore, we need to disable this check for
this particular situation.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/gfs2/bmap.c         |    2 +-
 fs/iomap/buffered-io.c |   25 ++++++++++++++++++++-----
 fs/xfs/xfs_iomap.c     |   27 ++++++++++++++++++++++++++-
 include/linux/iomap.h  |    6 +++++-
 4 files changed, 52 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1795c4e8dbf66a..ce9293c916363e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1300,7 +1300,7 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
 				 unsigned int length)
 {
 	BUG_ON(current->journal_info);
-	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
+	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops, 0);
 }
 
 #define GFS2_JTRUNC_REVOKES 8192
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 955f19e27e47c5..4e851e9c2a1002 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1350,7 +1350,8 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
 	return filemap_write_and_wait_range(mapping, i->pos, end);
 }
 
-static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
+static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
+		unsigned zeroing_flags)
 {
 	loff_t pos = iter->pos;
 	loff_t length = iomap_length(iter);
@@ -1363,6 +1364,18 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
 		size_t bytes = min_t(u64, SIZE_MAX, length);
 		bool ret;
 
+		/*
+		 * If we've gone past EOF and have a written mapping, and the
+		 * filesystem supports written mappings past EOF, skip the rest
+		 * of the range.  We can't write that back anyway.
+		 */
+		if (pos > iter->inode->i_size &&
+		    (zeroing_flags & IOMAP_ZERO_MAPPED_BEYOND_EOF)) {
+			written += length;
+			length = 0;
+			break;
+		}
+
 		status = iomap_write_begin(iter, pos, bytes, &folio);
 		if (status)
 			return status;
@@ -1395,7 +1408,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
 
 int
 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
-		const struct iomap_ops *ops)
+		const struct iomap_ops *ops, unsigned zeroing_flags)
 {
 	struct iomap_iter iter = {
 		.inode		= inode,
@@ -1424,7 +1437,8 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 	    filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
 		iter.len = plen;
 		while ((ret = iomap_iter(&iter, ops)) > 0)
-			iter.processed = iomap_zero_iter(&iter, did_zero);
+			iter.processed = iomap_zero_iter(&iter, did_zero,
+							 zeroing_flags);
 
 		iter.len = len - (iter.pos - pos);
 		if (ret || !iter.len)
@@ -1453,7 +1467,8 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
 			continue;
 		}
 
-		iter.processed = iomap_zero_iter(&iter, did_zero);
+		iter.processed = iomap_zero_iter(&iter, did_zero,
+						 zeroing_flags);
 	}
 	return ret;
 }
@@ -1469,7 +1484,7 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 	/* Block boundary? Nothing to do */
 	if (!off)
 		return 0;
-	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, 0);
 }
 EXPORT_SYMBOL_GPL(iomap_truncate_page);
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 50fa3ef89f6c98..b7d0dfd5fd3117 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1490,14 +1490,39 @@ xfs_zero_range(
 	bool			*did_zero)
 {
 	struct inode		*inode = VFS_I(ip);
+	unsigned int		zeroing_flags = 0;
 
 	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
 
 	if (IS_DAX(inode))
 		return dax_zero_range(inode, pos, len, did_zero,
 				      &xfs_dax_write_iomap_ops);
+
+	/*
+	 * Files with allocation units larger than the fsblock size can share
+	 * zeroed written blocks beyond EOF if the EOF is in the middle of an
+	 * allocation unit because it keeps the refcounting code simple.  We
+	 * therefore permit zeroing of pagecache for these post-EOF written
+	 * extents so that the blocks in the CoW staging extent beyond EOF are
+	 * all initialized to zero.
+	 *
+	 * Alternate designs could be: (a) don't allow sharing of an allocation
+	 * unit that spans EOF because of the unwritten blocks; (b) rewrite the
+	 * reflink code to allow shared unwritten extents in this one corner
+	 * case; or (c) insert zeroed pages into the pagecache to get around
+	 * the checks in iomap_zero_range.
+	 *
+	 * However, this design (allow zeroing of pagecache beyond EOF) was
+	 * chosen because it most closely resembles what we do for allocation
+	 * unit == 1 fsblock.  Note that for these files, we force writeback
+	 * of post-EOF folios to ensure that CoW always happens in units of
+	 * allocation units.
+	 */
+	if (xfs_inode_has_bigrtalloc(ip) && xfs_has_reflink(ip->i_mount))
+		zeroing_flags |= IOMAP_ZERO_MAPPED_BEYOND_EOF;
+
 	return iomap_zero_range(inode, pos, len, did_zero,
-				&xfs_buffered_write_iomap_ops);
+				&xfs_buffered_write_iomap_ops, zeroing_flags);
 }
 
 int
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 5675af6b740c27..31a5aa239aab1d 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -306,7 +306,11 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
 int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
 		const struct iomap_ops *ops);
 int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
-		bool *did_zero, const struct iomap_ops *ops);
+		bool *did_zero, const struct iomap_ops *ops,
+		unsigned zeroing_flags);
+/* ignore written mappings allowed beyond EOF */
+#define IOMAP_ZERO_MAPPED_BEYOND_EOF	(1U << 0)
+
 int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
 		const struct iomap_ops *ops);
 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,

From patchwork Fri Dec 13 01:22:14 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13906313
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4BA37DDDC
	for <linux-xfs@vger.kernel.org>; Fri, 13 Dec 2024 01:22:15 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1734052935; cv=none;
 b=Y6qeR+89P1Vusi2uNfWNIlKCyQUOKhOTEkNC+ODCesZlamxjsDMugILGuiNfciz4e2ArDV7y5YNU16Buin2sMrCgV+/qp/c5q7wyfufTd8OXQhoPyGo9AJb6+ztCnjeKuyfmuARy7ethurfj+8XEzAp6OgbQlGRHosk8mbezew0=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1734052935; c=relaxed/simple;
	bh=tYFRfTaYWFLF1LBGiqPXSf7STfiVxXYUre03GvvLNHc=;
	h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=uCYkpUa3tEjLjzN8fHpbDmNNBRm11EWXrXXHTKtbWGh6buFVNSMsbTV1o3Va+RutGHRXHizCMVe/1WQRHsJviVS6FNOLTWRLWeoRYXPnA1MSgffHC7dhwiqKHm4vl7lDJ1poyQ8DTh9zFvWJDDp7xpXb+wvCrMeYkX0GUzI7a9w=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=U2V4nviK; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="U2V4nviK"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 208B3C4CECE;
	Fri, 13 Dec 2024 01:22:15 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1734052935;
	bh=tYFRfTaYWFLF1LBGiqPXSf7STfiVxXYUre03GvvLNHc=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=U2V4nviKKBwElknWBg8hAtlK4qFhC5NFTXptWIoVuSsQJJhMphfFBNjkrUWBqyZJn
	 sxV41w0v3GXSA6ePN0E49bQKuIVKenAQQ4lDRgmPRBKNHbnIwVJnKtN6G3dvCAONWC
	 F7A+3K2Z56nI2BKOgMIu6xhx4D4Z8BQys7balJ36mh+YqjnrWRrjsSbOUYnlhO/8Ue
	 HgaYce9uhx1chAcZNc0Cj50w5bPN/jcAxIC4Dg4A6i1IbqQmd4MT2VXqhIXjjkbfyO
	 0dmM2V6bQO3qLppp02kpg87yGgr06SqBy7huqcEv20r+G4GfiSYJrMMdmOv/PAIX8l
	 vxJZtwbYeIeuA==
Date: Thu, 12 Dec 2024 17:22:14 -0800
Subject: [PATCH 03/11] xfs: convert partially written rt file extents to
 completely written
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: hch@lst.de, linux-xfs@vger.kernel.org
Message-ID: <173405125794.1184063.17925337081966040081.stgit@frogsfrogsfrogs>
In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Create a utility function to convert the partially written extents of a
realtime file to be completely written.  In other words, if rextsize==7
and only block 6 is unwritten, these functions will zero out block 6 and
convert the mapping to written so that the entire 7-block allocation
unit can be remapped in a single operation.  This is required for any
rt file remapping activities that do not use log items to restart
interrupted operations.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_rtbitmap.h |   12 +++
 fs/xfs/xfs_bmap_util.c       |  182 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_bmap_util.h       |    7 ++
 fs/xfs/xfs_trace.h           |   11 ++-
 4 files changed, 208 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index 22e5d9cd95f47c..89eb1e42128b38 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -157,6 +157,18 @@ xfs_rtb_to_rtxoff(
 	return do_div(rtbno, mp->m_sb.sb_rextsize);
 }
 
+/* Return the offset of a file block offset within an rt extent. */
+static inline xfs_extlen_t
+xfs_fileoff_to_rtxoff(
+	struct xfs_mount	*mp,
+	xfs_fileoff_t		off)
+{
+	if (likely(mp->m_rtxblklog >= 0))
+		return off & mp->m_rtxblkmask;
+
+	return do_div(off, mp->m_sb.sb_rextsize);
+}
+
 /* Round this file block offset up to the nearest rt extent size. */
 static inline xfs_rtblock_t
 xfs_fileoff_roundup_rtx(
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0836fea2d6d814..3229b756f33780 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1726,3 +1726,185 @@ xfs_swap_extents(
 	xfs_trans_cancel(tp);
 	goto out_unlock_ilock;
 }
+
+#ifdef CONFIG_XFS_RT
+/*
+ * Decide if this is an unwritten extent that isn't aligned to an allocation
+ * unit boundary.
+ *
+ * If it is, shorten the mapping to the end of the allocation unit so that
+ * we're ready to convert all the mappings for this allocation unit to a zeroed
+ * written extent.  If not, return false.
+ */
+static inline bool
+xfs_want_convert_rtbigalloc_mapping(
+	struct xfs_mount	*mp,
+	struct xfs_bmbt_irec	*irec)
+{
+	xfs_fileoff_t		rext_next;
+	xfs_extlen_t		modoff, modcnt;
+
+	if (irec->br_state != XFS_EXT_UNWRITTEN)
+		return false;
+
+	modoff = xfs_fileoff_to_rtxoff(mp, irec->br_startoff);
+	if (modoff == 0) {
+		xfs_rtbxlen_t	rexts;
+
+		rexts = xfs_blen_to_rtbxlen(mp, irec->br_blockcount);
+		modcnt = xfs_blen_to_rtxoff(mp, irec->br_blockcount);
+		if (rexts > 0) {
+			/*
+			 * Unwritten mapping starts at an rt extent boundary
+			 * and is longer than one rt extent.  Round the length
+			 * down to the nearest extent but don't select it for
+			 * conversion.
+			 */
+			irec->br_blockcount -= modcnt;
+			modcnt = 0;
+		}
+
+		/* Unwritten mapping is perfectly aligned, do not convert. */
+		if (modcnt == 0)
+			return false;
+	}
+
+	/*
+	 * Unaligned and unwritten; trim to the current rt extent and select it
+	 * for conversion.
+	 */
+	rext_next = (irec->br_startoff - modoff) + mp->m_sb.sb_rextsize;
+	xfs_trim_extent(irec, irec->br_startoff, rext_next - irec->br_startoff);
+	return true;
+}
+
+/*
+ * Find an unwritten extent in the given file range, zero it, and convert the
+ * mapping to written.  Adjust the scan cursor on the way out.
+ */
+STATIC int
+xfs_convert_rtbigalloc_mapping(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*offp,
+	xfs_fileoff_t		endoff)
+{
+	struct xfs_bmbt_irec	irec;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	unsigned int		resblks;
+	int			nmap;
+	int			error;
+
+	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 1);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/*
+	 * Read the mapping.  If we find an unwritten extent that isn't aligned
+	 * to an allocation unit...
+	 */
+retry:
+	nmap = 1;
+	error = xfs_bmapi_read(ip, *offp, endoff - *offp, &irec, &nmap, 0);
+	if (error)
+		goto out_cancel;
+	ASSERT(nmap == 1);
+	ASSERT(irec.br_startoff == *offp);
+	if (!xfs_want_convert_rtbigalloc_mapping(mp, &irec)) {
+		*offp = irec.br_startoff + irec.br_blockcount;
+		if (*offp >= endoff)
+			goto out_cancel;
+		goto retry;
+	}
+
+	/*
+	 * ...then write zeroes to the space and change the mapping state to
+	 * written.  This consolidates the mappings for this allocation unit.
+	 */
+	nmap = 1;
+	error = xfs_bmapi_write(tp, ip, irec.br_startoff, irec.br_blockcount,
+			XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &irec, &nmap);
+	if (error)
+		goto out_cancel;
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_unlock;
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	/*
+	 * If an unwritten mapping was returned, something is very wrong.
+	 * If no mapping was returned, then bmapi_write thought it performed
+	 * a short allocation, which should be impossible since we previously
+	 * queried the mapping and haven't cycled locks since then.  Either
+	 * way, fail the operation.
+	 */
+	if (nmap == 0 || irec.br_state != XFS_EXT_NORM) {
+		ASSERT(nmap != 0);
+		ASSERT(irec.br_state == XFS_EXT_NORM);
+		return -EIO;
+	}
+
+	/* Advance the cursor to the end of the mapping returned. */
+	*offp = irec.br_startoff + irec.br_blockcount;
+	return 0;
+
+out_cancel:
+	xfs_trans_cancel(tp);
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Prepare a file with multi-fsblock allocation units for a remapping.
+ *
+ * File allocation units (AU) must be fully mapped to the data fork.  If the
+ * space in an AU have not been fully written, there can be multiple extent
+ * mappings (e.g. mixed written and unwritten blocks) to the AU.  If the log
+ * does not have a means to ensure that all remappings for a given AU will be
+ * completed even if the fs goes down, we must maintain the above constraint in
+ * another way.
+ *
+ * Convert the unwritten parts of an AU to written by writing zeroes to the
+ * storage and flipping the mapping.  Once this completes, there will be a
+ * single mapping for the entire AU, and we can proceed with the remapping
+ * operation.
+ *
+ * Callers must ensure that there are no dirty pages in the given range.
+ */
+int
+xfs_convert_rtbigalloc_file_space(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	uint64_t		len)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		off;
+	xfs_fileoff_t		endoff;
+	int			error;
+
+	if (!xfs_inode_has_bigrtalloc(ip))
+		return 0;
+
+	off = xfs_fileoff_rounddown_rtx(mp, XFS_B_TO_FSBT(mp, pos));
+	endoff = xfs_fileoff_roundup_rtx(mp, XFS_B_TO_FSB(mp, pos + len));
+
+	trace_xfs_convert_rtbigalloc_file_space(ip, pos, len);
+
+	while (off < endoff) {
+		if (fatal_signal_pending(current))
+			return -EINTR;
+
+		error = xfs_convert_rtbigalloc_mapping(ip, &off, endoff);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index b29760d36e1ab1..3834962670449f 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -79,4 +79,11 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
 int	xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
 			      xfs_off_t len);
 
+#ifdef CONFIG_XFS_RT
+int xfs_convert_rtbigalloc_file_space(struct xfs_inode *ip, loff_t pos,
+		uint64_t len);
+#else
+# define xfs_convert_rtbigalloc_file_space(ip, pos, len)	(-EOPNOTSUPP)
+#endif
+
 #endif	/* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 4fe689410eb6ae..8af9c38bea152f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1555,7 +1555,7 @@ DEFINE_IMAP_EVENT(xfs_iomap_alloc);
 DEFINE_IMAP_EVENT(xfs_iomap_found);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
-	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count),
 	TP_ARGS(ip, offset, count),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
@@ -1563,7 +1563,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 		__field(loff_t, isize)
 		__field(loff_t, disize)
 		__field(loff_t, offset)
-		__field(size_t, count)
+		__field(u64, count)
 	),
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -1574,7 +1574,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 		__entry->count = count;
 	),
 	TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
-		  "pos 0x%llx bytecount 0x%zx",
+		  "pos 0x%llx bytecount 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->isize,
@@ -1585,7 +1585,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
 
 #define DEFINE_SIMPLE_IO_EVENT(name)	\
 DEFINE_EVENT(xfs_simple_io_class, name,	\
-	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),	\
+	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count),	\
 	TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
@@ -3971,6 +3971,9 @@ TRACE_EVENT(xfs_ioctl_clone,
 /* unshare tracepoints */
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
+#ifdef CONFIG_XFS_RT
+DEFINE_SIMPLE_IO_EVENT(xfs_convert_rtbigalloc_file_space);
+#endif /* CONFIG_XFS_RT */
 
 /* copy on write */
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);

From patchwork Fri Dec 13 01:22:30 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13906314
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id E9C1E10F7
	for <linux-xfs@vger.kernel.org>; Fri, 13 Dec 2024 01:22:30 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1734052951; cv=none;
 b=IRJNAA3OMLtZhYBl+OOWDd/UQLGxfJTJjgi37+w1ksY9WLeREBBTiJmDJFVcjJzA2a5gmMWE29YOmTKDp/TZV+CFxARdx65tVNgoCjVrO1+nIComsq8rGsJciqLhZRwiogfTbzyeperUmCidaJOZtoihWmxSqJcnJa4rIgW5wLU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1734052951; c=relaxed/simple;
	bh=lJO24BHcEN1PtFt1v7sR8R8LZfuGAQWLAzaB0FCjUXc=;
	h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=ilcf8LVAo8r1kqGcyOD1TYyGwD7Kczb2a0LyQc76y2PpjTACpGSsdpRwY9kgLYB7omaMoYdBajp0399uDqPPsLESxJBKtgT5vyZP7doxnPZruM1cMlxeX4nUle1l+PAPbydhD78MRnqYhyBbKHCKR/WY/5jYcuLX1PFs4z/nfAo=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=rtTyHIgM; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="rtTyHIgM"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id C522BC4CECE;
	Fri, 13 Dec 2024 01:22:30 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1734052950;
	bh=lJO24BHcEN1PtFt1v7sR8R8LZfuGAQWLAzaB0FCjUXc=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=rtTyHIgMjZzuh73a4hTAtPjEoPe5RAcJ/nceVMg/FzjJLq6eQdAGTOk4PnPyZVTNc
	 Icgp0TCB/poJghfH6qTsV43XMYJiX/w4T7m0hKZvvdLApN/TcAp+iluz+DghYW0ZS6
	 lDraH7geV76Aq+HLgok4XS8Xa8lo8b+Ku0tPjjZayghgKsFz8w3uBd+o+iMyh/OKOT
	 QXdTkY54o8IFfWdDZ2RJdmqKEMyjzsjrYuDwljTfp+GhN8fNKNIaEc8w1jHpyYdovk
	 JtVTPboowZOFoWC+nFqpMBtdsQf6jlYzwdKF/i8ZBX9AMvg7kmRkOMfExO1VE+6ptU
	 Hsb8sED7VDn9w==
Date: Thu, 12 Dec 2024 17:22:30 -0800
Subject: [PATCH 04/11] xfs: enable CoW when rt extent size is larger than 1
 block
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: hch@lst.de, linux-xfs@vger.kernel.org
Message-ID: <173405125811.1184063.1464436221145143124.stgit@frogsfrogsfrogs>
In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Copy on write encounters a major plot twist when the file being CoW'd
lives on the realtime volume and the realtime extent size is larger than
a single filesystem block.  XFS can only unmap and remap full rt
extents, which means that allocations are always done in units of full
rt extents, and a request to unmap less than one extent is treated as a
request to convert an extent to unwritten status.

This behavioral quirk is not compatible with the existing CoW mechanism,
so we have to intercept every path through which files can be modified
to ensure that we dirty an entire rt extent at once so that we can remap
a full rt extent.  Use the existing VFS unshare functions to dirty the
page cache to set that up.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_file.c    |  270 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_file.h    |    3 +
 fs/xfs/xfs_inode.h   |    6 +
 fs/xfs/xfs_iops.c    |   29 +++++
 fs/xfs/xfs_reflink.c |   39 +++++++
 fs/xfs/xfs_trace.h   |    1 
 6 files changed, 345 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9a435b1ff26475..fad768c0b3f328 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -347,6 +347,116 @@ xfs_file_splice_read(
 	return ret;
 }
 
+/*
+ * Decide if this file write requires COWing-around at either end of the write
+ * range.  This is only required if the file allocation unit is larger than
+ * 1FSB and the write range is not aligned with the allocation unit.
+ */
+static bool
+xfs_file_write_needs_cow_around(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	long long int		count)
+{
+	/*
+	 * No COWing required if this inode doesn't do COW.
+	 *
+	 * If the allocation unit is 1FSB, we do not need to COW around the
+	 * edges of the operation range.  This applies to all files on the data
+	 * device and rt files that have an extent size of 1FSB.
+	 */
+	if (!xfs_inode_needs_cow_around(ip))
+		return false;
+
+	/*
+	 * Otherwise, check that the operation is aligned to the rt extent
+	 * size.  Any unaligned operation /must/ be COWed around since the
+	 * regular reflink code only handles extending writes up to fsblock
+	 * boundaries.
+	 */
+	return !xfs_is_falloc_aligned(ip, pos, count);
+}
+
+/* Do we need to COW-around at this offset to handle a truncate up or down? */
+bool
+xfs_truncate_needs_cow_around(
+	struct xfs_inode	*ip,
+	loff_t			pos)
+{
+	return xfs_file_write_needs_cow_around(ip, pos, 0);
+}
+
+/* Does this file write require COWing around? */
+static inline bool
+xfs_iocb_needs_cow_around(
+	struct xfs_inode	*ip,
+	const struct kiocb	*iocb,
+	const struct iov_iter	*from)
+{
+	return xfs_file_write_needs_cow_around(ip, iocb->ki_pos,
+			iov_iter_count(from));
+}
+
+/* Unshare the allocation unit mapped to the given file position.  */
+inline int
+xfs_file_unshare_at(
+	struct xfs_inode	*ip,
+	loff_t			pos)
+{
+	loff_t			isize = i_size_read(VFS_I(ip));
+	unsigned int		extsize, len;
+	uint32_t		mod;
+
+	len = extsize = xfs_inode_alloc_unitsize(ip);
+
+	/* Open-coded rounddown_64 so that we can skip out if aligned */
+	div_u64_rem(pos, extsize, &mod);
+	if (mod == 0)
+		return 0;
+	pos -= mod;
+
+	/* Do not extend the file. */
+	if (pos >= isize)
+		return 0;
+	if (pos + len > isize)
+		len = isize - pos;
+
+	trace_xfs_file_cow_around(ip, pos, len);
+
+	if (IS_DAX(VFS_I(ip)))
+		return dax_file_unshare(VFS_I(ip), pos, len,
+				&xfs_dax_write_iomap_ops);
+	return iomap_file_unshare(VFS_I(ip), pos, len,
+			&xfs_buffered_write_iomap_ops);
+}
+
+/*
+ * Dirty the pages on either side of a write request as needed to satisfy
+ * alignment requirements if we're going to perform a copy-write.
+ *
+ * This is only needed for realtime files when the rt extent size is larger
+ * than 1 fs block, because we don't allow a logical rt extent in a file to map
+ * to multiple physical rt extents.  In other words, we can only map and unmap
+ * full rt extents.  Note that page cache doesn't exist above EOF, so be
+ * careful to stay below EOF.
+ */
+static int
+xfs_file_cow_around(
+	struct xfs_inode	*ip,
+	loff_t			pos,
+	long long int		count)
+{
+	int			error;
+
+	/* Unshare at the start of the extent. */
+	error = xfs_file_unshare_at(ip,  pos);
+	if (error)
+		return error;
+
+	/* Unshare at the end. */
+	return xfs_file_unshare_at(ip, pos + count);
+}
+
 /*
  * Take care of zeroing post-EOF blocks when they might exist.
  *
@@ -411,6 +521,17 @@ xfs_file_write_zero_eof(
 		return 1;
 	}
 
+	/*
+	 * If we're starting the write past EOF, COW the allocation unit
+	 * containing the current EOF before we start zeroing the range between
+	 * EOF and the start of the write.
+	 */
+	if (xfs_truncate_needs_cow_around(ip, isize)) {
+		error = xfs_file_unshare_at(ip, isize);
+		if (error)
+			return error;
+	}
+
 	trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
 
 	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
@@ -456,9 +577,11 @@ xfs_file_write_checks(
 
 	/*
 	 * For changing security info in file_remove_privs() we need i_rwsem
-	 * exclusively.
+	 * exclusively.  We also need it to COW around the range being written.
 	 */
-	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
+	if (*iolock == XFS_IOLOCK_SHARED &&
+	    (!IS_NOSEC(inode) ||
+	     xfs_iocb_needs_cow_around(XFS_I(inode), iocb, from))) {
 		xfs_iunlock(XFS_I(inode), *iolock);
 		*iolock = XFS_IOLOCK_EXCL;
 		error = xfs_ilock_iocb(iocb, *iolock);
@@ -469,6 +592,22 @@ xfs_file_write_checks(
 		goto restart;
 	}
 
+	/*
+	 * The write is not aligned to the file's allocation unit.  If either
+	 * of the allocation units at the start or end of the write range are
+	 * shared, unshare them through the page cache.
+	 */
+	if (xfs_iocb_needs_cow_around(XFS_I(inode), iocb, from)) {
+		ASSERT(*iolock == XFS_IOLOCK_EXCL);
+
+		inode_dio_wait(inode);
+		drained_dio = true;
+
+		error = xfs_file_cow_around(XFS_I(inode), iocb->ki_pos, count);
+		if (error)
+			return error;
+	}
+
 	/*
 	 * If the offset is beyond the size of the file, we need to zero all
 	 * blocks that fall between the existing EOF and the start of this
@@ -594,6 +733,16 @@ xfs_file_dio_write_aligned(
 	unsigned int		iolock = XFS_IOLOCK_SHARED;
 	ssize_t			ret;
 
+	/*
+	 * If the range to write is not aligned to an allocation unit, we will
+	 * have to COW the allocation units on both ends of the write.  Because
+	 * this runs through the page cache, it requires IOLOCK_EXCL.  This
+	 * predicate performs an unlocked access of the rt and reflink inode
+	 * state.
+	 */
+	if (xfs_iocb_needs_cow_around(ip, iocb, from))
+		iolock = XFS_IOLOCK_EXCL;
+
 	ret = xfs_ilock_iocb_for_write(iocb, &iolock);
 	if (ret)
 		return ret;
@@ -928,6 +1077,24 @@ xfs_falloc_setsize(
 			&iattr);
 }
 
+static int
+xfs_falloc_punch_range(
+	struct xfs_inode	*ip,
+	loff_t			offset,
+	loff_t			len)
+{
+	int			error;
+
+	/* Unshare around the region to punch, if needed. */
+	if (xfs_file_write_needs_cow_around(ip, offset, len)) {
+		error = xfs_file_cow_around(ip, offset, len);
+		if (error)
+			return error;
+	}
+
+	return xfs_free_file_space(ip, offset, len);
+}
+
 static int
 xfs_falloc_collapse_range(
 	struct file		*file,
@@ -1017,6 +1184,13 @@ xfs_falloc_zero_range(
 	if (error)
 		return error;
 
+	/* Unshare around the region to zero, if needed. */
+	if (xfs_file_write_needs_cow_around(XFS_I(inode), offset, len)) {
+		error = xfs_file_cow_around(XFS_I(inode), offset, len);
+		if (error)
+			return error;
+	}
+
 	error = xfs_free_file_space(XFS_I(inode), offset, len);
 	if (error)
 		return error;
@@ -1044,6 +1218,23 @@ xfs_falloc_unshare_range(
 	if (error)
 		return error;
 
+	/*
+	 * Enlarge the unshare region to align to a full allocation unit.
+	 */
+	if (xfs_inode_needs_cow_around(XFS_I(inode))) {
+		unsigned int	rextsize;
+		uint32_t	mod;
+
+		rextsize = xfs_inode_alloc_unitsize(XFS_I(inode));
+		div_u64_rem(offset, rextsize, &mod);
+		offset -= mod;
+		len += mod;
+
+		div_u64_rem(offset + len, rextsize, &mod);
+		if (mod)
+			len += rextsize - mod;
+	}
+
 	error = xfs_reflink_unshare(XFS_I(inode), offset, len);
 	if (error)
 		return error;
@@ -1124,7 +1315,7 @@ xfs_file_fallocate(
 
 	switch (mode & FALLOC_FL_MODE_MASK) {
 	case FALLOC_FL_PUNCH_HOLE:
-		error = xfs_free_file_space(ip, offset, len);
+		error = xfs_falloc_punch_range(ip, offset, len);
 		break;
 	case FALLOC_FL_COLLAPSE_RANGE:
 		error = xfs_falloc_collapse_range(file, offset, len);
@@ -1458,6 +1649,70 @@ xfs_dax_read_fault(
 	return ret;
 }
 
+/* dax version of folio_mkwrite_check_truncate since vmf->page == NULL */
+static inline ssize_t
+dax_write_fault_check(
+	struct vm_fault		*vmf,
+	struct inode		*inode,
+	unsigned int		order)
+{
+	loff_t			size = i_size_read(inode);
+	pgoff_t			index = size >> PAGE_SHIFT;
+	size_t			len = 1U << (PAGE_SHIFT + order);
+	size_t			offset = size & (len - 1);
+
+	if (!IS_ENABLED(CONFIG_FS_DAX)) {
+		ASSERT(0);
+		return -EFAULT;
+	}
+
+	/* fault is wholly inside EOF */
+	if (vmf->pgoff + (1U << order) - 1 < index)
+		return len;
+	/* fault is wholly past EOF */
+	if (vmf->pgoff > index || !offset)
+		return -EFAULT;
+	/* fault is partially inside EOF */
+	return offset;
+}
+
+static int
+xfs_filemap_fault_around(
+	struct vm_fault		*vmf,
+	struct inode		*inode,
+	unsigned int		order)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	loff_t			pos;
+	ssize_t			len;
+
+	if (!xfs_inode_needs_cow_around(ip))
+		return 0;
+
+	if (IS_DAX(inode)) {
+		len = dax_write_fault_check(vmf, inode, order);
+		if (len < 0)
+			return len;
+		pos = vmf->pgoff << PAGE_SHIFT;
+	} else {
+		struct folio	*folio = page_folio(vmf->page);
+
+		folio_lock(folio);
+		len = folio_mkwrite_check_truncate(folio, inode);
+		if (len < 0) {
+			folio_unlock(folio);
+			return len;
+		}
+		pos = folio_pos(folio);
+		folio_unlock(folio);
+	}
+
+	if (!xfs_file_write_needs_cow_around(ip, pos, len))
+		return 0;
+
+	return xfs_file_cow_around(XFS_I(inode), pos, len);
+}
+
 /*
  * Locking for serialisation of IO during page faults. This results in a lock
  * ordering of:
@@ -1476,6 +1731,7 @@ xfs_write_fault(
 	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	unsigned int		lock_mode = XFS_MMAPLOCK_SHARED;
+	int			error;
 	vm_fault_t		ret;
 
 	trace_xfs_write_fault(ip, order);
@@ -1495,10 +1751,18 @@ xfs_write_fault(
 		lock_mode = XFS_MMAPLOCK_EXCL;
 	}
 
+	/* Unshare all the blocks in this rt extent surrounding this page. */
+	error = xfs_filemap_fault_around(vmf, inode, order);
+	if (error) {
+		ret = vmf_fs_error(error);
+		goto out_unlock;
+	}
+
 	if (IS_DAX(inode))
 		ret = xfs_dax_fault_locked(vmf, order, true);
 	else
 		ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
+out_unlock:
 	xfs_iunlock(ip, lock_mode);
 
 	sb_end_pagefault(inode->i_sb);
diff --git a/fs/xfs/xfs_file.h b/fs/xfs/xfs_file.h
index 2ad91f755caf35..24490ea49e16c6 100644
--- a/fs/xfs/xfs_file.h
+++ b/fs/xfs/xfs_file.h
@@ -12,4 +12,7 @@ extern const struct file_operations xfs_dir_file_operations;
 bool xfs_is_falloc_aligned(struct xfs_inode *ip, loff_t pos,
 		long long int len);
 
+bool xfs_truncate_needs_cow_around(struct xfs_inode *ip, loff_t pos);
+int xfs_file_unshare_at(struct xfs_inode *ip, loff_t pos);
+
 #endif /* __XFS_FILE_H__ */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c08093a65352ec..71ca16db369913 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -349,6 +349,12 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
 	return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1;
 }
 
+/* Decide if we need to unshare the blocks around a range that we're writing. */
+static inline bool xfs_inode_needs_cow_around(struct xfs_inode *ip)
+{
+	return xfs_is_cow_inode(ip) && xfs_inode_has_bigrtalloc(ip);
+}
+
 /*
  * Return the buftarg used for data allocations on a given inode.
  */
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 207e0dadffc3c5..114ebddaa7bc0d 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -29,6 +29,7 @@
 #include "xfs_xattr.h"
 #include "xfs_file.h"
 #include "xfs_bmap.h"
+#include "xfs_reflink.h"
 
 #include <linux/posix_acl.h>
 #include <linux/security.h>
@@ -886,10 +887,38 @@ xfs_setattr_size(
 	 * truncate.
 	 */
 	if (newsize > oldsize) {
+		/*
+		 * Extending the file size, so COW around the allocation unit
+		 * containing EOF before we zero the new range of the file.
+		 */
+		if (xfs_truncate_needs_cow_around(ip, oldsize)) {
+			error = xfs_file_unshare_at(ip, oldsize);
+			if (error)
+				return error;
+		}
+
 		trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
 		error = xfs_zero_range(ip, oldsize, newsize - oldsize,
 				&did_zeroing);
 	} else {
+		/*
+		 * We're reducing the size of the file, so COW around the new
+		 * EOF allocation unit before truncation zeroes the part of the
+		 * EOF block after the new EOF.  Flush the dirty pages to disk
+		 * before we start truncating the pagecache because truncation
+		 * zeroing doesn't preflush written mappings.
+		 */
+		if (xfs_truncate_needs_cow_around(ip, newsize)) {
+			error = xfs_file_unshare_at(ip, newsize);
+			if (error)
+				return error;
+
+			error = filemap_write_and_wait_range(inode->i_mapping,
+					newsize, newsize);
+			if (error)
+				return error;
+		}
+
 		error = xfs_truncate_page(ip, newsize, &did_zeroing);
 	}
 
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 59f7fc16eb8093..4f87f7041995c4 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -34,6 +34,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_rtgroup.h"
 #include "xfs_metafile.h"
+#include "xfs_rtbitmap.h"
 
 /*
  * Copy on Write of Shared Blocks
@@ -302,9 +303,26 @@ xfs_reflink_convert_cow_locked(
 	struct xfs_iext_cursor	icur;
 	struct xfs_bmbt_irec	got;
 	struct xfs_btree_cur	*dummy_cur = NULL;
+	struct xfs_mount	*mp = ip->i_mount;
 	int			dummy_logflags;
 	int			error = 0;
 
+	/*
+	 * We can only remap full rt extents, so make sure that we convert the
+	 * entire extent.  The caller must ensure that this is either a direct
+	 * write that's aligned to the rt extent size, or a buffered write for
+	 * which we've dirtied extra pages to make this work properly.
+	 */
+	if (xfs_inode_needs_cow_around(ip)) {
+		xfs_fileoff_t	new_off;
+
+		new_off = xfs_fileoff_rounddown_rtx(mp, offset_fsb);
+		count_fsb += offset_fsb - new_off;
+		offset_fsb = new_off;
+
+		count_fsb = xfs_blen_roundup_rtx(mp, count_fsb);
+	}
+
 	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
 		return 0;
 
@@ -626,11 +644,21 @@ xfs_reflink_cancel_cow_blocks(
 	bool				cancel_real)
 {
 	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+	struct xfs_mount		*mp = ip->i_mount;
 	struct xfs_bmbt_irec		got, del;
 	struct xfs_iext_cursor		icur;
 	bool				isrt = XFS_IS_REALTIME_INODE(ip);
 	int				error = 0;
 
+	/*
+	 * Shrink the range that we're cancelling if they don't align to the
+	 * realtime extent size, since we can only free full extents.
+	 */
+	if (xfs_inode_needs_cow_around(ip)) {
+		offset_fsb = xfs_fileoff_roundup_rtx(mp, offset_fsb);
+		end_fsb = xfs_fileoff_rounddown_rtx(mp, end_fsb);
+	}
+
 	if (!xfs_inode_has_cow_data(ip))
 		return 0;
 	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
@@ -923,6 +951,7 @@ xfs_reflink_end_cow(
 	xfs_off_t			offset,
 	xfs_off_t			count)
 {
+	struct xfs_mount		*mp = ip->i_mount;
 	xfs_fileoff_t			offset_fsb;
 	xfs_fileoff_t			end_fsb;
 	int				error = 0;
@@ -932,6 +961,16 @@ xfs_reflink_end_cow(
 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
 
+	/*
+	 * Make sure the end is aligned with a rt extent (if desired), since
+	 * the end of the range could be EOF.  The _convert_cow function should
+	 * have set us up to swap only full rt extents.
+	 */
+	if (xfs_inode_needs_cow_around(ip)) {
+		offset_fsb = xfs_fileoff_rounddown_rtx(mp, offset_fsb);
+		end_fsb = xfs_fileoff_roundup_rtx(mp, end_fsb);
+	}
+
 	/*
 	 * Walk forwards until we've remapped the I/O range.  The loop function
 	 * repeatedly cycles the ILOCK to allocate one transaction per remapped
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8af9c38bea152f..e744f9435ff88d 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3970,6 +3970,7 @@ TRACE_EVENT(xfs_ioctl_clone,
 
 /* unshare tracepoints */
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
+DEFINE_SIMPLE_IO_EVENT(xfs_file_cow_around);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
 #ifdef CONFIG_XFS_RT
 DEFINE_SIMPLE_IO_EVENT(xfs_convert_rtbigalloc_file_space);

From patchwork Fri Dec 13 01:22:45 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13906315
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 920CB10F7
	for <linux-xfs@vger.kernel.org>; Fri, 13 Dec 2024 01:22:46 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1734052966; cv=none;
 b=cX3FJkOnXROPG6vCGjs4qzmCgZw3DtzbTWmUqCb+9EQWZi1uH1jI63WSPAsDaLev2WvBygduI5vgFSlWrP/D3A+0p4Bs8PMSMw2ohxo80zOKlyUEjtlQj0hh0AdQB8V3PxSUWehUY9JNkyXLOvlf4QGlxxF7okBg+dlDRYKSvZw=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1734052966; c=relaxed/simple;
	bh=obt8JtmVqGfnEZXDgRPBqcNTF6MXDbdglCegtRtE85o=;
	h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=p5oUY2KiDCMeQxdpv1fHrMBK4tvbqo3bUm9U9DHH6NBrqjTz30HERKA5N+VroPN3Px2aLGKzTBtAHixEgENFEFxGulyON8G57OBGwAIviMnLS5Rf3jaO489m8ZZxUA92ckGB6c5mxpbh3GtC0xefzs0QOZtABXVbiPY6IAaD4aU=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=j0OJxFLP; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="j0OJxFLP"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 680D3C4CECE;
	Fri, 13 Dec 2024 01:22:46 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1734052966;
	bh=obt8JtmVqGfnEZXDgRPBqcNTF6MXDbdglCegtRtE85o=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=j0OJxFLPcT8DdEORVtsl29kC8MS6z4FY26zDAoRV1WfXaJEOsGMh8HCMd6mLCwugW
	 whwKeCJPiM6iILyZeAYwODtFTJB1RvEBSq+1ed4LxuN9ELduRTZwZ6u2c8orcBBMtS
	 o6gbMwf9YrKHgyZxbsXxfzD+YPYaFyUnFZgbGfj1wpANvxuP3y7uw+snHrZZdQ5a/d
	 RKd0KmDhXsJi3UIbMbKo0fQ2LioRui8qkpkqLocTwilnOYHglB18TuOhNTC6XkTrUR
	 q4qyUIoPTdnPMTIRqySfI2gXGyzV1Pj5Wn7ZJbNzk3SvtSJoZ8RUDEaImWHtWdasDx
	 FSzn9SG7aP7uQ==
Date: Thu, 12 Dec 2024 17:22:45 -0800
Subject: [PATCH 05/11] xfs: forcibly convert unwritten blocks within an rt
 extent before sharing
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: hch@lst.de, linux-xfs@vger.kernel.org
Message-ID: <173405125829.1184063.12093363819984841679.stgit@frogsfrogsfrogs>
In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

As noted in the previous patch, XFS can only unmap and map full rt
extents.  This means that we cannot stop mid-extent for any reason,
including stepping around unwritten/written extents.  Second, the
reflink and CoW mechanisms were not designed to handle shared unwritten
extents, so we have to do something to get rid of them.

If the user asks us to remap two files, we must scan both ranges
beforehand to convert any unwritten extents that are not aligned to rt
extent boundaries into zeroed written extents before sharing.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c |   19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 4f87f7041995c4..82ceec8517a020 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1666,6 +1666,25 @@ xfs_reflink_remap_prep(
 	if (ret)
 		goto out_unlock;
 
+	/*
+	 * Now that we've marked both inodes for reflink, make sure that all
+	 * allocation units (AU) mapped into either files' ranges are either
+	 * wholly written, wholly unwritten, or holes.  The bmap code requires
+	 * that we align all unmap and remap requests to an AU.  We've already
+	 * flushed the page cache and finished directio for the range that's
+	 * being remapped, so we can convert the mappings directly.
+	 */
+	if (xfs_inode_has_bigrtalloc(src)) {
+		ret = xfs_convert_rtbigalloc_file_space(src, pos_in, *len);
+		if (ret)
+			goto out_unlock;
+	}
+	if (xfs_inode_has_bigrtalloc(dest)) {
+		ret = xfs_convert_rtbigalloc_file_space(dest, pos_out, *len);
+		if (ret)
+			goto out_unlock;
+	}
+
 	/*
 	 * If pos_out > EOF, we may have dirtied blocks between EOF and
 	 * pos_out. In that case, we need to extend the flush and unmap to cover

From patchwork Fri Dec 13 01:23:01 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13906316
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 39DF61078F
	for <linux-xfs@vger.kernel.org>; Fri, 13 Dec 2024 01:23:02 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1734052982; cv=none;
 b=Q5mae6TIkW4bVaQByTiYpDY4QJ2knTN/YwPB+ZGKL0cg8kw7KcjlIHEC/FkVbyQf0IlF2Dcf1H7t6aEeXDDWLFWUg9TC1iJf98HgqZbD8VDYRpSJnc741Jstk1TI8EPwolbFeWTDd770X/Ga9i4fiGrQOh6ZIFPz6CWDOobMtGQ=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1734052982; c=relaxed/simple;
	bh=MGXPCPb2rp08r7BvOWJ0lcc715YkjlICeVLVxxChSww=;
	h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=CvDDHqwAVXk7Nla8z+xOP2Fm9i3Is3TFeumfXtbtp035V/jmEHo/8cWuSq5DYELLmSj7Acwnwb6aOE9AZZH6OCkeKDdYzNNoWVcnotdlWPfCsl31tFw034bk4w3ul7BytJ6oH5cokVvbDGoGE3BICAgQMnbYhtbbi5fd4iniP5U=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=QjYQ0loj; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="QjYQ0loj"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 14F12C4CECE;
	Fri, 13 Dec 2024 01:23:02 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1734052982;
	bh=MGXPCPb2rp08r7BvOWJ0lcc715YkjlICeVLVxxChSww=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=QjYQ0lojW5tOzKcZx2JyBgH83gejwzKSZQNvypdmmeplPfRMwAYw4vNf9MSIL3HZ/
	 xgSjx14dNXjDJqEcw3KqKzEr4DMiYZiJXN8BMf3iJ2UwUT5oRyDD1+UrdcKvoSKrvA
	 gVIIifo2mAn9YA0g/D330VMwKCijO0uPwWH2WiRbhofK5OHiH0PDPstOOCKlE7e7pu
	 UlYvO/2srTUsoUZNv4+n/6SfwsPaPrAbUaadXqO/MWuyZK3p3q5c8LzxBGn/XGiJw3
	 j4gWd2vmCwIS++Q/+WMV5f5Sc5sQEPXnr7iEJalade/Jw0fbiKE7MtsSsXknUHsKsQ
	 MzSDPZnhgoQIw==
Date: Thu, 12 Dec 2024 17:23:01 -0800
Subject: [PATCH 06/11] xfs: add some tracepoints for writeback
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: hch@lst.de, linux-xfs@vger.kernel.org
Message-ID: <173405125846.1184063.6516078668233318926.stgit@frogsfrogsfrogs>
In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add a tracepoint so I can see where writeback is initiated.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_aops.c  |   19 ++++++++++++-------
 fs/xfs/xfs_trace.h |   34 ++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 559a3a57709748..f51f2f5f76d0f6 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -468,21 +468,26 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
 
 STATIC int
 xfs_vm_writepages(
-	struct address_space	*mapping,
-	struct writeback_control *wbc)
+	struct address_space		*mapping,
+	struct writeback_control	*wbc)
 {
-	struct xfs_writepage_ctx wpc = { };
+	struct xfs_writepage_ctx	wpc = { };
+	struct xfs_inode		*ip = XFS_I(mapping->host);
 
-	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+	trace_xfs_vm_writepages(ip, wbc);
+
+	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
 }
 
 STATIC int
 xfs_dax_writepages(
-	struct address_space	*mapping,
-	struct writeback_control *wbc)
+	struct address_space		*mapping,
+	struct writeback_control	*wbc)
 {
-	struct xfs_inode	*ip = XFS_I(mapping->host);
+	struct xfs_inode		*ip = XFS_I(mapping->host);
+
+	trace_xfs_dax_writepages(ip, wbc);
 
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return dax_writeback_mapping_range(mapping,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index e744f9435ff88d..0234af78cea9a1 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1554,6 +1554,40 @@ DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IMAP_EVENT(xfs_iomap_alloc);
 DEFINE_IMAP_EVENT(xfs_iomap_found);
 
+DECLARE_EVENT_CLASS(xfs_writeback_class,
+	TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc),
+	TP_ARGS(ip, wbc),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(loff_t, range_start)
+		__field(loff_t, range_end)
+		__field(long, nr_to_write)
+		__field(enum writeback_sync_modes, sync_mode)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->range_start = wbc->range_start;
+		__entry->range_end = wbc->range_end;
+		__entry->nr_to_write = wbc->nr_to_write;
+		__entry->sync_mode = wbc->sync_mode;
+	),
+	TP_printk("dev %d:%d ino 0x%llx range_start 0x%llx range_end 0x%llx nr_to_write %ld sync_mode %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		   __entry->ino,
+		   __entry->range_start,
+		   __entry->range_end,
+		   __entry->nr_to_write,
+		   __entry->sync_mode)
+);
+#define DEFINE_WRITEBACK_EVENT(name)	\
+DEFINE_EVENT(xfs_writeback_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), \
+	TP_ARGS(ip, wbc))
+DEFINE_WRITEBACK_EVENT(xfs_vm_writepages);
+DEFINE_WRITEBACK_EVENT(xfs_dax_writepages);
+
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count),
 	TP_ARGS(ip, offset, count),

From patchwork Fri Dec 13 01:23:17 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13906317
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id D358617BA1
	for <linux-xfs@vger.kernel.org>; Fri, 13 Dec 2024 01:23:17 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1734052997; cv=none;
 b=C3f3XRKsnupjUw2xXLl4AJRtdudDvbU5VJozkJKwR5JmGjJteef+klLxqYUknq75pmk3jqfWUQ3XZ2oaIz4lbDM3/oeO7mBzSmudxHY2OMlRELjqM7ntQ8buina0SyjYRJRa8CXvSRarc/eO9gQp7ypcF5rp1rqgestPFWnkEYE=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1734052997; c=relaxed/simple;
	bh=kKz9K3F+lPeE+yIdMEqt89LUeSNml2IE66ly0hGswGI=;
	h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=syPpqrO67u8fjH9OtPer1sPpRkRNg6/9XOgs/vXR7ykEqPXV+05ROiTfklgTK7dd0CZ6EidG9FUEYbyKEE23lSvYZ+c6WAfyWMZZlfVrdrGQ8ZMNqmLG1MzuU6JI2zdkZ4OjPbgG6G7zP/Zd3i79faXFlTWq+YfK+4Zfefn6OW4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=rghH5f8X; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="rghH5f8X"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id AB3FEC4CECE;
	Fri, 13 Dec 2024 01:23:17 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1734052997;
	bh=kKz9K3F+lPeE+yIdMEqt89LUeSNml2IE66ly0hGswGI=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=rghH5f8XGlNltI8pMyG3CWuva7IRzHkq4MHmbG4J5lrsbgn5Ilbdz/q3ISV9vDL/o
	 1XkKNrciyR1cPFUkJKBKPpGD52dyXwiVOHl+h5XlgQqFp1HPwT8cEZYSXremH5y7pZ
	 xSZ+vkVTSEhmS4zvY73dZUBFXYBGvImXOn+r2FvO9svM5HpbU2cLb1AXncfyz+PKyu
	 c6QOU1hsmkFCs07K7sMrv6L9V1IgqgBfKX8kOXsZky3/Z+NpxsJ2SEpkJ99iDFgljy
	 AvahV+QQbxII9nwtW+DfRaU02ghn+UwOidMV4qGS4q2UcKPXRhlWIRp7qH4qWVQ41t
	 qpzZgtJdKtEUA==
Date: Thu, 12 Dec 2024 17:23:17 -0800
Subject: [PATCH 07/11] xfs: extend writeback requests to handle rt cow
 correctly
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: hch@lst.de, linux-xfs@vger.kernel.org
Message-ID: <173405125863.1184063.8842755288883819617.stgit@frogsfrogsfrogs>
In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

If we have shared realtime files and the rt extent size is larger than a
single fs block, we need to extend writeback requests to be aligned to
rt extent size granularity because we cannot share partial rt extents.
The front end should have set us up for this by dirtying the relevant
ranges.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_aops.c  |   38 ++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_trace.h |    1 +
 2 files changed, 39 insertions(+)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f51f2f5f76d0f6..9bc2d7d92e4c46 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -466,6 +466,38 @@ static const struct iomap_writeback_ops xfs_writeback_ops = {
 	.discard_folio		= xfs_discard_folio,
 };
 
+/*
+ * Extend the writeback range to allocation unit granularity and alignment.
+ * This is a requirement for blocksize > pagesize scenarios such as realtime
+ * copy on write, since we can only share full rt extents.
+ */
+static inline void
+xfs_vm_writepages_extend(
+	struct xfs_inode		*ip,
+	struct writeback_control	*wbc)
+{
+	unsigned int			bsize = xfs_inode_alloc_unitsize(ip);
+	long long int			pages_to_write;
+	loff_t				next = wbc->range_end + 1;
+
+	wbc->range_start = rounddown_64(wbc->range_start, bsize);
+	if (wbc->range_end != LLONG_MAX)
+		wbc->range_end = roundup_64(next, bsize) - 1;
+
+	if (wbc->nr_to_write != LONG_MAX) {
+		pgoff_t		pg_start = wbc->range_start >> PAGE_SHIFT;
+		pgoff_t		pg_next = (wbc->range_end + 1) >> PAGE_SHIFT;
+
+		pages_to_write = pg_next - pg_start;
+		if (pages_to_write >= LONG_MAX)
+			pages_to_write = LONG_MAX;
+		if (wbc->nr_to_write < pages_to_write)
+			wbc->nr_to_write = pages_to_write;
+	}
+
+	trace_xfs_vm_writepages_extend(ip, wbc);
+}
+
 STATIC int
 xfs_vm_writepages(
 	struct address_space		*mapping,
@@ -476,6 +508,9 @@ xfs_vm_writepages(
 
 	trace_xfs_vm_writepages(ip, wbc);
 
+	if (xfs_inode_needs_cow_around(ip))
+		xfs_vm_writepages_extend(ip, wbc);
+
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
 }
@@ -489,6 +524,9 @@ xfs_dax_writepages(
 
 	trace_xfs_dax_writepages(ip, wbc);
 
+	if (xfs_inode_needs_cow_around(ip))
+		xfs_vm_writepages_extend(ip, wbc);
+
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return dax_writeback_mapping_range(mapping,
 			xfs_inode_buftarg(ip)->bt_daxdev, wbc);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0234af78cea9a1..021ea65909c915 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1586,6 +1586,7 @@ DEFINE_EVENT(xfs_writeback_class, name,	\
 	TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), \
 	TP_ARGS(ip, wbc))
 DEFINE_WRITEBACK_EVENT(xfs_vm_writepages);
+DEFINE_WRITEBACK_EVENT(xfs_vm_writepages_extend);
 DEFINE_WRITEBACK_EVENT(xfs_dax_writepages);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,

From patchwork Fri Dec 13 01:23:32 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13906318
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7158918EA2
	for <linux-xfs@vger.kernel.org>; Fri, 13 Dec 2024 01:23:33 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1734053013; cv=none;
 b=cKbpZMMag2lC+qviRynFSs+ZNcpb5HiTJRgbjLUL0m5Cz4/Zumd1p/02/5fK4FXGmZDSCjrBVe3fv1Vhloicc+w/lolcTslvd9VsxCXKdyH+Uh9KSr7ZYFr0/sgRLKl+JNvbWDNyddLcze6Ik9nL2XDsu7T5unUXhYlmpcUhDU4=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1734053013; c=relaxed/simple;
	bh=tryhAMq0YVG3vdMS3dD/LVsVxxAycb49ibeO25iUKRU=;
	h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=DauRkGezXzArJSBoB44ayX/sz1afxKG1beBFB0mgqeF8t1xhHAiZ3vTVKpIFYRg1qiTYziNYCR6vfQyeQm6sssS3unAa8s7C5CY1zHhbQtScJFRo+lvSN3a1jfOXUKSaTUh7/gdbXBHr6sosoXtkgOr4A9p+cDayOj9yv/laZg4=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=JCoyzNzR; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="JCoyzNzR"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 49182C4CECE;
	Fri, 13 Dec 2024 01:23:33 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1734053013;
	bh=tryhAMq0YVG3vdMS3dD/LVsVxxAycb49ibeO25iUKRU=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=JCoyzNzRQ/hKpyBu3OHStJFJSE4Ttx4xmp9hc6/5SvPSV0xMRXNKWCQdGQRqEenrm
	 8wLeFQbnD4eghi6GaX4eFM6ZGETcmj1fgENjPzihq0iJj7TOZLe3NZHvW6y1jaho9S
	 vnN57avg4mKrPkLrEYrP5/EuLlhKNyZSSxavsQ08Jt9qY4DKYD0LRbyu1UoLkaLTpW
	 AVKGi7cu2bDh5V4t/nLapcFSnKyEykt46aaxYuS40XOXXvViTp0yAqLwa/LN6p89SJ
	 K4AOV8JhhUdVxSnlMVebzp+++gQ21C09piuVyXXw3tnNhgkFwkVkv9IiwOY2iDAWN8
	 yZj7mwRwgAErQ==
Date: Thu, 12 Dec 2024 17:23:32 -0800
Subject: [PATCH 08/11] xfs: enable extent size hints for CoW when rtextsize >
 1
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: hch@lst.de, linux-xfs@vger.kernel.org
Message-ID: <173405125880.1184063.8755676628520114568.stgit@frogsfrogsfrogs>
In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

CoW extent size hints are not allowed on filesystems that have large
realtime extents because we only want to perform the minimum required
amount of write-around (aka write amplification) for shared extents.

On filesystems where rtextsize > 1, allocations can only be done in
units of full rt extents, which means that we can only map an entire rt
extent's worth of blocks into the data fork.  Hole punch requests become
conversions to unwritten if the request isn't aligned properly.

Because a copy-write fundamentally requires remapping, this means that
we also can only do copy-writes of a full rt extent.  This is too
expensive for large hint sizes, since it's all or nothing.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_bmap.c |   22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 40ad22fb808b95..e1aac1711f553f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6525,6 +6525,28 @@ xfs_get_cowextsz_hint(
 	if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 		a = ip->i_cowextsize;
 	if (XFS_IS_REALTIME_INODE(ip)) {
+		/*
+		 * For realtime files, the realtime extent is the fundamental
+		 * unit of allocation.  This means that data sharing and CoW
+		 * remapping can only be done in those units.  For filesystems
+		 * where the extent size is larger than one block, write
+		 * requests that are not aligned to an extent boundary employ
+		 * an unshare-around strategy to ensure that all pages for a
+		 * shared extent are fully dirtied.
+		 *
+		 * Because the remapping alignment requirement applies equally
+		 * to all CoW writes, any regular overwrites that could be
+		 * turned (by a speculative CoW preallocation) into a CoW write
+		 * must either employ this dirty-around strategy, or be smart
+		 * enough to ignore the CoW fork mapping unless the entire
+		 * extent is dirty or becomes shared by writeback time.  Doing
+		 * the first would dramatically increase write amplification,
+		 * and the second would require deeper insight into the state
+		 * of the page cache during a writeback request.  For now, we
+		 * ignore the hint.
+		 */
+		if (ip->i_mount->m_sb.sb_rextsize > 1)
+			return ip->i_mount->m_sb.sb_rextsize;
 		b = 0;
 		if (ip->i_diflags & XFS_DIFLAG_EXTSIZE)
 			b = ip->i_extsize;

From patchwork Fri Dec 13 01:23:48 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13906319
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1C10421345
	for <linux-xfs@vger.kernel.org>; Fri, 13 Dec 2024 01:23:49 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1734053029; cv=none;
 b=KShEssr+bzUx/tmEkCUgtTBvLPnCpuM1ruHajYNuIFIha1raVkf78ejil8GXkISOMwDp2U2cBcggu38PRI8S386nQ2uMcAfPs/CQXs1biiNmLsuBXeZnzKOZG9Jze4p6TY8yBhn3hUg6IS6dyIB54zBlWkR6bkDY2pNmA2Uvm/I=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1734053029; c=relaxed/simple;
	bh=CA3zQGiztERvQtDTb+QA0vzkbRbztOQJVB13SigRqIA=;
	h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=b3gjcpMeG0tRGkEILbLvSFBZTuUwHzElkAUNqKtFirxkMk6LNh3PtasGTAyIG8sgyUo3lSylS9V6LMroTiQ06DcZ2F7xru6geFj/96GhGbqa0Q4hWD7owIYaBkjzCeojM2NUXKgn86v4G1OBBz8cA0HvuHa7DW0RWKjNcuVNL9k=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=ZVQxZTnS; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="ZVQxZTnS"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id E1F5DC4CECE;
	Fri, 13 Dec 2024 01:23:48 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1734053028;
	bh=CA3zQGiztERvQtDTb+QA0vzkbRbztOQJVB13SigRqIA=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=ZVQxZTnSeo2MZIPnCAN7Xv/rN4nKUk+8KLbGlKHmIBOUN+UP4pS1/OAXPOaGQbEp2
	 /235627snhSmV8wRKXNEQ4VZEsoQrqGN4OZRozpkfEyq6nJeBl8kkLveGWccEhyaZy
	 oRa3Xhd1PWJyDZhCQ7EzP7/ebnpzRsppaVg4fsTHhlFolGcl61UoglCMagaylKloPF
	 Uh2un/uDRzmJwwLd55OFfraYzu3bDrjJETzf0UvJ06pyqUAq3k33YocGlFH1TE3h/9
	 2tblW7kSE0TSQPfk+6qkvC1bOgJldJeV9qv8Se+ZgxYvrVcfe6GMH49uv0Cn4a230+
	 7gnMYgVaEh+Yw==
Date: Thu, 12 Dec 2024 17:23:48 -0800
Subject: [PATCH 09/11] xfs: allow reflink on the rt volume when extent size is
 larger than 1 rt block
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: hch@lst.de, linux-xfs@vger.kernel.org
Message-ID: <173405125896.1184063.11119572969501198910.stgit@frogsfrogsfrogs>
In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Make the necessary tweaks to the reflink remapping code to support
remapping on the realtime volume when the rt extent size is larger than
a single rt block.  We need to check that the remap arguments from
userspace are aligned to a rt extent boundary, and that the length
is always aligned, even if the kernel tried to round it up to EOF for
us.  XFS can only map and remap full rt extents, so we have to be a
little more strict about the alignment there.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c |   91 +++++++++++++++++++++++++++++++++++++++++++++-----
 fs/xfs/xfs_super.c   |    2 +
 fs/xfs/xfs_trace.h   |    3 ++
 3 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 82ceec8517a020..0222b78dedd92d 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1506,6 +1506,13 @@ xfs_reflink_remap_blocks(
 	len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
 			XFS_MAX_FILEOFF);
 
+	/*
+	 * Make sure the end is aligned with an allocation unit, even if it's
+	 * past EOF.
+	 */
+	if (xfs_inode_has_bigrtalloc(dest))
+		len = xfs_blen_roundup_rtx(mp, len);
+
 	trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
 
 	while (len > 0) {
@@ -1580,6 +1587,57 @@ xfs_reflink_zero_posteof(
 	return xfs_zero_range(ip, isize, pos - isize, NULL);
 }
 
+#ifdef CONFIG_XFS_RT
+/*
+ * Adjust the length of the remap operation to end on an allocation unit (AU)
+ * boundary.
+ */
+STATIC int
+xfs_reflink_adjust_rtbigalloc_len(
+	struct xfs_inode	*src,
+	loff_t			pos_in,
+	struct xfs_inode	*dest,
+	loff_t			pos_out,
+	loff_t			*len,
+	unsigned int		remap_flags)
+{
+	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(src);
+	uint32_t		mod;
+
+	div_u64_rem(*len, alloc_unit, &mod);
+
+	/*
+	 * We previously checked the AU alignment of both offsets, so we now
+	 * have to check the AU alignment of the length.  The VFS remap prep
+	 * function can change the length on us, so we can only make length
+	 * adjustments after that.  If the length is aligned to an AU, we're
+	 * good to go.
+	 *
+	 * Otherwise, the length is not aligned to an AU.  If the source file's
+	 * range ends at EOF, the VFS ensured that the dest file's range also
+	 * ends at EOF.  The actual remap function will round the (byte) length
+	 * up to the nearest AU, so we're ok here too.
+	 */
+	if (mod == 0 || pos_in + *len == i_size_read(VFS_I(src)))
+		return 0;
+
+	/*
+	 * Otherwise, the only thing we can do is round the request length down
+	 * to an AU boundary.  If the caller doesn't allow that, we cannot move
+	 * forward.
+	 */
+	if (!(remap_flags & REMAP_FILE_CAN_SHORTEN))
+		return -EINVAL;
+
+	/* Back off by a single extent. */
+	(*len) -= mod;
+	trace_xfs_reflink_adjust_rtbigalloc_len(src, pos_in, *len, dest, pos_out);
+	return 0;
+}
+#else
+# define xfs_reflink_adjust_rtbigalloc_len(...)		(0)
+#endif /* CONFIG_XFS_RT */
+
 /*
  * Prepare two files for range cloning.  Upon a successful return both inodes
  * will have the iolock and mmaplock held, the page cache of the out file will
@@ -1622,6 +1680,7 @@ xfs_reflink_remap_prep(
 	struct xfs_inode	*src = XFS_I(inode_in);
 	struct inode		*inode_out = file_inode(file_out);
 	struct xfs_inode	*dest = XFS_I(inode_out);
+	const struct iomap_ops	*dax_read_ops = NULL;
 	int			ret;
 
 	/* Lock both files against IO */
@@ -1639,15 +1698,25 @@ xfs_reflink_remap_prep(
 	if (IS_DAX(inode_in) != IS_DAX(inode_out))
 		goto out_unlock;
 
-	if (!IS_DAX(inode_in))
-		ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
-				pos_out, len, remap_flags);
-	else
-		ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
-				pos_out, len, remap_flags, &xfs_read_iomap_ops);
+	ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(dest)));
+
+	if (IS_DAX(inode_in))
+		dax_read_ops = &xfs_read_iomap_ops;
+
+	ret = __generic_remap_file_range_prep(file_in, pos_in, file_out,
+			pos_out, len, remap_flags, dax_read_ops,
+			xfs_inode_alloc_unitsize(dest));
 	if (ret || *len == 0)
 		goto out_unlock;
 
+	/* Adjust the end to align to an allocation unit. */
+	if (xfs_inode_has_bigrtalloc(src)) {
+		ret = xfs_reflink_adjust_rtbigalloc_len(src, pos_in, dest,
+				pos_out, len, remap_flags);
+		if (ret || *len == 0)
+			goto out_unlock;
+	}
+
 	/* Attach dquots to dest inode before changing block map */
 	ret = xfs_qm_dqattach(dest);
 	if (ret)
@@ -1896,11 +1965,13 @@ xfs_reflink_supports_rextsize(
 	       return false;
 
 	/*
-	 * Reflink doesn't support rt extent size larger than a single fsblock
-	 * because we would have to perform CoW-around for unaligned write
-	 * requests to guarantee that we always remap entire rt extents.
+	 * Reflink doesn't support file allocation units larger than a single
+	 * block and not a power of two because we would have to perform
+	 * CoW-around for unaligned write requests to guarantee that we always
+	 * remap entire allocation units and the reflink code cannot yet handle
+	 * rounding ranges to align to non powers of two.
 	 */
-	if (rextsize != 1)
+	if (!is_power_of_2(rextsize))
 		return false;
 
 	return true;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 0fa7b7cc75c146..c91b9467a3eef8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1757,7 +1757,7 @@ xfs_fs_fill_super(
 		if (xfs_has_realtime(mp) &&
 		    !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) {
 			xfs_alert(mp,
-	"reflink not compatible with realtime extent size %u!",
+	"reflink not compatible with non-power-of-2 realtime extent size %u!",
 					mp->m_sb.sb_rextsize);
 			error = -EINVAL;
 			goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 021ea65909c915..b218786e734df0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3965,6 +3965,9 @@ TRACE_EVENT(xfs_reflink_remap_blocks,
 		  __entry->dest_lblk)
 );
 DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
+#ifdef CONFIG_XFS_RT
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_adjust_rtbigalloc_len);
+#endif /* CONFIG_XFS_RT */
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);

From patchwork Fri Dec 13 01:24:04 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13906325
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id B437118EA2
	for <linux-xfs@vger.kernel.org>; Fri, 13 Dec 2024 01:24:04 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1734053044; cv=none;
 b=ZqFIKeRQtmY4rD+TqWxa5CYUvRe1iNCXekiPqyPUFKFoK+wgu5v+YKytojQQRgA5vCDv9chDAZ1FqQ+wE21iV6CFYQ4tGHaRg+0oVnZnag/W79DYbsHRQWIRtHJFGNqngmqFPdfHuz3vDFqQot18duCk4KbF0QAx5eiSbvi3KE0=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1734053044; c=relaxed/simple;
	bh=IdDYYjRNrpi6gmPsYhhzs8I1wZiq+3261RkKgjFDRSU=;
	h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=kzwtIra1CmP6AIRELYjDB41SYAEyzbC7N0j9mLjIuf8Og4U70mEc69InFzZLmTxeeQRtb56vsEvyoyrF9a00Nea/GbBR+hxAG7sX1FL5u4W8RgTiFZJD8pmgoXSWtuVknl31xlq3fkqhhRRyYAkzZOjlMeY38QOr1c3u5gHSOyI=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=YwpZDQnX; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="YwpZDQnX"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8F69DC4CED3;
	Fri, 13 Dec 2024 01:24:04 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1734053044;
	bh=IdDYYjRNrpi6gmPsYhhzs8I1wZiq+3261RkKgjFDRSU=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=YwpZDQnXBgF1MP2IqHHfOz+4kZ4hhDHCBRCu2L1ZZYPccZQ81R8PTrK2m1TABVTNz
	 PEo0+dD92myTh3yzHZtkP7s6GvCrdA5JSPMmmrXjF4eVeZtLmrXndqsq3/oIT44V/m
	 s5AP6vApLgUrO+KnXw7X4pKwc9egs9278wRo2dcIaIUnQbGOb7ILTJ+ofgOr8u83fh
	 xCVFx9yzs9sUyBcaRKjJe3xUOblcmrgqckuofe2kdo/2o2JI87aUb+mTV3Kwbh8E8T
	 Zzps9JY6FpdNp9EWeJbxss3R14zMWGXIXxGgGUYNCAjhxhISF2/l0ND5ZCm/Ba99xF
	 3MYvAStfbC4dQ==
Date: Thu, 12 Dec 2024 17:24:04 -0800
Subject: [PATCH 10/11] xfs: fix integer overflow when validating extent size
 hints
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: hch@lst.de, linux-xfs@vger.kernel.org
Message-ID: <173405125912.1184063.5511687476090644949.stgit@frogsfrogsfrogs>
In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Both file extent size hints are stored as 32-bit quantities, in units of
filesystem blocks.  As part of validating the hints, we convert these
quantities to bytes to ensure that the hint is congruent with the file's
allocation size.

The maximum possible hint value is 2097151 (aka XFS_MAX_BMBT_EXTLEN).
If the file allocation unit is larger than 2048, the unit conversion
will exceed 32 bits in size, which overflows the uint32_t used to store
the value used in the comparison.  This isn't a problem for files on the
data device since the hint will always be a multiple of the block size.
However, this is a problem for realtime files because the rtextent size
can be any integer number of fs blocks, and truncation of upper bits
changes the outcome of division.

Eliminate the overflow by performing the congruency check in units of
blocks, not bytes.  Otherwise, we get errors like this:

$ truncate -s 500T /tmp/a
$ mkfs.xfs -f -N /tmp/a -d extszinherit=2097151,rtinherit=1 -r extsize=28k
illegal extent size hint 2097151, must be less than 2097151 and a multiple of 7.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_buf.c |   20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index f24fa628fecf1e..3fd1b03b4c78cc 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -819,13 +819,11 @@ xfs_inode_validate_extsize(
 	bool				rt_flag;
 	bool				hint_flag;
 	bool				inherit_flag;
-	uint32_t			extsize_bytes;
-	uint32_t			blocksize_bytes;
+	uint32_t			alloc_unit = 1;
 
 	rt_flag = (flags & XFS_DIFLAG_REALTIME);
 	hint_flag = (flags & XFS_DIFLAG_EXTSIZE);
 	inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT);
-	extsize_bytes = XFS_FSB_TO_B(mp, extsize);
 
 	/*
 	 * This comment describes a historic gap in this verifier function.
@@ -854,9 +852,7 @@ xfs_inode_validate_extsize(
 	 */
 
 	if (rt_flag)
-		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
-	else
-		blocksize_bytes = mp->m_sb.sb_blocksize;
+		alloc_unit = mp->m_sb.sb_rextsize;
 
 	if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode)))
 		return __this_address;
@@ -874,7 +870,7 @@ xfs_inode_validate_extsize(
 	if (mode && !(hint_flag || inherit_flag) && extsize != 0)
 		return __this_address;
 
-	if (extsize_bytes % blocksize_bytes)
+	if (extsize % alloc_unit)
 		return __this_address;
 
 	if (extsize > XFS_MAX_BMBT_EXTLEN)
@@ -909,12 +905,10 @@ xfs_inode_validate_cowextsize(
 {
 	bool				rt_flag;
 	bool				hint_flag;
-	uint32_t			cowextsize_bytes;
-	uint32_t			blocksize_bytes;
+	uint32_t			alloc_unit = 1;
 
 	rt_flag = (flags & XFS_DIFLAG_REALTIME);
 	hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE);
-	cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize);
 
 	/*
 	 * Similar to extent size hints, a directory can be configured to
@@ -929,9 +923,7 @@ xfs_inode_validate_cowextsize(
 	 */
 
 	if (rt_flag)
-		blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
-	else
-		blocksize_bytes = mp->m_sb.sb_blocksize;
+		alloc_unit = mp->m_sb.sb_rextsize;
 
 	if (hint_flag && !xfs_has_reflink(mp))
 		return __this_address;
@@ -946,7 +938,7 @@ xfs_inode_validate_cowextsize(
 	if (mode && !hint_flag && cowextsize != 0)
 		return __this_address;
 
-	if (cowextsize_bytes % blocksize_bytes)
+	if (cowextsize % alloc_unit)
 		return __this_address;
 
 	if (cowextsize > XFS_MAX_BMBT_EXTLEN)

From patchwork Fri Dec 13 01:24:19 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Darrick J. Wong" <djwong@kernel.org>
X-Patchwork-Id: 13906326
Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org
 [10.30.226.201])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5F2BA38385
	for <linux-xfs@vger.kernel.org>; Fri, 13 Dec 2024 01:24:20 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=10.30.226.201
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1734053060; cv=none;
 b=JiRAeiYj/oPk51BxL+2U+bSkO7HEaJUKmY5gM0KMZPDZ4EQlhZsa5lBItVzCNsadN5rEVb5io3LuX6oAUhV/G05YcVO2XLDsXFWeW2K8bTECqxbIsAUUwZa7crZ01pSt9kwCHSJwziro2vRNf0PjDMByPvyjLYxGkoqugNyvGlY=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1734053060; c=relaxed/simple;
	bh=8A3OvtxShk/d/AttFSxe8ntzKAWvzGQnO9G6Ds5GA8Q=;
	h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=RKbs+7S448yyiz+f7esxoAJsQYutUmrzMngZ9nfj/ZhWnVepRdm1UmCVKsX6f3Peq2opziZk694KNkSGJXUkduM5yfaHew+XFfhB3a0BtwiW7xMFdL0RqRoJCevsFk+EzeZlHQ7d/IEWKVutWHMnK41CboskRcp9pDJ3NxqMnQo=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b=Ybo6el0V; arc=none smtp.client-ip=10.30.226.201
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org
 header.b="Ybo6el0V"
Received: by smtp.kernel.org (Postfix) with ESMTPSA id 37A11C4CECE;
	Fri, 13 Dec 2024 01:24:20 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org;
	s=k20201202; t=1734053060;
	bh=8A3OvtxShk/d/AttFSxe8ntzKAWvzGQnO9G6Ds5GA8Q=;
	h=Date:Subject:From:To:Cc:In-Reply-To:References:From;
	b=Ybo6el0VDtl33ZNmsXN3laZqw5s0fiahZXuhuVYg/70bBeHDbuYcwOIkW4uBd3t3e
	 osD9xAzHx3EB8NYSzsqdAvlS33ALYjUDbUVfzkxBWSQmjVsKbqUnTONKCrrRHGAYjo
	 zashb4lZc+n60ll8Ac23gmL5qo1QzW7pHUF/55f0OTOM8X1l/+yo271UScbbNrl/8V
	 Ssu9c4lIDRhbqyYqsf2bem0DP3aeSqY0icEPOVVKdS0cwAmKzCvil9fgurZ83BaCbT
	 sygqwlYGoeMXy/fNqBxMeg9Xh/3v2uPbx7LU7hlJ6qpby5uTc/8H3ZGrdiGD7N5UfT
	 hfVqz9PfqKqdQ==
Date: Thu, 12 Dec 2024 17:24:19 -0800
Subject: [PATCH 11/11] xfs: support realtime reflink with an extent size that
 isn't a power of 2
From: "Darrick J. Wong" <djwong@kernel.org>
To: djwong@kernel.org
Cc: hch@lst.de, linux-xfs@vger.kernel.org
Message-ID: <173405125928.1184063.9203313014441349759.stgit@frogsfrogsfrogs>
In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs>
Precedence: bulk
X-Mailing-List: linux-xfs@vger.kernel.org
List-Id: <linux-xfs.vger.kernel.org>
List-Subscribe: <mailto:linux-xfs+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-xfs+unsubscribe@vger.kernel.org>
MIME-Version: 1.0

From: Darrick J. Wong <djwong@kernel.org>

Add the necessary alignment checking code to the reflink remap code to
ensure that remap requests are aligned to rt extent boundaries if the
realtime extent size isn't a power of two.  The VFS helpers assume that
they can use the usual (blocksize - 1) masking to avoid slow 64-bit
division, but since XFS is special we won't make everyone pay that cost
for our weird edge case.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/xfs/xfs_reflink.c |  119 +++++++++++++++++++++++++++++++++++++-------------
 fs/xfs/xfs_reflink.h |    2 -
 fs/xfs/xfs_rtalloc.c |    4 --
 fs/xfs/xfs_super.c   |    9 ----
 4 files changed, 90 insertions(+), 44 deletions(-)

diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 0222b78dedd92d..6ceb00565bab24 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1638,6 +1638,83 @@ xfs_reflink_adjust_rtbigalloc_len(
 # define xfs_reflink_adjust_rtbigalloc_len(...)		(0)
 #endif /* CONFIG_XFS_RT */
 
+/*
+ * Check the alignment of a remap request when the allocation unit size isn't a
+ * power of two.  The VFS helpers use (fast) bitmask-based alignment checks,
+ * but here we have to use slow long division.
+ */
+static int
+xfs_reflink_remap_check_rtalign(
+	struct xfs_inode		*ip_in,
+	loff_t				pos_in,
+	struct xfs_inode		*ip_out,
+	loff_t				pos_out,
+	loff_t				*req_len,
+	unsigned int			remap_flags)
+{
+	struct xfs_mount		*mp = ip_in->i_mount;
+	uint32_t			rextbytes;
+	loff_t				in_size, out_size;
+	loff_t				new_length, length = *req_len;
+	loff_t				blen;
+
+	rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
+	in_size = i_size_read(VFS_I(ip_in));
+	out_size = i_size_read(VFS_I(ip_out));
+
+	/* The start of both ranges must be aligned to a rt extent. */
+	if (!isaligned_64(pos_in, rextbytes) ||
+	    !isaligned_64(pos_out, rextbytes))
+		return -EINVAL;
+
+	if (length == 0)
+		length = in_size - pos_in;
+
+	/*
+	 * If the user wanted us to exchange up to the infile's EOF, round up
+	 * to the next block boundary for this check.
+	 *
+	 * Otherwise, reject the range length if it's not extent aligned.  We
+	 * already confirmed the starting offsets' extent alignment.
+	 */
+	if (pos_in + length == in_size)
+		blen = roundup_64(in_size, rextbytes) - pos_in;
+	else
+		blen = rounddown_64(length, rextbytes);
+
+	/* Don't allow overlapped remappings within the same file. */
+	if (ip_in == ip_out &&
+	    pos_out + blen > pos_in &&
+	    pos_in + blen > pos_out)
+		return -EINVAL;
+
+	/*
+	 * Ensure that we don't exchange a partial EOF extent into the middle
+	 * of another file.
+	 */
+	if (isaligned_64(length, rextbytes))
+		return 0;
+
+	new_length = length;
+	if (pos_out + length < out_size)
+		new_length = rounddown_64(new_length, rextbytes);
+
+	if (new_length == length)
+		return 0;
+
+	/*
+	 * Return the shortened request if the caller permits it.  If the
+	 * request was shortened to zero rt extents, we know that the original
+	 * arguments weren't valid in the first place.
+	 */
+	if ((remap_flags & REMAP_FILE_CAN_SHORTEN) && new_length > 0) {
+		*req_len = new_length;
+		return 0;
+	}
+
+	return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
+}
+
 /*
  * Prepare two files for range cloning.  Upon a successful return both inodes
  * will have the iolock and mmaplock held, the page cache of the out file will
@@ -1681,6 +1758,7 @@ xfs_reflink_remap_prep(
 	struct inode		*inode_out = file_inode(file_out);
 	struct xfs_inode	*dest = XFS_I(inode_out);
 	const struct iomap_ops	*dax_read_ops = NULL;
+	unsigned int		alloc_unit = xfs_inode_alloc_unitsize(dest);
 	int			ret;
 
 	/* Lock both files against IO */
@@ -1698,14 +1776,22 @@ xfs_reflink_remap_prep(
 	if (IS_DAX(inode_in) != IS_DAX(inode_out))
 		goto out_unlock;
 
-	ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(dest)));
+	/* Check non-power of two alignment issues, if necessary. */
+	if (XFS_IS_REALTIME_INODE(dest) && !is_power_of_2(alloc_unit)) {
+		ret = xfs_reflink_remap_check_rtalign(src, pos_in, dest,
+				pos_out, len, remap_flags);
+		if (ret)
+			goto out_unlock;
+
+		/* Do the VFS checks with the regular block alignment. */
+		alloc_unit = src->i_mount->m_sb.sb_blocksize;
+	}
 
 	if (IS_DAX(inode_in))
 		dax_read_ops = &xfs_read_iomap_ops;
 
 	ret = __generic_remap_file_range_prep(file_in, pos_in, file_out,
-			pos_out, len, remap_flags, dax_read_ops,
-			xfs_inode_alloc_unitsize(dest));
+			pos_out, len, remap_flags, dax_read_ops, alloc_unit);
 	if (ret || *len == 0)
 		goto out_unlock;
 
@@ -1949,30 +2035,3 @@ xfs_reflink_unshare(
 	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
 	return error;
 }
-
-/*
- * Can we use reflink with this realtime extent size?  Note that we don't check
- * for rblocks > 0 here because this can be called as part of attaching a new
- * rt section.
- */
-bool
-xfs_reflink_supports_rextsize(
-	struct xfs_mount	*mp,
-	unsigned int		rextsize)
-{
-	/* reflink on the realtime device requires rtgroups */
-	if (!xfs_has_rtgroups(mp))
-	       return false;
-
-	/*
-	 * Reflink doesn't support file allocation units larger than a single
-	 * block and not a power of two because we would have to perform
-	 * CoW-around for unaligned write requests to guarantee that we always
-	 * remap entire allocation units and the reflink code cannot yet handle
-	 * rounding ranges to align to non powers of two.
-	 */
-	if (!is_power_of_2(rextsize))
-		return false;
-
-	return true;
-}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index cc4e92278279b6..3bfd7ab9e1148a 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -62,6 +62,4 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in,
 extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
 		xfs_extlen_t cowextsize, unsigned int remap_flags);
 
-bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize);
-
 #endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index d8e6d073d64dc9..586da450cc44b4 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1295,9 +1295,7 @@ xfs_growfs_rt(
 			goto out_unlock;
 		if (xfs_has_reflink(mp))
 			goto out_unlock;
-	} else if (xfs_has_reflink(mp) &&
-		   !xfs_reflink_supports_rextsize(mp, in->extsize))
-		goto out_unlock;
+	}
 
 	error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
 	if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c91b9467a3eef8..8050fea541140a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1754,15 +1754,6 @@ xfs_fs_fill_super(
 		xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
 
 	if (xfs_has_reflink(mp)) {
-		if (xfs_has_realtime(mp) &&
-		    !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) {
-			xfs_alert(mp,
-	"reflink not compatible with non-power-of-2 realtime extent size %u!",
-					mp->m_sb.sb_rextsize);
-			error = -EINVAL;
-			goto out_filestream_unmount;
-		}
-
 		/*
 		 * always-cow mode is not supported on filesystems with rt
 		 * extent sizes larger than a single block because we'd have