From patchwork Fri Dec 13 01:21:43 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13906311 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6278317BA1 for ; Fri, 13 Dec 2024 01:21:44 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052904; cv=none; b=rkfMovjceYNsCY94FppUv9Yd8FiIE/A5pw7CeuGBZwPGGhJhRRAIJVmsd373l4wjzFDMrLrTUsmfKnAuaTMz5o+5wOaJfIpmAclMzm2J5LuYgcMTwYZ/bP9Lfh7KdkrBJ0U0kwtRa//bFqoLbt2Myr0Tk6k06ZhvBVGxigp4LrI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052904; c=relaxed/simple; bh=v/juFr1JZ3Mw7m4ULZG1dvu6bJZJbU4F1Y2pTICBUMU=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=lSId6vDb2gXNNGjRYV3n5CwP2M3ZfxrundZuQTOCmaH3dQN/KBmcQyyZlvXzdG4nk6mcZ7gShB6onzgd7uesASNsYfiKN7jSh/vveKWPnC0XuwPsqyhiHnRtAJd6lLDU4+AuLHdxQTDuDoBSUR5gxpvx/0fOPWAwJdgYrr8Gfxo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=L85F9kfU; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="L85F9kfU" Received: by smtp.kernel.org (Postfix) with ESMTPSA id D11DAC4CECE; Fri, 13 Dec 2024 01:21:43 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734052903; bh=v/juFr1JZ3Mw7m4ULZG1dvu6bJZJbU4F1Y2pTICBUMU=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=L85F9kfUlrqLww01xtaGJptmXm6EavzSBPc1zUZh8q/drd+KCMHvkd9NfRwtlwEck uNBxmA+hNH6slNf+XUcNgMYbjTskU7bjKOyKl+RUkP3AYoLOxzoGu4IAsa7HO5Iq10 k7wLmizFpnW4bSLzoXwqPBWKrbpQNkTE7aVxD40V57G5+ZcevDxc1N3SrPMikmFpGi SaGkbalv8r8bcKF+iphay1xSD3SwFTv4pUA1e9a9rFPGh4Pz8p44PZ906kmiL1kovl 8l4f8PQe5PpNpL/DsbCMIgKijRfBcKNYlgMR9JftmJFCDZ3vvHUawQ5mA2pWuMafCS phE/jVBf5WuJw== Date: Thu, 12 Dec 2024 17:21:43 -0800 Subject: [PATCH 01/11] vfs: explicitly pass the block size to the remap prep function From: "Darrick J. Wong" To: djwong@kernel.org Cc: hch@lst.de, linux-xfs@vger.kernel.org Message-ID: <173405125759.1184063.6610287530974429945.stgit@frogsfrogsfrogs> In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Make it so that filesystems can pass an explicit blocksize to the remap prep function. This enables filesystems whose fundamental allocation units are /not/ the same as the blocksize to ensure that the remapping checks are aligned properly. Signed-off-by: "Darrick J. Wong" --- fs/dax.c | 5 ++++- fs/remap_range.c | 30 ++++++++++++++++++------------ include/linux/fs.h | 3 ++- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 21b47402b3dca4..c7ea298b4214a5 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -2076,7 +2076,10 @@ int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, loff_t *len, unsigned int remap_flags, const struct iomap_ops *ops) { + unsigned int blocksize = file_inode(file_out)->i_sb->s_blocksize; + return __generic_remap_file_range_prep(file_in, pos_in, file_out, - pos_out, len, remap_flags, ops); + pos_out, len, remap_flags, ops, + blocksize); } EXPORT_SYMBOL_GPL(dax_remap_file_range_prep); diff --git a/fs/remap_range.c b/fs/remap_range.c index 26afbbbfb10c2e..d3c6c6b05eb191 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -30,18 +30,18 @@ */ static int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - loff_t *req_count, unsigned int remap_flags) + loff_t *req_count, unsigned int remap_flags, + unsigned int blocksize) { struct inode *inode_in = file_in->f_mapping->host; struct inode *inode_out = file_out->f_mapping->host; uint64_t count = *req_count; uint64_t bcount; loff_t size_in, size_out; - loff_t bs = inode_out->i_sb->s_blocksize; int ret; /* The start of both ranges must be aligned to an fs block. */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) + if (!IS_ALIGNED(pos_in, blocksize) || !IS_ALIGNED(pos_out, blocksize)) return -EINVAL; /* Ensure offsets don't wrap. */ @@ -75,10 +75,10 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, */ if (pos_in + count == size_in && (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) { - bcount = ALIGN(size_in, bs) - pos_in; + bcount = ALIGN(size_in, blocksize) - pos_in; } else { - if (!IS_ALIGNED(count, bs)) - count = ALIGN_DOWN(count, bs); + if (!IS_ALIGNED(count, blocksize)) + count = ALIGN_DOWN(count, blocksize); bcount = count; } @@ -134,9 +134,10 @@ static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, loff_t pos_out, loff_t *len, - unsigned int remap_flags) + unsigned int remap_flags, + unsigned int blocksize) { - u64 blkmask = i_blocksize(inode_in) - 1; + u64 blkmask = blocksize - 1; loff_t new_len = *len; if ((*len & blkmask) == 0) @@ -277,7 +278,8 @@ int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, - const struct iomap_ops *dax_read_ops) + const struct iomap_ops *dax_read_ops, + unsigned int blocksize) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -312,7 +314,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, - remap_flags); + remap_flags, blocksize); if (ret || *len == 0) return ret; @@ -353,7 +355,7 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, } ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, - remap_flags); + remap_flags, blocksize); if (ret || *len == 0) return ret; @@ -363,13 +365,17 @@ __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, return ret; } +EXPORT_SYMBOL(__generic_remap_file_range_prep); int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { + unsigned int blocksize = file_inode(file_out)->i_sb->s_blocksize; + return __generic_remap_file_range_prep(file_in, pos_in, file_out, - pos_out, len, remap_flags, NULL); + pos_out, len, remap_flags, NULL, + blocksize); } EXPORT_SYMBOL(generic_remap_file_range_prep); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecce2..b638fb1bcbc96f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2191,7 +2191,8 @@ int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, - const struct iomap_ops *dax_read_ops); + const struct iomap_ops *dax_read_ops, + unsigned int block_size); int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); From patchwork Fri Dec 13 01:21:59 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13906312 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A05722AD25 for ; Fri, 13 Dec 2024 01:21:59 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052919; cv=none; b=mu670qsHSFYfcIZiUXr6aLEJlFdQwHh1NU70W9JajHqT9YZ709dLD9JDiB9HgNcQ+dMe8LHEbOSs4b9AAvj04IKGcfdqdf7KFlGkI6sZnnMEFTEumy3qYNGx117t+PEYF6E+pe9noBKElsa6Y1JHTbhPOvDTYRfVc6ahhxanztU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052919; c=relaxed/simple; bh=DnbyVknEHkpEa7j2O2HYhbucDbhzPaQWV7TRZ0C8uOU=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=g1CXmpiyDEgHG1z54Rqo4Bnc3MWBmKrJX6nfZWk0M/G8n+wg53IvOmz8vsg94aL8xuG1+2o6xlPQP2Kzip9xIgec9Bqvl7D4GUB2xdq+X0lEww2Brr8XIXIE+wR5hBnzerNC5cNtOXSTi6uJdkU3x01XeFlRKfiZN4z/NWp3yc4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=KUvsxyJX; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="KUvsxyJX" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 774BFC4CED4; Fri, 13 Dec 2024 01:21:59 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734052919; bh=DnbyVknEHkpEa7j2O2HYhbucDbhzPaQWV7TRZ0C8uOU=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=KUvsxyJXJa1gA/L3mgBNu507BVpty2IyWVj/o7l8WEEOhmJpYuS9YnWMoh+Ze7Cf3 32j9n23KvlhhjCJWrw+yHKfpjZQKQ8yxupmJl7tp0nLYSlrqcOZLHyh/SFQasVJ1Mg O0tKTfJrgUoxU1L9Makw7pvJdWwZNq03OJSvigW8PAr0m6c4otfAKx7aeGxNHeSnJq AEpKiaIMXHlswa+wRZUF53sevz2ZpJr4AIjPP+uN2LarTUIazXBn5RRLowUw/VwuiS B/SLbiGqkOy1TtFzc3dAApjosvvE2j3Q3SK5Q7JgKjGHFSIcpaxqhU/wOPz3iMxFl9 hndeGPDQ6DBNQ== Date: Thu, 12 Dec 2024 17:21:59 -0800 Subject: [PATCH 02/11] iomap: allow zeroing of written extents beyond EOF From: "Darrick J. Wong" To: djwong@kernel.org Cc: hch@lst.de, linux-xfs@vger.kernel.org Message-ID: <173405125776.1184063.5414430767804356851.stgit@frogsfrogsfrogs> In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong In commit eb65540aa9fc82 ("iomap: warn on zero range of a post-eof folio"), we established that iomap_zero_range cannot dirty folios beyond EOF because writeback will ignore those dirty folios. However, XFS can only handle copy on write of entire file allocation units. For reflink on a realtime volume where the allocation unit size is larger than a single fsblock, if EOF is in the middle of an allocation unit, we must use the pagecache to stage the out of place write, even if that means having (zeroed) dirty pagecache beyond EOF. To support this, the writeback call knows how to extend the writeback range to align with an allocation unit, and it successfully finds the dirty post-EOF folios. Therefore, we need to disable this check for this particular situation. Signed-off-by: "Darrick J. Wong" --- fs/gfs2/bmap.c | 2 +- fs/iomap/buffered-io.c | 25 ++++++++++++++++++++----- fs/xfs/xfs_iomap.c | 27 ++++++++++++++++++++++++++- include/linux/iomap.h | 6 +++++- 4 files changed, 52 insertions(+), 8 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1795c4e8dbf66a..ce9293c916363e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1300,7 +1300,7 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) { BUG_ON(current->journal_info); - return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); + return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops, 0); } #define GFS2_JTRUNC_REVOKES 8192 diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 955f19e27e47c5..4e851e9c2a1002 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1350,7 +1350,8 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) return filemap_write_and_wait_range(mapping, i->pos, end); } -static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) +static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, + unsigned zeroing_flags) { loff_t pos = iter->pos; loff_t length = iomap_length(iter); @@ -1363,6 +1364,18 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) size_t bytes = min_t(u64, SIZE_MAX, length); bool ret; + /* + * If we've gone past EOF and have a written mapping, and the + * filesystem supports written mappings past EOF, skip the rest + * of the range. We can't write that back anyway. + */ + if (pos > iter->inode->i_size && + (zeroing_flags & IOMAP_ZERO_MAPPED_BEYOND_EOF)) { + written += length; + length = 0; + break; + } + status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; @@ -1395,7 +1408,7 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, unsigned zeroing_flags) { struct iomap_iter iter = { .inode = inode, @@ -1424,7 +1437,8 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { iter.len = plen; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.processed = iomap_zero_iter(&iter, did_zero, + zeroing_flags); iter.len = len - (iter.pos - pos); if (ret || !iter.len) @@ -1453,7 +1467,8 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, continue; } - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.processed = iomap_zero_iter(&iter, did_zero, + zeroing_flags); } return ret; } @@ -1469,7 +1484,7 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, /* Block boundary? Nothing to do */ if (!off) return 0; - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, 0); } EXPORT_SYMBOL_GPL(iomap_truncate_page); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 50fa3ef89f6c98..b7d0dfd5fd3117 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1490,14 +1490,39 @@ xfs_zero_range( bool *did_zero) { struct inode *inode = VFS_I(ip); + unsigned int zeroing_flags = 0; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); if (IS_DAX(inode)) return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); + + /* + * Files with allocation units larger than the fsblock size can share + * zeroed written blocks beyond EOF if the EOF is in the middle of an + * allocation unit because it keeps the refcounting code simple. We + * therefore permit zeroing of pagecache for these post-EOF written + * extents so that the blocks in the CoW staging extent beyond EOF are + * all initialized to zero. + * + * Alternate designs could be: (a) don't allow sharing of an allocation + * unit that spans EOF because of the unwritten blocks; (b) rewrite the + * reflink code to allow shared unwritten extents in this one corner + * case; or (c) insert zeroed pages into the pagecache to get around + * the checks in iomap_zero_range. + * + * However, this design (allow zeroing of pagecache beyond EOF) was + * chosen because it most closely resembles what we do for allocation + * unit == 1 fsblock. Note that for these files, we force writeback + * of post-EOF folios to ensure that CoW always happens in units of + * allocation units. + */ + if (xfs_inode_has_bigrtalloc(ip) && xfs_has_reflink(ip->i_mount)) + zeroing_flags |= IOMAP_ZERO_MAPPED_BEYOND_EOF; + return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, zeroing_flags); } int diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5675af6b740c27..31a5aa239aab1d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -306,7 +306,11 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, - bool *did_zero, const struct iomap_ops *ops); + bool *did_zero, const struct iomap_ops *ops, + unsigned zeroing_flags); +/* ignore written mappings allowed beyond EOF */ +#define IOMAP_ZERO_MAPPED_BEYOND_EOF (1U << 0) + int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, From patchwork Fri Dec 13 01:22:14 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13906313 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4BA37DDDC for ; Fri, 13 Dec 2024 01:22:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052935; cv=none; b=Y6qeR+89P1Vusi2uNfWNIlKCyQUOKhOTEkNC+ODCesZlamxjsDMugILGuiNfciz4e2ArDV7y5YNU16Buin2sMrCgV+/qp/c5q7wyfufTd8OXQhoPyGo9AJb6+ztCnjeKuyfmuARy7ethurfj+8XEzAp6OgbQlGRHosk8mbezew0= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052935; c=relaxed/simple; bh=tYFRfTaYWFLF1LBGiqPXSf7STfiVxXYUre03GvvLNHc=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=uCYkpUa3tEjLjzN8fHpbDmNNBRm11EWXrXXHTKtbWGh6buFVNSMsbTV1o3Va+RutGHRXHizCMVe/1WQRHsJviVS6FNOLTWRLWeoRYXPnA1MSgffHC7dhwiqKHm4vl7lDJ1poyQ8DTh9zFvWJDDp7xpXb+wvCrMeYkX0GUzI7a9w= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=U2V4nviK; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="U2V4nviK" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 208B3C4CECE; Fri, 13 Dec 2024 01:22:15 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734052935; bh=tYFRfTaYWFLF1LBGiqPXSf7STfiVxXYUre03GvvLNHc=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=U2V4nviKKBwElknWBg8hAtlK4qFhC5NFTXptWIoVuSsQJJhMphfFBNjkrUWBqyZJn sxV41w0v3GXSA6ePN0E49bQKuIVKenAQQ4lDRgmPRBKNHbnIwVJnKtN6G3dvCAONWC F7A+3K2Z56nI2BKOgMIu6xhx4D4Z8BQys7balJ36mh+YqjnrWRrjsSbOUYnlhO/8Ue HgaYce9uhx1chAcZNc0Cj50w5bPN/jcAxIC4Dg4A6i1IbqQmd4MT2VXqhIXjjkbfyO 0dmM2V6bQO3qLppp02kpg87yGgr06SqBy7huqcEv20r+G4GfiSYJrMMdmOv/PAIX8l vxJZtwbYeIeuA== Date: Thu, 12 Dec 2024 17:22:14 -0800 Subject: [PATCH 03/11] xfs: convert partially written rt file extents to completely written From: "Darrick J. Wong" To: djwong@kernel.org Cc: hch@lst.de, linux-xfs@vger.kernel.org Message-ID: <173405125794.1184063.17925337081966040081.stgit@frogsfrogsfrogs> In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Create a utility function to convert the partially written extents of a realtime file to be completely written. In other words, if rextsize==7 and only block 6 is unwritten, these functions will zero out block 6 and convert the mapping to written so that the entire 7-block allocation unit can be remapped in a single operation. This is required for any rt file remapping activities that do not use log items to restart interrupted operations. Signed-off-by: "Darrick J. Wong" --- fs/xfs/libxfs/xfs_rtbitmap.h | 12 +++ fs/xfs/xfs_bmap_util.c | 182 ++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_bmap_util.h | 7 ++ fs/xfs/xfs_trace.h | 11 ++- 4 files changed, 208 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 22e5d9cd95f47c..89eb1e42128b38 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -157,6 +157,18 @@ xfs_rtb_to_rtxoff( return do_div(rtbno, mp->m_sb.sb_rextsize); } +/* Return the offset of a file block offset within an rt extent. */ +static inline xfs_extlen_t +xfs_fileoff_to_rtxoff( + struct xfs_mount *mp, + xfs_fileoff_t off) +{ + if (likely(mp->m_rtxblklog >= 0)) + return off & mp->m_rtxblkmask; + + return do_div(off, mp->m_sb.sb_rextsize); +} + /* Round this file block offset up to the nearest rt extent size. */ static inline xfs_rtblock_t xfs_fileoff_roundup_rtx( diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0836fea2d6d814..3229b756f33780 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1726,3 +1726,185 @@ xfs_swap_extents( xfs_trans_cancel(tp); goto out_unlock_ilock; } + +#ifdef CONFIG_XFS_RT +/* + * Decide if this is an unwritten extent that isn't aligned to an allocation + * unit boundary. + * + * If it is, shorten the mapping to the end of the allocation unit so that + * we're ready to convert all the mappings for this allocation unit to a zeroed + * written extent. If not, return false. + */ +static inline bool +xfs_want_convert_rtbigalloc_mapping( + struct xfs_mount *mp, + struct xfs_bmbt_irec *irec) +{ + xfs_fileoff_t rext_next; + xfs_extlen_t modoff, modcnt; + + if (irec->br_state != XFS_EXT_UNWRITTEN) + return false; + + modoff = xfs_fileoff_to_rtxoff(mp, irec->br_startoff); + if (modoff == 0) { + xfs_rtbxlen_t rexts; + + rexts = xfs_blen_to_rtbxlen(mp, irec->br_blockcount); + modcnt = xfs_blen_to_rtxoff(mp, irec->br_blockcount); + if (rexts > 0) { + /* + * Unwritten mapping starts at an rt extent boundary + * and is longer than one rt extent. Round the length + * down to the nearest extent but don't select it for + * conversion. + */ + irec->br_blockcount -= modcnt; + modcnt = 0; + } + + /* Unwritten mapping is perfectly aligned, do not convert. */ + if (modcnt == 0) + return false; + } + + /* + * Unaligned and unwritten; trim to the current rt extent and select it + * for conversion. + */ + rext_next = (irec->br_startoff - modoff) + mp->m_sb.sb_rextsize; + xfs_trim_extent(irec, irec->br_startoff, rext_next - irec->br_startoff); + return true; +} + +/* + * Find an unwritten extent in the given file range, zero it, and convert the + * mapping to written. Adjust the scan cursor on the way out. + */ +STATIC int +xfs_convert_rtbigalloc_mapping( + struct xfs_inode *ip, + xfs_fileoff_t *offp, + xfs_fileoff_t endoff) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int resblks; + int nmap; + int error; + + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 1); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Read the mapping. If we find an unwritten extent that isn't aligned + * to an allocation unit... + */ +retry: + nmap = 1; + error = xfs_bmapi_read(ip, *offp, endoff - *offp, &irec, &nmap, 0); + if (error) + goto out_cancel; + ASSERT(nmap == 1); + ASSERT(irec.br_startoff == *offp); + if (!xfs_want_convert_rtbigalloc_mapping(mp, &irec)) { + *offp = irec.br_startoff + irec.br_blockcount; + if (*offp >= endoff) + goto out_cancel; + goto retry; + } + + /* + * ...then write zeroes to the space and change the mapping state to + * written. This consolidates the mappings for this allocation unit. + */ + nmap = 1; + error = xfs_bmapi_write(tp, ip, irec.br_startoff, irec.br_blockcount, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &irec, &nmap); + if (error) + goto out_cancel; + error = xfs_trans_commit(tp); + if (error) + goto out_unlock; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* + * If an unwritten mapping was returned, something is very wrong. + * If no mapping was returned, then bmapi_write thought it performed + * a short allocation, which should be impossible since we previously + * queried the mapping and haven't cycled locks since then. Either + * way, fail the operation. + */ + if (nmap == 0 || irec.br_state != XFS_EXT_NORM) { + ASSERT(nmap != 0); + ASSERT(irec.br_state == XFS_EXT_NORM); + return -EIO; + } + + /* Advance the cursor to the end of the mapping returned. */ + *offp = irec.br_startoff + irec.br_blockcount; + return 0; + +out_cancel: + xfs_trans_cancel(tp); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Prepare a file with multi-fsblock allocation units for a remapping. + * + * File allocation units (AU) must be fully mapped to the data fork. If the + * space in an AU have not been fully written, there can be multiple extent + * mappings (e.g. mixed written and unwritten blocks) to the AU. If the log + * does not have a means to ensure that all remappings for a given AU will be + * completed even if the fs goes down, we must maintain the above constraint in + * another way. + * + * Convert the unwritten parts of an AU to written by writing zeroes to the + * storage and flipping the mapping. Once this completes, there will be a + * single mapping for the entire AU, and we can proceed with the remapping + * operation. + * + * Callers must ensure that there are no dirty pages in the given range. + */ +int +xfs_convert_rtbigalloc_file_space( + struct xfs_inode *ip, + loff_t pos, + uint64_t len) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t off; + xfs_fileoff_t endoff; + int error; + + if (!xfs_inode_has_bigrtalloc(ip)) + return 0; + + off = xfs_fileoff_rounddown_rtx(mp, XFS_B_TO_FSBT(mp, pos)); + endoff = xfs_fileoff_roundup_rtx(mp, XFS_B_TO_FSB(mp, pos + len)); + + trace_xfs_convert_rtbigalloc_file_space(ip, pos, len); + + while (off < endoff) { + if (fatal_signal_pending(current)) + return -EINTR; + + error = xfs_convert_rtbigalloc_mapping(ip, &off, endoff); + if (error) + return error; + } + + return 0; +} +#endif /* CONFIG_XFS_RT */ diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index b29760d36e1ab1..3834962670449f 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -79,4 +79,11 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); +#ifdef CONFIG_XFS_RT +int xfs_convert_rtbigalloc_file_space(struct xfs_inode *ip, loff_t pos, + uint64_t len); +#else +# define xfs_convert_rtbigalloc_file_space(ip, pos, len) (-EOPNOTSUPP) +#endif + #endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4fe689410eb6ae..8af9c38bea152f 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1555,7 +1555,7 @@ DEFINE_IMAP_EVENT(xfs_iomap_alloc); DEFINE_IMAP_EVENT(xfs_iomap_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, - TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count), TP_ARGS(ip, offset, count), TP_STRUCT__entry( __field(dev_t, dev) @@ -1563,7 +1563,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, __field(loff_t, isize) __field(loff_t, disize) __field(loff_t, offset) - __field(size_t, count) + __field(u64, count) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -1574,7 +1574,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, __entry->count = count; ), TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " - "pos 0x%llx bytecount 0x%zx", + "pos 0x%llx bytecount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, @@ -1585,7 +1585,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, #define DEFINE_SIMPLE_IO_EVENT(name) \ DEFINE_EVENT(xfs_simple_io_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count), \ TP_ARGS(ip, offset, count)) DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); @@ -3971,6 +3971,9 @@ TRACE_EVENT(xfs_ioctl_clone, /* unshare tracepoints */ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); +#ifdef CONFIG_XFS_RT +DEFINE_SIMPLE_IO_EVENT(xfs_convert_rtbigalloc_file_space); +#endif /* CONFIG_XFS_RT */ /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); From patchwork Fri Dec 13 01:22:30 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13906314 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id E9C1E10F7 for ; Fri, 13 Dec 2024 01:22:30 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052951; cv=none; b=IRJNAA3OMLtZhYBl+OOWDd/UQLGxfJTJjgi37+w1ksY9WLeREBBTiJmDJFVcjJzA2a5gmMWE29YOmTKDp/TZV+CFxARdx65tVNgoCjVrO1+nIComsq8rGsJciqLhZRwiogfTbzyeperUmCidaJOZtoihWmxSqJcnJa4rIgW5wLU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052951; c=relaxed/simple; bh=lJO24BHcEN1PtFt1v7sR8R8LZfuGAQWLAzaB0FCjUXc=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=ilcf8LVAo8r1kqGcyOD1TYyGwD7Kczb2a0LyQc76y2PpjTACpGSsdpRwY9kgLYB7omaMoYdBajp0399uDqPPsLESxJBKtgT5vyZP7doxnPZruM1cMlxeX4nUle1l+PAPbydhD78MRnqYhyBbKHCKR/WY/5jYcuLX1PFs4z/nfAo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=rtTyHIgM; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="rtTyHIgM" Received: by smtp.kernel.org (Postfix) with ESMTPSA id C522BC4CECE; Fri, 13 Dec 2024 01:22:30 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734052950; bh=lJO24BHcEN1PtFt1v7sR8R8LZfuGAQWLAzaB0FCjUXc=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=rtTyHIgMjZzuh73a4hTAtPjEoPe5RAcJ/nceVMg/FzjJLq6eQdAGTOk4PnPyZVTNc Icgp0TCB/poJghfH6qTsV43XMYJiX/w4T7m0hKZvvdLApN/TcAp+iluz+DghYW0ZS6 lDraH7geV76Aq+HLgok4XS8Xa8lo8b+Ku0tPjjZayghgKsFz8w3uBd+o+iMyh/OKOT QXdTkY54o8IFfWdDZ2RJdmqKEMyjzsjrYuDwljTfp+GhN8fNKNIaEc8w1jHpyYdovk JtVTPboowZOFoWC+nFqpMBtdsQf6jlYzwdKF/i8ZBX9AMvg7kmRkOMfExO1VE+6ptU Hsb8sED7VDn9w== Date: Thu, 12 Dec 2024 17:22:30 -0800 Subject: [PATCH 04/11] xfs: enable CoW when rt extent size is larger than 1 block From: "Darrick J. Wong" To: djwong@kernel.org Cc: hch@lst.de, linux-xfs@vger.kernel.org Message-ID: <173405125811.1184063.1464436221145143124.stgit@frogsfrogsfrogs> In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Copy on write encounters a major plot twist when the file being CoW'd lives on the realtime volume and the realtime extent size is larger than a single filesystem block. XFS can only unmap and remap full rt extents, which means that allocations are always done in units of full rt extents, and a request to unmap less than one extent is treated as a request to convert an extent to unwritten status. This behavioral quirk is not compatible with the existing CoW mechanism, so we have to intercept every path through which files can be modified to ensure that we dirty an entire rt extent at once so that we can remap a full rt extent. Use the existing VFS unshare functions to dirty the page cache to set that up. Signed-off-by: "Darrick J. Wong" --- fs/xfs/xfs_file.c | 270 +++++++++++++++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_file.h | 3 + fs/xfs/xfs_inode.h | 6 + fs/xfs/xfs_iops.c | 29 +++++ fs/xfs/xfs_reflink.c | 39 +++++++ fs/xfs/xfs_trace.h | 1 6 files changed, 345 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9a435b1ff26475..fad768c0b3f328 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -347,6 +347,116 @@ xfs_file_splice_read( return ret; } +/* + * Decide if this file write requires COWing-around at either end of the write + * range. This is only required if the file allocation unit is larger than + * 1FSB and the write range is not aligned with the allocation unit. + */ +static bool +xfs_file_write_needs_cow_around( + struct xfs_inode *ip, + loff_t pos, + long long int count) +{ + /* + * No COWing required if this inode doesn't do COW. + * + * If the allocation unit is 1FSB, we do not need to COW around the + * edges of the operation range. This applies to all files on the data + * device and rt files that have an extent size of 1FSB. + */ + if (!xfs_inode_needs_cow_around(ip)) + return false; + + /* + * Otherwise, check that the operation is aligned to the rt extent + * size. Any unaligned operation /must/ be COWed around since the + * regular reflink code only handles extending writes up to fsblock + * boundaries. + */ + return !xfs_is_falloc_aligned(ip, pos, count); +} + +/* Do we need to COW-around at this offset to handle a truncate up or down? */ +bool +xfs_truncate_needs_cow_around( + struct xfs_inode *ip, + loff_t pos) +{ + return xfs_file_write_needs_cow_around(ip, pos, 0); +} + +/* Does this file write require COWing around? */ +static inline bool +xfs_iocb_needs_cow_around( + struct xfs_inode *ip, + const struct kiocb *iocb, + const struct iov_iter *from) +{ + return xfs_file_write_needs_cow_around(ip, iocb->ki_pos, + iov_iter_count(from)); +} + +/* Unshare the allocation unit mapped to the given file position. */ +inline int +xfs_file_unshare_at( + struct xfs_inode *ip, + loff_t pos) +{ + loff_t isize = i_size_read(VFS_I(ip)); + unsigned int extsize, len; + uint32_t mod; + + len = extsize = xfs_inode_alloc_unitsize(ip); + + /* Open-coded rounddown_64 so that we can skip out if aligned */ + div_u64_rem(pos, extsize, &mod); + if (mod == 0) + return 0; + pos -= mod; + + /* Do not extend the file. */ + if (pos >= isize) + return 0; + if (pos + len > isize) + len = isize - pos; + + trace_xfs_file_cow_around(ip, pos, len); + + if (IS_DAX(VFS_I(ip))) + return dax_file_unshare(VFS_I(ip), pos, len, + &xfs_dax_write_iomap_ops); + return iomap_file_unshare(VFS_I(ip), pos, len, + &xfs_buffered_write_iomap_ops); +} + +/* + * Dirty the pages on either side of a write request as needed to satisfy + * alignment requirements if we're going to perform a copy-write. + * + * This is only needed for realtime files when the rt extent size is larger + * than 1 fs block, because we don't allow a logical rt extent in a file to map + * to multiple physical rt extents. In other words, we can only map and unmap + * full rt extents. Note that page cache doesn't exist above EOF, so be + * careful to stay below EOF. + */ +static int +xfs_file_cow_around( + struct xfs_inode *ip, + loff_t pos, + long long int count) +{ + int error; + + /* Unshare at the start of the extent. */ + error = xfs_file_unshare_at(ip, pos); + if (error) + return error; + + /* Unshare at the end. */ + return xfs_file_unshare_at(ip, pos + count); +} + /* * Take care of zeroing post-EOF blocks when they might exist. * @@ -411,6 +521,17 @@ xfs_file_write_zero_eof( return 1; } + /* + * If we're starting the write past EOF, COW the allocation unit + * containing the current EOF before we start zeroing the range between + * EOF and the start of the write. + */ + if (xfs_truncate_needs_cow_around(ip, isize)) { + error = xfs_file_unshare_at(ip, isize); + if (error) + return error; + } + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); @@ -456,9 +577,11 @@ xfs_file_write_checks( /* * For changing security info in file_remove_privs() we need i_rwsem - * exclusively. + * exclusively. We also need it to COW around the range being written. */ - if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { + if (*iolock == XFS_IOLOCK_SHARED && + (!IS_NOSEC(inode) || + xfs_iocb_needs_cow_around(XFS_I(inode), iocb, from))) { xfs_iunlock(XFS_I(inode), *iolock); *iolock = XFS_IOLOCK_EXCL; error = xfs_ilock_iocb(iocb, *iolock); @@ -469,6 +592,22 @@ xfs_file_write_checks( goto restart; } + /* + * The write is not aligned to the file's allocation unit. If either + * of the allocation units at the start or end of the write range are + * shared, unshare them through the page cache. + */ + if (xfs_iocb_needs_cow_around(XFS_I(inode), iocb, from)) { + ASSERT(*iolock == XFS_IOLOCK_EXCL); + + inode_dio_wait(inode); + drained_dio = true; + + error = xfs_file_cow_around(XFS_I(inode), iocb->ki_pos, count); + if (error) + return error; + } + /* * If the offset is beyond the size of the file, we need to zero all * blocks that fall between the existing EOF and the start of this @@ -594,6 +733,16 @@ xfs_file_dio_write_aligned( unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; + /* + * If the range to write is not aligned to an allocation unit, we will + * have to COW the allocation units on both ends of the write. Because + * this runs through the page cache, it requires IOLOCK_EXCL. This + * predicate performs an unlocked access of the rt and reflink inode + * state. + */ + if (xfs_iocb_needs_cow_around(ip, iocb, from)) + iolock = XFS_IOLOCK_EXCL; + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -928,6 +1077,24 @@ xfs_falloc_setsize( &iattr); } +static int +xfs_falloc_punch_range( + struct xfs_inode *ip, + loff_t offset, + loff_t len) +{ + int error; + + /* Unshare around the region to punch, if needed. */ + if (xfs_file_write_needs_cow_around(ip, offset, len)) { + error = xfs_file_cow_around(ip, offset, len); + if (error) + return error; + } + + return xfs_free_file_space(ip, offset, len); +} + static int xfs_falloc_collapse_range( struct file *file, @@ -1017,6 +1184,13 @@ xfs_falloc_zero_range( if (error) return error; + /* Unshare around the region to zero, if needed. */ + if (xfs_file_write_needs_cow_around(XFS_I(inode), offset, len)) { + error = xfs_file_cow_around(XFS_I(inode), offset, len); + if (error) + return error; + } + error = xfs_free_file_space(XFS_I(inode), offset, len); if (error) return error; @@ -1044,6 +1218,23 @@ xfs_falloc_unshare_range( if (error) return error; + /* + * Enlarge the unshare region to align to a full allocation unit. + */ + if (xfs_inode_needs_cow_around(XFS_I(inode))) { + unsigned int rextsize; + uint32_t mod; + + rextsize = xfs_inode_alloc_unitsize(XFS_I(inode)); + div_u64_rem(offset, rextsize, &mod); + offset -= mod; + len += mod; + + div_u64_rem(offset + len, rextsize, &mod); + if (mod) + len += rextsize - mod; + } + error = xfs_reflink_unshare(XFS_I(inode), offset, len); if (error) return error; @@ -1124,7 +1315,7 @@ xfs_file_fallocate( switch (mode & FALLOC_FL_MODE_MASK) { case FALLOC_FL_PUNCH_HOLE: - error = xfs_free_file_space(ip, offset, len); + error = xfs_falloc_punch_range(ip, offset, len); break; case FALLOC_FL_COLLAPSE_RANGE: error = xfs_falloc_collapse_range(file, offset, len); @@ -1458,6 +1649,70 @@ xfs_dax_read_fault( return ret; } +/* dax version of folio_mkwrite_check_truncate since vmf->page == NULL */ +static inline ssize_t +dax_write_fault_check( + struct vm_fault *vmf, + struct inode *inode, + unsigned int order) +{ + loff_t size = i_size_read(inode); + pgoff_t index = size >> PAGE_SHIFT; + size_t len = 1U << (PAGE_SHIFT + order); + size_t offset = size & (len - 1); + + if (!IS_ENABLED(CONFIG_FS_DAX)) { + ASSERT(0); + return -EFAULT; + } + + /* fault is wholly inside EOF */ + if (vmf->pgoff + (1U << order) - 1 < index) + return len; + /* fault is wholly past EOF */ + if (vmf->pgoff > index || !offset) + return -EFAULT; + /* fault is partially inside EOF */ + return offset; +} + +static int +xfs_filemap_fault_around( + struct vm_fault *vmf, + struct inode *inode, + unsigned int order) +{ + struct xfs_inode *ip = XFS_I(inode); + loff_t pos; + ssize_t len; + + if (!xfs_inode_needs_cow_around(ip)) + return 0; + + if (IS_DAX(inode)) { + len = dax_write_fault_check(vmf, inode, order); + if (len < 0) + return len; + pos = vmf->pgoff << PAGE_SHIFT; + } else { + struct folio *folio = page_folio(vmf->page); + + folio_lock(folio); + len = folio_mkwrite_check_truncate(folio, inode); + if (len < 0) { + folio_unlock(folio); + return len; + } + pos = folio_pos(folio); + folio_unlock(folio); + } + + if (!xfs_file_write_needs_cow_around(ip, pos, len)) + return 0; + + return xfs_file_cow_around(XFS_I(inode), pos, len); +} + /* * Locking for serialisation of IO during page faults. This results in a lock * ordering of: @@ -1476,6 +1731,7 @@ xfs_write_fault( struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); unsigned int lock_mode = XFS_MMAPLOCK_SHARED; + int error; vm_fault_t ret; trace_xfs_write_fault(ip, order); @@ -1495,10 +1751,18 @@ xfs_write_fault( lock_mode = XFS_MMAPLOCK_EXCL; } + /* Unshare all the blocks in this rt extent surrounding this page. */ + error = xfs_filemap_fault_around(vmf, inode, order); + if (error) { + ret = vmf_fs_error(error); + goto out_unlock; + } + if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); +out_unlock: xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); diff --git a/fs/xfs/xfs_file.h b/fs/xfs/xfs_file.h index 2ad91f755caf35..24490ea49e16c6 100644 --- a/fs/xfs/xfs_file.h +++ b/fs/xfs/xfs_file.h @@ -12,4 +12,7 @@ extern const struct file_operations xfs_dir_file_operations; bool xfs_is_falloc_aligned(struct xfs_inode *ip, loff_t pos, long long int len); +bool xfs_truncate_needs_cow_around(struct xfs_inode *ip, loff_t pos); +int xfs_file_unshare_at(struct xfs_inode *ip, loff_t pos); + #endif /* __XFS_FILE_H__ */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c08093a65352ec..71ca16db369913 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -349,6 +349,12 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) return XFS_IS_REALTIME_INODE(ip) && ip->i_mount->m_sb.sb_rextsize > 1; } +/* Decide if we need to unshare the blocks around a range that we're writing. */ +static inline bool xfs_inode_needs_cow_around(struct xfs_inode *ip) +{ + return xfs_is_cow_inode(ip) && xfs_inode_has_bigrtalloc(ip); +} + /* * Return the buftarg used for data allocations on a given inode. */ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 207e0dadffc3c5..114ebddaa7bc0d 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -29,6 +29,7 @@ #include "xfs_xattr.h" #include "xfs_file.h" #include "xfs_bmap.h" +#include "xfs_reflink.h" #include #include @@ -886,10 +887,38 @@ xfs_setattr_size( * truncate. */ if (newsize > oldsize) { + /* + * Extending the file size, so COW around the allocation unit + * containing EOF before we zero the new range of the file. + */ + if (xfs_truncate_needs_cow_around(ip, oldsize)) { + error = xfs_file_unshare_at(ip, oldsize); + if (error) + return error; + } + trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); error = xfs_zero_range(ip, oldsize, newsize - oldsize, &did_zeroing); } else { + /* + * We're reducing the size of the file, so COW around the new + * EOF allocation unit before truncation zeroes the part of the + * EOF block after the new EOF. Flush the dirty pages to disk + * before we start truncating the pagecache because truncation + * zeroing doesn't preflush written mappings. + */ + if (xfs_truncate_needs_cow_around(ip, newsize)) { + error = xfs_file_unshare_at(ip, newsize); + if (error) + return error; + + error = filemap_write_and_wait_range(inode->i_mapping, + newsize, newsize); + if (error) + return error; + } + error = xfs_truncate_page(ip, newsize, &did_zeroing); } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 59f7fc16eb8093..4f87f7041995c4 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -34,6 +34,7 @@ #include "xfs_rtalloc.h" #include "xfs_rtgroup.h" #include "xfs_metafile.h" +#include "xfs_rtbitmap.h" /* * Copy on Write of Shared Blocks @@ -302,9 +303,26 @@ xfs_reflink_convert_cow_locked( struct xfs_iext_cursor icur; struct xfs_bmbt_irec got; struct xfs_btree_cur *dummy_cur = NULL; + struct xfs_mount *mp = ip->i_mount; int dummy_logflags; int error = 0; + /* + * We can only remap full rt extents, so make sure that we convert the + * entire extent. The caller must ensure that this is either a direct + * write that's aligned to the rt extent size, or a buffered write for + * which we've dirtied extra pages to make this work properly. + */ + if (xfs_inode_needs_cow_around(ip)) { + xfs_fileoff_t new_off; + + new_off = xfs_fileoff_rounddown_rtx(mp, offset_fsb); + count_fsb += offset_fsb - new_off; + offset_fsb = new_off; + + count_fsb = xfs_blen_roundup_rtx(mp, count_fsb); + } + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) return 0; @@ -626,11 +644,21 @@ xfs_reflink_cancel_cow_blocks( bool cancel_real) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); + struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; bool isrt = XFS_IS_REALTIME_INODE(ip); int error = 0; + /* + * Shrink the range that we're cancelling if they don't align to the + * realtime extent size, since we can only free full extents. + */ + if (xfs_inode_needs_cow_around(ip)) { + offset_fsb = xfs_fileoff_roundup_rtx(mp, offset_fsb); + end_fsb = xfs_fileoff_rounddown_rtx(mp, end_fsb); + } + if (!xfs_inode_has_cow_data(ip)) return 0; if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) @@ -923,6 +951,7 @@ xfs_reflink_end_cow( xfs_off_t offset, xfs_off_t count) { + struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; int error = 0; @@ -932,6 +961,16 @@ xfs_reflink_end_cow( offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + /* + * Make sure the end is aligned with a rt extent (if desired), since + * the end of the range could be EOF. The _convert_cow function should + * have set us up to swap only full rt extents. + */ + if (xfs_inode_needs_cow_around(ip)) { + offset_fsb = xfs_fileoff_rounddown_rtx(mp, offset_fsb); + end_fsb = xfs_fileoff_roundup_rtx(mp, end_fsb); + } + /* * Walk forwards until we've remapped the I/O range. The loop function * repeatedly cycles the ILOCK to allocate one transaction per remapped diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8af9c38bea152f..e744f9435ff88d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3970,6 +3970,7 @@ TRACE_EVENT(xfs_ioctl_clone, /* unshare tracepoints */ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); +DEFINE_SIMPLE_IO_EVENT(xfs_file_cow_around); DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); #ifdef CONFIG_XFS_RT DEFINE_SIMPLE_IO_EVENT(xfs_convert_rtbigalloc_file_space); From patchwork Fri Dec 13 01:22:45 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13906315 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 920CB10F7 for ; Fri, 13 Dec 2024 01:22:46 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052966; cv=none; b=cX3FJkOnXROPG6vCGjs4qzmCgZw3DtzbTWmUqCb+9EQWZi1uH1jI63WSPAsDaLev2WvBygduI5vgFSlWrP/D3A+0p4Bs8PMSMw2ohxo80zOKlyUEjtlQj0hh0AdQB8V3PxSUWehUY9JNkyXLOvlf4QGlxxF7okBg+dlDRYKSvZw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052966; c=relaxed/simple; bh=obt8JtmVqGfnEZXDgRPBqcNTF6MXDbdglCegtRtE85o=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=p5oUY2KiDCMeQxdpv1fHrMBK4tvbqo3bUm9U9DHH6NBrqjTz30HERKA5N+VroPN3Px2aLGKzTBtAHixEgENFEFxGulyON8G57OBGwAIviMnLS5Rf3jaO489m8ZZxUA92ckGB6c5mxpbh3GtC0xefzs0QOZtABXVbiPY6IAaD4aU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=j0OJxFLP; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="j0OJxFLP" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 680D3C4CECE; Fri, 13 Dec 2024 01:22:46 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734052966; bh=obt8JtmVqGfnEZXDgRPBqcNTF6MXDbdglCegtRtE85o=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=j0OJxFLPcT8DdEORVtsl29kC8MS6z4FY26zDAoRV1WfXaJEOsGMh8HCMd6mLCwugW whwKeCJPiM6iILyZeAYwODtFTJB1RvEBSq+1ed4LxuN9ELduRTZwZ6u2c8orcBBMtS o6gbMwf9YrKHgyZxbsXxfzD+YPYaFyUnFZgbGfj1wpANvxuP3y7uw+snHrZZdQ5a/d RKd0KmDhXsJi3UIbMbKo0fQ2LioRui8qkpkqLocTwilnOYHglB18TuOhNTC6XkTrUR q4qyUIoPTdnPMTIRqySfI2gXGyzV1Pj5Wn7ZJbNzk3SvtSJoZ8RUDEaImWHtWdasDx FSzn9SG7aP7uQ== Date: Thu, 12 Dec 2024 17:22:45 -0800 Subject: [PATCH 05/11] xfs: forcibly convert unwritten blocks within an rt extent before sharing From: "Darrick J. Wong" To: djwong@kernel.org Cc: hch@lst.de, linux-xfs@vger.kernel.org Message-ID: <173405125829.1184063.12093363819984841679.stgit@frogsfrogsfrogs> In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong As noted in the previous patch, XFS can only unmap and map full rt extents. This means that we cannot stop mid-extent for any reason, including stepping around unwritten/written extents. Second, the reflink and CoW mechanisms were not designed to handle shared unwritten extents, so we have to do something to get rid of them. If the user asks us to remap two files, we must scan both ranges beforehand to convert any unwritten extents that are not aligned to rt extent boundaries into zeroed written extents before sharing. Signed-off-by: "Darrick J. Wong" --- fs/xfs/xfs_reflink.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 4f87f7041995c4..82ceec8517a020 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1666,6 +1666,25 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; + /* + * Now that we've marked both inodes for reflink, make sure that all + * allocation units (AU) mapped into either files' ranges are either + * wholly written, wholly unwritten, or holes. The bmap code requires + * that we align all unmap and remap requests to an AU. We've already + * flushed the page cache and finished directio for the range that's + * being remapped, so we can convert the mappings directly. + */ + if (xfs_inode_has_bigrtalloc(src)) { + ret = xfs_convert_rtbigalloc_file_space(src, pos_in, *len); + if (ret) + goto out_unlock; + } + if (xfs_inode_has_bigrtalloc(dest)) { + ret = xfs_convert_rtbigalloc_file_space(dest, pos_out, *len); + if (ret) + goto out_unlock; + } + /* * If pos_out > EOF, we may have dirtied blocks between EOF and * pos_out. In that case, we need to extend the flush and unmap to cover From patchwork Fri Dec 13 01:23:01 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13906316 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 39DF61078F for ; Fri, 13 Dec 2024 01:23:02 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052982; cv=none; b=Q5mae6TIkW4bVaQByTiYpDY4QJ2knTN/YwPB+ZGKL0cg8kw7KcjlIHEC/FkVbyQf0IlF2Dcf1H7t6aEeXDDWLFWUg9TC1iJf98HgqZbD8VDYRpSJnc741Jstk1TI8EPwolbFeWTDd770X/Ga9i4fiGrQOh6ZIFPz6CWDOobMtGQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052982; c=relaxed/simple; bh=MGXPCPb2rp08r7BvOWJ0lcc715YkjlICeVLVxxChSww=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=CvDDHqwAVXk7Nla8z+xOP2Fm9i3Is3TFeumfXtbtp035V/jmEHo/8cWuSq5DYELLmSj7Acwnwb6aOE9AZZH6OCkeKDdYzNNoWVcnotdlWPfCsl31tFw034bk4w3ul7BytJ6oH5cokVvbDGoGE3BICAgQMnbYhtbbi5fd4iniP5U= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=QjYQ0loj; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="QjYQ0loj" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 14F12C4CECE; Fri, 13 Dec 2024 01:23:02 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734052982; bh=MGXPCPb2rp08r7BvOWJ0lcc715YkjlICeVLVxxChSww=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=QjYQ0lojW5tOzKcZx2JyBgH83gejwzKSZQNvypdmmeplPfRMwAYw4vNf9MSIL3HZ/ xgSjx14dNXjDJqEcw3KqKzEr4DMiYZiJXN8BMf3iJ2UwUT5oRyDD1+UrdcKvoSKrvA gVIIifo2mAn9YA0g/D330VMwKCijO0uPwWH2WiRbhofK5OHiH0PDPstOOCKlE7e7pu UlYvO/2srTUsoUZNv4+n/6SfwsPaPrAbUaadXqO/MWuyZK3p3q5c8LzxBGn/XGiJw3 j4gWd2vmCwIS++Q/+WMV5f5Sc5sQEPXnr7iEJalade/Jw0fbiKE7MtsSsXknUHsKsQ MzSDPZnhgoQIw== Date: Thu, 12 Dec 2024 17:23:01 -0800 Subject: [PATCH 06/11] xfs: add some tracepoints for writeback From: "Darrick J. Wong" To: djwong@kernel.org Cc: hch@lst.de, linux-xfs@vger.kernel.org Message-ID: <173405125846.1184063.6516078668233318926.stgit@frogsfrogsfrogs> In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Add a tracepoint so I can see where writeback is initiated. Signed-off-by: "Darrick J. Wong" --- fs/xfs/xfs_aops.c | 19 ++++++++++++------- fs/xfs/xfs_trace.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 559a3a57709748..f51f2f5f76d0f6 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -468,21 +468,26 @@ static const struct iomap_writeback_ops xfs_writeback_ops = { STATIC int xfs_vm_writepages( - struct address_space *mapping, - struct writeback_control *wbc) + struct address_space *mapping, + struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { }; + struct xfs_writepage_ctx wpc = { }; + struct xfs_inode *ip = XFS_I(mapping->host); - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + trace_xfs_vm_writepages(ip, wbc); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } STATIC int xfs_dax_writepages( - struct address_space *mapping, - struct writeback_control *wbc) + struct address_space *mapping, + struct writeback_control *wbc) { - struct xfs_inode *ip = XFS_I(mapping->host); + struct xfs_inode *ip = XFS_I(mapping->host); + + trace_xfs_dax_writepages(ip, wbc); xfs_iflags_clear(ip, XFS_ITRUNCATED); return dax_writeback_mapping_range(mapping, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e744f9435ff88d..0234af78cea9a1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1554,6 +1554,40 @@ DEFINE_IMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IMAP_EVENT(xfs_iomap_alloc); DEFINE_IMAP_EVENT(xfs_iomap_found); +DECLARE_EVENT_CLASS(xfs_writeback_class, + TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), + TP_ARGS(ip, wbc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, range_start) + __field(loff_t, range_end) + __field(long, nr_to_write) + __field(enum writeback_sync_modes, sync_mode) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->range_start = wbc->range_start; + __entry->range_end = wbc->range_end; + __entry->nr_to_write = wbc->nr_to_write; + __entry->sync_mode = wbc->sync_mode; + ), + TP_printk("dev %d:%d ino 0x%llx range_start 0x%llx range_end 0x%llx nr_to_write %ld sync_mode %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->range_start, + __entry->range_end, + __entry->nr_to_write, + __entry->sync_mode) +); +#define DEFINE_WRITEBACK_EVENT(name) \ +DEFINE_EVENT(xfs_writeback_class, name, \ + TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), \ + TP_ARGS(ip, wbc)) +DEFINE_WRITEBACK_EVENT(xfs_vm_writepages); +DEFINE_WRITEBACK_EVENT(xfs_dax_writepages); + DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, u64 count), TP_ARGS(ip, offset, count), From patchwork Fri Dec 13 01:23:17 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13906317 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D358617BA1 for ; Fri, 13 Dec 2024 01:23:17 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052997; cv=none; b=C3f3XRKsnupjUw2xXLl4AJRtdudDvbU5VJozkJKwR5JmGjJteef+klLxqYUknq75pmk3jqfWUQ3XZ2oaIz4lbDM3/oeO7mBzSmudxHY2OMlRELjqM7ntQ8buina0SyjYRJRa8CXvSRarc/eO9gQp7ypcF5rp1rqgestPFWnkEYE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734052997; c=relaxed/simple; bh=kKz9K3F+lPeE+yIdMEqt89LUeSNml2IE66ly0hGswGI=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=syPpqrO67u8fjH9OtPer1sPpRkRNg6/9XOgs/vXR7ykEqPXV+05ROiTfklgTK7dd0CZ6EidG9FUEYbyKEE23lSvYZ+c6WAfyWMZZlfVrdrGQ8ZMNqmLG1MzuU6JI2zdkZ4OjPbgG6G7zP/Zd3i79faXFlTWq+YfK+4Zfefn6OW4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=rghH5f8X; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="rghH5f8X" Received: by smtp.kernel.org (Postfix) with ESMTPSA id AB3FEC4CECE; Fri, 13 Dec 2024 01:23:17 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734052997; bh=kKz9K3F+lPeE+yIdMEqt89LUeSNml2IE66ly0hGswGI=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=rghH5f8XGlNltI8pMyG3CWuva7IRzHkq4MHmbG4J5lrsbgn5Ilbdz/q3ISV9vDL/o 1XkKNrciyR1cPFUkJKBKPpGD52dyXwiVOHl+h5XlgQqFp1HPwT8cEZYSXremH5y7pZ xSZ+vkVTSEhmS4zvY73dZUBFXYBGvImXOn+r2FvO9svM5HpbU2cLb1AXncfyz+PKyu c6QOU1hsmkFCs07K7sMrv6L9V1IgqgBfKX8kOXsZky3/Z+NpxsJ2SEpkJ99iDFgljy AvahV+QQbxII9nwtW+DfRaU02ghn+UwOidMV4qGS4q2UcKPXRhlWIRp7qH4qWVQ41t qpzZgtJdKtEUA== Date: Thu, 12 Dec 2024 17:23:17 -0800 Subject: [PATCH 07/11] xfs: extend writeback requests to handle rt cow correctly From: "Darrick J. Wong" To: djwong@kernel.org Cc: hch@lst.de, linux-xfs@vger.kernel.org Message-ID: <173405125863.1184063.8842755288883819617.stgit@frogsfrogsfrogs> In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong If we have shared realtime files and the rt extent size is larger than a single fs block, we need to extend writeback requests to be aligned to rt extent size granularity because we cannot share partial rt extents. The front end should have set us up for this by dirtying the relevant ranges. Signed-off-by: "Darrick J. Wong" --- fs/xfs/xfs_aops.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trace.h | 1 + 2 files changed, 39 insertions(+) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f51f2f5f76d0f6..9bc2d7d92e4c46 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -466,6 +466,38 @@ static const struct iomap_writeback_ops xfs_writeback_ops = { .discard_folio = xfs_discard_folio, }; +/* + * Extend the writeback range to allocation unit granularity and alignment. + * This is a requirement for blocksize > pagesize scenarios such as realtime + * copy on write, since we can only share full rt extents. + */ +static inline void +xfs_vm_writepages_extend( + struct xfs_inode *ip, + struct writeback_control *wbc) +{ + unsigned int bsize = xfs_inode_alloc_unitsize(ip); + long long int pages_to_write; + loff_t next = wbc->range_end + 1; + + wbc->range_start = rounddown_64(wbc->range_start, bsize); + if (wbc->range_end != LLONG_MAX) + wbc->range_end = roundup_64(next, bsize) - 1; + + if (wbc->nr_to_write != LONG_MAX) { + pgoff_t pg_start = wbc->range_start >> PAGE_SHIFT; + pgoff_t pg_next = (wbc->range_end + 1) >> PAGE_SHIFT; + + pages_to_write = pg_next - pg_start; + if (pages_to_write >= LONG_MAX) + pages_to_write = LONG_MAX; + if (wbc->nr_to_write < pages_to_write) + wbc->nr_to_write = pages_to_write; + } + + trace_xfs_vm_writepages_extend(ip, wbc); +} + STATIC int xfs_vm_writepages( struct address_space *mapping, @@ -476,6 +508,9 @@ xfs_vm_writepages( trace_xfs_vm_writepages(ip, wbc); + if (xfs_inode_needs_cow_around(ip)) + xfs_vm_writepages_extend(ip, wbc); + xfs_iflags_clear(ip, XFS_ITRUNCATED); return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } @@ -489,6 +524,9 @@ xfs_dax_writepages( trace_xfs_dax_writepages(ip, wbc); + if (xfs_inode_needs_cow_around(ip)) + xfs_vm_writepages_extend(ip, wbc); + xfs_iflags_clear(ip, XFS_ITRUNCATED); return dax_writeback_mapping_range(mapping, xfs_inode_buftarg(ip)->bt_daxdev, wbc); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0234af78cea9a1..021ea65909c915 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1586,6 +1586,7 @@ DEFINE_EVENT(xfs_writeback_class, name, \ TP_PROTO(struct xfs_inode *ip, const struct writeback_control *wbc), \ TP_ARGS(ip, wbc)) DEFINE_WRITEBACK_EVENT(xfs_vm_writepages); +DEFINE_WRITEBACK_EVENT(xfs_vm_writepages_extend); DEFINE_WRITEBACK_EVENT(xfs_dax_writepages); DECLARE_EVENT_CLASS(xfs_simple_io_class, From patchwork Fri Dec 13 01:23:32 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13906318 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7158918EA2 for ; Fri, 13 Dec 2024 01:23:33 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734053013; cv=none; b=cKbpZMMag2lC+qviRynFSs+ZNcpb5HiTJRgbjLUL0m5Cz4/Zumd1p/02/5fK4FXGmZDSCjrBVe3fv1Vhloicc+w/lolcTslvd9VsxCXKdyH+Uh9KSr7ZYFr0/sgRLKl+JNvbWDNyddLcze6Ik9nL2XDsu7T5unUXhYlmpcUhDU4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734053013; c=relaxed/simple; bh=tryhAMq0YVG3vdMS3dD/LVsVxxAycb49ibeO25iUKRU=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=DauRkGezXzArJSBoB44ayX/sz1afxKG1beBFB0mgqeF8t1xhHAiZ3vTVKpIFYRg1qiTYziNYCR6vfQyeQm6sssS3unAa8s7C5CY1zHhbQtScJFRo+lvSN3a1jfOXUKSaTUh7/gdbXBHr6sosoXtkgOr4A9p+cDayOj9yv/laZg4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=JCoyzNzR; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="JCoyzNzR" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 49182C4CECE; Fri, 13 Dec 2024 01:23:33 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734053013; bh=tryhAMq0YVG3vdMS3dD/LVsVxxAycb49ibeO25iUKRU=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=JCoyzNzRQ/hKpyBu3OHStJFJSE4Ttx4xmp9hc6/5SvPSV0xMRXNKWCQdGQRqEenrm 8wLeFQbnD4eghi6GaX4eFM6ZGETcmj1fgENjPzihq0iJj7TOZLe3NZHvW6y1jaho9S vnN57avg4mKrPkLrEYrP5/EuLlhKNyZSSxavsQ08Jt9qY4DKYD0LRbyu1UoLkaLTpW AVKGi7cu2bDh5V4t/nLapcFSnKyEykt46aaxYuS40XOXXvViTp0yAqLwa/LN6p89SJ K4AOV8JhhUdVxSnlMVebzp+++gQ21C09piuVyXXw3tnNhgkFwkVkv9IiwOY2iDAWN8 yZj7mwRwgAErQ== Date: Thu, 12 Dec 2024 17:23:32 -0800 Subject: [PATCH 08/11] xfs: enable extent size hints for CoW when rtextsize > 1 From: "Darrick J. Wong" To: djwong@kernel.org Cc: hch@lst.de, linux-xfs@vger.kernel.org Message-ID: <173405125880.1184063.8755676628520114568.stgit@frogsfrogsfrogs> In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong CoW extent size hints are not allowed on filesystems that have large realtime extents because we only want to perform the minimum required amount of write-around (aka write amplification) for shared extents. On filesystems where rtextsize > 1, allocations can only be done in units of full rt extents, which means that we can only map an entire rt extent's worth of blocks into the data fork. Hole punch requests become conversions to unwritten if the request isn't aligned properly. Because a copy-write fundamentally requires remapping, this means that we also can only do copy-writes of a full rt extent. This is too expensive for large hint sizes, since it's all or nothing. Signed-off-by: "Darrick J. Wong" --- fs/xfs/libxfs/xfs_bmap.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 40ad22fb808b95..e1aac1711f553f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6525,6 +6525,28 @@ xfs_get_cowextsz_hint( if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) a = ip->i_cowextsize; if (XFS_IS_REALTIME_INODE(ip)) { + /* + * For realtime files, the realtime extent is the fundamental + * unit of allocation. This means that data sharing and CoW + * remapping can only be done in those units. For filesystems + * where the extent size is larger than one block, write + * requests that are not aligned to an extent boundary employ + * an unshare-around strategy to ensure that all pages for a + * shared extent are fully dirtied. + * + * Because the remapping alignment requirement applies equally + * to all CoW writes, any regular overwrites that could be + * turned (by a speculative CoW preallocation) into a CoW write + * must either employ this dirty-around strategy, or be smart + * enough to ignore the CoW fork mapping unless the entire + * extent is dirty or becomes shared by writeback time. Doing + * the first would dramatically increase write amplification, + * and the second would require deeper insight into the state + * of the page cache during a writeback request. For now, we + * ignore the hint. + */ + if (ip->i_mount->m_sb.sb_rextsize > 1) + return ip->i_mount->m_sb.sb_rextsize; b = 0; if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) b = ip->i_extsize; From patchwork Fri Dec 13 01:23:48 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13906319 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1C10421345 for ; Fri, 13 Dec 2024 01:23:49 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734053029; cv=none; b=KShEssr+bzUx/tmEkCUgtTBvLPnCpuM1ruHajYNuIFIha1raVkf78ejil8GXkISOMwDp2U2cBcggu38PRI8S386nQ2uMcAfPs/CQXs1biiNmLsuBXeZnzKOZG9Jze4p6TY8yBhn3hUg6IS6dyIB54zBlWkR6bkDY2pNmA2Uvm/I= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734053029; c=relaxed/simple; bh=CA3zQGiztERvQtDTb+QA0vzkbRbztOQJVB13SigRqIA=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=b3gjcpMeG0tRGkEILbLvSFBZTuUwHzElkAUNqKtFirxkMk6LNh3PtasGTAyIG8sgyUo3lSylS9V6LMroTiQ06DcZ2F7xru6geFj/96GhGbqa0Q4hWD7owIYaBkjzCeojM2NUXKgn86v4G1OBBz8cA0HvuHa7DW0RWKjNcuVNL9k= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ZVQxZTnS; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ZVQxZTnS" Received: by smtp.kernel.org (Postfix) with ESMTPSA id E1F5DC4CECE; Fri, 13 Dec 2024 01:23:48 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734053028; bh=CA3zQGiztERvQtDTb+QA0vzkbRbztOQJVB13SigRqIA=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=ZVQxZTnSeo2MZIPnCAN7Xv/rN4nKUk+8KLbGlKHmIBOUN+UP4pS1/OAXPOaGQbEp2 /235627snhSmV8wRKXNEQ4VZEsoQrqGN4OZRozpkfEyq6nJeBl8kkLveGWccEhyaZy oRa3Xhd1PWJyDZhCQ7EzP7/ebnpzRsppaVg4fsTHhlFolGcl61UoglCMagaylKloPF Uh2un/uDRzmJwwLd55OFfraYzu3bDrjJETzf0UvJ06pyqUAq3k33YocGlFH1TE3h/9 2tblW7kSE0TSQPfk+6qkvC1bOgJldJeV9qv8Se+ZgxYvrVcfe6GMH49uv0Cn4a230+ 7gnMYgVaEh+Yw== Date: Thu, 12 Dec 2024 17:23:48 -0800 Subject: [PATCH 09/11] xfs: allow reflink on the rt volume when extent size is larger than 1 rt block From: "Darrick J. Wong" To: djwong@kernel.org Cc: hch@lst.de, linux-xfs@vger.kernel.org Message-ID: <173405125896.1184063.11119572969501198910.stgit@frogsfrogsfrogs> In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Make the necessary tweaks to the reflink remapping code to support remapping on the realtime volume when the rt extent size is larger than a single rt block. We need to check that the remap arguments from userspace are aligned to a rt extent boundary, and that the length is always aligned, even if the kernel tried to round it up to EOF for us. XFS can only map and remap full rt extents, so we have to be a little more strict about the alignment there. Signed-off-by: "Darrick J. Wong" --- fs/xfs/xfs_reflink.c | 91 +++++++++++++++++++++++++++++++++++++++++++++----- fs/xfs/xfs_super.c | 2 + fs/xfs/xfs_trace.h | 3 ++ 3 files changed, 85 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 82ceec8517a020..0222b78dedd92d 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1506,6 +1506,13 @@ xfs_reflink_remap_blocks( len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), XFS_MAX_FILEOFF); + /* + * Make sure the end is aligned with an allocation unit, even if it's + * past EOF. + */ + if (xfs_inode_has_bigrtalloc(dest)) + len = xfs_blen_roundup_rtx(mp, len); + trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); while (len > 0) { @@ -1580,6 +1587,57 @@ xfs_reflink_zero_posteof( return xfs_zero_range(ip, isize, pos - isize, NULL); } +#ifdef CONFIG_XFS_RT +/* + * Adjust the length of the remap operation to end on an allocation unit (AU) + * boundary. + */ +STATIC int +xfs_reflink_adjust_rtbigalloc_len( + struct xfs_inode *src, + loff_t pos_in, + struct xfs_inode *dest, + loff_t pos_out, + loff_t *len, + unsigned int remap_flags) +{ + unsigned int alloc_unit = xfs_inode_alloc_unitsize(src); + uint32_t mod; + + div_u64_rem(*len, alloc_unit, &mod); + + /* + * We previously checked the AU alignment of both offsets, so we now + * have to check the AU alignment of the length. The VFS remap prep + * function can change the length on us, so we can only make length + * adjustments after that. If the length is aligned to an AU, we're + * good to go. + * + * Otherwise, the length is not aligned to an AU. If the source file's + * range ends at EOF, the VFS ensured that the dest file's range also + * ends at EOF. The actual remap function will round the (byte) length + * up to the nearest AU, so we're ok here too. + */ + if (mod == 0 || pos_in + *len == i_size_read(VFS_I(src))) + return 0; + + /* + * Otherwise, the only thing we can do is round the request length down + * to an AU boundary. If the caller doesn't allow that, we cannot move + * forward. + */ + if (!(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return -EINVAL; + + /* Back off by a single extent. */ + (*len) -= mod; + trace_xfs_reflink_adjust_rtbigalloc_len(src, pos_in, *len, dest, pos_out); + return 0; +} +#else +# define xfs_reflink_adjust_rtbigalloc_len(...) (0) +#endif /* CONFIG_XFS_RT */ + /* * Prepare two files for range cloning. Upon a successful return both inodes * will have the iolock and mmaplock held, the page cache of the out file will @@ -1622,6 +1680,7 @@ xfs_reflink_remap_prep( struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); + const struct iomap_ops *dax_read_ops = NULL; int ret; /* Lock both files against IO */ @@ -1639,15 +1698,25 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) != IS_DAX(inode_out)) goto out_unlock; - if (!IS_DAX(inode_in)) - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, - pos_out, len, remap_flags); - else - ret = dax_remap_file_range_prep(file_in, pos_in, file_out, - pos_out, len, remap_flags, &xfs_read_iomap_ops); + ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(dest))); + + if (IS_DAX(inode_in)) + dax_read_ops = &xfs_read_iomap_ops; + + ret = __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, dax_read_ops, + xfs_inode_alloc_unitsize(dest)); if (ret || *len == 0) goto out_unlock; + /* Adjust the end to align to an allocation unit. */ + if (xfs_inode_has_bigrtalloc(src)) { + ret = xfs_reflink_adjust_rtbigalloc_len(src, pos_in, dest, + pos_out, len, remap_flags); + if (ret || *len == 0) + goto out_unlock; + } + /* Attach dquots to dest inode before changing block map */ ret = xfs_qm_dqattach(dest); if (ret) @@ -1896,11 +1965,13 @@ xfs_reflink_supports_rextsize( return false; /* - * Reflink doesn't support rt extent size larger than a single fsblock - * because we would have to perform CoW-around for unaligned write - * requests to guarantee that we always remap entire rt extents. + * Reflink doesn't support file allocation units larger than a single + * block and not a power of two because we would have to perform + * CoW-around for unaligned write requests to guarantee that we always + * remap entire allocation units and the reflink code cannot yet handle + * rounding ranges to align to non powers of two. */ - if (rextsize != 1) + if (!is_power_of_2(rextsize)) return false; return true; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0fa7b7cc75c146..c91b9467a3eef8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1757,7 +1757,7 @@ xfs_fs_fill_super( if (xfs_has_realtime(mp) && !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) { xfs_alert(mp, - "reflink not compatible with realtime extent size %u!", + "reflink not compatible with non-power-of-2 realtime extent size %u!", mp->m_sb.sb_rextsize); error = -EINVAL; goto out_filestream_unmount; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 021ea65909c915..b218786e734df0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3965,6 +3965,9 @@ TRACE_EVENT(xfs_reflink_remap_blocks, __entry->dest_lblk) ); DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); +#ifdef CONFIG_XFS_RT +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_adjust_rtbigalloc_len); +#endif /* CONFIG_XFS_RT */ DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); From patchwork Fri Dec 13 01:24:04 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13906325 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B437118EA2 for ; Fri, 13 Dec 2024 01:24:04 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734053044; cv=none; b=ZqFIKeRQtmY4rD+TqWxa5CYUvRe1iNCXekiPqyPUFKFoK+wgu5v+YKytojQQRgA5vCDv9chDAZ1FqQ+wE21iV6CFYQ4tGHaRg+0oVnZnag/W79DYbsHRQWIRtHJFGNqngmqFPdfHuz3vDFqQot18duCk4KbF0QAx5eiSbvi3KE0= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734053044; c=relaxed/simple; bh=IdDYYjRNrpi6gmPsYhhzs8I1wZiq+3261RkKgjFDRSU=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=kzwtIra1CmP6AIRELYjDB41SYAEyzbC7N0j9mLjIuf8Og4U70mEc69InFzZLmTxeeQRtb56vsEvyoyrF9a00Nea/GbBR+hxAG7sX1FL5u4W8RgTiFZJD8pmgoXSWtuVknl31xlq3fkqhhRRyYAkzZOjlMeY38QOr1c3u5gHSOyI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=YwpZDQnX; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="YwpZDQnX" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8F69DC4CED3; Fri, 13 Dec 2024 01:24:04 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734053044; bh=IdDYYjRNrpi6gmPsYhhzs8I1wZiq+3261RkKgjFDRSU=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=YwpZDQnXBgF1MP2IqHHfOz+4kZ4hhDHCBRCu2L1ZZYPccZQ81R8PTrK2m1TABVTNz PEo0+dD92myTh3yzHZtkP7s6GvCrdA5JSPMmmrXjF4eVeZtLmrXndqsq3/oIT44V/m s5AP6vApLgUrO+KnXw7X4pKwc9egs9278wRo2dcIaIUnQbGOb7ILTJ+ofgOr8u83fh xCVFx9yzs9sUyBcaRKjJe3xUOblcmrgqckuofe2kdo/2o2JI87aUb+mTV3Kwbh8E8T Zzps9JY6FpdNp9EWeJbxss3R14zMWGXIXxGgGUYNCAjhxhISF2/l0ND5ZCm/Ba99xF 3MYvAStfbC4dQ== Date: Thu, 12 Dec 2024 17:24:04 -0800 Subject: [PATCH 10/11] xfs: fix integer overflow when validating extent size hints From: "Darrick J. Wong" To: djwong@kernel.org Cc: hch@lst.de, linux-xfs@vger.kernel.org Message-ID: <173405125912.1184063.5511687476090644949.stgit@frogsfrogsfrogs> In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Both file extent size hints are stored as 32-bit quantities, in units of filesystem blocks. As part of validating the hints, we convert these quantities to bytes to ensure that the hint is congruent with the file's allocation size. The maximum possible hint value is 2097151 (aka XFS_MAX_BMBT_EXTLEN). If the file allocation unit is larger than 2048, the unit conversion will exceed 32 bits in size, which overflows the uint32_t used to store the value used in the comparison. This isn't a problem for files on the data device since the hint will always be a multiple of the block size. However, this is a problem for realtime files because the rtextent size can be any integer number of fs blocks, and truncation of upper bits changes the outcome of division. Eliminate the overflow by performing the congruency check in units of blocks, not bytes. Otherwise, we get errors like this: $ truncate -s 500T /tmp/a $ mkfs.xfs -f -N /tmp/a -d extszinherit=2097151,rtinherit=1 -r extsize=28k illegal extent size hint 2097151, must be less than 2097151 and a multiple of 7. Signed-off-by: "Darrick J. Wong" --- fs/xfs/libxfs/xfs_inode_buf.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f24fa628fecf1e..3fd1b03b4c78cc 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -819,13 +819,11 @@ xfs_inode_validate_extsize( bool rt_flag; bool hint_flag; bool inherit_flag; - uint32_t extsize_bytes; - uint32_t blocksize_bytes; + uint32_t alloc_unit = 1; rt_flag = (flags & XFS_DIFLAG_REALTIME); hint_flag = (flags & XFS_DIFLAG_EXTSIZE); inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); - extsize_bytes = XFS_FSB_TO_B(mp, extsize); /* * This comment describes a historic gap in this verifier function. @@ -854,9 +852,7 @@ xfs_inode_validate_extsize( */ if (rt_flag) - blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); - else - blocksize_bytes = mp->m_sb.sb_blocksize; + alloc_unit = mp->m_sb.sb_rextsize; if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode))) return __this_address; @@ -874,7 +870,7 @@ xfs_inode_validate_extsize( if (mode && !(hint_flag || inherit_flag) && extsize != 0) return __this_address; - if (extsize_bytes % blocksize_bytes) + if (extsize % alloc_unit) return __this_address; if (extsize > XFS_MAX_BMBT_EXTLEN) @@ -909,12 +905,10 @@ xfs_inode_validate_cowextsize( { bool rt_flag; bool hint_flag; - uint32_t cowextsize_bytes; - uint32_t blocksize_bytes; + uint32_t alloc_unit = 1; rt_flag = (flags & XFS_DIFLAG_REALTIME); hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); - cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); /* * Similar to extent size hints, a directory can be configured to @@ -929,9 +923,7 @@ xfs_inode_validate_cowextsize( */ if (rt_flag) - blocksize_bytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); - else - blocksize_bytes = mp->m_sb.sb_blocksize; + alloc_unit = mp->m_sb.sb_rextsize; if (hint_flag && !xfs_has_reflink(mp)) return __this_address; @@ -946,7 +938,7 @@ xfs_inode_validate_cowextsize( if (mode && !hint_flag && cowextsize != 0) return __this_address; - if (cowextsize_bytes % blocksize_bytes) + if (cowextsize % alloc_unit) return __this_address; if (cowextsize > XFS_MAX_BMBT_EXTLEN) From patchwork Fri Dec 13 01:24:19 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13906326 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5F2BA38385 for ; Fri, 13 Dec 2024 01:24:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734053060; cv=none; b=JiRAeiYj/oPk51BxL+2U+bSkO7HEaJUKmY5gM0KMZPDZ4EQlhZsa5lBItVzCNsadN5rEVb5io3LuX6oAUhV/G05YcVO2XLDsXFWeW2K8bTECqxbIsAUUwZa7crZ01pSt9kwCHSJwziro2vRNf0PjDMByPvyjLYxGkoqugNyvGlY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1734053060; c=relaxed/simple; bh=8A3OvtxShk/d/AttFSxe8ntzKAWvzGQnO9G6Ds5GA8Q=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=RKbs+7S448yyiz+f7esxoAJsQYutUmrzMngZ9nfj/ZhWnVepRdm1UmCVKsX6f3Peq2opziZk694KNkSGJXUkduM5yfaHew+XFfhB3a0BtwiW7xMFdL0RqRoJCevsFk+EzeZlHQ7d/IEWKVutWHMnK41CboskRcp9pDJ3NxqMnQo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=Ybo6el0V; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="Ybo6el0V" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 37A11C4CECE; Fri, 13 Dec 2024 01:24:20 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1734053060; bh=8A3OvtxShk/d/AttFSxe8ntzKAWvzGQnO9G6Ds5GA8Q=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=Ybo6el0VDtl33ZNmsXN3laZqw5s0fiahZXuhuVYg/70bBeHDbuYcwOIkW4uBd3t3e osD9xAzHx3EB8NYSzsqdAvlS33ALYjUDbUVfzkxBWSQmjVsKbqUnTONKCrrRHGAYjo zashb4lZc+n60ll8Ac23gmL5qo1QzW7pHUF/55f0OTOM8X1l/+yo271UScbbNrl/8V Ssu9c4lIDRhbqyYqsf2bem0DP3aeSqY0icEPOVVKdS0cwAmKzCvil9fgurZ83BaCbT sygqwlYGoeMXy/fNqBxMeg9Xh/3v2uPbx7LU7hlJ6qpby5uTc/8H3ZGrdiGD7N5UfT hfVqz9PfqKqdQ== Date: Thu, 12 Dec 2024 17:24:19 -0800 Subject: [PATCH 11/11] xfs: support realtime reflink with an extent size that isn't a power of 2 From: "Darrick J. Wong" To: djwong@kernel.org Cc: hch@lst.de, linux-xfs@vger.kernel.org Message-ID: <173405125928.1184063.9203313014441349759.stgit@frogsfrogsfrogs> In-Reply-To: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> References: <173405125712.1184063.11685981006674346615.stgit@frogsfrogsfrogs> Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Add the necessary alignment checking code to the reflink remap code to ensure that remap requests are aligned to rt extent boundaries if the realtime extent size isn't a power of two. The VFS helpers assume that they can use the usual (blocksize - 1) masking to avoid slow 64-bit division, but since XFS is special we won't make everyone pay that cost for our weird edge case. Signed-off-by: "Darrick J. Wong" --- fs/xfs/xfs_reflink.c | 119 +++++++++++++++++++++++++++++++++++++------------- fs/xfs/xfs_reflink.h | 2 - fs/xfs/xfs_rtalloc.c | 4 -- fs/xfs/xfs_super.c | 9 ---- 4 files changed, 90 insertions(+), 44 deletions(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 0222b78dedd92d..6ceb00565bab24 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1638,6 +1638,83 @@ xfs_reflink_adjust_rtbigalloc_len( # define xfs_reflink_adjust_rtbigalloc_len(...) (0) #endif /* CONFIG_XFS_RT */ +/* + * Check the alignment of a remap request when the allocation unit size isn't a + * power of two. The VFS helpers use (fast) bitmask-based alignment checks, + * but here we have to use slow long division. + */ +static int +xfs_reflink_remap_check_rtalign( + struct xfs_inode *ip_in, + loff_t pos_in, + struct xfs_inode *ip_out, + loff_t pos_out, + loff_t *req_len, + unsigned int remap_flags) +{ + struct xfs_mount *mp = ip_in->i_mount; + uint32_t rextbytes; + loff_t in_size, out_size; + loff_t new_length, length = *req_len; + loff_t blen; + + rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize); + in_size = i_size_read(VFS_I(ip_in)); + out_size = i_size_read(VFS_I(ip_out)); + + /* The start of both ranges must be aligned to a rt extent. */ + if (!isaligned_64(pos_in, rextbytes) || + !isaligned_64(pos_out, rextbytes)) + return -EINVAL; + + if (length == 0) + length = in_size - pos_in; + + /* + * If the user wanted us to exchange up to the infile's EOF, round up + * to the next block boundary for this check. + * + * Otherwise, reject the range length if it's not extent aligned. We + * already confirmed the starting offsets' extent alignment. + */ + if (pos_in + length == in_size) + blen = roundup_64(in_size, rextbytes) - pos_in; + else + blen = rounddown_64(length, rextbytes); + + /* Don't allow overlapped remappings within the same file. */ + if (ip_in == ip_out && + pos_out + blen > pos_in && + pos_in + blen > pos_out) + return -EINVAL; + + /* + * Ensure that we don't exchange a partial EOF extent into the middle + * of another file. + */ + if (isaligned_64(length, rextbytes)) + return 0; + + new_length = length; + if (pos_out + length < out_size) + new_length = rounddown_64(new_length, rextbytes); + + if (new_length == length) + return 0; + + /* + * Return the shortened request if the caller permits it. If the + * request was shortened to zero rt extents, we know that the original + * arguments weren't valid in the first place. + */ + if ((remap_flags & REMAP_FILE_CAN_SHORTEN) && new_length > 0) { + *req_len = new_length; + return 0; + } + + return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; +} + /* * Prepare two files for range cloning. Upon a successful return both inodes * will have the iolock and mmaplock held, the page cache of the out file will @@ -1681,6 +1758,7 @@ xfs_reflink_remap_prep( struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); const struct iomap_ops *dax_read_ops = NULL; + unsigned int alloc_unit = xfs_inode_alloc_unitsize(dest); int ret; /* Lock both files against IO */ @@ -1698,14 +1776,22 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) != IS_DAX(inode_out)) goto out_unlock; - ASSERT(is_power_of_2(xfs_inode_alloc_unitsize(dest))); + /* Check non-power of two alignment issues, if necessary. */ + if (XFS_IS_REALTIME_INODE(dest) && !is_power_of_2(alloc_unit)) { + ret = xfs_reflink_remap_check_rtalign(src, pos_in, dest, + pos_out, len, remap_flags); + if (ret) + goto out_unlock; + + /* Do the VFS checks with the regular block alignment. */ + alloc_unit = src->i_mount->m_sb.sb_blocksize; + } if (IS_DAX(inode_in)) dax_read_ops = &xfs_read_iomap_ops; ret = __generic_remap_file_range_prep(file_in, pos_in, file_out, - pos_out, len, remap_flags, dax_read_ops, - xfs_inode_alloc_unitsize(dest)); + pos_out, len, remap_flags, dax_read_ops, alloc_unit); if (ret || *len == 0) goto out_unlock; @@ -1949,30 +2035,3 @@ xfs_reflink_unshare( trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); return error; } - -/* - * Can we use reflink with this realtime extent size? Note that we don't check - * for rblocks > 0 here because this can be called as part of attaching a new - * rt section. - */ -bool -xfs_reflink_supports_rextsize( - struct xfs_mount *mp, - unsigned int rextsize) -{ - /* reflink on the realtime device requires rtgroups */ - if (!xfs_has_rtgroups(mp)) - return false; - - /* - * Reflink doesn't support file allocation units larger than a single - * block and not a power of two because we would have to perform - * CoW-around for unaligned write requests to guarantee that we always - * remap entire allocation units and the reflink code cannot yet handle - * rounding ranges to align to non powers of two. - */ - if (!is_power_of_2(rextsize)) - return false; - - return true; -} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index cc4e92278279b6..3bfd7ab9e1148a 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -62,6 +62,4 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, unsigned int remap_flags); -bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize); - #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index d8e6d073d64dc9..586da450cc44b4 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1295,9 +1295,7 @@ xfs_growfs_rt( goto out_unlock; if (xfs_has_reflink(mp)) goto out_unlock; - } else if (xfs_has_reflink(mp) && - !xfs_reflink_supports_rextsize(mp, in->extsize)) - goto out_unlock; + } error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks); if (error) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c91b9467a3eef8..8050fea541140a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1754,15 +1754,6 @@ xfs_fs_fill_super( xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); if (xfs_has_reflink(mp)) { - if (xfs_has_realtime(mp) && - !xfs_reflink_supports_rextsize(mp, mp->m_sb.sb_rextsize)) { - xfs_alert(mp, - "reflink not compatible with non-power-of-2 realtime extent size %u!", - mp->m_sb.sb_rextsize); - error = -EINVAL; - goto out_filestream_unmount; - } - /* * always-cow mode is not supported on filesystems with rt * extent sizes larger than a single block because we'd have