diff mbox series

[v31.0.1,1/1] xfs: introduce new file range commit ioctls

Message ID 20240824062927.GU865349@frogsfrogsfrogs (mailing list archive)
State New
Headers show
Series [v31.0.1,1/1] xfs: introduce new file range commit ioctls | expand

Commit Message

Darrick J. Wong Aug. 24, 2024, 6:29 a.m. UTC
From: Darrick J. Wong <djwong@kernel.org>

This patch introduces two more new ioctls to manage atomic updates to
file contents -- XFS_IOC_START_COMMIT and XFS_IOC_COMMIT_RANGE.  The
commit mechanism here is exactly the same as what XFS_IOC_EXCHANGE_RANGE
does, but with the additional requirement that file2 cannot have changed
since some sampling point.  The start-commit ioctl performs the sampling
of file attributes.

Note: This patch currently samples i_ctime during START_COMMIT and
checks that it hasn't changed during COMMIT_RANGE.  This isn't entirely
safe in kernels prior to 6.12 because ctime only had coarse grained
granularity and very fast updates could collide with a COMMIT_RANGE.
With the multi-granularity ctime introduced by Jeff Layton, it's now
possible to update ctime such that this does not happen.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 fs/xfs/libxfs/xfs_fs.h |   26 +++++++++
 fs/xfs/xfs_exchrange.c |  143 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_exchrange.h |   16 +++++
 fs/xfs/xfs_ioctl.c     |    4 +
 fs/xfs/xfs_trace.h     |   57 +++++++++++++++++++
 5 files changed, 243 insertions(+), 3 deletions(-)

Comments

Jeff Layton Aug. 24, 2024, 12:11 p.m. UTC | #1
On Fri, 2024-08-23 at 23:29 -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> This patch introduces two more new ioctls to manage atomic updates to
> file contents -- XFS_IOC_START_COMMIT and XFS_IOC_COMMIT_RANGE.  The
> commit mechanism here is exactly the same as what XFS_IOC_EXCHANGE_RANGE
> does, but with the additional requirement that file2 cannot have changed
> since some sampling point.  The start-commit ioctl performs the sampling
> of file attributes.
> 
> Note: This patch currently samples i_ctime during START_COMMIT and
> checks that it hasn't changed during COMMIT_RANGE.  This isn't entirely
> safe in kernels prior to 6.12 because ctime only had coarse grained
> granularity and very fast updates could collide with a COMMIT_RANGE.
> With the multi-granularity ctime introduced by Jeff Layton, it's now
> possible to update ctime such that this does not happen.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  fs/xfs/libxfs/xfs_fs.h |   26 +++++++++
>  fs/xfs/xfs_exchrange.c |  143 ++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/xfs_exchrange.h |   16 +++++
>  fs/xfs/xfs_ioctl.c     |    4 +
>  fs/xfs/xfs_trace.h     |   57 +++++++++++++++++++
>  5 files changed, 243 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
> index 454b63ef72016..c85c8077fac39 100644
> --- a/fs/xfs/libxfs/xfs_fs.h
> +++ b/fs/xfs/libxfs/xfs_fs.h
> @@ -825,6 +825,30 @@ struct xfs_exchange_range {
>  	__u64		flags;		/* see XFS_EXCHANGE_RANGE_* below */
>  };
>  
> +/*
> + * Using the same definition of file2 as struct xfs_exchange_range, commit the
> + * contents of file1 into file2 if file2 has the same inode number, mtime, and
> + * ctime as the arguments provided to the call.  The old contents of file2 will
> + * be moved to file1.
> + *
> + * Returns -EBUSY if there isn't an exact match for the file2 fields.
> + *
> + * Filesystems must be able to restart and complete the operation even after
> + * the system goes down.
> + */
> +struct xfs_commit_range {
> +	__s32		file1_fd;
> +	__u32		pad;		/* must be zeroes */
> +	__u64		file1_offset;	/* file1 offset, bytes */
> +	__u64		file2_offset;	/* file2 offset, bytes */
> +	__u64		length;		/* bytes to exchange */
> +
> +	__u64		flags;		/* see XFS_EXCHANGE_RANGE_* below */
> +
> +	/* opaque file2 metadata for freshness checks */
> +	__u64		file2_freshness[6];
> +};
> +
>  /*
>   * Exchange file data all the way to the ends of both files, and then exchange
>   * the file sizes.  This flag can be used to replace a file's contents with a
> @@ -997,6 +1021,8 @@ struct xfs_getparents_by_handle {
>  #define XFS_IOC_BULKSTAT	     _IOR ('X', 127, struct xfs_bulkstat_req)
>  #define XFS_IOC_INUMBERS	     _IOR ('X', 128, struct xfs_inumbers_req)
>  #define XFS_IOC_EXCHANGE_RANGE	     _IOW ('X', 129, struct xfs_exchange_range)
> +#define XFS_IOC_START_COMMIT	     _IOR ('X', 130, struct xfs_commit_range)
> +#define XFS_IOC_COMMIT_RANGE	     _IOW ('X', 131, struct xfs_commit_range)
>  /*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
>  
>  
> diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
> index c8a655c92c92f..d0889190ab7ff 100644
> --- a/fs/xfs/xfs_exchrange.c
> +++ b/fs/xfs/xfs_exchrange.c
> @@ -72,6 +72,34 @@ xfs_exchrange_estimate(
>  	return error;
>  }
>  
> +/*
> + * Check that file2's metadata agree with the snapshot that we took for the
> + * range commit request.
> + *
> + * This should be called after the filesystem has locked /all/ inode metadata
> + * against modification.
> + */
> +STATIC int
> +xfs_exchrange_check_freshness(
> +	const struct xfs_exchrange	*fxr,
> +	struct xfs_inode		*ip2)
> +{
> +	struct inode			*inode2 = VFS_I(ip2);
> +	struct timespec64		ctime = inode_get_ctime(inode2);
> +	struct timespec64		mtime = inode_get_mtime(inode2);
> +
> +	trace_xfs_exchrange_freshness(fxr, ip2);
> +
> +	/* Check that file2 hasn't otherwise been modified. */
> +	if (fxr->file2_ino != ip2->i_ino ||
> +	    fxr->file2_gen != inode2->i_generation ||
> +	    !timespec64_equal(&fxr->file2_ctime, &ctime) ||
> +	    !timespec64_equal(&fxr->file2_mtime, &mtime))
> +		return -EBUSY;
> +
> +	return 0;
> +}
> +
>  #define QRETRY_IP1	(0x1)
>  #define QRETRY_IP2	(0x2)
>  
> @@ -607,6 +635,12 @@ xfs_exchrange_prep(
>  	if (error || fxr->length == 0)
>  		return error;
>  
> +	if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
> +		error = xfs_exchrange_check_freshness(fxr, ip2);
> +		if (error)
> +			return error;
> +	}
> +
>  	/* Attach dquots to both inodes before changing block maps. */
>  	error = xfs_qm_dqattach(ip2);
>  	if (error)
> @@ -719,7 +753,8 @@ xfs_exchange_range(
>  	if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
>  		return -EXDEV;
>  
> -	if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
> +	if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
> +			 __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
>  		return -EINVAL;
>  
>  	/* Userspace requests only honored for regular files. */
> @@ -802,3 +837,109 @@ xfs_ioc_exchange_range(
>  	fdput(file1);
>  	return error;
>  }
> +
> +/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
> +struct xfs_commit_range_fresh {
> +	xfs_fsid_t	fsid;		/* m_fixedfsid */
> +	__u64		file2_ino;	/* inode number */
> +	__s64		file2_mtime;	/* modification time */
> +	__s64		file2_ctime;	/* change time */
> +	__s32		file2_mtime_nsec; /* mod time, nsec */
> +	__s32		file2_ctime_nsec; /* change time, nsec */
> +	__u32		file2_gen;	/* inode generation */
> +	__u32		magic;		/* zero */
> +};
> +#define XCR_FRESH_MAGIC	0x444F524B	/* DORK */
> +
> +/* Set up a commitrange operation by sampling file2's write-related attrs */
> +long
> +xfs_ioc_start_commit(
> +	struct file			*file,
> +	struct xfs_commit_range __user	*argp)
> +{
> +	struct xfs_commit_range		args = { };
> +	struct timespec64		ts;
> +	struct xfs_commit_range_fresh	*kern_f;
> +	struct xfs_commit_range_fresh	__user *user_f;
> +	struct inode			*inode2 = file_inode(file);
> +	struct xfs_inode		*ip2 = XFS_I(inode2);
> +	const unsigned int		lockflags = XFS_IOLOCK_SHARED |
> +						    XFS_MMAPLOCK_SHARED |
> +						    XFS_ILOCK_SHARED;
> +
> +	BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
> +		     sizeof(args.file2_freshness));
> +
> +	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
> +
> +	memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
> +
> +	xfs_ilock(ip2, lockflags);
> +	ts = inode_get_ctime(inode2);
> +	kern_f->file2_ctime		= ts.tv_sec;
> +	kern_f->file2_ctime_nsec	= ts.tv_nsec;
> +	ts = inode_get_mtime(inode2);
> +	kern_f->file2_mtime		= ts.tv_sec;
> +	kern_f->file2_mtime_nsec	= ts.tv_nsec;
> +	kern_f->file2_ino		= ip2->i_ino;
> +	kern_f->file2_gen		= inode2->i_generation;
> +	kern_f->magic			= XCR_FRESH_MAGIC;
> +	xfs_iunlock(ip2, lockflags);
> +
> +	user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
> +	if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +/*
> + * Exchange file1 and file2 contents if file2 has not been written since the
> + * start commit operation.
> + */
> +long
> +xfs_ioc_commit_range(
> +	struct file			*file,
> +	struct xfs_commit_range __user	*argp)
> +{
> +	struct xfs_exchrange		fxr = {
> +		.file2			= file,
> +	};
> +	struct xfs_commit_range		args;
> +	struct xfs_commit_range_fresh	*kern_f;
> +	struct xfs_inode		*ip2 = XFS_I(file_inode(file));
> +	struct xfs_mount		*mp = ip2->i_mount;
> +	struct fd			file1;
> +	int				error;
> +
> +	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
> +
> +	if (copy_from_user(&args, argp, sizeof(args)))
> +		return -EFAULT;
> +	if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
> +		return -EINVAL;
> +	if (kern_f->magic != XCR_FRESH_MAGIC)
> +		return -EBUSY;
> +	if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
> +		return -EBUSY;
> +
> +	fxr.file1_offset	= args.file1_offset;
> +	fxr.file2_offset	= args.file2_offset;
> +	fxr.length		= args.length;
> +	fxr.flags		= args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
> +	fxr.file2_ino		= kern_f->file2_ino;
> +	fxr.file2_gen		= kern_f->file2_gen;
> +	fxr.file2_mtime.tv_sec	= kern_f->file2_mtime;
> +	fxr.file2_mtime.tv_nsec	= kern_f->file2_mtime_nsec;
> +	fxr.file2_ctime.tv_sec	= kern_f->file2_ctime;
> +	fxr.file2_ctime.tv_nsec	= kern_f->file2_ctime_nsec;
> +
> +	file1 = fdget(args.file1_fd);
> +	if (!file1.file)
> +		return -EBADF;
> +	fxr.file1 = file1.file;
> +
> +	error = xfs_exchange_range(&fxr);
> +	fdput(file1);
> +	return error;
> +}
> diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h
> index 039abcca546e8..bc1298aba806b 100644
> --- a/fs/xfs/xfs_exchrange.h
> +++ b/fs/xfs/xfs_exchrange.h
> @@ -10,8 +10,12 @@
>  #define __XFS_EXCHANGE_RANGE_UPD_CMTIME1	(1ULL << 63)
>  #define __XFS_EXCHANGE_RANGE_UPD_CMTIME2	(1ULL << 62)
>  
> +/* Freshness check required */
> +#define __XFS_EXCHANGE_RANGE_CHECK_FRESH2	(1ULL << 61)
> +
>  #define XFS_EXCHANGE_RANGE_PRIV_FLAGS	(__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \
> -					 __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
> +					 __XFS_EXCHANGE_RANGE_UPD_CMTIME2 | \
> +					 __XFS_EXCHANGE_RANGE_CHECK_FRESH2)
>  
>  struct xfs_exchrange {
>  	struct file		*file1;
> @@ -22,10 +26,20 @@ struct xfs_exchrange {
>  	u64			length;
>  
>  	u64			flags;	/* XFS_EXCHANGE_RANGE flags */
> +
> +	/* file2 metadata for freshness checks */
> +	u64			file2_ino;
> +	struct timespec64	file2_mtime;
> +	struct timespec64	file2_ctime;
> +	u32			file2_gen;
>  };
>  
>  long xfs_ioc_exchange_range(struct file *file,
>  		struct xfs_exchange_range __user *argp);
> +long xfs_ioc_start_commit(struct file *file,
> +		struct xfs_commit_range __user *argp);
> +long xfs_ioc_commit_range(struct file *file,
> +		struct xfs_commit_range __user	*argp);
>  
>  struct xfs_exchmaps_req;
>  
> diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
> index 6b13666d4e963..90b3ee21e7fe6 100644
> --- a/fs/xfs/xfs_ioctl.c
> +++ b/fs/xfs/xfs_ioctl.c
> @@ -1518,6 +1518,10 @@ xfs_file_ioctl(
>  
>  	case XFS_IOC_EXCHANGE_RANGE:
>  		return xfs_ioc_exchange_range(filp, arg);
> +	case XFS_IOC_START_COMMIT:
> +		return xfs_ioc_start_commit(filp, arg);
> +	case XFS_IOC_COMMIT_RANGE:
> +		return xfs_ioc_commit_range(filp, arg);
>  
>  	default:
>  		return -ENOTTY;
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index 180ce697305a9..4cf0fa71ba9ce 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -4926,7 +4926,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error);
>  	{ XFS_EXCHANGE_RANGE_DRY_RUN,		"DRY_RUN" }, \
>  	{ XFS_EXCHANGE_RANGE_FILE1_WRITTEN,	"F1_WRITTEN" }, \
>  	{ __XFS_EXCHANGE_RANGE_UPD_CMTIME1,	"CMTIME1" }, \
> -	{ __XFS_EXCHANGE_RANGE_UPD_CMTIME2,	"CMTIME2" }
> +	{ __XFS_EXCHANGE_RANGE_UPD_CMTIME2,	"CMTIME2" }, \
> +	{ __XFS_EXCHANGE_RANGE_CHECK_FRESH2,	"FRESH2" }
>  
>  /* file exchange-range tracepoint class */
>  DECLARE_EVENT_CLASS(xfs_exchrange_class,
> @@ -4986,6 +4987,60 @@ DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep);
>  DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush);
>  DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings);
>  
> +TRACE_EVENT(xfs_exchrange_freshness,
> +	TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip2),
> +	TP_ARGS(fxr, ip2),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(xfs_ino_t, ip2_ino)
> +		__field(long long, ip2_mtime)
> +		__field(long long, ip2_ctime)
> +		__field(int, ip2_mtime_nsec)
> +		__field(int, ip2_ctime_nsec)
> +
> +		__field(xfs_ino_t, file2_ino)
> +		__field(long long, file2_mtime)
> +		__field(long long, file2_ctime)
> +		__field(int, file2_mtime_nsec)
> +		__field(int, file2_ctime_nsec)
> +	),
> +	TP_fast_assign(
> +		struct timespec64	ts64;
> +		struct inode		*inode2 = VFS_I(ip2);
> +
> +		__entry->dev = inode2->i_sb->s_dev;
> +		__entry->ip2_ino = ip2->i_ino;
> +
> +		ts64 = inode_get_ctime(inode2);
> +		__entry->ip2_ctime = ts64.tv_sec;
> +		__entry->ip2_ctime_nsec = ts64.tv_nsec;
> +
> +		ts64 = inode_get_mtime(inode2);
> +		__entry->ip2_mtime = ts64.tv_sec;
> +		__entry->ip2_mtime_nsec = ts64.tv_nsec;
> +
> +		__entry->file2_ino = fxr->file2_ino;
> +		__entry->file2_mtime = fxr->file2_mtime.tv_sec;
> +		__entry->file2_ctime = fxr->file2_ctime.tv_sec;
> +		__entry->file2_mtime_nsec = fxr->file2_mtime.tv_nsec;
> +		__entry->file2_ctime_nsec = fxr->file2_ctime.tv_nsec;
> +	),
> +	TP_printk("dev %d:%d "
> +		  "ino 0x%llx mtime %lld:%d ctime %lld:%d -> "
> +		  "file 0x%llx mtime %lld:%d ctime %lld:%d",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->ip2_ino,
> +		  __entry->ip2_mtime,
> +		  __entry->ip2_mtime_nsec,
> +		  __entry->ip2_ctime,
> +		  __entry->ip2_ctime_nsec,
> +		  __entry->file2_ino,
> +		  __entry->file2_mtime,
> +		  __entry->file2_mtime_nsec,
> +		  __entry->file2_ctime,
> +		  __entry->file2_ctime_nsec)
> +);
> +
>  TRACE_EVENT(xfs_exchmaps_overhead,
>  	TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks,
>  		 unsigned long long rmapbt_blocks),

Acked-by: Jeff Layton <jlayton@kernel.org>
Christoph Hellwig Aug. 25, 2024, 4:52 a.m. UTC | #2
Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>
diff mbox series

Patch

diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 454b63ef72016..c85c8077fac39 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -825,6 +825,30 @@  struct xfs_exchange_range {
 	__u64		flags;		/* see XFS_EXCHANGE_RANGE_* below */
 };
 
+/*
+ * Using the same definition of file2 as struct xfs_exchange_range, commit the
+ * contents of file1 into file2 if file2 has the same inode number, mtime, and
+ * ctime as the arguments provided to the call.  The old contents of file2 will
+ * be moved to file1.
+ *
+ * Returns -EBUSY if there isn't an exact match for the file2 fields.
+ *
+ * Filesystems must be able to restart and complete the operation even after
+ * the system goes down.
+ */
+struct xfs_commit_range {
+	__s32		file1_fd;
+	__u32		pad;		/* must be zeroes */
+	__u64		file1_offset;	/* file1 offset, bytes */
+	__u64		file2_offset;	/* file2 offset, bytes */
+	__u64		length;		/* bytes to exchange */
+
+	__u64		flags;		/* see XFS_EXCHANGE_RANGE_* below */
+
+	/* opaque file2 metadata for freshness checks */
+	__u64		file2_freshness[6];
+};
+
 /*
  * Exchange file data all the way to the ends of both files, and then exchange
  * the file sizes.  This flag can be used to replace a file's contents with a
@@ -997,6 +1021,8 @@  struct xfs_getparents_by_handle {
 #define XFS_IOC_BULKSTAT	     _IOR ('X', 127, struct xfs_bulkstat_req)
 #define XFS_IOC_INUMBERS	     _IOR ('X', 128, struct xfs_inumbers_req)
 #define XFS_IOC_EXCHANGE_RANGE	     _IOW ('X', 129, struct xfs_exchange_range)
+#define XFS_IOC_START_COMMIT	     _IOR ('X', 130, struct xfs_commit_range)
+#define XFS_IOC_COMMIT_RANGE	     _IOW ('X', 131, struct xfs_commit_range)
 /*	XFS_IOC_GETFSUUID ---------- deprecated 140	 */
 
 
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index c8a655c92c92f..d0889190ab7ff 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -72,6 +72,34 @@  xfs_exchrange_estimate(
 	return error;
 }
 
+/*
+ * Check that file2's metadata agree with the snapshot that we took for the
+ * range commit request.
+ *
+ * This should be called after the filesystem has locked /all/ inode metadata
+ * against modification.
+ */
+STATIC int
+xfs_exchrange_check_freshness(
+	const struct xfs_exchrange	*fxr,
+	struct xfs_inode		*ip2)
+{
+	struct inode			*inode2 = VFS_I(ip2);
+	struct timespec64		ctime = inode_get_ctime(inode2);
+	struct timespec64		mtime = inode_get_mtime(inode2);
+
+	trace_xfs_exchrange_freshness(fxr, ip2);
+
+	/* Check that file2 hasn't otherwise been modified. */
+	if (fxr->file2_ino != ip2->i_ino ||
+	    fxr->file2_gen != inode2->i_generation ||
+	    !timespec64_equal(&fxr->file2_ctime, &ctime) ||
+	    !timespec64_equal(&fxr->file2_mtime, &mtime))
+		return -EBUSY;
+
+	return 0;
+}
+
 #define QRETRY_IP1	(0x1)
 #define QRETRY_IP2	(0x2)
 
@@ -607,6 +635,12 @@  xfs_exchrange_prep(
 	if (error || fxr->length == 0)
 		return error;
 
+	if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
+		error = xfs_exchrange_check_freshness(fxr, ip2);
+		if (error)
+			return error;
+	}
+
 	/* Attach dquots to both inodes before changing block maps. */
 	error = xfs_qm_dqattach(ip2);
 	if (error)
@@ -719,7 +753,8 @@  xfs_exchange_range(
 	if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
 		return -EXDEV;
 
-	if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+	if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
+			 __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
 		return -EINVAL;
 
 	/* Userspace requests only honored for regular files. */
@@ -802,3 +837,109 @@  xfs_ioc_exchange_range(
 	fdput(file1);
 	return error;
 }
+
+/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
+struct xfs_commit_range_fresh {
+	xfs_fsid_t	fsid;		/* m_fixedfsid */
+	__u64		file2_ino;	/* inode number */
+	__s64		file2_mtime;	/* modification time */
+	__s64		file2_ctime;	/* change time */
+	__s32		file2_mtime_nsec; /* mod time, nsec */
+	__s32		file2_ctime_nsec; /* change time, nsec */
+	__u32		file2_gen;	/* inode generation */
+	__u32		magic;		/* zero */
+};
+#define XCR_FRESH_MAGIC	0x444F524B	/* DORK */
+
+/* Set up a commitrange operation by sampling file2's write-related attrs */
+long
+xfs_ioc_start_commit(
+	struct file			*file,
+	struct xfs_commit_range __user	*argp)
+{
+	struct xfs_commit_range		args = { };
+	struct timespec64		ts;
+	struct xfs_commit_range_fresh	*kern_f;
+	struct xfs_commit_range_fresh	__user *user_f;
+	struct inode			*inode2 = file_inode(file);
+	struct xfs_inode		*ip2 = XFS_I(inode2);
+	const unsigned int		lockflags = XFS_IOLOCK_SHARED |
+						    XFS_MMAPLOCK_SHARED |
+						    XFS_ILOCK_SHARED;
+
+	BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
+		     sizeof(args.file2_freshness));
+
+	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
+
+	memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+
+	xfs_ilock(ip2, lockflags);
+	ts = inode_get_ctime(inode2);
+	kern_f->file2_ctime		= ts.tv_sec;
+	kern_f->file2_ctime_nsec	= ts.tv_nsec;
+	ts = inode_get_mtime(inode2);
+	kern_f->file2_mtime		= ts.tv_sec;
+	kern_f->file2_mtime_nsec	= ts.tv_nsec;
+	kern_f->file2_ino		= ip2->i_ino;
+	kern_f->file2_gen		= inode2->i_generation;
+	kern_f->magic			= XCR_FRESH_MAGIC;
+	xfs_iunlock(ip2, lockflags);
+
+	user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
+	if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/*
+ * Exchange file1 and file2 contents if file2 has not been written since the
+ * start commit operation.
+ */
+long
+xfs_ioc_commit_range(
+	struct file			*file,
+	struct xfs_commit_range __user	*argp)
+{
+	struct xfs_exchrange		fxr = {
+		.file2			= file,
+	};
+	struct xfs_commit_range		args;
+	struct xfs_commit_range_fresh	*kern_f;
+	struct xfs_inode		*ip2 = XFS_I(file_inode(file));
+	struct xfs_mount		*mp = ip2->i_mount;
+	struct fd			file1;
+	int				error;
+
+	kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+	if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+		return -EINVAL;
+	if (kern_f->magic != XCR_FRESH_MAGIC)
+		return -EBUSY;
+	if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
+		return -EBUSY;
+
+	fxr.file1_offset	= args.file1_offset;
+	fxr.file2_offset	= args.file2_offset;
+	fxr.length		= args.length;
+	fxr.flags		= args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
+	fxr.file2_ino		= kern_f->file2_ino;
+	fxr.file2_gen		= kern_f->file2_gen;
+	fxr.file2_mtime.tv_sec	= kern_f->file2_mtime;
+	fxr.file2_mtime.tv_nsec	= kern_f->file2_mtime_nsec;
+	fxr.file2_ctime.tv_sec	= kern_f->file2_ctime;
+	fxr.file2_ctime.tv_nsec	= kern_f->file2_ctime_nsec;
+
+	file1 = fdget(args.file1_fd);
+	if (!file1.file)
+		return -EBADF;
+	fxr.file1 = file1.file;
+
+	error = xfs_exchange_range(&fxr);
+	fdput(file1);
+	return error;
+}
diff --git a/fs/xfs/xfs_exchrange.h b/fs/xfs/xfs_exchrange.h
index 039abcca546e8..bc1298aba806b 100644
--- a/fs/xfs/xfs_exchrange.h
+++ b/fs/xfs/xfs_exchrange.h
@@ -10,8 +10,12 @@ 
 #define __XFS_EXCHANGE_RANGE_UPD_CMTIME1	(1ULL << 63)
 #define __XFS_EXCHANGE_RANGE_UPD_CMTIME2	(1ULL << 62)
 
+/* Freshness check required */
+#define __XFS_EXCHANGE_RANGE_CHECK_FRESH2	(1ULL << 61)
+
 #define XFS_EXCHANGE_RANGE_PRIV_FLAGS	(__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \
-					 __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
+					 __XFS_EXCHANGE_RANGE_UPD_CMTIME2 | \
+					 __XFS_EXCHANGE_RANGE_CHECK_FRESH2)
 
 struct xfs_exchrange {
 	struct file		*file1;
@@ -22,10 +26,20 @@  struct xfs_exchrange {
 	u64			length;
 
 	u64			flags;	/* XFS_EXCHANGE_RANGE flags */
+
+	/* file2 metadata for freshness checks */
+	u64			file2_ino;
+	struct timespec64	file2_mtime;
+	struct timespec64	file2_ctime;
+	u32			file2_gen;
 };
 
 long xfs_ioc_exchange_range(struct file *file,
 		struct xfs_exchange_range __user *argp);
+long xfs_ioc_start_commit(struct file *file,
+		struct xfs_commit_range __user *argp);
+long xfs_ioc_commit_range(struct file *file,
+		struct xfs_commit_range __user	*argp);
 
 struct xfs_exchmaps_req;
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6b13666d4e963..90b3ee21e7fe6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1518,6 +1518,10 @@  xfs_file_ioctl(
 
 	case XFS_IOC_EXCHANGE_RANGE:
 		return xfs_ioc_exchange_range(filp, arg);
+	case XFS_IOC_START_COMMIT:
+		return xfs_ioc_start_commit(filp, arg);
+	case XFS_IOC_COMMIT_RANGE:
+		return xfs_ioc_commit_range(filp, arg);
 
 	default:
 		return -ENOTTY;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 180ce697305a9..4cf0fa71ba9ce 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -4926,7 +4926,8 @@  DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error);
 	{ XFS_EXCHANGE_RANGE_DRY_RUN,		"DRY_RUN" }, \
 	{ XFS_EXCHANGE_RANGE_FILE1_WRITTEN,	"F1_WRITTEN" }, \
 	{ __XFS_EXCHANGE_RANGE_UPD_CMTIME1,	"CMTIME1" }, \
-	{ __XFS_EXCHANGE_RANGE_UPD_CMTIME2,	"CMTIME2" }
+	{ __XFS_EXCHANGE_RANGE_UPD_CMTIME2,	"CMTIME2" }, \
+	{ __XFS_EXCHANGE_RANGE_CHECK_FRESH2,	"FRESH2" }
 
 /* file exchange-range tracepoint class */
 DECLARE_EVENT_CLASS(xfs_exchrange_class,
@@ -4986,6 +4987,60 @@  DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep);
 DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush);
 DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings);
 
+TRACE_EVENT(xfs_exchrange_freshness,
+	TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip2),
+	TP_ARGS(fxr, ip2),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ip2_ino)
+		__field(long long, ip2_mtime)
+		__field(long long, ip2_ctime)
+		__field(int, ip2_mtime_nsec)
+		__field(int, ip2_ctime_nsec)
+
+		__field(xfs_ino_t, file2_ino)
+		__field(long long, file2_mtime)
+		__field(long long, file2_ctime)
+		__field(int, file2_mtime_nsec)
+		__field(int, file2_ctime_nsec)
+	),
+	TP_fast_assign(
+		struct timespec64	ts64;
+		struct inode		*inode2 = VFS_I(ip2);
+
+		__entry->dev = inode2->i_sb->s_dev;
+		__entry->ip2_ino = ip2->i_ino;
+
+		ts64 = inode_get_ctime(inode2);
+		__entry->ip2_ctime = ts64.tv_sec;
+		__entry->ip2_ctime_nsec = ts64.tv_nsec;
+
+		ts64 = inode_get_mtime(inode2);
+		__entry->ip2_mtime = ts64.tv_sec;
+		__entry->ip2_mtime_nsec = ts64.tv_nsec;
+
+		__entry->file2_ino = fxr->file2_ino;
+		__entry->file2_mtime = fxr->file2_mtime.tv_sec;
+		__entry->file2_ctime = fxr->file2_ctime.tv_sec;
+		__entry->file2_mtime_nsec = fxr->file2_mtime.tv_nsec;
+		__entry->file2_ctime_nsec = fxr->file2_ctime.tv_nsec;
+	),
+	TP_printk("dev %d:%d "
+		  "ino 0x%llx mtime %lld:%d ctime %lld:%d -> "
+		  "file 0x%llx mtime %lld:%d ctime %lld:%d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ip2_ino,
+		  __entry->ip2_mtime,
+		  __entry->ip2_mtime_nsec,
+		  __entry->ip2_ctime,
+		  __entry->ip2_ctime_nsec,
+		  __entry->file2_ino,
+		  __entry->file2_mtime,
+		  __entry->file2_mtime_nsec,
+		  __entry->file2_ctime,
+		  __entry->file2_ctime_nsec)
+);
+
 TRACE_EVENT(xfs_exchmaps_overhead,
 	TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks,
 		 unsigned long long rmapbt_blocks),