tmpfs: add support for an i_version counter

Message ID	20220909130031.15477-1-jlayton@kernel.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-nfs-owner@kernel.org> From: Jeff Layton <jlayton@kernel.org> To: hughd@google.com Cc: viro@zeniv.linux.org.uk, akpm@linux-foundation.org, linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, linux-nfs@vger.kernel.org, Chuck Lever <chuck.lever@oracle.com> Subject: [PATCH] tmpfs: add support for an i_version counter Date: Fri, 9 Sep 2022 09:00:31 -0400 Message-Id: <20220909130031.15477-1-jlayton@kernel.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	tmpfs: add support for an i_version counter \| expand tmpfs: add support for an i_version counter

Message ID

20220909130031.15477-1-jlayton@kernel.org (mailing list archive)

State

New, archived

Headers

From: Jeff Layton <jlayton@kernel.org>
To: hughd@google.com
Cc: viro@zeniv.linux.org.uk, akpm@linux-foundation.org,
        linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
        linux-mm@kvack.org, linux-nfs@vger.kernel.org,
        Chuck Lever <chuck.lever@oracle.com>
Subject: [PATCH] tmpfs: add support for an i_version counter
Date: Fri,  9 Sep 2022 09:00:31 -0400
Message-Id: <20220909130031.15477-1-jlayton@kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

Series

tmpfs: add support for an i_version counter | expand

Commit Message

Jeff Layton Sept. 9, 2022, 1 p.m. UTC

NFSv4 mandates a change attribute to avoid problems with timestamp
granularity, which Linux implements using the i_version counter. This is
particularly important when the underlying filesystem is fast.

Give tmpfs an i_version counter. Since it doesn't have to be persistent,
we can just turn on SB_I_VERSION and sprinkle some inode_inc_iversion
calls in the right places.

Also, while there is no formal spec for xattrs, most implementations
update the ctime on setxattr. Fix shmem_xattr_handler_set to update the
ctime and bump the i_version appropriately.

Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/posix_acl.c |  3 +++
 mm/shmem.c     | 31 ++++++++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

Comments

Andrew Morton Sept. 9, 2022, 9:03 p.m. UTC | #1

On Fri,  9 Sep 2022 09:00:31 -0400 Jeff Layton <jlayton@kernel.org> wrote:

> NFSv4 mandates a change attribute to avoid problems with timestamp
> granularity, which Linux implements using the i_version counter. This is
> particularly important when the underlying filesystem is fast.
> 
> Give tmpfs an i_version counter. Since it doesn't have to be persistent,
> we can just turn on SB_I_VERSION and sprinkle some inode_inc_iversion
> calls in the right places.
> 
> Also, while there is no formal spec for xattrs, most implementations
> update the ctime on setxattr. Fix shmem_xattr_handler_set to update the
> ctime and bump the i_version appropriately.
> 
> ...
>
> --- a/fs/posix_acl.c
> +++ b/fs/posix_acl.c
> @@ -24,6 +24,7 @@
>  #include <linux/user_namespace.h>
>  #include <linux/namei.h>
>  #include <linux/mnt_idmapping.h>
> +#include <linux/iversion.h>
>  
>  static struct posix_acl **acl_by_type(struct inode *inode, int type)
>  {
> @@ -1073,6 +1074,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
>  	}
>  
>  	inode->i_ctime = current_time(inode);
> +	if (IS_I_VERSION(inode))
> +		inode_inc_iversion(inode);
>  	set_cached_acl(inode, type, acl);
>  	return 0;
>  }

adds a kilobyte of text to shmem.o because the quite large
inode_maybe_inc_iversion() get inlined all over the place.  Why oh why.

Is there any reason not to do the obvious?

--- a/include/linux/iversion.h~a
+++ a/include/linux/iversion.h
@@ -177,56 +177,7 @@ inode_set_iversion_queried(struct inode
 				I_VERSION_QUERIED);
 }
 
-/**
- * inode_maybe_inc_iversion - increments i_version
- * @inode: inode with the i_version that should be updated
- * @force: increment the counter even if it's not necessary?
- *
- * Every time the inode is modified, the i_version field must be seen to have
- * changed by any observer.
- *
- * If "force" is set or the QUERIED flag is set, then ensure that we increment
- * the value, and clear the queried flag.
- *
- * In the common case where neither is set, then we can return "false" without
- * updating i_version.
- *
- * If this function returns false, and no other metadata has changed, then we
- * can avoid logging the metadata.
- */
-static inline bool
-inode_maybe_inc_iversion(struct inode *inode, bool force)
-{
-	u64 cur, old, new;
-
-	/*
-	 * The i_version field is not strictly ordered with any other inode
-	 * information, but the legacy inode_inc_iversion code used a spinlock
-	 * to serialize increments.
-	 *
-	 * Here, we add full memory barriers to ensure that any de-facto
-	 * ordering with other info is preserved.
-	 *
-	 * This barrier pairs with the barrier in inode_query_iversion()
-	 */
-	smp_mb();
-	cur = inode_peek_iversion_raw(inode);
-	for (;;) {
-		/* If flag is clear then we needn't do anything */
-		if (!force && !(cur & I_VERSION_QUERIED))
-			return false;
-
-		/* Since lowest bit is flag, add 2 to avoid it */
-		new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
-
-		old = atomic64_cmpxchg(&inode->i_version, cur, new);
-		if (likely(old == cur))
-			break;
-		cur = old;
-	}
-	return true;
-}
-
+bool inode_maybe_inc_iversion(struct inode *inode, bool force);
 
 /**
  * inode_inc_iversion - forcibly increment i_version
--- a/fs/libfs.c~a
+++ a/fs/libfs.c
@@ -15,6 +15,7 @@
 #include <linux/mutex.h>
 #include <linux/namei.h>
 #include <linux/exportfs.h>
+#include <linux/iversion.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> /* sync_mapping_buffers */
 #include <linux/fs_context.h>
@@ -1529,3 +1530,53 @@ void generic_set_encrypted_ci_d_ops(stru
 #endif
 }
 EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
+
+/**
+ * inode_maybe_inc_iversion - increments i_version
+ * @inode: inode with the i_version that should be updated
+ * @force: increment the counter even if it's not necessary?
+ *
+ * Every time the inode is modified, the i_version field must be seen to have
+ * changed by any observer.
+ *
+ * If "force" is set or the QUERIED flag is set, then ensure that we increment
+ * the value, and clear the queried flag.
+ *
+ * In the common case where neither is set, then we can return "false" without
+ * updating i_version.
+ *
+ * If this function returns false, and no other metadata has changed, then we
+ * can avoid logging the metadata.
+ */
+bool inode_maybe_inc_iversion(struct inode *inode, bool force)
+{
+	u64 cur, old, new;
+
+	/*
+	 * The i_version field is not strictly ordered with any other inode
+	 * information, but the legacy inode_inc_iversion code used a spinlock
+	 * to serialize increments.
+	 *
+	 * Here, we add full memory barriers to ensure that any de-facto
+	 * ordering with other info is preserved.
+	 *
+	 * This barrier pairs with the barrier in inode_query_iversion()
+	 */
+	smp_mb();
+	cur = inode_peek_iversion_raw(inode);
+	for (;;) {
+		/* If flag is clear then we needn't do anything */
+		if (!force && !(cur & I_VERSION_QUERIED))
+			return false;
+
+		/* Since lowest bit is flag, add 2 to avoid it */
+		new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
+
+		old = atomic64_cmpxchg(&inode->i_version, cur, new);
+		if (likely(old == cur))
+			break;
+		cur = old;
+	}
+	return true;
+}
+EXPORT_SYMBOL(inode_maybe_inc_iversion);

Jeff Layton Sept. 9, 2022, 9:23 p.m. UTC | #2

On Fri, 2022-09-09 at 14:03 -0700, Andrew Morton wrote:
> On Fri,  9 Sep 2022 09:00:31 -0400 Jeff Layton <jlayton@kernel.org> wrote:
> 
> > NFSv4 mandates a change attribute to avoid problems with timestamp
> > granularity, which Linux implements using the i_version counter. This is
> > particularly important when the underlying filesystem is fast.
> > 
> > Give tmpfs an i_version counter. Since it doesn't have to be persistent,
> > we can just turn on SB_I_VERSION and sprinkle some inode_inc_iversion
> > calls in the right places.
> > 
> > Also, while there is no formal spec for xattrs, most implementations
> > update the ctime on setxattr. Fix shmem_xattr_handler_set to update the
> > ctime and bump the i_version appropriately.
> > 
> > ...
> > 
> > --- a/fs/posix_acl.c
> > +++ b/fs/posix_acl.c
> > @@ -24,6 +24,7 @@
> >  #include <linux/user_namespace.h>
> >  #include <linux/namei.h>
> >  #include <linux/mnt_idmapping.h>
> > +#include <linux/iversion.h>
> >  
> >  static struct posix_acl **acl_by_type(struct inode *inode, int type)
> >  {
> > @@ -1073,6 +1074,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
> >  	}
> >  
> >  	inode->i_ctime = current_time(inode);
> > +	if (IS_I_VERSION(inode))
> > +		inode_inc_iversion(inode);
> >  	set_cached_acl(inode, type, acl);
> >  	return 0;
> >  }
> 
> adds a kilobyte of text to shmem.o because the quite large
> inode_maybe_inc_iversion() get inlined all over the place.  Why oh why.
> 
> Is there any reason not to do the obvious?
> 

No reason at all:

Acked-by: Jeff Layton <jlayton@kernel.org>

> --- a/include/linux/iversion.h~a
> +++ a/include/linux/iversion.h
> @@ -177,56 +177,7 @@ inode_set_iversion_queried(struct inode
>  				I_VERSION_QUERIED);
>  }
>  
> -/**
> - * inode_maybe_inc_iversion - increments i_version
> - * @inode: inode with the i_version that should be updated
> - * @force: increment the counter even if it's not necessary?
> - *
> - * Every time the inode is modified, the i_version field must be seen to have
> - * changed by any observer.
> - *
> - * If "force" is set or the QUERIED flag is set, then ensure that we increment
> - * the value, and clear the queried flag.
> - *
> - * In the common case where neither is set, then we can return "false" without
> - * updating i_version.
> - *
> - * If this function returns false, and no other metadata has changed, then we
> - * can avoid logging the metadata.
> - */
> -static inline bool
> -inode_maybe_inc_iversion(struct inode *inode, bool force)
> -{
> -	u64 cur, old, new;
> -
> -	/*
> -	 * The i_version field is not strictly ordered with any other inode
> -	 * information, but the legacy inode_inc_iversion code used a spinlock
> -	 * to serialize increments.
> -	 *
> -	 * Here, we add full memory barriers to ensure that any de-facto
> -	 * ordering with other info is preserved.
> -	 *
> -	 * This barrier pairs with the barrier in inode_query_iversion()
> -	 */
> -	smp_mb();
> -	cur = inode_peek_iversion_raw(inode);
> -	for (;;) {
> -		/* If flag is clear then we needn't do anything */
> -		if (!force && !(cur & I_VERSION_QUERIED))
> -			return false;
> -
> -		/* Since lowest bit is flag, add 2 to avoid it */
> -		new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
> -
> -		old = atomic64_cmpxchg(&inode->i_version, cur, new);
> -		if (likely(old == cur))
> -			break;
> -		cur = old;
> -	}
> -	return true;
> -}
> -
> +bool inode_maybe_inc_iversion(struct inode *inode, bool force);
>  
>  /**
>   * inode_inc_iversion - forcibly increment i_version
> --- a/fs/libfs.c~a
> +++ a/fs/libfs.c
> @@ -15,6 +15,7 @@
>  #include <linux/mutex.h>
>  #include <linux/namei.h>
>  #include <linux/exportfs.h>
> +#include <linux/iversion.h>
>  #include <linux/writeback.h>
>  #include <linux/buffer_head.h> /* sync_mapping_buffers */
>  #include <linux/fs_context.h>
> @@ -1529,3 +1530,53 @@ void generic_set_encrypted_ci_d_ops(stru
>  #endif
>  }
>  EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
> +
> +/**
> + * inode_maybe_inc_iversion - increments i_version
> + * @inode: inode with the i_version that should be updated
> + * @force: increment the counter even if it's not necessary?
> + *
> + * Every time the inode is modified, the i_version field must be seen to have
> + * changed by any observer.
> + *
> + * If "force" is set or the QUERIED flag is set, then ensure that we increment
> + * the value, and clear the queried flag.
> + *
> + * In the common case where neither is set, then we can return "false" without
> + * updating i_version.
> + *
> + * If this function returns false, and no other metadata has changed, then we
> + * can avoid logging the metadata.
> + */
> +bool inode_maybe_inc_iversion(struct inode *inode, bool force)
> +{
> +	u64 cur, old, new;
> +
> +	/*
> +	 * The i_version field is not strictly ordered with any other inode
> +	 * information, but the legacy inode_inc_iversion code used a spinlock
> +	 * to serialize increments.
> +	 *
> +	 * Here, we add full memory barriers to ensure that any de-facto
> +	 * ordering with other info is preserved.
> +	 *
> +	 * This barrier pairs with the barrier in inode_query_iversion()
> +	 */
> +	smp_mb();
> +	cur = inode_peek_iversion_raw(inode);
> +	for (;;) {
> +		/* If flag is clear then we needn't do anything */
> +		if (!force && !(cur & I_VERSION_QUERIED))
> +			return false;
> +
> +		/* Since lowest bit is flag, add 2 to avoid it */
> +		new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
> +
> +		old = atomic64_cmpxchg(&inode->i_version, cur, new);
> +		if (likely(old == cur))
> +			break;
> +		cur = old;
> +	}
> +	return true;
> +}
> +EXPORT_SYMBOL(inode_maybe_inc_iversion);
> _
> 

--

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 5af33800743e..efb88a5e59f9 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -24,6 +24,7 @@ 
 #include <linux/user_namespace.h>
 #include <linux/namei.h>
 #include <linux/mnt_idmapping.h>
+#include <linux/iversion.h>
 
 static struct posix_acl **acl_by_type(struct inode *inode, int type)
 {
@@ -1073,6 +1074,8 @@  int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 	}
 
 	inode->i_ctime = current_time(inode);
+	if (IS_I_VERSION(inode))
+		inode_inc_iversion(inode);
 	set_cached_acl(inode, type, acl);
 	return 0;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 42e5888bf84d..84c1b7bf47ec 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -38,6 +38,7 @@ 
 #include <linux/hugetlb.h>
 #include <linux/fs_parser.h>
 #include <linux/swapfile.h>
+#include <linux/iversion.h>
 #include "swap.h"
 
 static struct vfsmount *shm_mnt;
@@ -1043,6 +1044,7 @@  void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
 	shmem_undo_range(inode, lstart, lend, false);
 	inode->i_ctime = inode->i_mtime = current_time(inode);
+	inode_inc_iversion(inode);
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
 
@@ -1087,6 +1089,8 @@  static int shmem_setattr(struct user_namespace *mnt_userns,
 	struct inode *inode = d_inode(dentry);
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	int error;
+	bool update_mtime = false;
+	bool update_ctime = true;
 
 	error = setattr_prepare(&init_user_ns, dentry, attr);
 	if (error)
@@ -1107,7 +1111,9 @@  static int shmem_setattr(struct user_namespace *mnt_userns,
 			if (error)
 				return error;
 			i_size_write(inode, newsize);
-			inode->i_ctime = inode->i_mtime = current_time(inode);
+			update_mtime = true;
+		} else {
+			update_ctime = false;
 		}
 		if (newsize <= oldsize) {
 			loff_t holebegin = round_up(newsize, PAGE_SIZE);
@@ -1127,6 +1133,12 @@  static int shmem_setattr(struct user_namespace *mnt_userns,
 	setattr_copy(&init_user_ns, inode, attr);
 	if (attr->ia_valid & ATTR_MODE)
 		error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+	if (!error && update_ctime) {
+		inode->i_ctime = current_time(inode);
+		if (update_mtime)
+			inode->i_mtime = inode->i_ctime;
+		inode_inc_iversion(inode);
+	}
 	return error;
 }
 
@@ -2901,6 +2913,7 @@  shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
 		error = 0;
 		dir->i_size += BOGO_DIRENT_SIZE;
 		dir->i_ctime = dir->i_mtime = current_time(dir);
+		inode_inc_iversion(dir);
 		d_instantiate(dentry, inode);
 		dget(dentry); /* Extra count - pin the dentry in core */
 	}
@@ -2976,6 +2989,7 @@  static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
 
 	dir->i_size += BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+	inode_inc_iversion(dir);
 	inc_nlink(inode);
 	ihold(inode);	/* New dentry reference */
 	dget(dentry);		/* Extra pinning count for the created dentry */
@@ -2993,6 +3007,7 @@  static int shmem_unlink(struct inode *dir, struct dentry *dentry)
 
 	dir->i_size -= BOGO_DIRENT_SIZE;
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+	inode_inc_iversion(dir);
 	drop_nlink(inode);
 	dput(dentry);	/* Undo the count from "create" - this does all the work */
 	return 0;
@@ -3082,6 +3097,8 @@  static int shmem_rename2(struct user_namespace *mnt_userns,
 	old_dir->i_ctime = old_dir->i_mtime =
 	new_dir->i_ctime = new_dir->i_mtime =
 	inode->i_ctime = current_time(old_dir);
+	inode_inc_iversion(old_dir);
+	inode_inc_iversion(new_dir);
 	return 0;
 }
 
@@ -3134,6 +3151,7 @@  static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
 	}
 	dir->i_size += BOGO_DIRENT_SIZE;
 	dir->i_ctime = dir->i_mtime = current_time(dir);
+	inode_inc_iversion(dir);
 	d_instantiate(dentry, inode);
 	dget(dentry);
 	return 0;
@@ -3204,6 +3222,7 @@  static int shmem_fileattr_set(struct user_namespace *mnt_userns,
 
 	shmem_set_inode_flags(inode, info->fsflags);
 	inode->i_ctime = current_time(inode);
+	inode_inc_iversion(inode);
 	return 0;
 }
 
@@ -3267,9 +3286,15 @@  static int shmem_xattr_handler_set(const struct xattr_handler *handler,
 				   size_t size, int flags)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	int err;
 
 	name = xattr_full_name(handler, name);
-	return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
+	err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
+	if (!err) {
+		inode->i_ctime = current_time(inode);
+		inode_inc_iversion(inode);
+	}
+	return err;
 }
 
 static const struct xattr_handler shmem_security_xattr_handler = {
@@ -3732,7 +3757,7 @@  static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
 		sb->s_flags |= SB_NOUSER;
 	}
 	sb->s_export_op = &shmem_export_ops;
-	sb->s_flags |= SB_NOSEC;
+	sb->s_flags |= SB_NOSEC | SB_I_VERSION;
 #else
 	sb->s_flags |= SB_NOUSER;
 #endif

tmpfs: add support for an i_version counter

Commit Message

Comments

Patch