Message ID | 20220909130031.15477-1-jlayton@kernel.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | tmpfs: add support for an i_version counter | expand |
On Fri, 9 Sep 2022 09:00:31 -0400 Jeff Layton <jlayton@kernel.org> wrote: > NFSv4 mandates a change attribute to avoid problems with timestamp > granularity, which Linux implements using the i_version counter. This is > particularly important when the underlying filesystem is fast. > > Give tmpfs an i_version counter. Since it doesn't have to be persistent, > we can just turn on SB_I_VERSION and sprinkle some inode_inc_iversion > calls in the right places. > > Also, while there is no formal spec for xattrs, most implementations > update the ctime on setxattr. Fix shmem_xattr_handler_set to update the > ctime and bump the i_version appropriately. > > ... > > --- a/fs/posix_acl.c > +++ b/fs/posix_acl.c > @@ -24,6 +24,7 @@ > #include <linux/user_namespace.h> > #include <linux/namei.h> > #include <linux/mnt_idmapping.h> > +#include <linux/iversion.h> > > static struct posix_acl **acl_by_type(struct inode *inode, int type) > { > @@ -1073,6 +1074,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, > } > > inode->i_ctime = current_time(inode); > + if (IS_I_VERSION(inode)) > + inode_inc_iversion(inode); > set_cached_acl(inode, type, acl); > return 0; > } adds a kilobyte of text to shmem.o because the quite large inode_maybe_inc_iversion() get inlined all over the place. Why oh why. Is there any reason not to do the obvious? --- a/include/linux/iversion.h~a +++ a/include/linux/iversion.h @@ -177,56 +177,7 @@ inode_set_iversion_queried(struct inode I_VERSION_QUERIED); } -/** - * inode_maybe_inc_iversion - increments i_version - * @inode: inode with the i_version that should be updated - * @force: increment the counter even if it's not necessary? - * - * Every time the inode is modified, the i_version field must be seen to have - * changed by any observer. - * - * If "force" is set or the QUERIED flag is set, then ensure that we increment - * the value, and clear the queried flag. - * - * In the common case where neither is set, then we can return "false" without - * updating i_version. - * - * If this function returns false, and no other metadata has changed, then we - * can avoid logging the metadata. - */ -static inline bool -inode_maybe_inc_iversion(struct inode *inode, bool force) -{ - u64 cur, old, new; - - /* - * The i_version field is not strictly ordered with any other inode - * information, but the legacy inode_inc_iversion code used a spinlock - * to serialize increments. - * - * Here, we add full memory barriers to ensure that any de-facto - * ordering with other info is preserved. - * - * This barrier pairs with the barrier in inode_query_iversion() - */ - smp_mb(); - cur = inode_peek_iversion_raw(inode); - for (;;) { - /* If flag is clear then we needn't do anything */ - if (!force && !(cur & I_VERSION_QUERIED)) - return false; - - /* Since lowest bit is flag, add 2 to avoid it */ - new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; - - old = atomic64_cmpxchg(&inode->i_version, cur, new); - if (likely(old == cur)) - break; - cur = old; - } - return true; -} - +bool inode_maybe_inc_iversion(struct inode *inode, bool force); /** * inode_inc_iversion - forcibly increment i_version --- a/fs/libfs.c~a +++ a/fs/libfs.c @@ -15,6 +15,7 @@ #include <linux/mutex.h> #include <linux/namei.h> #include <linux/exportfs.h> +#include <linux/iversion.h> #include <linux/writeback.h> #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> @@ -1529,3 +1530,53 @@ void generic_set_encrypted_ci_d_ops(stru #endif } EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); + +/** + * inode_maybe_inc_iversion - increments i_version + * @inode: inode with the i_version that should be updated + * @force: increment the counter even if it's not necessary? + * + * Every time the inode is modified, the i_version field must be seen to have + * changed by any observer. + * + * If "force" is set or the QUERIED flag is set, then ensure that we increment + * the value, and clear the queried flag. + * + * In the common case where neither is set, then we can return "false" without + * updating i_version. + * + * If this function returns false, and no other metadata has changed, then we + * can avoid logging the metadata. + */ +bool inode_maybe_inc_iversion(struct inode *inode, bool force) +{ + u64 cur, old, new; + + /* + * The i_version field is not strictly ordered with any other inode + * information, but the legacy inode_inc_iversion code used a spinlock + * to serialize increments. + * + * Here, we add full memory barriers to ensure that any de-facto + * ordering with other info is preserved. + * + * This barrier pairs with the barrier in inode_query_iversion() + */ + smp_mb(); + cur = inode_peek_iversion_raw(inode); + for (;;) { + /* If flag is clear then we needn't do anything */ + if (!force && !(cur & I_VERSION_QUERIED)) + return false; + + /* Since lowest bit is flag, add 2 to avoid it */ + new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; + + old = atomic64_cmpxchg(&inode->i_version, cur, new); + if (likely(old == cur)) + break; + cur = old; + } + return true; +} +EXPORT_SYMBOL(inode_maybe_inc_iversion);
On Fri, 2022-09-09 at 14:03 -0700, Andrew Morton wrote: > On Fri, 9 Sep 2022 09:00:31 -0400 Jeff Layton <jlayton@kernel.org> wrote: > > > NFSv4 mandates a change attribute to avoid problems with timestamp > > granularity, which Linux implements using the i_version counter. This is > > particularly important when the underlying filesystem is fast. > > > > Give tmpfs an i_version counter. Since it doesn't have to be persistent, > > we can just turn on SB_I_VERSION and sprinkle some inode_inc_iversion > > calls in the right places. > > > > Also, while there is no formal spec for xattrs, most implementations > > update the ctime on setxattr. Fix shmem_xattr_handler_set to update the > > ctime and bump the i_version appropriately. > > > > ... > > > > --- a/fs/posix_acl.c > > +++ b/fs/posix_acl.c > > @@ -24,6 +24,7 @@ > > #include <linux/user_namespace.h> > > #include <linux/namei.h> > > #include <linux/mnt_idmapping.h> > > +#include <linux/iversion.h> > > > > static struct posix_acl **acl_by_type(struct inode *inode, int type) > > { > > @@ -1073,6 +1074,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, > > } > > > > inode->i_ctime = current_time(inode); > > + if (IS_I_VERSION(inode)) > > + inode_inc_iversion(inode); > > set_cached_acl(inode, type, acl); > > return 0; > > } > > adds a kilobyte of text to shmem.o because the quite large > inode_maybe_inc_iversion() get inlined all over the place. Why oh why. > > Is there any reason not to do the obvious? > No reason at all: Acked-by: Jeff Layton <jlayton@kernel.org> > --- a/include/linux/iversion.h~a > +++ a/include/linux/iversion.h > @@ -177,56 +177,7 @@ inode_set_iversion_queried(struct inode > I_VERSION_QUERIED); > } > > -/** > - * inode_maybe_inc_iversion - increments i_version > - * @inode: inode with the i_version that should be updated > - * @force: increment the counter even if it's not necessary? > - * > - * Every time the inode is modified, the i_version field must be seen to have > - * changed by any observer. > - * > - * If "force" is set or the QUERIED flag is set, then ensure that we increment > - * the value, and clear the queried flag. > - * > - * In the common case where neither is set, then we can return "false" without > - * updating i_version. > - * > - * If this function returns false, and no other metadata has changed, then we > - * can avoid logging the metadata. > - */ > -static inline bool > -inode_maybe_inc_iversion(struct inode *inode, bool force) > -{ > - u64 cur, old, new; > - > - /* > - * The i_version field is not strictly ordered with any other inode > - * information, but the legacy inode_inc_iversion code used a spinlock > - * to serialize increments. > - * > - * Here, we add full memory barriers to ensure that any de-facto > - * ordering with other info is preserved. > - * > - * This barrier pairs with the barrier in inode_query_iversion() > - */ > - smp_mb(); > - cur = inode_peek_iversion_raw(inode); > - for (;;) { > - /* If flag is clear then we needn't do anything */ > - if (!force && !(cur & I_VERSION_QUERIED)) > - return false; > - > - /* Since lowest bit is flag, add 2 to avoid it */ > - new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; > - > - old = atomic64_cmpxchg(&inode->i_version, cur, new); > - if (likely(old == cur)) > - break; > - cur = old; > - } > - return true; > -} > - > +bool inode_maybe_inc_iversion(struct inode *inode, bool force); > > /** > * inode_inc_iversion - forcibly increment i_version > --- a/fs/libfs.c~a > +++ a/fs/libfs.c > @@ -15,6 +15,7 @@ > #include <linux/mutex.h> > #include <linux/namei.h> > #include <linux/exportfs.h> > +#include <linux/iversion.h> > #include <linux/writeback.h> > #include <linux/buffer_head.h> /* sync_mapping_buffers */ > #include <linux/fs_context.h> > @@ -1529,3 +1530,53 @@ void generic_set_encrypted_ci_d_ops(stru > #endif > } > EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); > + > +/** > + * inode_maybe_inc_iversion - increments i_version > + * @inode: inode with the i_version that should be updated > + * @force: increment the counter even if it's not necessary? > + * > + * Every time the inode is modified, the i_version field must be seen to have > + * changed by any observer. > + * > + * If "force" is set or the QUERIED flag is set, then ensure that we increment > + * the value, and clear the queried flag. > + * > + * In the common case where neither is set, then we can return "false" without > + * updating i_version. > + * > + * If this function returns false, and no other metadata has changed, then we > + * can avoid logging the metadata. > + */ > +bool inode_maybe_inc_iversion(struct inode *inode, bool force) > +{ > + u64 cur, old, new; > + > + /* > + * The i_version field is not strictly ordered with any other inode > + * information, but the legacy inode_inc_iversion code used a spinlock > + * to serialize increments. > + * > + * Here, we add full memory barriers to ensure that any de-facto > + * ordering with other info is preserved. > + * > + * This barrier pairs with the barrier in inode_query_iversion() > + */ > + smp_mb(); > + cur = inode_peek_iversion_raw(inode); > + for (;;) { > + /* If flag is clear then we needn't do anything */ > + if (!force && !(cur & I_VERSION_QUERIED)) > + return false; > + > + /* Since lowest bit is flag, add 2 to avoid it */ > + new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; > + > + old = atomic64_cmpxchg(&inode->i_version, cur, new); > + if (likely(old == cur)) > + break; > + cur = old; > + } > + return true; > +} > +EXPORT_SYMBOL(inode_maybe_inc_iversion); > _ > --
diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 5af33800743e..efb88a5e59f9 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -24,6 +24,7 @@ #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/mnt_idmapping.h> +#include <linux/iversion.h> static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -1073,6 +1074,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, } inode->i_ctime = current_time(inode); + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); set_cached_acl(inode, type, acl); return 0; } diff --git a/mm/shmem.c b/mm/shmem.c index 42e5888bf84d..84c1b7bf47ec 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -38,6 +38,7 @@ #include <linux/hugetlb.h> #include <linux/fs_parser.h> #include <linux/swapfile.h> +#include <linux/iversion.h> #include "swap.h" static struct vfsmount *shm_mnt; @@ -1043,6 +1044,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode->i_ctime = inode->i_mtime = current_time(inode); + inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -1087,6 +1089,8 @@ static int shmem_setattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; + bool update_mtime = false; + bool update_ctime = true; error = setattr_prepare(&init_user_ns, dentry, attr); if (error) @@ -1107,7 +1111,9 @@ static int shmem_setattr(struct user_namespace *mnt_userns, if (error) return error; i_size_write(inode, newsize); - inode->i_ctime = inode->i_mtime = current_time(inode); + update_mtime = true; + } else { + update_ctime = false; } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); @@ -1127,6 +1133,12 @@ static int shmem_setattr(struct user_namespace *mnt_userns, setattr_copy(&init_user_ns, inode, attr); if (attr->ia_valid & ATTR_MODE) error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + if (!error && update_ctime) { + inode->i_ctime = current_time(inode); + if (update_mtime) + inode->i_mtime = inode->i_ctime; + inode_inc_iversion(inode); + } return error; } @@ -2901,6 +2913,7 @@ shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir, error = 0; dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ } @@ -2976,6 +2989,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ @@ -2993,6 +3007,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - this does all the work */ return 0; @@ -3082,6 +3097,8 @@ static int shmem_rename2(struct user_namespace *mnt_userns, old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = inode->i_ctime = current_time(old_dir); + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); return 0; } @@ -3134,6 +3151,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); return 0; @@ -3204,6 +3222,7 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns, shmem_set_inode_flags(inode, info->fsflags); inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); return 0; } @@ -3267,9 +3286,15 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); + int err; name = xattr_full_name(handler, name); - return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + if (!err) { + inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); + } + return err; } static const struct xattr_handler shmem_security_xattr_handler = { @@ -3732,7 +3757,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= SB_NOSEC; + sb->s_flags |= SB_NOSEC | SB_I_VERSION; #else sb->s_flags |= SB_NOUSER; #endif
NFSv4 mandates a change attribute to avoid problems with timestamp granularity, which Linux implements using the i_version counter. This is particularly important when the underlying filesystem is fast. Give tmpfs an i_version counter. Since it doesn't have to be persistent, we can just turn on SB_I_VERSION and sprinkle some inode_inc_iversion calls in the right places. Also, while there is no formal spec for xattrs, most implementations update the ctime on setxattr. Fix shmem_xattr_handler_set to update the ctime and bump the i_version appropriately. Cc: Chuck Lever <chuck.lever@oracle.com> Signed-off-by: Jeff Layton <jlayton@kernel.org> --- fs/posix_acl.c | 3 +++ mm/shmem.c | 31 ++++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-)