Message ID | 168175931561.2843.16288612382874559384.stgit@manet.1015granger.net (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [v1] shmem: stable directory cookies | expand |
On Mon, 2023-04-17 at 15:23 -0400, Chuck Lever wrote: > From: Chuck Lever <chuck.lever@oracle.com> > > The current cursor-based directory cookie mechanism doesn't work > when a tmpfs filesystem is exported via NFS. This is because NFS > clients do not open directories: each READDIR operation has to open > the directory on the server, read it, then close it. The cursor > state for that directory, being associated strictly with the opened > struct file, is then discarded. > > Directory cookies are cached not only by NFS clients, but also by > user space libraries on those clients. Essentially there is no way > to invalidate those caches when directory offsets have changed on > an NFS server after the offset-to-dentry mapping changes. > > The solution we've come up with is to make the directory cookie for > each file in a tmpfs filesystem stable for the life of the directory > entry it represents. > > Add a per-directory xarray. shmem_readdir() uses this to map each > directory offset (an loff_t integer) to the memory address of a > struct dentry. > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > --- > include/linux/shmem_fs.h | 2 > mm/shmem.c | 213 +++++++++++++++++++++++++++++++++++++++++++--- > 2 files changed, 201 insertions(+), 14 deletions(-) > > Changes since RFC: > - Destroy xarray in shmem_destroy_inode() instead of free_in_core_inode() > - A few cosmetic updates > > diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h > index 103d1000a5a2..682ef885aa89 100644 > --- a/include/linux/shmem_fs.h > +++ b/include/linux/shmem_fs.h > @@ -26,6 +26,8 @@ struct shmem_inode_info { > atomic_t stop_eviction; /* hold when working on inode */ > struct timespec64 i_crtime; /* file creation time */ > unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ > + struct xarray doff_map; /* dir offset to entry mapping */ > + u32 next_doff; > struct inode vfs_inode; > }; > > diff --git a/mm/shmem.c b/mm/shmem.c > index 448f393d8ab2..ba4176499e5c 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -40,6 +40,8 @@ > #include <linux/fs_parser.h> > #include <linux/swapfile.h> > #include <linux/iversion.h> > +#include <linux/xarray.h> > + > #include "swap.h" > > static struct vfsmount *shm_mnt; > @@ -234,6 +236,7 @@ static const struct super_operations shmem_ops; > const struct address_space_operations shmem_aops; > static const struct file_operations shmem_file_operations; > static const struct inode_operations shmem_inode_operations; > +static const struct file_operations shmem_dir_operations; > static const struct inode_operations shmem_dir_inode_operations; > static const struct inode_operations shmem_special_inode_operations; > static const struct vm_operations_struct shmem_vm_ops; > @@ -2397,7 +2400,9 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block > /* Some things misbehave if size == 0 on a directory */ > inode->i_size = 2 * BOGO_DIRENT_SIZE; > inode->i_op = &shmem_dir_inode_operations; > - inode->i_fop = &simple_dir_operations; > + inode->i_fop = &shmem_dir_operations; > + xa_init_flags(&info->doff_map, XA_FLAGS_ALLOC1); > + info->next_doff = 0; > break; > case S_IFLNK: > /* > @@ -2917,6 +2922,71 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) > return 0; > } > > +static struct xarray *shmem_doff_map(struct inode *dir) > +{ > + return &SHMEM_I(dir)->doff_map; > +} > + > +static int shmem_doff_add(struct inode *dir, struct dentry *dentry) > +{ > + struct shmem_inode_info *info = SHMEM_I(dir); > + struct xa_limit limit = XA_LIMIT(2, U32_MAX); > + u32 offset; > + int ret; > + > + if (dentry->d_fsdata) > + return -EBUSY; > + > + offset = 0; > + ret = xa_alloc_cyclic(shmem_doff_map(dir), &offset, dentry, limit, > + &info->next_doff, GFP_KERNEL); > + if (ret < 0) > + return ret; > + > + dentry->d_fsdata = (void *)(unsigned long)offset; > + return 0; > +} > + > +static struct dentry *shmem_doff_find_after(struct dentry *dir, > + unsigned long *offset) > +{ > + struct xarray *xa = shmem_doff_map(d_inode(dir)); > + struct dentry *d, *found = NULL; > + > + spin_lock(&dir->d_lock); > + d = xa_find_after(xa, offset, ULONG_MAX, XA_PRESENT); > + if (d) { > + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); > + if (simple_positive(d)) > + found = dget_dlock(d); > + spin_unlock(&d->d_lock); > + } > + spin_unlock(&dir->d_lock); This part is kind of gross, but I think I get it now... You have to take dir->d_lock to ensure that "d" doesn't go away when you don't hold a ref on it, and you need the child's d_lock to ensure that simple_positive result is stable while you take a reference (because doing a dput there could be problematic). If that's right, then that's a bit subtle, and might deserve a nice comment. I do wonder if there is some way to do this with RCU instead, but this seems to work well enough. > + return found; > +} > + > +static void shmem_doff_remove(struct inode *dir, struct dentry *dentry) > +{ > + u32 offset = (u32)(unsigned long)dentry->d_fsdata; > + > + if (!offset) > + return; > + > + xa_erase(shmem_doff_map(dir), offset); > + dentry->d_fsdata = NULL; > +} > + > +/* > + * During fs teardown (eg. umount), a directory's doff_map might still > + * contain entries. xa_destroy() cleans out anything that remains. > + */ > +static void shmem_doff_map_destroy(struct inode *inode) > +{ > + struct xarray *xa = shmem_doff_map(inode); > + > + xa_destroy(xa); > +} > + > /* > * File creation. Allocate an inode, and we're done.. > */ > @@ -2938,6 +3008,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, > if (error && error != -EOPNOTSUPP) > goto out_iput; > > + error = shmem_doff_add(dir, dentry); > + if (error) > + goto out_iput; > + > error = 0; > dir->i_size += BOGO_DIRENT_SIZE; > dir->i_ctime = dir->i_mtime = current_time(dir); > @@ -3015,6 +3089,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr > goto out; > } > > + ret = shmem_doff_add(dir, dentry); > + if (ret) > + goto out; > + > dir->i_size += BOGO_DIRENT_SIZE; > inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); > inode_inc_iversion(dir); > @@ -3033,6 +3111,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) > if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) > shmem_free_inode(inode->i_sb); > > + shmem_doff_remove(dir, dentry); > + > dir->i_size -= BOGO_DIRENT_SIZE; > inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); > inode_inc_iversion(dir); > @@ -3091,24 +3171,37 @@ static int shmem_rename2(struct mnt_idmap *idmap, > { > struct inode *inode = d_inode(old_dentry); > int they_are_dirs = S_ISDIR(inode->i_mode); > + int error; > > if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) > return -EINVAL; > > - if (flags & RENAME_EXCHANGE) > + if (flags & RENAME_EXCHANGE) { > + shmem_doff_remove(old_dir, old_dentry); > + shmem_doff_remove(new_dir, new_dentry); > + error = shmem_doff_add(new_dir, old_dentry); > + if (error) > + return error; > + error = shmem_doff_add(old_dir, new_dentry); > + if (error) > + return error; > return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); > + } > > if (!simple_empty(new_dentry)) > return -ENOTEMPTY; > > if (flags & RENAME_WHITEOUT) { > - int error; > - > error = shmem_whiteout(idmap, old_dir, old_dentry); > if (error) > return error; > } > > + shmem_doff_remove(old_dir, old_dentry); > + error = shmem_doff_add(new_dir, old_dentry); > + if (error) > + return error; > + > if (d_really_is_positive(new_dentry)) { > (void) shmem_unlink(new_dir, new_dentry); > if (they_are_dirs) { > @@ -3149,26 +3242,22 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, > > error = security_inode_init_security(inode, dir, &dentry->d_name, > shmem_initxattrs, NULL); > - if (error && error != -EOPNOTSUPP) { > - iput(inode); > - return error; > - } > + if (error && error != -EOPNOTSUPP) > + goto out_iput; > > inode->i_size = len-1; > if (len <= SHORT_SYMLINK_LEN) { > inode->i_link = kmemdup(symname, len, GFP_KERNEL); > if (!inode->i_link) { > - iput(inode); > - return -ENOMEM; > + error = -ENOMEM; > + goto out_iput; > } > inode->i_op = &shmem_short_symlink_operations; > } else { > inode_nohighmem(inode); > error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); > - if (error) { > - iput(inode); > - return error; > - } > + if (error) > + goto out_iput; > inode->i_mapping->a_ops = &shmem_aops; > inode->i_op = &shmem_symlink_inode_operations; > memcpy(folio_address(folio), symname, len); > @@ -3177,12 +3266,20 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, > folio_unlock(folio); > folio_put(folio); > } > + > + error = shmem_doff_add(dir, dentry); > + if (error) > + goto out_iput; > + > dir->i_size += BOGO_DIRENT_SIZE; > dir->i_ctime = dir->i_mtime = current_time(dir); > inode_inc_iversion(dir); > d_instantiate(dentry, inode); > dget(dentry); > return 0; > +out_iput: > + iput(inode); > + return error; > } > > static void shmem_put_link(void *arg) > @@ -3224,6 +3321,77 @@ static const char *shmem_get_link(struct dentry *dentry, > return folio_address(folio); > } > > +static loff_t shmem_dir_llseek(struct file *file, loff_t offset, int whence) > +{ > + switch (whence) { > + case SEEK_CUR: > + offset += file->f_pos; > + fallthrough; > + case SEEK_SET: > + if (offset >= 0) > + break; > + fallthrough; > + default: > + return -EINVAL; > + } > + return vfs_setpos(file, offset, U32_MAX); > +} > + > +static bool shmem_dir_emit(struct dir_context *ctx, struct dentry *dentry) > +{ > + struct inode *inode = d_inode(dentry); > + > + return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, > + (loff_t)dentry->d_fsdata, inode->i_ino, > + fs_umode_to_dtype(inode->i_mode)); > +} > + > +/** > + * shmem_readdir - Emit entries starting at offset @ctx->pos > + * @file: an open directory to iterate over > + * @ctx: directory iteration context > + * > + * Caller must hold @file's i_rwsem to prevent insertion or removal of > + * entries during this call. > + * > + * On entry, @ctx->pos contains an offset that represents the first entry > + * to be read from the directory. > + * > + * The operation continues until there are no more entries to read, or > + * until the ctx->actor indicates there is no more space in the caller's > + * output buffer. > + * > + * On return, @ctx->pos contains an offset that will read the next entry > + * in this directory when shmem_readdir() is called again with @ctx. > + * > + * Return values: > + * %0 - Complete > + */ > +static int shmem_readdir(struct file *file, struct dir_context *ctx) > +{ > + struct dentry *dentry, *dir = file->f_path.dentry; > + unsigned long offset; > + > + lockdep_assert_held(&d_inode(dir)->i_rwsem); You probably don't need the above. This is called via ->iterate_shared so the lock had _better_ be held. > + > + if (!dir_emit_dots(file, ctx)) > + goto out; > + for (offset = ctx->pos - 1; offset < ULONG_MAX - 1;) { > + dentry = shmem_doff_find_after(dir, &offset); > + if (!dentry) > + break; > + if (!shmem_dir_emit(ctx, dentry)) { > + dput(dentry); > + break; > + } > + ctx->pos = offset + 1; > + dput(dentry); > + } > + > +out: > + return 0; > +} > + > #ifdef CONFIG_TMPFS_XATTR > > static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) > @@ -3742,6 +3910,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) > return 0; > } > > +#else /* CONFIG_TMPFS */ > + > +static inline void shmem_doff_map_destroy(struct inode *dir) > +{ > +} > + > #endif /* CONFIG_TMPFS */ > > static void shmem_put_super(struct super_block *sb) > @@ -3888,6 +4062,8 @@ static void shmem_destroy_inode(struct inode *inode) > { > if (S_ISREG(inode->i_mode)) > mpol_free_shared_policy(&SHMEM_I(inode)->policy); > + if (S_ISDIR(inode->i_mode)) > + shmem_doff_map_destroy(inode); > } > > static void shmem_init_inode(void *foo) > @@ -3955,6 +4131,15 @@ static const struct inode_operations shmem_inode_operations = { > #endif > }; > > +static const struct file_operations shmem_dir_operations = { > +#ifdef CONFIG_TMPFS > + .llseek = shmem_dir_llseek, > + .iterate_shared = shmem_readdir, > +#endif > + .read = generic_read_dir, > + .fsync = noop_fsync, > +}; > + > static const struct inode_operations shmem_dir_inode_operations = { > #ifdef CONFIG_TMPFS > .getattr = shmem_getattr, > > Other than the nits above, this all looks fine to me. I've done some testing with this series too and it all seems to work as expected, and fixes some nasty problems when trying to recursively remove directories via nfsd. Have you done any performance testing? My expectation would be that you'd have roughly similar (or even faster) performance with this set, but at the expense of a bit of memory (for the xarrays). One thing we could consider is lifting the bulk of this code into libfs, so other shmem-like filesystems can take advantage of it, but that work could be done later too when we have another proposed consumer.
> On Apr 20, 2023, at 2:52 PM, Jeff Layton <jlayton@kernel.org> wrote: > > On Mon, 2023-04-17 at 15:23 -0400, Chuck Lever wrote: >> From: Chuck Lever <chuck.lever@oracle.com> >> >> The current cursor-based directory cookie mechanism doesn't work >> when a tmpfs filesystem is exported via NFS. This is because NFS >> clients do not open directories: each READDIR operation has to open >> the directory on the server, read it, then close it. The cursor >> state for that directory, being associated strictly with the opened >> struct file, is then discarded. >> >> Directory cookies are cached not only by NFS clients, but also by >> user space libraries on those clients. Essentially there is no way >> to invalidate those caches when directory offsets have changed on >> an NFS server after the offset-to-dentry mapping changes. >> >> The solution we've come up with is to make the directory cookie for >> each file in a tmpfs filesystem stable for the life of the directory >> entry it represents. >> >> Add a per-directory xarray. shmem_readdir() uses this to map each >> directory offset (an loff_t integer) to the memory address of a >> struct dentry. >> >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> >> --- >> include/linux/shmem_fs.h | 2 >> mm/shmem.c | 213 +++++++++++++++++++++++++++++++++++++++++++--- >> 2 files changed, 201 insertions(+), 14 deletions(-) >> >> Changes since RFC: >> - Destroy xarray in shmem_destroy_inode() instead of free_in_core_inode() >> - A few cosmetic updates >> >> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h >> index 103d1000a5a2..682ef885aa89 100644 >> --- a/include/linux/shmem_fs.h >> +++ b/include/linux/shmem_fs.h >> @@ -26,6 +26,8 @@ struct shmem_inode_info { >> atomic_t stop_eviction; /* hold when working on inode */ >> struct timespec64 i_crtime; /* file creation time */ >> unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ >> + struct xarray doff_map; /* dir offset to entry mapping */ >> + u32 next_doff; >> struct inode vfs_inode; >> }; >> >> diff --git a/mm/shmem.c b/mm/shmem.c >> index 448f393d8ab2..ba4176499e5c 100644 >> --- a/mm/shmem.c >> +++ b/mm/shmem.c >> @@ -40,6 +40,8 @@ >> #include <linux/fs_parser.h> >> #include <linux/swapfile.h> >> #include <linux/iversion.h> >> +#include <linux/xarray.h> >> + >> #include "swap.h" >> >> static struct vfsmount *shm_mnt; >> @@ -234,6 +236,7 @@ static const struct super_operations shmem_ops; >> const struct address_space_operations shmem_aops; >> static const struct file_operations shmem_file_operations; >> static const struct inode_operations shmem_inode_operations; >> +static const struct file_operations shmem_dir_operations; >> static const struct inode_operations shmem_dir_inode_operations; >> static const struct inode_operations shmem_special_inode_operations; >> static const struct vm_operations_struct shmem_vm_ops; >> @@ -2397,7 +2400,9 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block >> /* Some things misbehave if size == 0 on a directory */ >> inode->i_size = 2 * BOGO_DIRENT_SIZE; >> inode->i_op = &shmem_dir_inode_operations; >> - inode->i_fop = &simple_dir_operations; >> + inode->i_fop = &shmem_dir_operations; >> + xa_init_flags(&info->doff_map, XA_FLAGS_ALLOC1); >> + info->next_doff = 0; >> break; >> case S_IFLNK: >> /* >> @@ -2917,6 +2922,71 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) >> return 0; >> } >> >> +static struct xarray *shmem_doff_map(struct inode *dir) >> +{ >> + return &SHMEM_I(dir)->doff_map; >> +} >> + >> +static int shmem_doff_add(struct inode *dir, struct dentry *dentry) >> +{ >> + struct shmem_inode_info *info = SHMEM_I(dir); >> + struct xa_limit limit = XA_LIMIT(2, U32_MAX); >> + u32 offset; >> + int ret; >> + >> + if (dentry->d_fsdata) >> + return -EBUSY; >> + >> + offset = 0; >> + ret = xa_alloc_cyclic(shmem_doff_map(dir), &offset, dentry, limit, >> + &info->next_doff, GFP_KERNEL); >> + if (ret < 0) >> + return ret; >> + >> + dentry->d_fsdata = (void *)(unsigned long)offset; >> + return 0; >> +} >> + >> +static struct dentry *shmem_doff_find_after(struct dentry *dir, >> + unsigned long *offset) >> +{ >> + struct xarray *xa = shmem_doff_map(d_inode(dir)); >> + struct dentry *d, *found = NULL; >> + >> + spin_lock(&dir->d_lock); >> + d = xa_find_after(xa, offset, ULONG_MAX, XA_PRESENT); >> + if (d) { >> + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); >> + if (simple_positive(d)) >> + found = dget_dlock(d); >> + spin_unlock(&d->d_lock); >> + } >> + spin_unlock(&dir->d_lock); > > This part is kind of gross, but I think I get it now... > > You have to take dir->d_lock to ensure that "d" doesn't go away when you > don't hold a ref on it, and you need the child's d_lock to ensure that > simple_positive result is stable while you take a reference (because > doing a dput there could be problematic). If that's right, then that's a > bit subtle, and might deserve a nice comment. > > I do wonder if there is some way to do this with RCU instead, but this > seems to work well enough. I lifted this from fs/libfs.c, fwiw. >> + return found; >> +} >> + >> +static void shmem_doff_remove(struct inode *dir, struct dentry *dentry) >> +{ >> + u32 offset = (u32)(unsigned long)dentry->d_fsdata; >> + >> + if (!offset) >> + return; >> + >> + xa_erase(shmem_doff_map(dir), offset); >> + dentry->d_fsdata = NULL; >> +} >> + >> +/* >> + * During fs teardown (eg. umount), a directory's doff_map might still >> + * contain entries. xa_destroy() cleans out anything that remains. >> + */ >> +static void shmem_doff_map_destroy(struct inode *inode) >> +{ >> + struct xarray *xa = shmem_doff_map(inode); >> + >> + xa_destroy(xa); >> +} >> + >> /* >> * File creation. Allocate an inode, and we're done.. >> */ >> @@ -2938,6 +3008,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, >> if (error && error != -EOPNOTSUPP) >> goto out_iput; >> >> + error = shmem_doff_add(dir, dentry); >> + if (error) >> + goto out_iput; >> + >> error = 0; >> dir->i_size += BOGO_DIRENT_SIZE; >> dir->i_ctime = dir->i_mtime = current_time(dir); >> @@ -3015,6 +3089,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr >> goto out; >> } >> >> + ret = shmem_doff_add(dir, dentry); >> + if (ret) >> + goto out; >> + >> dir->i_size += BOGO_DIRENT_SIZE; >> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); >> inode_inc_iversion(dir); >> @@ -3033,6 +3111,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) >> if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) >> shmem_free_inode(inode->i_sb); >> >> + shmem_doff_remove(dir, dentry); >> + >> dir->i_size -= BOGO_DIRENT_SIZE; >> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); >> inode_inc_iversion(dir); >> @@ -3091,24 +3171,37 @@ static int shmem_rename2(struct mnt_idmap *idmap, >> { >> struct inode *inode = d_inode(old_dentry); >> int they_are_dirs = S_ISDIR(inode->i_mode); >> + int error; >> >> if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) >> return -EINVAL; >> >> - if (flags & RENAME_EXCHANGE) >> + if (flags & RENAME_EXCHANGE) { >> + shmem_doff_remove(old_dir, old_dentry); >> + shmem_doff_remove(new_dir, new_dentry); >> + error = shmem_doff_add(new_dir, old_dentry); >> + if (error) >> + return error; >> + error = shmem_doff_add(old_dir, new_dentry); >> + if (error) >> + return error; >> return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); >> + } >> >> if (!simple_empty(new_dentry)) >> return -ENOTEMPTY; >> >> if (flags & RENAME_WHITEOUT) { >> - int error; >> - >> error = shmem_whiteout(idmap, old_dir, old_dentry); >> if (error) >> return error; >> } >> >> + shmem_doff_remove(old_dir, old_dentry); >> + error = shmem_doff_add(new_dir, old_dentry); >> + if (error) >> + return error; >> + >> if (d_really_is_positive(new_dentry)) { >> (void) shmem_unlink(new_dir, new_dentry); >> if (they_are_dirs) { >> @@ -3149,26 +3242,22 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, >> >> error = security_inode_init_security(inode, dir, &dentry->d_name, >> shmem_initxattrs, NULL); >> - if (error && error != -EOPNOTSUPP) { >> - iput(inode); >> - return error; >> - } >> + if (error && error != -EOPNOTSUPP) >> + goto out_iput; >> >> inode->i_size = len-1; >> if (len <= SHORT_SYMLINK_LEN) { >> inode->i_link = kmemdup(symname, len, GFP_KERNEL); >> if (!inode->i_link) { >> - iput(inode); >> - return -ENOMEM; >> + error = -ENOMEM; >> + goto out_iput; >> } >> inode->i_op = &shmem_short_symlink_operations; >> } else { >> inode_nohighmem(inode); >> error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); >> - if (error) { >> - iput(inode); >> - return error; >> - } >> + if (error) >> + goto out_iput; >> inode->i_mapping->a_ops = &shmem_aops; >> inode->i_op = &shmem_symlink_inode_operations; >> memcpy(folio_address(folio), symname, len); >> @@ -3177,12 +3266,20 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, >> folio_unlock(folio); >> folio_put(folio); >> } >> + >> + error = shmem_doff_add(dir, dentry); >> + if (error) >> + goto out_iput; >> + >> dir->i_size += BOGO_DIRENT_SIZE; >> dir->i_ctime = dir->i_mtime = current_time(dir); >> inode_inc_iversion(dir); >> d_instantiate(dentry, inode); >> dget(dentry); >> return 0; >> +out_iput: >> + iput(inode); >> + return error; >> } >> >> static void shmem_put_link(void *arg) >> @@ -3224,6 +3321,77 @@ static const char *shmem_get_link(struct dentry *dentry, >> return folio_address(folio); >> } >> >> +static loff_t shmem_dir_llseek(struct file *file, loff_t offset, int whence) >> +{ >> + switch (whence) { >> + case SEEK_CUR: >> + offset += file->f_pos; >> + fallthrough; >> + case SEEK_SET: >> + if (offset >= 0) >> + break; >> + fallthrough; >> + default: >> + return -EINVAL; >> + } >> + return vfs_setpos(file, offset, U32_MAX); >> +} >> + >> +static bool shmem_dir_emit(struct dir_context *ctx, struct dentry *dentry) >> +{ >> + struct inode *inode = d_inode(dentry); >> + >> + return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, >> + (loff_t)dentry->d_fsdata, inode->i_ino, >> + fs_umode_to_dtype(inode->i_mode)); >> +} >> + >> +/** >> + * shmem_readdir - Emit entries starting at offset @ctx->pos >> + * @file: an open directory to iterate over >> + * @ctx: directory iteration context >> + * >> + * Caller must hold @file's i_rwsem to prevent insertion or removal of >> + * entries during this call. >> + * >> + * On entry, @ctx->pos contains an offset that represents the first entry >> + * to be read from the directory. >> + * >> + * The operation continues until there are no more entries to read, or >> + * until the ctx->actor indicates there is no more space in the caller's >> + * output buffer. >> + * >> + * On return, @ctx->pos contains an offset that will read the next entry >> + * in this directory when shmem_readdir() is called again with @ctx. >> + * >> + * Return values: >> + * %0 - Complete >> + */ >> +static int shmem_readdir(struct file *file, struct dir_context *ctx) >> +{ >> + struct dentry *dentry, *dir = file->f_path.dentry; >> + unsigned long offset; >> + >> + lockdep_assert_held(&d_inode(dir)->i_rwsem); > > You probably don't need the above. This is called via ->iterate_shared > so the lock had _better_ be held. True, it's not 100% necessary. I was trying to document the API contract, part of which is "caller needs to hold dir->i_rwsem". This seemed like the most crisp way to do that. >> + >> + if (!dir_emit_dots(file, ctx)) >> + goto out; >> + for (offset = ctx->pos - 1; offset < ULONG_MAX - 1;) { >> + dentry = shmem_doff_find_after(dir, &offset); >> + if (!dentry) >> + break; >> + if (!shmem_dir_emit(ctx, dentry)) { >> + dput(dentry); >> + break; >> + } >> + ctx->pos = offset + 1; >> + dput(dentry); >> + } >> + >> +out: >> + return 0; >> +} >> + >> #ifdef CONFIG_TMPFS_XATTR >> >> static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) >> @@ -3742,6 +3910,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) >> return 0; >> } >> >> +#else /* CONFIG_TMPFS */ >> + >> +static inline void shmem_doff_map_destroy(struct inode *dir) >> +{ >> +} >> + >> #endif /* CONFIG_TMPFS */ >> >> static void shmem_put_super(struct super_block *sb) >> @@ -3888,6 +4062,8 @@ static void shmem_destroy_inode(struct inode *inode) >> { >> if (S_ISREG(inode->i_mode)) >> mpol_free_shared_policy(&SHMEM_I(inode)->policy); >> + if (S_ISDIR(inode->i_mode)) >> + shmem_doff_map_destroy(inode); >> } >> >> static void shmem_init_inode(void *foo) >> @@ -3955,6 +4131,15 @@ static const struct inode_operations shmem_inode_operations = { >> #endif >> }; >> >> +static const struct file_operations shmem_dir_operations = { >> +#ifdef CONFIG_TMPFS >> + .llseek = shmem_dir_llseek, >> + .iterate_shared = shmem_readdir, >> +#endif >> + .read = generic_read_dir, >> + .fsync = noop_fsync, >> +}; >> + >> static const struct inode_operations shmem_dir_inode_operations = { >> #ifdef CONFIG_TMPFS >> .getattr = shmem_getattr, >> >> > > Other than the nits above, this all looks fine to me. I've done some > testing with this series too and it all seems to work as expected, and > fixes some nasty problems when trying to recursively remove directories > via nfsd. Thanks for your review, testing, and suggestions. > Have you done any performance testing? My expectation would be that > you'd have roughly similar (or even faster) performance with this set, > but at the expense of a bit of memory (for the xarrays). I don't have any directory microbenchmarks. I suppose I could do something like timing large software builds. > One thing we could consider is lifting the bulk of this code into libfs, > so other shmem-like filesystems can take advantage of it, but that work > could be done later too when we have another proposed consumer. Eg. autofs. -- Chuck Lever
On Mon, 17 Apr 2023 15:23:10 -0400 Chuck Lever <cel@kernel.org> wrote: > From: Chuck Lever <chuck.lever@oracle.com> > > The current cursor-based directory cookie mechanism doesn't work > when a tmpfs filesystem is exported via NFS. This is because NFS > clients do not open directories: each READDIR operation has to open > the directory on the server, read it, then close it. The cursor > state for that directory, being associated strictly with the opened > struct file, is then discarded. > > Directory cookies are cached not only by NFS clients, but also by > user space libraries on those clients. Essentially there is no way > to invalidate those caches when directory offsets have changed on > an NFS server after the offset-to-dentry mapping changes. > > The solution we've come up with is to make the directory cookie for > each file in a tmpfs filesystem stable for the life of the directory > entry it represents. > > Add a per-directory xarray. shmem_readdir() uses this to map each > directory offset (an loff_t integer) to the memory address of a > struct dentry. > How have people survived for this long with this problem? It's a lot of new code - can we get away with simply disallowing exports of tmpfs? How can we maintain this? Is it possible to come up with a test harness for inclusion in kernel selftests?
> On May 2, 2023, at 8:12 PM, Andrew Morton <akpm@linux-foundation.org> wrote: > > On Mon, 17 Apr 2023 15:23:10 -0400 Chuck Lever <cel@kernel.org> wrote: > >> From: Chuck Lever <chuck.lever@oracle.com> >> >> The current cursor-based directory cookie mechanism doesn't work >> when a tmpfs filesystem is exported via NFS. This is because NFS >> clients do not open directories: each READDIR operation has to open >> the directory on the server, read it, then close it. The cursor >> state for that directory, being associated strictly with the opened >> struct file, is then discarded. >> >> Directory cookies are cached not only by NFS clients, but also by >> user space libraries on those clients. Essentially there is no way >> to invalidate those caches when directory offsets have changed on >> an NFS server after the offset-to-dentry mapping changes. >> >> The solution we've come up with is to make the directory cookie for >> each file in a tmpfs filesystem stable for the life of the directory >> entry it represents. >> >> Add a per-directory xarray. shmem_readdir() uses this to map each >> directory offset (an loff_t integer) to the memory address of a >> struct dentry. >> > > How have people survived for this long with this problem? It's less of a problem without NFS in the picture; local applications can hold the directory open, and that preserves the seek cursor. But you can still trigger it. Also, a plurality of applications are well-behaved in this regard. It's just the more complex and more useful ones (like git) that seem to trigger issues. It became less bearable for NFS because of a recent change on the Linux NFS client to optimize directory read behavior: 85aa8ddc3818 ("NFS: Trigger the "ls -l" readdir heuristic sooner") Trond argued that tmpfs directory cookie behavior has always been problematic (eg broken) therefore this commit does not count as a regression. However, it does make tmpfs exports less usable, breaking some tests that have always worked. > It's a lot of new code - I don't feel that this is a lot of new code: include/linux/shmem_fs.h | 2 mm/shmem.c | 213 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 201 insertions(+), 14 deletions(-) But I agree it might look a little daunting on first review. I am happy to try to break this single patch up or consider other approaches. We could, for instance, tuck a little more of this into lib/fs. Copying the readdir and directory seeking implementation from simplefs to tmpfs is one reason the insertion count is worrisome. > can we get away with simply disallowing > exports of tmpfs? I think the bottom line is that you /can/ trigger this behavior without NFS, just not as quickly. The threshold is high enough that most use cases aren't bothered by this right now. We'd rather not disallow exporting tmpfs. It's a very good testing platform for us, and disallowing it would be a noticeable regression for some folks. > How can we maintain this? Is it possible to come up with a test > harness for inclusion in kernel selftests? There is very little directory cookie testing that I know of in the obvious place: fstests. That would be where this stuff should be unit tested, IMO. -- Chuck Lever
On Wed, 2023-05-03 at 00:43 +0000, Chuck Lever III wrote: > > > On May 2, 2023, at 8:12 PM, Andrew Morton <akpm@linux-foundation.org> wrote: > > > > On Mon, 17 Apr 2023 15:23:10 -0400 Chuck Lever <cel@kernel.org> wrote: > > > > > From: Chuck Lever <chuck.lever@oracle.com> > > > > > > The current cursor-based directory cookie mechanism doesn't work > > > when a tmpfs filesystem is exported via NFS. This is because NFS > > > clients do not open directories: each READDIR operation has to open > > > the directory on the server, read it, then close it. The cursor > > > state for that directory, being associated strictly with the opened > > > struct file, is then discarded. > > > > > > Directory cookies are cached not only by NFS clients, but also by > > > user space libraries on those clients. Essentially there is no way > > > to invalidate those caches when directory offsets have changed on > > > an NFS server after the offset-to-dentry mapping changes. > > > > > > The solution we've come up with is to make the directory cookie for > > > each file in a tmpfs filesystem stable for the life of the directory > > > entry it represents. > > > > > > Add a per-directory xarray. shmem_readdir() uses this to map each > > > directory offset (an loff_t integer) to the memory address of a > > > struct dentry. > > > > > > > How have people survived for this long with this problem? > > It's less of a problem without NFS in the picture; local > applications can hold the directory open, and that preserves > the seek cursor. But you can still trigger it. > > Also, a plurality of applications are well-behaved in this > regard. It's just the more complex and more useful ones > (like git) that seem to trigger issues. > > It became less bearable for NFS because of a recent change > on the Linux NFS client to optimize directory read behavior: > > 85aa8ddc3818 ("NFS: Trigger the "ls -l" readdir heuristic sooner") > > Trond argued that tmpfs directory cookie behavior has always > been problematic (eg broken) therefore this commit does not > count as a regression. However, it does make tmpfs exports > less usable, breaking some tests that have always worked. > > > > It's a lot of new code - > > I don't feel that this is a lot of new code: > > include/linux/shmem_fs.h | 2 > mm/shmem.c | 213 +++++++++++++++++++++++++++++++++++++++++++--- > 2 files changed, 201 insertions(+), 14 deletions(-) > > But I agree it might look a little daunting on first review. > I am happy to try to break this single patch up or consider > other approaches. > I wonder whether you really need an xarray here? dcache_readdir walks the d_subdirs list. We add things to d_subdirs at d_alloc time (and in d_move). If you were to assign its dirindex when the dentry gets added to d_subdirs (maybe in ->d_init?) then you'd have a list already ordered by index, and could deal with missing indexes easily. It's not as efficient as the xarray if you have to seek through a big dir, but if keeping the changes tiny is a goal then that might be another way to do this. > We could, for instance, tuck a little more of this into > lib/fs. Copying the readdir and directory seeking > implementation from simplefs to tmpfs is one reason > the insertion count is worrisome. > > > > can we get away with simply disallowing > > exports of tmpfs? > > I think the bottom line is that you /can/ trigger this > behavior without NFS, just not as quickly. The threshold > is high enough that most use cases aren't bothered by > this right now. > > We'd rather not disallow exporting tmpfs. It's a very > good testing platform for us, and disallowing it would > be a noticeable regression for some folks. > > Yeah, I'd not be in favor of that either. We've had an exportable tmpfs for a long time. It's a good way to do testing of the entire NFS server stack, without having to deal with underlying storage. > > How can we maintain this? Is it possible to come up with a test > > harness for inclusion in kernel selftests? > > There is very little directory cookie testing that I know of > in the obvious place: fstests. That would be where this stuff > should be unit tested, IMO. > I'd like to see this too. It's easy for programs to get this wrong. In this case, could we emulate the NFS behavior by doing this in a loop over a large directory? opendir seekdir (to result of last telldir) readdir unlink telldir closedir At the end of it, check whether there are any entries left over.
On 2 May 2023, at 20:43, Chuck Lever III wrote: >> On May 2, 2023, at 8:12 PM, Andrew Morton <akpm@linux-foundation.org> wrote: >> >> On Mon, 17 Apr 2023 15:23:10 -0400 Chuck Lever <cel@kernel.org> wrote: >> >>> From: Chuck Lever <chuck.lever@oracle.com> >>> >>> The current cursor-based directory cookie mechanism doesn't work >>> when a tmpfs filesystem is exported via NFS. This is because NFS >>> clients do not open directories: each READDIR operation has to open >>> the directory on the server, read it, then close it. The cursor >>> state for that directory, being associated strictly with the opened >>> struct file, is then discarded. >>> >>> Directory cookies are cached not only by NFS clients, but also by >>> user space libraries on those clients. Essentially there is no way >>> to invalidate those caches when directory offsets have changed on >>> an NFS server after the offset-to-dentry mapping changes. >>> >>> The solution we've come up with is to make the directory cookie for >>> each file in a tmpfs filesystem stable for the life of the directory >>> entry it represents. >>> >>> Add a per-directory xarray. shmem_readdir() uses this to map each >>> directory offset (an loff_t integer) to the memory address of a >>> struct dentry. >>> >> >> How have people survived for this long with this problem? They survived this long by not considering their current directory offset to be a stationary position in the stream after removing chunks of that stream, as per some POSIX. However, git does this: opendir while getdents unlink(dentries) closedir assert(directory empty) This pattern isn't guaranteed to always produce an empty directory, and filesystems aren't wrong when it doesn't, but they could probably do better. Libfs, on the other hand, conservatively closes and re-opens the directory after removing some entries in order to ensure none are skipped. > It's less of a problem without NFS in the picture; local > applications can hold the directory open, and that preserves > the seek cursor. But you can still trigger it. > > Also, a plurality of applications are well-behaved in this > regard. It's just the more complex and more useful ones > (like git) that seem to trigger issues. > > It became less bearable for NFS because of a recent change > on the Linux NFS client to optimize directory read behavior: > > 85aa8ddc3818 ("NFS: Trigger the "ls -l" readdir heuristic sooner") My ears burn again. > Trond argued that tmpfs directory cookie behavior has always > been problematic (eg broken) therefore this commit does not > count as a regression. However, it does make tmpfs exports > less usable, breaking some tests that have always worked. As luck would have it, since on NFS the breakage also depends on the length of the filenames. It's also possible to fix git's remove_dir_recurse(), but making tmpfs have stable directory offsets would be an improvement for everyone, and especially for NFS. >> It's a lot of new code - > > I don't feel that this is a lot of new code: > > include/linux/shmem_fs.h | 2 > mm/shmem.c | 213 +++++++++++++++++++++++++++++++++++++++++++--- > 2 files changed, 201 insertions(+), 14 deletions(-) > > But I agree it might look a little daunting on first review. > I am happy to try to break this single patch up or consider > other approaches. > > We could, for instance, tuck a little more of this into > lib/fs. Copying the readdir and directory seeking > implementation from simplefs to tmpfs is one reason > the insertion count is worrisome. > > >> can we get away with simply disallowing >> exports of tmpfs? > > I think the bottom line is that you /can/ trigger this > behavior without NFS, just not as quickly. The threshold > is high enough that most use cases aren't bothered by > this right now. Yes, you can run into this problem directly on tmpfs. > We'd rather not disallow exporting tmpfs. It's a very > good testing platform for us, and disallowing it would > be a noticeable regression for some folks. > > >> How can we maintain this? Is it possible to come up with a test >> harness for inclusion in kernel selftests? > > There is very little directory cookie testing that I know of > in the obvious place: fstests. That would be where this stuff > should be unit tested, IMO. Yes, we could write a test, but a test failure shouldn't mean the filesystem is wrong or broken. Ben
Hello, kernel test robot noticed a -18.7% regression of aim9.disk_src.ops_per_sec on: commit: 2976e2b93abcbf19811dc7a444b6df85a520468e ("[PATCH v1] shmem: stable directory cookies") url: https://github.com/intel-lab-lkp/linux/commits/Chuck-Lever/shmem-stable-directory-cookies/20230418-032350 base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git 6a8f57ae2eb07ab39a6f0ccad60c760743051026 patch link: https://lore.kernel.org/all/168175931561.2843.16288612382874559384.stgit@manet.1015granger.net/ patch subject: [PATCH v1] shmem: stable directory cookies testcase: aim9 test machine: 48 threads 2 sockets Intel(R) Xeon(R) CPU E5-2697 v2 @ 2.70GHz (Ivy Bridge-EP) with 112G memory parameters: testtime: 300s test: disk_src cpufreq_governor: performance In addition to that, the commit also has significant impact on the following tests: +------------------+----------------------------------------------------------+ | testcase: change | aim9: aim9.disk_src.ops_per_sec -21.1% regression | | test machine | 224 threads 2 sockets (Sapphire Rapids) with 256G memory | | test parameters | cpufreq_governor=performance | | | test=disk_src | | | testtime=300s | +------------------+----------------------------------------------------------+ If you fix the issue, kindly add following tag | Reported-by: kernel test robot <yujie.liu@intel.com> | Link: https://lore.kernel.org/oe-lkp/202305051223.8ef7d7ae-yujie.liu@intel.com Details are as below: ========================================================================================= compiler/cpufreq_governor/kconfig/rootfs/tbox_group/test/testcase/testtime: gcc-11/performance/x86_64-rhel-8.3/debian-11.1-x86_64-20220510.cgz/lkp-ivb-2ep1/disk_src/aim9/300s commit: v6.3-rc7 2976e2b93a ("shmem: stable directory cookies") v6.3-rc7 2976e2b93abcbf19811dc7a444b ---------------- --------------------------- %stddev %change %stddev \ | \ 0.25 ± 7% +0.1 0.36 ± 3% mpstat.cpu.all.soft% 0.61 -0.1 0.52 mpstat.cpu.all.usr% 198823 -18.7% 161675 aim9.disk_src.ops_per_sec 21274 ± 61% -94.8% 1112 ± 13% aim9.time.involuntary_context_switches 95.00 -4.2% 91.00 aim9.time.percent_of_cpu_this_job_got 72.93 -16.7% 60.78 aim9.time.user_time 23420 +6.0% 24832 proc-vmstat.nr_slab_reclaimable 1374766 ± 27% +400.7% 6883519 proc-vmstat.numa_hit 1331644 ± 28% +413.6% 6839273 proc-vmstat.numa_local 4415141 ± 38% +507.5% 26821043 proc-vmstat.pgalloc_normal 4392173 ± 38% +508.9% 26743904 proc-vmstat.pgfree 10.80 +23.1% 13.29 perf-stat.i.MPKI 2.58 +0.2 2.79 ± 2% perf-stat.i.branch-miss-rate% 19666784 +2.5% 20148875 perf-stat.i.branch-misses 17.41 -2.6 14.86 perf-stat.i.cache-miss-rate% 40153603 +17.7% 47241220 perf-stat.i.cache-references 1.69 +4.7% 1.77 perf-stat.i.cpi 1.073e+09 -7.6% 9.923e+08 ± 9% perf-stat.i.dTLB-loads 0.19 ± 3% -0.0 0.17 ± 4% perf-stat.i.dTLB-store-miss-rate% 1651558 ± 3% -8.7% 1508543 ± 4% perf-stat.i.dTLB-store-misses 8.547e+08 +4.4% 8.927e+08 perf-stat.i.dTLB-stores 0.59 -4.2% 0.57 perf-stat.i.ipc 47.69 -1.4 46.24 perf-stat.i.node-load-miss-rate% 31.75 ± 9% -5.3 26.42 ± 12% perf-stat.i.node-store-miss-rate% 103277 ± 5% +17.5% 121395 ± 5% perf-stat.i.node-stores 10.52 +22.3% 12.86 perf-stat.overall.MPKI 2.68 +0.2 2.88 perf-stat.overall.branch-miss-rate% 17.41 -2.5 14.87 perf-stat.overall.cache-miss-rate% 1.66 +4.3% 1.74 perf-stat.overall.cpi 0.19 ± 3% -0.0 0.17 ± 4% perf-stat.overall.dTLB-store-miss-rate% 0.60 -4.1% 0.58 perf-stat.overall.ipc 45.06 -1.5 43.55 perf-stat.overall.node-load-miss-rate% 40019034 +17.7% 47083587 perf-stat.ps.cache-references 1.07e+09 -7.6% 9.891e+08 ± 9% perf-stat.ps.dTLB-loads 1645993 ± 3% -8.7% 1503507 ± 4% perf-stat.ps.dTLB-store-misses 8.519e+08 +4.4% 8.898e+08 perf-stat.ps.dTLB-stores 102926 ± 5% +17.6% 121068 ± 5% perf-stat.ps.node-stores 0.00 +0.9 0.94 ± 27% perf-profile.calltrace.cycles-pp.kmem_cache_alloc_lru.xas_alloc.xas_create.xas_store.__xa_alloc 0.00 +1.0 1.01 ± 25% perf-profile.calltrace.cycles-pp.xas_alloc.xas_create.xas_store.__xa_alloc.__xa_alloc_cyclic 0.00 +1.0 1.05 ± 26% perf-profile.calltrace.cycles-pp.kmem_cache_alloc_lru.xas_alloc.xas_expand.xas_create.xas_store 0.00 +1.1 1.12 ± 16% perf-profile.calltrace.cycles-pp.xas_store.__xa_erase.xa_erase.shmem_unlink.vfs_unlink 0.00 +1.1 1.14 ± 24% perf-profile.calltrace.cycles-pp.xas_alloc.xas_expand.xas_create.xas_store.__xa_alloc 0.00 +1.1 1.15 ± 16% perf-profile.calltrace.cycles-pp.__xa_erase.xa_erase.shmem_unlink.vfs_unlink.do_unlinkat 0.00 +1.2 1.21 ± 16% perf-profile.calltrace.cycles-pp.xa_erase.shmem_unlink.vfs_unlink.do_unlinkat.__x64_sys_unlink 1.24 ± 25% +1.2 2.49 ± 15% perf-profile.calltrace.cycles-pp.vfs_unlink.do_unlinkat.__x64_sys_unlink.do_syscall_64.entry_SYSCALL_64_after_hwframe 0.00 +1.3 1.32 ± 21% perf-profile.calltrace.cycles-pp.xas_expand.xas_create.xas_store.__xa_alloc.__xa_alloc_cyclic 0.00 +1.5 1.49 ± 17% perf-profile.calltrace.cycles-pp.shmem_unlink.vfs_unlink.do_unlinkat.__x64_sys_unlink.do_syscall_64 0.00 +2.5 2.49 ± 16% perf-profile.calltrace.cycles-pp.xas_create.xas_store.__xa_alloc.__xa_alloc_cyclic.shmem_doff_add 0.00 +2.6 2.57 ± 16% perf-profile.calltrace.cycles-pp.xas_store.__xa_alloc.__xa_alloc_cyclic.shmem_doff_add.shmem_mknod 0.00 +2.8 2.81 ± 16% perf-profile.calltrace.cycles-pp.__xa_alloc.__xa_alloc_cyclic.shmem_doff_add.shmem_mknod.lookup_open 0.00 +2.8 2.84 ± 15% perf-profile.calltrace.cycles-pp.__xa_alloc_cyclic.shmem_doff_add.shmem_mknod.lookup_open.open_last_lookups 5.77 ± 19% +2.9 8.72 ± 13% perf-profile.calltrace.cycles-pp.open_last_lookups.path_openat.do_filp_open.do_sys_openat2.__x64_sys_creat 0.00 +3.0 2.96 ± 16% perf-profile.calltrace.cycles-pp.shmem_doff_add.shmem_mknod.lookup_open.open_last_lookups.path_openat 2.76 ± 18% +3.0 5.73 ± 14% perf-profile.calltrace.cycles-pp.shmem_mknod.lookup_open.open_last_lookups.path_openat.do_filp_open 5.11 ± 19% +3.0 8.09 ± 13% perf-profile.calltrace.cycles-pp.lookup_open.open_last_lookups.path_openat.do_filp_open.do_sys_openat2 0.05 ± 65% +0.1 0.12 ± 22% perf-profile.children.cycles-pp.rcu_nocb_try_bypass 0.02 ±141% +0.1 0.11 ± 32% perf-profile.children.cycles-pp.__unfreeze_partials 0.01 ±223% +0.1 0.11 ± 23% perf-profile.children.cycles-pp.rmqueue 0.09 ± 32% +0.1 0.23 ± 24% perf-profile.children.cycles-pp.rcu_segcblist_enqueue 0.05 ± 77% +0.2 0.20 ± 17% perf-profile.children.cycles-pp.get_page_from_freelist 0.08 ± 33% +0.2 0.26 ± 15% perf-profile.children.cycles-pp.__alloc_pages 0.31 ± 20% +0.5 0.78 ± 21% perf-profile.children.cycles-pp.__slab_free 0.47 ± 22% +0.6 1.08 ± 16% perf-profile.children.cycles-pp.__call_rcu_common 0.00 +0.7 0.67 ± 20% perf-profile.children.cycles-pp.radix_tree_node_rcu_free 0.00 +1.0 1.00 ± 18% perf-profile.children.cycles-pp.radix_tree_node_ctor 0.17 ± 40% +1.1 1.29 ± 15% perf-profile.children.cycles-pp.setup_object 0.00 +1.2 1.15 ± 16% perf-profile.children.cycles-pp.__xa_erase 0.00 +1.2 1.21 ± 16% perf-profile.children.cycles-pp.xa_erase 0.28 ± 41% +1.2 1.50 ± 17% perf-profile.children.cycles-pp.shmem_unlink 0.21 ± 39% +1.2 1.46 ± 15% perf-profile.children.cycles-pp.shuffle_freelist 1.25 ± 25% +1.3 2.50 ± 15% perf-profile.children.cycles-pp.vfs_unlink 0.00 +1.3 1.32 ± 21% perf-profile.children.cycles-pp.xas_expand 0.28 ± 42% +1.5 1.77 ± 14% perf-profile.children.cycles-pp.allocate_slab 0.42 ± 26% +1.6 1.98 ± 13% perf-profile.children.cycles-pp.___slab_alloc 1.23 ± 16% +1.7 2.93 ± 18% perf-profile.children.cycles-pp.rcu_core 2.70 ± 8% +1.7 4.42 ± 9% perf-profile.children.cycles-pp.__do_softirq 1.09 ± 12% +1.7 2.83 ± 19% perf-profile.children.cycles-pp.rcu_do_batch 1.53 ± 20% +2.1 3.60 ± 13% perf-profile.children.cycles-pp.kmem_cache_alloc_lru 2.52 ± 11% +2.1 4.65 ± 8% perf-profile.children.cycles-pp.__irq_exit_rcu 0.00 +2.2 2.16 ± 16% perf-profile.children.cycles-pp.xas_alloc 11.75 ± 10% +2.4 14.15 ± 7% perf-profile.children.cycles-pp.asm_sysvec_apic_timer_interrupt 11.06 ± 10% +2.4 13.48 ± 6% perf-profile.children.cycles-pp.sysvec_apic_timer_interrupt 0.00 +2.5 2.50 ± 16% perf-profile.children.cycles-pp.xas_create 0.00 +2.8 2.81 ± 16% perf-profile.children.cycles-pp.__xa_alloc 0.00 +2.8 2.84 ± 15% perf-profile.children.cycles-pp.__xa_alloc_cyclic 5.80 ± 19% +3.0 8.76 ± 13% perf-profile.children.cycles-pp.open_last_lookups 2.78 ± 18% +3.0 5.74 ± 14% perf-profile.children.cycles-pp.shmem_mknod 0.00 +3.0 2.96 ± 16% perf-profile.children.cycles-pp.shmem_doff_add 5.14 ± 19% +3.0 8.12 ± 13% perf-profile.children.cycles-pp.lookup_open 0.00 +3.7 3.70 ± 15% perf-profile.children.cycles-pp.xas_store 0.08 ± 29% +0.1 0.17 ± 22% perf-profile.self.cycles-pp.xas_load 0.03 ±100% +0.1 0.14 ± 17% perf-profile.self.cycles-pp.shuffle_freelist 0.00 +0.1 0.13 ± 19% perf-profile.self.cycles-pp.xas_alloc 0.09 ± 35% +0.1 0.22 ± 23% perf-profile.self.cycles-pp.rcu_segcblist_enqueue 0.00 +0.2 0.15 ± 24% perf-profile.self.cycles-pp.xas_create 0.00 +0.2 0.17 ± 31% perf-profile.self.cycles-pp.xas_expand 0.00 +0.3 0.27 ± 15% perf-profile.self.cycles-pp.xas_store 0.31 ± 18% +0.3 0.62 ± 14% perf-profile.self.cycles-pp.__call_rcu_common 0.17 ± 27% +0.3 0.50 ± 16% perf-profile.self.cycles-pp.kmem_cache_alloc_lru 0.31 ± 19% +0.5 0.76 ± 22% perf-profile.self.cycles-pp.__slab_free 0.00 +0.7 0.66 ± 20% perf-profile.self.cycles-pp.radix_tree_node_rcu_free 0.00 +0.9 0.91 ± 17% perf-profile.self.cycles-pp.radix_tree_node_ctor *************************************************************************************************** lkp-spr-r02: 224 threads 2 sockets (Sapphire Rapids) with 256G memory ========================================================================================= compiler/cpufreq_governor/kconfig/rootfs/tbox_group/test/testcase/testtime: gcc-11/performance/x86_64-rhel-8.3/debian-11.1-x86_64-20220510.cgz/lkp-spr-r02/disk_src/aim9/300s commit: v6.3-rc7 2976e2b93a ("shmem: stable directory cookies") v6.3-rc7 2976e2b93abcbf19811dc7a444b ---------------- --------------------------- %stddev %change %stddev \ | \ 0.54 -8.5% 0.49 turbostat.IPC 2819 +4.4% 2944 vmstat.system.cs 0.07 +0.0 0.09 mpstat.cpu.all.soft% 0.09 -0.0 0.08 mpstat.cpu.all.usr% 412523 -21.1% 325484 aim9.disk_src.ops_per_sec 816.00 ± 7% +578.0% 5532 ± 4% aim9.time.involuntary_context_switches 92.20 -5.6% 87.00 aim9.time.percent_of_cpu_this_job_got 229.15 -2.7% 222.99 aim9.time.system_time 49.41 -17.9% 40.56 aim9.time.user_time 39515 +5.7% 41749 proc-vmstat.nr_slab_reclaimable 2514403 +186.2% 7196969 proc-vmstat.numa_hit 2311707 +202.6% 6994204 proc-vmstat.numa_local 5423 +7.9% 5849 proc-vmstat.pgactivate 12367005 +303.9% 49944845 proc-vmstat.pgalloc_normal 12324581 +304.3% 49832694 proc-vmstat.pgfree 29.62 ± 31% -29.9% 20.77 ± 15% sched_debug.cfs_rq:/.load_avg.avg 200823 ± 17% -38.0% 124525 ± 9% sched_debug.cfs_rq:/.min_vruntime.max 19592 ± 32% -46.2% 10535 ± 13% sched_debug.cfs_rq:/.min_vruntime.stddev 62.57 ± 12% -17.9% 51.39 ± 9% sched_debug.cfs_rq:/.runnable_avg.avg 102694 ± 30% -63.7% 37282 ± 25% sched_debug.cfs_rq:/.spread0.max 19593 ± 32% -46.2% 10538 ± 13% sched_debug.cfs_rq:/.spread0.stddev 62.54 ± 12% -18.0% 51.27 ± 8% sched_debug.cfs_rq:/.util_avg.avg 1.80 +157.0% 4.63 perf-stat.i.MPKI 1.571e+09 -6.9% 1.462e+09 perf-stat.i.branch-instructions 0.41 +0.0 0.43 perf-stat.i.branch-miss-rate% 6982520 -1.8% 6857934 perf-stat.i.branch-misses 2.64 ± 2% -1.2 1.49 perf-stat.i.cache-miss-rate% 445268 +28.9% 573872 ± 2% perf-stat.i.cache-misses 14884982 +138.0% 35429555 perf-stat.i.cache-references 2696 +5.0% 2830 perf-stat.i.context-switches 0.77 +10.4% 0.85 perf-stat.i.cpi 6.327e+09 +3.6% 6.554e+09 perf-stat.i.cpu-cycles 235.21 +3.5% 243.42 perf-stat.i.cpu-migrations 18266 ± 3% -21.6% 14320 perf-stat.i.cycles-between-cache-misses 2.266e+09 -6.8% 2.113e+09 perf-stat.i.dTLB-loads 1.403e+09 -6.4% 1.313e+09 perf-stat.i.dTLB-stores 8.193e+09 -6.0% 7.702e+09 perf-stat.i.instructions 1.29 -9.4% 1.17 perf-stat.i.ipc 0.03 +3.7% 0.03 perf-stat.i.metric.GHz 66.52 +138.0% 158.31 perf-stat.i.metric.K/sec 23.39 -6.7% 21.82 perf-stat.i.metric.M/sec 153573 ± 2% +40.6% 215954 ± 2% perf-stat.i.node-load-misses 8731 ± 12% +24.8% 10895 ± 8% perf-stat.i.node-loads 1.82 +153.2% 4.60 perf-stat.overall.MPKI 0.44 +0.0 0.47 perf-stat.overall.branch-miss-rate% 2.99 -1.4 1.62 ± 2% perf-stat.overall.cache-miss-rate% 0.77 +10.2% 0.85 perf-stat.overall.cpi 14209 -19.6% 11428 ± 2% perf-stat.overall.cycles-between-cache-misses 1.30 -9.3% 1.18 perf-stat.overall.ipc 1.565e+09 -6.9% 1.457e+09 perf-stat.ps.branch-instructions 6959348 -1.8% 6835053 perf-stat.ps.branch-misses 443804 +28.8% 571841 ± 2% perf-stat.ps.cache-misses 14834967 +138.0% 35309612 perf-stat.ps.cache-references 2687 +5.0% 2821 perf-stat.ps.context-switches 6.305e+09 +3.6% 6.532e+09 perf-stat.ps.cpu-cycles 234.41 +3.5% 242.58 perf-stat.ps.cpu-migrations 2.259e+09 -6.8% 2.106e+09 perf-stat.ps.dTLB-loads 1.398e+09 -6.4% 1.309e+09 perf-stat.ps.dTLB-stores 8.166e+09 -6.0% 7.676e+09 perf-stat.ps.instructions 153047 ± 2% +40.6% 215202 ± 2% perf-stat.ps.node-load-misses 8703 ± 12% +24.7% 10853 ± 8% perf-stat.ps.node-loads 2.454e+12 -6.0% 2.307e+12 perf-stat.total.instructions 7.47 -1.7 5.77 ± 5% perf-profile.calltrace.cycles-pp.__xstat64 6.81 -1.6 5.26 ± 4% perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.__xstat64 6.54 -1.5 4.99 ± 5% perf-profile.calltrace.cycles-pp.__do_sys_newstat.do_syscall_64.entry_SYSCALL_64_after_hwframe.__xstat64 6.72 -1.5 5.19 ± 4% perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.__xstat64 5.77 -1.4 4.41 ± 4% perf-profile.calltrace.cycles-pp.vfs_fstatat.__do_sys_newstat.do_syscall_64.entry_SYSCALL_64_after_hwframe.__xstat64 12.55 -1.2 11.32 ± 5% perf-profile.calltrace.cycles-pp.unlink 11.87 -1.0 10.88 ± 5% perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.unlink 11.66 -1.0 10.70 ± 5% perf-profile.calltrace.cycles-pp.__x64_sys_unlink.do_syscall_64.entry_SYSCALL_64_after_hwframe.unlink 11.80 -1.0 10.84 ± 5% perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.unlink 4.18 -1.0 3.22 ± 5% perf-profile.calltrace.cycles-pp.vfs_statx.vfs_fstatat.__do_sys_newstat.do_syscall_64.entry_SYSCALL_64_after_hwframe 3.34 -0.9 2.48 ± 6% perf-profile.calltrace.cycles-pp.__close 3.37 -0.8 2.61 ± 5% perf-profile.calltrace.cycles-pp.filename_lookup.vfs_statx.vfs_fstatat.__do_sys_newstat.do_syscall_64 3.34 -0.7 2.60 ± 6% perf-profile.calltrace.cycles-pp.shmem_get_inode.shmem_mknod.lookup_open.open_last_lookups.path_openat 2.50 -0.7 1.78 ± 5% perf-profile.calltrace.cycles-pp.filename_parentat.do_unlinkat.__x64_sys_unlink.do_syscall_64.entry_SYSCALL_64_after_hwframe 3.24 -0.7 2.53 ± 5% perf-profile.calltrace.cycles-pp.path_lookupat.filename_lookup.vfs_statx.vfs_fstatat.__do_sys_newstat 2.40 ± 2% -0.7 1.70 ± 6% perf-profile.calltrace.cycles-pp.path_parentat.filename_parentat.do_unlinkat.__x64_sys_unlink.do_syscall_64 2.71 -0.7 2.06 ± 5% perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.__close 2.65 -0.6 2.02 ± 5% perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.__close 1.91 ± 2% -0.6 1.34 ± 6% perf-profile.calltrace.cycles-pp.link_path_walk.path_parentat.filename_parentat.do_unlinkat.__x64_sys_unlink 2.59 ± 2% -0.6 2.03 ± 6% perf-profile.calltrace.cycles-pp.new_inode.shmem_get_inode.shmem_mknod.lookup_open.open_last_lookups 2.05 ± 2% -0.5 1.54 ± 7% perf-profile.calltrace.cycles-pp.alloc_inode.new_inode.shmem_get_inode.shmem_mknod.lookup_open 1.96 ± 2% -0.5 1.45 ± 4% perf-profile.calltrace.cycles-pp.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe.__close 1.90 -0.5 1.41 ± 4% perf-profile.calltrace.cycles-pp.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe 1.92 ± 2% -0.5 1.44 ± 4% perf-profile.calltrace.cycles-pp.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe.__close 2.26 ± 2% -0.5 1.78 ± 6% perf-profile.calltrace.cycles-pp.link_path_walk.path_lookupat.filename_lookup.vfs_statx.vfs_fstatat 1.91 ± 2% -0.5 1.44 ± 8% perf-profile.calltrace.cycles-pp.link_path_walk.path_openat.do_filp_open.do_sys_openat2.__x64_sys_creat 1.78 ± 2% -0.5 1.33 ± 7% perf-profile.calltrace.cycles-pp.shmem_alloc_inode.alloc_inode.new_inode.shmem_get_inode.shmem_mknod 1.77 ± 2% -0.4 1.32 ± 8% perf-profile.calltrace.cycles-pp.kmem_cache_alloc_lru.shmem_alloc_inode.alloc_inode.new_inode.shmem_get_inode 1.70 ± 2% -0.4 1.27 ± 5% perf-profile.calltrace.cycles-pp.task_work_run.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64 1.81 ± 2% -0.4 1.41 ± 5% perf-profile.calltrace.cycles-pp.d_alloc_parallel.lookup_open.open_last_lookups.path_openat.do_filp_open 1.82 ± 2% -0.4 1.42 ± 6% perf-profile.calltrace.cycles-pp.alloc_empty_file.path_openat.do_filp_open.do_sys_openat2.__x64_sys_creat 0.74 ± 6% -0.4 0.34 ± 81% perf-profile.calltrace.cycles-pp.strncpy_from_user.getname_flags.do_sys_openat2.__x64_sys_creat.do_syscall_64 1.71 -0.4 1.33 ± 7% perf-profile.calltrace.cycles-pp.__alloc_file.alloc_empty_file.path_openat.do_filp_open.do_sys_openat2 0.69 ± 7% -0.4 0.32 ± 82% perf-profile.calltrace.cycles-pp.strncpy_from_user.getname_flags.__x64_sys_unlink.do_syscall_64.entry_SYSCALL_64_after_hwframe 1.63 ± 2% -0.4 1.26 ± 6% perf-profile.calltrace.cycles-pp.dput.do_unlinkat.__x64_sys_unlink.do_syscall_64.entry_SYSCALL_64_after_hwframe 1.64 ± 2% -0.4 1.28 ± 6% perf-profile.calltrace.cycles-pp.evict.do_unlinkat.__x64_sys_unlink.do_syscall_64.entry_SYSCALL_64_after_hwframe 1.33 ± 3% -0.3 1.00 ± 5% perf-profile.calltrace.cycles-pp.getname_flags.vfs_fstatat.__do_sys_newstat.do_syscall_64.entry_SYSCALL_64_after_hwframe 1.50 ± 3% -0.3 1.19 ± 7% perf-profile.calltrace.cycles-pp.d_alloc.d_alloc_parallel.lookup_open.open_last_lookups.path_openat 1.06 ± 6% -0.3 0.77 ± 8% perf-profile.calltrace.cycles-pp.getname_flags.do_sys_openat2.__x64_sys_creat.do_syscall_64.entry_SYSCALL_64_after_hwframe 1.18 ± 5% -0.3 0.91 ± 6% perf-profile.calltrace.cycles-pp.do_open.path_openat.do_filp_open.do_sys_openat2.__x64_sys_creat 0.94 -0.3 0.67 ± 8% perf-profile.calltrace.cycles-pp.__fput.task_work_run.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode 0.58 ± 3% -0.3 0.32 ± 81% perf-profile.calltrace.cycles-pp.walk_component.link_path_walk.path_lookupat.filename_lookup.vfs_statx 0.70 ± 3% -0.3 0.44 ± 50% perf-profile.calltrace.cycles-pp.cp_new_stat.__do_sys_newstat.do_syscall_64.entry_SYSCALL_64_after_hwframe.__xstat64 1.11 ± 4% -0.2 0.87 ± 6% perf-profile.calltrace.cycles-pp.__d_alloc.d_alloc.d_alloc_parallel.lookup_open.open_last_lookups 1.01 ± 4% -0.2 0.78 ± 6% perf-profile.calltrace.cycles-pp.getname_flags.__x64_sys_unlink.do_syscall_64.entry_SYSCALL_64_after_hwframe.unlink 0.79 ± 6% -0.2 0.56 ± 7% perf-profile.calltrace.cycles-pp.simple_lookup.lookup_open.open_last_lookups.path_openat.do_filp_open 0.93 ± 5% -0.2 0.70 ± 5% perf-profile.calltrace.cycles-pp.strncpy_from_user.getname_flags.vfs_fstatat.__do_sys_newstat.do_syscall_64 1.05 ± 4% -0.2 0.83 ± 9% perf-profile.calltrace.cycles-pp.shmem_evict_inode.evict.do_unlinkat.__x64_sys_unlink.do_syscall_64 1.00 ± 4% -0.2 0.80 ± 6% perf-profile.calltrace.cycles-pp.kmem_cache_alloc_lru.__d_alloc.d_alloc.d_alloc_parallel.lookup_open 0.98 ± 5% -0.2 0.79 ± 4% perf-profile.calltrace.cycles-pp.dentry_kill.dput.do_unlinkat.__x64_sys_unlink.do_syscall_64 0.89 ± 4% -0.2 0.70 ± 10% perf-profile.calltrace.cycles-pp.kmem_cache_alloc.__alloc_file.alloc_empty_file.path_openat.do_filp_open 0.81 ± 4% -0.2 0.63 ± 7% perf-profile.calltrace.cycles-pp.do_dentry_open.do_open.path_openat.do_filp_open.do_sys_openat2 0.82 ± 5% -0.2 0.67 ± 6% perf-profile.calltrace.cycles-pp.__dentry_kill.dentry_kill.dput.do_unlinkat.__x64_sys_unlink 0.56 ± 2% +0.2 0.74 ± 9% perf-profile.calltrace.cycles-pp._raw_spin_lock_irq.__hrtimer_run_queues.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt 0.87 ± 9% +0.2 1.08 ± 8% perf-profile.calltrace.cycles-pp.rcu_pending.rcu_sched_clock_irq.update_process_times.tick_sched_handle.tick_sched_timer 1.04 ± 6% +0.3 1.30 ± 6% perf-profile.calltrace.cycles-pp.rcu_sched_clock_irq.update_process_times.tick_sched_handle.tick_sched_timer.__hrtimer_run_queues 0.59 ± 7% +0.3 0.86 ± 12% perf-profile.calltrace.cycles-pp.ret_from_fork 0.59 ± 7% +0.3 0.86 ± 12% perf-profile.calltrace.cycles-pp.kthread.ret_from_fork 1.24 ± 9% +0.5 1.70 ± 6% perf-profile.calltrace.cycles-pp._raw_spin_lock.scheduler_tick.update_process_times.tick_sched_handle.tick_sched_timer 0.00 +0.6 0.58 ± 5% perf-profile.calltrace.cycles-pp.rcu_do_batch.rcu_core.__do_softirq.__irq_exit_rcu.sysvec_apic_timer_interrupt 3.79 ± 5% +0.7 4.47 ± 4% perf-profile.calltrace.cycles-pp.scheduler_tick.update_process_times.tick_sched_handle.tick_sched_timer.__hrtimer_run_queues 0.45 ± 50% +0.7 1.19 ± 5% perf-profile.calltrace.cycles-pp.rcu_core.__do_softirq.__irq_exit_rcu.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt 1.86 ± 3% +0.8 2.68 ± 4% perf-profile.calltrace.cycles-pp.ct_kernel_exit_state.ct_kernel_enter.ct_idle_exit.cpuidle_enter_state.cpuidle_enter 2.02 ± 3% +0.8 2.87 ± 4% perf-profile.calltrace.cycles-pp.ct_idle_exit.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle 2.00 ± 4% +0.9 2.85 ± 4% perf-profile.calltrace.cycles-pp.ct_kernel_enter.ct_idle_exit.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call 0.00 +1.0 0.97 ± 16% perf-profile.calltrace.cycles-pp.allocate_slab.___slab_alloc.kmem_cache_alloc_lru.xas_alloc.xas_create 5.77 ± 4% +1.0 6.78 ± 4% perf-profile.calltrace.cycles-pp.update_process_times.tick_sched_handle.tick_sched_timer.__hrtimer_run_queues.hrtimer_interrupt 5.86 ± 4% +1.0 6.88 ± 4% perf-profile.calltrace.cycles-pp.tick_sched_handle.tick_sched_timer.__hrtimer_run_queues.hrtimer_interrupt.__sysvec_apic_timer_interrupt 0.00 +1.0 1.04 ± 16% perf-profile.calltrace.cycles-pp.___slab_alloc.kmem_cache_alloc_lru.xas_alloc.xas_create.xas_store 2.49 ± 3% +1.3 3.79 ± 5% perf-profile.calltrace.cycles-pp.vfs_unlink.do_unlinkat.__x64_sys_unlink.do_syscall_64.entry_SYSCALL_64_after_hwframe 0.00 +1.3 1.31 ± 5% perf-profile.calltrace.cycles-pp.__call_rcu_common.xas_store.__xa_erase.xa_erase.shmem_unlink 6.69 ± 6% +1.3 8.01 ± 7% perf-profile.calltrace.cycles-pp.tick_sched_timer.__hrtimer_run_queues.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt 0.00 +1.5 1.45 ± 11% perf-profile.calltrace.cycles-pp.kmem_cache_alloc_lru.xas_alloc.xas_create.xas_store.__xa_alloc 0.00 +1.5 1.47 ± 12% perf-profile.calltrace.cycles-pp.allocate_slab.___slab_alloc.kmem_cache_alloc_lru.xas_alloc.xas_expand 9.60 ± 4% +1.6 11.16 ± 6% perf-profile.calltrace.cycles-pp.__hrtimer_run_queues.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt 0.00 +1.6 1.57 ± 10% perf-profile.calltrace.cycles-pp.xas_alloc.xas_create.xas_store.__xa_alloc.__xa_alloc_cyclic 0.00 +1.6 1.59 ± 11% perf-profile.calltrace.cycles-pp.___slab_alloc.kmem_cache_alloc_lru.xas_alloc.xas_expand.xas_create 0.56 ± 2% +1.6 2.19 ± 5% perf-profile.calltrace.cycles-pp.setup_object.shuffle_freelist.allocate_slab.___slab_alloc.kmem_cache_alloc_lru 0.57 +1.8 2.34 ± 6% perf-profile.calltrace.cycles-pp.shmem_unlink.vfs_unlink.do_unlinkat.__x64_sys_unlink.do_syscall_64 0.00 +1.8 1.81 ± 6% perf-profile.calltrace.cycles-pp.xas_store.__xa_erase.xa_erase.shmem_unlink.vfs_unlink 12.59 ± 3% +1.8 14.41 ± 6% perf-profile.calltrace.cycles-pp.hrtimer_interrupt.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state 13.38 ± 3% +1.8 15.23 ± 5% perf-profile.calltrace.cycles-pp.__sysvec_apic_timer_interrupt.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state.cpuidle_enter 0.00 +1.9 1.85 ± 6% perf-profile.calltrace.cycles-pp.__xa_erase.xa_erase.shmem_unlink.vfs_unlink.do_unlinkat 0.00 +1.9 1.92 ± 6% perf-profile.calltrace.cycles-pp.xa_erase.shmem_unlink.vfs_unlink.do_unlinkat.__x64_sys_unlink 0.00 +2.0 1.95 ± 10% perf-profile.calltrace.cycles-pp.kmem_cache_alloc_lru.xas_alloc.xas_expand.xas_create.xas_store 0.00 +2.1 2.08 ± 10% perf-profile.calltrace.cycles-pp.xas_alloc.xas_expand.xas_create.xas_store.__xa_alloc 0.00 +2.1 2.12 ± 5% perf-profile.calltrace.cycles-pp.radix_tree_node_ctor.setup_object.shuffle_freelist.allocate_slab.___slab_alloc 0.00 +2.3 2.29 ± 9% perf-profile.calltrace.cycles-pp.xas_expand.xas_create.xas_store.__xa_alloc.__xa_alloc_cyclic 0.00 +2.3 2.33 ± 4% perf-profile.calltrace.cycles-pp.shuffle_freelist.allocate_slab.___slab_alloc.kmem_cache_alloc_lru.xas_alloc 9.30 +2.4 11.67 ± 5% perf-profile.calltrace.cycles-pp.open_last_lookups.path_openat.do_filp_open.do_sys_openat2.__x64_sys_creat 19.70 +2.4 22.13 ± 4% perf-profile.calltrace.cycles-pp.sysvec_apic_timer_interrupt.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call 24.00 +2.4 26.44 ± 4% perf-profile.calltrace.cycles-pp.asm_sysvec_apic_timer_interrupt.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle 7.90 +2.7 10.60 ± 5% perf-profile.calltrace.cycles-pp.lookup_open.open_last_lookups.path_openat.do_filp_open.do_sys_openat2 4.60 +3.5 8.14 ± 5% perf-profile.calltrace.cycles-pp.shmem_mknod.lookup_open.open_last_lookups.path_openat.do_filp_open 50.91 +3.8 54.75 ± 3% perf-profile.calltrace.cycles-pp.secondary_startup_64_no_verify 50.64 +3.9 54.50 ± 4% perf-profile.calltrace.cycles-pp.do_idle.cpu_startup_entry.start_secondary.secondary_startup_64_no_verify 50.66 +3.9 54.52 ± 3% perf-profile.calltrace.cycles-pp.cpu_startup_entry.start_secondary.secondary_startup_64_no_verify 50.66 +3.9 54.52 ± 3% perf-profile.calltrace.cycles-pp.start_secondary.secondary_startup_64_no_verify 47.04 +3.9 50.90 ± 3% perf-profile.calltrace.cycles-pp.cpuidle_enter.cpuidle_idle_call.do_idle.cpu_startup_entry.start_secondary 50.20 +3.9 54.08 ± 3% perf-profile.calltrace.cycles-pp.cpuidle_idle_call.do_idle.cpu_startup_entry.start_secondary.secondary_startup_64_no_verify 46.16 +3.9 50.04 ± 3% perf-profile.calltrace.cycles-pp.cpuidle_enter_state.cpuidle_enter.cpuidle_idle_call.do_idle.cpu_startup_entry 0.00 +4.1 4.08 ± 5% perf-profile.calltrace.cycles-pp.xas_create.xas_store.__xa_alloc.__xa_alloc_cyclic.shmem_doff_add 0.00 +4.2 4.16 ± 5% perf-profile.calltrace.cycles-pp.xas_store.__xa_alloc.__xa_alloc_cyclic.shmem_doff_add.shmem_mknod 0.00 +4.3 4.31 ± 4% perf-profile.calltrace.cycles-pp.__xa_alloc.__xa_alloc_cyclic.shmem_doff_add.shmem_mknod.lookup_open 0.00 +4.3 4.34 ± 4% perf-profile.calltrace.cycles-pp.__xa_alloc_cyclic.shmem_doff_add.shmem_mknod.lookup_open.open_last_lookups 0.00 +4.5 4.49 ± 5% perf-profile.calltrace.cycles-pp.shmem_doff_add.shmem_mknod.lookup_open.open_last_lookups.path_openat 7.97 -1.8 6.14 ± 5% perf-profile.children.cycles-pp.__xstat64 6.42 -1.6 4.83 ± 6% perf-profile.children.cycles-pp.link_path_walk 6.69 -1.5 5.15 ± 5% perf-profile.children.cycles-pp.__do_sys_newstat 5.93 -1.4 4.57 ± 4% perf-profile.children.cycles-pp.vfs_fstatat 12.81 -1.3 11.52 ± 5% perf-profile.children.cycles-pp.unlink 11.68 -1.0 10.71 ± 5% perf-profile.children.cycles-pp.__x64_sys_unlink 4.32 -1.0 3.37 ± 5% perf-profile.children.cycles-pp.vfs_statx 3.72 ± 2% -0.9 2.79 ± 5% perf-profile.children.cycles-pp.__close 3.56 ± 3% -0.9 2.68 ± 4% perf-profile.children.cycles-pp.getname_flags 3.58 -0.8 2.80 ± 5% perf-profile.children.cycles-pp.filename_lookup 2.96 -0.7 2.22 ± 6% perf-profile.children.cycles-pp.dput 3.49 -0.7 2.74 ± 5% perf-profile.children.cycles-pp.path_lookupat 3.36 -0.7 2.62 ± 6% perf-profile.children.cycles-pp.shmem_get_inode 2.50 -0.7 1.78 ± 5% perf-profile.children.cycles-pp.filename_parentat 2.43 ± 2% -0.7 1.72 ± 6% perf-profile.children.cycles-pp.path_parentat 2.44 ± 3% -0.6 1.80 ± 3% perf-profile.children.cycles-pp.strncpy_from_user 2.62 ± 2% -0.6 2.05 ± 6% perf-profile.children.cycles-pp.new_inode 2.31 ± 2% -0.6 1.75 ± 5% perf-profile.children.cycles-pp.syscall_exit_to_user_mode 2.17 ± 2% -0.5 1.65 ± 5% perf-profile.children.cycles-pp.exit_to_user_mode_prepare 2.08 ± 2% -0.5 1.57 ± 7% perf-profile.children.cycles-pp.alloc_inode 2.17 ± 4% -0.5 1.66 ± 5% perf-profile.children.cycles-pp.inode_permission 2.08 ± 2% -0.5 1.59 ± 5% perf-profile.children.cycles-pp.exit_to_user_mode_loop 1.61 ± 2% -0.5 1.14 ± 4% perf-profile.children.cycles-pp.__might_resched 1.79 ± 2% -0.5 1.33 ± 8% perf-profile.children.cycles-pp.shmem_alloc_inode 2.07 ± 2% -0.4 1.62 ± 7% perf-profile.children.cycles-pp.walk_component 2.20 ± 3% -0.4 1.75 ± 5% perf-profile.children.cycles-pp.kmem_cache_alloc 1.88 ± 2% -0.4 1.43 ± 6% perf-profile.children.cycles-pp.task_work_run 1.56 ± 4% -0.4 1.11 ± 8% perf-profile.children.cycles-pp.__entry_text_start 1.86 ± 3% -0.4 1.45 ± 5% perf-profile.children.cycles-pp.d_alloc_parallel 1.88 ± 2% -0.4 1.48 ± 6% perf-profile.children.cycles-pp.alloc_empty_file 1.78 ± 2% -0.4 1.40 ± 7% perf-profile.children.cycles-pp.__alloc_file 1.67 ± 2% -0.4 1.31 ± 6% perf-profile.children.cycles-pp.evict 1.56 -0.3 1.23 ± 8% perf-profile.children.cycles-pp.lookup_fast 1.53 ± 3% -0.3 1.22 ± 7% perf-profile.children.cycles-pp.d_alloc 0.96 ± 4% -0.3 0.67 ± 10% perf-profile.children.cycles-pp.try_to_unlazy 1.25 ± 4% -0.3 0.97 ± 6% perf-profile.children.cycles-pp.do_open 1.11 ± 2% -0.3 0.83 ± 8% perf-profile.children.cycles-pp.__fput 1.16 ± 3% -0.3 0.88 ± 6% perf-profile.children.cycles-pp.generic_permission 1.19 ± 3% -0.3 0.92 ± 7% perf-profile.children.cycles-pp.__d_lookup_rcu 0.85 ± 3% -0.2 0.60 ± 8% perf-profile.children.cycles-pp.__check_object_size 0.73 ± 2% -0.2 0.49 ± 11% perf-profile.children.cycles-pp.complete_walk 0.94 ± 5% -0.2 0.71 ± 8% perf-profile.children.cycles-pp.__cond_resched 1.13 ± 4% -0.2 0.91 ± 6% perf-profile.children.cycles-pp.__d_alloc 1.07 ± 3% -0.2 0.85 ± 8% perf-profile.children.cycles-pp.shmem_evict_inode 0.79 ± 6% -0.2 0.57 ± 6% perf-profile.children.cycles-pp.simple_lookup 0.77 ± 3% -0.2 0.55 ± 8% perf-profile.children.cycles-pp.lockref_put_or_lock 0.75 ± 6% -0.2 0.53 ± 11% perf-profile.children.cycles-pp.__legitimize_path 0.74 ± 8% -0.2 0.53 ± 6% perf-profile.children.cycles-pp.__d_add 0.80 ± 9% -0.2 0.59 ± 2% perf-profile.children.cycles-pp.step_into 0.88 ± 4% -0.2 0.68 ± 8% perf-profile.children.cycles-pp.do_dentry_open 1.01 ± 5% -0.2 0.82 ± 4% perf-profile.children.cycles-pp.dentry_kill 0.55 ± 3% -0.2 0.38 ± 13% perf-profile.children.cycles-pp.inode_init_once 0.74 ± 4% -0.2 0.57 ± 8% perf-profile.children.cycles-pp.cp_new_stat 0.69 ± 3% -0.2 0.52 ± 11% perf-profile.children.cycles-pp.entry_SYSRETQ_unsafe_stack 0.65 ± 12% -0.2 0.49 ± 11% perf-profile.children.cycles-pp.security_file_alloc 0.87 ± 5% -0.2 0.71 ± 6% perf-profile.children.cycles-pp.__dentry_kill 0.59 ± 5% -0.1 0.45 ± 3% perf-profile.children.cycles-pp.down_write 0.70 ± 5% -0.1 0.56 ± 12% perf-profile.children.cycles-pp.scramble 0.46 ± 5% -0.1 0.31 ± 8% perf-profile.children.cycles-pp.make_vfsuid 0.66 ± 6% -0.1 0.52 ± 10% perf-profile.children.cycles-pp._IO_fgets 0.49 ± 7% -0.1 0.36 ± 8% perf-profile.children.cycles-pp.rcu_all_qs 0.53 ± 4% -0.1 0.39 ± 8% perf-profile.children.cycles-pp._IO_getline_info 0.51 ± 6% -0.1 0.37 ± 9% perf-profile.children.cycles-pp.fsnotify_destroy_marks 0.54 ± 6% -0.1 0.41 ± 6% perf-profile.children.cycles-pp.fsnotify 0.46 ± 5% -0.1 0.34 ± 9% perf-profile.children.cycles-pp.security_inode_permission 0.63 ± 6% -0.1 0.51 ± 10% perf-profile.children.cycles-pp.shmem_undo_range 0.46 ± 7% -0.1 0.34 ± 9% perf-profile.children.cycles-pp.fsnotify_grab_connector 0.65 ± 2% -0.1 0.53 ± 6% perf-profile.children.cycles-pp.__might_sleep 0.45 ± 4% -0.1 0.33 ± 9% perf-profile.children.cycles-pp.dentry_unlink_inode 0.46 ± 12% -0.1 0.35 ± 16% perf-profile.children.cycles-pp.mnt_want_write 0.44 ± 5% -0.1 0.34 ± 10% perf-profile.children.cycles-pp.iput 0.46 ± 9% -0.1 0.36 ± 7% perf-profile.children.cycles-pp.destroy_inode 0.48 ± 8% -0.1 0.38 ± 9% perf-profile.children.cycles-pp._copy_to_user 0.42 ± 4% -0.1 0.31 ± 12% perf-profile.children.cycles-pp.terminate_walk 0.38 ± 10% -0.1 0.27 ± 15% perf-profile.children.cycles-pp.__legitimize_mnt 0.32 ± 6% -0.1 0.23 ± 9% perf-profile.children.cycles-pp.lockref_get_not_dead 0.43 ± 8% -0.1 0.34 ± 15% perf-profile.children.cycles-pp.filp_close 0.41 ± 8% -0.1 0.32 ± 7% perf-profile.children.cycles-pp.__destroy_inode 0.35 ± 2% -0.1 0.26 ± 4% perf-profile.children.cycles-pp.__might_fault 0.35 ± 9% -0.1 0.26 ± 18% perf-profile.children.cycles-pp.__mnt_want_write 0.39 ± 3% -0.1 0.31 ± 14% perf-profile.children.cycles-pp.obj_cgroup_charge 0.31 ± 5% -0.1 0.22 ± 21% perf-profile.children.cycles-pp.memcg_list_lru_alloc 0.36 ± 6% -0.1 0.27 ± 9% perf-profile.children.cycles-pp._atomic_dec_and_lock 0.35 ± 10% -0.1 0.27 ± 6% perf-profile.children.cycles-pp.path_init 0.32 ± 8% -0.1 0.25 ± 10% perf-profile.children.cycles-pp.alloc_fd 0.22 ± 12% -0.1 0.15 ± 21% perf-profile.children.cycles-pp.__virt_addr_valid 0.44 ± 9% -0.1 0.36 ± 13% perf-profile.children.cycles-pp.copy_user_enhanced_fast_string 0.37 ± 8% -0.1 0.29 ± 6% perf-profile.children.cycles-pp.simple_acl_create 0.23 ± 9% -0.1 0.16 ± 7% perf-profile.children.cycles-pp.set_cached_acl 0.23 ± 12% -0.1 0.16 ± 9% perf-profile.children.cycles-pp.shmem_getattr 0.43 ± 6% -0.1 0.36 ± 9% perf-profile.children.cycles-pp.inode_maybe_inc_iversion 0.30 ± 6% -0.1 0.23 ± 6% perf-profile.children.cycles-pp.refill_obj_stock 0.17 ± 12% -0.1 0.10 ± 8% perf-profile.children.cycles-pp.__d_rehash 0.30 ± 9% -0.1 0.23 ± 14% perf-profile.children.cycles-pp.up_write 0.23 ± 10% -0.1 0.16 ± 11% perf-profile.children.cycles-pp.d_delete 0.21 ± 7% -0.1 0.15 ± 16% perf-profile.children.cycles-pp.xa_load 0.26 ± 10% -0.1 0.19 ± 13% perf-profile.children.cycles-pp.__filemap_get_folio 0.22 ± 13% -0.1 0.16 ± 9% perf-profile.children.cycles-pp.__srcu_read_unlock 0.18 ± 13% -0.1 0.12 ± 19% perf-profile.children.cycles-pp.map_id_up 0.25 ± 7% -0.1 0.19 ± 15% perf-profile.children.cycles-pp.apparmor_file_alloc_security 0.29 ± 12% -0.1 0.22 ± 6% perf-profile.children.cycles-pp.d_lookup 0.26 ± 7% -0.1 0.20 ± 5% perf-profile.children.cycles-pp.__lookup_hash 0.25 ± 6% -0.1 0.19 ± 19% perf-profile.children.cycles-pp.path_put 0.23 ± 10% -0.1 0.17 ± 13% perf-profile.children.cycles-pp.__d_instantiate 0.63 ± 5% -0.1 0.57 ± 5% perf-profile.children.cycles-pp.dsearch 0.17 ± 6% -0.1 0.12 ± 8% perf-profile.children.cycles-pp._get_random_bytes 0.13 ± 12% -0.1 0.08 ± 9% perf-profile.children.cycles-pp.find_lock_entries 0.23 ± 8% -0.0 0.18 ± 2% perf-profile.children.cycles-pp.lookup_dcache 0.16 ± 8% -0.0 0.11 ± 3% perf-profile.children.cycles-pp.syscall_return_via_sysret 0.19 ± 9% -0.0 0.14 ± 5% perf-profile.children.cycles-pp.shmem_file_read_iter 0.14 ± 16% -0.0 0.09 ± 4% perf-profile.children.cycles-pp.generic_fillattr 0.21 ± 9% -0.0 0.17 ± 8% perf-profile.children.cycles-pp.chdir 0.19 ± 6% -0.0 0.15 ± 18% perf-profile.children.cycles-pp.security_file_open 0.10 ± 14% -0.0 0.06 ± 12% perf-profile.children.cycles-pp.crng_make_state 0.24 ± 9% -0.0 0.20 ± 7% perf-profile.children.cycles-pp.may_delete 0.18 ± 8% -0.0 0.14 ± 16% perf-profile.children.cycles-pp.inode_wait_for_writeback 0.12 ± 9% -0.0 0.08 ± 14% perf-profile.children.cycles-pp.entry_SYSCALL_64_safe_stack 0.19 ± 11% -0.0 0.16 ± 13% perf-profile.children.cycles-pp.__fsnotify_parent 0.17 ± 5% -0.0 0.13 ± 10% perf-profile.children.cycles-pp.ihold 0.14 ± 3% -0.0 0.11 ± 7% perf-profile.children.cycles-pp.chacha_block_generic 0.13 ± 14% -0.0 0.09 ± 12% perf-profile.children.cycles-pp.tsc_verify_tsc_adjust 0.13 ± 3% -0.0 0.09 ± 10% perf-profile.children.cycles-pp.chacha_permute 0.09 ± 4% -0.0 0.06 ± 12% perf-profile.children.cycles-pp.putname 0.09 ± 16% -0.0 0.06 ± 14% perf-profile.children.cycles-pp.crng_fast_key_erasure 0.15 ± 11% -0.0 0.12 ± 6% perf-profile.children.cycles-pp.mntput_no_expire 0.08 ± 13% -0.0 0.05 ± 52% perf-profile.children.cycles-pp.getcwd 0.11 ± 11% -0.0 0.08 ± 12% perf-profile.children.cycles-pp.security_path_mknod 0.18 ± 9% -0.0 0.15 ± 4% perf-profile.children.cycles-pp.___d_drop 0.09 ± 10% -0.0 0.07 ± 5% perf-profile.children.cycles-pp.security_file_free 0.10 ± 4% -0.0 0.08 ± 13% perf-profile.children.cycles-pp.security_path_unlink 0.16 ± 4% -0.0 0.14 ± 7% perf-profile.children.cycles-pp._find_next_and_bit 0.09 ± 8% -0.0 0.07 ± 11% perf-profile.children.cycles-pp.drop_nlink 0.08 ± 7% -0.0 0.06 ± 12% perf-profile.children.cycles-pp.apparmor_path_unlink 0.13 ± 8% +0.0 0.15 ± 6% perf-profile.children.cycles-pp.idle_cpu 0.08 ± 13% +0.0 0.10 ± 9% perf-profile.children.cycles-pp.map__process_kallsym_symbol 0.04 ± 51% +0.0 0.07 ± 11% perf-profile.children.cycles-pp.do_read_fault 0.03 ± 82% +0.0 0.07 ± 7% perf-profile.children.cycles-pp.write_cache 0.05 ± 50% +0.0 0.08 ± 13% perf-profile.children.cycles-pp.update_wall_time 0.05 ± 50% +0.0 0.08 ± 13% perf-profile.children.cycles-pp.timekeeping_advance 0.18 ± 11% +0.0 0.22 ± 7% perf-profile.children.cycles-pp.update_curr 0.18 ± 6% +0.0 0.22 ± 8% perf-profile.children.cycles-pp.force_qs_rnp 0.16 ± 15% +0.0 0.20 ± 7% perf-profile.children.cycles-pp._raw_spin_unlock_irqrestore 0.13 ± 17% +0.0 0.18 ± 9% perf-profile.children.cycles-pp.task_tick_fair 0.03 ± 82% +0.1 0.09 ± 9% perf-profile.children.cycles-pp.rcu_segcblist_pend_cbs 0.00 +0.1 0.06 ± 12% perf-profile.children.cycles-pp.rmqueue_bulk 0.00 +0.1 0.08 ± 13% perf-profile.children.cycles-pp.free_pcppages_bulk 0.67 ± 7% +0.1 0.75 ± 4% perf-profile.children.cycles-pp.sched_clock_cpu 0.86 ± 5% +0.1 0.94 perf-profile.children.cycles-pp.native_sched_clock 0.06 ± 12% +0.1 0.14 ± 16% perf-profile.children.cycles-pp.__unfreeze_partials 0.00 +0.1 0.08 ± 19% perf-profile.children.cycles-pp.rmqueue 0.06 ± 15% +0.1 0.15 ± 11% perf-profile.children.cycles-pp.free_unref_page 0.00 +0.1 0.09 ± 14% perf-profile.children.cycles-pp.inc_slabs_node 0.00 +0.1 0.09 ± 22% perf-profile.children.cycles-pp.xas_clear_mark 0.01 ±200% +0.1 0.11 ± 17% perf-profile.children.cycles-pp.get_page_from_freelist 0.82 ± 3% +0.1 0.92 ± 5% perf-profile.children.cycles-pp._raw_spin_lock_irq 0.02 ±123% +0.1 0.13 ± 17% perf-profile.children.cycles-pp.__alloc_pages 0.40 ± 5% +0.2 0.56 ± 7% perf-profile.children.cycles-pp.note_gp_changes 0.94 ± 6% +0.2 1.10 ± 6% perf-profile.children.cycles-pp._raw_spin_lock_irqsave 0.15 ± 11% +0.2 0.31 ± 5% perf-profile.children.cycles-pp.rcu_nocb_try_bypass 0.28 ± 11% +0.2 0.48 ± 12% perf-profile.children.cycles-pp.rcu_segcblist_enqueue 0.90 ± 9% +0.2 1.10 ± 7% perf-profile.children.cycles-pp.rcu_pending 0.24 ± 21% +0.2 0.46 ± 22% perf-profile.children.cycles-pp.smpboot_thread_fn 0.01 ±200% +0.2 0.26 ± 36% perf-profile.children.cycles-pp.run_ksoftirqd 1.07 ± 6% +0.3 1.32 ± 6% perf-profile.children.cycles-pp.rcu_sched_clock_irq 0.60 ± 7% +0.3 0.87 ± 12% perf-profile.children.cycles-pp.ret_from_fork 0.59 ± 7% +0.3 0.86 ± 12% perf-profile.children.cycles-pp.kthread 0.00 +0.7 0.66 ± 5% perf-profile.children.cycles-pp.radix_tree_node_rcu_free 4.01 ± 6% +0.7 4.74 ± 4% perf-profile.children.cycles-pp.scheduler_tick 1.93 ± 3% +0.8 2.77 ± 4% perf-profile.children.cycles-pp.ct_kernel_exit_state 2.04 ± 4% +0.8 2.89 ± 4% perf-profile.children.cycles-pp.ct_idle_exit 2.01 ± 4% +0.8 2.86 ± 4% perf-profile.children.cycles-pp.ct_kernel_enter 5.98 ± 4% +1.1 7.05 ± 3% perf-profile.children.cycles-pp.update_process_times 6.05 ± 4% +1.1 7.13 ± 3% perf-profile.children.cycles-pp.tick_sched_handle 1.04 ± 6% +1.2 2.21 ± 3% perf-profile.children.cycles-pp.__call_rcu_common 2.54 ± 3% +1.3 3.83 ± 5% perf-profile.children.cycles-pp.vfs_unlink 6.92 ± 6% +1.4 8.29 ± 7% perf-profile.children.cycles-pp.tick_sched_timer 9.90 ± 4% +1.6 11.52 ± 6% perf-profile.children.cycles-pp.__hrtimer_run_queues 1.09 ± 6% +1.7 2.79 ± 2% perf-profile.children.cycles-pp.__slab_free 0.58 ± 2% +1.8 2.35 ± 6% perf-profile.children.cycles-pp.shmem_unlink 0.00 +1.9 1.86 ± 7% perf-profile.children.cycles-pp.__xa_erase 12.88 ± 3% +1.9 14.77 ± 6% perf-profile.children.cycles-pp.hrtimer_interrupt 13.66 ± 3% +1.9 15.57 ± 5% perf-profile.children.cycles-pp.__sysvec_apic_timer_interrupt 0.00 +1.9 1.93 ± 6% perf-profile.children.cycles-pp.xa_erase 0.57 ± 3% +2.0 2.62 ± 5% perf-profile.children.cycles-pp.setup_object 0.00 +2.1 2.14 ± 6% perf-profile.children.cycles-pp.radix_tree_node_ctor 0.61 ± 3% +2.2 2.77 ± 5% perf-profile.children.cycles-pp.shuffle_freelist 0.65 ± 3% +2.3 2.91 ± 5% perf-profile.children.cycles-pp.allocate_slab 0.00 +2.3 2.30 ± 9% perf-profile.children.cycles-pp.xas_expand 9.38 +2.4 11.73 ± 5% perf-profile.children.cycles-pp.open_last_lookups 7.58 ± 5% +2.4 9.98 ± 2% perf-profile.children.cycles-pp.__irq_exit_rcu 0.81 ± 5% +2.4 3.23 ± 4% perf-profile.children.cycles-pp.___slab_alloc 7.16 ± 6% +2.6 9.76 ± 2% perf-profile.children.cycles-pp.__do_softirq 3.16 ± 3% +2.6 5.78 ± 2% perf-profile.children.cycles-pp.rcu_do_batch 7.96 +2.7 10.65 ± 5% perf-profile.children.cycles-pp.lookup_open 3.62 ± 3% +2.8 6.41 perf-profile.children.cycles-pp.rcu_core 2.81 +2.8 5.61 ± 5% perf-profile.children.cycles-pp.kmem_cache_alloc_lru 4.64 +3.5 8.16 ± 5% perf-profile.children.cycles-pp.shmem_mknod 0.00 +3.7 3.68 ± 5% perf-profile.children.cycles-pp.xas_alloc 50.91 +3.8 54.75 ± 3% perf-profile.children.cycles-pp.secondary_startup_64_no_verify 50.91 +3.8 54.75 ± 3% perf-profile.children.cycles-pp.cpu_startup_entry 50.91 +3.8 54.75 ± 3% perf-profile.children.cycles-pp.do_idle 50.66 +3.9 54.52 ± 3% perf-profile.children.cycles-pp.start_secondary 47.29 +3.9 51.14 ± 3% perf-profile.children.cycles-pp.cpuidle_enter 47.24 +3.9 51.10 ± 3% perf-profile.children.cycles-pp.cpuidle_enter_state 50.49 +3.9 54.37 ± 3% perf-profile.children.cycles-pp.cpuidle_idle_call 0.00 +4.1 4.10 ± 5% perf-profile.children.cycles-pp.xas_create 0.00 +4.3 4.32 ± 5% perf-profile.children.cycles-pp.__xa_alloc 0.00 +4.4 4.35 ± 4% perf-profile.children.cycles-pp.__xa_alloc_cyclic 23.08 +4.4 27.48 ± 3% perf-profile.children.cycles-pp.sysvec_apic_timer_interrupt 25.36 +4.4 29.77 ± 3% perf-profile.children.cycles-pp.asm_sysvec_apic_timer_interrupt 0.00 +4.5 4.51 ± 5% perf-profile.children.cycles-pp.shmem_doff_add 0.00 +6.0 6.01 ± 5% perf-profile.children.cycles-pp.xas_store 1.77 -0.6 1.22 ± 8% perf-profile.self.cycles-pp.link_path_walk 1.34 ± 2% -0.4 0.90 ± 6% perf-profile.self.cycles-pp.__might_resched 1.29 ± 4% -0.4 0.92 ± 3% perf-profile.self.cycles-pp.strncpy_from_user 0.97 ± 3% -0.3 0.69 ± 6% perf-profile.self.cycles-pp.__d_lookup_rcu 0.62 ± 3% -0.2 0.41 ± 11% perf-profile.self.cycles-pp.lockref_put_or_lock 0.68 ± 8% -0.2 0.47 ± 3% perf-profile.self.cycles-pp.step_into 0.84 ± 5% -0.2 0.64 ± 9% perf-profile.self.cycles-pp.mod_objcg_state 0.75 ± 4% -0.2 0.56 ± 8% perf-profile.self.cycles-pp.kmem_cache_alloc 0.78 ± 4% -0.2 0.58 ± 9% perf-profile.self.cycles-pp.generic_permission 0.40 ± 5% -0.2 0.22 ± 14% perf-profile.self.cycles-pp.inode_init_once 0.79 ± 5% -0.2 0.61 ± 7% perf-profile.self.cycles-pp.inode_permission 0.68 ± 2% -0.2 0.51 ± 10% perf-profile.self.cycles-pp.entry_SYSRETQ_unsafe_stack 0.60 ± 5% -0.1 0.45 ± 12% perf-profile.self.cycles-pp.scramble 0.49 ± 3% -0.1 0.35 ± 6% perf-profile.self.cycles-pp._IO_getline_info 0.50 ± 6% -0.1 0.36 ± 15% perf-profile.self.cycles-pp.walk_component 0.59 ± 5% -0.1 0.45 ± 11% perf-profile.self.cycles-pp._IO_fgets 0.40 ± 5% -0.1 0.27 ± 7% perf-profile.self.cycles-pp.creat64 0.35 ± 9% -0.1 0.22 ± 4% perf-profile.self.cycles-pp.__close 0.49 ± 6% -0.1 0.36 ± 5% perf-profile.self.cycles-pp.fsnotify 0.40 ± 5% -0.1 0.27 ± 10% perf-profile.self.cycles-pp.__xstat64 0.34 ± 10% -0.1 0.22 ± 6% perf-profile.self.cycles-pp.make_vfsuid 0.34 ± 9% -0.1 0.23 ± 16% perf-profile.self.cycles-pp.unlink 0.46 ± 5% -0.1 0.35 ± 10% perf-profile.self.cycles-pp.__cond_resched 0.32 ± 8% -0.1 0.21 ± 12% perf-profile.self.cycles-pp.do_dentry_open 0.32 ± 5% -0.1 0.21 ± 9% perf-profile.self.cycles-pp.__check_object_size 0.51 ± 3% -0.1 0.40 ± 5% perf-profile.self.cycles-pp.__might_sleep 0.33 ± 11% -0.1 0.22 ± 19% perf-profile.self.cycles-pp.__legitimize_mnt 0.37 ± 2% -0.1 0.28 ± 10% perf-profile.self.cycles-pp.security_inode_permission 0.30 ± 11% -0.1 0.20 ± 7% perf-profile.self.cycles-pp.path_init 0.39 ± 7% -0.1 0.30 ± 10% perf-profile.self.cycles-pp.inode_maybe_inc_iversion 0.30 ± 5% -0.1 0.21 ± 5% perf-profile.self.cycles-pp.rcu_all_qs 0.29 ± 3% -0.1 0.20 ± 14% perf-profile.self.cycles-pp.entry_SYSCALL_64_after_hwframe 0.31 ± 10% -0.1 0.23 ± 16% perf-profile.self.cycles-pp.__mnt_want_write 0.31 ± 8% -0.1 0.22 ± 7% perf-profile.self.cycles-pp.dput 0.41 ± 10% -0.1 0.33 ± 12% perf-profile.self.cycles-pp.copy_user_enhanced_fast_string 0.27 ± 11% -0.1 0.19 ± 6% perf-profile.self.cycles-pp.d_alloc_parallel 0.26 ± 5% -0.1 0.18 ± 9% perf-profile.self.cycles-pp.lockref_get_not_dead 0.21 ± 6% -0.1 0.14 ± 6% perf-profile.self.cycles-pp.lookup_open 0.21 ± 9% -0.1 0.14 ± 9% perf-profile.self.cycles-pp.set_cached_acl 0.28 ± 8% -0.1 0.21 ± 5% perf-profile.self.cycles-pp.refill_obj_stock 0.26 ± 8% -0.1 0.19 ± 14% perf-profile.self.cycles-pp.up_write 0.20 ± 14% -0.1 0.13 ± 14% perf-profile.self.cycles-pp.shmem_get_inode 0.15 ± 14% -0.1 0.09 ± 11% perf-profile.self.cycles-pp.__d_rehash 0.18 ± 10% -0.1 0.11 ± 17% perf-profile.self.cycles-pp.__virt_addr_valid 0.15 ± 13% -0.1 0.08 ± 14% perf-profile.self.cycles-pp.map_id_up 0.17 ± 10% -0.1 0.11 ± 21% perf-profile.self.cycles-pp.do_syscall_64 0.22 ± 8% -0.1 0.16 ± 10% perf-profile.self.cycles-pp._atomic_dec_and_lock 0.23 ± 12% -0.1 0.17 ± 5% perf-profile.self.cycles-pp._IO_default_xsputn 0.16 ± 10% -0.1 0.10 ± 16% perf-profile.self.cycles-pp.shmem_evict_inode 0.16 ± 8% -0.1 0.11 ± 12% perf-profile.self.cycles-pp.__filemap_get_folio 0.19 ± 16% -0.1 0.14 ± 13% perf-profile.self.cycles-pp.__srcu_read_unlock 0.16 ± 13% -0.1 0.11 ± 6% perf-profile.self.cycles-pp.vfs_unlink 0.16 ± 8% -0.1 0.11 ± 4% perf-profile.self.cycles-pp.syscall_return_via_sysret 0.23 ± 10% -0.1 0.18 ± 12% perf-profile.self.cycles-pp.getname_flags 0.17 ± 6% -0.0 0.12 ± 19% perf-profile.self.cycles-pp.apparmor_file_open 0.17 ± 13% -0.0 0.12 ± 10% perf-profile.self.cycles-pp.__srcu_read_lock 0.12 ± 13% -0.0 0.07 ± 25% perf-profile.self.cycles-pp.fput 0.08 ± 7% -0.0 0.03 ± 82% perf-profile.self.cycles-pp.putname 0.20 ± 7% -0.0 0.16 ± 3% perf-profile.self.cycles-pp.__fput 0.15 ± 12% -0.0 0.11 ± 22% perf-profile.self.cycles-pp.apparmor_file_alloc_security 0.18 ± 9% -0.0 0.14 ± 11% perf-profile.self.cycles-pp.__alloc_file 0.17 ± 8% -0.0 0.13 ± 16% perf-profile.self.cycles-pp.lockref_get 0.08 ± 9% -0.0 0.04 ± 82% perf-profile.self.cycles-pp.may_delete 0.13 ± 12% -0.0 0.09 ± 15% perf-profile.self.cycles-pp.cp_new_stat 0.10 ± 13% -0.0 0.06 ± 15% perf-profile.self.cycles-pp.do_sys_openat2 0.12 ± 9% -0.0 0.08 ± 14% perf-profile.self.cycles-pp.entry_SYSCALL_64_safe_stack 0.12 ± 15% -0.0 0.08 ± 19% perf-profile.self.cycles-pp.open_last_lookups 0.17 ± 8% -0.0 0.13 ± 10% perf-profile.self.cycles-pp.__fsnotify_parent 0.16 ± 9% -0.0 0.12 ± 8% perf-profile.self.cycles-pp.___d_drop 0.14 ± 7% -0.0 0.10 ± 17% perf-profile.self.cycles-pp.alloc_fd 0.12 ± 17% -0.0 0.08 ± 9% perf-profile.self.cycles-pp.try_to_unlazy 0.10 ± 5% -0.0 0.06 ± 52% perf-profile.self.cycles-pp.syscall_exit_to_user_mode 0.08 ± 14% -0.0 0.05 ± 52% perf-profile.self.cycles-pp.exit_to_user_mode_prepare 0.10 ± 15% -0.0 0.07 ± 9% perf-profile.self.cycles-pp.__d_add 0.15 ± 7% -0.0 0.12 ± 11% perf-profile.self.cycles-pp.security_inode_init_security 0.13 ± 3% -0.0 0.09 ± 10% perf-profile.self.cycles-pp.chacha_permute 0.12 ± 8% -0.0 0.09 ± 10% perf-profile.self.cycles-pp.__d_lookup_unhash 0.09 ± 8% -0.0 0.06 ± 10% perf-profile.self.cycles-pp.generic_fillattr 0.09 ± 14% -0.0 0.06 ± 18% perf-profile.self.cycles-pp.shmem_mknod 0.11 ± 13% -0.0 0.08 ± 9% perf-profile.self.cycles-pp.exit_to_user_mode_loop 0.11 ± 15% -0.0 0.08 ± 7% perf-profile.self.cycles-pp.tsc_verify_tsc_adjust 0.10 ± 9% -0.0 0.07 ± 16% perf-profile.self.cycles-pp.get_obj_cgroup_from_current 0.13 ± 12% -0.0 0.10 ± 12% perf-profile.self.cycles-pp.mntput_no_expire 0.09 ± 4% -0.0 0.06 ± 12% perf-profile.self.cycles-pp.drop_nlink 0.15 ± 3% -0.0 0.13 ± 5% perf-profile.self.cycles-pp._find_next_and_bit 0.08 ± 13% -0.0 0.05 ± 14% perf-profile.self.cycles-pp.path_openat 0.09 ± 8% -0.0 0.07 ± 14% perf-profile.self.cycles-pp.__d_instantiate 0.07 ± 16% -0.0 0.05 ± 9% perf-profile.self.cycles-pp.__might_fault 0.08 ± 9% -0.0 0.06 ± 6% perf-profile.self.cycles-pp.do_open 0.06 +0.0 0.08 ± 10% perf-profile.self.cycles-pp.hrtimer_next_event_without 0.10 ± 14% +0.0 0.13 ± 19% perf-profile.self.cycles-pp.___slab_alloc 0.01 ±200% +0.1 0.07 ± 11% perf-profile.self.cycles-pp.rcu_segcblist_pend_cbs 0.25 ± 4% +0.1 0.32 ± 8% perf-profile.self.cycles-pp.slab_pre_alloc_hook 0.83 ± 5% +0.1 0.90 perf-profile.self.cycles-pp.native_sched_clock 0.00 +0.1 0.07 ± 18% perf-profile.self.cycles-pp.xas_clear_mark 0.00 +0.1 0.08 ± 14% perf-profile.self.cycles-pp.setup_object 0.00 +0.1 0.09 ± 13% perf-profile.self.cycles-pp.inc_slabs_node 0.24 ± 10% +0.1 0.32 ± 10% perf-profile.self.cycles-pp.note_gp_changes 0.08 ± 17% +0.1 0.18 ± 8% perf-profile.self.cycles-pp.rcu_do_batch 0.79 ± 2% +0.1 0.91 ± 5% perf-profile.self.cycles-pp._raw_spin_lock_irq 0.11 ± 14% +0.1 0.23 ± 8% perf-profile.self.cycles-pp.rcu_nocb_try_bypass 0.00 +0.1 0.13 ± 9% perf-profile.self.cycles-pp.shuffle_freelist 0.90 ± 6% +0.2 1.05 ± 6% perf-profile.self.cycles-pp._raw_spin_lock_irqsave 0.26 ± 12% +0.2 0.44 ± 14% perf-profile.self.cycles-pp.rcu_segcblist_enqueue 0.00 +0.2 0.18 ± 5% perf-profile.self.cycles-pp.xas_create 0.58 ± 14% +0.2 0.77 ± 11% perf-profile.self.cycles-pp.rcu_pending 0.00 +0.2 0.19 ± 15% perf-profile.self.cycles-pp.xas_alloc 0.00 +0.2 0.19 ± 15% perf-profile.self.cycles-pp.xas_expand 0.00 +0.3 0.35 ± 5% perf-profile.self.cycles-pp.xas_store 0.21 ± 5% +0.5 0.67 ± 9% perf-profile.self.cycles-pp.kmem_cache_alloc_lru 0.56 ± 8% +0.6 1.16 ± 4% perf-profile.self.cycles-pp.__call_rcu_common 0.00 +0.6 0.64 ± 6% perf-profile.self.cycles-pp.radix_tree_node_rcu_free 1.92 ± 3% +0.8 2.76 ± 4% perf-profile.self.cycles-pp.ct_kernel_exit_state 1.08 ± 6% +1.6 2.71 ± 3% perf-profile.self.cycles-pp.__slab_free 0.00 +1.8 1.80 ± 6% perf-profile.self.cycles-pp.radix_tree_node_ctor Disclaimer: Results have been estimated based on internal Intel analysis and are provided for informational purposes only. Any difference in system hardware or software design or configuration may affect actual performance.
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 103d1000a5a2..682ef885aa89 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -26,6 +26,8 @@ struct shmem_inode_info { atomic_t stop_eviction; /* hold when working on inode */ struct timespec64 i_crtime; /* file creation time */ unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ + struct xarray doff_map; /* dir offset to entry mapping */ + u32 next_doff; struct inode vfs_inode; }; diff --git a/mm/shmem.c b/mm/shmem.c index 448f393d8ab2..ba4176499e5c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -40,6 +40,8 @@ #include <linux/fs_parser.h> #include <linux/swapfile.h> #include <linux/iversion.h> +#include <linux/xarray.h> + #include "swap.h" static struct vfsmount *shm_mnt; @@ -234,6 +236,7 @@ static const struct super_operations shmem_ops; const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; +static const struct file_operations shmem_dir_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; @@ -2397,7 +2400,9 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * BOGO_DIRENT_SIZE; inode->i_op = &shmem_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + inode->i_fop = &shmem_dir_operations; + xa_init_flags(&info->doff_map, XA_FLAGS_ALLOC1); + info->next_doff = 0; break; case S_IFLNK: /* @@ -2917,6 +2922,71 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static struct xarray *shmem_doff_map(struct inode *dir) +{ + return &SHMEM_I(dir)->doff_map; +} + +static int shmem_doff_add(struct inode *dir, struct dentry *dentry) +{ + struct shmem_inode_info *info = SHMEM_I(dir); + struct xa_limit limit = XA_LIMIT(2, U32_MAX); + u32 offset; + int ret; + + if (dentry->d_fsdata) + return -EBUSY; + + offset = 0; + ret = xa_alloc_cyclic(shmem_doff_map(dir), &offset, dentry, limit, + &info->next_doff, GFP_KERNEL); + if (ret < 0) + return ret; + + dentry->d_fsdata = (void *)(unsigned long)offset; + return 0; +} + +static struct dentry *shmem_doff_find_after(struct dentry *dir, + unsigned long *offset) +{ + struct xarray *xa = shmem_doff_map(d_inode(dir)); + struct dentry *d, *found = NULL; + + spin_lock(&dir->d_lock); + d = xa_find_after(xa, offset, ULONG_MAX, XA_PRESENT); + if (d) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(d)) + found = dget_dlock(d); + spin_unlock(&d->d_lock); + } + spin_unlock(&dir->d_lock); + return found; +} + +static void shmem_doff_remove(struct inode *dir, struct dentry *dentry) +{ + u32 offset = (u32)(unsigned long)dentry->d_fsdata; + + if (!offset) + return; + + xa_erase(shmem_doff_map(dir), offset); + dentry->d_fsdata = NULL; +} + +/* + * During fs teardown (eg. umount), a directory's doff_map might still + * contain entries. xa_destroy() cleans out anything that remains. + */ +static void shmem_doff_map_destroy(struct inode *inode) +{ + struct xarray *xa = shmem_doff_map(inode); + + xa_destroy(xa); +} + /* * File creation. Allocate an inode, and we're done.. */ @@ -2938,6 +3008,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, if (error && error != -EOPNOTSUPP) goto out_iput; + error = shmem_doff_add(dir, dentry); + if (error) + goto out_iput; + error = 0; dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); @@ -3015,6 +3089,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr goto out; } + ret = shmem_doff_add(dir, dentry); + if (ret) + goto out; + dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); inode_inc_iversion(dir); @@ -3033,6 +3111,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) shmem_free_inode(inode->i_sb); + shmem_doff_remove(dir, dentry); + dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); inode_inc_iversion(dir); @@ -3091,24 +3171,37 @@ static int shmem_rename2(struct mnt_idmap *idmap, { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); + int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - if (flags & RENAME_EXCHANGE) + if (flags & RENAME_EXCHANGE) { + shmem_doff_remove(old_dir, old_dentry); + shmem_doff_remove(new_dir, new_dentry); + error = shmem_doff_add(new_dir, old_dentry); + if (error) + return error; + error = shmem_doff_add(old_dir, new_dentry); + if (error) + return error; return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); + } if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { - int error; - error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) return error; } + shmem_doff_remove(old_dir, old_dentry); + error = shmem_doff_add(new_dir, old_dentry); + if (error) + return error; + if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { @@ -3149,26 +3242,22 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error && error != -EOPNOTSUPP) { - iput(inode); - return error; - } + if (error && error != -EOPNOTSUPP) + goto out_iput; inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { inode->i_link = kmemdup(symname, len, GFP_KERNEL); if (!inode->i_link) { - iput(inode); - return -ENOMEM; + error = -ENOMEM; + goto out_iput; } inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); - if (error) { - iput(inode); - return error; - } + if (error) + goto out_iput; inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); @@ -3177,12 +3266,20 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, folio_unlock(folio); folio_put(folio); } + + error = shmem_doff_add(dir, dentry); + if (error) + goto out_iput; + dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); return 0; +out_iput: + iput(inode); + return error; } static void shmem_put_link(void *arg) @@ -3224,6 +3321,77 @@ static const char *shmem_get_link(struct dentry *dentry, return folio_address(folio); } +static loff_t shmem_dir_llseek(struct file *file, loff_t offset, int whence) +{ + switch (whence) { + case SEEK_CUR: + offset += file->f_pos; + fallthrough; + case SEEK_SET: + if (offset >= 0) + break; + fallthrough; + default: + return -EINVAL; + } + return vfs_setpos(file, offset, U32_MAX); +} + +static bool shmem_dir_emit(struct dir_context *ctx, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, + (loff_t)dentry->d_fsdata, inode->i_ino, + fs_umode_to_dtype(inode->i_mode)); +} + +/** + * shmem_readdir - Emit entries starting at offset @ctx->pos + * @file: an open directory to iterate over + * @ctx: directory iteration context + * + * Caller must hold @file's i_rwsem to prevent insertion or removal of + * entries during this call. + * + * On entry, @ctx->pos contains an offset that represents the first entry + * to be read from the directory. + * + * The operation continues until there are no more entries to read, or + * until the ctx->actor indicates there is no more space in the caller's + * output buffer. + * + * On return, @ctx->pos contains an offset that will read the next entry + * in this directory when shmem_readdir() is called again with @ctx. + * + * Return values: + * %0 - Complete + */ +static int shmem_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dentry, *dir = file->f_path.dentry; + unsigned long offset; + + lockdep_assert_held(&d_inode(dir)->i_rwsem); + + if (!dir_emit_dots(file, ctx)) + goto out; + for (offset = ctx->pos - 1; offset < ULONG_MAX - 1;) { + dentry = shmem_doff_find_after(dir, &offset); + if (!dentry) + break; + if (!shmem_dir_emit(ctx, dentry)) { + dput(dentry); + break; + } + ctx->pos = offset + 1; + dput(dentry); + } + +out: + return 0; +} + #ifdef CONFIG_TMPFS_XATTR static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) @@ -3742,6 +3910,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) return 0; } +#else /* CONFIG_TMPFS */ + +static inline void shmem_doff_map_destroy(struct inode *dir) +{ +} + #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) @@ -3888,6 +4062,8 @@ static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); + if (S_ISDIR(inode->i_mode)) + shmem_doff_map_destroy(inode); } static void shmem_init_inode(void *foo) @@ -3955,6 +4131,15 @@ static const struct inode_operations shmem_inode_operations = { #endif }; +static const struct file_operations shmem_dir_operations = { +#ifdef CONFIG_TMPFS + .llseek = shmem_dir_llseek, + .iterate_shared = shmem_readdir, +#endif + .read = generic_read_dir, + .fsync = noop_fsync, +}; + static const struct inode_operations shmem_dir_inode_operations = { #ifdef CONFIG_TMPFS .getattr = shmem_getattr,