Message ID | 168605707262.32244.4794425063054676856.stgit@manet.1015granger.net (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | shmemfs stable directory offsets | expand |
On 6/6/23 15:11, Chuck Lever wrote: > From: Chuck Lever <chuck.lever@oracle.com> > > The current cursor-based directory offset mechanism doesn't work > when a tmpfs filesystem is exported via NFS. This is because NFS > clients do not open directories. Each server-side READDIR operation > has to open the directory, read it, then close it. The cursor state > for that directory, being associated strictly with the opened > struct file, is thus discarded after each NFS READDIR operation. > > Directory offsets are cached not only by NFS clients, but also by > user space libraries on those clients. Essentially there is no way > to invalidate those caches when directory offsets have changed on > an NFS server after the offset-to-dentry mapping changes. Thus the > whole application stack depends on unchanging directory offsets. > > The solution we've come up with is to make the directory offset for > each file in a tmpfs filesystem stable for the life of the directory > entry it represents. > > shmem_readdir() and shmem_dir_llseek() now use an xarray to map each > directory offset (an loff_t integer) to the memory address of a > struct dentry. > > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > --- > mm/shmem.c | 39 +++++++++++++++++++++++++++++++++++---- > 1 file changed, 35 insertions(+), 4 deletions(-) > > diff --git a/mm/shmem.c b/mm/shmem.c > index 721f9fd064aa..fd9571056181 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -2410,7 +2410,8 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block > /* Some things misbehave if size == 0 on a directory */ > inode->i_size = 2 * BOGO_DIRENT_SIZE; > inode->i_op = &shmem_dir_inode_operations; > - inode->i_fop = &simple_dir_operations; > + inode->i_fop = &stable_dir_operations; > + stable_offset_init(inode); > break; > case S_IFLNK: > /* > @@ -2950,6 +2951,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, > if (error && error != -EOPNOTSUPP) > goto out_iput; > > + error = stable_offset_add(dir, dentry); > + if (error) > + goto out_iput; > + > error = 0; This line can be removed? > dir->i_size += BOGO_DIRENT_SIZE; > dir->i_ctime = dir->i_mtime = current_time(dir); > @@ -3027,6 +3032,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr > goto out; > } > > + ret = stable_offset_add(dir, dentry); > + if (ret) > + goto out; > + I think this should call shmem_free_inode() before goto out - reverse what shmem_reserve_inode() has done. > dir->i_size += BOGO_DIRENT_SIZE; > inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); > inode_inc_iversion(dir); > @@ -3045,6 +3054,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) > if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) > shmem_free_inode(inode->i_sb); > > + stable_offset_remove(dir, dentry); > + > dir->i_size -= BOGO_DIRENT_SIZE; > inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); > inode_inc_iversion(dir); > @@ -3103,24 +3114,37 @@ static int shmem_rename2(struct mnt_idmap *idmap, > { > struct inode *inode = d_inode(old_dentry); > int they_are_dirs = S_ISDIR(inode->i_mode); > + int error; > > if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) > return -EINVAL; > > - if (flags & RENAME_EXCHANGE) > + if (flags & RENAME_EXCHANGE) { > + stable_offset_remove(old_dir, old_dentry); > + stable_offset_remove(new_dir, new_dentry); > + error = stable_offset_add(new_dir, old_dentry); > + if (error) > + return error; > + error = stable_offset_add(old_dir, new_dentry); > + if (error) > + return error; > return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); > + } Hmm, error handling issues? Everything needs to be reversed when any of the operations fails? > > if (!simple_empty(new_dentry)) > return -ENOTEMPTY; > > if (flags & RENAME_WHITEOUT) { > - int error; > - > error = shmem_whiteout(idmap, old_dir, old_dentry); > if (error) > return error; > } > > + stable_offset_remove(old_dir, old_dentry); > + error = stable_offset_add(new_dir, old_dentry); > + if (error) > + return error; > + > if (d_really_is_positive(new_dentry)) { > (void) shmem_unlink(new_dir, new_dentry); > if (they_are_dirs) { > @@ -3185,6 +3209,11 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, > folio_unlock(folio); > folio_put(folio); > } > + > + error = stable_offset_add(dir, dentry); > + if (error) > + goto out_iput; > + Error handling, there is a kmemdup() above which needs to be freed? I'm not sure about folio, automatically released with the inode? > dir->i_size += BOGO_DIRENT_SIZE; > dir->i_ctime = dir->i_mtime = current_time(dir); > inode_inc_iversion(dir); > @@ -3920,6 +3949,8 @@ static void shmem_destroy_inode(struct inode *inode) > { > if (S_ISREG(inode->i_mode)) > mpol_free_shared_policy(&SHMEM_I(inode)->policy); > + if (S_ISDIR(inode->i_mode)) > + stable_offset_destroy(inode); > } > > static void shmem_init_inode(void *foo) > > Thanks, Bernd
> On Jun 26, 2023, at 9:18 AM, Bernd Schubert <bernd.schubert@fastmail.fm> wrote: > > > > On 6/6/23 15:11, Chuck Lever wrote: >> From: Chuck Lever <chuck.lever@oracle.com> >> The current cursor-based directory offset mechanism doesn't work >> when a tmpfs filesystem is exported via NFS. This is because NFS >> clients do not open directories. Each server-side READDIR operation >> has to open the directory, read it, then close it. The cursor state >> for that directory, being associated strictly with the opened >> struct file, is thus discarded after each NFS READDIR operation. >> Directory offsets are cached not only by NFS clients, but also by >> user space libraries on those clients. Essentially there is no way >> to invalidate those caches when directory offsets have changed on >> an NFS server after the offset-to-dentry mapping changes. Thus the >> whole application stack depends on unchanging directory offsets. >> The solution we've come up with is to make the directory offset for >> each file in a tmpfs filesystem stable for the life of the directory >> entry it represents. >> shmem_readdir() and shmem_dir_llseek() now use an xarray to map each >> directory offset (an loff_t integer) to the memory address of a >> struct dentry. >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> >> --- >> mm/shmem.c | 39 +++++++++++++++++++++++++++++++++++---- >> 1 file changed, 35 insertions(+), 4 deletions(-) >> diff --git a/mm/shmem.c b/mm/shmem.c >> index 721f9fd064aa..fd9571056181 100644 >> --- a/mm/shmem.c >> +++ b/mm/shmem.c >> @@ -2410,7 +2410,8 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block >> /* Some things misbehave if size == 0 on a directory */ >> inode->i_size = 2 * BOGO_DIRENT_SIZE; >> inode->i_op = &shmem_dir_inode_operations; >> - inode->i_fop = &simple_dir_operations; >> + inode->i_fop = &stable_dir_operations; >> + stable_offset_init(inode); >> break; >> case S_IFLNK: >> /* >> @@ -2950,6 +2951,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, >> if (error && error != -EOPNOTSUPP) >> goto out_iput; >> + error = stable_offset_add(dir, dentry); >> + if (error) >> + goto out_iput; >> + >> error = 0; > > This line can be removed? > >> dir->i_size += BOGO_DIRENT_SIZE; >> dir->i_ctime = dir->i_mtime = current_time(dir); >> @@ -3027,6 +3032,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr >> goto out; >> } >> + ret = stable_offset_add(dir, dentry); >> + if (ret) >> + goto out; >> + > > I think this should call shmem_free_inode() before goto out - reverse what shmem_reserve_inode() has done. > >> dir->i_size += BOGO_DIRENT_SIZE; >> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); >> inode_inc_iversion(dir); >> @@ -3045,6 +3054,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) >> if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) >> shmem_free_inode(inode->i_sb); >> + stable_offset_remove(dir, dentry); >> + >> dir->i_size -= BOGO_DIRENT_SIZE; >> inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); >> inode_inc_iversion(dir); >> @@ -3103,24 +3114,37 @@ static int shmem_rename2(struct mnt_idmap *idmap, >> { >> struct inode *inode = d_inode(old_dentry); >> int they_are_dirs = S_ISDIR(inode->i_mode); >> + int error; >> if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) >> return -EINVAL; >> - if (flags & RENAME_EXCHANGE) >> + if (flags & RENAME_EXCHANGE) { >> + stable_offset_remove(old_dir, old_dentry); >> + stable_offset_remove(new_dir, new_dentry); >> + error = stable_offset_add(new_dir, old_dentry); >> + if (error) >> + return error; >> + error = stable_offset_add(old_dir, new_dentry); >> + if (error) >> + return error; >> return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); >> + } > > Hmm, error handling issues? Everything needs to be reversed when any of the operations fails? > >> if (!simple_empty(new_dentry)) >> return -ENOTEMPTY; >> if (flags & RENAME_WHITEOUT) { >> - int error; >> - >> error = shmem_whiteout(idmap, old_dir, old_dentry); >> if (error) >> return error; >> } >> + stable_offset_remove(old_dir, old_dentry); >> + error = stable_offset_add(new_dir, old_dentry); >> + if (error) >> + return error; >> + >> if (d_really_is_positive(new_dentry)) { >> (void) shmem_unlink(new_dir, new_dentry); >> if (they_are_dirs) { >> @@ -3185,6 +3209,11 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, >> folio_unlock(folio); >> folio_put(folio); >> } >> + >> + error = stable_offset_add(dir, dentry); >> + if (error) >> + goto out_iput; >> + > > Error handling, there is a kmemdup() above which needs to be freed? I'm not sure about folio, automatically released with the inode? > >> dir->i_size += BOGO_DIRENT_SIZE; >> dir->i_ctime = dir->i_mtime = current_time(dir); >> inode_inc_iversion(dir); >> @@ -3920,6 +3949,8 @@ static void shmem_destroy_inode(struct inode *inode) >> { >> if (S_ISREG(inode->i_mode)) >> mpol_free_shared_policy(&SHMEM_I(inode)->policy); >> + if (S_ISDIR(inode->i_mode)) >> + stable_offset_destroy(inode); >> } >> static void shmem_init_inode(void *foo) > > Thanks, > Bernd Thanks for the review. I think I've addressed the issues you've pointed out. Watch for v4 of this series. -- Chuck Lever
diff --git a/mm/shmem.c b/mm/shmem.c index 721f9fd064aa..fd9571056181 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2410,7 +2410,8 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * BOGO_DIRENT_SIZE; inode->i_op = &shmem_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + inode->i_fop = &stable_dir_operations; + stable_offset_init(inode); break; case S_IFLNK: /* @@ -2950,6 +2951,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, if (error && error != -EOPNOTSUPP) goto out_iput; + error = stable_offset_add(dir, dentry); + if (error) + goto out_iput; + error = 0; dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); @@ -3027,6 +3032,10 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr goto out; } + ret = stable_offset_add(dir, dentry); + if (ret) + goto out; + dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); inode_inc_iversion(dir); @@ -3045,6 +3054,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) shmem_free_inode(inode->i_sb); + stable_offset_remove(dir, dentry); + dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); inode_inc_iversion(dir); @@ -3103,24 +3114,37 @@ static int shmem_rename2(struct mnt_idmap *idmap, { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); + int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - if (flags & RENAME_EXCHANGE) + if (flags & RENAME_EXCHANGE) { + stable_offset_remove(old_dir, old_dentry); + stable_offset_remove(new_dir, new_dentry); + error = stable_offset_add(new_dir, old_dentry); + if (error) + return error; + error = stable_offset_add(old_dir, new_dentry); + if (error) + return error; return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); + } if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { - int error; - error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) return error; } + stable_offset_remove(old_dir, old_dentry); + error = stable_offset_add(new_dir, old_dentry); + if (error) + return error; + if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { @@ -3185,6 +3209,11 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, folio_unlock(folio); folio_put(folio); } + + error = stable_offset_add(dir, dentry); + if (error) + goto out_iput; + dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); inode_inc_iversion(dir); @@ -3920,6 +3949,8 @@ static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); + if (S_ISDIR(inode->i_mode)) + stable_offset_destroy(inode); } static void shmem_init_inode(void *foo)