Message ID | 165708109258.1940.1095066282860556838.stgit@noble.brown (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | NFSD: clean up locking. | expand |
On Wed, 2022-07-06 at 14:18 +1000, NeilBrown wrote: > nfsd_lookup() takes an exclusive lock on the parent inode, but many > callers don't want the lock and may not need to lock at all if the > result is in the dcache. > > Change nfsd_lookup() to be passed a bool flag. > If false, don't take the lock. > If true, do take an exclusive lock, and return with it held if > successful. > If nfsd_lookup() returns an error, the lock WILL NOT be held. > > Only nfsd4_open() requests the lock to be held, and does so to block > rename until it decides whether to return a delegation. > > NOTE: when nfsd4_open() creates a file, the directory does *NOT* remain > locked and never has. So it is possible (though unlikely) for the > newly created file to be renamed before a delegation is handed out, > and that would be bad. This patch does *NOT* fix that, but *DOES* > take the directory lock immediately after creating the file, which > reduces the size of the window and ensure that the lock is held > consistently. More work is needed to guarantee no rename happens > before the delegation. > Interesting. Maybe after taking the lock, we could re-vet the dentry vs. the info in the OPEN request? That way, we'd presumably know that the above race didn't occur. > NOTE-2: NFSv4 requires directory changeinfo for OPEN even when a create > wasn't requested and no change happened. Now that nfsd_lookup() > doesn't use fh_lock(), we need explicit fh_fill_pre/post_attrs() > in the non-create branch of do_open_lookup(). > > Signed-off-by: NeilBrown <neilb@suse.de> > --- > fs/nfsd/nfs3proc.c | 2 +- > fs/nfsd/nfs4proc.c | 51 ++++++++++++++++++++++++++++------------ > fs/nfsd/nfsproc.c | 2 +- > fs/nfsd/vfs.c | 66 +++++++++++++++++++++++++++++++++++----------------- > fs/nfsd/vfs.h | 8 ++++-- > 5 files changed, 88 insertions(+), 41 deletions(-) > > diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c > index ad7941001106..3a67d0afb885 100644 > --- a/fs/nfsd/nfs3proc.c > +++ b/fs/nfsd/nfs3proc.c > @@ -96,7 +96,7 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp) > > resp->status = nfsd_lookup(rqstp, &resp->dirfh, > argp->name, argp->len, > - &resp->fh); > + &resp->fh, false); > return rpc_success; > } > > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c > index 4737019738ab..6ec22c69cbec 100644 > --- a/fs/nfsd/nfs4proc.c > +++ b/fs/nfsd/nfs4proc.c > @@ -414,7 +414,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, > } > > static __be32 > -do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) > +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > + struct nfsd4_open *open, struct svc_fh **resfh) > { > struct svc_fh *current_fh = &cstate->current_fh; > int accmode; > @@ -441,11 +442,18 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru > * yes | no | GUARDED4 | GUARDED4 > * yes | yes | GUARDED4 | GUARDED4 > */ > - > current->fs->umask = open->op_umask; > status = nfsd4_create_file(rqstp, current_fh, *resfh, open); > current->fs->umask = 0; > > + if (!status) > + /* We really want to hold the lock from before the > + * create to ensure no rename happens, but that > + * needs more work... > + */ > + inode_lock_nested(current_fh->fh_dentry->d_inode, > + I_MUTEX_PARENT); > + > if (!status && open->op_label.len) > nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); > > @@ -457,17 +465,25 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru > if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0) > open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | > FATTR4_WORD1_TIME_MODIFY); > - } else > - /* > - * Note this may exit with the parent still locked. > - * We will hold the lock until nfsd4_open's final > - * lookup, to prevent renames or unlinks until we've had > - * a chance to an acquire a delegation if appropriate. > + } else { > + /* We want to keep the directory locked until we've had a chance > + * to acquire a delegation if appropriate, so request that > + * nfsd_lookup() hold on to the lock. > */ > status = nfsd_lookup(rqstp, current_fh, > - open->op_fname, open->op_fnamelen, *resfh); > + open->op_fname, open->op_fnamelen, *resfh, > + true); > + if (!status) { > + /* NFSv4 protocol requires change attributes even though > + * no change happened. > + */ > + fh_fill_pre_attrs(current_fh); > + fh_fill_post_attrs(current_fh); > + } > + } > if (status) > - goto out; > + return status; > + > status = nfsd_check_obj_isreg(*resfh); > if (status) > goto out; > @@ -483,6 +499,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru > status = do_open_permission(rqstp, *resfh, open, accmode); > set_change_info(&open->op_cinfo, current_fh); > out: > + if (status) > + inode_unlock(current_fh->fh_dentry->d_inode); > return status; > } > > @@ -540,6 +558,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > struct net *net = SVC_NET(rqstp); > struct nfsd_net *nn = net_generic(net, nfsd_net_id); > bool reclaim = false; > + bool locked = false; > > dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", > (int)open->op_fnamelen, open->op_fname, > @@ -604,6 +623,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > status = do_open_lookup(rqstp, cstate, open, &resfh); > if (status) > goto out; > + locked = true; > break; > case NFS4_OPEN_CLAIM_PREVIOUS: > status = nfs4_check_open_reclaim(cstate->clp); > @@ -639,6 +659,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > fput(open->op_filp); > open->op_filp = NULL; > } > + if (locked) > + inode_unlock(cstate->current_fh.fh_dentry->d_inode); > if (resfh && resfh != &cstate->current_fh) { > fh_dup2(&cstate->current_fh, resfh); > fh_put(resfh); > @@ -933,7 +955,7 @@ static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh) > return nfserr_noent; > } > fh_put(&tmp_fh); > - return nfsd_lookup(rqstp, fh, "..", 2, fh); > + return nfsd_lookup(rqstp, fh, "..", 2, fh, false); > } > > static __be32 > @@ -949,7 +971,7 @@ nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > { > return nfsd_lookup(rqstp, &cstate->current_fh, > u->lookup.lo_name, u->lookup.lo_len, > - &cstate->current_fh); > + &cstate->current_fh, false); > } > > static __be32 > @@ -1089,11 +1111,10 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > if (err) > return err; > err = nfsd_lookup_dentry(rqstp, &cstate->current_fh, > - secinfo->si_name, secinfo->si_namelen, > - &exp, &dentry); > + secinfo->si_name, secinfo->si_namelen, > + &exp, &dentry, false); > if (err) > return err; > - fh_unlock(&cstate->current_fh); > if (d_really_is_negative(dentry)) { > exp_put(exp); > err = nfserr_noent; > diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c > index a25b8e321662..ed24fae09517 100644 > --- a/fs/nfsd/nfsproc.c > +++ b/fs/nfsd/nfsproc.c > @@ -133,7 +133,7 @@ nfsd_proc_lookup(struct svc_rqst *rqstp) > > fh_init(&resp->fh, NFS_FHSIZE); > resp->status = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len, > - &resp->fh); > + &resp->fh, false); > fh_put(&argp->fh); > if (resp->status != nfs_ok) > goto out; > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c > index 4916c29af0fa..8e050c6d112a 100644 > --- a/fs/nfsd/vfs.c > +++ b/fs/nfsd/vfs.c > @@ -172,7 +172,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) > __be32 > nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, > const char *name, unsigned int len, > - struct svc_export **exp_ret, struct dentry **dentry_ret) > + struct svc_export **exp_ret, struct dentry **dentry_ret, > + bool locked) > { > struct svc_export *exp; > struct dentry *dparent; > @@ -199,27 +200,31 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, > goto out_nfserr; > } > } else { > - /* > - * In the nfsd4_open() case, this may be held across > - * subsequent open and delegation acquisition which may > - * need to take the child's i_mutex: > - */ > - fh_lock_nested(fhp, I_MUTEX_PARENT); > - dentry = lookup_one_len(name, dparent, len); > + if (locked) > + dentry = lookup_one_len(name, dparent, len); > + else > + dentry = lookup_one_len_unlocked(name, dparent, len); > host_err = PTR_ERR(dentry); > if (IS_ERR(dentry)) > goto out_nfserr; > if (nfsd_mountpoint(dentry, exp)) { > /* > - * We don't need the i_mutex after all. It's > - * still possible we could open this (regular > - * files can be mountpoints too), but the > - * i_mutex is just there to prevent renames of > - * something that we might be about to delegate, > - * and a mountpoint won't be renamed: > + * nfsd_cross_mnt() may wait for an upcall > + * to userspace, and holding i_sem across that s/i_sem/i_rwsem/ > + * invites the possibility of a deadlock. > + * We don't really need the lock on the parent > + * of a mount point was we only need it to guard > + * against a rename before we get a lease for a > + * delegation. > + * So just drop the i_sem and reclaim it. > */ > - fh_unlock(fhp); > - if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { > + if (locked) > + inode_unlock(dparent->d_inode); > + host_err = nfsd_cross_mnt(rqstp, &dentry, &exp); > + if (locked) > + inode_lock_nested(dparent->d_inode, > + I_MUTEX_PARENT); > + if (host_err) { > dput(dentry); > goto out_nfserr; > } > @@ -234,7 +239,17 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, > return nfserrno(host_err); > } > > -/* > +/** > + * nfsd_lookup - look up a single path component for nfsd > + * > + * @rqstp: the request context > + * @ftp: the file handle of the directory > + * @name: the component name, or %NULL to look up parent > + * @len: length of name to examine > + * @resfh: pointer to pre-initialised filehandle to hold result. > + * @lock: if true, lock directory during lookup and keep it locked > + * if there is no error. > + * > * Look up one component of a pathname. > * N.B. After this call _both_ fhp and resfh need an fh_put > * > @@ -244,11 +259,15 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, > * returned. Otherwise the covered directory is returned. > * NOTE: this mountpoint crossing is not supported properly by all > * clients and is explicitly disallowed for NFSv3 > - * NeilBrown <neilb@cse.unsw.edu.au> > + * > + * Only nfsd4_open() calls this with @lock set. It does so to block > + * renames/unlinks before it possibly gets a lease to provide a > + * delegation. > */ > __be32 > nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, > - unsigned int len, struct svc_fh *resfh) > + unsigned int len, struct svc_fh *resfh, > + bool lock) > { > struct svc_export *exp; > struct dentry *dentry; > @@ -257,9 +276,11 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, > err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); > if (err) > return err; > - err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); > + if (lock) > + inode_lock_nested(fhp->fh_dentry->d_inode, I_MUTEX_PARENT); > + err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry, lock); > if (err) > - return err; > + goto out_err; > err = check_nfsd_access(exp, rqstp); > if (err) > goto out; > @@ -273,6 +294,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, > out: > dput(dentry); > exp_put(exp); > +out_err: > + if (err && lock) > + inode_unlock(fhp->fh_dentry->d_inode); > return err; > } > > diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h > index 9f4fd3060200..290788f007d4 100644 > --- a/fs/nfsd/vfs.h > +++ b/fs/nfsd/vfs.h > @@ -45,10 +45,12 @@ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); > int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, > struct svc_export **expp); > __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, > - const char *, unsigned int, struct svc_fh *); > + const char *, unsigned int, struct svc_fh *, > + bool); > __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, > - const char *, unsigned int, > - struct svc_export **, struct dentry **); > + const char *, unsigned int, > + struct svc_export **, struct dentry **, > + bool); > __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, > struct iattr *, int, time64_t); > int nfsd_mountpoint(struct dentry *, struct svc_export *); > > Other than minor comment nit... Reviewed-by: Jeff Layton <jlayton@kernel.org>
> On Jul 6, 2022, at 12:18 AM, NeilBrown <neilb@suse.de> wrote: > > nfsd_lookup() takes an exclusive lock on the parent inode, but many > callers don't want the lock and may not need to lock at all if the > result is in the dcache. > > Change nfsd_lookup() to be passed a bool flag. > If false, don't take the lock. > If true, do take an exclusive lock, and return with it held if > successful. > If nfsd_lookup() returns an error, the lock WILL NOT be held. > > Only nfsd4_open() requests the lock to be held, and does so to block > rename until it decides whether to return a delegation. > > NOTE: when nfsd4_open() creates a file, the directory does *NOT* remain > locked and never has. So it is possible (though unlikely) for the > newly created file to be renamed before a delegation is handed out, > and that would be bad. This patch does *NOT* fix that, but *DOES* > take the directory lock immediately after creating the file, which > reduces the size of the window and ensure that the lock is held > consistently. More work is needed to guarantee no rename happens > before the delegation. > > NOTE-2: NFSv4 requires directory changeinfo for OPEN even when a create > wasn't requested and no change happened. Now that nfsd_lookup() > doesn't use fh_lock(), we need explicit fh_fill_pre/post_attrs() > in the non-create branch of do_open_lookup(). > > Signed-off-by: NeilBrown <neilb@suse.de> > --- > fs/nfsd/nfs3proc.c | 2 +- > fs/nfsd/nfs4proc.c | 51 ++++++++++++++++++++++++++++------------ > fs/nfsd/nfsproc.c | 2 +- > fs/nfsd/vfs.c | 66 +++++++++++++++++++++++++++++++++++----------------- > fs/nfsd/vfs.h | 8 ++++-- > 5 files changed, 88 insertions(+), 41 deletions(-) > > diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c > index ad7941001106..3a67d0afb885 100644 > --- a/fs/nfsd/nfs3proc.c > +++ b/fs/nfsd/nfs3proc.c > @@ -96,7 +96,7 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp) > > resp->status = nfsd_lookup(rqstp, &resp->dirfh, > argp->name, argp->len, > - &resp->fh); > + &resp->fh, false); > return rpc_success; > } > > diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c > index 4737019738ab..6ec22c69cbec 100644 > --- a/fs/nfsd/nfs4proc.c > +++ b/fs/nfsd/nfs4proc.c > @@ -414,7 +414,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, > } > > static __be32 > -do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) > +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > + struct nfsd4_open *open, struct svc_fh **resfh) > { > struct svc_fh *current_fh = &cstate->current_fh; > int accmode; > @@ -441,11 +442,18 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru > * yes | no | GUARDED4 | GUARDED4 > * yes | yes | GUARDED4 | GUARDED4 > */ > - > current->fs->umask = open->op_umask; > status = nfsd4_create_file(rqstp, current_fh, *resfh, open); > current->fs->umask = 0; > > + if (!status) > + /* We really want to hold the lock from before the > + * create to ensure no rename happens, but that > + * needs more work... > + */ > + inode_lock_nested(current_fh->fh_dentry->d_inode, > + I_MUTEX_PARENT); > + > if (!status && open->op_label.len) > nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); > > @@ -457,17 +465,25 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru > if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0) > open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | > FATTR4_WORD1_TIME_MODIFY); > - } else > - /* > - * Note this may exit with the parent still locked. > - * We will hold the lock until nfsd4_open's final > - * lookup, to prevent renames or unlinks until we've had > - * a chance to an acquire a delegation if appropriate. > + } else { > + /* We want to keep the directory locked until we've had a chance > + * to acquire a delegation if appropriate, so request that > + * nfsd_lookup() hold on to the lock. > */ > status = nfsd_lookup(rqstp, current_fh, > - open->op_fname, open->op_fnamelen, *resfh); > + open->op_fname, open->op_fnamelen, *resfh, > + true); > + if (!status) { > + /* NFSv4 protocol requires change attributes even though > + * no change happened. > + */ > + fh_fill_pre_attrs(current_fh); > + fh_fill_post_attrs(current_fh); If this is really correct, the comment should also state that no concurrent changes to the parent are possible during the lookup, and thus the pre and post attributes are expected to be the same always. Otherwise, this code paragraph looks just a little insane ;-) > + } > + } > if (status) > - goto out; > + return status; > + > status = nfsd_check_obj_isreg(*resfh); > if (status) > goto out; > @@ -483,6 +499,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru > status = do_open_permission(rqstp, *resfh, open, accmode); > set_change_info(&open->op_cinfo, current_fh); > out: > + if (status) > + inode_unlock(current_fh->fh_dentry->d_inode); > return status; > } > > @@ -540,6 +558,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > struct net *net = SVC_NET(rqstp); > struct nfsd_net *nn = net_generic(net, nfsd_net_id); > bool reclaim = false; > + bool locked = false; > > dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", > (int)open->op_fnamelen, open->op_fname, > @@ -604,6 +623,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > status = do_open_lookup(rqstp, cstate, open, &resfh); > if (status) > goto out; > + locked = true; > break; > case NFS4_OPEN_CLAIM_PREVIOUS: > status = nfs4_check_open_reclaim(cstate->clp); > @@ -639,6 +659,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > fput(open->op_filp); > open->op_filp = NULL; > } > + if (locked) > + inode_unlock(cstate->current_fh.fh_dentry->d_inode); > if (resfh && resfh != &cstate->current_fh) { > fh_dup2(&cstate->current_fh, resfh); > fh_put(resfh); > @@ -933,7 +955,7 @@ static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh) > return nfserr_noent; > } > fh_put(&tmp_fh); > - return nfsd_lookup(rqstp, fh, "..", 2, fh); > + return nfsd_lookup(rqstp, fh, "..", 2, fh, false); > } > > static __be32 > @@ -949,7 +971,7 @@ nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > { > return nfsd_lookup(rqstp, &cstate->current_fh, > u->lookup.lo_name, u->lookup.lo_len, > - &cstate->current_fh); > + &cstate->current_fh, false); > } > > static __be32 > @@ -1089,11 +1111,10 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, > if (err) > return err; > err = nfsd_lookup_dentry(rqstp, &cstate->current_fh, > - secinfo->si_name, secinfo->si_namelen, > - &exp, &dentry); > + secinfo->si_name, secinfo->si_namelen, > + &exp, &dentry, false); > if (err) > return err; > - fh_unlock(&cstate->current_fh); > if (d_really_is_negative(dentry)) { > exp_put(exp); > err = nfserr_noent; > diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c > index a25b8e321662..ed24fae09517 100644 > --- a/fs/nfsd/nfsproc.c > +++ b/fs/nfsd/nfsproc.c > @@ -133,7 +133,7 @@ nfsd_proc_lookup(struct svc_rqst *rqstp) > > fh_init(&resp->fh, NFS_FHSIZE); > resp->status = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len, > - &resp->fh); > + &resp->fh, false); > fh_put(&argp->fh); > if (resp->status != nfs_ok) > goto out; > diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c > index 4916c29af0fa..8e050c6d112a 100644 > --- a/fs/nfsd/vfs.c > +++ b/fs/nfsd/vfs.c > @@ -172,7 +172,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) > __be32 > nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, > const char *name, unsigned int len, > - struct svc_export **exp_ret, struct dentry **dentry_ret) > + struct svc_export **exp_ret, struct dentry **dentry_ret, > + bool locked) > { > struct svc_export *exp; > struct dentry *dparent; > @@ -199,27 +200,31 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, > goto out_nfserr; > } > } else { > - /* > - * In the nfsd4_open() case, this may be held across > - * subsequent open and delegation acquisition which may > - * need to take the child's i_mutex: > - */ > - fh_lock_nested(fhp, I_MUTEX_PARENT); > - dentry = lookup_one_len(name, dparent, len); > + if (locked) > + dentry = lookup_one_len(name, dparent, len); > + else > + dentry = lookup_one_len_unlocked(name, dparent, len); > host_err = PTR_ERR(dentry); > if (IS_ERR(dentry)) > goto out_nfserr; > if (nfsd_mountpoint(dentry, exp)) { > /* > - * We don't need the i_mutex after all. It's > - * still possible we could open this (regular > - * files can be mountpoints too), but the > - * i_mutex is just there to prevent renames of > - * something that we might be about to delegate, > - * and a mountpoint won't be renamed: > + * nfsd_cross_mnt() may wait for an upcall > + * to userspace, and holding i_sem across that > + * invites the possibility of a deadlock. > + * We don't really need the lock on the parent > + * of a mount point was we only need it to guard > + * against a rename before we get a lease for a > + * delegation. > + * So just drop the i_sem and reclaim it. > */ > - fh_unlock(fhp); > - if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { > + if (locked) > + inode_unlock(dparent->d_inode); > + host_err = nfsd_cross_mnt(rqstp, &dentry, &exp); > + if (locked) > + inode_lock_nested(dparent->d_inode, > + I_MUTEX_PARENT); > + if (host_err) { > dput(dentry); > goto out_nfserr; > } > @@ -234,7 +239,17 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, > return nfserrno(host_err); > } > > -/* > +/** > + * nfsd_lookup - look up a single path component for nfsd > + * > + * @rqstp: the request context > + * @ftp: the file handle of the directory > + * @name: the component name, or %NULL to look up parent > + * @len: length of name to examine > + * @resfh: pointer to pre-initialised filehandle to hold result. > + * @lock: if true, lock directory during lookup and keep it locked > + * if there is no error. > + * > * Look up one component of a pathname. > * N.B. After this call _both_ fhp and resfh need an fh_put > * > @@ -244,11 +259,15 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, > * returned. Otherwise the covered directory is returned. > * NOTE: this mountpoint crossing is not supported properly by all > * clients and is explicitly disallowed for NFSv3 > - * NeilBrown <neilb@cse.unsw.edu.au> > + * > + * Only nfsd4_open() calls this with @lock set. It does so to block > + * renames/unlinks before it possibly gets a lease to provide a > + * delegation. > */ > __be32 > nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, > - unsigned int len, struct svc_fh *resfh) > + unsigned int len, struct svc_fh *resfh, > + bool lock) > { > struct svc_export *exp; > struct dentry *dentry; > @@ -257,9 +276,11 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, > err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); > if (err) > return err; > - err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); > + if (lock) > + inode_lock_nested(fhp->fh_dentry->d_inode, I_MUTEX_PARENT); > + err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry, lock); > if (err) > - return err; > + goto out_err; > err = check_nfsd_access(exp, rqstp); > if (err) > goto out; > @@ -273,6 +294,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, > out: > dput(dentry); > exp_put(exp); > +out_err: > + if (err && lock) > + inode_unlock(fhp->fh_dentry->d_inode); > return err; > } > > diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h > index 9f4fd3060200..290788f007d4 100644 > --- a/fs/nfsd/vfs.h > +++ b/fs/nfsd/vfs.h > @@ -45,10 +45,12 @@ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); > int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, > struct svc_export **expp); > __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, > - const char *, unsigned int, struct svc_fh *); > + const char *, unsigned int, struct svc_fh *, > + bool); > __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, > - const char *, unsigned int, > - struct svc_export **, struct dentry **); > + const char *, unsigned int, > + struct svc_export **, struct dentry **, > + bool); > __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, > struct iattr *, int, time64_t); > int nfsd_mountpoint(struct dentry *, struct svc_export *); > > -- Chuck Lever
On Wed, 06 Jul 2022, Jeff Layton wrote: > On Wed, 2022-07-06 at 14:18 +1000, NeilBrown wrote: > > nfsd_lookup() takes an exclusive lock on the parent inode, but many > > callers don't want the lock and may not need to lock at all if the > > result is in the dcache. > > > > Change nfsd_lookup() to be passed a bool flag. > > If false, don't take the lock. > > If true, do take an exclusive lock, and return with it held if > > successful. > > If nfsd_lookup() returns an error, the lock WILL NOT be held. > > > > Only nfsd4_open() requests the lock to be held, and does so to block > > rename until it decides whether to return a delegation. > > > > NOTE: when nfsd4_open() creates a file, the directory does *NOT* remain > > locked and never has. So it is possible (though unlikely) for the > > newly created file to be renamed before a delegation is handed out, > > and that would be bad. This patch does *NOT* fix that, but *DOES* > > take the directory lock immediately after creating the file, which > > reduces the size of the window and ensure that the lock is held > > consistently. More work is needed to guarantee no rename happens > > before the delegation. > > > > Interesting. Maybe after taking the lock, we could re-vet the dentry vs. > the info in the OPEN request? That way, we'd presumably know that the > above race didn't occur. I would lean towards revalidating the dentry after getting the lease. However I don't think "revalidate the dentry" is quite as easy as I would like it to be, particularly if you care about bind-mounts of regular files. > > /* > > - * We don't need the i_mutex after all. It's > > - * still possible we could open this (regular > > - * files can be mountpoints too), but the > > - * i_mutex is just there to prevent renames of > > - * something that we might be about to delegate, > > - * and a mountpoint won't be renamed: > > + * nfsd_cross_mnt() may wait for an upcall > > + * to userspace, and holding i_sem across that > > s/i_sem/i_rwsem/ But ... fs/nilfs2/nilfs.h calls it i_sem, as does fs/jffs2/README.Locking And $ git grep -w i_mutex | wc 180 1878 13728 But yes, I should spell it i_rwsem... or maybe just "the inode lock". > > Other than minor comment nit... > > Reviewed-by: Jeff Layton <jlayton@kernel.org> > Thanks, NeilBrown
On Thu, 07 Jul 2022, Chuck Lever III wrote: > > + /* We want to keep the directory locked until we've had a chance > > + * to acquire a delegation if appropriate, so request that > > + * nfsd_lookup() hold on to the lock. > > */ > > status = nfsd_lookup(rqstp, current_fh, > > - open->op_fname, open->op_fnamelen, *resfh); > > + open->op_fname, open->op_fnamelen, *resfh, > > + true); > > + if (!status) { > > + /* NFSv4 protocol requires change attributes even though > > + * no change happened. > > + */ > > + fh_fill_pre_attrs(current_fh); > > + fh_fill_post_attrs(current_fh); > > If this is really correct, the comment should also state that > no concurrent changes to the parent are possible during > the lookup, and thus the pre and post attributes are expected > to be the same always. The earlier comment notes that nfsd_lookup() is called in a way in which it takes and keeps the lock - that should imply that no other changes can happen. But with such insane looking code, it doesn't hurt to be extra explicit. > > Otherwise, this code paragraph looks just a little insane ;-) > :-) Thanks, NeilBrown
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index ad7941001106..3a67d0afb885 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -96,7 +96,7 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp) resp->status = nfsd_lookup(rqstp, &resp->dirfh, argp->name, argp->len, - &resp->fh); + &resp->fh, false); return rpc_success; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 4737019738ab..6ec22c69cbec 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -414,7 +414,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, } static __be32 -do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_open *open, struct svc_fh **resfh) { struct svc_fh *current_fh = &cstate->current_fh; int accmode; @@ -441,11 +442,18 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * yes | no | GUARDED4 | GUARDED4 * yes | yes | GUARDED4 | GUARDED4 */ - current->fs->umask = open->op_umask; status = nfsd4_create_file(rqstp, current_fh, *resfh, open); current->fs->umask = 0; + if (!status) + /* We really want to hold the lock from before the + * create to ensure no rename happens, but that + * needs more work... + */ + inode_lock_nested(current_fh->fh_dentry->d_inode, + I_MUTEX_PARENT); + if (!status && open->op_label.len) nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval); @@ -457,17 +465,25 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0) open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_MODIFY); - } else - /* - * Note this may exit with the parent still locked. - * We will hold the lock until nfsd4_open's final - * lookup, to prevent renames or unlinks until we've had - * a chance to an acquire a delegation if appropriate. + } else { + /* We want to keep the directory locked until we've had a chance + * to acquire a delegation if appropriate, so request that + * nfsd_lookup() hold on to the lock. */ status = nfsd_lookup(rqstp, current_fh, - open->op_fname, open->op_fnamelen, *resfh); + open->op_fname, open->op_fnamelen, *resfh, + true); + if (!status) { + /* NFSv4 protocol requires change attributes even though + * no change happened. + */ + fh_fill_pre_attrs(current_fh); + fh_fill_post_attrs(current_fh); + } + } if (status) - goto out; + return status; + status = nfsd_check_obj_isreg(*resfh); if (status) goto out; @@ -483,6 +499,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru status = do_open_permission(rqstp, *resfh, open, accmode); set_change_info(&open->op_cinfo, current_fh); out: + if (status) + inode_unlock(current_fh->fh_dentry->d_inode); return status; } @@ -540,6 +558,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool reclaim = false; + bool locked = false; dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", (int)open->op_fnamelen, open->op_fname, @@ -604,6 +623,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = do_open_lookup(rqstp, cstate, open, &resfh); if (status) goto out; + locked = true; break; case NFS4_OPEN_CLAIM_PREVIOUS: status = nfs4_check_open_reclaim(cstate->clp); @@ -639,6 +659,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fput(open->op_filp); open->op_filp = NULL; } + if (locked) + inode_unlock(cstate->current_fh.fh_dentry->d_inode); if (resfh && resfh != &cstate->current_fh) { fh_dup2(&cstate->current_fh, resfh); fh_put(resfh); @@ -933,7 +955,7 @@ static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh) return nfserr_noent; } fh_put(&tmp_fh); - return nfsd_lookup(rqstp, fh, "..", 2, fh); + return nfsd_lookup(rqstp, fh, "..", 2, fh, false); } static __be32 @@ -949,7 +971,7 @@ nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { return nfsd_lookup(rqstp, &cstate->current_fh, u->lookup.lo_name, u->lookup.lo_len, - &cstate->current_fh); + &cstate->current_fh, false); } static __be32 @@ -1089,11 +1111,10 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (err) return err; err = nfsd_lookup_dentry(rqstp, &cstate->current_fh, - secinfo->si_name, secinfo->si_namelen, - &exp, &dentry); + secinfo->si_name, secinfo->si_namelen, + &exp, &dentry, false); if (err) return err; - fh_unlock(&cstate->current_fh); if (d_really_is_negative(dentry)) { exp_put(exp); err = nfserr_noent; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index a25b8e321662..ed24fae09517 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -133,7 +133,7 @@ nfsd_proc_lookup(struct svc_rqst *rqstp) fh_init(&resp->fh, NFS_FHSIZE); resp->status = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len, - &resp->fh); + &resp->fh, false); fh_put(&argp->fh); if (resp->status != nfs_ok) goto out; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4916c29af0fa..8e050c6d112a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -172,7 +172,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) __be32 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, unsigned int len, - struct svc_export **exp_ret, struct dentry **dentry_ret) + struct svc_export **exp_ret, struct dentry **dentry_ret, + bool locked) { struct svc_export *exp; struct dentry *dparent; @@ -199,27 +200,31 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; } } else { - /* - * In the nfsd4_open() case, this may be held across - * subsequent open and delegation acquisition which may - * need to take the child's i_mutex: - */ - fh_lock_nested(fhp, I_MUTEX_PARENT); - dentry = lookup_one_len(name, dparent, len); + if (locked) + dentry = lookup_one_len(name, dparent, len); + else + dentry = lookup_one_len_unlocked(name, dparent, len); host_err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_nfserr; if (nfsd_mountpoint(dentry, exp)) { /* - * We don't need the i_mutex after all. It's - * still possible we could open this (regular - * files can be mountpoints too), but the - * i_mutex is just there to prevent renames of - * something that we might be about to delegate, - * and a mountpoint won't be renamed: + * nfsd_cross_mnt() may wait for an upcall + * to userspace, and holding i_sem across that + * invites the possibility of a deadlock. + * We don't really need the lock on the parent + * of a mount point was we only need it to guard + * against a rename before we get a lease for a + * delegation. + * So just drop the i_sem and reclaim it. */ - fh_unlock(fhp); - if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { + if (locked) + inode_unlock(dparent->d_inode); + host_err = nfsd_cross_mnt(rqstp, &dentry, &exp); + if (locked) + inode_lock_nested(dparent->d_inode, + I_MUTEX_PARENT); + if (host_err) { dput(dentry); goto out_nfserr; } @@ -234,7 +239,17 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, return nfserrno(host_err); } -/* +/** + * nfsd_lookup - look up a single path component for nfsd + * + * @rqstp: the request context + * @ftp: the file handle of the directory + * @name: the component name, or %NULL to look up parent + * @len: length of name to examine + * @resfh: pointer to pre-initialised filehandle to hold result. + * @lock: if true, lock directory during lookup and keep it locked + * if there is no error. + * * Look up one component of a pathname. * N.B. After this call _both_ fhp and resfh need an fh_put * @@ -244,11 +259,15 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, * returned. Otherwise the covered directory is returned. * NOTE: this mountpoint crossing is not supported properly by all * clients and is explicitly disallowed for NFSv3 - * NeilBrown <neilb@cse.unsw.edu.au> + * + * Only nfsd4_open() calls this with @lock set. It does so to block + * renames/unlinks before it possibly gets a lease to provide a + * delegation. */ __be32 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, - unsigned int len, struct svc_fh *resfh) + unsigned int len, struct svc_fh *resfh, + bool lock) { struct svc_export *exp; struct dentry *dentry; @@ -257,9 +276,11 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); if (err) return err; - err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); + if (lock) + inode_lock_nested(fhp->fh_dentry->d_inode, I_MUTEX_PARENT); + err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry, lock); if (err) - return err; + goto out_err; err = check_nfsd_access(exp, rqstp); if (err) goto out; @@ -273,6 +294,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, out: dput(dentry); exp_put(exp); +out_err: + if (err && lock) + inode_unlock(fhp->fh_dentry->d_inode); return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 9f4fd3060200..290788f007d4 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -45,10 +45,12 @@ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, - const char *, unsigned int, struct svc_fh *); + const char *, unsigned int, struct svc_fh *, + bool); __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, - const char *, unsigned int, - struct svc_export **, struct dentry **); + const char *, unsigned int, + struct svc_export **, struct dentry **, + bool); __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, struct iattr *, int, time64_t); int nfsd_mountpoint(struct dentry *, struct svc_export *);
nfsd_lookup() takes an exclusive lock on the parent inode, but many callers don't want the lock and may not need to lock at all if the result is in the dcache. Change nfsd_lookup() to be passed a bool flag. If false, don't take the lock. If true, do take an exclusive lock, and return with it held if successful. If nfsd_lookup() returns an error, the lock WILL NOT be held. Only nfsd4_open() requests the lock to be held, and does so to block rename until it decides whether to return a delegation. NOTE: when nfsd4_open() creates a file, the directory does *NOT* remain locked and never has. So it is possible (though unlikely) for the newly created file to be renamed before a delegation is handed out, and that would be bad. This patch does *NOT* fix that, but *DOES* take the directory lock immediately after creating the file, which reduces the size of the window and ensure that the lock is held consistently. More work is needed to guarantee no rename happens before the delegation. NOTE-2: NFSv4 requires directory changeinfo for OPEN even when a create wasn't requested and no change happened. Now that nfsd_lookup() doesn't use fh_lock(), we need explicit fh_fill_pre/post_attrs() in the non-create branch of do_open_lookup(). Signed-off-by: NeilBrown <neilb@suse.de> --- fs/nfsd/nfs3proc.c | 2 +- fs/nfsd/nfs4proc.c | 51 ++++++++++++++++++++++++++++------------ fs/nfsd/nfsproc.c | 2 +- fs/nfsd/vfs.c | 66 +++++++++++++++++++++++++++++++++++----------------- fs/nfsd/vfs.h | 8 ++++-- 5 files changed, 88 insertions(+), 41 deletions(-)