diff mbox series

[5/8] NFSD: reduce locking in nfsd_lookup()

Message ID 165708109258.1940.1095066282860556838.stgit@noble.brown (mailing list archive)
State New, archived
Headers show
Series NFSD: clean up locking. | expand

Commit Message

NeilBrown July 6, 2022, 4:18 a.m. UTC
nfsd_lookup() takes an exclusive lock on the parent inode, but many
callers don't want the lock and may not need to lock at all if the
result is in the dcache.

Change nfsd_lookup() to be passed a bool flag.
If false, don't take the lock.
If true, do take an exclusive lock, and return with it held if
successful.
If nfsd_lookup() returns an error, the lock WILL NOT be held.

Only nfsd4_open() requests the lock to be held, and does so to block
rename until it decides whether to return a delegation.

NOTE: when nfsd4_open() creates a file, the directory does *NOT* remain
  locked and never has.  So it is possible (though unlikely) for the
  newly created file to be renamed before a delegation is handed out,
  and that would be bad.  This patch does *NOT* fix that, but *DOES*
  take the directory lock immediately after creating the file, which
  reduces the size of the window and ensure that the lock is held
  consistently.  More work is needed to guarantee no rename happens
  before the delegation.

NOTE-2: NFSv4 requires directory changeinfo for OPEN even when a create
  wasn't requested and no change happened.  Now that nfsd_lookup()
  doesn't use fh_lock(), we need explicit fh_fill_pre/post_attrs()
  in the non-create branch of do_open_lookup().

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/nfs3proc.c |    2 +-
 fs/nfsd/nfs4proc.c |   51 ++++++++++++++++++++++++++++------------
 fs/nfsd/nfsproc.c  |    2 +-
 fs/nfsd/vfs.c      |   66 +++++++++++++++++++++++++++++++++++-----------------
 fs/nfsd/vfs.h      |    8 ++++--
 5 files changed, 88 insertions(+), 41 deletions(-)

Comments

Jeff Layton July 6, 2022, 1:47 p.m. UTC | #1
On Wed, 2022-07-06 at 14:18 +1000, NeilBrown wrote:
> nfsd_lookup() takes an exclusive lock on the parent inode, but many
> callers don't want the lock and may not need to lock at all if the
> result is in the dcache.
> 
> Change nfsd_lookup() to be passed a bool flag.
> If false, don't take the lock.
> If true, do take an exclusive lock, and return with it held if
> successful.
> If nfsd_lookup() returns an error, the lock WILL NOT be held.
> 
> Only nfsd4_open() requests the lock to be held, and does so to block
> rename until it decides whether to return a delegation.
> 
> NOTE: when nfsd4_open() creates a file, the directory does *NOT* remain
>   locked and never has.  So it is possible (though unlikely) for the
>   newly created file to be renamed before a delegation is handed out,
>   and that would be bad.  This patch does *NOT* fix that, but *DOES*
>   take the directory lock immediately after creating the file, which
>   reduces the size of the window and ensure that the lock is held
>   consistently.  More work is needed to guarantee no rename happens
>   before the delegation.
> 

Interesting. Maybe after taking the lock, we could re-vet the dentry vs.
the info in the OPEN request? That way, we'd presumably know that the
above race didn't occur.

> NOTE-2: NFSv4 requires directory changeinfo for OPEN even when a create
>   wasn't requested and no change happened.  Now that nfsd_lookup()
>   doesn't use fh_lock(), we need explicit fh_fill_pre/post_attrs()
>   in the non-create branch of do_open_lookup().
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/nfsd/nfs3proc.c |    2 +-
>  fs/nfsd/nfs4proc.c |   51 ++++++++++++++++++++++++++++------------
>  fs/nfsd/nfsproc.c  |    2 +-
>  fs/nfsd/vfs.c      |   66 +++++++++++++++++++++++++++++++++++-----------------
>  fs/nfsd/vfs.h      |    8 ++++--
>  5 files changed, 88 insertions(+), 41 deletions(-)
> 
> diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
> index ad7941001106..3a67d0afb885 100644
> --- a/fs/nfsd/nfs3proc.c
> +++ b/fs/nfsd/nfs3proc.c
> @@ -96,7 +96,7 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp)
>  
>  	resp->status = nfsd_lookup(rqstp, &resp->dirfh,
>  				   argp->name, argp->len,
> -				   &resp->fh);
> +				   &resp->fh, false);
>  	return rpc_success;
>  }
>  
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 4737019738ab..6ec22c69cbec 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -414,7 +414,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  }
>  
>  static __be32
> -do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
> +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> +	       struct nfsd4_open *open, struct svc_fh **resfh)
>  {
>  	struct svc_fh *current_fh = &cstate->current_fh;
>  	int accmode;
> @@ -441,11 +442,18 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
>  		 * yes          | no     | GUARDED4        | GUARDED4
>  		 * yes          | yes    | GUARDED4        | GUARDED4
>  		 */
> -
>  		current->fs->umask = open->op_umask;
>  		status = nfsd4_create_file(rqstp, current_fh, *resfh, open);
>  		current->fs->umask = 0;
>  
> +		if (!status)
> +			/* We really want to hold the lock from before the
> +			 * create to ensure no rename happens, but that
> +			 * needs more work...
> +			 */
> +			inode_lock_nested(current_fh->fh_dentry->d_inode,
> +					  I_MUTEX_PARENT);
> +
>  		if (!status && open->op_label.len)
>  			nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
>  
> @@ -457,17 +465,25 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
>  		if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0)
>  			open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
>  						FATTR4_WORD1_TIME_MODIFY);
> -	} else
> -		/*
> -		 * Note this may exit with the parent still locked.
> -		 * We will hold the lock until nfsd4_open's final
> -		 * lookup, to prevent renames or unlinks until we've had
> -		 * a chance to an acquire a delegation if appropriate.
> +	} else {
> +		/* We want to keep the directory locked until we've had a chance
> +		 * to acquire a delegation if appropriate, so request that
> +		 * nfsd_lookup() hold on to the lock.
>  		 */
>  		status = nfsd_lookup(rqstp, current_fh,
> -				     open->op_fname, open->op_fnamelen, *resfh);
> +				     open->op_fname, open->op_fnamelen, *resfh,
> +				     true);
> +		if (!status) {
> +			/* NFSv4 protocol requires change attributes even though
> +			 * no change happened.
> +			 */
> +			fh_fill_pre_attrs(current_fh);
> +			fh_fill_post_attrs(current_fh);
> +		}
> +	}
>  	if (status)
> -		goto out;
> +		return status;
> +
>  	status = nfsd_check_obj_isreg(*resfh);
>  	if (status)
>  		goto out;
> @@ -483,6 +499,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
>  	status = do_open_permission(rqstp, *resfh, open, accmode);
>  	set_change_info(&open->op_cinfo, current_fh);
>  out:
> +	if (status)
> +		inode_unlock(current_fh->fh_dentry->d_inode);
>  	return status;
>  }
>  
> @@ -540,6 +558,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	struct net *net = SVC_NET(rqstp);
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>  	bool reclaim = false;
> +	bool locked = false;
>  
>  	dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
>  		(int)open->op_fnamelen, open->op_fname,
> @@ -604,6 +623,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  		status = do_open_lookup(rqstp, cstate, open, &resfh);
>  		if (status)
>  			goto out;
> +		locked = true;
>  		break;
>  	case NFS4_OPEN_CLAIM_PREVIOUS:
>  		status = nfs4_check_open_reclaim(cstate->clp);
> @@ -639,6 +659,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  		fput(open->op_filp);
>  		open->op_filp = NULL;
>  	}
> +	if (locked)
> +		inode_unlock(cstate->current_fh.fh_dentry->d_inode);
>  	if (resfh && resfh != &cstate->current_fh) {
>  		fh_dup2(&cstate->current_fh, resfh);
>  		fh_put(resfh);
> @@ -933,7 +955,7 @@ static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
>  		return nfserr_noent;
>  	}
>  	fh_put(&tmp_fh);
> -	return nfsd_lookup(rqstp, fh, "..", 2, fh);
> +	return nfsd_lookup(rqstp, fh, "..", 2, fh, false);
>  }
>  
>  static __be32
> @@ -949,7 +971,7 @@ nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  {
>  	return nfsd_lookup(rqstp, &cstate->current_fh,
>  			   u->lookup.lo_name, u->lookup.lo_len,
> -			   &cstate->current_fh);
> +			   &cstate->current_fh, false);
>  }
>  
>  static __be32
> @@ -1089,11 +1111,10 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	if (err)
>  		return err;
>  	err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
> -				    secinfo->si_name, secinfo->si_namelen,
> -				    &exp, &dentry);
> +				 secinfo->si_name, secinfo->si_namelen,
> +				 &exp, &dentry, false);
>  	if (err)
>  		return err;
> -	fh_unlock(&cstate->current_fh);
>  	if (d_really_is_negative(dentry)) {
>  		exp_put(exp);
>  		err = nfserr_noent;
> diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
> index a25b8e321662..ed24fae09517 100644
> --- a/fs/nfsd/nfsproc.c
> +++ b/fs/nfsd/nfsproc.c
> @@ -133,7 +133,7 @@ nfsd_proc_lookup(struct svc_rqst *rqstp)
>  
>  	fh_init(&resp->fh, NFS_FHSIZE);
>  	resp->status = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len,
> -				   &resp->fh);
> +				   &resp->fh, false);
>  	fh_put(&argp->fh);
>  	if (resp->status != nfs_ok)
>  		goto out;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 4916c29af0fa..8e050c6d112a 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -172,7 +172,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
>  __be32
>  nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  		   const char *name, unsigned int len,
> -		   struct svc_export **exp_ret, struct dentry **dentry_ret)
> +		   struct svc_export **exp_ret, struct dentry **dentry_ret,
> +		   bool locked)
>  {
>  	struct svc_export	*exp;
>  	struct dentry		*dparent;
> @@ -199,27 +200,31 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  				goto out_nfserr;
>  		}
>  	} else {
> -		/*
> -		 * In the nfsd4_open() case, this may be held across
> -		 * subsequent open and delegation acquisition which may
> -		 * need to take the child's i_mutex:
> -		 */
> -		fh_lock_nested(fhp, I_MUTEX_PARENT);
> -		dentry = lookup_one_len(name, dparent, len);
> +		if (locked)
> +			dentry = lookup_one_len(name, dparent, len);
> +		else
> +			dentry = lookup_one_len_unlocked(name, dparent, len);
>  		host_err = PTR_ERR(dentry);
>  		if (IS_ERR(dentry))
>  			goto out_nfserr;
>  		if (nfsd_mountpoint(dentry, exp)) {
>  			/*
> -			 * We don't need the i_mutex after all.  It's
> -			 * still possible we could open this (regular
> -			 * files can be mountpoints too), but the
> -			 * i_mutex is just there to prevent renames of
> -			 * something that we might be about to delegate,
> -			 * and a mountpoint won't be renamed:
> +			 * nfsd_cross_mnt() may wait for an upcall
> +			 * to userspace, and holding i_sem across that

s/i_sem/i_rwsem/

> +			 * invites the possibility of a deadlock.
> +			 * We don't really need the lock on the parent
> +			 * of a mount point was we only need it to guard
> +			 * against a rename before we get a lease for a
> +			 * delegation.
> +			 * So just drop the i_sem and reclaim it.
>  			 */
> -			fh_unlock(fhp);
> -			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
> +			if (locked)
> +				inode_unlock(dparent->d_inode);
> +			host_err = nfsd_cross_mnt(rqstp, &dentry, &exp);
> +			if (locked)
> +				inode_lock_nested(dparent->d_inode,
> +						  I_MUTEX_PARENT);
> +			if (host_err) {
>  				dput(dentry);
>  				goto out_nfserr;
>  			}
> @@ -234,7 +239,17 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  	return nfserrno(host_err);
>  }
>  
> -/*
> +/**
> + * nfsd_lookup - look up a single path component for nfsd
> + *
> + * @rqstp:   the request context
> + * @ftp:     the file handle of the directory
> + * @name:    the component name, or %NULL to look up parent
> + * @len:     length of name to examine
> + * @resfh:   pointer to pre-initialised filehandle to hold result.
> + * @lock:    if true, lock directory during lookup and keep it locked
> + *           if there is no error.
> + *
>   * Look up one component of a pathname.
>   * N.B. After this call _both_ fhp and resfh need an fh_put
>   *
> @@ -244,11 +259,15 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>   * returned. Otherwise the covered directory is returned.
>   * NOTE: this mountpoint crossing is not supported properly by all
>   *   clients and is explicitly disallowed for NFSv3
> - *      NeilBrown <neilb@cse.unsw.edu.au>
> + *
> + * Only nfsd4_open() calls this with @lock set.  It does so to block
> + * renames/unlinks before it possibly gets a lease to provide a
> + * delegation.
>   */
>  __be32
>  nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
> -				unsigned int len, struct svc_fh *resfh)
> +	    unsigned int len, struct svc_fh *resfh,
> +	    bool lock)
>  {
>  	struct svc_export	*exp;
>  	struct dentry		*dentry;
> @@ -257,9 +276,11 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
>  	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
>  	if (err)
>  		return err;
> -	err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
> +	if (lock)
> +		inode_lock_nested(fhp->fh_dentry->d_inode, I_MUTEX_PARENT);
> +	err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry, lock);
>  	if (err)
> -		return err;
> +		goto out_err;
>  	err = check_nfsd_access(exp, rqstp);
>  	if (err)
>  		goto out;
> @@ -273,6 +294,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
>  out:
>  	dput(dentry);
>  	exp_put(exp);
> +out_err:
> +	if (err && lock)
> +		inode_unlock(fhp->fh_dentry->d_inode);
>  	return err;
>  }
>  
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index 9f4fd3060200..290788f007d4 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -45,10 +45,12 @@ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
>  int		nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
>  		                struct svc_export **expp);
>  __be32		nfsd_lookup(struct svc_rqst *, struct svc_fh *,
> -				const char *, unsigned int, struct svc_fh *);
> +			    const char *, unsigned int, struct svc_fh *,
> +			    bool);
>  __be32		 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
> -				const char *, unsigned int,
> -				struct svc_export **, struct dentry **);
> +				    const char *, unsigned int,
> +				    struct svc_export **, struct dentry **,
> +				    bool);
>  __be32		nfsd_setattr(struct svc_rqst *, struct svc_fh *,
>  				struct iattr *, int, time64_t);
>  int nfsd_mountpoint(struct dentry *, struct svc_export *);
> 
> 

Other than minor comment nit...

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Chuck Lever July 6, 2022, 4:29 p.m. UTC | #2
> On Jul 6, 2022, at 12:18 AM, NeilBrown <neilb@suse.de> wrote:
> 
> nfsd_lookup() takes an exclusive lock on the parent inode, but many
> callers don't want the lock and may not need to lock at all if the
> result is in the dcache.
> 
> Change nfsd_lookup() to be passed a bool flag.
> If false, don't take the lock.
> If true, do take an exclusive lock, and return with it held if
> successful.
> If nfsd_lookup() returns an error, the lock WILL NOT be held.
> 
> Only nfsd4_open() requests the lock to be held, and does so to block
> rename until it decides whether to return a delegation.
> 
> NOTE: when nfsd4_open() creates a file, the directory does *NOT* remain
>  locked and never has.  So it is possible (though unlikely) for the
>  newly created file to be renamed before a delegation is handed out,
>  and that would be bad.  This patch does *NOT* fix that, but *DOES*
>  take the directory lock immediately after creating the file, which
>  reduces the size of the window and ensure that the lock is held
>  consistently.  More work is needed to guarantee no rename happens
>  before the delegation.
> 
> NOTE-2: NFSv4 requires directory changeinfo for OPEN even when a create
>  wasn't requested and no change happened.  Now that nfsd_lookup()
>  doesn't use fh_lock(), we need explicit fh_fill_pre/post_attrs()
>  in the non-create branch of do_open_lookup().
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
> fs/nfsd/nfs3proc.c |    2 +-
> fs/nfsd/nfs4proc.c |   51 ++++++++++++++++++++++++++++------------
> fs/nfsd/nfsproc.c  |    2 +-
> fs/nfsd/vfs.c      |   66 +++++++++++++++++++++++++++++++++++-----------------
> fs/nfsd/vfs.h      |    8 ++++--
> 5 files changed, 88 insertions(+), 41 deletions(-)
> 
> diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
> index ad7941001106..3a67d0afb885 100644
> --- a/fs/nfsd/nfs3proc.c
> +++ b/fs/nfsd/nfs3proc.c
> @@ -96,7 +96,7 @@ nfsd3_proc_lookup(struct svc_rqst *rqstp)
> 
> 	resp->status = nfsd_lookup(rqstp, &resp->dirfh,
> 				   argp->name, argp->len,
> -				   &resp->fh);
> +				   &resp->fh, false);
> 	return rpc_success;
> }
> 
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 4737019738ab..6ec22c69cbec 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -414,7 +414,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
> }
> 
> static __be32
> -do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
> +do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> +	       struct nfsd4_open *open, struct svc_fh **resfh)
> {
> 	struct svc_fh *current_fh = &cstate->current_fh;
> 	int accmode;
> @@ -441,11 +442,18 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
> 		 * yes          | no     | GUARDED4        | GUARDED4
> 		 * yes          | yes    | GUARDED4        | GUARDED4
> 		 */
> -
> 		current->fs->umask = open->op_umask;
> 		status = nfsd4_create_file(rqstp, current_fh, *resfh, open);
> 		current->fs->umask = 0;
> 
> +		if (!status)
> +			/* We really want to hold the lock from before the
> +			 * create to ensure no rename happens, but that
> +			 * needs more work...
> +			 */
> +			inode_lock_nested(current_fh->fh_dentry->d_inode,
> +					  I_MUTEX_PARENT);
> +
> 		if (!status && open->op_label.len)
> 			nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
> 
> @@ -457,17 +465,25 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
> 		if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0)
> 			open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
> 						FATTR4_WORD1_TIME_MODIFY);
> -	} else
> -		/*
> -		 * Note this may exit with the parent still locked.
> -		 * We will hold the lock until nfsd4_open's final
> -		 * lookup, to prevent renames or unlinks until we've had
> -		 * a chance to an acquire a delegation if appropriate.
> +	} else {
> +		/* We want to keep the directory locked until we've had a chance
> +		 * to acquire a delegation if appropriate, so request that
> +		 * nfsd_lookup() hold on to the lock.
> 		 */
> 		status = nfsd_lookup(rqstp, current_fh,
> -				     open->op_fname, open->op_fnamelen, *resfh);
> +				     open->op_fname, open->op_fnamelen, *resfh,
> +				     true);
> +		if (!status) {
> +			/* NFSv4 protocol requires change attributes even though
> +			 * no change happened.
> +			 */
> +			fh_fill_pre_attrs(current_fh);
> +			fh_fill_post_attrs(current_fh);

If this is really correct, the comment should also state that
no concurrent changes to the parent are possible during
the lookup, and thus the pre and post attributes are expected
to be the same always.

Otherwise, this code paragraph looks just a little insane ;-)


> +		}
> +	}
> 	if (status)
> -		goto out;
> +		return status;
> +
> 	status = nfsd_check_obj_isreg(*resfh);
> 	if (status)
> 		goto out;
> @@ -483,6 +499,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
> 	status = do_open_permission(rqstp, *resfh, open, accmode);
> 	set_change_info(&open->op_cinfo, current_fh);
> out:
> +	if (status)
> +		inode_unlock(current_fh->fh_dentry->d_inode);
> 	return status;
> }
> 
> @@ -540,6 +558,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> 	struct net *net = SVC_NET(rqstp);
> 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> 	bool reclaim = false;
> +	bool locked = false;
> 
> 	dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
> 		(int)open->op_fnamelen, open->op_fname,
> @@ -604,6 +623,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> 		status = do_open_lookup(rqstp, cstate, open, &resfh);
> 		if (status)
> 			goto out;
> +		locked = true;
> 		break;
> 	case NFS4_OPEN_CLAIM_PREVIOUS:
> 		status = nfs4_check_open_reclaim(cstate->clp);
> @@ -639,6 +659,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> 		fput(open->op_filp);
> 		open->op_filp = NULL;
> 	}
> +	if (locked)
> +		inode_unlock(cstate->current_fh.fh_dentry->d_inode);
> 	if (resfh && resfh != &cstate->current_fh) {
> 		fh_dup2(&cstate->current_fh, resfh);
> 		fh_put(resfh);
> @@ -933,7 +955,7 @@ static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
> 		return nfserr_noent;
> 	}
> 	fh_put(&tmp_fh);
> -	return nfsd_lookup(rqstp, fh, "..", 2, fh);
> +	return nfsd_lookup(rqstp, fh, "..", 2, fh, false);
> }
> 
> static __be32
> @@ -949,7 +971,7 @@ nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> {
> 	return nfsd_lookup(rqstp, &cstate->current_fh,
> 			   u->lookup.lo_name, u->lookup.lo_len,
> -			   &cstate->current_fh);
> +			   &cstate->current_fh, false);
> }
> 
> static __be32
> @@ -1089,11 +1111,10 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> 	if (err)
> 		return err;
> 	err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
> -				    secinfo->si_name, secinfo->si_namelen,
> -				    &exp, &dentry);
> +				 secinfo->si_name, secinfo->si_namelen,
> +				 &exp, &dentry, false);
> 	if (err)
> 		return err;
> -	fh_unlock(&cstate->current_fh);
> 	if (d_really_is_negative(dentry)) {
> 		exp_put(exp);
> 		err = nfserr_noent;
> diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
> index a25b8e321662..ed24fae09517 100644
> --- a/fs/nfsd/nfsproc.c
> +++ b/fs/nfsd/nfsproc.c
> @@ -133,7 +133,7 @@ nfsd_proc_lookup(struct svc_rqst *rqstp)
> 
> 	fh_init(&resp->fh, NFS_FHSIZE);
> 	resp->status = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len,
> -				   &resp->fh);
> +				   &resp->fh, false);
> 	fh_put(&argp->fh);
> 	if (resp->status != nfs_ok)
> 		goto out;
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 4916c29af0fa..8e050c6d112a 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -172,7 +172,8 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
> __be32
> nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> 		   const char *name, unsigned int len,
> -		   struct svc_export **exp_ret, struct dentry **dentry_ret)
> +		   struct svc_export **exp_ret, struct dentry **dentry_ret,
> +		   bool locked)
> {
> 	struct svc_export	*exp;
> 	struct dentry		*dparent;
> @@ -199,27 +200,31 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> 				goto out_nfserr;
> 		}
> 	} else {
> -		/*
> -		 * In the nfsd4_open() case, this may be held across
> -		 * subsequent open and delegation acquisition which may
> -		 * need to take the child's i_mutex:
> -		 */
> -		fh_lock_nested(fhp, I_MUTEX_PARENT);
> -		dentry = lookup_one_len(name, dparent, len);
> +		if (locked)
> +			dentry = lookup_one_len(name, dparent, len);
> +		else
> +			dentry = lookup_one_len_unlocked(name, dparent, len);
> 		host_err = PTR_ERR(dentry);
> 		if (IS_ERR(dentry))
> 			goto out_nfserr;
> 		if (nfsd_mountpoint(dentry, exp)) {
> 			/*
> -			 * We don't need the i_mutex after all.  It's
> -			 * still possible we could open this (regular
> -			 * files can be mountpoints too), but the
> -			 * i_mutex is just there to prevent renames of
> -			 * something that we might be about to delegate,
> -			 * and a mountpoint won't be renamed:
> +			 * nfsd_cross_mnt() may wait for an upcall
> +			 * to userspace, and holding i_sem across that
> +			 * invites the possibility of a deadlock.
> +			 * We don't really need the lock on the parent
> +			 * of a mount point was we only need it to guard
> +			 * against a rename before we get a lease for a
> +			 * delegation.
> +			 * So just drop the i_sem and reclaim it.
> 			 */
> -			fh_unlock(fhp);
> -			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
> +			if (locked)
> +				inode_unlock(dparent->d_inode);
> +			host_err = nfsd_cross_mnt(rqstp, &dentry, &exp);
> +			if (locked)
> +				inode_lock_nested(dparent->d_inode,
> +						  I_MUTEX_PARENT);
> +			if (host_err) {
> 				dput(dentry);
> 				goto out_nfserr;
> 			}
> @@ -234,7 +239,17 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
> 	return nfserrno(host_err);
> }
> 
> -/*
> +/**
> + * nfsd_lookup - look up a single path component for nfsd
> + *
> + * @rqstp:   the request context
> + * @ftp:     the file handle of the directory
> + * @name:    the component name, or %NULL to look up parent
> + * @len:     length of name to examine
> + * @resfh:   pointer to pre-initialised filehandle to hold result.
> + * @lock:    if true, lock directory during lookup and keep it locked
> + *           if there is no error.
> + *
>  * Look up one component of a pathname.
>  * N.B. After this call _both_ fhp and resfh need an fh_put
>  *
> @@ -244,11 +259,15 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  * returned. Otherwise the covered directory is returned.
>  * NOTE: this mountpoint crossing is not supported properly by all
>  *   clients and is explicitly disallowed for NFSv3
> - *      NeilBrown <neilb@cse.unsw.edu.au>
> + *
> + * Only nfsd4_open() calls this with @lock set.  It does so to block
> + * renames/unlinks before it possibly gets a lease to provide a
> + * delegation.
>  */
> __be32
> nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
> -				unsigned int len, struct svc_fh *resfh)
> +	    unsigned int len, struct svc_fh *resfh,
> +	    bool lock)
> {
> 	struct svc_export	*exp;
> 	struct dentry		*dentry;
> @@ -257,9 +276,11 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
> 	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
> 	if (err)
> 		return err;
> -	err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
> +	if (lock)
> +		inode_lock_nested(fhp->fh_dentry->d_inode, I_MUTEX_PARENT);
> +	err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry, lock);
> 	if (err)
> -		return err;
> +		goto out_err;
> 	err = check_nfsd_access(exp, rqstp);
> 	if (err)
> 		goto out;
> @@ -273,6 +294,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
> out:
> 	dput(dentry);
> 	exp_put(exp);
> +out_err:
> +	if (err && lock)
> +		inode_unlock(fhp->fh_dentry->d_inode);
> 	return err;
> }
> 
> diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
> index 9f4fd3060200..290788f007d4 100644
> --- a/fs/nfsd/vfs.h
> +++ b/fs/nfsd/vfs.h
> @@ -45,10 +45,12 @@ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
> int		nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
> 		                struct svc_export **expp);
> __be32		nfsd_lookup(struct svc_rqst *, struct svc_fh *,
> -				const char *, unsigned int, struct svc_fh *);
> +			    const char *, unsigned int, struct svc_fh *,
> +			    bool);
> __be32		 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
> -				const char *, unsigned int,
> -				struct svc_export **, struct dentry **);
> +				    const char *, unsigned int,
> +				    struct svc_export **, struct dentry **,
> +				    bool);
> __be32		nfsd_setattr(struct svc_rqst *, struct svc_fh *,
> 				struct iattr *, int, time64_t);
> int nfsd_mountpoint(struct dentry *, struct svc_export *);
> 
> 

--
Chuck Lever
NeilBrown July 7, 2022, 1:26 a.m. UTC | #3
On Wed, 06 Jul 2022, Jeff Layton wrote:
> On Wed, 2022-07-06 at 14:18 +1000, NeilBrown wrote:
> > nfsd_lookup() takes an exclusive lock on the parent inode, but many
> > callers don't want the lock and may not need to lock at all if the
> > result is in the dcache.
> > 
> > Change nfsd_lookup() to be passed a bool flag.
> > If false, don't take the lock.
> > If true, do take an exclusive lock, and return with it held if
> > successful.
> > If nfsd_lookup() returns an error, the lock WILL NOT be held.
> > 
> > Only nfsd4_open() requests the lock to be held, and does so to block
> > rename until it decides whether to return a delegation.
> > 
> > NOTE: when nfsd4_open() creates a file, the directory does *NOT* remain
> >   locked and never has.  So it is possible (though unlikely) for the
> >   newly created file to be renamed before a delegation is handed out,
> >   and that would be bad.  This patch does *NOT* fix that, but *DOES*
> >   take the directory lock immediately after creating the file, which
> >   reduces the size of the window and ensure that the lock is held
> >   consistently.  More work is needed to guarantee no rename happens
> >   before the delegation.
> > 
> 
> Interesting. Maybe after taking the lock, we could re-vet the dentry vs.
> the info in the OPEN request? That way, we'd presumably know that the
> above race didn't occur.

I would lean towards revalidating the dentry after getting the lease.
However I don't think "revalidate the dentry" is quite as easy as I
would like it to be, particularly if you care about bind-mounts of
regular files.

> >  			/*
> > -			 * We don't need the i_mutex after all.  It's
> > -			 * still possible we could open this (regular
> > -			 * files can be mountpoints too), but the
> > -			 * i_mutex is just there to prevent renames of
> > -			 * something that we might be about to delegate,
> > -			 * and a mountpoint won't be renamed:
> > +			 * nfsd_cross_mnt() may wait for an upcall
> > +			 * to userspace, and holding i_sem across that
> 
> s/i_sem/i_rwsem/

But ...  fs/nilfs2/nilfs.h calls it i_sem, as does
fs/jffs2/README.Locking
And 
$ git grep -w i_mutex | wc
    180    1878   13728

But yes, I should spell it i_rwsem... or maybe just "the inode lock".

> 
> Other than minor comment nit...
> 
> Reviewed-by: Jeff Layton <jlayton@kernel.org>
> 

Thanks,
NeilBrown
NeilBrown July 7, 2022, 1:29 a.m. UTC | #4
On Thu, 07 Jul 2022, Chuck Lever III wrote:
> > +		/* We want to keep the directory locked until we've had a chance
> > +		 * to acquire a delegation if appropriate, so request that
> > +		 * nfsd_lookup() hold on to the lock.
> > 		 */
> > 		status = nfsd_lookup(rqstp, current_fh,
> > -				     open->op_fname, open->op_fnamelen, *resfh);
> > +				     open->op_fname, open->op_fnamelen, *resfh,
> > +				     true);
> > +		if (!status) {
> > +			/* NFSv4 protocol requires change attributes even though
> > +			 * no change happened.
> > +			 */
> > +			fh_fill_pre_attrs(current_fh);
> > +			fh_fill_post_attrs(current_fh);
> 
> If this is really correct, the comment should also state that
> no concurrent changes to the parent are possible during
> the lookup, and thus the pre and post attributes are expected
> to be the same always.

The earlier comment notes that nfsd_lookup() is called in a way in which
it takes and keeps the lock - that should imply that no other changes
can happen.  But with such insane looking code, it doesn't hurt to be
extra explicit.

> 
> Otherwise, this code paragraph looks just a little insane ;-)
> 
:-)

Thanks,
NeilBrown
diff mbox series

Patch

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index ad7941001106..3a67d0afb885 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -96,7 +96,7 @@  nfsd3_proc_lookup(struct svc_rqst *rqstp)
 
 	resp->status = nfsd_lookup(rqstp, &resp->dirfh,
 				   argp->name, argp->len,
-				   &resp->fh);
+				   &resp->fh, false);
 	return rpc_success;
 }
 
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4737019738ab..6ec22c69cbec 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -414,7 +414,8 @@  nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
 }
 
 static __be32
-do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
+do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+	       struct nfsd4_open *open, struct svc_fh **resfh)
 {
 	struct svc_fh *current_fh = &cstate->current_fh;
 	int accmode;
@@ -441,11 +442,18 @@  do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
 		 * yes          | no     | GUARDED4        | GUARDED4
 		 * yes          | yes    | GUARDED4        | GUARDED4
 		 */
-
 		current->fs->umask = open->op_umask;
 		status = nfsd4_create_file(rqstp, current_fh, *resfh, open);
 		current->fs->umask = 0;
 
+		if (!status)
+			/* We really want to hold the lock from before the
+			 * create to ensure no rename happens, but that
+			 * needs more work...
+			 */
+			inode_lock_nested(current_fh->fh_dentry->d_inode,
+					  I_MUTEX_PARENT);
+
 		if (!status && open->op_label.len)
 			nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
 
@@ -457,17 +465,25 @@  do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
 		if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0)
 			open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
 						FATTR4_WORD1_TIME_MODIFY);
-	} else
-		/*
-		 * Note this may exit with the parent still locked.
-		 * We will hold the lock until nfsd4_open's final
-		 * lookup, to prevent renames or unlinks until we've had
-		 * a chance to an acquire a delegation if appropriate.
+	} else {
+		/* We want to keep the directory locked until we've had a chance
+		 * to acquire a delegation if appropriate, so request that
+		 * nfsd_lookup() hold on to the lock.
 		 */
 		status = nfsd_lookup(rqstp, current_fh,
-				     open->op_fname, open->op_fnamelen, *resfh);
+				     open->op_fname, open->op_fnamelen, *resfh,
+				     true);
+		if (!status) {
+			/* NFSv4 protocol requires change attributes even though
+			 * no change happened.
+			 */
+			fh_fill_pre_attrs(current_fh);
+			fh_fill_post_attrs(current_fh);
+		}
+	}
 	if (status)
-		goto out;
+		return status;
+
 	status = nfsd_check_obj_isreg(*resfh);
 	if (status)
 		goto out;
@@ -483,6 +499,8 @@  do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
 	status = do_open_permission(rqstp, *resfh, open, accmode);
 	set_change_info(&open->op_cinfo, current_fh);
 out:
+	if (status)
+		inode_unlock(current_fh->fh_dentry->d_inode);
 	return status;
 }
 
@@ -540,6 +558,7 @@  nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	bool reclaim = false;
+	bool locked = false;
 
 	dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
 		(int)open->op_fnamelen, open->op_fname,
@@ -604,6 +623,7 @@  nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = do_open_lookup(rqstp, cstate, open, &resfh);
 		if (status)
 			goto out;
+		locked = true;
 		break;
 	case NFS4_OPEN_CLAIM_PREVIOUS:
 		status = nfs4_check_open_reclaim(cstate->clp);
@@ -639,6 +659,8 @@  nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		fput(open->op_filp);
 		open->op_filp = NULL;
 	}
+	if (locked)
+		inode_unlock(cstate->current_fh.fh_dentry->d_inode);
 	if (resfh && resfh != &cstate->current_fh) {
 		fh_dup2(&cstate->current_fh, resfh);
 		fh_put(resfh);
@@ -933,7 +955,7 @@  static __be32 nfsd4_do_lookupp(struct svc_rqst *rqstp, struct svc_fh *fh)
 		return nfserr_noent;
 	}
 	fh_put(&tmp_fh);
-	return nfsd_lookup(rqstp, fh, "..", 2, fh);
+	return nfsd_lookup(rqstp, fh, "..", 2, fh, false);
 }
 
 static __be32
@@ -949,7 +971,7 @@  nfsd4_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	return nfsd_lookup(rqstp, &cstate->current_fh,
 			   u->lookup.lo_name, u->lookup.lo_len,
-			   &cstate->current_fh);
+			   &cstate->current_fh, false);
 }
 
 static __be32
@@ -1089,11 +1111,10 @@  nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (err)
 		return err;
 	err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
-				    secinfo->si_name, secinfo->si_namelen,
-				    &exp, &dentry);
+				 secinfo->si_name, secinfo->si_namelen,
+				 &exp, &dentry, false);
 	if (err)
 		return err;
-	fh_unlock(&cstate->current_fh);
 	if (d_really_is_negative(dentry)) {
 		exp_put(exp);
 		err = nfserr_noent;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a25b8e321662..ed24fae09517 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -133,7 +133,7 @@  nfsd_proc_lookup(struct svc_rqst *rqstp)
 
 	fh_init(&resp->fh, NFS_FHSIZE);
 	resp->status = nfsd_lookup(rqstp, &argp->fh, argp->name, argp->len,
-				   &resp->fh);
+				   &resp->fh, false);
 	fh_put(&argp->fh);
 	if (resp->status != nfs_ok)
 		goto out;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 4916c29af0fa..8e050c6d112a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -172,7 +172,8 @@  int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
 __be32
 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		   const char *name, unsigned int len,
-		   struct svc_export **exp_ret, struct dentry **dentry_ret)
+		   struct svc_export **exp_ret, struct dentry **dentry_ret,
+		   bool locked)
 {
 	struct svc_export	*exp;
 	struct dentry		*dparent;
@@ -199,27 +200,31 @@  nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 				goto out_nfserr;
 		}
 	} else {
-		/*
-		 * In the nfsd4_open() case, this may be held across
-		 * subsequent open and delegation acquisition which may
-		 * need to take the child's i_mutex:
-		 */
-		fh_lock_nested(fhp, I_MUTEX_PARENT);
-		dentry = lookup_one_len(name, dparent, len);
+		if (locked)
+			dentry = lookup_one_len(name, dparent, len);
+		else
+			dentry = lookup_one_len_unlocked(name, dparent, len);
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
 		if (nfsd_mountpoint(dentry, exp)) {
 			/*
-			 * We don't need the i_mutex after all.  It's
-			 * still possible we could open this (regular
-			 * files can be mountpoints too), but the
-			 * i_mutex is just there to prevent renames of
-			 * something that we might be about to delegate,
-			 * and a mountpoint won't be renamed:
+			 * nfsd_cross_mnt() may wait for an upcall
+			 * to userspace, and holding i_sem across that
+			 * invites the possibility of a deadlock.
+			 * We don't really need the lock on the parent
+			 * of a mount point was we only need it to guard
+			 * against a rename before we get a lease for a
+			 * delegation.
+			 * So just drop the i_sem and reclaim it.
 			 */
-			fh_unlock(fhp);
-			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
+			if (locked)
+				inode_unlock(dparent->d_inode);
+			host_err = nfsd_cross_mnt(rqstp, &dentry, &exp);
+			if (locked)
+				inode_lock_nested(dparent->d_inode,
+						  I_MUTEX_PARENT);
+			if (host_err) {
 				dput(dentry);
 				goto out_nfserr;
 			}
@@ -234,7 +239,17 @@  nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	return nfserrno(host_err);
 }
 
-/*
+/**
+ * nfsd_lookup - look up a single path component for nfsd
+ *
+ * @rqstp:   the request context
+ * @ftp:     the file handle of the directory
+ * @name:    the component name, or %NULL to look up parent
+ * @len:     length of name to examine
+ * @resfh:   pointer to pre-initialised filehandle to hold result.
+ * @lock:    if true, lock directory during lookup and keep it locked
+ *           if there is no error.
+ *
  * Look up one component of a pathname.
  * N.B. After this call _both_ fhp and resfh need an fh_put
  *
@@ -244,11 +259,15 @@  nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * returned. Otherwise the covered directory is returned.
  * NOTE: this mountpoint crossing is not supported properly by all
  *   clients and is explicitly disallowed for NFSv3
- *      NeilBrown <neilb@cse.unsw.edu.au>
+ *
+ * Only nfsd4_open() calls this with @lock set.  It does so to block
+ * renames/unlinks before it possibly gets a lease to provide a
+ * delegation.
  */
 __be32
 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
-				unsigned int len, struct svc_fh *resfh)
+	    unsigned int len, struct svc_fh *resfh,
+	    bool lock)
 {
 	struct svc_export	*exp;
 	struct dentry		*dentry;
@@ -257,9 +276,11 @@  nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
 	err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
 	if (err)
 		return err;
-	err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
+	if (lock)
+		inode_lock_nested(fhp->fh_dentry->d_inode, I_MUTEX_PARENT);
+	err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry, lock);
 	if (err)
-		return err;
+		goto out_err;
 	err = check_nfsd_access(exp, rqstp);
 	if (err)
 		goto out;
@@ -273,6 +294,9 @@  nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
 out:
 	dput(dentry);
 	exp_put(exp);
+out_err:
+	if (err && lock)
+		inode_unlock(fhp->fh_dentry->d_inode);
 	return err;
 }
 
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 9f4fd3060200..290788f007d4 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -45,10 +45,12 @@  typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
 int		nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 		                struct svc_export **expp);
 __be32		nfsd_lookup(struct svc_rqst *, struct svc_fh *,
-				const char *, unsigned int, struct svc_fh *);
+			    const char *, unsigned int, struct svc_fh *,
+			    bool);
 __be32		 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
-				const char *, unsigned int,
-				struct svc_export **, struct dentry **);
+				    const char *, unsigned int,
+				    struct svc_export **, struct dentry **,
+				    bool);
 __be32		nfsd_setattr(struct svc_rqst *, struct svc_fh *,
 				struct iattr *, int, time64_t);
 int nfsd_mountpoint(struct dentry *, struct svc_export *);