diff mbox series

[v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru()

Message ID 170546328406.23031.11217818844350800811@noble.neil.brown.name (mailing list archive)
State New, archived
Headers show
Series [v2] nfsd: drop st_mutex and rp_mutex before calling move_to_close_lru() | expand

Commit Message

NeilBrown Jan. 17, 2024, 3:48 a.m. UTC
move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
This can lead to a deadlock as move_to_close_lru() waits for sc_count to
drop to 2, and some threads holding a reference might be waiting for either
mutex.  These references will never be dropped so sc_count will never
reach 2.

There can be no harm in dropping ->st_mutex to before
move_to_close_lru() because the only place that takes the mutex is
nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.

Similarly dropping .rp_mutex is safe after the state is closed and so
no longer usable.  Another way to look at this is that nothing
significant happens between when nfsd4_close() now calls
nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
nfsd4_cstate_clear_replay() a little later.

See also
 https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
where this problem was raised but not successfully resolved.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/nfsd/nfs4state.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

Comments

Jeff Layton Jan. 17, 2024, 6:19 p.m. UTC | #1
On Wed, 2024-01-17 at 14:48 +1100, NeilBrown wrote:
> move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> drop to 2, and some threads holding a reference might be waiting for either
> mutex.  These references will never be dropped so sc_count will never
> reach 2.
> 
> There can be no harm in dropping ->st_mutex to before
> move_to_close_lru() because the only place that takes the mutex is
> nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
> NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
> 
> Similarly dropping .rp_mutex is safe after the state is closed and so
> no longer usable.  Another way to look at this is that nothing
> significant happens between when nfsd4_close() now calls
> nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
> nfsd4_cstate_clear_replay() a little later.
> 
> See also
>  https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> where this problem was raised but not successfully resolved.
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/nfsd/nfs4state.c | 18 ++++++++++++++----
>  1 file changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 40415929e2ae..0850191f9920 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
>  	return status;
>  }
>  
> -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  {
>  	struct nfs4_client *clp = s->st_stid.sc_client;
>  	bool unhashed;
> @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  		list_for_each_entry(stp, &reaplist, st_locks)
>  			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
>  		free_ol_stateid_reaplist(&reaplist);
> +		return false;
>  	} else {
>  		spin_unlock(&clp->cl_lock);
>  		free_ol_stateid_reaplist(&reaplist);
> -		if (unhashed)
> -			move_to_close_lru(s, clp->net);
> +		return unhashed;
>  	}
>  }
>  
> @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	struct nfs4_ol_stateid *stp;
>  	struct net *net = SVC_NET(rqstp);
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +	bool need_move_to_close_list;
>  
>  	dprintk("NFSD: nfsd4_close on file %pd\n", 
>  			cstate->current_fh.fh_dentry);
> @@ -7114,8 +7115,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	 */
>  	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
>  
> -	nfsd4_close_open_stateid(stp);
> +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
>  	mutex_unlock(&stp->st_mutex);
> +	if (need_move_to_close_list) {
> +		/* Drop the replay mutex early as move_to_close_lru()
> +		 * can wait for other threads which hold that mutex.
> +		 * This call is idempotent, so that fact that it will
> +		 * be called twice is harmless.
> +		 */
> +		nfsd4_cstate_clear_replay(cstate);
> +		move_to_close_lru(stp, net);
> +	}
>  
>  	/* v4.1+ suggests that we send a special stateid in here, since the
>  	 * clients should just ignore this anyway. Since this is not useful

I think I get it, though I still struggle with the v4.0 open/close
handling.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Jeff Layton Feb. 28, 2024, 5:40 p.m. UTC | #2
On Wed, 2024-01-17 at 14:48 +1100, NeilBrown wrote:
> move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> drop to 2, and some threads holding a reference might be waiting for either
> mutex.  These references will never be dropped so sc_count will never
> reach 2.
> 
> There can be no harm in dropping ->st_mutex to before
> move_to_close_lru() because the only place that takes the mutex is
> nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
> NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
> 
> Similarly dropping .rp_mutex is safe after the state is closed and so
> no longer usable.  Another way to look at this is that nothing
> significant happens between when nfsd4_close() now calls
> nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
> nfsd4_cstate_clear_replay() a little later.
> 
> See also
>  https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> where this problem was raised but not successfully resolved.
> 
> Signed-off-by: NeilBrown <neilb@suse.de>
> ---
>  fs/nfsd/nfs4state.c | 18 ++++++++++++++----
>  1 file changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 40415929e2ae..0850191f9920 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
>  	return status;
>  }
>  
> -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  {
>  	struct nfs4_client *clp = s->st_stid.sc_client;
>  	bool unhashed;
> @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
>  		list_for_each_entry(stp, &reaplist, st_locks)
>  			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
>  		free_ol_stateid_reaplist(&reaplist);
> +		return false;
>  	} else {
>  		spin_unlock(&clp->cl_lock);
>  		free_ol_stateid_reaplist(&reaplist);
> -		if (unhashed)
> -			move_to_close_lru(s, clp->net);
> +		return unhashed;
>  	}
>  }
>  
> @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	struct nfs4_ol_stateid *stp;
>  	struct net *net = SVC_NET(rqstp);
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> +	bool need_move_to_close_list;
>  
>  	dprintk("NFSD: nfsd4_close on file %pd\n", 
>  			cstate->current_fh.fh_dentry);
> @@ -7114,8 +7115,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
>  	 */
>  	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
>  
> -	nfsd4_close_open_stateid(stp);
> +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
>  	mutex_unlock(&stp->st_mutex);
> +	if (need_move_to_close_list) {
> +		/* Drop the replay mutex early as move_to_close_lru()
> +		 * can wait for other threads which hold that mutex.
> +		 * This call is idempotent, so that fact that it will
> +		 * be called twice is harmless.
> +		 */
> +		nfsd4_cstate_clear_replay(cstate);
> +		move_to_close_lru(stp, net);
> +	}
>  
>  	/* v4.1+ suggests that we send a special stateid in here, since the
>  	 * clients should just ignore this anyway. Since this is not useful

There is a recent regression in pynfs test CLOSE12 in Chuck's nfsd-next
branch. In the attached capture, there is an extra 40 bytes on the end
of the CLOSE response in frame 112.

A bisect landed on this patch, though I don't see the cause just yet.

Thoughts?
Jeff Layton Feb. 28, 2024, 7:30 p.m. UTC | #3
On Wed, 2024-02-28 at 12:40 -0500, Jeff Layton wrote:
> On Wed, 2024-01-17 at 14:48 +1100, NeilBrown wrote:
> > move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> > This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> > drop to 2, and some threads holding a reference might be waiting for either
> > mutex.  These references will never be dropped so sc_count will never
> > reach 2.
> > 
> > There can be no harm in dropping ->st_mutex to before
> > move_to_close_lru() because the only place that takes the mutex is
> > nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
> > NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
> > 
> > Similarly dropping .rp_mutex is safe after the state is closed and so
> > no longer usable.  Another way to look at this is that nothing
> > significant happens between when nfsd4_close() now calls
> > nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
> > nfsd4_cstate_clear_replay() a little later.
> > 
> > See also
> >  https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> > where this problem was raised but not successfully resolved.
> > 
> > Signed-off-by: NeilBrown <neilb@suse.de>
> > ---
> >  fs/nfsd/nfs4state.c | 18 ++++++++++++++----
> >  1 file changed, 14 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 40415929e2ae..0850191f9920 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
> >  	return status;
> >  }
> >  
> > -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> > +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> >  {
> >  	struct nfs4_client *clp = s->st_stid.sc_client;
> >  	bool unhashed;
> > @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> >  		list_for_each_entry(stp, &reaplist, st_locks)
> >  			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
> >  		free_ol_stateid_reaplist(&reaplist);
> > +		return false;
> >  	} else {
> >  		spin_unlock(&clp->cl_lock);
> >  		free_ol_stateid_reaplist(&reaplist);
> > -		if (unhashed)
> > -			move_to_close_lru(s, clp->net);
> > +		return unhashed;
> >  	}
> >  }
> >  
> > @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  	struct nfs4_ol_stateid *stp;
> >  	struct net *net = SVC_NET(rqstp);
> >  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +	bool need_move_to_close_list;
> >  
> >  	dprintk("NFSD: nfsd4_close on file %pd\n", 
> >  			cstate->current_fh.fh_dentry);
> > @@ -7114,8 +7115,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  	 */
> >  	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
> >  
> > -	nfsd4_close_open_stateid(stp);
> > +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
> >  	mutex_unlock(&stp->st_mutex);
> > +	if (need_move_to_close_list) {
> > +		/* Drop the replay mutex early as move_to_close_lru()
> > +		 * can wait for other threads which hold that mutex.
> > +		 * This call is idempotent, so that fact that it will
> > +		 * be called twice is harmless.
> > +		 */
> > +		nfsd4_cstate_clear_replay(cstate);
> > +		move_to_close_lru(stp, net);
> > +	}
> >  
> >  	/* v4.1+ suggests that we send a special stateid in here, since the
> >  	 * clients should just ignore this anyway. Since this is not useful
> 
> There is a recent regression in pynfs test CLOSE12 in Chuck's nfsd-next
> branch. In the attached capture, there is an extra 40 bytes on the end
> of the CLOSE response in frame 112.
> 
> A bisect landed on this patch, though I don't see the cause just yet.
> 
> Thoughts?

To follow up with what I found so far:

The problem seems to be with the rp_buflen in nfsd4_encode_replay. With
this patch applied, rp_buflen is 56 bytes when nfsd4_encode_replay is
called. With this patch reverted, it's 16 bytes. That accounts for the
extra 40 bytes.

Still trying to determine why this is occurring though...
Jeff Layton Feb. 29, 2024, 2 p.m. UTC | #4
On Wed, 2024-02-28 at 12:40 -0500, Jeff Layton wrote:
> On Wed, 2024-01-17 at 14:48 +1100, NeilBrown wrote:
> > move_to_close_lru() is currently called with ->st_mutex and .rp_mutex held.
> > This can lead to a deadlock as move_to_close_lru() waits for sc_count to
> > drop to 2, and some threads holding a reference might be waiting for either
> > mutex.  These references will never be dropped so sc_count will never
> > reach 2.
> > 
> > There can be no harm in dropping ->st_mutex to before
> > move_to_close_lru() because the only place that takes the mutex is
> > nfsd4_lock_ol_stateid(), and it quickly aborts if sc_type is
> > NFS4_CLOSED_STID, which it will be before move_to_close_lru() is called.
> > 
> > Similarly dropping .rp_mutex is safe after the state is closed and so
> > no longer usable.  Another way to look at this is that nothing
> > significant happens between when nfsd4_close() now calls
> > nfsd4_cstate_clear_replay(), and where nfsd4_proc_compound calls
> > nfsd4_cstate_clear_replay() a little later.
> > 
> > See also
> >  https://lore.kernel.org/lkml/4dd1fe21e11344e5969bb112e954affb@jd.com/T/
> > where this problem was raised but not successfully resolved.
> > 
> > Signed-off-by: NeilBrown <neilb@suse.de>
> > ---
> >  fs/nfsd/nfs4state.c | 18 ++++++++++++++----
> >  1 file changed, 14 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 40415929e2ae..0850191f9920 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -7055,7 +7055,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
> >  	return status;
> >  }
> >  
> > -static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> > +static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> >  {
> >  	struct nfs4_client *clp = s->st_stid.sc_client;
> >  	bool unhashed;
> > @@ -7072,11 +7072,11 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
> >  		list_for_each_entry(stp, &reaplist, st_locks)
> >  			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
> >  		free_ol_stateid_reaplist(&reaplist);
> > +		return false;
> >  	} else {
> >  		spin_unlock(&clp->cl_lock);
> >  		free_ol_stateid_reaplist(&reaplist);
> > -		if (unhashed)
> > -			move_to_close_lru(s, clp->net);
> > +		return unhashed;
> >  	}
> >  }
> >  
> > @@ -7092,6 +7092,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  	struct nfs4_ol_stateid *stp;
> >  	struct net *net = SVC_NET(rqstp);
> >  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> > +	bool need_move_to_close_list;
> >  
> >  	dprintk("NFSD: nfsd4_close on file %pd\n", 
> >  			cstate->current_fh.fh_dentry);
> > @@ -7114,8 +7115,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
> >  	 */
> >  	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
> >  
> > -	nfsd4_close_open_stateid(stp);
> > +	need_move_to_close_list = nfsd4_close_open_stateid(stp);
> >  	mutex_unlock(&stp->st_mutex);
> > +	if (need_move_to_close_list) {
> > +		/* Drop the replay mutex early as move_to_close_lru()
> > +		 * can wait for other threads which hold that mutex.
> > +		 * This call is idempotent, so that fact that it will
> > +		 * be called twice is harmless.
> > +		 */
> > +		nfsd4_cstate_clear_replay(cstate);

Ok, I think I figured out the regression. The problem is the above line.

That clears cstate->replay_owner, which makes nfsd4_encode_operation not
update the so_replay.rp_buflen, which leaves it set to what it was in
the _prior_ seqid-morphing operation. In this case, that's an OPEN
reply, which was 40 bytes longer than the CLOSE reply.

I'm not sure of the best way to fix this, so it may be best to just
revert this patch for now.

Thinking about it more, the rp_mutex has a rather nasty code smell about
it. Maybe we ought to turn the mutex_lock into a trylock and just return
NFS4ERR_DELAY if you can't get it?

In principle, contention for that lock means that the stateowner is
spraying seqid-morphing operations at us. Returning DELAY would seem
like a reasonable thing to do there if we get confused.

Chuck, Neil, any thoughts?

> > +		move_to_close_lru(stp, net);
> > +	}
> >  
> >  	/* v4.1+ suggests that we send a special stateid in here, since the
> >  	 * clients should just ignore this anyway. Since this is not useful
> 
> There is a recent regression in pynfs test CLOSE12 in Chuck's nfsd-next
> branch. In the attached capture, there is an extra 40 bytes on the end
> of the CLOSE response in frame 112.
> 
> A bisect landed on this patch, though I don't see the cause just yet.
> 
> Thoughts?
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 40415929e2ae..0850191f9920 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -7055,7 +7055,7 @@  nfsd4_open_downgrade(struct svc_rqst *rqstp,
 	return status;
 }
 
-static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
+static bool nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 {
 	struct nfs4_client *clp = s->st_stid.sc_client;
 	bool unhashed;
@@ -7072,11 +7072,11 @@  static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 		list_for_each_entry(stp, &reaplist, st_locks)
 			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
 		free_ol_stateid_reaplist(&reaplist);
+		return false;
 	} else {
 		spin_unlock(&clp->cl_lock);
 		free_ol_stateid_reaplist(&reaplist);
-		if (unhashed)
-			move_to_close_lru(s, clp->net);
+		return unhashed;
 	}
 }
 
@@ -7092,6 +7092,7 @@  nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_ol_stateid *stp;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	bool need_move_to_close_list;
 
 	dprintk("NFSD: nfsd4_close on file %pd\n", 
 			cstate->current_fh.fh_dentry);
@@ -7114,8 +7115,17 @@  nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 */
 	nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
 
-	nfsd4_close_open_stateid(stp);
+	need_move_to_close_list = nfsd4_close_open_stateid(stp);
 	mutex_unlock(&stp->st_mutex);
+	if (need_move_to_close_list) {
+		/* Drop the replay mutex early as move_to_close_lru()
+		 * can wait for other threads which hold that mutex.
+		 * This call is idempotent, so that fact that it will
+		 * be called twice is harmless.
+		 */
+		nfsd4_cstate_clear_replay(cstate);
+		move_to_close_lru(stp, net);
+	}
 
 	/* v4.1+ suggests that we send a special stateid in here, since the
 	 * clients should just ignore this anyway. Since this is not useful