diff mbox series

[2/6] SUNRPC: rename and refactor svc_get_next_xprt()

Message ID 20230802073443.17965-3-neilb@suse.de (mailing list archive)
State New, archived
Headers show
Series SUNRPC: thread management improvements | expand

Commit Message

NeilBrown Aug. 2, 2023, 7:34 a.m. UTC
svc_get_next_xprt() does a lot more than just get an xprt.  It also
decides if it needs to sleep, depending not only on the availability of
xprts but also on the need to exit or handle external work.

So rename it to svc_rqst_wait_for_work() and only do the testing and
waiting.  Move all the waiting-related code out of svc_recv() into the
new svc_rqst_wait_for_work().

Move the dequeueing code out of svc_get_next_xprt() into svc_recv().

Previously svc_xprt_dequeue() would be called twice, once before waiting
and possibly once after.  Now instead rqst_should_sleep() is called
twice.  Once to decide if waiting is needed, and once to check against
after setting the task state do see if we might have missed a wakeup.

signed-off-by: NeilBrown <neilb@suse.de>
---
 net/sunrpc/svc_xprt.c | 91 ++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 48 deletions(-)

Comments

Chuck Lever III Aug. 3, 2023, 7 p.m. UTC | #1
On Wed, Aug 02, 2023 at 05:34:39PM +1000, NeilBrown wrote:
> svc_get_next_xprt() does a lot more than just get an xprt.  It also
> decides if it needs to sleep, depending not only on the availability of
> xprts but also on the need to exit or handle external work.
> 
> So rename it to svc_rqst_wait_for_work() and only do the testing and
> waiting.  Move all the waiting-related code out of svc_recv() into the
> new svc_rqst_wait_for_work().
> 
> Move the dequeueing code out of svc_get_next_xprt() into svc_recv().
> 
> Previously svc_xprt_dequeue() would be called twice, once before waiting
> and possibly once after.  Now instead rqst_should_sleep() is called
> twice.  Once to decide if waiting is needed, and once to check against
> after setting the task state do see if we might have missed a wakeup.
> 
> signed-off-by: NeilBrown <neilb@suse.de>

I've tested and applied this one and the previous one to the thread
scheduling branch, with a few minor fix-ups. Apologies for how long
this took, I'm wrestling with a SATA/block bug on the v6.6 test
system that is being very sassy and hard to nail down.

I need to dive into the backchannel patch next. I'm trying to think
of how I want to test that one.


> ---
>  net/sunrpc/svc_xprt.c | 91 ++++++++++++++++++++-----------------------
>  1 file changed, 43 insertions(+), 48 deletions(-)
> 
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index 60759647fee4..f1d64ded89fb 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -722,51 +722,33 @@ rqst_should_sleep(struct svc_rqst *rqstp)
>  	return true;
>  }
>  
> -static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp)
> +static void svc_rqst_wait_for_work(struct svc_rqst *rqstp)
>  {
> -	struct svc_pool		*pool = rqstp->rq_pool;
> -
> -	/* rq_xprt should be clear on entry */
> -	WARN_ON_ONCE(rqstp->rq_xprt);
> +	struct svc_pool *pool = rqstp->rq_pool;
>  
> -	rqstp->rq_xprt = svc_xprt_dequeue(pool);
> -	if (rqstp->rq_xprt)
> -		goto out_found;
> -
> -	set_current_state(TASK_IDLE);
> -	smp_mb__before_atomic();
> -	clear_bit(SP_CONGESTED, &pool->sp_flags);
> -	clear_bit(RQ_BUSY, &rqstp->rq_flags);
> -	smp_mb__after_atomic();
> -
> -	if (likely(rqst_should_sleep(rqstp)))
> -		schedule();
> -	else
> -		__set_current_state(TASK_RUNNING);
> +	if (rqst_should_sleep(rqstp)) {
> +		set_current_state(TASK_IDLE);
> +		smp_mb__before_atomic();
> +		clear_bit(SP_CONGESTED, &pool->sp_flags);
> +		clear_bit(RQ_BUSY, &rqstp->rq_flags);
> +		smp_mb__after_atomic();
> +
> +		/* Need to check should_sleep() again after
> +		 * setting task state in case a wakeup happened
> +		 * between testing and setting.
> +		 */
> +		if (rqst_should_sleep(rqstp)) {
> +			schedule();
> +		} else {
> +			__set_current_state(TASK_RUNNING);
> +			cond_resched();
> +		}
>  
> +		set_bit(RQ_BUSY, &rqstp->rq_flags);
> +		smp_mb__after_atomic();
> +	} else
> +		cond_resched();
>  	try_to_freeze();
> -
> -	set_bit(RQ_BUSY, &rqstp->rq_flags);
> -	smp_mb__after_atomic();
> -	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
> -	rqstp->rq_xprt = svc_xprt_dequeue(pool);
> -	if (rqstp->rq_xprt)
> -		goto out_found;
> -
> -	if (kthread_should_stop())
> -		return NULL;
> -	return NULL;
> -out_found:
> -	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
> -	/* Normally we will wait up to 5 seconds for any required
> -	 * cache information to be provided.
> -	 */
> -	if (!test_bit(SP_CONGESTED, &pool->sp_flags))
> -		rqstp->rq_chandle.thread_wait = 5*HZ;
> -	else
> -		rqstp->rq_chandle.thread_wait = 1*HZ;
> -	trace_svc_xprt_dequeue(rqstp);
> -	return rqstp->rq_xprt;
>  }
>  
>  static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
> @@ -858,20 +840,33 @@ static void svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
>   */
>  void svc_recv(struct svc_rqst *rqstp)
>  {
> -	struct svc_xprt		*xprt = NULL;
> +	struct svc_pool *pool = rqstp->rq_pool;
>  
>  	if (!svc_alloc_arg(rqstp))
>  		return;
>  
> -	try_to_freeze();
> -	cond_resched();
> +	svc_rqst_wait_for_work(rqstp);
> +
> +	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
> +
>  	if (kthread_should_stop())
> -		goto out;
> +		return;
> +
> +	rqstp->rq_xprt = svc_xprt_dequeue(pool);
> +	if (rqstp->rq_xprt) {
> +		struct svc_xprt *xprt = rqstp->rq_xprt;
> +
> +		/* Normally we will wait up to 5 seconds for any required
> +		 * cache information to be provided.
> +		 */
> +		if (test_bit(SP_CONGESTED, &pool->sp_flags))
> +			rqstp->rq_chandle.thread_wait = 5 * HZ;
> +		else
> +			rqstp->rq_chandle.thread_wait = 1 * HZ;
>  
> -	xprt = svc_get_next_xprt(rqstp);
> -	if (xprt)
> +		trace_svc_xprt_dequeue(rqstp);
>  		svc_handle_xprt(rqstp, xprt);
> -out:
> +	}
>  }
>  EXPORT_SYMBOL_GPL(svc_recv);
>  
> -- 
> 2.40.1
>
NeilBrown Aug. 3, 2023, 9:12 p.m. UTC | #2
On Fri, 04 Aug 2023, Chuck Lever wrote:
> On Wed, Aug 02, 2023 at 05:34:39PM +1000, NeilBrown wrote:
> > svc_get_next_xprt() does a lot more than just get an xprt.  It also
> > decides if it needs to sleep, depending not only on the availability of
> > xprts but also on the need to exit or handle external work.
> > 
> > So rename it to svc_rqst_wait_for_work() and only do the testing and
> > waiting.  Move all the waiting-related code out of svc_recv() into the
> > new svc_rqst_wait_for_work().
> > 
> > Move the dequeueing code out of svc_get_next_xprt() into svc_recv().
> > 
> > Previously svc_xprt_dequeue() would be called twice, once before waiting
> > and possibly once after.  Now instead rqst_should_sleep() is called
> > twice.  Once to decide if waiting is needed, and once to check against
> > after setting the task state do see if we might have missed a wakeup.
> > 
> > signed-off-by: NeilBrown <neilb@suse.de>
> 
> I've tested and applied this one and the previous one to the thread
> scheduling branch, with a few minor fix-ups. Apologies for how long
> this took, I'm wrestling with a SATA/block bug on the v6.6 test
> system that is being very sassy and hard to nail down.

I'm happy that we are making progress and the series is getting improved
along the way.  Good lock with the block bug.

> 
> I need to dive into the backchannel patch next. I'm trying to think
> of how I want to test that one.
> 

I think lock-grant call backs should be easiest to trigger.
However I just tried mounting the filesystem twice with nosharecache,
and the locking that same file on both mounts.  I expected one to block
and hoped I would see the callback when the first lock was dropped.
However the second lock didn't block! That's a bug.
I haven't dug very deep yet, but I think the client is getting a
delegation for the first open (O_RDWR) so the server doesn't see the
lock.
Then when the lock arrives on the second mount, there is no conflicting
lock and the write delegation maybe isn't treated as a conflict?

I'll try to look more early next week.

NeilBrown
NeilBrown Aug. 6, 2023, 11:07 p.m. UTC | #3
On Fri, 04 Aug 2023, NeilBrown wrote:
> On Fri, 04 Aug 2023, Chuck Lever wrote:
> > On Wed, Aug 02, 2023 at 05:34:39PM +1000, NeilBrown wrote:
> > > svc_get_next_xprt() does a lot more than just get an xprt.  It also
> > > decides if it needs to sleep, depending not only on the availability of
> > > xprts but also on the need to exit or handle external work.
> > > 
> > > So rename it to svc_rqst_wait_for_work() and only do the testing and
> > > waiting.  Move all the waiting-related code out of svc_recv() into the
> > > new svc_rqst_wait_for_work().
> > > 
> > > Move the dequeueing code out of svc_get_next_xprt() into svc_recv().
> > > 
> > > Previously svc_xprt_dequeue() would be called twice, once before waiting
> > > and possibly once after.  Now instead rqst_should_sleep() is called
> > > twice.  Once to decide if waiting is needed, and once to check against
> > > after setting the task state do see if we might have missed a wakeup.
> > > 
> > > signed-off-by: NeilBrown <neilb@suse.de>
> > 
> > I've tested and applied this one and the previous one to the thread
> > scheduling branch, with a few minor fix-ups. Apologies for how long
> > this took, I'm wrestling with a SATA/block bug on the v6.6 test
> > system that is being very sassy and hard to nail down.
> 
> I'm happy that we are making progress and the series is getting improved
> along the way.  Good lock with the block bug.
> 
> > 
> > I need to dive into the backchannel patch next. I'm trying to think
> > of how I want to test that one.
> > 
> 
> I think lock-grant call backs should be easiest to trigger.
> However I just tried mounting the filesystem twice with nosharecache,
> and the locking that same file on both mounts.  I expected one to block
> and hoped I would see the callback when the first lock was dropped.
> However the second lock didn't block! That's a bug.
> I haven't dug very deep yet, but I think the client is getting a
> delegation for the first open (O_RDWR) so the server doesn't see the
> lock.
> Then when the lock arrives on the second mount, there is no conflicting
> lock and the write delegation maybe isn't treated as a conflict?
> 
> I'll try to look more early next week.

The bug appears to be client-side.
When I mount the same filesystem twice using nosharecache the client
only creates a single session.  One of the mounts opens the file and
gets a write delegation.  The other mount opens the same file but this
doesn't trigger a delegation recall as the server thinks it is the same
client as it is using the same session.  But the client is caching the
two mounts separately and not sharing the delegation.
So when the second mount locks the file the server allows it, even
though the first mount thinks it holds a lock thanks to the delegation.

I think nosharecache needs to use a separate identity and create a
separate session.

I repeated the test using network namespaces to create a genuinely
separate clients so the server now sees two clients opening the same file
and trying to lock it.  I now see both CB_RECALL and CB_NOTIFY_LOCK
callbacks being handled correctly.

NeilBrown
Chuck Lever III Aug. 7, 2023, 2:59 p.m. UTC | #4
> On Aug 6, 2023, at 7:07 PM, NeilBrown <neilb@suse.de> wrote:
> 
> On Fri, 04 Aug 2023, NeilBrown wrote:
>> On Fri, 04 Aug 2023, Chuck Lever wrote:
>>> On Wed, Aug 02, 2023 at 05:34:39PM +1000, NeilBrown wrote:
>>>> svc_get_next_xprt() does a lot more than just get an xprt.  It also
>>>> decides if it needs to sleep, depending not only on the availability of
>>>> xprts but also on the need to exit or handle external work.
>>>> 
>>>> So rename it to svc_rqst_wait_for_work() and only do the testing and
>>>> waiting.  Move all the waiting-related code out of svc_recv() into the
>>>> new svc_rqst_wait_for_work().
>>>> 
>>>> Move the dequeueing code out of svc_get_next_xprt() into svc_recv().
>>>> 
>>>> Previously svc_xprt_dequeue() would be called twice, once before waiting
>>>> and possibly once after.  Now instead rqst_should_sleep() is called
>>>> twice.  Once to decide if waiting is needed, and once to check against
>>>> after setting the task state do see if we might have missed a wakeup.
>>>> 
>>>> signed-off-by: NeilBrown <neilb@suse.de>
>>> 
>>> I've tested and applied this one and the previous one to the thread
>>> scheduling branch, with a few minor fix-ups. Apologies for how long
>>> this took, I'm wrestling with a SATA/block bug on the v6.6 test
>>> system that is being very sassy and hard to nail down.
>> 
>> I'm happy that we are making progress and the series is getting improved
>> along the way.  Good lock with the block bug.
>> 
>>> 
>>> I need to dive into the backchannel patch next. I'm trying to think
>>> of how I want to test that one.
>>> 
>> 
>> I think lock-grant call backs should be easiest to trigger.
>> However I just tried mounting the filesystem twice with nosharecache,
>> and the locking that same file on both mounts.  I expected one to block
>> and hoped I would see the callback when the first lock was dropped.
>> However the second lock didn't block! That's a bug.
>> I haven't dug very deep yet, but I think the client is getting a
>> delegation for the first open (O_RDWR) so the server doesn't see the
>> lock.
>> Then when the lock arrives on the second mount, there is no conflicting
>> lock and the write delegation maybe isn't treated as a conflict?
>> 
>> I'll try to look more early next week.
> 
> The bug appears to be client-side.
> When I mount the same filesystem twice using nosharecache the client
> only creates a single session.  One of the mounts opens the file and
> gets a write delegation.  The other mount opens the same file but this
> doesn't trigger a delegation recall as the server thinks it is the same
> client as it is using the same session.  But the client is caching the
> two mounts separately and not sharing the delegation.
> So when the second mount locks the file the server allows it, even
> though the first mount thinks it holds a lock thanks to the delegation.
> 
> I think nosharecache needs to use a separate identity and create a
> separate session.
> 
> I repeated the test using network namespaces to create a genuinely
> separate clients so the server now sees two clients opening the same file
> and trying to lock it.  I now see both CB_RECALL and CB_NOTIFY_LOCK
> callbacks being handled correctly.

Thanks for these results. I'll apply this one to the topic branch,
but I'd like to get client review/ack for it first.

Meanwhile, I've applied the rpc_status patches to nfsd-next, and
pulled in the first half of topic-sunrpc-thread-scheduling for
v6.6.


--
Chuck Lever
diff mbox series

Patch

diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 60759647fee4..f1d64ded89fb 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -722,51 +722,33 @@  rqst_should_sleep(struct svc_rqst *rqstp)
 	return true;
 }
 
-static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp)
+static void svc_rqst_wait_for_work(struct svc_rqst *rqstp)
 {
-	struct svc_pool		*pool = rqstp->rq_pool;
-
-	/* rq_xprt should be clear on entry */
-	WARN_ON_ONCE(rqstp->rq_xprt);
+	struct svc_pool *pool = rqstp->rq_pool;
 
-	rqstp->rq_xprt = svc_xprt_dequeue(pool);
-	if (rqstp->rq_xprt)
-		goto out_found;
-
-	set_current_state(TASK_IDLE);
-	smp_mb__before_atomic();
-	clear_bit(SP_CONGESTED, &pool->sp_flags);
-	clear_bit(RQ_BUSY, &rqstp->rq_flags);
-	smp_mb__after_atomic();
-
-	if (likely(rqst_should_sleep(rqstp)))
-		schedule();
-	else
-		__set_current_state(TASK_RUNNING);
+	if (rqst_should_sleep(rqstp)) {
+		set_current_state(TASK_IDLE);
+		smp_mb__before_atomic();
+		clear_bit(SP_CONGESTED, &pool->sp_flags);
+		clear_bit(RQ_BUSY, &rqstp->rq_flags);
+		smp_mb__after_atomic();
+
+		/* Need to check should_sleep() again after
+		 * setting task state in case a wakeup happened
+		 * between testing and setting.
+		 */
+		if (rqst_should_sleep(rqstp)) {
+			schedule();
+		} else {
+			__set_current_state(TASK_RUNNING);
+			cond_resched();
+		}
 
+		set_bit(RQ_BUSY, &rqstp->rq_flags);
+		smp_mb__after_atomic();
+	} else
+		cond_resched();
 	try_to_freeze();
-
-	set_bit(RQ_BUSY, &rqstp->rq_flags);
-	smp_mb__after_atomic();
-	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
-	rqstp->rq_xprt = svc_xprt_dequeue(pool);
-	if (rqstp->rq_xprt)
-		goto out_found;
-
-	if (kthread_should_stop())
-		return NULL;
-	return NULL;
-out_found:
-	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
-	/* Normally we will wait up to 5 seconds for any required
-	 * cache information to be provided.
-	 */
-	if (!test_bit(SP_CONGESTED, &pool->sp_flags))
-		rqstp->rq_chandle.thread_wait = 5*HZ;
-	else
-		rqstp->rq_chandle.thread_wait = 1*HZ;
-	trace_svc_xprt_dequeue(rqstp);
-	return rqstp->rq_xprt;
 }
 
 static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt)
@@ -858,20 +840,33 @@  static void svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
  */
 void svc_recv(struct svc_rqst *rqstp)
 {
-	struct svc_xprt		*xprt = NULL;
+	struct svc_pool *pool = rqstp->rq_pool;
 
 	if (!svc_alloc_arg(rqstp))
 		return;
 
-	try_to_freeze();
-	cond_resched();
+	svc_rqst_wait_for_work(rqstp);
+
+	clear_bit(SP_TASK_PENDING, &pool->sp_flags);
+
 	if (kthread_should_stop())
-		goto out;
+		return;
+
+	rqstp->rq_xprt = svc_xprt_dequeue(pool);
+	if (rqstp->rq_xprt) {
+		struct svc_xprt *xprt = rqstp->rq_xprt;
+
+		/* Normally we will wait up to 5 seconds for any required
+		 * cache information to be provided.
+		 */
+		if (test_bit(SP_CONGESTED, &pool->sp_flags))
+			rqstp->rq_chandle.thread_wait = 5 * HZ;
+		else
+			rqstp->rq_chandle.thread_wait = 1 * HZ;
 
-	xprt = svc_get_next_xprt(rqstp);
-	if (xprt)
+		trace_svc_xprt_dequeue(rqstp);
 		svc_handle_xprt(rqstp, xprt);
-out:
+	}
 }
 EXPORT_SYMBOL_GPL(svc_recv);