diff mbox series

[RFC,v19,06/11] NFSD: Update find_clp_in_name_tree() to handle courtesy client

Message ID 1648742529-28551-7-git-send-email-dai.ngo@oracle.com (mailing list archive)
State New, archived
Headers show
Series NFSD: Initial implementation of NFSv4 Courteous Server | expand

Commit Message

Dai Ngo March 31, 2022, 4:02 p.m. UTC
Update find_clp_in_name_tree to check and expire courtesy client.

Update find_confirmed_client_by_name to discard the courtesy
client by setting CLIENT_EXPIRED.

Update nfsd4_setclientid to expire client with CLIENT_EXPIRED
state to prevent multiple confirmed clients with the same name
on the conf_name_tree.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
 fs/nfsd/nfs4state.c | 27 ++++++++++++++++++++++++---
 fs/nfsd/state.h     | 22 ++++++++++++++++++++++
 2 files changed, 46 insertions(+), 3 deletions(-)

Comments

J. Bruce Fields April 1, 2022, 3:21 p.m. UTC | #1
On Thu, Mar 31, 2022 at 09:02:04AM -0700, Dai Ngo wrote:
> Update find_clp_in_name_tree to check and expire courtesy client.
> 
> Update find_confirmed_client_by_name to discard the courtesy
> client by setting CLIENT_EXPIRED.
> 
> Update nfsd4_setclientid to expire client with CLIENT_EXPIRED
> state to prevent multiple confirmed clients with the same name
> on the conf_name_tree.
> 
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
>  fs/nfsd/nfs4state.c | 27 ++++++++++++++++++++++++---
>  fs/nfsd/state.h     | 22 ++++++++++++++++++++++
>  2 files changed, 46 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index fe8969ba94b3..eadce5d19473 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -2893,8 +2893,11 @@ find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
>  			node = node->rb_left;
>  		else if (cmp < 0)
>  			node = node->rb_right;
> -		else
> +		else {
> +			if (nfsd4_courtesy_clnt_expired(clp))
> +				return NULL;
>  			return clp;
> +		}
>  	}
>  	return NULL;
>  }
> @@ -2973,8 +2976,15 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)
>  static struct nfs4_client *
>  find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
>  {
> +	struct nfs4_client *clp;
> +
>  	lockdep_assert_held(&nn->client_lock);
> -	return find_clp_in_name_tree(name, &nn->conf_name_tree);
> +	clp = find_clp_in_name_tree(name, &nn->conf_name_tree);
> +	if (clp && clp->cl_cs_client_state == NFSD4_CLIENT_RECONNECTED) {
> +		nfsd4_discard_courtesy_clnt(clp);
> +		clp = NULL;
> +	}
> +	return clp;
>  }
>  
....
> +static inline void
> +nfsd4_discard_courtesy_clnt(struct nfs4_client *clp)
> +{
> +	spin_lock(&clp->cl_cs_lock);
> +	clp->cl_cs_client_state = NFSD4_CLIENT_EXPIRED;
> +	spin_unlock(&clp->cl_cs_lock);
> +}

This is a red flag to me.... What guarantees that the condition we just
checked (cl_cs_client_state == NFSD4_CLIENT_RECONNECTED) is still true
here?  Why couldn't another thread have raced in and called
reactivate_courtesy_client?

Should we be holding cl_cs_lock across both the cl_cs_client_state and
the assignment?  Or should reactivate_courtesy_client be taking the
client_lock?

I'm still not clear on the need for the CLIENT_RECONNECTED state.

I think analysis would be a bit simpler if the only states were ACTIVE,
COURTESY, and EXPIRED, the only transitions possible were

	ACTIVE->COURTESY->{EXPIRED or ACTIVE}

and the same lock ensured that those were the only possible transitions.

(And to be honest I'd still prefer the original approach where we expire
clients from the posix locking code and then retry.  It handles an
additional case (the one where reboot happens after a long network
partition), and I don't think it requires adding these new client
states....)

--b.
Chuck Lever III April 1, 2022, 3:57 p.m. UTC | #2
> On Apr 1, 2022, at 11:21 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
> 
> On Thu, Mar 31, 2022 at 09:02:04AM -0700, Dai Ngo wrote:
>> Update find_clp_in_name_tree to check and expire courtesy client.
>> 
>> Update find_confirmed_client_by_name to discard the courtesy
>> client by setting CLIENT_EXPIRED.
>> 
>> Update nfsd4_setclientid to expire client with CLIENT_EXPIRED
>> state to prevent multiple confirmed clients with the same name
>> on the conf_name_tree.
>> 
>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>> ---
>> fs/nfsd/nfs4state.c | 27 ++++++++++++++++++++++++---
>> fs/nfsd/state.h     | 22 ++++++++++++++++++++++
>> 2 files changed, 46 insertions(+), 3 deletions(-)
>> 
>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>> index fe8969ba94b3..eadce5d19473 100644
>> --- a/fs/nfsd/nfs4state.c
>> +++ b/fs/nfsd/nfs4state.c
>> @@ -2893,8 +2893,11 @@ find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
>> 			node = node->rb_left;
>> 		else if (cmp < 0)
>> 			node = node->rb_right;
>> -		else
>> +		else {
>> +			if (nfsd4_courtesy_clnt_expired(clp))
>> +				return NULL;
>> 			return clp;
>> +		}
>> 	}
>> 	return NULL;
>> }
>> @@ -2973,8 +2976,15 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)
>> static struct nfs4_client *
>> find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
>> {
>> +	struct nfs4_client *clp;
>> +
>> 	lockdep_assert_held(&nn->client_lock);
>> -	return find_clp_in_name_tree(name, &nn->conf_name_tree);
>> +	clp = find_clp_in_name_tree(name, &nn->conf_name_tree);
>> +	if (clp && clp->cl_cs_client_state == NFSD4_CLIENT_RECONNECTED) {
>> +		nfsd4_discard_courtesy_clnt(clp);
>> +		clp = NULL;
>> +	}
>> +	return clp;
>> }
>> 
> ....
>> +static inline void
>> +nfsd4_discard_courtesy_clnt(struct nfs4_client *clp)
>> +{
>> +	spin_lock(&clp->cl_cs_lock);
>> +	clp->cl_cs_client_state = NFSD4_CLIENT_EXPIRED;
>> +	spin_unlock(&clp->cl_cs_lock);
>> +}
> 
> This is a red flag to me.... What guarantees that the condition we just
> checked (cl_cs_client_state == NFSD4_CLIENT_RECONNECTED) is still true
> here?  Why couldn't another thread have raced in and called
> reactivate_courtesy_client?
> 
> Should we be holding cl_cs_lock across both the cl_cs_client_state and
> the assignment?

Holding cl_cs_lock while checking cl_cs_client_state and then
updating it sounds OK to me.


> Or should reactivate_courtesy_client be taking the
> client_lock?
> 
> I'm still not clear on the need for the CLIENT_RECONNECTED state.
> 
> I think analysis would be a bit simpler if the only states were ACTIVE,
> COURTESY, and EXPIRED, the only transitions possible were
> 
> 	ACTIVE->COURTESY->{EXPIRED or ACTIVE}
> 
> and the same lock ensured that those were the only possible transitions.

Yes, that would be easier, but I don't think it's realistic. There
are lock ordering issues between nn->client_lock and the locks on
the individual files and state that make it onerous.


> (And to be honest I'd still prefer the original approach where we expire
> clients from the posix locking code and then retry.  It handles an
> additional case (the one where reboot happens after a long network
> partition), and I don't think it requires adding these new client
> states....)

The locking of the earlier approach was unworkable.

But, I'm happy to consider that again if you can come up with a way
of handling it properly and simply.


--
Chuck Lever
Dai Ngo April 1, 2022, 7:11 p.m. UTC | #3
On 4/1/22 8:57 AM, Chuck Lever III wrote:
>
>> On Apr 1, 2022, at 11:21 AM, J. Bruce Fields <bfields@fieldses.org> wrote:
>>
>> On Thu, Mar 31, 2022 at 09:02:04AM -0700, Dai Ngo wrote:
>>> Update find_clp_in_name_tree to check and expire courtesy client.
>>>
>>> Update find_confirmed_client_by_name to discard the courtesy
>>> client by setting CLIENT_EXPIRED.
>>>
>>> Update nfsd4_setclientid to expire client with CLIENT_EXPIRED
>>> state to prevent multiple confirmed clients with the same name
>>> on the conf_name_tree.
>>>
>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
>>> ---
>>> fs/nfsd/nfs4state.c | 27 ++++++++++++++++++++++++---
>>> fs/nfsd/state.h     | 22 ++++++++++++++++++++++
>>> 2 files changed, 46 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
>>> index fe8969ba94b3..eadce5d19473 100644
>>> --- a/fs/nfsd/nfs4state.c
>>> +++ b/fs/nfsd/nfs4state.c
>>> @@ -2893,8 +2893,11 @@ find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
>>> 			node = node->rb_left;
>>> 		else if (cmp < 0)
>>> 			node = node->rb_right;
>>> -		else
>>> +		else {
>>> +			if (nfsd4_courtesy_clnt_expired(clp))
>>> +				return NULL;
>>> 			return clp;
>>> +		}
>>> 	}
>>> 	return NULL;
>>> }
>>> @@ -2973,8 +2976,15 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)
>>> static struct nfs4_client *
>>> find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
>>> {
>>> +	struct nfs4_client *clp;
>>> +
>>> 	lockdep_assert_held(&nn->client_lock);
>>> -	return find_clp_in_name_tree(name, &nn->conf_name_tree);
>>> +	clp = find_clp_in_name_tree(name, &nn->conf_name_tree);
>>> +	if (clp && clp->cl_cs_client_state == NFSD4_CLIENT_RECONNECTED) {
>>> +		nfsd4_discard_courtesy_clnt(clp);
>>> +		clp = NULL;
>>> +	}
>>> +	return clp;
>>> }
>>>
>> ....
>>> +static inline void
>>> +nfsd4_discard_courtesy_clnt(struct nfs4_client *clp)
>>> +{
>>> +	spin_lock(&clp->cl_cs_lock);
>>> +	clp->cl_cs_client_state = NFSD4_CLIENT_EXPIRED;
>>> +	spin_unlock(&clp->cl_cs_lock);
>>> +}
>> This is a red flag to me.... What guarantees that the condition we just
>> checked (cl_cs_client_state == NFSD4_CLIENT_RECONNECTED) is still true
>> here?  Why couldn't another thread have raced in and called
>> reactivate_courtesy_client?
>>
>> Should we be holding cl_cs_lock across both the cl_cs_client_state and
>> the assignment?
> Holding cl_cs_lock while checking cl_cs_client_state and then
> updating it sounds OK to me.

Thanks Bruce and Chuck!

I replaced nfsd4_discard_courtesy_clnt with nfsd4_discard_reconnect_clnt
which checks and sets the cl_cs_client_state under the cl_cs_lock:

static inline bool
nfsd4_discard_reconnect_clnt(struct nfs4_client *clp)
{
         bool ret = false;

         spin_lock(&clp->cl_cs_lock);
         if (clp->cl_cs_client_state == NFSD4_CLIENT_RECONNECTED) {
                 clp->cl_cs_client_state = NFSD4_CLIENT_EXPIRED;
                 ret = true;
         }
         spin_unlock(&clp->cl_cs_lock);
         return ret;
}

>
>
>> Or should reactivate_courtesy_client be taking the
>> client_lock?
>>
>> I'm still not clear on the need for the CLIENT_RECONNECTED state.
>>
>> I think analysis would be a bit simpler if the only states were ACTIVE,
>> COURTESY, and EXPIRED, the only transitions possible were
>>
>> 	ACTIVE->COURTESY->{EXPIRED or ACTIVE}
>>
>> and the same lock ensured that those were the only possible transitions.
> Yes, that would be easier, but I don't think it's realistic. There
> are lock ordering issues between nn->client_lock and the locks on
> the individual files and state that make it onerous.
>
>
>> (And to be honest I'd still prefer the original approach where we expire
>> clients from the posix locking code and then retry.  It handles an
>> additional case (the one where reboot happens after a long network
>> partition), and I don't think it requires adding these new client
>> states....)
> The locking of the earlier approach was unworkable.
>
> But, I'm happy to consider that again if you can come up with a way
> of handling it properly and simply.

I will wait for feedback from Bruce before sending v20 with the
above change.

-Dai

>
>
> --
> Chuck Lever
>
>
>
J. Bruce Fields April 13, 2022, 12:55 p.m. UTC | #4
On Fri, Apr 01, 2022 at 12:11:34PM -0700, dai.ngo@oracle.com wrote:
> On 4/1/22 8:57 AM, Chuck Lever III wrote:
> >>(And to be honest I'd still prefer the original approach where we expire
> >>clients from the posix locking code and then retry.  It handles an
> >>additional case (the one where reboot happens after a long network
> >>partition), and I don't think it requires adding these new client
> >>states....)
> >The locking of the earlier approach was unworkable.
> >
> >But, I'm happy to consider that again if you can come up with a way
> >of handling it properly and simply.
> 
> I will wait for feedback from Bruce before sending v20 with the
> above change.

OK, I'd like to tweak the design in that direction.

I'd like to handle the case where the network goes down for a while, and
the server gets power-cycled before the network comes back up.  I think
that could easily happen.  There's no reason clients couldn't reclaim
all their state in that case.  We should let them.

To handle that case, we have to delay removing the client's stable
storage record until there's a lock conflict.  That means code that
checks for conflicts must be able to sleep.

In each case (opens, locks, delegations), conflicts are first detected
while holding a spinlock.  So we need to unlock before waiting, and then
retry if necessary.

We decided instead to remove the stable-storage record when first
converting a client to a courtesy client--then we can handle a conflict
by just setting a flag on the client that indicates it should no longer
be used, no need to drop any locks.

That leaves the client in a state where it's still on a bunch of global
data structures, but has to be treated as if it no longer exists.  That
turns out to require more special handling than expected.  You've shown
admirable persistance in handling those cases, but I'm still not
completely convinced this is correct.

We could avoid that complication, and also solve the
server-reboot-during-network-partition problem, if we went back to the
first plan and allowed ourselves to sleep at the time we detect a
conflict.  I don't think it's that complicated.

We end up using a lot of the same logic regardless, so don't throw away
the existing patches.

My basic plan is:

Keep the client state, but with only three values: ACTIVE, COURTESY, and
EXPIRABLE.

ACTIVE is the initial state, which we return to whenever we renew.  The
laundromat sets COURTESY whenever a client isn't renewed for a lease
period.  When we run into a conflict with a lock held by a client, we
call

  static bool try_to_expire_client(struct nfs4_client *clp)
  {
	return COURTESY == cmpxchg(clp->cl_state, COURTESY, EXPIRABLE);
  }

If it returns true, that tells us the client was a courtesy client.  We
then call queue_work(laundry_wq, &nn->laundromat_work) to tell the
laundromat to actually expire the client.  Then if needed we can drop
locks, wait for the laundromat to do the work with
flush_workqueue(laundry_wq), and retry.

All the EXPIRABLE state does is tell the laundromat to expire this
client.  It does *not* prevent the client from being renewed and
acquiring new locks--if that happens before the laundromat gets to the
client, that's fine, we let it return to ACTIVE state and if someone
retries the conflicing lock they'll just get a denial.

Here's a suggested a rough patch ordering.  If you want to go above and
beyond, I also suggest some tests that should pass after each step:


PATCH 1
-------

Implement courtesy behavior *only* for clients that have
delegations, but no actual opens or locks:

Define new cl_state field with values ACTIVE, COURTESY, and EXPIRABLE.
Set to ACTIVE on renewal.  Modify the laundromat so that instead of
expiring any client that's too old, it first checks if a client has
state consisting only of unconflicted delegations, and, if so, it sets
COURTESY.

Define try_to_expire_client as above.  In nfsd_break_deleg_cb, call
try_to_expire_client and queue_work.  (But also continue scheduling the
recall as we do in the current code, there's no harm to that.)

Modify the laundromat to try to expire old clients with EXPIRED set.

TESTS:
	- Establish a client, open a file, get a delegation, close the
	  file, wait 2 lease periods, verify that you can still use the
	  delegation.
	- Establish a client, open a file, get a delegation, close the
	  file, wait 2 lease periods, establish a second client, request
	  a conflicting open, verify that the open succeeds and that the
	  first client is no longer able to use its delegation.


PATCH 2
-------

Extend courtesy client behavior to clients that have opens or
delegations, but no locks:

Modify the laundromat to set COURTESY on old clients with state
consisting only of opens or unconflicted delegations.

Add in nfs4_resolve_deny_conflicts_locked and friends as in your patch
"Update nfs4_get_vfs_file()...", but in the case of a conflict, call
try_to_expire_client and queue_work(), then modify e.g.
nfs4_get_vfs_file to flush_workqueue() and then retry after unlocking
fi_lock.

TESTS:
	- establish a client, open a file, wait 2 lease periods, verify
	  that you can still use the open stateid.
	- establish a client, open a file, wait 2 lease periods,
	  establish a second client, request an open with a share mode
	  conflicting with the first open, verify that the open succeeds
	  and that first client is no longer able to use its open.

PATCH 3
-------

Minor tweak to prevent the laundromat from being freed out from
under a thread processing a conflicting lock:

Create and destroy the laundromat workqueue in init_nfsd/exit_nfsd
instead of where it's done currently.

(That makes the laundromat's lifetime longer than strictly necessary.
We could do better with a little more work; I think this is OK for now.)

TESTS:
	- just rerun any regression tests; this patch shouldn't change
	  behavior.

PATCH 4
-------

Extend courtesy client behavior to any client with state, including
locks:

Modify the laundromat to set COURTESY on any old client with state.

Add two new lock manager callbacks:

	void * (*lm_lock_expirable)(struct file_lock *);
	bool (*lm_expire_lock)(void *);

If lm_lock_expirable() is called and returns non-NULL, posix_lock_inode
should drop flc_lock, call lm_expire_lock() with the value returned from
lm_lock_expirable, and then restart the loop over flc_posix from the
beginning.

For now, nfsd's lm_lock_expirable will basically just be

	if (try_to_expire_client()) {
		queue_work()
		return get_net();
	}
	return NULL;

and lm_expire_lock will:

	flush_workqueue()
	put_net()

One more subtlety: the moment we drop the flc_lock, it's possible
another task could race in and free it.  Worse, the nfsd module could be
removed entirely--so nfsd's lm_expire_lock code could disappear out from
under us.  To prevent this, I think we need to add a struct module
*owner field to struct lock_manager_operations, and use it like:

	owner = fl->fl_lmops->owner;
	__get_module(owner);
	expire_lock = fl->fl_lmops->lm_expire_lock;
	spin_unlock(&ctx->flc_lock);
	expire_lock(...);
	module_put(owner);

Maybe there's some simpler way, but I don't see it.

TESTS:
	- retest courtesy client behavior using file locks this time.

--

That's the basic idea.  I think it should work--though I may have
overlooked something.

This has us flush the laundromat workqueue while holding mutexes in a
couple cases.  We could avoid that with a little more work, I think.
But those mutexes should only be associated with the client requesting a
new open/lock, and such a client shouldn't be touched by the laundromat,
so I think we're OK.

It'd also be helpful to update the info file with courtesy client
information, as you do in your current patches.

Does this make sense?

--b.
Dai Ngo April 13, 2022, 6:28 p.m. UTC | #5
On 4/13/22 5:55 AM, Bruce Fields wrote:
> On Fri, Apr 01, 2022 at 12:11:34PM -0700, dai.ngo@oracle.com wrote:
>> On 4/1/22 8:57 AM, Chuck Lever III wrote:
>>>> (And to be honest I'd still prefer the original approach where we expire
>>>> clients from the posix locking code and then retry.  It handles an
>>>> additional case (the one where reboot happens after a long network
>>>> partition), and I don't think it requires adding these new client
>>>> states....)
>>> The locking of the earlier approach was unworkable.
>>>
>>> But, I'm happy to consider that again if you can come up with a way
>>> of handling it properly and simply.
>> I will wait for feedback from Bruce before sending v20 with the
>> above change.
> OK, I'd like to tweak the design in that direction.
>
> I'd like to handle the case where the network goes down for a while, and
> the server gets power-cycled before the network comes back up.  I think
> that could easily happen.  There's no reason clients couldn't reclaim
> all their state in that case.  We should let them.
>
> To handle that case, we have to delay removing the client's stable
> storage record until there's a lock conflict.  That means code that
> checks for conflicts must be able to sleep.
>
> In each case (opens, locks, delegations), conflicts are first detected
> while holding a spinlock.  So we need to unlock before waiting, and then
> retry if necessary.
>
> We decided instead to remove the stable-storage record when first
> converting a client to a courtesy client--then we can handle a conflict
> by just setting a flag on the client that indicates it should no longer
> be used, no need to drop any locks.
>
> That leaves the client in a state where it's still on a bunch of global
> data structures, but has to be treated as if it no longer exists.  That
> turns out to require more special handling than expected.  You've shown
> admirable persistance in handling those cases, but I'm still not
> completely convinced this is correct.
>
> We could avoid that complication, and also solve the
> server-reboot-during-network-partition problem, if we went back to the
> first plan and allowed ourselves to sleep at the time we detect a
> conflict.  I don't think it's that complicated.
>
> We end up using a lot of the same logic regardless, so don't throw away
> the existing patches.
>
> My basic plan is:
>
> Keep the client state, but with only three values: ACTIVE, COURTESY, and
> EXPIRABLE.
>
> ACTIVE is the initial state, which we return to whenever we renew.  The
> laundromat sets COURTESY whenever a client isn't renewed for a lease
> period.  When we run into a conflict with a lock held by a client, we
> call
>
>    static bool try_to_expire_client(struct nfs4_client *clp)
>    {
> 	return COURTESY == cmpxchg(clp->cl_state, COURTESY, EXPIRABLE);
>    }
>
> If it returns true, that tells us the client was a courtesy client.  We
> then call queue_work(laundry_wq, &nn->laundromat_work) to tell the
> laundromat to actually expire the client.  Then if needed we can drop
> locks, wait for the laundromat to do the work with
> flush_workqueue(laundry_wq), and retry.
>
> All the EXPIRABLE state does is tell the laundromat to expire this
> client.  It does *not* prevent the client from being renewed and
> acquiring new locks--if that happens before the laundromat gets to the
> client, that's fine, we let it return to ACTIVE state and if someone
> retries the conflicing lock they'll just get a denial.
>
> Here's a suggested a rough patch ordering.  If you want to go above and
> beyond, I also suggest some tests that should pass after each step:
>
>
> PATCH 1
> -------
>
> Implement courtesy behavior *only* for clients that have
> delegations, but no actual opens or locks:
>
> Define new cl_state field with values ACTIVE, COURTESY, and EXPIRABLE.
> Set to ACTIVE on renewal.  Modify the laundromat so that instead of
> expiring any client that's too old, it first checks if a client has
> state consisting only of unconflicted delegations, and, if so, it sets
> COURTESY.
>
> Define try_to_expire_client as above.  In nfsd_break_deleg_cb, call
> try_to_expire_client and queue_work.  (But also continue scheduling the
> recall as we do in the current code, there's no harm to that.)
>
> Modify the laundromat to try to expire old clients with EXPIRED set.
>
> TESTS:
> 	- Establish a client, open a file, get a delegation, close the
> 	  file, wait 2 lease periods, verify that you can still use the
> 	  delegation.
> 	- Establish a client, open a file, get a delegation, close the
> 	  file, wait 2 lease periods, establish a second client, request
> 	  a conflicting open, verify that the open succeeds and that the
> 	  first client is no longer able to use its delegation.
>
>
> PATCH 2
> -------
>
> Extend courtesy client behavior to clients that have opens or
> delegations, but no locks:
>
> Modify the laundromat to set COURTESY on old clients with state
> consisting only of opens or unconflicted delegations.
>
> Add in nfs4_resolve_deny_conflicts_locked and friends as in your patch
> "Update nfs4_get_vfs_file()...", but in the case of a conflict, call
> try_to_expire_client and queue_work(), then modify e.g.
> nfs4_get_vfs_file to flush_workqueue() and then retry after unlocking
> fi_lock.
>
> TESTS:
> 	- establish a client, open a file, wait 2 lease periods, verify
> 	  that you can still use the open stateid.
> 	- establish a client, open a file, wait 2 lease periods,
> 	  establish a second client, request an open with a share mode
> 	  conflicting with the first open, verify that the open succeeds
> 	  and that first client is no longer able to use its open.
>
> PATCH 3
> -------
>
> Minor tweak to prevent the laundromat from being freed out from
> under a thread processing a conflicting lock:
>
> Create and destroy the laundromat workqueue in init_nfsd/exit_nfsd
> instead of where it's done currently.
>
> (That makes the laundromat's lifetime longer than strictly necessary.
> We could do better with a little more work; I think this is OK for now.)
>
> TESTS:
> 	- just rerun any regression tests; this patch shouldn't change
> 	  behavior.
>
> PATCH 4
> -------
>
> Extend courtesy client behavior to any client with state, including
> locks:
>
> Modify the laundromat to set COURTESY on any old client with state.
>
> Add two new lock manager callbacks:
>
> 	void * (*lm_lock_expirable)(struct file_lock *);
> 	bool (*lm_expire_lock)(void *);
>
> If lm_lock_expirable() is called and returns non-NULL, posix_lock_inode
> should drop flc_lock, call lm_expire_lock() with the value returned from
> lm_lock_expirable, and then restart the loop over flc_posix from the
> beginning.
>
> For now, nfsd's lm_lock_expirable will basically just be
>
> 	if (try_to_expire_client()) {
> 		queue_work()
> 		return get_net();
> 	}
> 	return NULL;
>
> and lm_expire_lock will:
>
> 	flush_workqueue()
> 	put_net()
>
> One more subtlety: the moment we drop the flc_lock, it's possible
> another task could race in and free it.  Worse, the nfsd module could be
> removed entirely--so nfsd's lm_expire_lock code could disappear out from
> under us.  To prevent this, I think we need to add a struct module
> *owner field to struct lock_manager_operations, and use it like:
>
> 	owner = fl->fl_lmops->owner;
> 	__get_module(owner);
> 	expire_lock = fl->fl_lmops->lm_expire_lock;
> 	spin_unlock(&ctx->flc_lock);
> 	expire_lock(...);
> 	module_put(owner);
>
> Maybe there's some simpler way, but I don't see it.
>
> TESTS:
> 	- retest courtesy client behavior using file locks this time.
>
> --
>
> That's the basic idea.  I think it should work--though I may have
> overlooked something.
>
> This has us flush the laundromat workqueue while holding mutexes in a
> couple cases.  We could avoid that with a little more work, I think.
> But those mutexes should only be associated with the client requesting a
> new open/lock, and such a client shouldn't be touched by the laundromat,
> so I think we're OK.
>
> It'd also be helpful to update the info file with courtesy client
> information, as you do in your current patches.
>
> Does this make sense?

I think most of the complications in the current patches is due to the
handling of race conditions when courtesy client reconnects as well as
creating and removing client record (which I already addressed in v21).
The new approach here does not cover these race conditions, I guess
these are the details that will show up in the implementation.

I feel like we're going around in the circle but I will implement this
new approach then we can compare to see if it's simpler than the current
one.

-Dai


>
> --b.
J. Bruce Fields April 13, 2022, 6:42 p.m. UTC | #6
On Wed, Apr 13, 2022 at 11:28:05AM -0700, dai.ngo@oracle.com wrote:
> 
> On 4/13/22 5:55 AM, Bruce Fields wrote:
> >On Fri, Apr 01, 2022 at 12:11:34PM -0700, dai.ngo@oracle.com wrote:
> >>On 4/1/22 8:57 AM, Chuck Lever III wrote:
> >>>>(And to be honest I'd still prefer the original approach where we expire
> >>>>clients from the posix locking code and then retry.  It handles an
> >>>>additional case (the one where reboot happens after a long network
> >>>>partition), and I don't think it requires adding these new client
> >>>>states....)
> >>>The locking of the earlier approach was unworkable.
> >>>
> >>>But, I'm happy to consider that again if you can come up with a way
> >>>of handling it properly and simply.
> >>I will wait for feedback from Bruce before sending v20 with the
> >>above change.
> >OK, I'd like to tweak the design in that direction.
> >
> >I'd like to handle the case where the network goes down for a while, and
> >the server gets power-cycled before the network comes back up.  I think
> >that could easily happen.  There's no reason clients couldn't reclaim
> >all their state in that case.  We should let them.
> >
> >To handle that case, we have to delay removing the client's stable
> >storage record until there's a lock conflict.  That means code that
> >checks for conflicts must be able to sleep.
> >
> >In each case (opens, locks, delegations), conflicts are first detected
> >while holding a spinlock.  So we need to unlock before waiting, and then
> >retry if necessary.
> >
> >We decided instead to remove the stable-storage record when first
> >converting a client to a courtesy client--then we can handle a conflict
> >by just setting a flag on the client that indicates it should no longer
> >be used, no need to drop any locks.
> >
> >That leaves the client in a state where it's still on a bunch of global
> >data structures, but has to be treated as if it no longer exists.  That
> >turns out to require more special handling than expected.  You've shown
> >admirable persistance in handling those cases, but I'm still not
> >completely convinced this is correct.
> >
> >We could avoid that complication, and also solve the
> >server-reboot-during-network-partition problem, if we went back to the
> >first plan and allowed ourselves to sleep at the time we detect a
> >conflict.  I don't think it's that complicated.
> >
> >We end up using a lot of the same logic regardless, so don't throw away
> >the existing patches.
> >
> >My basic plan is:
> >
> >Keep the client state, but with only three values: ACTIVE, COURTESY, and
> >EXPIRABLE.
> >
> >ACTIVE is the initial state, which we return to whenever we renew.  The
> >laundromat sets COURTESY whenever a client isn't renewed for a lease
> >period.  When we run into a conflict with a lock held by a client, we
> >call
> >
> >   static bool try_to_expire_client(struct nfs4_client *clp)
> >   {
> >	return COURTESY == cmpxchg(clp->cl_state, COURTESY, EXPIRABLE);
> >   }
> >
> >If it returns true, that tells us the client was a courtesy client.  We
> >then call queue_work(laundry_wq, &nn->laundromat_work) to tell the
> >laundromat to actually expire the client.  Then if needed we can drop
> >locks, wait for the laundromat to do the work with
> >flush_workqueue(laundry_wq), and retry.
> >
> >All the EXPIRABLE state does is tell the laundromat to expire this
> >client.  It does *not* prevent the client from being renewed and
> >acquiring new locks--if that happens before the laundromat gets to the
> >client, that's fine, we let it return to ACTIVE state and if someone
> >retries the conflicing lock they'll just get a denial.
> >
> >Here's a suggested a rough patch ordering.  If you want to go above and
> >beyond, I also suggest some tests that should pass after each step:
> >
> >
> >PATCH 1
> >-------
> >
> >Implement courtesy behavior *only* for clients that have
> >delegations, but no actual opens or locks:
> >
> >Define new cl_state field with values ACTIVE, COURTESY, and EXPIRABLE.
> >Set to ACTIVE on renewal.  Modify the laundromat so that instead of
> >expiring any client that's too old, it first checks if a client has
> >state consisting only of unconflicted delegations, and, if so, it sets
> >COURTESY.
> >
> >Define try_to_expire_client as above.  In nfsd_break_deleg_cb, call
> >try_to_expire_client and queue_work.  (But also continue scheduling the
> >recall as we do in the current code, there's no harm to that.)
> >
> >Modify the laundromat to try to expire old clients with EXPIRED set.
> >
> >TESTS:
> >	- Establish a client, open a file, get a delegation, close the
> >	  file, wait 2 lease periods, verify that you can still use the
> >	  delegation.
> >	- Establish a client, open a file, get a delegation, close the
> >	  file, wait 2 lease periods, establish a second client, request
> >	  a conflicting open, verify that the open succeeds and that the
> >	  first client is no longer able to use its delegation.
> >
> >
> >PATCH 2
> >-------
> >
> >Extend courtesy client behavior to clients that have opens or
> >delegations, but no locks:
> >
> >Modify the laundromat to set COURTESY on old clients with state
> >consisting only of opens or unconflicted delegations.
> >
> >Add in nfs4_resolve_deny_conflicts_locked and friends as in your patch
> >"Update nfs4_get_vfs_file()...", but in the case of a conflict, call
> >try_to_expire_client and queue_work(), then modify e.g.
> >nfs4_get_vfs_file to flush_workqueue() and then retry after unlocking
> >fi_lock.
> >
> >TESTS:
> >	- establish a client, open a file, wait 2 lease periods, verify
> >	  that you can still use the open stateid.
> >	- establish a client, open a file, wait 2 lease periods,
> >	  establish a second client, request an open with a share mode
> >	  conflicting with the first open, verify that the open succeeds
> >	  and that first client is no longer able to use its open.
> >
> >PATCH 3
> >-------
> >
> >Minor tweak to prevent the laundromat from being freed out from
> >under a thread processing a conflicting lock:
> >
> >Create and destroy the laundromat workqueue in init_nfsd/exit_nfsd
> >instead of where it's done currently.
> >
> >(That makes the laundromat's lifetime longer than strictly necessary.
> >We could do better with a little more work; I think this is OK for now.)
> >
> >TESTS:
> >	- just rerun any regression tests; this patch shouldn't change
> >	  behavior.
> >
> >PATCH 4
> >-------
> >
> >Extend courtesy client behavior to any client with state, including
> >locks:
> >
> >Modify the laundromat to set COURTESY on any old client with state.
> >
> >Add two new lock manager callbacks:
> >
> >	void * (*lm_lock_expirable)(struct file_lock *);
> >	bool (*lm_expire_lock)(void *);
> >
> >If lm_lock_expirable() is called and returns non-NULL, posix_lock_inode
> >should drop flc_lock, call lm_expire_lock() with the value returned from
> >lm_lock_expirable, and then restart the loop over flc_posix from the
> >beginning.
> >
> >For now, nfsd's lm_lock_expirable will basically just be
> >
> >	if (try_to_expire_client()) {
> >		queue_work()
> >		return get_net();
> >	}
> >	return NULL;
> >
> >and lm_expire_lock will:
> >
> >	flush_workqueue()
> >	put_net()
> >
> >One more subtlety: the moment we drop the flc_lock, it's possible
> >another task could race in and free it.  Worse, the nfsd module could be
> >removed entirely--so nfsd's lm_expire_lock code could disappear out from
> >under us.  To prevent this, I think we need to add a struct module
> >*owner field to struct lock_manager_operations, and use it like:
> >
> >	owner = fl->fl_lmops->owner;
> >	__get_module(owner);
> >	expire_lock = fl->fl_lmops->lm_expire_lock;
> >	spin_unlock(&ctx->flc_lock);
> >	expire_lock(...);
> >	module_put(owner);
> >
> >Maybe there's some simpler way, but I don't see it.
> >
> >TESTS:
> >	- retest courtesy client behavior using file locks this time.
> >
> >--
> >
> >That's the basic idea.  I think it should work--though I may have
> >overlooked something.
> >
> >This has us flush the laundromat workqueue while holding mutexes in a
> >couple cases.  We could avoid that with a little more work, I think.
> >But those mutexes should only be associated with the client requesting a
> >new open/lock, and such a client shouldn't be touched by the laundromat,
> >so I think we're OK.
> >
> >It'd also be helpful to update the info file with courtesy client
> >information, as you do in your current patches.
> >
> >Does this make sense?
> 
> I think most of the complications in the current patches is due to the
> handling of race conditions when courtesy client reconnects as well as
> creating and removing client record (which I already addressed in v21).
> The new approach here does not cover these race conditions,

That's the thing, there *is* no reconnection with this approach.  A
"courtesy" client still has a stable storage record and is treated in
every way exactly like a normal active client that just hasn't been
renewed in a long time.  I'm suggesting this approach exactly to avoid
the complications you're talking about.

> I guess
> these are the details that will show up in the implementation.
> 
> I feel like we're going around in the circle but I will implement this
> new approach then we can compare to see if it's simpler than the current
> one.

Thanks for giving it a look.

--b.
Dai Ngo April 15, 2022, 2:47 p.m. UTC | #7
On 4/13/22 5:55 AM, Bruce Fields wrote:
> On Fri, Apr 01, 2022 at 12:11:34PM -0700, dai.ngo@oracle.com wrote:
>> On 4/1/22 8:57 AM, Chuck Lever III wrote:
>>>> (And to be honest I'd still prefer the original approach where we expire
>>>> clients from the posix locking code and then retry.  It handles an
>>>> additional case (the one where reboot happens after a long network
>>>> partition), and I don't think it requires adding these new client
>>>> states....)
>>> The locking of the earlier approach was unworkable.
>>>
>>> But, I'm happy to consider that again if you can come up with a way
>>> of handling it properly and simply.
>> I will wait for feedback from Bruce before sending v20 with the
>> above change.
> OK, I'd like to tweak the design in that direction.
>
> I'd like to handle the case where the network goes down for a while, and
> the server gets power-cycled before the network comes back up.  I think
> that could easily happen.  There's no reason clients couldn't reclaim
> all their state in that case.  We should let them.
>
> To handle that case, we have to delay removing the client's stable
> storage record until there's a lock conflict.  That means code that
> checks for conflicts must be able to sleep.
>
> In each case (opens, locks, delegations), conflicts are first detected
> while holding a spinlock.  So we need to unlock before waiting, and then
> retry if necessary.
>
> We decided instead to remove the stable-storage record when first
> converting a client to a courtesy client--then we can handle a conflict
> by just setting a flag on the client that indicates it should no longer
> be used, no need to drop any locks.
>
> That leaves the client in a state where it's still on a bunch of global
> data structures, but has to be treated as if it no longer exists.  That
> turns out to require more special handling than expected.  You've shown
> admirable persistance in handling those cases, but I'm still not
> completely convinced this is correct.
>
> We could avoid that complication, and also solve the
> server-reboot-during-network-partition problem, if we went back to the
> first plan and allowed ourselves to sleep at the time we detect a
> conflict.  I don't think it's that complicated.
>
> We end up using a lot of the same logic regardless, so don't throw away
> the existing patches.
>
> My basic plan is:
>
> Keep the client state, but with only three values: ACTIVE, COURTESY, and
> EXPIRABLE.
>
> ACTIVE is the initial state, which we return to whenever we renew.  The
> laundromat sets COURTESY whenever a client isn't renewed for a lease
> period.  When we run into a conflict with a lock held by a client, we
> call
>
>    static bool try_to_expire_client(struct nfs4_client *clp)
>    {
> 	return COURTESY == cmpxchg(clp->cl_state, COURTESY, EXPIRABLE);
>    }
>
> If it returns true, that tells us the client was a courtesy client.  We
> then call queue_work(laundry_wq, &nn->laundromat_work) to tell the
> laundromat to actually expire the client.  Then if needed we can drop
> locks, wait for the laundromat to do the work with
> flush_workqueue(laundry_wq), and retry.
>
> All the EXPIRABLE state does is tell the laundromat to expire this
> client.  It does *not* prevent the client from being renewed and
> acquiring new locks--if that happens before the laundromat gets to the
> client, that's fine, we let it return to ACTIVE state and if someone
> retries the conflicing lock they'll just get a denial.
>
> Here's a suggested a rough patch ordering.  If you want to go above and
> beyond, I also suggest some tests that should pass after each step:
>
>
> PATCH 1
> -------
>
> Implement courtesy behavior *only* for clients that have
> delegations, but no actual opens or locks:

we can do a close(2) so that the only remaining state is the
delegation state. However this causes problem for doing exactly
what you want in patch 1.

>
> Define new cl_state field with values ACTIVE, COURTESY, and EXPIRABLE.
> Set to ACTIVE on renewal.  Modify the laundromat so that instead of
> expiring any client that's too old, it first checks if a client has
> state consisting only of unconflicted delegations, and, if so, it sets
> COURTESY.
>
> Define try_to_expire_client as above.  In nfsd_break_deleg_cb, call
> try_to_expire_client and queue_work.  (But also continue scheduling the
> recall as we do in the current code, there's no harm to that.)
>
> Modify the laundromat to try to expire old clients with EXPIRED set.
>
> TESTS:
> 	- Establish a client, open a file, get a delegation, close the
> 	  file, wait 2 lease periods, verify that you can still use the
> 	  delegation.

 From user space, how do we force the client to use the delegation
state to read the file *after* doing a close(2)?

In my testing, once the read delegation is granted the Linux client
uses the delegation state to read the file. So the test can do open(2),
read some (more(1)), wait for 2 lease periods and read again and
make sure it works as expected.

Can we leave the open state remain valid for this patch but do not
support share reservation conflict yet until Patch 2?

-Dai

> 	- Establish a client, open a file, get a delegation, close the
> 	  file, wait 2 lease periods, establish a second client, request
> 	  a conflicting open, verify that the open succeeds and that the
> 	  first client is no longer able to use its delegation.
>
>
> PATCH 2
> -------
>
> Extend courtesy client behavior to clients that have opens or
> delegations, but no locks:
>
> Modify the laundromat to set COURTESY on old clients with state
> consisting only of opens or unconflicted delegations.
>
> Add in nfs4_resolve_deny_conflicts_locked and friends as in your patch
> "Update nfs4_get_vfs_file()...", but in the case of a conflict, call
> try_to_expire_client and queue_work(), then modify e.g.
> nfs4_get_vfs_file to flush_workqueue() and then retry after unlocking
> fi_lock.
>
> TESTS:
> 	- establish a client, open a file, wait 2 lease periods, verify
> 	  that you can still use the open stateid.
> 	- establish a client, open a file, wait 2 lease periods,
> 	  establish a second client, request an open with a share mode
> 	  conflicting with the first open, verify that the open succeeds
> 	  and that first client is no longer able to use its open.
>
> PATCH 3
> -------
>
> Minor tweak to prevent the laundromat from being freed out from
> under a thread processing a conflicting lock:
>
> Create and destroy the laundromat workqueue in init_nfsd/exit_nfsd
> instead of where it's done currently.
>
> (That makes the laundromat's lifetime longer than strictly necessary.
> We could do better with a little more work; I think this is OK for now.)
>
> TESTS:
> 	- just rerun any regression tests; this patch shouldn't change
> 	  behavior.
>
> PATCH 4
> -------
>
> Extend courtesy client behavior to any client with state, including
> locks:
>
> Modify the laundromat to set COURTESY on any old client with state.
>
> Add two new lock manager callbacks:
>
> 	void * (*lm_lock_expirable)(struct file_lock *);
> 	bool (*lm_expire_lock)(void *);
>
> If lm_lock_expirable() is called and returns non-NULL, posix_lock_inode
> should drop flc_lock, call lm_expire_lock() with the value returned from
> lm_lock_expirable, and then restart the loop over flc_posix from the
> beginning.
>
> For now, nfsd's lm_lock_expirable will basically just be
>
> 	if (try_to_expire_client()) {
> 		queue_work()
> 		return get_net();
> 	}
> 	return NULL;
>
> and lm_expire_lock will:
>
> 	flush_workqueue()
> 	put_net()
>
> One more subtlety: the moment we drop the flc_lock, it's possible
> another task could race in and free it.  Worse, the nfsd module could be
> removed entirely--so nfsd's lm_expire_lock code could disappear out from
> under us.  To prevent this, I think we need to add a struct module
> *owner field to struct lock_manager_operations, and use it like:
>
> 	owner = fl->fl_lmops->owner;
> 	__get_module(owner);
> 	expire_lock = fl->fl_lmops->lm_expire_lock;
> 	spin_unlock(&ctx->flc_lock);
> 	expire_lock(...);
> 	module_put(owner);
>
> Maybe there's some simpler way, but I don't see it.
>
> TESTS:
> 	- retest courtesy client behavior using file locks this time.
>
> --
>
> That's the basic idea.  I think it should work--though I may have
> overlooked something.
>
> This has us flush the laundromat workqueue while holding mutexes in a
> couple cases.  We could avoid that with a little more work, I think.
> But those mutexes should only be associated with the client requesting a
> new open/lock, and such a client shouldn't be touched by the laundromat,
> so I think we're OK.
>
> It'd also be helpful to update the info file with courtesy client
> information, as you do in your current patches.
>
> Does this make sense?
>
> --b.
Dai Ngo April 15, 2022, 2:56 p.m. UTC | #8
On 4/15/22 7:47 AM, dai.ngo@oracle.com wrote:
>
> On 4/13/22 5:55 AM, Bruce Fields wrote:
>> On Fri, Apr 01, 2022 at 12:11:34PM -0700, dai.ngo@oracle.com wrote:
>>> On 4/1/22 8:57 AM, Chuck Lever III wrote:
>>>>> (And to be honest I'd still prefer the original approach where we 
>>>>> expire
>>>>> clients from the posix locking code and then retry.  It handles an
>>>>> additional case (the one where reboot happens after a long network
>>>>> partition), and I don't think it requires adding these new client
>>>>> states....)
>>>> The locking of the earlier approach was unworkable.
>>>>
>>>> But, I'm happy to consider that again if you can come up with a way
>>>> of handling it properly and simply.
>>> I will wait for feedback from Bruce before sending v20 with the
>>> above change.
>> OK, I'd like to tweak the design in that direction.
>>
>> I'd like to handle the case where the network goes down for a while, and
>> the server gets power-cycled before the network comes back up. I think
>> that could easily happen.  There's no reason clients couldn't reclaim
>> all their state in that case.  We should let them.
>>
>> To handle that case, we have to delay removing the client's stable
>> storage record until there's a lock conflict.  That means code that
>> checks for conflicts must be able to sleep.
>>
>> In each case (opens, locks, delegations), conflicts are first detected
>> while holding a spinlock.  So we need to unlock before waiting, and then
>> retry if necessary.
>>
>> We decided instead to remove the stable-storage record when first
>> converting a client to a courtesy client--then we can handle a conflict
>> by just setting a flag on the client that indicates it should no longer
>> be used, no need to drop any locks.
>>
>> That leaves the client in a state where it's still on a bunch of global
>> data structures, but has to be treated as if it no longer exists.  That
>> turns out to require more special handling than expected. You've shown
>> admirable persistance in handling those cases, but I'm still not
>> completely convinced this is correct.
>>
>> We could avoid that complication, and also solve the
>> server-reboot-during-network-partition problem, if we went back to the
>> first plan and allowed ourselves to sleep at the time we detect a
>> conflict.  I don't think it's that complicated.
>>
>> We end up using a lot of the same logic regardless, so don't throw away
>> the existing patches.
>>
>> My basic plan is:
>>
>> Keep the client state, but with only three values: ACTIVE, COURTESY, and
>> EXPIRABLE.
>>
>> ACTIVE is the initial state, which we return to whenever we renew.  The
>> laundromat sets COURTESY whenever a client isn't renewed for a lease
>> period.  When we run into a conflict with a lock held by a client, we
>> call
>>
>>    static bool try_to_expire_client(struct nfs4_client *clp)
>>    {
>>     return COURTESY == cmpxchg(clp->cl_state, COURTESY, EXPIRABLE);
>>    }
>>
>> If it returns true, that tells us the client was a courtesy client.  We
>> then call queue_work(laundry_wq, &nn->laundromat_work) to tell the
>> laundromat to actually expire the client.  Then if needed we can drop
>> locks, wait for the laundromat to do the work with
>> flush_workqueue(laundry_wq), and retry.
>>
>> All the EXPIRABLE state does is tell the laundromat to expire this
>> client.  It does *not* prevent the client from being renewed and
>> acquiring new locks--if that happens before the laundromat gets to the
>> client, that's fine, we let it return to ACTIVE state and if someone
>> retries the conflicing lock they'll just get a denial.
>>
>> Here's a suggested a rough patch ordering.  If you want to go above and
>> beyond, I also suggest some tests that should pass after each step:
>>
>>
>> PATCH 1
>> -------
>>
>> Implement courtesy behavior *only* for clients that have
>> delegations, but no actual opens or locks:
>
> we can do a close(2) so that the only remaining state is the
> delegation state. However this causes problem for doing exactly
> what you want in patch 1.
>
>>
>> Define new cl_state field with values ACTIVE, COURTESY, and EXPIRABLE.
>> Set to ACTIVE on renewal.  Modify the laundromat so that instead of
>> expiring any client that's too old, it first checks if a client has
>> state consisting only of unconflicted delegations, and, if so, it sets
>> COURTESY.
>>
>> Define try_to_expire_client as above.  In nfsd_break_deleg_cb, call
>> try_to_expire_client and queue_work.  (But also continue scheduling the
>> recall as we do in the current code, there's no harm to that.)
>>
>> Modify the laundromat to try to expire old clients with EXPIRED set.
>>
>> TESTS:
>>     - Establish a client, open a file, get a delegation, close the
>>       file, wait 2 lease periods, verify that you can still use the
>>       delegation.
>
> From user space, how do we force the client to use the delegation
> state to read the file *after* doing a close(2)?

I guess we can write new pynfs test to do this, but I'd like to avoid
it if possible.

-Dai
  

>
> In my testing, once the read delegation is granted the Linux client
> uses the delegation state to read the file. So the test can do open(2),
> read some (more(1)), wait for 2 lease periods and read again and
> make sure it works as expected.
>
> Can we leave the open state remain valid for this patch but do not
> support share reservation conflict yet until Patch 2?
>
> -Dai
>
>>     - Establish a client, open a file, get a delegation, close the
>>       file, wait 2 lease periods, establish a second client, request
>>       a conflicting open, verify that the open succeeds and that the
>>       first client is no longer able to use its delegation.
>>
>>
>> PATCH 2
>> -------
>>
>> Extend courtesy client behavior to clients that have opens or
>> delegations, but no locks:
>>
>> Modify the laundromat to set COURTESY on old clients with state
>> consisting only of opens or unconflicted delegations.
>>
>> Add in nfs4_resolve_deny_conflicts_locked and friends as in your patch
>> "Update nfs4_get_vfs_file()...", but in the case of a conflict, call
>> try_to_expire_client and queue_work(), then modify e.g.
>> nfs4_get_vfs_file to flush_workqueue() and then retry after unlocking
>> fi_lock.
>>
>> TESTS:
>>     - establish a client, open a file, wait 2 lease periods, verify
>>       that you can still use the open stateid.
>>     - establish a client, open a file, wait 2 lease periods,
>>       establish a second client, request an open with a share mode
>>       conflicting with the first open, verify that the open succeeds
>>       and that first client is no longer able to use its open.
>>
>> PATCH 3
>> -------
>>
>> Minor tweak to prevent the laundromat from being freed out from
>> under a thread processing a conflicting lock:
>>
>> Create and destroy the laundromat workqueue in init_nfsd/exit_nfsd
>> instead of where it's done currently.
>>
>> (That makes the laundromat's lifetime longer than strictly necessary.
>> We could do better with a little more work; I think this is OK for now.)
>>
>> TESTS:
>>     - just rerun any regression tests; this patch shouldn't change
>>       behavior.
>>
>> PATCH 4
>> -------
>>
>> Extend courtesy client behavior to any client with state, including
>> locks:
>>
>> Modify the laundromat to set COURTESY on any old client with state.
>>
>> Add two new lock manager callbacks:
>>
>>     void * (*lm_lock_expirable)(struct file_lock *);
>>     bool (*lm_expire_lock)(void *);
>>
>> If lm_lock_expirable() is called and returns non-NULL, posix_lock_inode
>> should drop flc_lock, call lm_expire_lock() with the value returned from
>> lm_lock_expirable, and then restart the loop over flc_posix from the
>> beginning.
>>
>> For now, nfsd's lm_lock_expirable will basically just be
>>
>>     if (try_to_expire_client()) {
>>         queue_work()
>>         return get_net();
>>     }
>>     return NULL;
>>
>> and lm_expire_lock will:
>>
>>     flush_workqueue()
>>     put_net()
>>
>> One more subtlety: the moment we drop the flc_lock, it's possible
>> another task could race in and free it.  Worse, the nfsd module could be
>> removed entirely--so nfsd's lm_expire_lock code could disappear out from
>> under us.  To prevent this, I think we need to add a struct module
>> *owner field to struct lock_manager_operations, and use it like:
>>
>>     owner = fl->fl_lmops->owner;
>>     __get_module(owner);
>>     expire_lock = fl->fl_lmops->lm_expire_lock;
>>     spin_unlock(&ctx->flc_lock);
>>     expire_lock(...);
>>     module_put(owner);
>>
>> Maybe there's some simpler way, but I don't see it.
>>
>> TESTS:
>>     - retest courtesy client behavior using file locks this time.
>>
>> -- 
>>
>> That's the basic idea.  I think it should work--though I may have
>> overlooked something.
>>
>> This has us flush the laundromat workqueue while holding mutexes in a
>> couple cases.  We could avoid that with a little more work, I think.
>> But those mutexes should only be associated with the client requesting a
>> new open/lock, and such a client shouldn't be touched by the laundromat,
>> so I think we're OK.
>>
>> It'd also be helpful to update the info file with courtesy client
>> information, as you do in your current patches.
>>
>> Does this make sense?
>>
>> --b.
J. Bruce Fields April 15, 2022, 3:19 p.m. UTC | #9
On Fri, Apr 15, 2022 at 07:56:06AM -0700, dai.ngo@oracle.com wrote:
> On 4/15/22 7:47 AM, dai.ngo@oracle.com wrote:
> >From user space, how do we force the client to use the delegation
> >state to read the file *after* doing a close(2)?
> 
> I guess we can write new pynfs test to do this, but I'd like to avoid
> it if possible.

Right, this would require pynfs tests.  If you don't think they'd be
that bad.  But if you don't want to do tests for each step I think
that's not the end of the world, up to you.

--b.
Dai Ngo April 15, 2022, 3:36 p.m. UTC | #10
On 4/15/22 8:19 AM, Bruce Fields wrote:
> On Fri, Apr 15, 2022 at 07:56:06AM -0700, dai.ngo@oracle.com wrote:
>> On 4/15/22 7:47 AM, dai.ngo@oracle.com wrote:
>> >From user space, how do we force the client to use the delegation
>>> state to read the file *after* doing a close(2)?
>> I guess we can write new pynfs test to do this, but I'd like to avoid
>> it if possible.
> Right, this would require pynfs tests.  If you don't think they'd be
> that bad.  But if you don't want to do tests for each step I think
> that's not the end of the world, up to you.

ok, thanks Bruce. I'll leave open state alone but don't do anything
with share reservation yet until patch 2.

-Dai
J. Bruce Fields April 15, 2022, 7:53 p.m. UTC | #11
On Fri, Apr 15, 2022 at 11:19:59AM -0400, Bruce Fields wrote:
> On Fri, Apr 15, 2022 at 07:56:06AM -0700, dai.ngo@oracle.com wrote:
> > On 4/15/22 7:47 AM, dai.ngo@oracle.com wrote:
> > >From user space, how do we force the client to use the delegation
> > >state to read the file *after* doing a close(2)?
> > 
> > I guess we can write new pynfs test to do this, but I'd like to avoid
> > it if possible.
> 
> Right, this would require pynfs tests.  If you don't think they'd be
> that bad.

(Sorry, I meant to write "I don't think they'd be that bad"!  Anyway.)

> But if you don't want to do tests for each step I think
> that's not the end of the world, up to you.
> 
> --b.
J. Bruce Fields April 17, 2022, 7:07 p.m. UTC | #12
On Wed, Apr 13, 2022 at 08:55:50AM -0400, Bruce Fields wrote:
> On Fri, Apr 01, 2022 at 12:11:34PM -0700, dai.ngo@oracle.com wrote:
> > On 4/1/22 8:57 AM, Chuck Lever III wrote:
> > >>(And to be honest I'd still prefer the original approach where we expire
> > >>clients from the posix locking code and then retry.  It handles an
> > >>additional case (the one where reboot happens after a long network
> > >>partition), and I don't think it requires adding these new client
> > >>states....)
> > >The locking of the earlier approach was unworkable.
> > >
> > >But, I'm happy to consider that again if you can come up with a way
> > >of handling it properly and simply.
> > 
> > I will wait for feedback from Bruce before sending v20 with the
> > above change.
> 
> OK, I'd like to tweak the design in that direction.
> 
> I'd like to handle the case where the network goes down for a while, and
> the server gets power-cycled before the network comes back up.  I think
> that could easily happen.  There's no reason clients couldn't reclaim
> all their state in that case.  We should let them.
> 
> To handle that case, we have to delay removing the client's stable
> storage record until there's a lock conflict.  That means code that
> checks for conflicts must be able to sleep.
> 
> In each case (opens, locks, delegations), conflicts are first detected
> while holding a spinlock.  So we need to unlock before waiting, and then
> retry if necessary.
> 
> We decided instead to remove the stable-storage record when first
> converting a client to a courtesy client--then we can handle a conflict
> by just setting a flag on the client that indicates it should no longer
> be used, no need to drop any locks.
> 
> That leaves the client in a state where it's still on a bunch of global
> data structures, but has to be treated as if it no longer exists.  That
> turns out to require more special handling than expected.  You've shown
> admirable persistance in handling those cases, but I'm still not
> completely convinced this is correct.
> 
> We could avoid that complication, and also solve the
> server-reboot-during-network-partition problem, if we went back to the
> first plan and allowed ourselves to sleep at the time we detect a
> conflict.  I don't think it's that complicated.
> 
> We end up using a lot of the same logic regardless, so don't throw away
> the existing patches.
> 
> My basic plan is:
> 
> Keep the client state, but with only three values: ACTIVE, COURTESY, and
> EXPIRABLE.
> 
> ACTIVE is the initial state, which we return to whenever we renew.  The
> laundromat sets COURTESY whenever a client isn't renewed for a lease
> period.  When we run into a conflict with a lock held by a client, we
> call
> 
>   static bool try_to_expire_client(struct nfs4_client *clp)
>   {
> 	return COURTESY == cmpxchg(clp->cl_state, COURTESY, EXPIRABLE);
>   }
> 
> If it returns true, that tells us the client was a courtesy client.  We
> then call queue_work(laundry_wq, &nn->laundromat_work) to tell the
> laundromat to actually expire the client.  Then if needed we can drop
> locks, wait for the laundromat to do the work with
> flush_workqueue(laundry_wq), and retry.
> 
> All the EXPIRABLE state does is tell the laundromat to expire this
> client.  It does *not* prevent the client from being renewed and
> acquiring new locks--if that happens before the laundromat gets to the
> client, that's fine, we let it return to ACTIVE state and if someone
> retries the conflicing lock they'll just get a denial.
> 
> Here's a suggested a rough patch ordering.  If you want to go above and
> beyond, I also suggest some tests that should pass after each step:
> 
> 
> PATCH 1
> -------
> 
> Implement courtesy behavior *only* for clients that have
> delegations, but no actual opens or locks:
> 
> Define new cl_state field with values ACTIVE, COURTESY, and EXPIRABLE.
> Set to ACTIVE on renewal.  Modify the laundromat so that instead of
> expiring any client that's too old, it first checks if a client has
> state consisting only of unconflicted delegations, and, if so, it sets
> COURTESY.
> 
> Define try_to_expire_client as above.  In nfsd_break_deleg_cb, call
> try_to_expire_client and queue_work.  (But also continue scheduling the
> recall as we do in the current code, there's no harm to that.)
> 
> Modify the laundromat to try to expire old clients with EXPIRED set.
> 
> TESTS:
> 	- Establish a client, open a file, get a delegation, close the
> 	  file, wait 2 lease periods, verify that you can still use the
> 	  delegation.
> 	- Establish a client, open a file, get a delegation, close the
> 	  file, wait 2 lease periods, establish a second client, request
> 	  a conflicting open, verify that the open succeeds and that the
> 	  first client is no longer able to use its delegation.
> 
> 
> PATCH 2
> -------
> 
> Extend courtesy client behavior to clients that have opens or
> delegations, but no locks:
> 
> Modify the laundromat to set COURTESY on old clients with state
> consisting only of opens or unconflicted delegations.
> 
> Add in nfs4_resolve_deny_conflicts_locked and friends as in your patch
> "Update nfs4_get_vfs_file()...", but in the case of a conflict, call
> try_to_expire_client and queue_work(), then modify e.g.
> nfs4_get_vfs_file to flush_workqueue() and then retry after unlocking
> fi_lock.
> 
> TESTS:
> 	- establish a client, open a file, wait 2 lease periods, verify
> 	  that you can still use the open stateid.
> 	- establish a client, open a file, wait 2 lease periods,
> 	  establish a second client, request an open with a share mode
> 	  conflicting with the first open, verify that the open succeeds
> 	  and that first client is no longer able to use its open.
> 
> PATCH 3
> -------
> 
> Minor tweak to prevent the laundromat from being freed out from
> under a thread processing a conflicting lock:
> 
> Create and destroy the laundromat workqueue in init_nfsd/exit_nfsd
> instead of where it's done currently.
> 
> (That makes the laundromat's lifetime longer than strictly necessary.
> We could do better with a little more work; I think this is OK for now.)
> 
> TESTS:
> 	- just rerun any regression tests; this patch shouldn't change
> 	  behavior.
> 
> PATCH 4
> -------
> 
> Extend courtesy client behavior to any client with state, including
> locks:
> 
> Modify the laundromat to set COURTESY on any old client with state.
> 
> Add two new lock manager callbacks:
> 
> 	void * (*lm_lock_expirable)(struct file_lock *);
> 	bool (*lm_expire_lock)(void *);
> 
> If lm_lock_expirable() is called and returns non-NULL, posix_lock_inode
> should drop flc_lock, call lm_expire_lock() with the value returned from
> lm_lock_expirable, and then restart the loop over flc_posix from the
> beginning.
> 
> For now, nfsd's lm_lock_expirable will basically just be
> 
> 	if (try_to_expire_client()) {
> 		queue_work()
> 		return get_net();

Correction: I forgot that the laundromat is global, not per-net.  So, we
can skip the put_net/get_net.  Also, lm_lock_expirable can just return
bool instead of void *, and lm_expire_lock needs no arguments.

--b.

> 	}
> 	return NULL;
> 
> and lm_expire_lock will:
> 
> 	flush_workqueue()
> 	put_net()
> 
> One more subtlety: the moment we drop the flc_lock, it's possible
> another task could race in and free it.  Worse, the nfsd module could be
> removed entirely--so nfsd's lm_expire_lock code could disappear out from
> under us.  To prevent this, I think we need to add a struct module
> *owner field to struct lock_manager_operations, and use it like:
> 
> 	owner = fl->fl_lmops->owner;
> 	__get_module(owner);
> 	expire_lock = fl->fl_lmops->lm_expire_lock;
> 	spin_unlock(&ctx->flc_lock);
> 	expire_lock(...);
> 	module_put(owner);
> 
> Maybe there's some simpler way, but I don't see it.
> 
> TESTS:
> 	- retest courtesy client behavior using file locks this time.
> 
> --
> 
> That's the basic idea.  I think it should work--though I may have
> overlooked something.
> 
> This has us flush the laundromat workqueue while holding mutexes in a
> couple cases.  We could avoid that with a little more work, I think.
> But those mutexes should only be associated with the client requesting a
> new open/lock, and such a client shouldn't be touched by the laundromat,
> so I think we're OK.
> 
> It'd also be helpful to update the info file with courtesy client
> information, as you do in your current patches.
> 
> Does this make sense?
> 
> --b.
Dai Ngo April 18, 2022, 1:18 a.m. UTC | #13
On 4/17/22 12:07 PM, Bruce Fields wrote:
> On Wed, Apr 13, 2022 at 08:55:50AM -0400, Bruce Fields wrote:
>> On Fri, Apr 01, 2022 at 12:11:34PM -0700, dai.ngo@oracle.com wrote:
>>> On 4/1/22 8:57 AM, Chuck Lever III wrote:
>>>>> (And to be honest I'd still prefer the original approach where we expire
>>>>> clients from the posix locking code and then retry.  It handles an
>>>>> additional case (the one where reboot happens after a long network
>>>>> partition), and I don't think it requires adding these new client
>>>>> states....)
>>>> The locking of the earlier approach was unworkable.
>>>>
>>>> But, I'm happy to consider that again if you can come up with a way
>>>> of handling it properly and simply.
>>> I will wait for feedback from Bruce before sending v20 with the
>>> above change.
>> OK, I'd like to tweak the design in that direction.
>>
>> I'd like to handle the case where the network goes down for a while, and
>> the server gets power-cycled before the network comes back up.  I think
>> that could easily happen.  There's no reason clients couldn't reclaim
>> all their state in that case.  We should let them.
>>
>> To handle that case, we have to delay removing the client's stable
>> storage record until there's a lock conflict.  That means code that
>> checks for conflicts must be able to sleep.
>>
>> In each case (opens, locks, delegations), conflicts are first detected
>> while holding a spinlock.  So we need to unlock before waiting, and then
>> retry if necessary.
>>
>> We decided instead to remove the stable-storage record when first
>> converting a client to a courtesy client--then we can handle a conflict
>> by just setting a flag on the client that indicates it should no longer
>> be used, no need to drop any locks.
>>
>> That leaves the client in a state where it's still on a bunch of global
>> data structures, but has to be treated as if it no longer exists.  That
>> turns out to require more special handling than expected.  You've shown
>> admirable persistance in handling those cases, but I'm still not
>> completely convinced this is correct.
>>
>> We could avoid that complication, and also solve the
>> server-reboot-during-network-partition problem, if we went back to the
>> first plan and allowed ourselves to sleep at the time we detect a
>> conflict.  I don't think it's that complicated.
>>
>> We end up using a lot of the same logic regardless, so don't throw away
>> the existing patches.
>>
>> My basic plan is:
>>
>> Keep the client state, but with only three values: ACTIVE, COURTESY, and
>> EXPIRABLE.
>>
>> ACTIVE is the initial state, which we return to whenever we renew.  The
>> laundromat sets COURTESY whenever a client isn't renewed for a lease
>> period.  When we run into a conflict with a lock held by a client, we
>> call
>>
>>    static bool try_to_expire_client(struct nfs4_client *clp)
>>    {
>> 	return COURTESY == cmpxchg(clp->cl_state, COURTESY, EXPIRABLE);
>>    }
>>
>> If it returns true, that tells us the client was a courtesy client.  We
>> then call queue_work(laundry_wq, &nn->laundromat_work) to tell the
>> laundromat to actually expire the client.  Then if needed we can drop
>> locks, wait for the laundromat to do the work with
>> flush_workqueue(laundry_wq), and retry.
>>
>> All the EXPIRABLE state does is tell the laundromat to expire this
>> client.  It does *not* prevent the client from being renewed and
>> acquiring new locks--if that happens before the laundromat gets to the
>> client, that's fine, we let it return to ACTIVE state and if someone
>> retries the conflicing lock they'll just get a denial.
>>
>> Here's a suggested a rough patch ordering.  If you want to go above and
>> beyond, I also suggest some tests that should pass after each step:
>>
>>
>> PATCH 1
>> -------
>>
>> Implement courtesy behavior *only* for clients that have
>> delegations, but no actual opens or locks:
>>
>> Define new cl_state field with values ACTIVE, COURTESY, and EXPIRABLE.
>> Set to ACTIVE on renewal.  Modify the laundromat so that instead of
>> expiring any client that's too old, it first checks if a client has
>> state consisting only of unconflicted delegations, and, if so, it sets
>> COURTESY.
>>
>> Define try_to_expire_client as above.  In nfsd_break_deleg_cb, call
>> try_to_expire_client and queue_work.  (But also continue scheduling the
>> recall as we do in the current code, there's no harm to that.)
>>
>> Modify the laundromat to try to expire old clients with EXPIRED set.
>>
>> TESTS:
>> 	- Establish a client, open a file, get a delegation, close the
>> 	  file, wait 2 lease periods, verify that you can still use the
>> 	  delegation.
>> 	- Establish a client, open a file, get a delegation, close the
>> 	  file, wait 2 lease periods, establish a second client, request
>> 	  a conflicting open, verify that the open succeeds and that the
>> 	  first client is no longer able to use its delegation.
>>
>>
>> PATCH 2
>> -------
>>
>> Extend courtesy client behavior to clients that have opens or
>> delegations, but no locks:
>>
>> Modify the laundromat to set COURTESY on old clients with state
>> consisting only of opens or unconflicted delegations.
>>
>> Add in nfs4_resolve_deny_conflicts_locked and friends as in your patch
>> "Update nfs4_get_vfs_file()...", but in the case of a conflict, call
>> try_to_expire_client and queue_work(), then modify e.g.
>> nfs4_get_vfs_file to flush_workqueue() and then retry after unlocking
>> fi_lock.
>>
>> TESTS:
>> 	- establish a client, open a file, wait 2 lease periods, verify
>> 	  that you can still use the open stateid.
>> 	- establish a client, open a file, wait 2 lease periods,
>> 	  establish a second client, request an open with a share mode
>> 	  conflicting with the first open, verify that the open succeeds
>> 	  and that first client is no longer able to use its open.
>>
>> PATCH 3
>> -------
>>
>> Minor tweak to prevent the laundromat from being freed out from
>> under a thread processing a conflicting lock:
>>
>> Create and destroy the laundromat workqueue in init_nfsd/exit_nfsd
>> instead of where it's done currently.
>>
>> (That makes the laundromat's lifetime longer than strictly necessary.
>> We could do better with a little more work; I think this is OK for now.)
>>
>> TESTS:
>> 	- just rerun any regression tests; this patch shouldn't change
>> 	  behavior.
>>
>> PATCH 4
>> -------
>>
>> Extend courtesy client behavior to any client with state, including
>> locks:
>>
>> Modify the laundromat to set COURTESY on any old client with state.
>>
>> Add two new lock manager callbacks:
>>
>> 	void * (*lm_lock_expirable)(struct file_lock *);
>> 	bool (*lm_expire_lock)(void *);
>>
>> If lm_lock_expirable() is called and returns non-NULL, posix_lock_inode
>> should drop flc_lock, call lm_expire_lock() with the value returned from
>> lm_lock_expirable, and then restart the loop over flc_posix from the
>> beginning.
>>
>> For now, nfsd's lm_lock_expirable will basically just be
>>
>> 	if (try_to_expire_client()) {
>> 		queue_work()
>> 		return get_net();
> Correction: I forgot that the laundromat is global, not per-net.  So, we
> can skip the put_net/get_net.  Also, lm_lock_expirable can just return
> bool instead of void *, and lm_expire_lock needs no arguments.

okay.

-Dai

>
> --b.
>
>> 	}
>> 	return NULL;
>>
>> and lm_expire_lock will:
>>
>> 	flush_workqueue()
>> 	put_net()
>>
>> One more subtlety: the moment we drop the flc_lock, it's possible
>> another task could race in and free it.  Worse, the nfsd module could be
>> removed entirely--so nfsd's lm_expire_lock code could disappear out from
>> under us.  To prevent this, I think we need to add a struct module
>> *owner field to struct lock_manager_operations, and use it like:
>>
>> 	owner = fl->fl_lmops->owner;
>> 	__get_module(owner);
>> 	expire_lock = fl->fl_lmops->lm_expire_lock;
>> 	spin_unlock(&ctx->flc_lock);
>> 	expire_lock(...);
>> 	module_put(owner);
>>
>> Maybe there's some simpler way, but I don't see it.
>>
>> TESTS:
>> 	- retest courtesy client behavior using file locks this time.
>>
>> --
>>
>> That's the basic idea.  I think it should work--though I may have
>> overlooked something.
>>
>> This has us flush the laundromat workqueue while holding mutexes in a
>> couple cases.  We could avoid that with a little more work, I think.
>> But those mutexes should only be associated with the client requesting a
>> new open/lock, and such a client shouldn't be touched by the laundromat,
>> so I think we're OK.
>>
>> It'd also be helpful to update the info file with courtesy client
>> information, as you do in your current patches.
>>
>> Does this make sense?
>>
>> --b.
diff mbox series

Patch

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fe8969ba94b3..eadce5d19473 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2893,8 +2893,11 @@  find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
 			node = node->rb_left;
 		else if (cmp < 0)
 			node = node->rb_right;
-		else
+		else {
+			if (nfsd4_courtesy_clnt_expired(clp))
+				return NULL;
 			return clp;
+		}
 	}
 	return NULL;
 }
@@ -2973,8 +2976,15 @@  static bool clp_used_exchangeid(struct nfs4_client *clp)
 static struct nfs4_client *
 find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
 {
+	struct nfs4_client *clp;
+
 	lockdep_assert_held(&nn->client_lock);
-	return find_clp_in_name_tree(name, &nn->conf_name_tree);
+	clp = find_clp_in_name_tree(name, &nn->conf_name_tree);
+	if (clp && clp->cl_cs_client_state == NFSD4_CLIENT_RECONNECTED) {
+		nfsd4_discard_courtesy_clnt(clp);
+		clp = NULL;
+	}
+	return clp;
 }
 
 static struct nfs4_client *
@@ -4091,12 +4101,19 @@  nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_client	*unconf = NULL;
 	__be32 			status;
 	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+	struct nfs4_client	*cclient = NULL;
 
 	new = create_client(clname, rqstp, &clverifier);
 	if (new == NULL)
 		return nfserr_jukebox;
 	spin_lock(&nn->client_lock);
-	conf = find_confirmed_client_by_name(&clname, nn);
+	/* find confirmed client by name */
+	conf = find_clp_in_name_tree(&clname, &nn->conf_name_tree);
+	if (conf && conf->cl_cs_client_state == NFSD4_CLIENT_RECONNECTED) {
+		cclient = conf;
+		conf = NULL;
+	}
+
 	if (conf && client_has_state(conf)) {
 		status = nfserr_clid_inuse;
 		if (clp_used_exchangeid(conf))
@@ -4127,7 +4144,11 @@  nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	new = NULL;
 	status = nfs_ok;
 out:
+	if (cclient)
+		unhash_client_locked(cclient);
 	spin_unlock(&nn->client_lock);
+	if (cclient)
+		expire_client(cclient);
 	if (new)
 		free_client(new);
 	if (unconf) {
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 8b81493ee48a..a2e2ac1a945a 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -735,6 +735,7 @@  extern void nfsd4_client_record_remove(struct nfs4_client *clp);
 extern int nfsd4_client_record_check(struct nfs4_client *clp);
 extern void nfsd4_record_grace_done(struct nfsd_net *nn);
 
+/* courteous server */
 static inline bool
 nfsd4_expire_courtesy_clnt(struct nfs4_client *clp)
 {
@@ -749,4 +750,25 @@  nfsd4_expire_courtesy_clnt(struct nfs4_client *clp)
 	return rc;
 }
 
+static inline void
+nfsd4_discard_courtesy_clnt(struct nfs4_client *clp)
+{
+	spin_lock(&clp->cl_cs_lock);
+	clp->cl_cs_client_state = NFSD4_CLIENT_EXPIRED;
+	spin_unlock(&clp->cl_cs_lock);
+}
+
+static inline bool
+nfsd4_courtesy_clnt_expired(struct nfs4_client *clp)
+{
+	bool rc = false;
+
+	spin_lock(&clp->cl_cs_lock);
+	if (clp->cl_cs_client_state == NFSD4_CLIENT_EXPIRED)
+		rc = true;
+	if (clp->cl_cs_client_state == NFSD4_CLIENT_COURTESY)
+		clp->cl_cs_client_state = NFSD4_CLIENT_RECONNECTED;
+	spin_unlock(&clp->cl_cs_lock);
+	return rc;
+}
 #endif   /* NFSD4_STATE_H */