diff mbox series

[v2,3/3] nfsd: keep a tally of RECLAIM_COMPLETE operations when using nfsdcld

Message ID 20181218142926.27933-4-smayhew@redhat.com (mailing list archive)
State New, archived
Headers show
Series un-deprecate nfsdcld | expand

Commit Message

Scott Mayhew Dec. 18, 2018, 2:29 p.m. UTC
When using nfsdcld for NFSv4 client tracking, track the number of
RECLAIM_COMPLETE operations we receive from "known" clients to help in
deciding if we can lift the grace period early (or whether we need to
start a v4 grace period at all).

Signed-off-by: Scott Mayhew <smayhew@redhat.com>
---
 fs/nfsd/netns.h       |  3 +++
 fs/nfsd/nfs4recover.c |  5 +++++
 fs/nfsd/nfs4state.c   | 52 +++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfsctl.c      |  1 +
 4 files changed, 61 insertions(+)

Comments

J. Bruce Fields Dec. 19, 2018, 5:46 p.m. UTC | #1
On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> +skip_grace:
> +	printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
> +			net->ns.inum);
> +	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
> +	/*
> +	 * we could call nfsd4_end_grace() here, but it has a dprintk()
> +	 * that would be confusing if debug logging is enabled
> +	 */

In that case, I'd rather pull the dprintk out of nfsd4_end_grace into
its only other caller (nfs4_laundromat), instead of duplicating this
stuff.

> +	nn->grace_ended = true;
> +	nfsd4_record_grace_done(nn);
> +	locks_end_grace(&nn->nfsd4_manager);

(Yes, it's only three lines, but it's a little subtle, I'd rather have
it all in one place.)

--b.

> +	return 0;
>  }
>  
>  /* initialization to perform when the nfsd service is started: */
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 6384c9b94898..950ac6683be9 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -1240,6 +1240,7 @@ static __net_init int nfsd_init_net(struct net *net)
>  	nn->nfsd4_lease = 45;	/* default lease time */
>  	nn->nfsd4_grace = 45;
>  	nn->somebody_reclaimed = false;
> +	nn->track_reclaim_completes = false;
>  	nn->clverifier_counter = prandom_u32();
>  	nn->clientid_counter = prandom_u32();
>  	nn->s2s_cp_cl_id = nn->clientid_counter++;
> -- 
> 2.17.1
J. Bruce Fields Dec. 19, 2018, 6:28 p.m. UTC | #2
On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> index 89c2a27956d0..ae74814b2397 100644
> --- a/fs/nfsd/nfs4recover.c
> +++ b/fs/nfsd/nfs4recover.c
> @@ -1251,6 +1251,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
>  	free_cld_upcall(cup);
>  out_err:
>  	nfs4_release_reclaim(nn);
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  	if (ret)
>  		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
>  }
> @@ -1270,6 +1271,8 @@ nfs4_cld_state_init(struct net *net)
>  	for (i = 0; i < CLIENT_HASH_SIZE; i++)
>  		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
>  	nn->reclaim_str_hashtbl_size = 0;
> +	nn->track_reclaim_completes = true;
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  
>  	return 0;
>  }
...
> @@ -1318,6 +1322,7 @@ nfsd4_cld_tracking_exit(struct net *net)
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>  
>  	nfs4_release_reclaim(nn);
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  	nfsd4_remove_cld_pipe(net);
>  	nfs4_cld_state_shutdown(net);
>  }

We're initializing nr_reclaim_complete in 3 different places, probably
only one of those is really necessary?

--b.
J. Bruce Fields Dec. 19, 2018, 6:36 p.m. UTC | #3
On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> When using nfsdcld for NFSv4 client tracking, track the number of
> RECLAIM_COMPLETE operations we receive from "known" clients to help in
> deciding if we can lift the grace period early (or whether we need to
> start a v4 grace period at all).
> 
> Signed-off-by: Scott Mayhew <smayhew@redhat.com>
> ---
>  fs/nfsd/netns.h       |  3 +++
>  fs/nfsd/nfs4recover.c |  5 +++++
>  fs/nfsd/nfs4state.c   | 52 +++++++++++++++++++++++++++++++++++++++++++
>  fs/nfsd/nfsctl.c      |  1 +
>  4 files changed, 61 insertions(+)
> 
> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> index 32cb8c027483..b42aaa22fba2 100644
> --- a/fs/nfsd/netns.h
> +++ b/fs/nfsd/netns.h
> @@ -104,6 +104,9 @@ struct nfsd_net {
>  	time_t nfsd4_grace;
>  	bool somebody_reclaimed;
>  
> +	bool track_reclaim_completes;
> +	atomic_t nr_reclaim_complete;
> +
>  	bool nfsd_net_up;
>  	bool lockd_up;
>  
> diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> index 89c2a27956d0..ae74814b2397 100644
> --- a/fs/nfsd/nfs4recover.c
> +++ b/fs/nfsd/nfs4recover.c
> @@ -1251,6 +1251,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
>  	free_cld_upcall(cup);
>  out_err:
>  	nfs4_release_reclaim(nn);
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  	if (ret)
>  		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
>  }
> @@ -1270,6 +1271,8 @@ nfs4_cld_state_init(struct net *net)
>  	for (i = 0; i < CLIENT_HASH_SIZE; i++)
>  		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
>  	nn->reclaim_str_hashtbl_size = 0;
> +	nn->track_reclaim_completes = true;
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  
>  	return 0;
>  }
> @@ -1279,6 +1282,7 @@ nfs4_cld_state_shutdown(struct net *net)
>  {
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>  
> +	nn->track_reclaim_completes = false;
>  	kfree(nn->reclaim_str_hashtbl);
>  }
>  
> @@ -1318,6 +1322,7 @@ nfsd4_cld_tracking_exit(struct net *net)
>  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
>  
>  	nfs4_release_reclaim(nn);
> +	atomic_set(&nn->nr_reclaim_complete, 0);
>  	nfsd4_remove_cld_pipe(net);
>  	nfs4_cld_state_shutdown(net);
>  }
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 777fbb0d2761..5dfc3cd51d08 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -77,6 +77,7 @@ static u64 current_sessionid = 1;
>  /* forward declarations */
>  static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
>  static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
> +void nfsd4_end_grace(struct nfsd_net *nn);
>  
>  /* Locking: */
>  
> @@ -1988,10 +1989,41 @@ destroy_client(struct nfs4_client *clp)
>  	__destroy_client(clp);
>  }
>  
> +static void inc_reclaim_complete(struct nfs4_client *clp)
> +{
> +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> +
> +	if (!nn->track_reclaim_completes)
> +		return;
> +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> +		return;

Looking at the code....  This test will never fail, will it?  We could
make it a WARN_ON_ONCE(!test_bit(...)) but it doesn't look like a likely
bug to me.

> +	if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
> +		return;
> +	if (atomic_inc_return(&nn->nr_reclaim_complete) ==
> +			nn->reclaim_str_hashtbl_size) {
> +		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
> +				clp->net->ns.inum);
> +		nfsd4_end_grace(nn);
> +	}
> +}
> +
> +static void dec_reclaim_complete(struct nfs4_client *clp)
> +{
> +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> +
> +	if (!nn->track_reclaim_completes)
> +		return;
> +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> +		return;
> +	if (nfsd4_find_reclaim_client(clp->cl_name, nn))
> +		atomic_dec(&nn->nr_reclaim_complete);
> +}
> +
>  static void expire_client(struct nfs4_client *clp)
>  {
>  	unhash_client(clp);
>  	nfsd4_client_record_remove(clp);
> +	dec_reclaim_complete(clp);
>  	__destroy_client(clp);
>  }

This doesn't look right to me.  If a client reclaims and then
immediately calls DESTROY_CLIENTID or something--that should still count
as a reclaim, and that shouldn't prevent us from ending the grace period
early.

I think dec_reclaim_complete is unnecessary.

--b.

>  
> @@ -3339,6 +3371,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
>  
>  	status = nfs_ok;
>  	nfsd4_client_record_create(cstate->session->se_client);
> +	inc_reclaim_complete(cstate->session->se_client);
>  out:
>  	return status;
>  }
> @@ -4734,6 +4767,10 @@ static bool clients_still_reclaiming(struct nfsd_net *nn)
>  	unsigned long double_grace_period_end = nn->boot_time +
>  						2 * nn->nfsd4_lease;
>  
> +	if (nn->track_reclaim_completes &&
> +			atomic_read(&nn->nr_reclaim_complete) ==
> +			nn->reclaim_str_hashtbl_size)
> +		return false;
>  	if (!nn->somebody_reclaimed)
>  		return false;
>  	nn->somebody_reclaimed = false;
> @@ -7253,10 +7290,25 @@ nfs4_state_start_net(struct net *net)
>  		return ret;
>  	locks_start_grace(net, &nn->nfsd4_manager);
>  	nfsd4_client_tracking_init(net);
> +	if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
> +		goto skip_grace;
>  	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n",
>  	       nn->nfsd4_grace, net->ns.inum);
>  	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
>  	return 0;
> +
> +skip_grace:
> +	printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
> +			net->ns.inum);
> +	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
> +	/*
> +	 * we could call nfsd4_end_grace() here, but it has a dprintk()
> +	 * that would be confusing if debug logging is enabled
> +	 */
> +	nn->grace_ended = true;
> +	nfsd4_record_grace_done(nn);
> +	locks_end_grace(&nn->nfsd4_manager);
> +	return 0;
>  }
>  
>  /* initialization to perform when the nfsd service is started: */
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 6384c9b94898..950ac6683be9 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -1240,6 +1240,7 @@ static __net_init int nfsd_init_net(struct net *net)
>  	nn->nfsd4_lease = 45;	/* default lease time */
>  	nn->nfsd4_grace = 45;
>  	nn->somebody_reclaimed = false;
> +	nn->track_reclaim_completes = false;
>  	nn->clverifier_counter = prandom_u32();
>  	nn->clientid_counter = prandom_u32();
>  	nn->s2s_cp_cl_id = nn->clientid_counter++;
> -- 
> 2.17.1
Scott Mayhew Dec. 19, 2018, 9:57 p.m. UTC | #4
On Wed, 19 Dec 2018, J. Bruce Fields wrote:

> On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> > +skip_grace:
> > +	printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
> > +			net->ns.inum);
> > +	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
> > +	/*
> > +	 * we could call nfsd4_end_grace() here, but it has a dprintk()
> > +	 * that would be confusing if debug logging is enabled
> > +	 */
> 
> In that case, I'd rather pull the dprintk out of nfsd4_end_grace into
> its only other caller (nfs4_laundromat), instead of duplicating this
> stuff.

Okay, will do.

-Scott
> 
> > +	nn->grace_ended = true;
> > +	nfsd4_record_grace_done(nn);
> > +	locks_end_grace(&nn->nfsd4_manager);
> 
> (Yes, it's only three lines, but it's a little subtle, I'd rather have
> it all in one place.)
> 
> --b.
> 
> > +	return 0;
> >  }
> >  
> >  /* initialization to perform when the nfsd service is started: */
> > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> > index 6384c9b94898..950ac6683be9 100644
> > --- a/fs/nfsd/nfsctl.c
> > +++ b/fs/nfsd/nfsctl.c
> > @@ -1240,6 +1240,7 @@ static __net_init int nfsd_init_net(struct net *net)
> >  	nn->nfsd4_lease = 45;	/* default lease time */
> >  	nn->nfsd4_grace = 45;
> >  	nn->somebody_reclaimed = false;
> > +	nn->track_reclaim_completes = false;
> >  	nn->clverifier_counter = prandom_u32();
> >  	nn->clientid_counter = prandom_u32();
> >  	nn->s2s_cp_cl_id = nn->clientid_counter++;
> > -- 
> > 2.17.1
Scott Mayhew Dec. 19, 2018, 10:01 p.m. UTC | #5
On Wed, 19 Dec 2018, J. Bruce Fields wrote:

> On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> > index 89c2a27956d0..ae74814b2397 100644
> > --- a/fs/nfsd/nfs4recover.c
> > +++ b/fs/nfsd/nfs4recover.c
> > @@ -1251,6 +1251,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
> >  	free_cld_upcall(cup);
> >  out_err:
> >  	nfs4_release_reclaim(nn);
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  	if (ret)
> >  		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
> >  }
> > @@ -1270,6 +1271,8 @@ nfs4_cld_state_init(struct net *net)
> >  	for (i = 0; i < CLIENT_HASH_SIZE; i++)
> >  		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
> >  	nn->reclaim_str_hashtbl_size = 0;
> > +	nn->track_reclaim_completes = true;
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  
> >  	return 0;
> >  }
> ...
> > @@ -1318,6 +1322,7 @@ nfsd4_cld_tracking_exit(struct net *net)
> >  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> >  
> >  	nfs4_release_reclaim(nn);
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  	nfsd4_remove_cld_pipe(net);
> >  	nfs4_cld_state_shutdown(net);
> >  }
> 
> We're initializing nr_reclaim_complete in 3 different places, probably
> only one of those is really necessary?

Yes, only the one in nfs4_cld_state_init() is really necessary.  If the
ability to put a running server into grace is ever added, then I think
the counter would need to be reset there too.

-Scott
> 
> --b.
Scott Mayhew Dec. 19, 2018, 10:05 p.m. UTC | #6
On Wed, 19 Dec 2018, J. Bruce Fields wrote:

> On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> > When using nfsdcld for NFSv4 client tracking, track the number of
> > RECLAIM_COMPLETE operations we receive from "known" clients to help in
> > deciding if we can lift the grace period early (or whether we need to
> > start a v4 grace period at all).
> > 
> > Signed-off-by: Scott Mayhew <smayhew@redhat.com>
> > ---
> >  fs/nfsd/netns.h       |  3 +++
> >  fs/nfsd/nfs4recover.c |  5 +++++
> >  fs/nfsd/nfs4state.c   | 52 +++++++++++++++++++++++++++++++++++++++++++
> >  fs/nfsd/nfsctl.c      |  1 +
> >  4 files changed, 61 insertions(+)
> > 
> > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> > index 32cb8c027483..b42aaa22fba2 100644
> > --- a/fs/nfsd/netns.h
> > +++ b/fs/nfsd/netns.h
> > @@ -104,6 +104,9 @@ struct nfsd_net {
> >  	time_t nfsd4_grace;
> >  	bool somebody_reclaimed;
> >  
> > +	bool track_reclaim_completes;
> > +	atomic_t nr_reclaim_complete;
> > +
> >  	bool nfsd_net_up;
> >  	bool lockd_up;
> >  
> > diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> > index 89c2a27956d0..ae74814b2397 100644
> > --- a/fs/nfsd/nfs4recover.c
> > +++ b/fs/nfsd/nfs4recover.c
> > @@ -1251,6 +1251,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
> >  	free_cld_upcall(cup);
> >  out_err:
> >  	nfs4_release_reclaim(nn);
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  	if (ret)
> >  		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
> >  }
> > @@ -1270,6 +1271,8 @@ nfs4_cld_state_init(struct net *net)
> >  	for (i = 0; i < CLIENT_HASH_SIZE; i++)
> >  		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
> >  	nn->reclaim_str_hashtbl_size = 0;
> > +	nn->track_reclaim_completes = true;
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  
> >  	return 0;
> >  }
> > @@ -1279,6 +1282,7 @@ nfs4_cld_state_shutdown(struct net *net)
> >  {
> >  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> >  
> > +	nn->track_reclaim_completes = false;
> >  	kfree(nn->reclaim_str_hashtbl);
> >  }
> >  
> > @@ -1318,6 +1322,7 @@ nfsd4_cld_tracking_exit(struct net *net)
> >  	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
> >  
> >  	nfs4_release_reclaim(nn);
> > +	atomic_set(&nn->nr_reclaim_complete, 0);
> >  	nfsd4_remove_cld_pipe(net);
> >  	nfs4_cld_state_shutdown(net);
> >  }
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 777fbb0d2761..5dfc3cd51d08 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> > @@ -77,6 +77,7 @@ static u64 current_sessionid = 1;
> >  /* forward declarations */
> >  static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
> >  static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
> > +void nfsd4_end_grace(struct nfsd_net *nn);
> >  
> >  /* Locking: */
> >  
> > @@ -1988,10 +1989,41 @@ destroy_client(struct nfs4_client *clp)
> >  	__destroy_client(clp);
> >  }
> >  
> > +static void inc_reclaim_complete(struct nfs4_client *clp)
> > +{
> > +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > +
> > +	if (!nn->track_reclaim_completes)
> > +		return;
> > +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> > +		return;
> 
> Looking at the code....  This test will never fail, will it?  We could
> make it a WARN_ON_ONCE(!test_bit(...)) but it doesn't look like a likely
> bug to me.

No, that test should never fail... I think I was being overly cautious.

> 
> > +	if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
> > +		return;
> > +	if (atomic_inc_return(&nn->nr_reclaim_complete) ==
> > +			nn->reclaim_str_hashtbl_size) {
> > +		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
> > +				clp->net->ns.inum);
> > +		nfsd4_end_grace(nn);
> > +	}
> > +}
> > +
> > +static void dec_reclaim_complete(struct nfs4_client *clp)
> > +{
> > +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > +
> > +	if (!nn->track_reclaim_completes)
> > +		return;
> > +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> > +		return;
> > +	if (nfsd4_find_reclaim_client(clp->cl_name, nn))
> > +		atomic_dec(&nn->nr_reclaim_complete);
> > +}
> > +
> >  static void expire_client(struct nfs4_client *clp)
> >  {
> >  	unhash_client(clp);
> >  	nfsd4_client_record_remove(clp);
> > +	dec_reclaim_complete(clp);
> >  	__destroy_client(clp);
> >  }
> 
> This doesn't look right to me.  If a client reclaims and then
> immediately calls DESTROY_CLIENTID or something--that should still count
> as a reclaim, and that shouldn't prevent us from ending the grace period
> early.
> 
> I think dec_reclaim_complete is unnecessary.

What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
still in grace?  The count would be too high then and the server could
exit grace before all the clients have reclaimed.  I actually added
that at Jeff's suggestion because he was seeing it with nfs-ganesha.  

-Scott
> 
> --b.
> 
> >  
> > @@ -3339,6 +3371,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
> >  
> >  	status = nfs_ok;
> >  	nfsd4_client_record_create(cstate->session->se_client);
> > +	inc_reclaim_complete(cstate->session->se_client);
> >  out:
> >  	return status;
> >  }
> > @@ -4734,6 +4767,10 @@ static bool clients_still_reclaiming(struct nfsd_net *nn)
> >  	unsigned long double_grace_period_end = nn->boot_time +
> >  						2 * nn->nfsd4_lease;
> >  
> > +	if (nn->track_reclaim_completes &&
> > +			atomic_read(&nn->nr_reclaim_complete) ==
> > +			nn->reclaim_str_hashtbl_size)
> > +		return false;
> >  	if (!nn->somebody_reclaimed)
> >  		return false;
> >  	nn->somebody_reclaimed = false;
> > @@ -7253,10 +7290,25 @@ nfs4_state_start_net(struct net *net)
> >  		return ret;
> >  	locks_start_grace(net, &nn->nfsd4_manager);
> >  	nfsd4_client_tracking_init(net);
> > +	if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
> > +		goto skip_grace;
> >  	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n",
> >  	       nn->nfsd4_grace, net->ns.inum);
> >  	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
> >  	return 0;
> > +
> > +skip_grace:
> > +	printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
> > +			net->ns.inum);
> > +	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
> > +	/*
> > +	 * we could call nfsd4_end_grace() here, but it has a dprintk()
> > +	 * that would be confusing if debug logging is enabled
> > +	 */
> > +	nn->grace_ended = true;
> > +	nfsd4_record_grace_done(nn);
> > +	locks_end_grace(&nn->nfsd4_manager);
> > +	return 0;
> >  }
> >  
> >  /* initialization to perform when the nfsd service is started: */
> > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> > index 6384c9b94898..950ac6683be9 100644
> > --- a/fs/nfsd/nfsctl.c
> > +++ b/fs/nfsd/nfsctl.c
> > @@ -1240,6 +1240,7 @@ static __net_init int nfsd_init_net(struct net *net)
> >  	nn->nfsd4_lease = 45;	/* default lease time */
> >  	nn->nfsd4_grace = 45;
> >  	nn->somebody_reclaimed = false;
> > +	nn->track_reclaim_completes = false;
> >  	nn->clverifier_counter = prandom_u32();
> >  	nn->clientid_counter = prandom_u32();
> >  	nn->s2s_cp_cl_id = nn->clientid_counter++;
> > -- 
> > 2.17.1
J. Bruce Fields Dec. 19, 2018, 10:21 p.m. UTC | #7
On Wed, Dec 19, 2018 at 05:05:45PM -0500, Scott Mayhew wrote:
> On Wed, 19 Dec 2018, J. Bruce Fields wrote:
> 
> > On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> > > +	if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
> > > +		return;
> > > +	if (atomic_inc_return(&nn->nr_reclaim_complete) ==
> > > +			nn->reclaim_str_hashtbl_size) {
> > > +		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
> > > +				clp->net->ns.inum);
> > > +		nfsd4_end_grace(nn);
> > > +	}
> > > +}
> > > +
> > > +static void dec_reclaim_complete(struct nfs4_client *clp)
> > > +{
> > > +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > > +
> > > +	if (!nn->track_reclaim_completes)
> > > +		return;
> > > +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> > > +		return;
> > > +	if (nfsd4_find_reclaim_client(clp->cl_name, nn))
> > > +		atomic_dec(&nn->nr_reclaim_complete);
> > > +}
> > > +
> > >  static void expire_client(struct nfs4_client *clp)
> > >  {
> > >  	unhash_client(clp);
> > >  	nfsd4_client_record_remove(clp);
> > > +	dec_reclaim_complete(clp);
> > >  	__destroy_client(clp);
> > >  }
> > 
> > This doesn't look right to me.  If a client reclaims and then
> > immediately calls DESTROY_CLIENTID or something--that should still count
> > as a reclaim, and that shouldn't prevent us from ending the grace period
> > early.
> > 
> > I think dec_reclaim_complete is unnecessary.
> 
> What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
> EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
> still in grace?  The count would be too high then and the server could
> exit grace before all the clients have reclaimed.  I actually added
> that at Jeff's suggestion because he was seeing it with nfs-ganesha.  

Oh boy.

(Thinks.)

Once it issues a DESTROY_CLIENTID or an EXCHANGE_ID that removes the
previous client instance's state, it's got no locks to reclaim any more.
(It can't have gotten any *new* ones, since we're still in the grace
period.)

It's effectively a brand new client.  Only reclaiming clients should
bump that counter.

We certainly shouldn't be waiting for it to RECLAIM_COMPLETE to end the
grace period, that client just doesn't matter any more.

I think?

--b.
J. Bruce Fields Dec. 19, 2018, 10:43 p.m. UTC | #8
On Wed, Dec 19, 2018 at 05:21:47PM -0500, J. Bruce Fields wrote:
> On Wed, Dec 19, 2018 at 05:05:45PM -0500, Scott Mayhew wrote:
> > What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
> > EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
> > still in grace?  The count would be too high then and the server could
> > exit grace before all the clients have reclaimed.  I actually added
> > that at Jeff's suggestion because he was seeing it with nfs-ganesha.  
> 
> Oh boy.
> 
> (Thinks.)
> 
> Once it issues a DESTROY_CLIENTID or an EXCHANGE_ID that removes the
> previous client instance's state, it's got no locks to reclaim any more.
> (It can't have gotten any *new* ones, since we're still in the grace
> period.)
> 
> It's effectively a brand new client.  Only reclaiming clients should
> bump that counter.
> 
> We certainly shouldn't be waiting for it to RECLAIM_COMPLETE to end the
> grace period, that client just doesn't matter any more.

Actually, once the client's destroyed, it shouldn't matter whether the
previous incarnation of the client reclaimed or not.  It's never going
to reclaim now....  So expire_client should probably just be removing
the client from the table of reclaimable clients at the same time that
it removes its stable storage record.

--b.
Scott Mayhew Dec. 20, 2018, 4:36 p.m. UTC | #9
On Wed, 19 Dec 2018, J. Bruce Fields wrote:

> On Wed, Dec 19, 2018 at 05:21:47PM -0500, J. Bruce Fields wrote:
> > On Wed, Dec 19, 2018 at 05:05:45PM -0500, Scott Mayhew wrote:
> > > What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
> > > EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
> > > still in grace?  The count would be too high then and the server could
> > > exit grace before all the clients have reclaimed.  I actually added
> > > that at Jeff's suggestion because he was seeing it with nfs-ganesha.  
> > 
> > Oh boy.
> > 
> > (Thinks.)
> > 
> > Once it issues a DESTROY_CLIENTID or an EXCHANGE_ID that removes the
> > previous client instance's state, it's got no locks to reclaim any more.
> > (It can't have gotten any *new* ones, since we're still in the grace
> > period.)
> > 
> > It's effectively a brand new client.  Only reclaiming clients should
> > bump that counter.
> > 
> > We certainly shouldn't be waiting for it to RECLAIM_COMPLETE to end the
> > grace period, that client just doesn't matter any more.
> 
> Actually, once the client's destroyed, it shouldn't matter whether the
> previous incarnation of the client reclaimed or not.  It's never going
> to reclaim now....  So expire_client should probably just be removing
> the client from the table of reclaimable clients at the same time that
> it removes its stable storage record.

Okay, come to think of it, if we're in grace then nfsdcld should be
removing the client record from both the current and recovery epoch's db
tables too...
> 
> --b.
Jeff Layton Dec. 20, 2018, 5:29 p.m. UTC | #10
On Wed, 2018-12-19 at 17:21 -0500, J. Bruce Fields wrote:
> On Wed, Dec 19, 2018 at 05:05:45PM -0500, Scott Mayhew wrote:
> > On Wed, 19 Dec 2018, J. Bruce Fields wrote:
> > 
> > > On Tue, Dec 18, 2018 at 09:29:26AM -0500, Scott Mayhew wrote:
> > > > +	if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
> > > > +		return;
> > > > +	if (atomic_inc_return(&nn->nr_reclaim_complete) ==
> > > > +			nn->reclaim_str_hashtbl_size) {
> > > > +		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
> > > > +				clp->net->ns.inum);
> > > > +		nfsd4_end_grace(nn);
> > > > +	}
> > > > +}
> > > > +
> > > > +static void dec_reclaim_complete(struct nfs4_client *clp)
> > > > +{
> > > > +	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
> > > > +
> > > > +	if (!nn->track_reclaim_completes)
> > > > +		return;
> > > > +	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
> > > > +		return;
> > > > +	if (nfsd4_find_reclaim_client(clp->cl_name, nn))
> > > > +		atomic_dec(&nn->nr_reclaim_complete);
> > > > +}
> > > > +
> > > >  static void expire_client(struct nfs4_client *clp)
> > > >  {
> > > >  	unhash_client(clp);
> > > >  	nfsd4_client_record_remove(clp);
> > > > +	dec_reclaim_complete(clp);
> > > >  	__destroy_client(clp);
> > > >  }
> > > 
> > > This doesn't look right to me.  If a client reclaims and then
> > > immediately calls DESTROY_CLIENTID or something--that should still count
> > > as a reclaim, and that shouldn't prevent us from ending the grace period
> > > early.
> > > 
> > > I think dec_reclaim_complete is unnecessary.
> > 
> > What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
> > EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
> > still in grace?  The count would be too high then and the server could
> > exit grace before all the clients have reclaimed.  I actually added
> > that at Jeff's suggestion because he was seeing it with nfs-ganesha.  
> 
> Oh boy.
> 
> (Thinks.)
> 
> Once it issues a DESTROY_CLIENTID or an EXCHANGE_ID that removes the
> previous client instance's state, it's got no locks to reclaim any more.
> (It can't have gotten any *new* ones, since we're still in the grace
> period.)
> 
> It's effectively a brand new client.  Only reclaiming clients should
> bump that counter.
> 
> We certainly shouldn't be waiting for it to RECLAIM_COMPLETE to end the
> grace period, that client just doesn't matter any more.
> 
> I think?
> 

That wasn't my thinking here.

Suppose we have a client that holds some locks. Server reboots and we do
EXCHANGE_ID and start reclaiming, and eventually send a
RECLAIM_COMPLETE.

Now, there is a network partition and we lose contact with the server
for more than a lease period. The client record gets tossed out. Client
eventually reestablishes the connection before the grace period ends and
attempts to reclaim.

That reclaim should succeed, IMO, as there is no reason that it
shouldn't. Nothing can have claimed competing state since we're still in
the grace period.

The thing you don't want to do here is to double count the
RECLAIM_COMPLETE for this client. So decrementing the counter when you
tear down a client is reasonable.
Jeff Layton Dec. 20, 2018, 5:32 p.m. UTC | #11
On Thu, 2018-12-20 at 11:36 -0500, Scott Mayhew wrote:
> On Wed, 19 Dec 2018, J. Bruce Fields wrote:
> 
> > On Wed, Dec 19, 2018 at 05:21:47PM -0500, J. Bruce Fields wrote:
> > > On Wed, Dec 19, 2018 at 05:05:45PM -0500, Scott Mayhew wrote:
> > > > What if a client sends a RECLAIM_COMPLETE, then reboots and sends an
> > > > EXCHANGE_ID, CREATE_SESSION, and RECLAIM_COMPLETE while the server is
> > > > still in grace?  The count would be too high then and the server could
> > > > exit grace before all the clients have reclaimed.  I actually added
> > > > that at Jeff's suggestion because he was seeing it with nfs-ganesha.  
> > > 
> > > Oh boy.
> > > 
> > > (Thinks.)
> > > 
> > > Once it issues a DESTROY_CLIENTID or an EXCHANGE_ID that removes the
> > > previous client instance's state, it's got no locks to reclaim any more.
> > > (It can't have gotten any *new* ones, since we're still in the grace
> > > period.)
> > > 
> > > It's effectively a brand new client.  Only reclaiming clients should
> > > bump that counter.
> > > 
> > > We certainly shouldn't be waiting for it to RECLAIM_COMPLETE to end the
> > > grace period, that client just doesn't matter any more.
> > 
> > Actually, once the client's destroyed, it shouldn't matter whether the
> > previous incarnation of the client reclaimed or not.  It's never going
> > to reclaim now....  So expire_client should probably just be removing
> > the client from the table of reclaimable clients at the same time that
> > it removes its stable storage record.
> 
> Okay, come to think of it, if we're in grace then nfsdcld should be
> removing the client record from both the current and recovery epoch's db
> tables too...

No!

The recovery table is a record of who held state at the end of the prior
epoch. That does not change just because the client went away during the
current epoch.

You should only ever alter the table for the epoch that you are in. Once
the grace period is lifted, you can delete the tables for any prior
epoch.
J. Bruce Fields Dec. 20, 2018, 6:05 p.m. UTC | #12
On Thu, Dec 20, 2018 at 12:29:43PM -0500, Jeff Layton wrote:
> That wasn't my thinking here.
> 
> Suppose we have a client that holds some locks. Server reboots and we do
> EXCHANGE_ID and start reclaiming, and eventually send a
> RECLAIM_COMPLETE.
> 
> Now, there is a network partition and we lose contact with the server
> for more than a lease period. The client record gets tossed out. Client
> eventually reestablishes the connection before the grace period ends and
> attempts to reclaim.
> 
> That reclaim should succeed, IMO, as there is no reason that it
> shouldn't. Nothing can have claimed competing state since we're still in
> the grace period.

That scenario requires a grace period longer than the lease period,
which isn't impossible but sounds rare?  I guess you're thinking in the
cluster case about the possibility of a second node failure extending
the grace period.

Still, that's different from the case where the client explicitly
destroys its own state.  That could happen in less than a lease period
and in that case there won't be a reclaim.  I think that case could
happen if a client rebooted quickly or maybe just unmounted.

Hm.

--b.
Jeff Layton Dec. 20, 2018, 6:26 p.m. UTC | #13
On Thu, 2018-12-20 at 13:05 -0500, J. Bruce Fields wrote:
> On Thu, Dec 20, 2018 at 12:29:43PM -0500, Jeff Layton wrote:
> > That wasn't my thinking here.
> > 
> > Suppose we have a client that holds some locks. Server reboots and we do
> > EXCHANGE_ID and start reclaiming, and eventually send a
> > RECLAIM_COMPLETE.
> > 
> > Now, there is a network partition and we lose contact with the server
> > for more than a lease period. The client record gets tossed out. Client
> > eventually reestablishes the connection before the grace period ends and
> > attempts to reclaim.
> > 
> > That reclaim should succeed, IMO, as there is no reason that it
> > shouldn't. Nothing can have claimed competing state since we're still in
> > the grace period.
> 
> That scenario requires a grace period longer than the lease period,
> which isn't impossible but sounds rare?  I guess you're thinking in the
> cluster case about the possibility of a second node failure extending
> the grace period.
> 

Isn't our grace period twice the lease period by default? I think we do
have to assume that it may take an entire lease period before the
client notices that the server has rebooted. If grace period == lease
period then you aren't leaving much time for reclaim to occur.

> Still, that's different from the case where the client explicitly
> destroys its own state.  That could happen in less than a lease period
> and in that case there won't be a reclaim.  I think that case could
> happen if a client rebooted quickly or maybe just unmounted.
> 
> Hm.
> 

True. You're right that we don't want to delay lifting the grace period
because we're waiting for clients that have unmounted and aren't coming
back. Unfortunately, it's difficult to distinguish the two cases. Could
we just decrement the counter when we're tearing down a clientid
because of lease expiration and not on DESTROY_CLIENT?
J. Bruce Fields Dec. 20, 2018, 7:02 p.m. UTC | #14
On Thu, Dec 20, 2018 at 01:26:34PM -0500, Jeff Layton wrote:
> On Thu, 2018-12-20 at 13:05 -0500, J. Bruce Fields wrote:
> > On Thu, Dec 20, 2018 at 12:29:43PM -0500, Jeff Layton wrote:
> > > That wasn't my thinking here.
> > > 
> > > Suppose we have a client that holds some locks. Server reboots and we do
> > > EXCHANGE_ID and start reclaiming, and eventually send a
> > > RECLAIM_COMPLETE.
> > > 
> > > Now, there is a network partition and we lose contact with the server
> > > for more than a lease period. The client record gets tossed out. Client
> > > eventually reestablishes the connection before the grace period ends and
> > > attempts to reclaim.
> > > 
> > > That reclaim should succeed, IMO, as there is no reason that it
> > > shouldn't. Nothing can have claimed competing state since we're still in
> > > the grace period.
> > 
> > That scenario requires a grace period longer than the lease period,
> > which isn't impossible but sounds rare?  I guess you're thinking in the
> > cluster case about the possibility of a second node failure extending
> > the grace period.
> 
> Isn't our grace period twice the lease period by default?

Reminding myself.... Upstream now it will end the grace period after one
grace period, but will extend it up to two grace periods if someone has
reclaimed in the last second.

> I think we do
> have to assume that it may take an entire lease period before the
> client notices that the server has rebooted. If grace period == lease
> period then you aren't leaving much time for reclaim to occur.

My assumption is that it's mainly the client's responsibility to allow
enough time, by renewing its lease somewhat more frequently than once
per lease period.

That may be wrong--there's some support for that assumption in
https://tools.ietf.org/html/rfc7530#section-9.5, but that's talking only
about network delays, not about allowing additional time for the
recovery.

> > Still, that's different from the case where the client explicitly
> > destroys its own state.  That could happen in less than a lease period
> > and in that case there won't be a reclaim.  I think that case could
> > happen if a client rebooted quickly or maybe just unmounted.
> > 
> > Hm.
> > 
> 
> True. You're right that we don't want to delay lifting the grace period
> because we're waiting for clients that have unmounted and aren't coming
> back. Unfortunately, it's difficult to distinguish the two cases. Could
> we just decrement the counter when we're tearing down a clientid
> because of lease expiration and not on DESTROY_CLIENT?

Right, either DESTROY_CLIENTID or (in the 4.0 case) a
SETCLIENTID_CONFIRM.  So those two cases wouldn't be difficult to treat
differently.  OK, maybe that's the best choice.

--b.
diff mbox series

Patch

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 32cb8c027483..b42aaa22fba2 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -104,6 +104,9 @@  struct nfsd_net {
 	time_t nfsd4_grace;
 	bool somebody_reclaimed;
 
+	bool track_reclaim_completes;
+	atomic_t nr_reclaim_complete;
+
 	bool nfsd_net_up;
 	bool lockd_up;
 
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 89c2a27956d0..ae74814b2397 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1251,6 +1251,7 @@  nfsd4_cld_grace_done(struct nfsd_net *nn)
 	free_cld_upcall(cup);
 out_err:
 	nfs4_release_reclaim(nn);
+	atomic_set(&nn->nr_reclaim_complete, 0);
 	if (ret)
 		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
 }
@@ -1270,6 +1271,8 @@  nfs4_cld_state_init(struct net *net)
 	for (i = 0; i < CLIENT_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
 	nn->reclaim_str_hashtbl_size = 0;
+	nn->track_reclaim_completes = true;
+	atomic_set(&nn->nr_reclaim_complete, 0);
 
 	return 0;
 }
@@ -1279,6 +1282,7 @@  nfs4_cld_state_shutdown(struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
+	nn->track_reclaim_completes = false;
 	kfree(nn->reclaim_str_hashtbl);
 }
 
@@ -1318,6 +1322,7 @@  nfsd4_cld_tracking_exit(struct net *net)
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
 	nfs4_release_reclaim(nn);
+	atomic_set(&nn->nr_reclaim_complete, 0);
 	nfsd4_remove_cld_pipe(net);
 	nfs4_cld_state_shutdown(net);
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 777fbb0d2761..5dfc3cd51d08 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -77,6 +77,7 @@  static u64 current_sessionid = 1;
 /* forward declarations */
 static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
 static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
+void nfsd4_end_grace(struct nfsd_net *nn);
 
 /* Locking: */
 
@@ -1988,10 +1989,41 @@  destroy_client(struct nfs4_client *clp)
 	__destroy_client(clp);
 }
 
+static void inc_reclaim_complete(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (!nn->track_reclaim_completes)
+		return;
+	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
+		return;
+	if (!nfsd4_find_reclaim_client(clp->cl_name, nn))
+		return;
+	if (atomic_inc_return(&nn->nr_reclaim_complete) ==
+			nn->reclaim_str_hashtbl_size) {
+		printk(KERN_INFO "NFSD: all clients done reclaiming, ending NFSv4 grace period (net %x)\n",
+				clp->net->ns.inum);
+		nfsd4_end_grace(nn);
+	}
+}
+
+static void dec_reclaim_complete(struct nfs4_client *clp)
+{
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+	if (!nn->track_reclaim_completes)
+		return;
+	if (!test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
+		return;
+	if (nfsd4_find_reclaim_client(clp->cl_name, nn))
+		atomic_dec(&nn->nr_reclaim_complete);
+}
+
 static void expire_client(struct nfs4_client *clp)
 {
 	unhash_client(clp);
 	nfsd4_client_record_remove(clp);
+	dec_reclaim_complete(clp);
 	__destroy_client(clp);
 }
 
@@ -3339,6 +3371,7 @@  nfsd4_reclaim_complete(struct svc_rqst *rqstp,
 
 	status = nfs_ok;
 	nfsd4_client_record_create(cstate->session->se_client);
+	inc_reclaim_complete(cstate->session->se_client);
 out:
 	return status;
 }
@@ -4734,6 +4767,10 @@  static bool clients_still_reclaiming(struct nfsd_net *nn)
 	unsigned long double_grace_period_end = nn->boot_time +
 						2 * nn->nfsd4_lease;
 
+	if (nn->track_reclaim_completes &&
+			atomic_read(&nn->nr_reclaim_complete) ==
+			nn->reclaim_str_hashtbl_size)
+		return false;
 	if (!nn->somebody_reclaimed)
 		return false;
 	nn->somebody_reclaimed = false;
@@ -7253,10 +7290,25 @@  nfs4_state_start_net(struct net *net)
 		return ret;
 	locks_start_grace(net, &nn->nfsd4_manager);
 	nfsd4_client_tracking_init(net);
+	if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
+		goto skip_grace;
 	printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n",
 	       nn->nfsd4_grace, net->ns.inum);
 	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
 	return 0;
+
+skip_grace:
+	printk(KERN_INFO "NFSD: no clients to reclaim, skipping NFSv4 grace period (net %x)\n",
+			net->ns.inum);
+	queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_lease * HZ);
+	/*
+	 * we could call nfsd4_end_grace() here, but it has a dprintk()
+	 * that would be confusing if debug logging is enabled
+	 */
+	nn->grace_ended = true;
+	nfsd4_record_grace_done(nn);
+	locks_end_grace(&nn->nfsd4_manager);
+	return 0;
 }
 
 /* initialization to perform when the nfsd service is started: */
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6384c9b94898..950ac6683be9 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1240,6 +1240,7 @@  static __net_init int nfsd_init_net(struct net *net)
 	nn->nfsd4_lease = 45;	/* default lease time */
 	nn->nfsd4_grace = 45;
 	nn->somebody_reclaimed = false;
+	nn->track_reclaim_completes = false;
 	nn->clverifier_counter = prandom_u32();
 	nn->clientid_counter = prandom_u32();
 	nn->s2s_cp_cl_id = nn->clientid_counter++;