diff mbox series

[v4,2/2] NFSD: add shrinker to reap courtesy clients on low memory condition

Message ID 1661896113-8013-3-git-send-email-dai.ngo@oracle.com (mailing list archive)
State New, archived
Headers show
Series NFSD: memory shrinker for NFSv4 clients | expand

Commit Message

Dai Ngo Aug. 30, 2022, 9:48 p.m. UTC
Add the courtesy client shrinker to react to low memory condition
triggered by the memory shrinker.

On the shrinker's count callback, we increment a callback counter
and return the number of outstanding courtesy clients. When the
laundromat runs, it checks if this counter is not zero and starts
reaping old courtesy clients. The maximum number of clients to be
reaped is limited to NFSD_CIENT_MAX_TRIM_PER_RUN (128). This limit
is to prevent the laundromat from spending too much time reaping
the clients and not processing other tasks in a timely manner.

The laundromat is rescheduled to run sooner if it detects low
low memory condition and there are more clients to reap.

On the shrinker's scan callback, we return the number of clients
That were reaped since the last scan callback. We can not reap
the clients on the scan callback context since destroying the
client might require call into the underlying filesystem or other
subsystems which might allocate memory which can cause deadlock.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
---
 fs/nfsd/netns.h     |  3 +++
 fs/nfsd/nfs4state.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/nfsd/nfsctl.c    |  6 ++++--
 fs/nfsd/nfsd.h      |  9 +++++++--
 4 files changed, 62 insertions(+), 8 deletions(-)

Comments

Chuck Lever III Aug. 31, 2022, 2:30 p.m. UTC | #1
> On Aug 30, 2022, at 5:48 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> 
> Add the courtesy client shrinker to react to low memory condition
> triggered by the memory shrinker.
> 
> On the shrinker's count callback, we increment a callback counter
> and return the number of outstanding courtesy clients. When the
> laundromat runs, it checks if this counter is not zero and starts
> reaping old courtesy clients. The maximum number of clients to be
> reaped is limited to NFSD_CIENT_MAX_TRIM_PER_RUN (128). This limit
> is to prevent the laundromat from spending too much time reaping
> the clients and not processing other tasks in a timely manner.
> 
> The laundromat is rescheduled to run sooner if it detects low
> low memory condition and there are more clients to reap.
> 
> On the shrinker's scan callback, we return the number of clients
> That were reaped since the last scan callback. We can not reap
> the clients on the scan callback context since destroying the
> client might require call into the underlying filesystem or other
> subsystems which might allocate memory which can cause deadlock.
> 
> Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
> ---
> fs/nfsd/netns.h     |  3 +++
> fs/nfsd/nfs4state.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++----
> fs/nfsd/nfsctl.c    |  6 ++++--
> fs/nfsd/nfsd.h      |  9 +++++++--
> 4 files changed, 62 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
> index 2695dff1378a..2a604951623f 100644
> --- a/fs/nfsd/netns.h
> +++ b/fs/nfsd/netns.h
> @@ -194,6 +194,9 @@ struct nfsd_net {
> 	int			nfs4_max_clients;
> 
> 	atomic_t		nfsd_courtesy_client_count;
> +	atomic_t		nfsd_client_shrinker_cb_count;
> +	atomic_t		nfsd_client_shrinker_reapcount;
> +	struct shrinker		nfsd_client_shrinker;
> };
> 
> /* Simple check to find out if a given net was properly initialized */
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index eaf7b4dcea33..9aed9eed1892 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -4343,7 +4343,40 @@ nfsd4_init_slabs(void)
> 	return -ENOMEM;
> }
> 
> -void nfsd4_init_leases_net(struct nfsd_net *nn)
> +static unsigned long
> +nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc)

I find it confusing to have a function and a variable with exactly
the same name, especially if they are related. Maybe the variable
name can be nfsd_courtesy_client_num ?


> +{l
> +	struct nfsd_net *nn = container_of(shrink,
> +			struct nfsd_net, nfsd_client_shrinker);
> +
> +	atomic_inc(&nn->nfsd_client_shrinker_cb_count);
> +	mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
> +	return (unsigned long)atomic_read(&nn->nfsd_courtesy_client_count);
> +}
> +
> +static unsigned long
> +nfsd_courtesy_client_scan(struct shrinker *shrink, struct shrink_control *sc)
> +{
> +	struct nfsd_net *nn = container_of(shrink,
> +			struct nfsd_net, nfsd_client_shrinker);
> +	unsigned long cnt;
> +
> +	cnt = atomic_read(&nn->nfsd_client_shrinker_reapcount);
> +	atomic_set(&nn->nfsd_client_shrinker_reapcount, 0);
> +	return cnt;
> +}
> +
> +static int
> +nfsd_register_client_shrinker(struct nfsd_net *nn)
> +{
> +	nn->nfsd_client_shrinker.scan_objects = nfsd_courtesy_client_scan;
> +	nn->nfsd_client_shrinker.count_objects = nfsd_courtesy_client_count;
> +	nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
> +	return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client");
> +}

Nit: Move this into nfsd4_init_leases_net(). I don't see added
value for having a one-time use helper for this code.


> +
> +int
> +nfsd4_init_leases_net(struct nfsd_net *nn)
> {
> 	struct sysinfo si;
> 	u64 max_clients;
> @@ -4364,6 +4397,8 @@ void nfsd4_init_leases_net(struct nfsd_net *nn)
> 	nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB);
> 
> 	atomic_set(&nn->nfsd_courtesy_client_count, 0);
> +	atomic_set(&nn->nfsd_client_shrinker_cb_count, 0);
> +	return nfsd_register_client_shrinker(nn);
> }
> 
> static void init_nfs4_replay(struct nfs4_replay *rp)
> @@ -5872,12 +5907,17 @@ static void
> nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
> 				struct laundry_time *lt)
> {
> -	unsigned int maxreap, reapcnt = 0;
> +	unsigned int maxreap = 0, reapcnt = 0;
> +	int cb_cnt;

Nit: Reverse christmas tree, please. cb_cnt goes at the end
of the variable definitions.


> 	struct list_head *pos, *next;
> 	struct nfs4_client *clp;
> 
> -	maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ?
> -			NFSD_CLIENT_MAX_TRIM_PER_RUN : 0;
> +	cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count);
> +	if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients ||
> +							cb_cnt) {
> +		maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN;
> +		atomic_set(&nn->nfsd_client_shrinker_cb_count, 0);
> +	}

I'm not terribly happy with this, but I don't have a better suggestion
at the moment. Let me think about it.


> 	INIT_LIST_HEAD(reaplist);
> 	spin_lock(&nn->client_lock);
> 	list_for_each_safe(pos, next, &nn->client_lru) {
> @@ -5903,6 +5943,8 @@ nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
> 		}
> 	}
> 	spin_unlock(&nn->client_lock);
> +	if (cb_cnt)
> +		atomic_add(reapcnt, &nn->nfsd_client_shrinker_reapcount);
> }
> 
> static time64_t
> @@ -5943,6 +5985,8 @@ nfs4_laundromat(struct nfsd_net *nn)
> 		list_del_init(&clp->cl_lru);
> 		expire_client(clp);
> 	}
> +	if (atomic_read(&nn->nfsd_client_shrinker_cb_count) > 0)
> +		lt.new_timeo = NFSD_LAUNDROMAT_MINTIMEOUT;
> 	spin_lock(&state_lock);
> 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
> 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
> diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
> index 917fa1892fd2..597a26ad4183 100644
> --- a/fs/nfsd/nfsctl.c
> +++ b/fs/nfsd/nfsctl.c
> @@ -1481,11 +1481,12 @@ static __net_init int nfsd_init_net(struct net *net)
> 		goto out_idmap_error;
> 	nn->nfsd_versions = NULL;
> 	nn->nfsd4_minorversions = NULL;
> +	retval = nfsd4_init_leases_net(nn);
> +	if (retval)
> +		goto out_drc_error;
> 	retval = nfsd_reply_cache_init(nn);
> 	if (retval)
> 		goto out_drc_error;
> -	nfsd4_init_leases_net(nn);
> -
> 	get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
> 	seqlock_init(&nn->writeverf_lock);
> 
> @@ -1507,6 +1508,7 @@ static __net_exit void nfsd_exit_net(struct net *net)
> 	nfsd_idmap_shutdown(net);
> 	nfsd_export_shutdown(net);
> 	nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
> +	nfsd4_leases_net_shutdown(nn);
> }
> 
> static struct pernet_operations nfsd_net_ops = {
> diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
> index 57a468ed85c3..7e05ab7a3532 100644
> --- a/fs/nfsd/nfsd.h
> +++ b/fs/nfsd/nfsd.h
> @@ -498,7 +498,11 @@ extern void unregister_cld_notifier(void);
> extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn);
> #endif
> 
> -extern void nfsd4_init_leases_net(struct nfsd_net *nn);
> +extern int nfsd4_init_leases_net(struct nfsd_net *nn);
> +static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn)
> +{
> +	unregister_shrinker(&nn->nfsd_client_shrinker);
> +};

Nit: please move this into nfs4state.c next to nfsd4_init_leases_net().

static inline is used typically for performance-sensitive helpers, and
this adds a dependency on unregister_shrinker in every file that includes
nfsd.h.


> #else /* CONFIG_NFSD_V4 */
> static inline int nfsd4_is_junction(struct dentry *dentry)
> @@ -506,7 +510,8 @@ static inline int nfsd4_is_junction(struct dentry *dentry)
> 	return 0;
> }
> 
> -static inline void nfsd4_init_leases_net(struct nfsd_net *nn) {};
> +static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; };
> +static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) { };
> 
> #define register_cld_notifier() 0
> #define unregister_cld_notifier() do { } while(0)
> -- 
> 2.9.5
> 

--
Chuck Lever
Dai Ngo Sept. 2, 2022, 1:56 a.m. UTC | #2
Hi Chuck,

On 8/31/22 7:30 AM, Chuck Lever III wrote:
>> 	struct list_head *pos, *next;
>> 	struct nfs4_client *clp;
>>
>> -	maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ?
>> -			NFSD_CLIENT_MAX_TRIM_PER_RUN : 0;
>> +	cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count);
>> +	if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients ||
>> +							cb_cnt) {
>> +		maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN;
>> +		atomic_set(&nn->nfsd_client_shrinker_cb_count, 0);
>> +	}
> I'm not terribly happy with this, but I don't have a better suggestion
> at the moment. Let me think about it.

Do you have any suggestion to improve this, I want to incorporate it
before sending out v5?

Thanks,
-Dai
Chuck Lever III Sept. 2, 2022, 4:32 a.m. UTC | #3
> On Sep 1, 2022, at 9:56 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> 
> Hi Chuck,
> 
> On 8/31/22 7:30 AM, Chuck Lever III wrote:
>>> 	struct list_head *pos, *next;
>>> 	struct nfs4_client *clp;
>>> 
>>> -	maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ?
>>> -			NFSD_CLIENT_MAX_TRIM_PER_RUN : 0;
>>> +	cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count);
>>> +	if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients ||
>>> +							cb_cnt) {
>>> +		maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN;
>>> +		atomic_set(&nn->nfsd_client_shrinker_cb_count, 0);
>>> +	}
>> I'm not terribly happy with this, but I don't have a better suggestion
>> at the moment. Let me think about it.
> 
> Do you have any suggestion to improve this, I want to incorporate it
> before sending out v5?

Let's consider some broad outlines...

With regards to parametrizing the reaplist behavior, you want
a normal laundromat run to reap zero or more courtesy clients.
You want a shrinker laundromat run to reap more than zero. I
think you want a minreap variable as well as a maxreap variable
in there to control how the reaplist is built. Making @minreap
a function parameter rather than a global atomic would be a
plus for me, but maybe that's not practical.

But I would prefer a more straightforward approach overall. The
proposed approach seems tricky and brittle, and needs a lot of
explaining to understand. Other subsystems seem to get away with
something simpler.

Can nfsd_courtesy_client_count() simply reduce
nn->nfs4_max_clients, kick the laundromat, and then return 0?
Then get rid of nfsd_courtesy_client_scan().

Or, nfsd_courtesy_client_count() could return
nfsd_couresy_client_count. nfsd_courtesy_client_scan() could
then look something like this:

	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
		return SHRINK_STOP;

	nfsd_get_client_reaplist(nn, reaplist, lt);
	list_for_each_safe(pos, next, &reaplist) {
		clp = list_entry(pos, struct nfs4_client, cl_lru);
		trace_nfsd_clid_purged(&clp->cl_clientid);
		list_del_init(&clp->cl_lru);
		expire_client(clp);
		count++;
	}
	return count;

Obviously you would need to refactor common code into helper
functions.

--
Chuck Lever
Dai Ngo Sept. 2, 2022, 4:44 p.m. UTC | #4
On 9/1/22 9:32 PM, Chuck Lever III wrote:
>
>> On Sep 1, 2022, at 9:56 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>
>> Hi Chuck,
>>
>> On 8/31/22 7:30 AM, Chuck Lever III wrote:
>>>> 	struct list_head *pos, *next;
>>>> 	struct nfs4_client *clp;
>>>>
>>>> -	maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ?
>>>> -			NFSD_CLIENT_MAX_TRIM_PER_RUN : 0;
>>>> +	cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count);
>>>> +	if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients ||
>>>> +							cb_cnt) {
>>>> +		maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN;
>>>> +		atomic_set(&nn->nfsd_client_shrinker_cb_count, 0);
>>>> +	}
>>> I'm not terribly happy with this, but I don't have a better suggestion
>>> at the moment. Let me think about it.
>> Do you have any suggestion to improve this, I want to incorporate it
>> before sending out v5?
> Let's consider some broad outlines...
>
> With regards to parametrizing the reaplist behavior, you want
> a normal laundromat run to reap zero or more courtesy clients.
> You want a shrinker laundromat run to reap more than zero. I
> think you want a minreap variable as well as a maxreap variable
> in there to control how the reaplist is built. Making @minreap
> a function parameter rather than a global atomic would be a
> plus for me, but maybe that's not practical.

I'm not quite understand how the minreap is used, I think it
will make the code more complex.

>
> But I would prefer a more straightforward approach overall. The
> proposed approach seems tricky and brittle, and needs a lot of
> explaining to understand. Other subsystems seem to get away with
> something simpler.
>
> Can nfsd_courtesy_client_count() simply reduce
> nn->nfs4_max_clients, kick the laundromat, and then return 0?
> Then get rid of nfsd_courtesy_client_scan().

I need to think more about this approach. However at first glance,
nn->nfs4_max_clients is used to control how many clients, including
active and courtesy clients, are allowed in the system. If we lower
this count, it also prevent new clients from connecting to the
system. So now the shrinker mechanism does more than just getting
rid of unused resources, maybe that's ok?

>
> Or, nfsd_courtesy_client_count() could return
> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could
> then look something like this:
>
> 	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
> 		return SHRINK_STOP;
>
> 	nfsd_get_client_reaplist(nn, reaplist, lt);
> 	list_for_each_safe(pos, next, &reaplist) {
> 		clp = list_entry(pos, struct nfs4_client, cl_lru);
> 		trace_nfsd_clid_purged(&clp->cl_clientid);
> 		list_del_init(&clp->cl_lru);
> 		expire_client(clp);
> 		count++;
> 	}
> 	return count;

This does not work, we cannot expire clients on the context of
scan callback due to deadlock problem.

I will experiment with ways to get rid of the scan function to
make the logic simpler.

Thanks,
-Dai

>
> Obviously you would need to refactor common code into helper
> functions.
>
> --
> Chuck Lever
>
Chuck Lever III Sept. 2, 2022, 5:58 p.m. UTC | #5
> On Sep 2, 2022, at 12:44 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> 
> On 9/1/22 9:32 PM, Chuck Lever III wrote:
>> 
>>> On Sep 1, 2022, at 9:56 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>> 
>>> Hi Chuck,
>>> 
>>> On 8/31/22 7:30 AM, Chuck Lever III wrote:
>>>>> 	struct list_head *pos, *next;
>>>>> 	struct nfs4_client *clp;
>>>>> 
>>>>> -	maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ?
>>>>> -			NFSD_CLIENT_MAX_TRIM_PER_RUN : 0;
>>>>> +	cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count);
>>>>> +	if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients ||
>>>>> +							cb_cnt) {
>>>>> +		maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN;
>>>>> +		atomic_set(&nn->nfsd_client_shrinker_cb_count, 0);
>>>>> +	}
>>>> I'm not terribly happy with this, but I don't have a better suggestion
>>>> at the moment. Let me think about it.
>>> Do you have any suggestion to improve this, I want to incorporate it
>>> before sending out v5?
>> Let's consider some broad outlines...
>> 
>> With regards to parametrizing the reaplist behavior, you want
>> a normal laundromat run to reap zero or more courtesy clients.
>> You want a shrinker laundromat run to reap more than zero. I
>> think you want a minreap variable as well as a maxreap variable
>> in there to control how the reaplist is built. Making @minreap
>> a function parameter rather than a global atomic would be a
>> plus for me, but maybe that's not practical.
> 
> I'm not quite understand how the minreap is used, I think it
> will make the code more complex.
> 
>> 
>> But I would prefer a more straightforward approach overall. The
>> proposed approach seems tricky and brittle, and needs a lot of
>> explaining to understand. Other subsystems seem to get away with
>> something simpler.
>> 
>> Can nfsd_courtesy_client_count() simply reduce
>> nn->nfs4_max_clients, kick the laundromat, and then return 0?
>> Then get rid of nfsd_courtesy_client_scan().
> 
> I need to think more about this approach. However at first glance,
> nn->nfs4_max_clients is used to control how many clients, including
> active and courtesy clients, are allowed in the system. If we lower
> this count, it also prevent new clients from connecting to the
> system. So now the shrinker mechanism does more than just getting
> rid of unused resources, maybe that's ok?

I don't want to go down the path of kicking out active clients if
we don't need to. Maybe you can make the laundromat code more
careful to make a distinction between active and courtesy clients.

It might sense in the long run to separate the laundromat's
processing of courtesy and active clients anyway.


>> Or, nfsd_courtesy_client_count() could return
>> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could
>> then look something like this:
>> 
>> 	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
>> 		return SHRINK_STOP;
>> 
>> 	nfsd_get_client_reaplist(nn, reaplist, lt);
>> 	list_for_each_safe(pos, next, &reaplist) {
>> 		clp = list_entry(pos, struct nfs4_client, cl_lru);
>> 		trace_nfsd_clid_purged(&clp->cl_clientid);
>> 		list_del_init(&clp->cl_lru);
>> 		expire_client(clp);
>> 		count++;
>> 	}
>> 	return count;
> 
> This does not work, we cannot expire clients on the context of
> scan callback due to deadlock problem.

Correct, we don't want to start shrinker laundromat activity if
the allocation request specified that it cannot wait. But are
you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL?


> I will experiment with ways to get rid of the scan function to
> make the logic simpler.


--
Chuck Lever
Dai Ngo Sept. 2, 2022, 7:34 p.m. UTC | #6
On 9/2/22 10:58 AM, Chuck Lever III wrote:
>>> Or, nfsd_courtesy_client_count() could return
>>> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could
>>> then look something like this:
>>>
>>> 	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
>>> 		return SHRINK_STOP;
>>>
>>> 	nfsd_get_client_reaplist(nn, reaplist, lt);
>>> 	list_for_each_safe(pos, next, &reaplist) {
>>> 		clp = list_entry(pos, struct nfs4_client, cl_lru);
>>> 		trace_nfsd_clid_purged(&clp->cl_clientid);
>>> 		list_del_init(&clp->cl_lru);
>>> 		expire_client(clp);
>>> 		count++;
>>> 	}
>>> 	return count;
>> This does not work, we cannot expire clients on the context of
>> scan callback due to deadlock problem.
> Correct, we don't want to start shrinker laundromat activity if
> the allocation request specified that it cannot wait. But are
> you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL?

It can be deadlock due to lock ordering problem:

======================================================
  WARNING: possible circular locking dependency detected
  5.19.0-rc2_sk+ #1 Not tainted
  ------------------------------------------------------
  lck/31847 is trying to acquire lock:
  ffff88811d268850 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}, at: btrfs_inode_lock+0x38/0x70
  #012but task is already holding lock:
  ffffffffb41848c0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x506/0x1db0
  #012which lock already depends on the new lock.
  #012the existing dependency chain (in reverse order) is:

  #012-> #1 (fs_reclaim){+.+.}-{0:0}:
        fs_reclaim_acquire+0xc0/0x100
        __kmalloc+0x51/0x320
        btrfs_buffered_write+0x2eb/0xd90
        btrfs_do_write_iter+0x6bf/0x11c0
        do_iter_readv_writev+0x2bb/0x5a0
        do_iter_write+0x131/0x630
        nfsd_vfs_write+0x4da/0x1900 [nfsd]
        nfsd4_write+0x2ac/0x760 [nfsd]
        nfsd4_proc_compound+0xce8/0x23e0 [nfsd]
        nfsd_dispatch+0x4ed/0xc10 [nfsd]
        svc_process_common+0xd3f/0x1b00 [sunrpc]
        svc_process+0x361/0x4f0 [sunrpc]
        nfsd+0x2d6/0x570 [nfsd]
        kthread+0x2a1/0x340
        ret_from_fork+0x22/0x30

  #012-> #0 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}:
        __lock_acquire+0x318d/0x7830
        lock_acquire+0x1bb/0x500
        down_write+0x82/0x130
        btrfs_inode_lock+0x38/0x70
        btrfs_sync_file+0x280/0x1010
        nfsd_file_flush.isra.0+0x1b/0x220 [nfsd]
        nfsd_file_put+0xd4/0x110 [nfsd]
        release_all_access+0x13a/0x220 [nfsd]
        nfs4_free_ol_stateid+0x40/0x90 [nfsd]
        free_ol_stateid_reaplist+0x131/0x210 [nfsd]
        release_openowner+0xf7/0x160 [nfsd]
        __destroy_client+0x3cc/0x740 [nfsd]
        nfsd_cc_lru_scan+0x271/0x410 [nfsd]
        shrink_slab.constprop.0+0x31e/0x7d0
        shrink_node+0x54b/0xe50
        try_to_free_pages+0x394/0xba0
        __alloc_pages_slowpath.constprop.0+0x5d2/0x1db0
        __alloc_pages+0x4d6/0x580
        __handle_mm_fault+0xc25/0x2810
        handle_mm_fault+0x136/0x480
        do_user_addr_fault+0x3d8/0xec0
        exc_page_fault+0x5d/0xc0
        asm_exc_page_fault+0x27/0x30

-Dai
Chuck Lever III Sept. 3, 2022, 1:26 a.m. UTC | #7
> On Sep 2, 2022, at 3:34 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> 
> On 9/2/22 10:58 AM, Chuck Lever III wrote:
>>>> Or, nfsd_courtesy_client_count() could return
>>>> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could
>>>> then look something like this:
>>>> 
>>>> 	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
>>>> 		return SHRINK_STOP;
>>>> 
>>>> 	nfsd_get_client_reaplist(nn, reaplist, lt);
>>>> 	list_for_each_safe(pos, next, &reaplist) {
>>>> 		clp = list_entry(pos, struct nfs4_client, cl_lru);
>>>> 		trace_nfsd_clid_purged(&clp->cl_clientid);
>>>> 		list_del_init(&clp->cl_lru);
>>>> 		expire_client(clp);
>>>> 		count++;
>>>> 	}
>>>> 	return count;
>>> This does not work, we cannot expire clients on the context of
>>> scan callback due to deadlock problem.
>> Correct, we don't want to start shrinker laundromat activity if
>> the allocation request specified that it cannot wait. But are
>> you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL?
> 
> It can be deadlock due to lock ordering problem:
> 
> ======================================================
> WARNING: possible circular locking dependency detected
> 5.19.0-rc2_sk+ #1 Not tainted
> ------------------------------------------------------
> lck/31847 is trying to acquire lock:
> ffff88811d268850 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}, at: btrfs_inode_lock+0x38/0x70
> #012but task is already holding lock:
> ffffffffb41848c0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x506/0x1db0
> #012which lock already depends on the new lock.
> #012the existing dependency chain (in reverse order) is:
> 
> #012-> #1 (fs_reclaim){+.+.}-{0:0}:
>       fs_reclaim_acquire+0xc0/0x100
>       __kmalloc+0x51/0x320
>       btrfs_buffered_write+0x2eb/0xd90
>       btrfs_do_write_iter+0x6bf/0x11c0
>       do_iter_readv_writev+0x2bb/0x5a0
>       do_iter_write+0x131/0x630
>       nfsd_vfs_write+0x4da/0x1900 [nfsd]
>       nfsd4_write+0x2ac/0x760 [nfsd]
>       nfsd4_proc_compound+0xce8/0x23e0 [nfsd]
>       nfsd_dispatch+0x4ed/0xc10 [nfsd]
>       svc_process_common+0xd3f/0x1b00 [sunrpc]
>       svc_process+0x361/0x4f0 [sunrpc]
>       nfsd+0x2d6/0x570 [nfsd]
>       kthread+0x2a1/0x340
>       ret_from_fork+0x22/0x30
> 
> #012-> #0 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}:
>       __lock_acquire+0x318d/0x7830
>       lock_acquire+0x1bb/0x500
>       down_write+0x82/0x130
>       btrfs_inode_lock+0x38/0x70
>       btrfs_sync_file+0x280/0x1010
>       nfsd_file_flush.isra.0+0x1b/0x220 [nfsd]
>       nfsd_file_put+0xd4/0x110 [nfsd]
>       release_all_access+0x13a/0x220 [nfsd]
>       nfs4_free_ol_stateid+0x40/0x90 [nfsd]
>       free_ol_stateid_reaplist+0x131/0x210 [nfsd]
>       release_openowner+0xf7/0x160 [nfsd]
>       __destroy_client+0x3cc/0x740 [nfsd]
>       nfsd_cc_lru_scan+0x271/0x410 [nfsd]
>       shrink_slab.constprop.0+0x31e/0x7d0
>       shrink_node+0x54b/0xe50
>       try_to_free_pages+0x394/0xba0
>       __alloc_pages_slowpath.constprop.0+0x5d2/0x1db0
>       __alloc_pages+0x4d6/0x580
>       __handle_mm_fault+0xc25/0x2810
>       handle_mm_fault+0x136/0x480
>       do_user_addr_fault+0x3d8/0xec0
>       exc_page_fault+0x5d/0xc0
>       asm_exc_page_fault+0x27/0x30

Is this deadlock possible only via the filecache close
paths? I can't think of a reason the laundromat has to
wait for each file to be flushed and closed -- the
laundromat should be able to "put" these files and allow
the filecache LRU to flush and close in the background.


--
Chuck Lever
Dai Ngo Sept. 3, 2022, 5:03 p.m. UTC | #8
On 9/2/22 6:26 PM, Chuck Lever III wrote:
>> On Sep 2, 2022, at 3:34 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>
>> On 9/2/22 10:58 AM, Chuck Lever III wrote:
>>>>> Or, nfsd_courtesy_client_count() could return
>>>>> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could
>>>>> then look something like this:
>>>>>
>>>>> 	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
>>>>> 		return SHRINK_STOP;
>>>>>
>>>>> 	nfsd_get_client_reaplist(nn, reaplist, lt);
>>>>> 	list_for_each_safe(pos, next, &reaplist) {
>>>>> 		clp = list_entry(pos, struct nfs4_client, cl_lru);
>>>>> 		trace_nfsd_clid_purged(&clp->cl_clientid);
>>>>> 		list_del_init(&clp->cl_lru);
>>>>> 		expire_client(clp);
>>>>> 		count++;
>>>>> 	}
>>>>> 	return count;
>>>> This does not work, we cannot expire clients on the context of
>>>> scan callback due to deadlock problem.
>>> Correct, we don't want to start shrinker laundromat activity if
>>> the allocation request specified that it cannot wait. But are
>>> you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL?
>> It can be deadlock due to lock ordering problem:
>>
>> ======================================================
>> WARNING: possible circular locking dependency detected
>> 5.19.0-rc2_sk+ #1 Not tainted
>> ------------------------------------------------------
>> lck/31847 is trying to acquire lock:
>> ffff88811d268850 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}, at: btrfs_inode_lock+0x38/0x70
>> #012but task is already holding lock:
>> ffffffffb41848c0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x506/0x1db0
>> #012which lock already depends on the new lock.
>> #012the existing dependency chain (in reverse order) is:
>>
>> #012-> #1 (fs_reclaim){+.+.}-{0:0}:
>>        fs_reclaim_acquire+0xc0/0x100
>>        __kmalloc+0x51/0x320
>>        btrfs_buffered_write+0x2eb/0xd90
>>        btrfs_do_write_iter+0x6bf/0x11c0
>>        do_iter_readv_writev+0x2bb/0x5a0
>>        do_iter_write+0x131/0x630
>>        nfsd_vfs_write+0x4da/0x1900 [nfsd]
>>        nfsd4_write+0x2ac/0x760 [nfsd]
>>        nfsd4_proc_compound+0xce8/0x23e0 [nfsd]
>>        nfsd_dispatch+0x4ed/0xc10 [nfsd]
>>        svc_process_common+0xd3f/0x1b00 [sunrpc]
>>        svc_process+0x361/0x4f0 [sunrpc]
>>        nfsd+0x2d6/0x570 [nfsd]
>>        kthread+0x2a1/0x340
>>        ret_from_fork+0x22/0x30
>>
>> #012-> #0 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}:
>>        __lock_acquire+0x318d/0x7830
>>        lock_acquire+0x1bb/0x500
>>        down_write+0x82/0x130
>>        btrfs_inode_lock+0x38/0x70
>>        btrfs_sync_file+0x280/0x1010
>>        nfsd_file_flush.isra.0+0x1b/0x220 [nfsd]
>>        nfsd_file_put+0xd4/0x110 [nfsd]
>>        release_all_access+0x13a/0x220 [nfsd]
>>        nfs4_free_ol_stateid+0x40/0x90 [nfsd]
>>        free_ol_stateid_reaplist+0x131/0x210 [nfsd]
>>        release_openowner+0xf7/0x160 [nfsd]
>>        __destroy_client+0x3cc/0x740 [nfsd]
>>        nfsd_cc_lru_scan+0x271/0x410 [nfsd]
>>        shrink_slab.constprop.0+0x31e/0x7d0
>>        shrink_node+0x54b/0xe50
>>        try_to_free_pages+0x394/0xba0
>>        __alloc_pages_slowpath.constprop.0+0x5d2/0x1db0
>>        __alloc_pages+0x4d6/0x580
>>        __handle_mm_fault+0xc25/0x2810
>>        handle_mm_fault+0x136/0x480
>>        do_user_addr_fault+0x3d8/0xec0
>>        exc_page_fault+0x5d/0xc0
>>        asm_exc_page_fault+0x27/0x30
> Is this deadlock possible only via the filecache close
> paths?

I'm not sure, but there is another back trace below that shows
another problem.

>   I can't think of a reason the laundromat has to
> wait for each file to be flushed and closed -- the
> laundromat should be able to "put" these files and allow
> the filecache LRU to flush and close in the background.

ok, what should we do here, enhancing the laundromat to behave
as you describe?

Here is another stack trace of problem with calling expire_client
from 'scan' callback:
Sep  3 09:07:35 nfsvmf24 kernel: ------------[ cut here ]------------
Sep  3 09:07:35 nfsvmf24 kernel: workqueue: PF_MEMALLOC task 3525(gmain) is flushing !WQ_MEM_RECLAIM nfsd4_callbacks:0x0
Sep  3 09:07:35 nfsvmf24 kernel: WARNING: CPU: 0 PID: 3525 at kernel/workqueue.c:2625 check_flush_dependency+0x17a/0x350
Sep  3 09:07:35 nfsvmf24 kernel: Modules linked in: rpcsec_gss_krb5 nfsd nfs_acl lockd grace auth_rpcgss sunrpc
Sep  3 09:07:35 nfsvmf24 kernel: CPU: 0 PID: 3525 Comm: gmain Kdump: loaded Not tainted 6.0.0-rc3+ #1
Sep  3 09:07:35 nfsvmf24 kernel: Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
Sep  3 09:07:35 nfsvmf24 kernel: RIP: 0010:check_flush_dependency+0x17a/0x350
Sep  3 09:07:35 nfsvmf24 kernel: Code: 48 c1 ee 03 0f b6 04 06 84 c0 74 08 3c 03 0f 8e b8 01 00 00 41 8b b5 90 05 00 00 49 89 e8 48 c7 c7 c0 4d 06 9a e8 c6 a4 7f 02 <0f> 0b eb 4d 65 4c 8b 2c 25 c0 6e 02 00 4c 89 ef e8 71 65 01 00 49
Sep  3 09:07:35 nfsvmf24 kernel: RSP: 0018:ffff88810c73f4e8 EFLAGS: 00010282
Sep  3 09:07:35 nfsvmf24 kernel: RAX: 0000000000000000 RBX: ffff88811129a800 RCX: 0000000000000000
Sep  3 09:07:35 nfsvmf24 kernel: RDX: 0000000000000001 RSI: 0000000000000004 RDI: ffffed10218e7e93
Sep  3 09:07:35 nfsvmf24 kernel: RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed103afc6ed2
Sep  3 09:07:35 nfsvmf24 kernel: R10: ffff8881d7e3768b R11: ffffed103afc6ed1 R12: 0000000000000000
Sep  3 09:07:35 nfsvmf24 kernel: R13: ffff88810d14cac0 R14: 000000000000000d R15: 000000000000000c
Sep  3 09:07:35 nfsvmf24 kernel: FS:  00007fa9a696c700(0000) GS:ffff8881d7e00000(0000) knlGS:0000000000000000
Sep  3 09:07:35 nfsvmf24 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
Sep  3 09:07:35 nfsvmf24 kernel: CR2: 00007fe922689c50 CR3: 000000010bdd8000 CR4: 00000000000406f0
Sep  3 09:07:35 nfsvmf24 kernel: Call Trace:
Sep  3 09:07:35 nfsvmf24 kernel: <TASK>
Sep  3 09:07:35 nfsvmf24 kernel: __flush_workqueue+0x32c/0x1350
Sep  3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680
Sep  3 09:07:35 nfsvmf24 kernel: ? rcu_read_lock_held_common+0xe/0xa0
Sep  3 09:07:35 nfsvmf24 kernel: ? check_flush_dependency+0x350/0x350
Sep  3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720
Sep  3 09:07:35 nfsvmf24 kernel: ? __queue_work+0x3cc/0xe10
Sep  3 09:07:35 nfsvmf24 kernel: ? trace_hardirqs_on+0x2d/0x110
Sep  3 09:07:35 nfsvmf24 kernel: ? nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd]
Sep  3 09:07:35 nfsvmf24 kernel: nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd]
Sep  3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720
Sep  3 09:07:35 nfsvmf24 kernel: ? nfsd4_probe_callback_sync+0x20/0x20 [nfsd]
Sep  3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680
Sep  3 09:07:35 nfsvmf24 kernel: __destroy_client+0x5ec/0xa60 [nfsd]
Sep  3 09:07:35 nfsvmf24 kernel: ? nfsd4_cb_recall_release+0x20/0x20 [nfsd]
Sep  3 09:07:35 nfsvmf24 kernel: nfsd_courtesy_client_scan+0x39f/0x850 [nfsd]
Sep  3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace+0x74/0xe0
Sep  3 09:07:35 nfsvmf24 kernel: ? client_ctl_write+0x7c0/0x7c0 [nfsd]
Sep  3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace_thunk+0x16/0x18
Sep  3 09:07:35 nfsvmf24 kernel: shrink_slab.constprop.0+0x30b/0x7b0
Sep  3 09:07:35 nfsvmf24 kernel: ? unregister_shrinker+0x270/0x270
Sep  3 09:07:35 nfsvmf24 kernel: shrink_node+0x54b/0xe50
Sep  3 09:07:35 nfsvmf24 kernel: try_to_free_pages+0x394/0xba0
Sep  3 09:07:35 nfsvmf24 kernel: ? reclaim_pages+0x3b0/0x3b0
Sep  3 09:07:35 nfsvmf24 kernel: __alloc_pages_slowpath.constprop.0+0x636/0x1f30
Sep  3 09:07:35 nfsvmf24 kernel: ? warn_alloc+0x120/0x120
Sep  3 09:07:35 nfsvmf24 kernel: ? __zone_watermark_ok+0x2d0/0x2d0
Sep  3 09:07:35 nfsvmf24 kernel: __alloc_pages+0x4d6/0x580
Sep  3 09:07:35 nfsvmf24 kernel: ? __alloc_pages_slowpath.constprop.0+0x1f30/0x1f30
Sep  3 09:07:35 nfsvmf24 kernel: ? find_held_lock+0x2c/0x110
Sep  3 09:07:35 nfsvmf24 kernel: ? lockdep_hardirqs_on_prepare+0x17b/0x410
Sep  3 09:07:35 nfsvmf24 kernel: ____cache_alloc.part.0+0x586/0x760
Sep  3 09:07:35 nfsvmf24 kernel: ? kmem_cache_alloc+0x193/0x290
Sep  3 09:07:35 nfsvmf24 kernel: kmem_cache_alloc+0x22f/0x290
Sep  3 09:07:35 nfsvmf24 kernel: getname_flags+0xbe/0x4e0
Sep  3 09:07:35 nfsvmf24 kernel: user_path_at_empty+0x23/0x50
Sep  3 09:07:35 nfsvmf24 kernel: inotify_find_inode+0x28/0x120
Sep  3 09:07:35 nfsvmf24 kernel: ? __fget_light+0x19b/0x210
Sep  3 09:07:35 nfsvmf24 kernel: __x64_sys_inotify_add_watch+0x160/0x290
Sep  3 09:07:35 nfsvmf24 kernel: ? __ia32_sys_inotify_rm_watch+0x170/0x170
Sep  3 09:07:35 nfsvmf24 kernel: do_syscall_64+0x3d/0x90
Sep  3 09:07:35 nfsvmf24 kernel: entry_SYSCALL_64_after_hwframe+0x46/0xb0
Sep  3 09:07:35 nfsvmf24 kernel: RIP: 0033:0x7fa9b1a77f37
Sep  3 09:07:35 nfsvmf24 kernel: Code: f0 ff ff 73 01 c3 48 8b 0d 36 7f 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 fe 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 09 7f 2c 00 f7 d8 64 89 01 48
Sep  3 09:07:35 nfsvmf24 kernel: RSP: 002b:00007fa9a696bb28 EFLAGS: 00000202 ORIG_RAX: 00000000000000fe
Sep  3 09:07:35 nfsvmf24 kernel: RAX: ffffffffffffffda RBX: 00005638abf5adc0 RCX: 00007fa9b1a77f37
Sep  3 09:07:35 nfsvmf24 kernel: RDX: 0000000001002fce RSI: 00005638abf58080 RDI: 000000000000000d
Sep  3 09:07:35 nfsvmf24 kernel: RBP: 00007fa9a696bb54 R08: 00007ffe7f7f1080 R09: 0000000000000000
Sep  3 09:07:35 nfsvmf24 kernel: R10: 00000000002cf948 R11: 0000000000000202 R12: 0000000000000000
Sep  3 09:07:35 nfsvmf24 kernel: R13: 00007fa9b39710e0 R14: 00005638abf5adf0 R15: 00007fa9b317c940

Another behavior about the shrinker is that the 'count' and 'scan'
callbacks are not 1-to-1, meaning the 'scan' is not called after
every 'count' callback. Not sure what criteria the shrinker use to
do the 'scan' callback but it's very late, sometimes after dozens
of 'count' callback then there is a 'scan' callback. If we rely on
the 'scan' callback then I don't think we react in time to release
the courtesy clients.

-Dai

>
>
> --
> Chuck Lever
>
>
>
Chuck Lever III Sept. 3, 2022, 5:29 p.m. UTC | #9
> On Sep 3, 2022, at 1:03 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
> 
> 
> On 9/2/22 6:26 PM, Chuck Lever III wrote:
>>> On Sep 2, 2022, at 3:34 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>> 
>>> On 9/2/22 10:58 AM, Chuck Lever III wrote:
>>>>>> Or, nfsd_courtesy_client_count() could return
>>>>>> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could
>>>>>> then look something like this:
>>>>>> 
>>>>>> 	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
>>>>>> 		return SHRINK_STOP;
>>>>>> 
>>>>>> 	nfsd_get_client_reaplist(nn, reaplist, lt);
>>>>>> 	list_for_each_safe(pos, next, &reaplist) {
>>>>>> 		clp = list_entry(pos, struct nfs4_client, cl_lru);
>>>>>> 		trace_nfsd_clid_purged(&clp->cl_clientid);
>>>>>> 		list_del_init(&clp->cl_lru);
>>>>>> 		expire_client(clp);
>>>>>> 		count++;
>>>>>> 	}
>>>>>> 	return count;
>>>>> This does not work, we cannot expire clients on the context of
>>>>> scan callback due to deadlock problem.
>>>> Correct, we don't want to start shrinker laundromat activity if
>>>> the allocation request specified that it cannot wait. But are
>>>> you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL?
>>> It can be deadlock due to lock ordering problem:
>>> 
>>> ======================================================
>>> WARNING: possible circular locking dependency detected
>>> 5.19.0-rc2_sk+ #1 Not tainted
>>> ------------------------------------------------------
>>> lck/31847 is trying to acquire lock:
>>> ffff88811d268850 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}, at: btrfs_inode_lock+0x38/0x70
>>> #012but task is already holding lock:
>>> ffffffffb41848c0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x506/0x1db0
>>> #012which lock already depends on the new lock.
>>> #012the existing dependency chain (in reverse order) is:
>>> 
>>> #012-> #1 (fs_reclaim){+.+.}-{0:0}:
>>>       fs_reclaim_acquire+0xc0/0x100
>>>       __kmalloc+0x51/0x320
>>>       btrfs_buffered_write+0x2eb/0xd90
>>>       btrfs_do_write_iter+0x6bf/0x11c0
>>>       do_iter_readv_writev+0x2bb/0x5a0
>>>       do_iter_write+0x131/0x630
>>>       nfsd_vfs_write+0x4da/0x1900 [nfsd]
>>>       nfsd4_write+0x2ac/0x760 [nfsd]
>>>       nfsd4_proc_compound+0xce8/0x23e0 [nfsd]
>>>       nfsd_dispatch+0x4ed/0xc10 [nfsd]
>>>       svc_process_common+0xd3f/0x1b00 [sunrpc]
>>>       svc_process+0x361/0x4f0 [sunrpc]
>>>       nfsd+0x2d6/0x570 [nfsd]
>>>       kthread+0x2a1/0x340
>>>       ret_from_fork+0x22/0x30
>>> 
>>> #012-> #0 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}:
>>>       __lock_acquire+0x318d/0x7830
>>>       lock_acquire+0x1bb/0x500
>>>       down_write+0x82/0x130
>>>       btrfs_inode_lock+0x38/0x70
>>>       btrfs_sync_file+0x280/0x1010
>>>       nfsd_file_flush.isra.0+0x1b/0x220 [nfsd]
>>>       nfsd_file_put+0xd4/0x110 [nfsd]
>>>       release_all_access+0x13a/0x220 [nfsd]
>>>       nfs4_free_ol_stateid+0x40/0x90 [nfsd]
>>>       free_ol_stateid_reaplist+0x131/0x210 [nfsd]
>>>       release_openowner+0xf7/0x160 [nfsd]
>>>       __destroy_client+0x3cc/0x740 [nfsd]
>>>       nfsd_cc_lru_scan+0x271/0x410 [nfsd]
>>>       shrink_slab.constprop.0+0x31e/0x7d0
>>>       shrink_node+0x54b/0xe50
>>>       try_to_free_pages+0x394/0xba0
>>>       __alloc_pages_slowpath.constprop.0+0x5d2/0x1db0
>>>       __alloc_pages+0x4d6/0x580
>>>       __handle_mm_fault+0xc25/0x2810
>>>       handle_mm_fault+0x136/0x480
>>>       do_user_addr_fault+0x3d8/0xec0
>>>       exc_page_fault+0x5d/0xc0
>>>       asm_exc_page_fault+0x27/0x30
>> Is this deadlock possible only via the filecache close
>> paths?
> 
> I'm not sure, but there is another back trace below that shows
> another problem.
> 
>>  I can't think of a reason the laundromat has to
>> wait for each file to be flushed and closed -- the
>> laundromat should be able to "put" these files and allow
>> the filecache LRU to flush and close in the background.
> 
> ok, what should we do here, enhancing the laundromat to behave
> as you describe?

What I was suggesting was a longer term strategy for improving the
laundromat. In order to scale well in the number of clients, it
needs to schedule client expiry and deletion without serializing.

(ie, the laundromat itself can identify a set of clients to clean,
but then it should pass that list to other workers so it can run
again as soon as it needs to -- and that also means it can use more
than just one CPU at a time to do its work).

In the meantime, it is still valuable to consider a shrinker built
around using ->nfsd_courtesy_client_count only. That was one of
the two alternatives I suggested before, so I feel we have some
consensus there. Please continue to look into that, and we can
continue poking at laundromat improvements later.


> Here is another stack trace of problem with calling expire_client
> from 'scan' callback:
> Sep  3 09:07:35 nfsvmf24 kernel: ------------[ cut here ]------------
> Sep  3 09:07:35 nfsvmf24 kernel: workqueue: PF_MEMALLOC task 3525(gmain) is flushing !WQ_MEM_RECLAIM nfsd4_callbacks:0x0
> Sep  3 09:07:35 nfsvmf24 kernel: WARNING: CPU: 0 PID: 3525 at kernel/workqueue.c:2625 check_flush_dependency+0x17a/0x350
> Sep  3 09:07:35 nfsvmf24 kernel: Modules linked in: rpcsec_gss_krb5 nfsd nfs_acl lockd grace auth_rpcgss sunrpc
> Sep  3 09:07:35 nfsvmf24 kernel: CPU: 0 PID: 3525 Comm: gmain Kdump: loaded Not tainted 6.0.0-rc3+ #1
> Sep  3 09:07:35 nfsvmf24 kernel: Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
> Sep  3 09:07:35 nfsvmf24 kernel: RIP: 0010:check_flush_dependency+0x17a/0x350
> Sep  3 09:07:35 nfsvmf24 kernel: Code: 48 c1 ee 03 0f b6 04 06 84 c0 74 08 3c 03 0f 8e b8 01 00 00 41 8b b5 90 05 00 00 49 89 e8 48 c7 c7 c0 4d 06 9a e8 c6 a4 7f 02 <0f> 0b eb 4d 65 4c 8b 2c 25 c0 6e 02 00 4c 89 ef e8 71 65 01 00 49
> Sep  3 09:07:35 nfsvmf24 kernel: RSP: 0018:ffff88810c73f4e8 EFLAGS: 00010282
> Sep  3 09:07:35 nfsvmf24 kernel: RAX: 0000000000000000 RBX: ffff88811129a800 RCX: 0000000000000000
> Sep  3 09:07:35 nfsvmf24 kernel: RDX: 0000000000000001 RSI: 0000000000000004 RDI: ffffed10218e7e93
> Sep  3 09:07:35 nfsvmf24 kernel: RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed103afc6ed2
> Sep  3 09:07:35 nfsvmf24 kernel: R10: ffff8881d7e3768b R11: ffffed103afc6ed1 R12: 0000000000000000
> Sep  3 09:07:35 nfsvmf24 kernel: R13: ffff88810d14cac0 R14: 000000000000000d R15: 000000000000000c
> Sep  3 09:07:35 nfsvmf24 kernel: FS:  00007fa9a696c700(0000) GS:ffff8881d7e00000(0000) knlGS:0000000000000000
> Sep  3 09:07:35 nfsvmf24 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> Sep  3 09:07:35 nfsvmf24 kernel: CR2: 00007fe922689c50 CR3: 000000010bdd8000 CR4: 00000000000406f0
> Sep  3 09:07:35 nfsvmf24 kernel: Call Trace:
> Sep  3 09:07:35 nfsvmf24 kernel: <TASK>
> Sep  3 09:07:35 nfsvmf24 kernel: __flush_workqueue+0x32c/0x1350
> Sep  3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680
> Sep  3 09:07:35 nfsvmf24 kernel: ? rcu_read_lock_held_common+0xe/0xa0
> Sep  3 09:07:35 nfsvmf24 kernel: ? check_flush_dependency+0x350/0x350
> Sep  3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720
> Sep  3 09:07:35 nfsvmf24 kernel: ? __queue_work+0x3cc/0xe10
> Sep  3 09:07:35 nfsvmf24 kernel: ? trace_hardirqs_on+0x2d/0x110
> Sep  3 09:07:35 nfsvmf24 kernel: ? nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd]
> Sep  3 09:07:35 nfsvmf24 kernel: nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd]
> Sep  3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720
> Sep  3 09:07:35 nfsvmf24 kernel: ? nfsd4_probe_callback_sync+0x20/0x20 [nfsd]
> Sep  3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680
> Sep  3 09:07:35 nfsvmf24 kernel: __destroy_client+0x5ec/0xa60 [nfsd]
> Sep  3 09:07:35 nfsvmf24 kernel: ? nfsd4_cb_recall_release+0x20/0x20 [nfsd]
> Sep  3 09:07:35 nfsvmf24 kernel: nfsd_courtesy_client_scan+0x39f/0x850 [nfsd]
> Sep  3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace+0x74/0xe0
> Sep  3 09:07:35 nfsvmf24 kernel: ? client_ctl_write+0x7c0/0x7c0 [nfsd]
> Sep  3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace_thunk+0x16/0x18
> Sep  3 09:07:35 nfsvmf24 kernel: shrink_slab.constprop.0+0x30b/0x7b0
> Sep  3 09:07:35 nfsvmf24 kernel: ? unregister_shrinker+0x270/0x270
> Sep  3 09:07:35 nfsvmf24 kernel: shrink_node+0x54b/0xe50
> Sep  3 09:07:35 nfsvmf24 kernel: try_to_free_pages+0x394/0xba0
> Sep  3 09:07:35 nfsvmf24 kernel: ? reclaim_pages+0x3b0/0x3b0
> Sep  3 09:07:35 nfsvmf24 kernel: __alloc_pages_slowpath.constprop.0+0x636/0x1f30
> Sep  3 09:07:35 nfsvmf24 kernel: ? warn_alloc+0x120/0x120
> Sep  3 09:07:35 nfsvmf24 kernel: ? __zone_watermark_ok+0x2d0/0x2d0
> Sep  3 09:07:35 nfsvmf24 kernel: __alloc_pages+0x4d6/0x580
> Sep  3 09:07:35 nfsvmf24 kernel: ? __alloc_pages_slowpath.constprop.0+0x1f30/0x1f30
> Sep  3 09:07:35 nfsvmf24 kernel: ? find_held_lock+0x2c/0x110
> Sep  3 09:07:35 nfsvmf24 kernel: ? lockdep_hardirqs_on_prepare+0x17b/0x410
> Sep  3 09:07:35 nfsvmf24 kernel: ____cache_alloc.part.0+0x586/0x760
> Sep  3 09:07:35 nfsvmf24 kernel: ? kmem_cache_alloc+0x193/0x290
> Sep  3 09:07:35 nfsvmf24 kernel: kmem_cache_alloc+0x22f/0x290
> Sep  3 09:07:35 nfsvmf24 kernel: getname_flags+0xbe/0x4e0
> Sep  3 09:07:35 nfsvmf24 kernel: user_path_at_empty+0x23/0x50
> Sep  3 09:07:35 nfsvmf24 kernel: inotify_find_inode+0x28/0x120
> Sep  3 09:07:35 nfsvmf24 kernel: ? __fget_light+0x19b/0x210
> Sep  3 09:07:35 nfsvmf24 kernel: __x64_sys_inotify_add_watch+0x160/0x290
> Sep  3 09:07:35 nfsvmf24 kernel: ? __ia32_sys_inotify_rm_watch+0x170/0x170
> Sep  3 09:07:35 nfsvmf24 kernel: do_syscall_64+0x3d/0x90
> Sep  3 09:07:35 nfsvmf24 kernel: entry_SYSCALL_64_after_hwframe+0x46/0xb0
> Sep  3 09:07:35 nfsvmf24 kernel: RIP: 0033:0x7fa9b1a77f37
> Sep  3 09:07:35 nfsvmf24 kernel: Code: f0 ff ff 73 01 c3 48 8b 0d 36 7f 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 fe 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 09 7f 2c 00 f7 d8 64 89 01 48
> Sep  3 09:07:35 nfsvmf24 kernel: RSP: 002b:00007fa9a696bb28 EFLAGS: 00000202 ORIG_RAX: 00000000000000fe
> Sep  3 09:07:35 nfsvmf24 kernel: RAX: ffffffffffffffda RBX: 00005638abf5adc0 RCX: 00007fa9b1a77f37
> Sep  3 09:07:35 nfsvmf24 kernel: RDX: 0000000001002fce RSI: 00005638abf58080 RDI: 000000000000000d
> Sep  3 09:07:35 nfsvmf24 kernel: RBP: 00007fa9a696bb54 R08: 00007ffe7f7f1080 R09: 0000000000000000
> Sep  3 09:07:35 nfsvmf24 kernel: R10: 00000000002cf948 R11: 0000000000000202 R12: 0000000000000000
> Sep  3 09:07:35 nfsvmf24 kernel: R13: 00007fa9b39710e0 R14: 00005638abf5adf0 R15: 00007fa9b317c940
> 
> Another behavior about the shrinker is that the 'count' and 'scan'
> callbacks are not 1-to-1, meaning the 'scan' is not called after
> every 'count' callback. Not sure what criteria the shrinker use to
> do the 'scan' callback but it's very late, sometimes after dozens
> of 'count' callback then there is a 'scan' callback. If we rely on
> the 'scan' callback then I don't think we react in time to release
> the courtesy clients.
> 
> -Dai
> 
>> 
>> 
>> --
>> Chuck Lever

--
Chuck Lever
Dai Ngo Sept. 3, 2022, 5:59 p.m. UTC | #10
On 9/3/22 10:29 AM, Chuck Lever III wrote:
>
>> On Sep 3, 2022, at 1:03 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>
>>
>> On 9/2/22 6:26 PM, Chuck Lever III wrote:
>>>> On Sep 2, 2022, at 3:34 PM, Dai Ngo <dai.ngo@oracle.com> wrote:
>>>>
>>>> On 9/2/22 10:58 AM, Chuck Lever III wrote:
>>>>>>> Or, nfsd_courtesy_client_count() could return
>>>>>>> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could
>>>>>>> then look something like this:
>>>>>>>
>>>>>>> 	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
>>>>>>> 		return SHRINK_STOP;
>>>>>>>
>>>>>>> 	nfsd_get_client_reaplist(nn, reaplist, lt);
>>>>>>> 	list_for_each_safe(pos, next, &reaplist) {
>>>>>>> 		clp = list_entry(pos, struct nfs4_client, cl_lru);
>>>>>>> 		trace_nfsd_clid_purged(&clp->cl_clientid);
>>>>>>> 		list_del_init(&clp->cl_lru);
>>>>>>> 		expire_client(clp);
>>>>>>> 		count++;
>>>>>>> 	}
>>>>>>> 	return count;
>>>>>> This does not work, we cannot expire clients on the context of
>>>>>> scan callback due to deadlock problem.
>>>>> Correct, we don't want to start shrinker laundromat activity if
>>>>> the allocation request specified that it cannot wait. But are
>>>>> you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL?
>>>> It can be deadlock due to lock ordering problem:
>>>>
>>>> ======================================================
>>>> WARNING: possible circular locking dependency detected
>>>> 5.19.0-rc2_sk+ #1 Not tainted
>>>> ------------------------------------------------------
>>>> lck/31847 is trying to acquire lock:
>>>> ffff88811d268850 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}, at: btrfs_inode_lock+0x38/0x70
>>>> #012but task is already holding lock:
>>>> ffffffffb41848c0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x506/0x1db0
>>>> #012which lock already depends on the new lock.
>>>> #012the existing dependency chain (in reverse order) is:
>>>>
>>>> #012-> #1 (fs_reclaim){+.+.}-{0:0}:
>>>>        fs_reclaim_acquire+0xc0/0x100
>>>>        __kmalloc+0x51/0x320
>>>>        btrfs_buffered_write+0x2eb/0xd90
>>>>        btrfs_do_write_iter+0x6bf/0x11c0
>>>>        do_iter_readv_writev+0x2bb/0x5a0
>>>>        do_iter_write+0x131/0x630
>>>>        nfsd_vfs_write+0x4da/0x1900 [nfsd]
>>>>        nfsd4_write+0x2ac/0x760 [nfsd]
>>>>        nfsd4_proc_compound+0xce8/0x23e0 [nfsd]
>>>>        nfsd_dispatch+0x4ed/0xc10 [nfsd]
>>>>        svc_process_common+0xd3f/0x1b00 [sunrpc]
>>>>        svc_process+0x361/0x4f0 [sunrpc]
>>>>        nfsd+0x2d6/0x570 [nfsd]
>>>>        kthread+0x2a1/0x340
>>>>        ret_from_fork+0x22/0x30
>>>>
>>>> #012-> #0 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}:
>>>>        __lock_acquire+0x318d/0x7830
>>>>        lock_acquire+0x1bb/0x500
>>>>        down_write+0x82/0x130
>>>>        btrfs_inode_lock+0x38/0x70
>>>>        btrfs_sync_file+0x280/0x1010
>>>>        nfsd_file_flush.isra.0+0x1b/0x220 [nfsd]
>>>>        nfsd_file_put+0xd4/0x110 [nfsd]
>>>>        release_all_access+0x13a/0x220 [nfsd]
>>>>        nfs4_free_ol_stateid+0x40/0x90 [nfsd]
>>>>        free_ol_stateid_reaplist+0x131/0x210 [nfsd]
>>>>        release_openowner+0xf7/0x160 [nfsd]
>>>>        __destroy_client+0x3cc/0x740 [nfsd]
>>>>        nfsd_cc_lru_scan+0x271/0x410 [nfsd]
>>>>        shrink_slab.constprop.0+0x31e/0x7d0
>>>>        shrink_node+0x54b/0xe50
>>>>        try_to_free_pages+0x394/0xba0
>>>>        __alloc_pages_slowpath.constprop.0+0x5d2/0x1db0
>>>>        __alloc_pages+0x4d6/0x580
>>>>        __handle_mm_fault+0xc25/0x2810
>>>>        handle_mm_fault+0x136/0x480
>>>>        do_user_addr_fault+0x3d8/0xec0
>>>>        exc_page_fault+0x5d/0xc0
>>>>        asm_exc_page_fault+0x27/0x30
>>> Is this deadlock possible only via the filecache close
>>> paths?
>> I'm not sure, but there is another back trace below that shows
>> another problem.
>>
>>>   I can't think of a reason the laundromat has to
>>> wait for each file to be flushed and closed -- the
>>> laundromat should be able to "put" these files and allow
>>> the filecache LRU to flush and close in the background.
>> ok, what should we do here, enhancing the laundromat to behave
>> as you describe?
> What I was suggesting was a longer term strategy for improving the
> laundromat. In order to scale well in the number of clients, it
> needs to schedule client expiry and deletion without serializing.
>
> (ie, the laundromat itself can identify a set of clients to clean,
> but then it should pass that list to other workers so it can run
> again as soon as it needs to -- and that also means it can use more
> than just one CPU at a time to do its work).

I see. Currently on my lowly 1-CPU VM it takes about ~35 secs to
destroy 128 clients, each with only few states (generated by pynfs's
CID5 test). We can improve on this.

>
> In the meantime, it is still valuable to consider a shrinker built
> around using ->nfsd_courtesy_client_count only. That was one of
> the two alternatives I suggested before, so I feel we have some
> consensus there. Please continue to look into that, and we can
> continue poking at laundromat improvements later.

Okay,

Thanks,
-Dai

>
>
>> Here is another stack trace of problem with calling expire_client
>> from 'scan' callback:
>> Sep  3 09:07:35 nfsvmf24 kernel: ------------[ cut here ]------------
>> Sep  3 09:07:35 nfsvmf24 kernel: workqueue: PF_MEMALLOC task 3525(gmain) is flushing !WQ_MEM_RECLAIM nfsd4_callbacks:0x0
>> Sep  3 09:07:35 nfsvmf24 kernel: WARNING: CPU: 0 PID: 3525 at kernel/workqueue.c:2625 check_flush_dependency+0x17a/0x350
>> Sep  3 09:07:35 nfsvmf24 kernel: Modules linked in: rpcsec_gss_krb5 nfsd nfs_acl lockd grace auth_rpcgss sunrpc
>> Sep  3 09:07:35 nfsvmf24 kernel: CPU: 0 PID: 3525 Comm: gmain Kdump: loaded Not tainted 6.0.0-rc3+ #1
>> Sep  3 09:07:35 nfsvmf24 kernel: Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
>> Sep  3 09:07:35 nfsvmf24 kernel: RIP: 0010:check_flush_dependency+0x17a/0x350
>> Sep  3 09:07:35 nfsvmf24 kernel: Code: 48 c1 ee 03 0f b6 04 06 84 c0 74 08 3c 03 0f 8e b8 01 00 00 41 8b b5 90 05 00 00 49 89 e8 48 c7 c7 c0 4d 06 9a e8 c6 a4 7f 02 <0f> 0b eb 4d 65 4c 8b 2c 25 c0 6e 02 00 4c 89 ef e8 71 65 01 00 49
>> Sep  3 09:07:35 nfsvmf24 kernel: RSP: 0018:ffff88810c73f4e8 EFLAGS: 00010282
>> Sep  3 09:07:35 nfsvmf24 kernel: RAX: 0000000000000000 RBX: ffff88811129a800 RCX: 0000000000000000
>> Sep  3 09:07:35 nfsvmf24 kernel: RDX: 0000000000000001 RSI: 0000000000000004 RDI: ffffed10218e7e93
>> Sep  3 09:07:35 nfsvmf24 kernel: RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed103afc6ed2
>> Sep  3 09:07:35 nfsvmf24 kernel: R10: ffff8881d7e3768b R11: ffffed103afc6ed1 R12: 0000000000000000
>> Sep  3 09:07:35 nfsvmf24 kernel: R13: ffff88810d14cac0 R14: 000000000000000d R15: 000000000000000c
>> Sep  3 09:07:35 nfsvmf24 kernel: FS:  00007fa9a696c700(0000) GS:ffff8881d7e00000(0000) knlGS:0000000000000000
>> Sep  3 09:07:35 nfsvmf24 kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> Sep  3 09:07:35 nfsvmf24 kernel: CR2: 00007fe922689c50 CR3: 000000010bdd8000 CR4: 00000000000406f0
>> Sep  3 09:07:35 nfsvmf24 kernel: Call Trace:
>> Sep  3 09:07:35 nfsvmf24 kernel: <TASK>
>> Sep  3 09:07:35 nfsvmf24 kernel: __flush_workqueue+0x32c/0x1350
>> Sep  3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680
>> Sep  3 09:07:35 nfsvmf24 kernel: ? rcu_read_lock_held_common+0xe/0xa0
>> Sep  3 09:07:35 nfsvmf24 kernel: ? check_flush_dependency+0x350/0x350
>> Sep  3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720
>> Sep  3 09:07:35 nfsvmf24 kernel: ? __queue_work+0x3cc/0xe10
>> Sep  3 09:07:35 nfsvmf24 kernel: ? trace_hardirqs_on+0x2d/0x110
>> Sep  3 09:07:35 nfsvmf24 kernel: ? nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd]
>> Sep  3 09:07:35 nfsvmf24 kernel: nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd]
>> Sep  3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720
>> Sep  3 09:07:35 nfsvmf24 kernel: ? nfsd4_probe_callback_sync+0x20/0x20 [nfsd]
>> Sep  3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680
>> Sep  3 09:07:35 nfsvmf24 kernel: __destroy_client+0x5ec/0xa60 [nfsd]
>> Sep  3 09:07:35 nfsvmf24 kernel: ? nfsd4_cb_recall_release+0x20/0x20 [nfsd]
>> Sep  3 09:07:35 nfsvmf24 kernel: nfsd_courtesy_client_scan+0x39f/0x850 [nfsd]
>> Sep  3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace+0x74/0xe0
>> Sep  3 09:07:35 nfsvmf24 kernel: ? client_ctl_write+0x7c0/0x7c0 [nfsd]
>> Sep  3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace_thunk+0x16/0x18
>> Sep  3 09:07:35 nfsvmf24 kernel: shrink_slab.constprop.0+0x30b/0x7b0
>> Sep  3 09:07:35 nfsvmf24 kernel: ? unregister_shrinker+0x270/0x270
>> Sep  3 09:07:35 nfsvmf24 kernel: shrink_node+0x54b/0xe50
>> Sep  3 09:07:35 nfsvmf24 kernel: try_to_free_pages+0x394/0xba0
>> Sep  3 09:07:35 nfsvmf24 kernel: ? reclaim_pages+0x3b0/0x3b0
>> Sep  3 09:07:35 nfsvmf24 kernel: __alloc_pages_slowpath.constprop.0+0x636/0x1f30
>> Sep  3 09:07:35 nfsvmf24 kernel: ? warn_alloc+0x120/0x120
>> Sep  3 09:07:35 nfsvmf24 kernel: ? __zone_watermark_ok+0x2d0/0x2d0
>> Sep  3 09:07:35 nfsvmf24 kernel: __alloc_pages+0x4d6/0x580
>> Sep  3 09:07:35 nfsvmf24 kernel: ? __alloc_pages_slowpath.constprop.0+0x1f30/0x1f30
>> Sep  3 09:07:35 nfsvmf24 kernel: ? find_held_lock+0x2c/0x110
>> Sep  3 09:07:35 nfsvmf24 kernel: ? lockdep_hardirqs_on_prepare+0x17b/0x410
>> Sep  3 09:07:35 nfsvmf24 kernel: ____cache_alloc.part.0+0x586/0x760
>> Sep  3 09:07:35 nfsvmf24 kernel: ? kmem_cache_alloc+0x193/0x290
>> Sep  3 09:07:35 nfsvmf24 kernel: kmem_cache_alloc+0x22f/0x290
>> Sep  3 09:07:35 nfsvmf24 kernel: getname_flags+0xbe/0x4e0
>> Sep  3 09:07:35 nfsvmf24 kernel: user_path_at_empty+0x23/0x50
>> Sep  3 09:07:35 nfsvmf24 kernel: inotify_find_inode+0x28/0x120
>> Sep  3 09:07:35 nfsvmf24 kernel: ? __fget_light+0x19b/0x210
>> Sep  3 09:07:35 nfsvmf24 kernel: __x64_sys_inotify_add_watch+0x160/0x290
>> Sep  3 09:07:35 nfsvmf24 kernel: ? __ia32_sys_inotify_rm_watch+0x170/0x170
>> Sep  3 09:07:35 nfsvmf24 kernel: do_syscall_64+0x3d/0x90
>> Sep  3 09:07:35 nfsvmf24 kernel: entry_SYSCALL_64_after_hwframe+0x46/0xb0
>> Sep  3 09:07:35 nfsvmf24 kernel: RIP: 0033:0x7fa9b1a77f37
>> Sep  3 09:07:35 nfsvmf24 kernel: Code: f0 ff ff 73 01 c3 48 8b 0d 36 7f 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 fe 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 09 7f 2c 00 f7 d8 64 89 01 48
>> Sep  3 09:07:35 nfsvmf24 kernel: RSP: 002b:00007fa9a696bb28 EFLAGS: 00000202 ORIG_RAX: 00000000000000fe
>> Sep  3 09:07:35 nfsvmf24 kernel: RAX: ffffffffffffffda RBX: 00005638abf5adc0 RCX: 00007fa9b1a77f37
>> Sep  3 09:07:35 nfsvmf24 kernel: RDX: 0000000001002fce RSI: 00005638abf58080 RDI: 000000000000000d
>> Sep  3 09:07:35 nfsvmf24 kernel: RBP: 00007fa9a696bb54 R08: 00007ffe7f7f1080 R09: 0000000000000000
>> Sep  3 09:07:35 nfsvmf24 kernel: R10: 00000000002cf948 R11: 0000000000000202 R12: 0000000000000000
>> Sep  3 09:07:35 nfsvmf24 kernel: R13: 00007fa9b39710e0 R14: 00005638abf5adf0 R15: 00007fa9b317c940
>>
>> Another behavior about the shrinker is that the 'count' and 'scan'
>> callbacks are not 1-to-1, meaning the 'scan' is not called after
>> every 'count' callback. Not sure what criteria the shrinker use to
>> do the 'scan' callback but it's very late, sometimes after dozens
>> of 'count' callback then there is a 'scan' callback. If we rely on
>> the 'scan' callback then I don't think we react in time to release
>> the courtesy clients.
>>
>> -Dai
>>
>>>
>>> --
>>> Chuck Lever
> --
> Chuck Lever
>
>
>
J. Bruce Fields Sept. 6, 2022, 1 p.m. UTC | #11
On Sat, Sep 03, 2022 at 10:59:17AM -0700, dai.ngo@oracle.com wrote:
> On 9/3/22 10:29 AM, Chuck Lever III wrote:
> >What I was suggesting was a longer term strategy for improving the
> >laundromat. In order to scale well in the number of clients, it
> >needs to schedule client expiry and deletion without serializing.
> >
> >(ie, the laundromat itself can identify a set of clients to clean,
> >but then it should pass that list to other workers so it can run
> >again as soon as it needs to -- and that also means it can use more
> >than just one CPU at a time to do its work).
> 
> I see. Currently on my lowly 1-CPU VM it takes about ~35 secs to
> destroy 128 clients, each with only few states (generated by pynfs's
> CID5 test). We can improve on this.

Careful--it's not the CPU that's the issue, it's waiting for disk.

If you're on a hard drive, for example, it's going to take at least one
seek (probably at least 10ms) to expire a single client, so you're never
going to destroy more than 100 per second.  That's what you need to
parallelize.  See item 3 from

	https://lore.kernel.org/linux-nfs/20220523154026.GD24163@fieldses.org/

Also, looks like current nfs-utils is still doing 3 commits per expiry.

Steve, for some reason I think "nfsdcld: use WAL journal for faster
commits" never got applied:

	https://lore.kernel.org/linux-nfs/20220104222445.GF12040@fieldses.org/

--b.
Dai Ngo Sept. 6, 2022, 7:04 p.m. UTC | #12
On 9/6/22 6:00 AM, J. Bruce Fields wrote:
> On Sat, Sep 03, 2022 at 10:59:17AM -0700, dai.ngo@oracle.com wrote:
>> On 9/3/22 10:29 AM, Chuck Lever III wrote:
>>> What I was suggesting was a longer term strategy for improving the
>>> laundromat. In order to scale well in the number of clients, it
>>> needs to schedule client expiry and deletion without serializing.
>>>
>>> (ie, the laundromat itself can identify a set of clients to clean,
>>> but then it should pass that list to other workers so it can run
>>> again as soon as it needs to -- and that also means it can use more
>>> than just one CPU at a time to do its work).
>> I see. Currently on my lowly 1-CPU VM it takes about ~35 secs to
>> destroy 128 clients, each with only few states (generated by pynfs's
>> CID5 test). We can improve on this.
> Careful--it's not the CPU that's the issue, it's waiting for disk.
>
> If you're on a hard drive, for example, it's going to take at least one
> seek (probably at least 10ms) to expire a single client, so you're never
> going to destroy more than 100 per second.  That's what you need to
> parallelize.  See item 3 from
>
> 	https://urldefense.com/v3/__https://lore.kernel.org/linux-nfs/20220523154026.GD24163@fieldses.org/__;!!ACWV5N9M2RV99hQ!KK7Xqkksy2WBks12oxpw0FMQW8z7_FpSDgruhtAIrNCW5kmhvAY7noT5d6ybenxkIowyx9cVXBLDysVq$

Right! thank you for reminding me of this. I'll add it to my plate
if no one gets to it yet.

-Dai

>   
>
> Also, looks like current nfs-utils is still doing 3 commits per expiry.
>
> Steve, for some reason I think "nfsdcld: use WAL journal for faster
> commits" never got applied:
>
> 	https://urldefense.com/v3/__https://lore.kernel.org/linux-nfs/20220104222445.GF12040@fieldses.org/__;!!ACWV5N9M2RV99hQ!KK7Xqkksy2WBks12oxpw0FMQW8z7_FpSDgruhtAIrNCW5kmhvAY7noT5d6ybenxkIowyx9cVXP3Ocamj$
>
> --b.
diff mbox series

Patch

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 2695dff1378a..2a604951623f 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -194,6 +194,9 @@  struct nfsd_net {
 	int			nfs4_max_clients;
 
 	atomic_t		nfsd_courtesy_client_count;
+	atomic_t		nfsd_client_shrinker_cb_count;
+	atomic_t		nfsd_client_shrinker_reapcount;
+	struct shrinker		nfsd_client_shrinker;
 };
 
 /* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index eaf7b4dcea33..9aed9eed1892 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4343,7 +4343,40 @@  nfsd4_init_slabs(void)
 	return -ENOMEM;
 }
 
-void nfsd4_init_leases_net(struct nfsd_net *nn)
+static unsigned long
+nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct nfsd_net *nn = container_of(shrink,
+			struct nfsd_net, nfsd_client_shrinker);
+
+	atomic_inc(&nn->nfsd_client_shrinker_cb_count);
+	mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
+	return (unsigned long)atomic_read(&nn->nfsd_courtesy_client_count);
+}
+
+static unsigned long
+nfsd_courtesy_client_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct nfsd_net *nn = container_of(shrink,
+			struct nfsd_net, nfsd_client_shrinker);
+	unsigned long cnt;
+
+	cnt = atomic_read(&nn->nfsd_client_shrinker_reapcount);
+	atomic_set(&nn->nfsd_client_shrinker_reapcount, 0);
+	return cnt;
+}
+
+static int
+nfsd_register_client_shrinker(struct nfsd_net *nn)
+{
+	nn->nfsd_client_shrinker.scan_objects = nfsd_courtesy_client_scan;
+	nn->nfsd_client_shrinker.count_objects = nfsd_courtesy_client_count;
+	nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
+	return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client");
+}
+
+int
+nfsd4_init_leases_net(struct nfsd_net *nn)
 {
 	struct sysinfo si;
 	u64 max_clients;
@@ -4364,6 +4397,8 @@  void nfsd4_init_leases_net(struct nfsd_net *nn)
 	nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB);
 
 	atomic_set(&nn->nfsd_courtesy_client_count, 0);
+	atomic_set(&nn->nfsd_client_shrinker_cb_count, 0);
+	return nfsd_register_client_shrinker(nn);
 }
 
 static void init_nfs4_replay(struct nfs4_replay *rp)
@@ -5872,12 +5907,17 @@  static void
 nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
 				struct laundry_time *lt)
 {
-	unsigned int maxreap, reapcnt = 0;
+	unsigned int maxreap = 0, reapcnt = 0;
+	int cb_cnt;
 	struct list_head *pos, *next;
 	struct nfs4_client *clp;
 
-	maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ?
-			NFSD_CLIENT_MAX_TRIM_PER_RUN : 0;
+	cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count);
+	if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients ||
+							cb_cnt) {
+		maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN;
+		atomic_set(&nn->nfsd_client_shrinker_cb_count, 0);
+	}
 	INIT_LIST_HEAD(reaplist);
 	spin_lock(&nn->client_lock);
 	list_for_each_safe(pos, next, &nn->client_lru) {
@@ -5903,6 +5943,8 @@  nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
 		}
 	}
 	spin_unlock(&nn->client_lock);
+	if (cb_cnt)
+		atomic_add(reapcnt, &nn->nfsd_client_shrinker_reapcount);
 }
 
 static time64_t
@@ -5943,6 +5985,8 @@  nfs4_laundromat(struct nfsd_net *nn)
 		list_del_init(&clp->cl_lru);
 		expire_client(clp);
 	}
+	if (atomic_read(&nn->nfsd_client_shrinker_cb_count) > 0)
+		lt.new_timeo = NFSD_LAUNDROMAT_MINTIMEOUT;
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 917fa1892fd2..597a26ad4183 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1481,11 +1481,12 @@  static __net_init int nfsd_init_net(struct net *net)
 		goto out_idmap_error;
 	nn->nfsd_versions = NULL;
 	nn->nfsd4_minorversions = NULL;
+	retval = nfsd4_init_leases_net(nn);
+	if (retval)
+		goto out_drc_error;
 	retval = nfsd_reply_cache_init(nn);
 	if (retval)
 		goto out_drc_error;
-	nfsd4_init_leases_net(nn);
-
 	get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
 	seqlock_init(&nn->writeverf_lock);
 
@@ -1507,6 +1508,7 @@  static __net_exit void nfsd_exit_net(struct net *net)
 	nfsd_idmap_shutdown(net);
 	nfsd_export_shutdown(net);
 	nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
+	nfsd4_leases_net_shutdown(nn);
 }
 
 static struct pernet_operations nfsd_net_ops = {
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 57a468ed85c3..7e05ab7a3532 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -498,7 +498,11 @@  extern void unregister_cld_notifier(void);
 extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn);
 #endif
 
-extern void nfsd4_init_leases_net(struct nfsd_net *nn);
+extern int nfsd4_init_leases_net(struct nfsd_net *nn);
+static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn)
+{
+	unregister_shrinker(&nn->nfsd_client_shrinker);
+};
 
 #else /* CONFIG_NFSD_V4 */
 static inline int nfsd4_is_junction(struct dentry *dentry)
@@ -506,7 +510,8 @@  static inline int nfsd4_is_junction(struct dentry *dentry)
 	return 0;
 }
 
-static inline void nfsd4_init_leases_net(struct nfsd_net *nn) {};
+static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; };
+static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) { };
 
 #define register_cld_notifier() 0
 #define unregister_cld_notifier() do { } while(0)