Message ID | 1661896113-8013-3-git-send-email-dai.ngo@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | NFSD: memory shrinker for NFSv4 clients | expand |
> On Aug 30, 2022, at 5:48 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > Add the courtesy client shrinker to react to low memory condition > triggered by the memory shrinker. > > On the shrinker's count callback, we increment a callback counter > and return the number of outstanding courtesy clients. When the > laundromat runs, it checks if this counter is not zero and starts > reaping old courtesy clients. The maximum number of clients to be > reaped is limited to NFSD_CIENT_MAX_TRIM_PER_RUN (128). This limit > is to prevent the laundromat from spending too much time reaping > the clients and not processing other tasks in a timely manner. > > The laundromat is rescheduled to run sooner if it detects low > low memory condition and there are more clients to reap. > > On the shrinker's scan callback, we return the number of clients > That were reaped since the last scan callback. We can not reap > the clients on the scan callback context since destroying the > client might require call into the underlying filesystem or other > subsystems which might allocate memory which can cause deadlock. > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com> > --- > fs/nfsd/netns.h | 3 +++ > fs/nfsd/nfs4state.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++---- > fs/nfsd/nfsctl.c | 6 ++++-- > fs/nfsd/nfsd.h | 9 +++++++-- > 4 files changed, 62 insertions(+), 8 deletions(-) > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h > index 2695dff1378a..2a604951623f 100644 > --- a/fs/nfsd/netns.h > +++ b/fs/nfsd/netns.h > @@ -194,6 +194,9 @@ struct nfsd_net { > int nfs4_max_clients; > > atomic_t nfsd_courtesy_client_count; > + atomic_t nfsd_client_shrinker_cb_count; > + atomic_t nfsd_client_shrinker_reapcount; > + struct shrinker nfsd_client_shrinker; > }; > > /* Simple check to find out if a given net was properly initialized */ > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > index eaf7b4dcea33..9aed9eed1892 100644 > --- a/fs/nfsd/nfs4state.c > +++ b/fs/nfsd/nfs4state.c > @@ -4343,7 +4343,40 @@ nfsd4_init_slabs(void) > return -ENOMEM; > } > > -void nfsd4_init_leases_net(struct nfsd_net *nn) > +static unsigned long > +nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc) I find it confusing to have a function and a variable with exactly the same name, especially if they are related. Maybe the variable name can be nfsd_courtesy_client_num ? > +{l > + struct nfsd_net *nn = container_of(shrink, > + struct nfsd_net, nfsd_client_shrinker); > + > + atomic_inc(&nn->nfsd_client_shrinker_cb_count); > + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); > + return (unsigned long)atomic_read(&nn->nfsd_courtesy_client_count); > +} > + > +static unsigned long > +nfsd_courtesy_client_scan(struct shrinker *shrink, struct shrink_control *sc) > +{ > + struct nfsd_net *nn = container_of(shrink, > + struct nfsd_net, nfsd_client_shrinker); > + unsigned long cnt; > + > + cnt = atomic_read(&nn->nfsd_client_shrinker_reapcount); > + atomic_set(&nn->nfsd_client_shrinker_reapcount, 0); > + return cnt; > +} > + > +static int > +nfsd_register_client_shrinker(struct nfsd_net *nn) > +{ > + nn->nfsd_client_shrinker.scan_objects = nfsd_courtesy_client_scan; > + nn->nfsd_client_shrinker.count_objects = nfsd_courtesy_client_count; > + nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; > + return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"); > +} Nit: Move this into nfsd4_init_leases_net(). I don't see added value for having a one-time use helper for this code. > + > +int > +nfsd4_init_leases_net(struct nfsd_net *nn) > { > struct sysinfo si; > u64 max_clients; > @@ -4364,6 +4397,8 @@ void nfsd4_init_leases_net(struct nfsd_net *nn) > nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); > > atomic_set(&nn->nfsd_courtesy_client_count, 0); > + atomic_set(&nn->nfsd_client_shrinker_cb_count, 0); > + return nfsd_register_client_shrinker(nn); > } > > static void init_nfs4_replay(struct nfs4_replay *rp) > @@ -5872,12 +5907,17 @@ static void > nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, > struct laundry_time *lt) > { > - unsigned int maxreap, reapcnt = 0; > + unsigned int maxreap = 0, reapcnt = 0; > + int cb_cnt; Nit: Reverse christmas tree, please. cb_cnt goes at the end of the variable definitions. > struct list_head *pos, *next; > struct nfs4_client *clp; > > - maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ? > - NFSD_CLIENT_MAX_TRIM_PER_RUN : 0; > + cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count); > + if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients || > + cb_cnt) { > + maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN; > + atomic_set(&nn->nfsd_client_shrinker_cb_count, 0); > + } I'm not terribly happy with this, but I don't have a better suggestion at the moment. Let me think about it. > INIT_LIST_HEAD(reaplist); > spin_lock(&nn->client_lock); > list_for_each_safe(pos, next, &nn->client_lru) { > @@ -5903,6 +5943,8 @@ nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, > } > } > spin_unlock(&nn->client_lock); > + if (cb_cnt) > + atomic_add(reapcnt, &nn->nfsd_client_shrinker_reapcount); > } > > static time64_t > @@ -5943,6 +5985,8 @@ nfs4_laundromat(struct nfsd_net *nn) > list_del_init(&clp->cl_lru); > expire_client(clp); > } > + if (atomic_read(&nn->nfsd_client_shrinker_cb_count) > 0) > + lt.new_timeo = NFSD_LAUNDROMAT_MINTIMEOUT; > spin_lock(&state_lock); > list_for_each_safe(pos, next, &nn->del_recall_lru) { > dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c > index 917fa1892fd2..597a26ad4183 100644 > --- a/fs/nfsd/nfsctl.c > +++ b/fs/nfsd/nfsctl.c > @@ -1481,11 +1481,12 @@ static __net_init int nfsd_init_net(struct net *net) > goto out_idmap_error; > nn->nfsd_versions = NULL; > nn->nfsd4_minorversions = NULL; > + retval = nfsd4_init_leases_net(nn); > + if (retval) > + goto out_drc_error; > retval = nfsd_reply_cache_init(nn); > if (retval) > goto out_drc_error; > - nfsd4_init_leases_net(nn); > - > get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); > seqlock_init(&nn->writeverf_lock); > > @@ -1507,6 +1508,7 @@ static __net_exit void nfsd_exit_net(struct net *net) > nfsd_idmap_shutdown(net); > nfsd_export_shutdown(net); > nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); > + nfsd4_leases_net_shutdown(nn); > } > > static struct pernet_operations nfsd_net_ops = { > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h > index 57a468ed85c3..7e05ab7a3532 100644 > --- a/fs/nfsd/nfsd.h > +++ b/fs/nfsd/nfsd.h > @@ -498,7 +498,11 @@ extern void unregister_cld_notifier(void); > extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn); > #endif > > -extern void nfsd4_init_leases_net(struct nfsd_net *nn); > +extern int nfsd4_init_leases_net(struct nfsd_net *nn); > +static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) > +{ > + unregister_shrinker(&nn->nfsd_client_shrinker); > +}; Nit: please move this into nfs4state.c next to nfsd4_init_leases_net(). static inline is used typically for performance-sensitive helpers, and this adds a dependency on unregister_shrinker in every file that includes nfsd.h. > #else /* CONFIG_NFSD_V4 */ > static inline int nfsd4_is_junction(struct dentry *dentry) > @@ -506,7 +510,8 @@ static inline int nfsd4_is_junction(struct dentry *dentry) > return 0; > } > > -static inline void nfsd4_init_leases_net(struct nfsd_net *nn) {}; > +static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; }; > +static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) { }; > > #define register_cld_notifier() 0 > #define unregister_cld_notifier() do { } while(0) > -- > 2.9.5 > -- Chuck Lever
Hi Chuck, On 8/31/22 7:30 AM, Chuck Lever III wrote: >> struct list_head *pos, *next; >> struct nfs4_client *clp; >> >> - maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ? >> - NFSD_CLIENT_MAX_TRIM_PER_RUN : 0; >> + cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count); >> + if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients || >> + cb_cnt) { >> + maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN; >> + atomic_set(&nn->nfsd_client_shrinker_cb_count, 0); >> + } > I'm not terribly happy with this, but I don't have a better suggestion > at the moment. Let me think about it. Do you have any suggestion to improve this, I want to incorporate it before sending out v5? Thanks, -Dai
> On Sep 1, 2022, at 9:56 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > Hi Chuck, > > On 8/31/22 7:30 AM, Chuck Lever III wrote: >>> struct list_head *pos, *next; >>> struct nfs4_client *clp; >>> >>> - maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ? >>> - NFSD_CLIENT_MAX_TRIM_PER_RUN : 0; >>> + cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count); >>> + if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients || >>> + cb_cnt) { >>> + maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN; >>> + atomic_set(&nn->nfsd_client_shrinker_cb_count, 0); >>> + } >> I'm not terribly happy with this, but I don't have a better suggestion >> at the moment. Let me think about it. > > Do you have any suggestion to improve this, I want to incorporate it > before sending out v5? Let's consider some broad outlines... With regards to parametrizing the reaplist behavior, you want a normal laundromat run to reap zero or more courtesy clients. You want a shrinker laundromat run to reap more than zero. I think you want a minreap variable as well as a maxreap variable in there to control how the reaplist is built. Making @minreap a function parameter rather than a global atomic would be a plus for me, but maybe that's not practical. But I would prefer a more straightforward approach overall. The proposed approach seems tricky and brittle, and needs a lot of explaining to understand. Other subsystems seem to get away with something simpler. Can nfsd_courtesy_client_count() simply reduce nn->nfs4_max_clients, kick the laundromat, and then return 0? Then get rid of nfsd_courtesy_client_scan(). Or, nfsd_courtesy_client_count() could return nfsd_couresy_client_count. nfsd_courtesy_client_scan() could then look something like this: if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) return SHRINK_STOP; nfsd_get_client_reaplist(nn, reaplist, lt); list_for_each_safe(pos, next, &reaplist) { clp = list_entry(pos, struct nfs4_client, cl_lru); trace_nfsd_clid_purged(&clp->cl_clientid); list_del_init(&clp->cl_lru); expire_client(clp); count++; } return count; Obviously you would need to refactor common code into helper functions. -- Chuck Lever
On 9/1/22 9:32 PM, Chuck Lever III wrote: > >> On Sep 1, 2022, at 9:56 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >> >> Hi Chuck, >> >> On 8/31/22 7:30 AM, Chuck Lever III wrote: >>>> struct list_head *pos, *next; >>>> struct nfs4_client *clp; >>>> >>>> - maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ? >>>> - NFSD_CLIENT_MAX_TRIM_PER_RUN : 0; >>>> + cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count); >>>> + if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients || >>>> + cb_cnt) { >>>> + maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN; >>>> + atomic_set(&nn->nfsd_client_shrinker_cb_count, 0); >>>> + } >>> I'm not terribly happy with this, but I don't have a better suggestion >>> at the moment. Let me think about it. >> Do you have any suggestion to improve this, I want to incorporate it >> before sending out v5? > Let's consider some broad outlines... > > With regards to parametrizing the reaplist behavior, you want > a normal laundromat run to reap zero or more courtesy clients. > You want a shrinker laundromat run to reap more than zero. I > think you want a minreap variable as well as a maxreap variable > in there to control how the reaplist is built. Making @minreap > a function parameter rather than a global atomic would be a > plus for me, but maybe that's not practical. I'm not quite understand how the minreap is used, I think it will make the code more complex. > > But I would prefer a more straightforward approach overall. The > proposed approach seems tricky and brittle, and needs a lot of > explaining to understand. Other subsystems seem to get away with > something simpler. > > Can nfsd_courtesy_client_count() simply reduce > nn->nfs4_max_clients, kick the laundromat, and then return 0? > Then get rid of nfsd_courtesy_client_scan(). I need to think more about this approach. However at first glance, nn->nfs4_max_clients is used to control how many clients, including active and courtesy clients, are allowed in the system. If we lower this count, it also prevent new clients from connecting to the system. So now the shrinker mechanism does more than just getting rid of unused resources, maybe that's ok? > > Or, nfsd_courtesy_client_count() could return > nfsd_couresy_client_count. nfsd_courtesy_client_scan() could > then look something like this: > > if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) > return SHRINK_STOP; > > nfsd_get_client_reaplist(nn, reaplist, lt); > list_for_each_safe(pos, next, &reaplist) { > clp = list_entry(pos, struct nfs4_client, cl_lru); > trace_nfsd_clid_purged(&clp->cl_clientid); > list_del_init(&clp->cl_lru); > expire_client(clp); > count++; > } > return count; This does not work, we cannot expire clients on the context of scan callback due to deadlock problem. I will experiment with ways to get rid of the scan function to make the logic simpler. Thanks, -Dai > > Obviously you would need to refactor common code into helper > functions. > > -- > Chuck Lever >
> On Sep 2, 2022, at 12:44 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > On 9/1/22 9:32 PM, Chuck Lever III wrote: >> >>> On Sep 1, 2022, at 9:56 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >>> >>> Hi Chuck, >>> >>> On 8/31/22 7:30 AM, Chuck Lever III wrote: >>>>> struct list_head *pos, *next; >>>>> struct nfs4_client *clp; >>>>> >>>>> - maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ? >>>>> - NFSD_CLIENT_MAX_TRIM_PER_RUN : 0; >>>>> + cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count); >>>>> + if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients || >>>>> + cb_cnt) { >>>>> + maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN; >>>>> + atomic_set(&nn->nfsd_client_shrinker_cb_count, 0); >>>>> + } >>>> I'm not terribly happy with this, but I don't have a better suggestion >>>> at the moment. Let me think about it. >>> Do you have any suggestion to improve this, I want to incorporate it >>> before sending out v5? >> Let's consider some broad outlines... >> >> With regards to parametrizing the reaplist behavior, you want >> a normal laundromat run to reap zero or more courtesy clients. >> You want a shrinker laundromat run to reap more than zero. I >> think you want a minreap variable as well as a maxreap variable >> in there to control how the reaplist is built. Making @minreap >> a function parameter rather than a global atomic would be a >> plus for me, but maybe that's not practical. > > I'm not quite understand how the minreap is used, I think it > will make the code more complex. > >> >> But I would prefer a more straightforward approach overall. The >> proposed approach seems tricky and brittle, and needs a lot of >> explaining to understand. Other subsystems seem to get away with >> something simpler. >> >> Can nfsd_courtesy_client_count() simply reduce >> nn->nfs4_max_clients, kick the laundromat, and then return 0? >> Then get rid of nfsd_courtesy_client_scan(). > > I need to think more about this approach. However at first glance, > nn->nfs4_max_clients is used to control how many clients, including > active and courtesy clients, are allowed in the system. If we lower > this count, it also prevent new clients from connecting to the > system. So now the shrinker mechanism does more than just getting > rid of unused resources, maybe that's ok? I don't want to go down the path of kicking out active clients if we don't need to. Maybe you can make the laundromat code more careful to make a distinction between active and courtesy clients. It might sense in the long run to separate the laundromat's processing of courtesy and active clients anyway. >> Or, nfsd_courtesy_client_count() could return >> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could >> then look something like this: >> >> if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) >> return SHRINK_STOP; >> >> nfsd_get_client_reaplist(nn, reaplist, lt); >> list_for_each_safe(pos, next, &reaplist) { >> clp = list_entry(pos, struct nfs4_client, cl_lru); >> trace_nfsd_clid_purged(&clp->cl_clientid); >> list_del_init(&clp->cl_lru); >> expire_client(clp); >> count++; >> } >> return count; > > This does not work, we cannot expire clients on the context of > scan callback due to deadlock problem. Correct, we don't want to start shrinker laundromat activity if the allocation request specified that it cannot wait. But are you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL? > I will experiment with ways to get rid of the scan function to > make the logic simpler. -- Chuck Lever
On 9/2/22 10:58 AM, Chuck Lever III wrote: >>> Or, nfsd_courtesy_client_count() could return >>> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could >>> then look something like this: >>> >>> if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) >>> return SHRINK_STOP; >>> >>> nfsd_get_client_reaplist(nn, reaplist, lt); >>> list_for_each_safe(pos, next, &reaplist) { >>> clp = list_entry(pos, struct nfs4_client, cl_lru); >>> trace_nfsd_clid_purged(&clp->cl_clientid); >>> list_del_init(&clp->cl_lru); >>> expire_client(clp); >>> count++; >>> } >>> return count; >> This does not work, we cannot expire clients on the context of >> scan callback due to deadlock problem. > Correct, we don't want to start shrinker laundromat activity if > the allocation request specified that it cannot wait. But are > you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL? It can be deadlock due to lock ordering problem: ====================================================== WARNING: possible circular locking dependency detected 5.19.0-rc2_sk+ #1 Not tainted ------------------------------------------------------ lck/31847 is trying to acquire lock: ffff88811d268850 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}, at: btrfs_inode_lock+0x38/0x70 #012but task is already holding lock: ffffffffb41848c0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x506/0x1db0 #012which lock already depends on the new lock. #012the existing dependency chain (in reverse order) is: #012-> #1 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0xc0/0x100 __kmalloc+0x51/0x320 btrfs_buffered_write+0x2eb/0xd90 btrfs_do_write_iter+0x6bf/0x11c0 do_iter_readv_writev+0x2bb/0x5a0 do_iter_write+0x131/0x630 nfsd_vfs_write+0x4da/0x1900 [nfsd] nfsd4_write+0x2ac/0x760 [nfsd] nfsd4_proc_compound+0xce8/0x23e0 [nfsd] nfsd_dispatch+0x4ed/0xc10 [nfsd] svc_process_common+0xd3f/0x1b00 [sunrpc] svc_process+0x361/0x4f0 [sunrpc] nfsd+0x2d6/0x570 [nfsd] kthread+0x2a1/0x340 ret_from_fork+0x22/0x30 #012-> #0 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}: __lock_acquire+0x318d/0x7830 lock_acquire+0x1bb/0x500 down_write+0x82/0x130 btrfs_inode_lock+0x38/0x70 btrfs_sync_file+0x280/0x1010 nfsd_file_flush.isra.0+0x1b/0x220 [nfsd] nfsd_file_put+0xd4/0x110 [nfsd] release_all_access+0x13a/0x220 [nfsd] nfs4_free_ol_stateid+0x40/0x90 [nfsd] free_ol_stateid_reaplist+0x131/0x210 [nfsd] release_openowner+0xf7/0x160 [nfsd] __destroy_client+0x3cc/0x740 [nfsd] nfsd_cc_lru_scan+0x271/0x410 [nfsd] shrink_slab.constprop.0+0x31e/0x7d0 shrink_node+0x54b/0xe50 try_to_free_pages+0x394/0xba0 __alloc_pages_slowpath.constprop.0+0x5d2/0x1db0 __alloc_pages+0x4d6/0x580 __handle_mm_fault+0xc25/0x2810 handle_mm_fault+0x136/0x480 do_user_addr_fault+0x3d8/0xec0 exc_page_fault+0x5d/0xc0 asm_exc_page_fault+0x27/0x30 -Dai
> On Sep 2, 2022, at 3:34 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > On 9/2/22 10:58 AM, Chuck Lever III wrote: >>>> Or, nfsd_courtesy_client_count() could return >>>> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could >>>> then look something like this: >>>> >>>> if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) >>>> return SHRINK_STOP; >>>> >>>> nfsd_get_client_reaplist(nn, reaplist, lt); >>>> list_for_each_safe(pos, next, &reaplist) { >>>> clp = list_entry(pos, struct nfs4_client, cl_lru); >>>> trace_nfsd_clid_purged(&clp->cl_clientid); >>>> list_del_init(&clp->cl_lru); >>>> expire_client(clp); >>>> count++; >>>> } >>>> return count; >>> This does not work, we cannot expire clients on the context of >>> scan callback due to deadlock problem. >> Correct, we don't want to start shrinker laundromat activity if >> the allocation request specified that it cannot wait. But are >> you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL? > > It can be deadlock due to lock ordering problem: > > ====================================================== > WARNING: possible circular locking dependency detected > 5.19.0-rc2_sk+ #1 Not tainted > ------------------------------------------------------ > lck/31847 is trying to acquire lock: > ffff88811d268850 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}, at: btrfs_inode_lock+0x38/0x70 > #012but task is already holding lock: > ffffffffb41848c0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x506/0x1db0 > #012which lock already depends on the new lock. > #012the existing dependency chain (in reverse order) is: > > #012-> #1 (fs_reclaim){+.+.}-{0:0}: > fs_reclaim_acquire+0xc0/0x100 > __kmalloc+0x51/0x320 > btrfs_buffered_write+0x2eb/0xd90 > btrfs_do_write_iter+0x6bf/0x11c0 > do_iter_readv_writev+0x2bb/0x5a0 > do_iter_write+0x131/0x630 > nfsd_vfs_write+0x4da/0x1900 [nfsd] > nfsd4_write+0x2ac/0x760 [nfsd] > nfsd4_proc_compound+0xce8/0x23e0 [nfsd] > nfsd_dispatch+0x4ed/0xc10 [nfsd] > svc_process_common+0xd3f/0x1b00 [sunrpc] > svc_process+0x361/0x4f0 [sunrpc] > nfsd+0x2d6/0x570 [nfsd] > kthread+0x2a1/0x340 > ret_from_fork+0x22/0x30 > > #012-> #0 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}: > __lock_acquire+0x318d/0x7830 > lock_acquire+0x1bb/0x500 > down_write+0x82/0x130 > btrfs_inode_lock+0x38/0x70 > btrfs_sync_file+0x280/0x1010 > nfsd_file_flush.isra.0+0x1b/0x220 [nfsd] > nfsd_file_put+0xd4/0x110 [nfsd] > release_all_access+0x13a/0x220 [nfsd] > nfs4_free_ol_stateid+0x40/0x90 [nfsd] > free_ol_stateid_reaplist+0x131/0x210 [nfsd] > release_openowner+0xf7/0x160 [nfsd] > __destroy_client+0x3cc/0x740 [nfsd] > nfsd_cc_lru_scan+0x271/0x410 [nfsd] > shrink_slab.constprop.0+0x31e/0x7d0 > shrink_node+0x54b/0xe50 > try_to_free_pages+0x394/0xba0 > __alloc_pages_slowpath.constprop.0+0x5d2/0x1db0 > __alloc_pages+0x4d6/0x580 > __handle_mm_fault+0xc25/0x2810 > handle_mm_fault+0x136/0x480 > do_user_addr_fault+0x3d8/0xec0 > exc_page_fault+0x5d/0xc0 > asm_exc_page_fault+0x27/0x30 Is this deadlock possible only via the filecache close paths? I can't think of a reason the laundromat has to wait for each file to be flushed and closed -- the laundromat should be able to "put" these files and allow the filecache LRU to flush and close in the background. -- Chuck Lever
On 9/2/22 6:26 PM, Chuck Lever III wrote: >> On Sep 2, 2022, at 3:34 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >> >> On 9/2/22 10:58 AM, Chuck Lever III wrote: >>>>> Or, nfsd_courtesy_client_count() could return >>>>> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could >>>>> then look something like this: >>>>> >>>>> if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) >>>>> return SHRINK_STOP; >>>>> >>>>> nfsd_get_client_reaplist(nn, reaplist, lt); >>>>> list_for_each_safe(pos, next, &reaplist) { >>>>> clp = list_entry(pos, struct nfs4_client, cl_lru); >>>>> trace_nfsd_clid_purged(&clp->cl_clientid); >>>>> list_del_init(&clp->cl_lru); >>>>> expire_client(clp); >>>>> count++; >>>>> } >>>>> return count; >>>> This does not work, we cannot expire clients on the context of >>>> scan callback due to deadlock problem. >>> Correct, we don't want to start shrinker laundromat activity if >>> the allocation request specified that it cannot wait. But are >>> you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL? >> It can be deadlock due to lock ordering problem: >> >> ====================================================== >> WARNING: possible circular locking dependency detected >> 5.19.0-rc2_sk+ #1 Not tainted >> ------------------------------------------------------ >> lck/31847 is trying to acquire lock: >> ffff88811d268850 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}, at: btrfs_inode_lock+0x38/0x70 >> #012but task is already holding lock: >> ffffffffb41848c0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x506/0x1db0 >> #012which lock already depends on the new lock. >> #012the existing dependency chain (in reverse order) is: >> >> #012-> #1 (fs_reclaim){+.+.}-{0:0}: >> fs_reclaim_acquire+0xc0/0x100 >> __kmalloc+0x51/0x320 >> btrfs_buffered_write+0x2eb/0xd90 >> btrfs_do_write_iter+0x6bf/0x11c0 >> do_iter_readv_writev+0x2bb/0x5a0 >> do_iter_write+0x131/0x630 >> nfsd_vfs_write+0x4da/0x1900 [nfsd] >> nfsd4_write+0x2ac/0x760 [nfsd] >> nfsd4_proc_compound+0xce8/0x23e0 [nfsd] >> nfsd_dispatch+0x4ed/0xc10 [nfsd] >> svc_process_common+0xd3f/0x1b00 [sunrpc] >> svc_process+0x361/0x4f0 [sunrpc] >> nfsd+0x2d6/0x570 [nfsd] >> kthread+0x2a1/0x340 >> ret_from_fork+0x22/0x30 >> >> #012-> #0 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}: >> __lock_acquire+0x318d/0x7830 >> lock_acquire+0x1bb/0x500 >> down_write+0x82/0x130 >> btrfs_inode_lock+0x38/0x70 >> btrfs_sync_file+0x280/0x1010 >> nfsd_file_flush.isra.0+0x1b/0x220 [nfsd] >> nfsd_file_put+0xd4/0x110 [nfsd] >> release_all_access+0x13a/0x220 [nfsd] >> nfs4_free_ol_stateid+0x40/0x90 [nfsd] >> free_ol_stateid_reaplist+0x131/0x210 [nfsd] >> release_openowner+0xf7/0x160 [nfsd] >> __destroy_client+0x3cc/0x740 [nfsd] >> nfsd_cc_lru_scan+0x271/0x410 [nfsd] >> shrink_slab.constprop.0+0x31e/0x7d0 >> shrink_node+0x54b/0xe50 >> try_to_free_pages+0x394/0xba0 >> __alloc_pages_slowpath.constprop.0+0x5d2/0x1db0 >> __alloc_pages+0x4d6/0x580 >> __handle_mm_fault+0xc25/0x2810 >> handle_mm_fault+0x136/0x480 >> do_user_addr_fault+0x3d8/0xec0 >> exc_page_fault+0x5d/0xc0 >> asm_exc_page_fault+0x27/0x30 > Is this deadlock possible only via the filecache close > paths? I'm not sure, but there is another back trace below that shows another problem. > I can't think of a reason the laundromat has to > wait for each file to be flushed and closed -- the > laundromat should be able to "put" these files and allow > the filecache LRU to flush and close in the background. ok, what should we do here, enhancing the laundromat to behave as you describe? Here is another stack trace of problem with calling expire_client from 'scan' callback: Sep 3 09:07:35 nfsvmf24 kernel: ------------[ cut here ]------------ Sep 3 09:07:35 nfsvmf24 kernel: workqueue: PF_MEMALLOC task 3525(gmain) is flushing !WQ_MEM_RECLAIM nfsd4_callbacks:0x0 Sep 3 09:07:35 nfsvmf24 kernel: WARNING: CPU: 0 PID: 3525 at kernel/workqueue.c:2625 check_flush_dependency+0x17a/0x350 Sep 3 09:07:35 nfsvmf24 kernel: Modules linked in: rpcsec_gss_krb5 nfsd nfs_acl lockd grace auth_rpcgss sunrpc Sep 3 09:07:35 nfsvmf24 kernel: CPU: 0 PID: 3525 Comm: gmain Kdump: loaded Not tainted 6.0.0-rc3+ #1 Sep 3 09:07:35 nfsvmf24 kernel: Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 Sep 3 09:07:35 nfsvmf24 kernel: RIP: 0010:check_flush_dependency+0x17a/0x350 Sep 3 09:07:35 nfsvmf24 kernel: Code: 48 c1 ee 03 0f b6 04 06 84 c0 74 08 3c 03 0f 8e b8 01 00 00 41 8b b5 90 05 00 00 49 89 e8 48 c7 c7 c0 4d 06 9a e8 c6 a4 7f 02 <0f> 0b eb 4d 65 4c 8b 2c 25 c0 6e 02 00 4c 89 ef e8 71 65 01 00 49 Sep 3 09:07:35 nfsvmf24 kernel: RSP: 0018:ffff88810c73f4e8 EFLAGS: 00010282 Sep 3 09:07:35 nfsvmf24 kernel: RAX: 0000000000000000 RBX: ffff88811129a800 RCX: 0000000000000000 Sep 3 09:07:35 nfsvmf24 kernel: RDX: 0000000000000001 RSI: 0000000000000004 RDI: ffffed10218e7e93 Sep 3 09:07:35 nfsvmf24 kernel: RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed103afc6ed2 Sep 3 09:07:35 nfsvmf24 kernel: R10: ffff8881d7e3768b R11: ffffed103afc6ed1 R12: 0000000000000000 Sep 3 09:07:35 nfsvmf24 kernel: R13: ffff88810d14cac0 R14: 000000000000000d R15: 000000000000000c Sep 3 09:07:35 nfsvmf24 kernel: FS: 00007fa9a696c700(0000) GS:ffff8881d7e00000(0000) knlGS:0000000000000000 Sep 3 09:07:35 nfsvmf24 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 Sep 3 09:07:35 nfsvmf24 kernel: CR2: 00007fe922689c50 CR3: 000000010bdd8000 CR4: 00000000000406f0 Sep 3 09:07:35 nfsvmf24 kernel: Call Trace: Sep 3 09:07:35 nfsvmf24 kernel: <TASK> Sep 3 09:07:35 nfsvmf24 kernel: __flush_workqueue+0x32c/0x1350 Sep 3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680 Sep 3 09:07:35 nfsvmf24 kernel: ? rcu_read_lock_held_common+0xe/0xa0 Sep 3 09:07:35 nfsvmf24 kernel: ? check_flush_dependency+0x350/0x350 Sep 3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720 Sep 3 09:07:35 nfsvmf24 kernel: ? __queue_work+0x3cc/0xe10 Sep 3 09:07:35 nfsvmf24 kernel: ? trace_hardirqs_on+0x2d/0x110 Sep 3 09:07:35 nfsvmf24 kernel: ? nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd] Sep 3 09:07:35 nfsvmf24 kernel: nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd] Sep 3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720 Sep 3 09:07:35 nfsvmf24 kernel: ? nfsd4_probe_callback_sync+0x20/0x20 [nfsd] Sep 3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680 Sep 3 09:07:35 nfsvmf24 kernel: __destroy_client+0x5ec/0xa60 [nfsd] Sep 3 09:07:35 nfsvmf24 kernel: ? nfsd4_cb_recall_release+0x20/0x20 [nfsd] Sep 3 09:07:35 nfsvmf24 kernel: nfsd_courtesy_client_scan+0x39f/0x850 [nfsd] Sep 3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace+0x74/0xe0 Sep 3 09:07:35 nfsvmf24 kernel: ? client_ctl_write+0x7c0/0x7c0 [nfsd] Sep 3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace_thunk+0x16/0x18 Sep 3 09:07:35 nfsvmf24 kernel: shrink_slab.constprop.0+0x30b/0x7b0 Sep 3 09:07:35 nfsvmf24 kernel: ? unregister_shrinker+0x270/0x270 Sep 3 09:07:35 nfsvmf24 kernel: shrink_node+0x54b/0xe50 Sep 3 09:07:35 nfsvmf24 kernel: try_to_free_pages+0x394/0xba0 Sep 3 09:07:35 nfsvmf24 kernel: ? reclaim_pages+0x3b0/0x3b0 Sep 3 09:07:35 nfsvmf24 kernel: __alloc_pages_slowpath.constprop.0+0x636/0x1f30 Sep 3 09:07:35 nfsvmf24 kernel: ? warn_alloc+0x120/0x120 Sep 3 09:07:35 nfsvmf24 kernel: ? __zone_watermark_ok+0x2d0/0x2d0 Sep 3 09:07:35 nfsvmf24 kernel: __alloc_pages+0x4d6/0x580 Sep 3 09:07:35 nfsvmf24 kernel: ? __alloc_pages_slowpath.constprop.0+0x1f30/0x1f30 Sep 3 09:07:35 nfsvmf24 kernel: ? find_held_lock+0x2c/0x110 Sep 3 09:07:35 nfsvmf24 kernel: ? lockdep_hardirqs_on_prepare+0x17b/0x410 Sep 3 09:07:35 nfsvmf24 kernel: ____cache_alloc.part.0+0x586/0x760 Sep 3 09:07:35 nfsvmf24 kernel: ? kmem_cache_alloc+0x193/0x290 Sep 3 09:07:35 nfsvmf24 kernel: kmem_cache_alloc+0x22f/0x290 Sep 3 09:07:35 nfsvmf24 kernel: getname_flags+0xbe/0x4e0 Sep 3 09:07:35 nfsvmf24 kernel: user_path_at_empty+0x23/0x50 Sep 3 09:07:35 nfsvmf24 kernel: inotify_find_inode+0x28/0x120 Sep 3 09:07:35 nfsvmf24 kernel: ? __fget_light+0x19b/0x210 Sep 3 09:07:35 nfsvmf24 kernel: __x64_sys_inotify_add_watch+0x160/0x290 Sep 3 09:07:35 nfsvmf24 kernel: ? __ia32_sys_inotify_rm_watch+0x170/0x170 Sep 3 09:07:35 nfsvmf24 kernel: do_syscall_64+0x3d/0x90 Sep 3 09:07:35 nfsvmf24 kernel: entry_SYSCALL_64_after_hwframe+0x46/0xb0 Sep 3 09:07:35 nfsvmf24 kernel: RIP: 0033:0x7fa9b1a77f37 Sep 3 09:07:35 nfsvmf24 kernel: Code: f0 ff ff 73 01 c3 48 8b 0d 36 7f 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 fe 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 09 7f 2c 00 f7 d8 64 89 01 48 Sep 3 09:07:35 nfsvmf24 kernel: RSP: 002b:00007fa9a696bb28 EFLAGS: 00000202 ORIG_RAX: 00000000000000fe Sep 3 09:07:35 nfsvmf24 kernel: RAX: ffffffffffffffda RBX: 00005638abf5adc0 RCX: 00007fa9b1a77f37 Sep 3 09:07:35 nfsvmf24 kernel: RDX: 0000000001002fce RSI: 00005638abf58080 RDI: 000000000000000d Sep 3 09:07:35 nfsvmf24 kernel: RBP: 00007fa9a696bb54 R08: 00007ffe7f7f1080 R09: 0000000000000000 Sep 3 09:07:35 nfsvmf24 kernel: R10: 00000000002cf948 R11: 0000000000000202 R12: 0000000000000000 Sep 3 09:07:35 nfsvmf24 kernel: R13: 00007fa9b39710e0 R14: 00005638abf5adf0 R15: 00007fa9b317c940 Another behavior about the shrinker is that the 'count' and 'scan' callbacks are not 1-to-1, meaning the 'scan' is not called after every 'count' callback. Not sure what criteria the shrinker use to do the 'scan' callback but it's very late, sometimes after dozens of 'count' callback then there is a 'scan' callback. If we rely on the 'scan' callback then I don't think we react in time to release the courtesy clients. -Dai > > > -- > Chuck Lever > > >
> On Sep 3, 2022, at 1:03 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > > On 9/2/22 6:26 PM, Chuck Lever III wrote: >>> On Sep 2, 2022, at 3:34 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >>> >>> On 9/2/22 10:58 AM, Chuck Lever III wrote: >>>>>> Or, nfsd_courtesy_client_count() could return >>>>>> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could >>>>>> then look something like this: >>>>>> >>>>>> if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) >>>>>> return SHRINK_STOP; >>>>>> >>>>>> nfsd_get_client_reaplist(nn, reaplist, lt); >>>>>> list_for_each_safe(pos, next, &reaplist) { >>>>>> clp = list_entry(pos, struct nfs4_client, cl_lru); >>>>>> trace_nfsd_clid_purged(&clp->cl_clientid); >>>>>> list_del_init(&clp->cl_lru); >>>>>> expire_client(clp); >>>>>> count++; >>>>>> } >>>>>> return count; >>>>> This does not work, we cannot expire clients on the context of >>>>> scan callback due to deadlock problem. >>>> Correct, we don't want to start shrinker laundromat activity if >>>> the allocation request specified that it cannot wait. But are >>>> you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL? >>> It can be deadlock due to lock ordering problem: >>> >>> ====================================================== >>> WARNING: possible circular locking dependency detected >>> 5.19.0-rc2_sk+ #1 Not tainted >>> ------------------------------------------------------ >>> lck/31847 is trying to acquire lock: >>> ffff88811d268850 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}, at: btrfs_inode_lock+0x38/0x70 >>> #012but task is already holding lock: >>> ffffffffb41848c0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x506/0x1db0 >>> #012which lock already depends on the new lock. >>> #012the existing dependency chain (in reverse order) is: >>> >>> #012-> #1 (fs_reclaim){+.+.}-{0:0}: >>> fs_reclaim_acquire+0xc0/0x100 >>> __kmalloc+0x51/0x320 >>> btrfs_buffered_write+0x2eb/0xd90 >>> btrfs_do_write_iter+0x6bf/0x11c0 >>> do_iter_readv_writev+0x2bb/0x5a0 >>> do_iter_write+0x131/0x630 >>> nfsd_vfs_write+0x4da/0x1900 [nfsd] >>> nfsd4_write+0x2ac/0x760 [nfsd] >>> nfsd4_proc_compound+0xce8/0x23e0 [nfsd] >>> nfsd_dispatch+0x4ed/0xc10 [nfsd] >>> svc_process_common+0xd3f/0x1b00 [sunrpc] >>> svc_process+0x361/0x4f0 [sunrpc] >>> nfsd+0x2d6/0x570 [nfsd] >>> kthread+0x2a1/0x340 >>> ret_from_fork+0x22/0x30 >>> >>> #012-> #0 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}: >>> __lock_acquire+0x318d/0x7830 >>> lock_acquire+0x1bb/0x500 >>> down_write+0x82/0x130 >>> btrfs_inode_lock+0x38/0x70 >>> btrfs_sync_file+0x280/0x1010 >>> nfsd_file_flush.isra.0+0x1b/0x220 [nfsd] >>> nfsd_file_put+0xd4/0x110 [nfsd] >>> release_all_access+0x13a/0x220 [nfsd] >>> nfs4_free_ol_stateid+0x40/0x90 [nfsd] >>> free_ol_stateid_reaplist+0x131/0x210 [nfsd] >>> release_openowner+0xf7/0x160 [nfsd] >>> __destroy_client+0x3cc/0x740 [nfsd] >>> nfsd_cc_lru_scan+0x271/0x410 [nfsd] >>> shrink_slab.constprop.0+0x31e/0x7d0 >>> shrink_node+0x54b/0xe50 >>> try_to_free_pages+0x394/0xba0 >>> __alloc_pages_slowpath.constprop.0+0x5d2/0x1db0 >>> __alloc_pages+0x4d6/0x580 >>> __handle_mm_fault+0xc25/0x2810 >>> handle_mm_fault+0x136/0x480 >>> do_user_addr_fault+0x3d8/0xec0 >>> exc_page_fault+0x5d/0xc0 >>> asm_exc_page_fault+0x27/0x30 >> Is this deadlock possible only via the filecache close >> paths? > > I'm not sure, but there is another back trace below that shows > another problem. > >> I can't think of a reason the laundromat has to >> wait for each file to be flushed and closed -- the >> laundromat should be able to "put" these files and allow >> the filecache LRU to flush and close in the background. > > ok, what should we do here, enhancing the laundromat to behave > as you describe? What I was suggesting was a longer term strategy for improving the laundromat. In order to scale well in the number of clients, it needs to schedule client expiry and deletion without serializing. (ie, the laundromat itself can identify a set of clients to clean, but then it should pass that list to other workers so it can run again as soon as it needs to -- and that also means it can use more than just one CPU at a time to do its work). In the meantime, it is still valuable to consider a shrinker built around using ->nfsd_courtesy_client_count only. That was one of the two alternatives I suggested before, so I feel we have some consensus there. Please continue to look into that, and we can continue poking at laundromat improvements later. > Here is another stack trace of problem with calling expire_client > from 'scan' callback: > Sep 3 09:07:35 nfsvmf24 kernel: ------------[ cut here ]------------ > Sep 3 09:07:35 nfsvmf24 kernel: workqueue: PF_MEMALLOC task 3525(gmain) is flushing !WQ_MEM_RECLAIM nfsd4_callbacks:0x0 > Sep 3 09:07:35 nfsvmf24 kernel: WARNING: CPU: 0 PID: 3525 at kernel/workqueue.c:2625 check_flush_dependency+0x17a/0x350 > Sep 3 09:07:35 nfsvmf24 kernel: Modules linked in: rpcsec_gss_krb5 nfsd nfs_acl lockd grace auth_rpcgss sunrpc > Sep 3 09:07:35 nfsvmf24 kernel: CPU: 0 PID: 3525 Comm: gmain Kdump: loaded Not tainted 6.0.0-rc3+ #1 > Sep 3 09:07:35 nfsvmf24 kernel: Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 > Sep 3 09:07:35 nfsvmf24 kernel: RIP: 0010:check_flush_dependency+0x17a/0x350 > Sep 3 09:07:35 nfsvmf24 kernel: Code: 48 c1 ee 03 0f b6 04 06 84 c0 74 08 3c 03 0f 8e b8 01 00 00 41 8b b5 90 05 00 00 49 89 e8 48 c7 c7 c0 4d 06 9a e8 c6 a4 7f 02 <0f> 0b eb 4d 65 4c 8b 2c 25 c0 6e 02 00 4c 89 ef e8 71 65 01 00 49 > Sep 3 09:07:35 nfsvmf24 kernel: RSP: 0018:ffff88810c73f4e8 EFLAGS: 00010282 > Sep 3 09:07:35 nfsvmf24 kernel: RAX: 0000000000000000 RBX: ffff88811129a800 RCX: 0000000000000000 > Sep 3 09:07:35 nfsvmf24 kernel: RDX: 0000000000000001 RSI: 0000000000000004 RDI: ffffed10218e7e93 > Sep 3 09:07:35 nfsvmf24 kernel: RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed103afc6ed2 > Sep 3 09:07:35 nfsvmf24 kernel: R10: ffff8881d7e3768b R11: ffffed103afc6ed1 R12: 0000000000000000 > Sep 3 09:07:35 nfsvmf24 kernel: R13: ffff88810d14cac0 R14: 000000000000000d R15: 000000000000000c > Sep 3 09:07:35 nfsvmf24 kernel: FS: 00007fa9a696c700(0000) GS:ffff8881d7e00000(0000) knlGS:0000000000000000 > Sep 3 09:07:35 nfsvmf24 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > Sep 3 09:07:35 nfsvmf24 kernel: CR2: 00007fe922689c50 CR3: 000000010bdd8000 CR4: 00000000000406f0 > Sep 3 09:07:35 nfsvmf24 kernel: Call Trace: > Sep 3 09:07:35 nfsvmf24 kernel: <TASK> > Sep 3 09:07:35 nfsvmf24 kernel: __flush_workqueue+0x32c/0x1350 > Sep 3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680 > Sep 3 09:07:35 nfsvmf24 kernel: ? rcu_read_lock_held_common+0xe/0xa0 > Sep 3 09:07:35 nfsvmf24 kernel: ? check_flush_dependency+0x350/0x350 > Sep 3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720 > Sep 3 09:07:35 nfsvmf24 kernel: ? __queue_work+0x3cc/0xe10 > Sep 3 09:07:35 nfsvmf24 kernel: ? trace_hardirqs_on+0x2d/0x110 > Sep 3 09:07:35 nfsvmf24 kernel: ? nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd] > Sep 3 09:07:35 nfsvmf24 kernel: nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd] > Sep 3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720 > Sep 3 09:07:35 nfsvmf24 kernel: ? nfsd4_probe_callback_sync+0x20/0x20 [nfsd] > Sep 3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680 > Sep 3 09:07:35 nfsvmf24 kernel: __destroy_client+0x5ec/0xa60 [nfsd] > Sep 3 09:07:35 nfsvmf24 kernel: ? nfsd4_cb_recall_release+0x20/0x20 [nfsd] > Sep 3 09:07:35 nfsvmf24 kernel: nfsd_courtesy_client_scan+0x39f/0x850 [nfsd] > Sep 3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace+0x74/0xe0 > Sep 3 09:07:35 nfsvmf24 kernel: ? client_ctl_write+0x7c0/0x7c0 [nfsd] > Sep 3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace_thunk+0x16/0x18 > Sep 3 09:07:35 nfsvmf24 kernel: shrink_slab.constprop.0+0x30b/0x7b0 > Sep 3 09:07:35 nfsvmf24 kernel: ? unregister_shrinker+0x270/0x270 > Sep 3 09:07:35 nfsvmf24 kernel: shrink_node+0x54b/0xe50 > Sep 3 09:07:35 nfsvmf24 kernel: try_to_free_pages+0x394/0xba0 > Sep 3 09:07:35 nfsvmf24 kernel: ? reclaim_pages+0x3b0/0x3b0 > Sep 3 09:07:35 nfsvmf24 kernel: __alloc_pages_slowpath.constprop.0+0x636/0x1f30 > Sep 3 09:07:35 nfsvmf24 kernel: ? warn_alloc+0x120/0x120 > Sep 3 09:07:35 nfsvmf24 kernel: ? __zone_watermark_ok+0x2d0/0x2d0 > Sep 3 09:07:35 nfsvmf24 kernel: __alloc_pages+0x4d6/0x580 > Sep 3 09:07:35 nfsvmf24 kernel: ? __alloc_pages_slowpath.constprop.0+0x1f30/0x1f30 > Sep 3 09:07:35 nfsvmf24 kernel: ? find_held_lock+0x2c/0x110 > Sep 3 09:07:35 nfsvmf24 kernel: ? lockdep_hardirqs_on_prepare+0x17b/0x410 > Sep 3 09:07:35 nfsvmf24 kernel: ____cache_alloc.part.0+0x586/0x760 > Sep 3 09:07:35 nfsvmf24 kernel: ? kmem_cache_alloc+0x193/0x290 > Sep 3 09:07:35 nfsvmf24 kernel: kmem_cache_alloc+0x22f/0x290 > Sep 3 09:07:35 nfsvmf24 kernel: getname_flags+0xbe/0x4e0 > Sep 3 09:07:35 nfsvmf24 kernel: user_path_at_empty+0x23/0x50 > Sep 3 09:07:35 nfsvmf24 kernel: inotify_find_inode+0x28/0x120 > Sep 3 09:07:35 nfsvmf24 kernel: ? __fget_light+0x19b/0x210 > Sep 3 09:07:35 nfsvmf24 kernel: __x64_sys_inotify_add_watch+0x160/0x290 > Sep 3 09:07:35 nfsvmf24 kernel: ? __ia32_sys_inotify_rm_watch+0x170/0x170 > Sep 3 09:07:35 nfsvmf24 kernel: do_syscall_64+0x3d/0x90 > Sep 3 09:07:35 nfsvmf24 kernel: entry_SYSCALL_64_after_hwframe+0x46/0xb0 > Sep 3 09:07:35 nfsvmf24 kernel: RIP: 0033:0x7fa9b1a77f37 > Sep 3 09:07:35 nfsvmf24 kernel: Code: f0 ff ff 73 01 c3 48 8b 0d 36 7f 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 fe 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 09 7f 2c 00 f7 d8 64 89 01 48 > Sep 3 09:07:35 nfsvmf24 kernel: RSP: 002b:00007fa9a696bb28 EFLAGS: 00000202 ORIG_RAX: 00000000000000fe > Sep 3 09:07:35 nfsvmf24 kernel: RAX: ffffffffffffffda RBX: 00005638abf5adc0 RCX: 00007fa9b1a77f37 > Sep 3 09:07:35 nfsvmf24 kernel: RDX: 0000000001002fce RSI: 00005638abf58080 RDI: 000000000000000d > Sep 3 09:07:35 nfsvmf24 kernel: RBP: 00007fa9a696bb54 R08: 00007ffe7f7f1080 R09: 0000000000000000 > Sep 3 09:07:35 nfsvmf24 kernel: R10: 00000000002cf948 R11: 0000000000000202 R12: 0000000000000000 > Sep 3 09:07:35 nfsvmf24 kernel: R13: 00007fa9b39710e0 R14: 00005638abf5adf0 R15: 00007fa9b317c940 > > Another behavior about the shrinker is that the 'count' and 'scan' > callbacks are not 1-to-1, meaning the 'scan' is not called after > every 'count' callback. Not sure what criteria the shrinker use to > do the 'scan' callback but it's very late, sometimes after dozens > of 'count' callback then there is a 'scan' callback. If we rely on > the 'scan' callback then I don't think we react in time to release > the courtesy clients. > > -Dai > >> >> >> -- >> Chuck Lever -- Chuck Lever
On 9/3/22 10:29 AM, Chuck Lever III wrote: > >> On Sep 3, 2022, at 1:03 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >> >> >> On 9/2/22 6:26 PM, Chuck Lever III wrote: >>>> On Sep 2, 2022, at 3:34 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >>>> >>>> On 9/2/22 10:58 AM, Chuck Lever III wrote: >>>>>>> Or, nfsd_courtesy_client_count() could return >>>>>>> nfsd_couresy_client_count. nfsd_courtesy_client_scan() could >>>>>>> then look something like this: >>>>>>> >>>>>>> if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL) >>>>>>> return SHRINK_STOP; >>>>>>> >>>>>>> nfsd_get_client_reaplist(nn, reaplist, lt); >>>>>>> list_for_each_safe(pos, next, &reaplist) { >>>>>>> clp = list_entry(pos, struct nfs4_client, cl_lru); >>>>>>> trace_nfsd_clid_purged(&clp->cl_clientid); >>>>>>> list_del_init(&clp->cl_lru); >>>>>>> expire_client(clp); >>>>>>> count++; >>>>>>> } >>>>>>> return count; >>>>>> This does not work, we cannot expire clients on the context of >>>>>> scan callback due to deadlock problem. >>>>> Correct, we don't want to start shrinker laundromat activity if >>>>> the allocation request specified that it cannot wait. But are >>>>> you sure it doesn't work if sc_gfp_flags is set to GFP_KERNEL? >>>> It can be deadlock due to lock ordering problem: >>>> >>>> ====================================================== >>>> WARNING: possible circular locking dependency detected >>>> 5.19.0-rc2_sk+ #1 Not tainted >>>> ------------------------------------------------------ >>>> lck/31847 is trying to acquire lock: >>>> ffff88811d268850 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}, at: btrfs_inode_lock+0x38/0x70 >>>> #012but task is already holding lock: >>>> ffffffffb41848c0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0x506/0x1db0 >>>> #012which lock already depends on the new lock. >>>> #012the existing dependency chain (in reverse order) is: >>>> >>>> #012-> #1 (fs_reclaim){+.+.}-{0:0}: >>>> fs_reclaim_acquire+0xc0/0x100 >>>> __kmalloc+0x51/0x320 >>>> btrfs_buffered_write+0x2eb/0xd90 >>>> btrfs_do_write_iter+0x6bf/0x11c0 >>>> do_iter_readv_writev+0x2bb/0x5a0 >>>> do_iter_write+0x131/0x630 >>>> nfsd_vfs_write+0x4da/0x1900 [nfsd] >>>> nfsd4_write+0x2ac/0x760 [nfsd] >>>> nfsd4_proc_compound+0xce8/0x23e0 [nfsd] >>>> nfsd_dispatch+0x4ed/0xc10 [nfsd] >>>> svc_process_common+0xd3f/0x1b00 [sunrpc] >>>> svc_process+0x361/0x4f0 [sunrpc] >>>> nfsd+0x2d6/0x570 [nfsd] >>>> kthread+0x2a1/0x340 >>>> ret_from_fork+0x22/0x30 >>>> >>>> #012-> #0 (&sb->s_type->i_mutex_key#16){+.+.}-{3:3}: >>>> __lock_acquire+0x318d/0x7830 >>>> lock_acquire+0x1bb/0x500 >>>> down_write+0x82/0x130 >>>> btrfs_inode_lock+0x38/0x70 >>>> btrfs_sync_file+0x280/0x1010 >>>> nfsd_file_flush.isra.0+0x1b/0x220 [nfsd] >>>> nfsd_file_put+0xd4/0x110 [nfsd] >>>> release_all_access+0x13a/0x220 [nfsd] >>>> nfs4_free_ol_stateid+0x40/0x90 [nfsd] >>>> free_ol_stateid_reaplist+0x131/0x210 [nfsd] >>>> release_openowner+0xf7/0x160 [nfsd] >>>> __destroy_client+0x3cc/0x740 [nfsd] >>>> nfsd_cc_lru_scan+0x271/0x410 [nfsd] >>>> shrink_slab.constprop.0+0x31e/0x7d0 >>>> shrink_node+0x54b/0xe50 >>>> try_to_free_pages+0x394/0xba0 >>>> __alloc_pages_slowpath.constprop.0+0x5d2/0x1db0 >>>> __alloc_pages+0x4d6/0x580 >>>> __handle_mm_fault+0xc25/0x2810 >>>> handle_mm_fault+0x136/0x480 >>>> do_user_addr_fault+0x3d8/0xec0 >>>> exc_page_fault+0x5d/0xc0 >>>> asm_exc_page_fault+0x27/0x30 >>> Is this deadlock possible only via the filecache close >>> paths? >> I'm not sure, but there is another back trace below that shows >> another problem. >> >>> I can't think of a reason the laundromat has to >>> wait for each file to be flushed and closed -- the >>> laundromat should be able to "put" these files and allow >>> the filecache LRU to flush and close in the background. >> ok, what should we do here, enhancing the laundromat to behave >> as you describe? > What I was suggesting was a longer term strategy for improving the > laundromat. In order to scale well in the number of clients, it > needs to schedule client expiry and deletion without serializing. > > (ie, the laundromat itself can identify a set of clients to clean, > but then it should pass that list to other workers so it can run > again as soon as it needs to -- and that also means it can use more > than just one CPU at a time to do its work). I see. Currently on my lowly 1-CPU VM it takes about ~35 secs to destroy 128 clients, each with only few states (generated by pynfs's CID5 test). We can improve on this. > > In the meantime, it is still valuable to consider a shrinker built > around using ->nfsd_courtesy_client_count only. That was one of > the two alternatives I suggested before, so I feel we have some > consensus there. Please continue to look into that, and we can > continue poking at laundromat improvements later. Okay, Thanks, -Dai > > >> Here is another stack trace of problem with calling expire_client >> from 'scan' callback: >> Sep 3 09:07:35 nfsvmf24 kernel: ------------[ cut here ]------------ >> Sep 3 09:07:35 nfsvmf24 kernel: workqueue: PF_MEMALLOC task 3525(gmain) is flushing !WQ_MEM_RECLAIM nfsd4_callbacks:0x0 >> Sep 3 09:07:35 nfsvmf24 kernel: WARNING: CPU: 0 PID: 3525 at kernel/workqueue.c:2625 check_flush_dependency+0x17a/0x350 >> Sep 3 09:07:35 nfsvmf24 kernel: Modules linked in: rpcsec_gss_krb5 nfsd nfs_acl lockd grace auth_rpcgss sunrpc >> Sep 3 09:07:35 nfsvmf24 kernel: CPU: 0 PID: 3525 Comm: gmain Kdump: loaded Not tainted 6.0.0-rc3+ #1 >> Sep 3 09:07:35 nfsvmf24 kernel: Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 >> Sep 3 09:07:35 nfsvmf24 kernel: RIP: 0010:check_flush_dependency+0x17a/0x350 >> Sep 3 09:07:35 nfsvmf24 kernel: Code: 48 c1 ee 03 0f b6 04 06 84 c0 74 08 3c 03 0f 8e b8 01 00 00 41 8b b5 90 05 00 00 49 89 e8 48 c7 c7 c0 4d 06 9a e8 c6 a4 7f 02 <0f> 0b eb 4d 65 4c 8b 2c 25 c0 6e 02 00 4c 89 ef e8 71 65 01 00 49 >> Sep 3 09:07:35 nfsvmf24 kernel: RSP: 0018:ffff88810c73f4e8 EFLAGS: 00010282 >> Sep 3 09:07:35 nfsvmf24 kernel: RAX: 0000000000000000 RBX: ffff88811129a800 RCX: 0000000000000000 >> Sep 3 09:07:35 nfsvmf24 kernel: RDX: 0000000000000001 RSI: 0000000000000004 RDI: ffffed10218e7e93 >> Sep 3 09:07:35 nfsvmf24 kernel: RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed103afc6ed2 >> Sep 3 09:07:35 nfsvmf24 kernel: R10: ffff8881d7e3768b R11: ffffed103afc6ed1 R12: 0000000000000000 >> Sep 3 09:07:35 nfsvmf24 kernel: R13: ffff88810d14cac0 R14: 000000000000000d R15: 000000000000000c >> Sep 3 09:07:35 nfsvmf24 kernel: FS: 00007fa9a696c700(0000) GS:ffff8881d7e00000(0000) knlGS:0000000000000000 >> Sep 3 09:07:35 nfsvmf24 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 >> Sep 3 09:07:35 nfsvmf24 kernel: CR2: 00007fe922689c50 CR3: 000000010bdd8000 CR4: 00000000000406f0 >> Sep 3 09:07:35 nfsvmf24 kernel: Call Trace: >> Sep 3 09:07:35 nfsvmf24 kernel: <TASK> >> Sep 3 09:07:35 nfsvmf24 kernel: __flush_workqueue+0x32c/0x1350 >> Sep 3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680 >> Sep 3 09:07:35 nfsvmf24 kernel: ? rcu_read_lock_held_common+0xe/0xa0 >> Sep 3 09:07:35 nfsvmf24 kernel: ? check_flush_dependency+0x350/0x350 >> Sep 3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720 >> Sep 3 09:07:35 nfsvmf24 kernel: ? __queue_work+0x3cc/0xe10 >> Sep 3 09:07:35 nfsvmf24 kernel: ? trace_hardirqs_on+0x2d/0x110 >> Sep 3 09:07:35 nfsvmf24 kernel: ? nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd] >> Sep 3 09:07:35 nfsvmf24 kernel: nfsd4_shutdown_callback+0xc5/0x3d0 [nfsd] >> Sep 3 09:07:35 nfsvmf24 kernel: ? lock_release+0x485/0x720 >> Sep 3 09:07:35 nfsvmf24 kernel: ? nfsd4_probe_callback_sync+0x20/0x20 [nfsd] >> Sep 3 09:07:35 nfsvmf24 kernel: ? lock_downgrade+0x680/0x680 >> Sep 3 09:07:35 nfsvmf24 kernel: __destroy_client+0x5ec/0xa60 [nfsd] >> Sep 3 09:07:35 nfsvmf24 kernel: ? nfsd4_cb_recall_release+0x20/0x20 [nfsd] >> Sep 3 09:07:35 nfsvmf24 kernel: nfsd_courtesy_client_scan+0x39f/0x850 [nfsd] >> Sep 3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace+0x74/0xe0 >> Sep 3 09:07:35 nfsvmf24 kernel: ? client_ctl_write+0x7c0/0x7c0 [nfsd] >> Sep 3 09:07:35 nfsvmf24 kernel: ? preempt_schedule_notrace_thunk+0x16/0x18 >> Sep 3 09:07:35 nfsvmf24 kernel: shrink_slab.constprop.0+0x30b/0x7b0 >> Sep 3 09:07:35 nfsvmf24 kernel: ? unregister_shrinker+0x270/0x270 >> Sep 3 09:07:35 nfsvmf24 kernel: shrink_node+0x54b/0xe50 >> Sep 3 09:07:35 nfsvmf24 kernel: try_to_free_pages+0x394/0xba0 >> Sep 3 09:07:35 nfsvmf24 kernel: ? reclaim_pages+0x3b0/0x3b0 >> Sep 3 09:07:35 nfsvmf24 kernel: __alloc_pages_slowpath.constprop.0+0x636/0x1f30 >> Sep 3 09:07:35 nfsvmf24 kernel: ? warn_alloc+0x120/0x120 >> Sep 3 09:07:35 nfsvmf24 kernel: ? __zone_watermark_ok+0x2d0/0x2d0 >> Sep 3 09:07:35 nfsvmf24 kernel: __alloc_pages+0x4d6/0x580 >> Sep 3 09:07:35 nfsvmf24 kernel: ? __alloc_pages_slowpath.constprop.0+0x1f30/0x1f30 >> Sep 3 09:07:35 nfsvmf24 kernel: ? find_held_lock+0x2c/0x110 >> Sep 3 09:07:35 nfsvmf24 kernel: ? lockdep_hardirqs_on_prepare+0x17b/0x410 >> Sep 3 09:07:35 nfsvmf24 kernel: ____cache_alloc.part.0+0x586/0x760 >> Sep 3 09:07:35 nfsvmf24 kernel: ? kmem_cache_alloc+0x193/0x290 >> Sep 3 09:07:35 nfsvmf24 kernel: kmem_cache_alloc+0x22f/0x290 >> Sep 3 09:07:35 nfsvmf24 kernel: getname_flags+0xbe/0x4e0 >> Sep 3 09:07:35 nfsvmf24 kernel: user_path_at_empty+0x23/0x50 >> Sep 3 09:07:35 nfsvmf24 kernel: inotify_find_inode+0x28/0x120 >> Sep 3 09:07:35 nfsvmf24 kernel: ? __fget_light+0x19b/0x210 >> Sep 3 09:07:35 nfsvmf24 kernel: __x64_sys_inotify_add_watch+0x160/0x290 >> Sep 3 09:07:35 nfsvmf24 kernel: ? __ia32_sys_inotify_rm_watch+0x170/0x170 >> Sep 3 09:07:35 nfsvmf24 kernel: do_syscall_64+0x3d/0x90 >> Sep 3 09:07:35 nfsvmf24 kernel: entry_SYSCALL_64_after_hwframe+0x46/0xb0 >> Sep 3 09:07:35 nfsvmf24 kernel: RIP: 0033:0x7fa9b1a77f37 >> Sep 3 09:07:35 nfsvmf24 kernel: Code: f0 ff ff 73 01 c3 48 8b 0d 36 7f 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 b8 fe 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 09 7f 2c 00 f7 d8 64 89 01 48 >> Sep 3 09:07:35 nfsvmf24 kernel: RSP: 002b:00007fa9a696bb28 EFLAGS: 00000202 ORIG_RAX: 00000000000000fe >> Sep 3 09:07:35 nfsvmf24 kernel: RAX: ffffffffffffffda RBX: 00005638abf5adc0 RCX: 00007fa9b1a77f37 >> Sep 3 09:07:35 nfsvmf24 kernel: RDX: 0000000001002fce RSI: 00005638abf58080 RDI: 000000000000000d >> Sep 3 09:07:35 nfsvmf24 kernel: RBP: 00007fa9a696bb54 R08: 00007ffe7f7f1080 R09: 0000000000000000 >> Sep 3 09:07:35 nfsvmf24 kernel: R10: 00000000002cf948 R11: 0000000000000202 R12: 0000000000000000 >> Sep 3 09:07:35 nfsvmf24 kernel: R13: 00007fa9b39710e0 R14: 00005638abf5adf0 R15: 00007fa9b317c940 >> >> Another behavior about the shrinker is that the 'count' and 'scan' >> callbacks are not 1-to-1, meaning the 'scan' is not called after >> every 'count' callback. Not sure what criteria the shrinker use to >> do the 'scan' callback but it's very late, sometimes after dozens >> of 'count' callback then there is a 'scan' callback. If we rely on >> the 'scan' callback then I don't think we react in time to release >> the courtesy clients. >> >> -Dai >> >>> >>> -- >>> Chuck Lever > -- > Chuck Lever > > >
On Sat, Sep 03, 2022 at 10:59:17AM -0700, dai.ngo@oracle.com wrote: > On 9/3/22 10:29 AM, Chuck Lever III wrote: > >What I was suggesting was a longer term strategy for improving the > >laundromat. In order to scale well in the number of clients, it > >needs to schedule client expiry and deletion without serializing. > > > >(ie, the laundromat itself can identify a set of clients to clean, > >but then it should pass that list to other workers so it can run > >again as soon as it needs to -- and that also means it can use more > >than just one CPU at a time to do its work). > > I see. Currently on my lowly 1-CPU VM it takes about ~35 secs to > destroy 128 clients, each with only few states (generated by pynfs's > CID5 test). We can improve on this. Careful--it's not the CPU that's the issue, it's waiting for disk. If you're on a hard drive, for example, it's going to take at least one seek (probably at least 10ms) to expire a single client, so you're never going to destroy more than 100 per second. That's what you need to parallelize. See item 3 from https://lore.kernel.org/linux-nfs/20220523154026.GD24163@fieldses.org/ Also, looks like current nfs-utils is still doing 3 commits per expiry. Steve, for some reason I think "nfsdcld: use WAL journal for faster commits" never got applied: https://lore.kernel.org/linux-nfs/20220104222445.GF12040@fieldses.org/ --b.
On 9/6/22 6:00 AM, J. Bruce Fields wrote: > On Sat, Sep 03, 2022 at 10:59:17AM -0700, dai.ngo@oracle.com wrote: >> On 9/3/22 10:29 AM, Chuck Lever III wrote: >>> What I was suggesting was a longer term strategy for improving the >>> laundromat. In order to scale well in the number of clients, it >>> needs to schedule client expiry and deletion without serializing. >>> >>> (ie, the laundromat itself can identify a set of clients to clean, >>> but then it should pass that list to other workers so it can run >>> again as soon as it needs to -- and that also means it can use more >>> than just one CPU at a time to do its work). >> I see. Currently on my lowly 1-CPU VM it takes about ~35 secs to >> destroy 128 clients, each with only few states (generated by pynfs's >> CID5 test). We can improve on this. > Careful--it's not the CPU that's the issue, it's waiting for disk. > > If you're on a hard drive, for example, it's going to take at least one > seek (probably at least 10ms) to expire a single client, so you're never > going to destroy more than 100 per second. That's what you need to > parallelize. See item 3 from > > https://urldefense.com/v3/__https://lore.kernel.org/linux-nfs/20220523154026.GD24163@fieldses.org/__;!!ACWV5N9M2RV99hQ!KK7Xqkksy2WBks12oxpw0FMQW8z7_FpSDgruhtAIrNCW5kmhvAY7noT5d6ybenxkIowyx9cVXBLDysVq$ Right! thank you for reminding me of this. I'll add it to my plate if no one gets to it yet. -Dai > > > Also, looks like current nfs-utils is still doing 3 commits per expiry. > > Steve, for some reason I think "nfsdcld: use WAL journal for faster > commits" never got applied: > > https://urldefense.com/v3/__https://lore.kernel.org/linux-nfs/20220104222445.GF12040@fieldses.org/__;!!ACWV5N9M2RV99hQ!KK7Xqkksy2WBks12oxpw0FMQW8z7_FpSDgruhtAIrNCW5kmhvAY7noT5d6ybenxkIowyx9cVXP3Ocamj$ > > --b.
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 2695dff1378a..2a604951623f 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -194,6 +194,9 @@ struct nfsd_net { int nfs4_max_clients; atomic_t nfsd_courtesy_client_count; + atomic_t nfsd_client_shrinker_cb_count; + atomic_t nfsd_client_shrinker_reapcount; + struct shrinker nfsd_client_shrinker; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index eaf7b4dcea33..9aed9eed1892 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4343,7 +4343,40 @@ nfsd4_init_slabs(void) return -ENOMEM; } -void nfsd4_init_leases_net(struct nfsd_net *nn) +static unsigned long +nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc) +{ + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_client_shrinker); + + atomic_inc(&nn->nfsd_client_shrinker_cb_count); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + return (unsigned long)atomic_read(&nn->nfsd_courtesy_client_count); +} + +static unsigned long +nfsd_courtesy_client_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + struct nfsd_net *nn = container_of(shrink, + struct nfsd_net, nfsd_client_shrinker); + unsigned long cnt; + + cnt = atomic_read(&nn->nfsd_client_shrinker_reapcount); + atomic_set(&nn->nfsd_client_shrinker_reapcount, 0); + return cnt; +} + +static int +nfsd_register_client_shrinker(struct nfsd_net *nn) +{ + nn->nfsd_client_shrinker.scan_objects = nfsd_courtesy_client_scan; + nn->nfsd_client_shrinker.count_objects = nfsd_courtesy_client_count; + nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; + return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"); +} + +int +nfsd4_init_leases_net(struct nfsd_net *nn) { struct sysinfo si; u64 max_clients; @@ -4364,6 +4397,8 @@ void nfsd4_init_leases_net(struct nfsd_net *nn) nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); atomic_set(&nn->nfsd_courtesy_client_count, 0); + atomic_set(&nn->nfsd_client_shrinker_cb_count, 0); + return nfsd_register_client_shrinker(nn); } static void init_nfs4_replay(struct nfs4_replay *rp) @@ -5872,12 +5907,17 @@ static void nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, struct laundry_time *lt) { - unsigned int maxreap, reapcnt = 0; + unsigned int maxreap = 0, reapcnt = 0; + int cb_cnt; struct list_head *pos, *next; struct nfs4_client *clp; - maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ? - NFSD_CLIENT_MAX_TRIM_PER_RUN : 0; + cb_cnt = atomic_read(&nn->nfsd_client_shrinker_cb_count); + if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients || + cb_cnt) { + maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN; + atomic_set(&nn->nfsd_client_shrinker_cb_count, 0); + } INIT_LIST_HEAD(reaplist); spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { @@ -5903,6 +5943,8 @@ nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, } } spin_unlock(&nn->client_lock); + if (cb_cnt) + atomic_add(reapcnt, &nn->nfsd_client_shrinker_reapcount); } static time64_t @@ -5943,6 +5985,8 @@ nfs4_laundromat(struct nfsd_net *nn) list_del_init(&clp->cl_lru); expire_client(clp); } + if (atomic_read(&nn->nfsd_client_shrinker_cb_count) > 0) + lt.new_timeo = NFSD_LAUNDROMAT_MINTIMEOUT; spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 917fa1892fd2..597a26ad4183 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1481,11 +1481,12 @@ static __net_init int nfsd_init_net(struct net *net) goto out_idmap_error; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; + retval = nfsd4_init_leases_net(nn); + if (retval) + goto out_drc_error; retval = nfsd_reply_cache_init(nn); if (retval) goto out_drc_error; - nfsd4_init_leases_net(nn); - get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); @@ -1507,6 +1508,7 @@ static __net_exit void nfsd_exit_net(struct net *net) nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); + nfsd4_leases_net_shutdown(nn); } static struct pernet_operations nfsd_net_ops = { diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 57a468ed85c3..7e05ab7a3532 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -498,7 +498,11 @@ extern void unregister_cld_notifier(void); extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn); #endif -extern void nfsd4_init_leases_net(struct nfsd_net *nn); +extern int nfsd4_init_leases_net(struct nfsd_net *nn); +static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) +{ + unregister_shrinker(&nn->nfsd_client_shrinker); +}; #else /* CONFIG_NFSD_V4 */ static inline int nfsd4_is_junction(struct dentry *dentry) @@ -506,7 +510,8 @@ static inline int nfsd4_is_junction(struct dentry *dentry) return 0; } -static inline void nfsd4_init_leases_net(struct nfsd_net *nn) {}; +static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; }; +static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) { }; #define register_cld_notifier() 0 #define unregister_cld_notifier() do { } while(0)
Add the courtesy client shrinker to react to low memory condition triggered by the memory shrinker. On the shrinker's count callback, we increment a callback counter and return the number of outstanding courtesy clients. When the laundromat runs, it checks if this counter is not zero and starts reaping old courtesy clients. The maximum number of clients to be reaped is limited to NFSD_CIENT_MAX_TRIM_PER_RUN (128). This limit is to prevent the laundromat from spending too much time reaping the clients and not processing other tasks in a timely manner. The laundromat is rescheduled to run sooner if it detects low low memory condition and there are more clients to reap. On the shrinker's scan callback, we return the number of clients That were reaped since the last scan callback. We can not reap the clients on the scan callback context since destroying the client might require call into the underlying filesystem or other subsystems which might allocate memory which can cause deadlock. Signed-off-by: Dai Ngo <dai.ngo@oracle.com> --- fs/nfsd/netns.h | 3 +++ fs/nfsd/nfs4state.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++---- fs/nfsd/nfsctl.c | 6 ++++-- fs/nfsd/nfsd.h | 9 +++++++-- 4 files changed, 62 insertions(+), 8 deletions(-)