Message ID | 1673333310-24837-1-git-send-email-dai.ngo@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/1] NFSD: fix WARN_ON_ONCE in __queue_delayed_work | expand |
On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: > Currently nfsd4_state_shrinker_worker can be schduled multiple times > from nfsd4_state_shrinker_count when memory is low. This causes > the WARN_ON_ONCE in __queue_delayed_work to trigger. > > This patch allows only one instance of nfsd4_state_shrinker_worker > at a time using the nfsd_shrinker_active flag, protected by the > client_lock. > > Replace mod_delayed_work with queue_delayed_work since we > don't expect to modify the delay of any pending work. > > Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") > Reported-by: Mike Galbraith <efault@gmx.de> > Signed-off-by: Dai Ngo <dai.ngo@oracle.com> > --- > fs/nfsd/netns.h | 1 + > fs/nfsd/nfs4state.c | 16 ++++++++++++++-- > 2 files changed, 15 insertions(+), 2 deletions(-) > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h > index 8c854ba3285b..801d70926442 100644 > --- a/fs/nfsd/netns.h > +++ b/fs/nfsd/netns.h > @@ -196,6 +196,7 @@ struct nfsd_net { > atomic_t nfsd_courtesy_clients; > struct shrinker nfsd_client_shrinker; > struct delayed_work nfsd_shrinker_work; > + bool nfsd_shrinker_active; > }; > > /* Simple check to find out if a given net was properly initialized */ > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > index ee56c9466304..e00551af6a11 100644 > --- a/fs/nfsd/nfs4state.c > +++ b/fs/nfsd/nfs4state.c > @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) > struct nfsd_net *nn = container_of(shrink, > struct nfsd_net, nfsd_client_shrinker); > > + spin_lock(&nn->client_lock); > + if (nn->nfsd_shrinker_active) { > + spin_unlock(&nn->client_lock); > + return 0; > + } Is this extra machinery really necessary? The bool and spinlock don't seem to be needed. Typically there is no issue with calling queued_delayed_work when the work is already queued. It just returns false in that case without doing anything. > > count = atomic_read(&nn->nfsd_courtesy_clients); > if (!count) > count = atomic_long_read(&num_delegations); > - if (count) > - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); > + if (count) { > + nn->nfsd_shrinker_active = true; > + spin_unlock(&nn->client_lock); > + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); > + } else > + spin_unlock(&nn->client_lock); > return (unsigned long)count; > } > > @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) > > courtesy_client_reaper(nn); > deleg_reaper(nn); > + spin_lock(&nn->client_lock); > + nn->nfsd_shrinker_active = 0; > + spin_unlock(&nn->client_lock); > } > > static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
> On Jan 10, 2023, at 1:48 AM, Dai Ngo <dai.ngo@oracle.com> wrote: > > Currently nfsd4_state_shrinker_worker can be schduled multiple times > from nfsd4_state_shrinker_count when memory is low. This causes > the WARN_ON_ONCE in __queue_delayed_work to trigger. > > This patch allows only one instance of nfsd4_state_shrinker_worker > at a time using the nfsd_shrinker_active flag, protected by the > client_lock. > > Replace mod_delayed_work with queue_delayed_work since we > don't expect to modify the delay of any pending work. > > Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") > Reported-by: Mike Galbraith <efault@gmx.de> > Signed-off-by: Dai Ngo <dai.ngo@oracle.com> > --- > fs/nfsd/netns.h | 1 + > fs/nfsd/nfs4state.c | 16 ++++++++++++++-- > 2 files changed, 15 insertions(+), 2 deletions(-) > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h > index 8c854ba3285b..801d70926442 100644 > --- a/fs/nfsd/netns.h > +++ b/fs/nfsd/netns.h > @@ -196,6 +196,7 @@ struct nfsd_net { > atomic_t nfsd_courtesy_clients; > struct shrinker nfsd_client_shrinker; > struct delayed_work nfsd_shrinker_work; > + bool nfsd_shrinker_active; > }; > > /* Simple check to find out if a given net was properly initialized */ > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > index ee56c9466304..e00551af6a11 100644 > --- a/fs/nfsd/nfs4state.c > +++ b/fs/nfsd/nfs4state.c > @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) > struct nfsd_net *nn = container_of(shrink, > struct nfsd_net, nfsd_client_shrinker); > > + spin_lock(&nn->client_lock); > + if (nn->nfsd_shrinker_active) { > + spin_unlock(&nn->client_lock); > + return 0; > + } > count = atomic_read(&nn->nfsd_courtesy_clients); > if (!count) > count = atomic_long_read(&num_delegations); > - if (count) > - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); > + if (count) { > + nn->nfsd_shrinker_active = true; > + spin_unlock(&nn->client_lock); > + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); As Jeff said, try just replacing the mod_delayed_work() call with the queue_delayed_work() call, without the extra gating. > + } else > + spin_unlock(&nn->client_lock); > return (unsigned long)count; > } > > @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) > > courtesy_client_reaper(nn); > deleg_reaper(nn); > + spin_lock(&nn->client_lock); > + nn->nfsd_shrinker_active = 0; > + spin_unlock(&nn->client_lock); > } > > static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) > -- > 2.9.5 > -- Chuck Lever
On 1/10/23 2:30 AM, Jeff Layton wrote: > On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: >> Currently nfsd4_state_shrinker_worker can be schduled multiple times >> from nfsd4_state_shrinker_count when memory is low. This causes >> the WARN_ON_ONCE in __queue_delayed_work to trigger. >> >> This patch allows only one instance of nfsd4_state_shrinker_worker >> at a time using the nfsd_shrinker_active flag, protected by the >> client_lock. >> >> Replace mod_delayed_work with queue_delayed_work since we >> don't expect to modify the delay of any pending work. >> >> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") >> Reported-by: Mike Galbraith <efault@gmx.de> >> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >> --- >> fs/nfsd/netns.h | 1 + >> fs/nfsd/nfs4state.c | 16 ++++++++++++++-- >> 2 files changed, 15 insertions(+), 2 deletions(-) >> >> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h >> index 8c854ba3285b..801d70926442 100644 >> --- a/fs/nfsd/netns.h >> +++ b/fs/nfsd/netns.h >> @@ -196,6 +196,7 @@ struct nfsd_net { >> atomic_t nfsd_courtesy_clients; >> struct shrinker nfsd_client_shrinker; >> struct delayed_work nfsd_shrinker_work; >> + bool nfsd_shrinker_active; >> }; >> >> /* Simple check to find out if a given net was properly initialized */ >> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >> index ee56c9466304..e00551af6a11 100644 >> --- a/fs/nfsd/nfs4state.c >> +++ b/fs/nfsd/nfs4state.c >> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) >> struct nfsd_net *nn = container_of(shrink, >> struct nfsd_net, nfsd_client_shrinker); >> >> + spin_lock(&nn->client_lock); >> + if (nn->nfsd_shrinker_active) { >> + spin_unlock(&nn->client_lock); >> + return 0; >> + } > Is this extra machinery really necessary? The bool and spinlock don't > seem to be needed. Typically there is no issue with calling > queued_delayed_work when the work is already queued. It just returns > false in that case without doing anything. When there are multiple calls to mod_delayed_work/queue_delayed_work we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if the work is queued but not execute yet. This problem was reported by Mike. I initially tried with only the bool but that was not enough that was why the spinlock was added. Mike verified that the patch fixed the problem. -Dai > >> count = atomic_read(&nn->nfsd_courtesy_clients); >> if (!count) >> count = atomic_long_read(&num_delegations); >> - if (count) >> - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >> + if (count) { >> + nn->nfsd_shrinker_active = true; >> + spin_unlock(&nn->client_lock); >> + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >> + } else >> + spin_unlock(&nn->client_lock); >> return (unsigned long)count; >> } >> >> @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) >> >> courtesy_client_reaper(nn); >> deleg_reaper(nn); >> + spin_lock(&nn->client_lock); >> + nn->nfsd_shrinker_active = 0; >> + spin_unlock(&nn->client_lock); >> } >> >> static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > > On 1/10/23 2:30 AM, Jeff Layton wrote: >> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: >>> Currently nfsd4_state_shrinker_worker can be schduled multiple times >>> from nfsd4_state_shrinker_count when memory is low. This causes >>> the WARN_ON_ONCE in __queue_delayed_work to trigger. >>> >>> This patch allows only one instance of nfsd4_state_shrinker_worker >>> at a time using the nfsd_shrinker_active flag, protected by the >>> client_lock. >>> >>> Replace mod_delayed_work with queue_delayed_work since we >>> don't expect to modify the delay of any pending work. >>> >>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") >>> Reported-by: Mike Galbraith <efault@gmx.de> >>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >>> --- >>> fs/nfsd/netns.h | 1 + >>> fs/nfsd/nfs4state.c | 16 ++++++++++++++-- >>> 2 files changed, 15 insertions(+), 2 deletions(-) >>> >>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h >>> index 8c854ba3285b..801d70926442 100644 >>> --- a/fs/nfsd/netns.h >>> +++ b/fs/nfsd/netns.h >>> @@ -196,6 +196,7 @@ struct nfsd_net { >>> atomic_t nfsd_courtesy_clients; >>> struct shrinker nfsd_client_shrinker; >>> struct delayed_work nfsd_shrinker_work; >>> + bool nfsd_shrinker_active; >>> }; >>> /* Simple check to find out if a given net was properly initialized */ >>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >>> index ee56c9466304..e00551af6a11 100644 >>> --- a/fs/nfsd/nfs4state.c >>> +++ b/fs/nfsd/nfs4state.c >>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) >>> struct nfsd_net *nn = container_of(shrink, >>> struct nfsd_net, nfsd_client_shrinker); >>> + spin_lock(&nn->client_lock); >>> + if (nn->nfsd_shrinker_active) { >>> + spin_unlock(&nn->client_lock); >>> + return 0; >>> + } >> Is this extra machinery really necessary? The bool and spinlock don't >> seem to be needed. Typically there is no issue with calling >> queued_delayed_work when the work is already queued. It just returns >> false in that case without doing anything. > > When there are multiple calls to mod_delayed_work/queue_delayed_work > we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if > the work is queued but not execute yet. The delay argument of zero is interesting. If it's set to a value greater than zero, do you still see a problem? > This problem was reported by Mike. I initially tried with only the > bool but that was not enough that was why the spinlock was added. > Mike verified that the patch fixed the problem. > > -Dai > >> >>> count = atomic_read(&nn->nfsd_courtesy_clients); >>> if (!count) >>> count = atomic_long_read(&num_delegations); >>> - if (count) >>> - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>> + if (count) { >>> + nn->nfsd_shrinker_active = true; >>> + spin_unlock(&nn->client_lock); >>> + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>> + } else >>> + spin_unlock(&nn->client_lock); >>> return (unsigned long)count; >>> } >>> @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) >>> courtesy_client_reaper(nn); >>> deleg_reaper(nn); >>> + spin_lock(&nn->client_lock); >>> + nn->nfsd_shrinker_active = 0; >>> + spin_unlock(&nn->client_lock); >>> } >>> static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) -- Chuck Lever
On Tue, 2023-01-10 at 18:17 +0000, Chuck Lever III wrote: > > > On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > > > > > On 1/10/23 2:30 AM, Jeff Layton wrote: > > > On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: > > > > Currently nfsd4_state_shrinker_worker can be schduled multiple times > > > > from nfsd4_state_shrinker_count when memory is low. This causes > > > > the WARN_ON_ONCE in __queue_delayed_work to trigger. > > > > > > > > This patch allows only one instance of nfsd4_state_shrinker_worker > > > > at a time using the nfsd_shrinker_active flag, protected by the > > > > client_lock. > > > > > > > > Replace mod_delayed_work with queue_delayed_work since we > > > > don't expect to modify the delay of any pending work. > > > > > > > > Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") > > > > Reported-by: Mike Galbraith <efault@gmx.de> > > > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com> > > > > --- > > > > fs/nfsd/netns.h | 1 + > > > > fs/nfsd/nfs4state.c | 16 ++++++++++++++-- > > > > 2 files changed, 15 insertions(+), 2 deletions(-) > > > > > > > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h > > > > index 8c854ba3285b..801d70926442 100644 > > > > --- a/fs/nfsd/netns.h > > > > +++ b/fs/nfsd/netns.h > > > > @@ -196,6 +196,7 @@ struct nfsd_net { > > > > atomic_t nfsd_courtesy_clients; > > > > struct shrinker nfsd_client_shrinker; > > > > struct delayed_work nfsd_shrinker_work; > > > > + bool nfsd_shrinker_active; > > > > }; > > > > /* Simple check to find out if a given net was properly initialized */ > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > > > > index ee56c9466304..e00551af6a11 100644 > > > > --- a/fs/nfsd/nfs4state.c > > > > +++ b/fs/nfsd/nfs4state.c > > > > @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) > > > > struct nfsd_net *nn = container_of(shrink, > > > > struct nfsd_net, nfsd_client_shrinker); > > > > + spin_lock(&nn->client_lock); > > > > + if (nn->nfsd_shrinker_active) { > > > > + spin_unlock(&nn->client_lock); > > > > + return 0; > > > > + } > > > Is this extra machinery really necessary? The bool and spinlock don't > > > seem to be needed. Typically there is no issue with calling > > > queued_delayed_work when the work is already queued. It just returns > > > false in that case without doing anything. > > > > When there are multiple calls to mod_delayed_work/queue_delayed_work > > we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if > > the work is queued but not execute yet. > > The delay argument of zero is interesting. If it's set to a value > greater than zero, do you still see a problem? > It should be safe to call it with a delay of 0. If it's always queued with a delay of 0 though (and it seems to be), you could save a little space by using a struct work_struct instead. Also, I'm not sure if this is related, but where do you cancel the nfsd_shrinker_work before tearing down the struct nfsd_net? I'm not sure that would explains the problem Mike reported, but I think that needs to be addressed. > > > This problem was reported by Mike. I initially tried with only the > > bool but that was not enough that was why the spinlock was added. > > Mike verified that the patch fixed the problem. > > > > -Dai > > > > > > > > > count = atomic_read(&nn->nfsd_courtesy_clients); > > > > if (!count) > > > > count = atomic_long_read(&num_delegations); > > > > - if (count) > > > > - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); > > > > + if (count) { > > > > + nn->nfsd_shrinker_active = true; > > > > + spin_unlock(&nn->client_lock); > > > > + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); > > > > + } else > > > > + spin_unlock(&nn->client_lock); > > > > return (unsigned long)count; > > > > } > > > > @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) > > > > courtesy_client_reaper(nn); > > > > deleg_reaper(nn); > > > > + spin_lock(&nn->client_lock); > > > > + nn->nfsd_shrinker_active = 0; > > > > + spin_unlock(&nn->client_lock); > > > > } > > > > static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) > > -- > Chuck Lever > > >
On 1/10/23 10:17 AM, Chuck Lever III wrote: > >> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >> >> >> On 1/10/23 2:30 AM, Jeff Layton wrote: >>> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: >>>> Currently nfsd4_state_shrinker_worker can be schduled multiple times >>>> from nfsd4_state_shrinker_count when memory is low. This causes >>>> the WARN_ON_ONCE in __queue_delayed_work to trigger. >>>> >>>> This patch allows only one instance of nfsd4_state_shrinker_worker >>>> at a time using the nfsd_shrinker_active flag, protected by the >>>> client_lock. >>>> >>>> Replace mod_delayed_work with queue_delayed_work since we >>>> don't expect to modify the delay of any pending work. >>>> >>>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") >>>> Reported-by: Mike Galbraith <efault@gmx.de> >>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >>>> --- >>>> fs/nfsd/netns.h | 1 + >>>> fs/nfsd/nfs4state.c | 16 ++++++++++++++-- >>>> 2 files changed, 15 insertions(+), 2 deletions(-) >>>> >>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h >>>> index 8c854ba3285b..801d70926442 100644 >>>> --- a/fs/nfsd/netns.h >>>> +++ b/fs/nfsd/netns.h >>>> @@ -196,6 +196,7 @@ struct nfsd_net { >>>> atomic_t nfsd_courtesy_clients; >>>> struct shrinker nfsd_client_shrinker; >>>> struct delayed_work nfsd_shrinker_work; >>>> + bool nfsd_shrinker_active; >>>> }; >>>> /* Simple check to find out if a given net was properly initialized */ >>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >>>> index ee56c9466304..e00551af6a11 100644 >>>> --- a/fs/nfsd/nfs4state.c >>>> +++ b/fs/nfsd/nfs4state.c >>>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) >>>> struct nfsd_net *nn = container_of(shrink, >>>> struct nfsd_net, nfsd_client_shrinker); >>>> + spin_lock(&nn->client_lock); >>>> + if (nn->nfsd_shrinker_active) { >>>> + spin_unlock(&nn->client_lock); >>>> + return 0; >>>> + } >>> Is this extra machinery really necessary? The bool and spinlock don't >>> seem to be needed. Typically there is no issue with calling >>> queued_delayed_work when the work is already queued. It just returns >>> false in that case without doing anything. >> When there are multiple calls to mod_delayed_work/queue_delayed_work >> we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if >> the work is queued but not execute yet. > The delay argument of zero is interesting. If it's set to a value > greater than zero, do you still see a problem? I tried and tried but could not reproduce the problem that Mike reported. I guess my VMs don't have fast enough cpus to make it happen. As Jeff mentioned, delay 0 should be safe and we want to run the shrinker as soon as possible when memory is low. -Dai > > >> This problem was reported by Mike. I initially tried with only the >> bool but that was not enough that was why the spinlock was added. >> Mike verified that the patch fixed the problem. >> >> -Dai >> >>>> count = atomic_read(&nn->nfsd_courtesy_clients); >>>> if (!count) >>>> count = atomic_long_read(&num_delegations); >>>> - if (count) >>>> - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>>> + if (count) { >>>> + nn->nfsd_shrinker_active = true; >>>> + spin_unlock(&nn->client_lock); >>>> + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>>> + } else >>>> + spin_unlock(&nn->client_lock); >>>> return (unsigned long)count; >>>> } >>>> @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) >>>> courtesy_client_reaper(nn); >>>> deleg_reaper(nn); >>>> + spin_lock(&nn->client_lock); >>>> + nn->nfsd_shrinker_active = 0; >>>> + spin_unlock(&nn->client_lock); >>>> } >>>> static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) > -- > Chuck Lever > > >
> On Jan 10, 2023, at 1:46 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > > On 1/10/23 10:17 AM, Chuck Lever III wrote: >> >>> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >>> >>> >>> On 1/10/23 2:30 AM, Jeff Layton wrote: >>>> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: >>>>> Currently nfsd4_state_shrinker_worker can be schduled multiple times >>>>> from nfsd4_state_shrinker_count when memory is low. This causes >>>>> the WARN_ON_ONCE in __queue_delayed_work to trigger. >>>>> >>>>> This patch allows only one instance of nfsd4_state_shrinker_worker >>>>> at a time using the nfsd_shrinker_active flag, protected by the >>>>> client_lock. >>>>> >>>>> Replace mod_delayed_work with queue_delayed_work since we >>>>> don't expect to modify the delay of any pending work. >>>>> >>>>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") >>>>> Reported-by: Mike Galbraith <efault@gmx.de> >>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >>>>> --- >>>>> fs/nfsd/netns.h | 1 + >>>>> fs/nfsd/nfs4state.c | 16 ++++++++++++++-- >>>>> 2 files changed, 15 insertions(+), 2 deletions(-) >>>>> >>>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h >>>>> index 8c854ba3285b..801d70926442 100644 >>>>> --- a/fs/nfsd/netns.h >>>>> +++ b/fs/nfsd/netns.h >>>>> @@ -196,6 +196,7 @@ struct nfsd_net { >>>>> atomic_t nfsd_courtesy_clients; >>>>> struct shrinker nfsd_client_shrinker; >>>>> struct delayed_work nfsd_shrinker_work; >>>>> + bool nfsd_shrinker_active; >>>>> }; >>>>> /* Simple check to find out if a given net was properly initialized */ >>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >>>>> index ee56c9466304..e00551af6a11 100644 >>>>> --- a/fs/nfsd/nfs4state.c >>>>> +++ b/fs/nfsd/nfs4state.c >>>>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) >>>>> struct nfsd_net *nn = container_of(shrink, >>>>> struct nfsd_net, nfsd_client_shrinker); >>>>> + spin_lock(&nn->client_lock); >>>>> + if (nn->nfsd_shrinker_active) { >>>>> + spin_unlock(&nn->client_lock); >>>>> + return 0; >>>>> + } >>>> Is this extra machinery really necessary? The bool and spinlock don't >>>> seem to be needed. Typically there is no issue with calling >>>> queued_delayed_work when the work is already queued. It just returns >>>> false in that case without doing anything. >>> When there are multiple calls to mod_delayed_work/queue_delayed_work >>> we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if >>> the work is queued but not execute yet. >> The delay argument of zero is interesting. If it's set to a value >> greater than zero, do you still see a problem? > > I tried and tried but could not reproduce the problem that Mike > reported. I guess my VMs don't have fast enough cpus to make it > happen. I'd prefer not to guess... it sounds like we don't have a clear root cause on this one yet. I think I agree with Jeff: a spinlock shouldn't be required to make queuing work safe via this API. > As Jeff mentioned, delay 0 should be safe and we want to run > the shrinker as soon as possible when memory is low. I suggested that because the !delay code paths seem to lead directly to the WARN_ONs in queue_work(). <shrug> > -Dai > >> >> >>> This problem was reported by Mike. I initially tried with only the >>> bool but that was not enough that was why the spinlock was added. >>> Mike verified that the patch fixed the problem. >>> >>> -Dai >>> >>>>> count = atomic_read(&nn->nfsd_courtesy_clients); >>>>> if (!count) >>>>> count = atomic_long_read(&num_delegations); >>>>> - if (count) >>>>> - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>>>> + if (count) { >>>>> + nn->nfsd_shrinker_active = true; >>>>> + spin_unlock(&nn->client_lock); >>>>> + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>>>> + } else >>>>> + spin_unlock(&nn->client_lock); >>>>> return (unsigned long)count; >>>>> } >>>>> @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) >>>>> courtesy_client_reaper(nn); >>>>> deleg_reaper(nn); >>>>> + spin_lock(&nn->client_lock); >>>>> + nn->nfsd_shrinker_active = 0; >>>>> + spin_unlock(&nn->client_lock); >>>>> } >>>>> static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) >> -- >> Chuck Lever -- Chuck Lever
On 1/10/23 10:53 AM, Chuck Lever III wrote: > >> On Jan 10, 2023, at 1:46 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >> >> >> On 1/10/23 10:17 AM, Chuck Lever III wrote: >>>> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >>>> >>>> >>>> On 1/10/23 2:30 AM, Jeff Layton wrote: >>>>> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: >>>>>> Currently nfsd4_state_shrinker_worker can be schduled multiple times >>>>>> from nfsd4_state_shrinker_count when memory is low. This causes >>>>>> the WARN_ON_ONCE in __queue_delayed_work to trigger. >>>>>> >>>>>> This patch allows only one instance of nfsd4_state_shrinker_worker >>>>>> at a time using the nfsd_shrinker_active flag, protected by the >>>>>> client_lock. >>>>>> >>>>>> Replace mod_delayed_work with queue_delayed_work since we >>>>>> don't expect to modify the delay of any pending work. >>>>>> >>>>>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") >>>>>> Reported-by: Mike Galbraith <efault@gmx.de> >>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >>>>>> --- >>>>>> fs/nfsd/netns.h | 1 + >>>>>> fs/nfsd/nfs4state.c | 16 ++++++++++++++-- >>>>>> 2 files changed, 15 insertions(+), 2 deletions(-) >>>>>> >>>>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h >>>>>> index 8c854ba3285b..801d70926442 100644 >>>>>> --- a/fs/nfsd/netns.h >>>>>> +++ b/fs/nfsd/netns.h >>>>>> @@ -196,6 +196,7 @@ struct nfsd_net { >>>>>> atomic_t nfsd_courtesy_clients; >>>>>> struct shrinker nfsd_client_shrinker; >>>>>> struct delayed_work nfsd_shrinker_work; >>>>>> + bool nfsd_shrinker_active; >>>>>> }; >>>>>> /* Simple check to find out if a given net was properly initialized */ >>>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >>>>>> index ee56c9466304..e00551af6a11 100644 >>>>>> --- a/fs/nfsd/nfs4state.c >>>>>> +++ b/fs/nfsd/nfs4state.c >>>>>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) >>>>>> struct nfsd_net *nn = container_of(shrink, >>>>>> struct nfsd_net, nfsd_client_shrinker); >>>>>> + spin_lock(&nn->client_lock); >>>>>> + if (nn->nfsd_shrinker_active) { >>>>>> + spin_unlock(&nn->client_lock); >>>>>> + return 0; >>>>>> + } >>>>> Is this extra machinery really necessary? The bool and spinlock don't >>>>> seem to be needed. Typically there is no issue with calling >>>>> queued_delayed_work when the work is already queued. It just returns >>>>> false in that case without doing anything. >>>> When there are multiple calls to mod_delayed_work/queue_delayed_work >>>> we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if >>>> the work is queued but not execute yet. >>> The delay argument of zero is interesting. If it's set to a value >>> greater than zero, do you still see a problem? >> I tried and tried but could not reproduce the problem that Mike >> reported. I guess my VMs don't have fast enough cpus to make it >> happen. > I'd prefer not to guess... it sounds like we don't have a clear > root cause on this one yet. Yes, we do, as I explained in above. The reason I could not reproduce it because my system is not fast enough to get multiple calls to nfsd4_state_shrinker_count simultaneously. > > I think I agree with Jeff: a spinlock shouldn't be required to > make queuing work safe via this API. > > >> As Jeff mentioned, delay 0 should be safe and we want to run >> the shrinker as soon as possible when memory is low. > I suggested that because the !delay code paths seem to lead > directly to the WARN_ONs in queue_work(). <shrug> If we use 0 delay then we need the spinlock, as proven by Mike's test. If we use delay != 0 then it may work without the spinlock, we will Mike to retest it. You and Jeff decide what the delay is and I will make the change and retest. -Dai > > >> -Dai >> >>> >>>> This problem was reported by Mike. I initially tried with only the >>>> bool but that was not enough that was why the spinlock was added. >>>> Mike verified that the patch fixed the problem. >>>> >>>> -Dai >>>> >>>>>> count = atomic_read(&nn->nfsd_courtesy_clients); >>>>>> if (!count) >>>>>> count = atomic_long_read(&num_delegations); >>>>>> - if (count) >>>>>> - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>>>>> + if (count) { >>>>>> + nn->nfsd_shrinker_active = true; >>>>>> + spin_unlock(&nn->client_lock); >>>>>> + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>>>>> + } else >>>>>> + spin_unlock(&nn->client_lock); >>>>>> return (unsigned long)count; >>>>>> } >>>>>> @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) >>>>>> courtesy_client_reaper(nn); >>>>>> deleg_reaper(nn); >>>>>> + spin_lock(&nn->client_lock); >>>>>> + nn->nfsd_shrinker_active = 0; >>>>>> + spin_unlock(&nn->client_lock); >>>>>> } >>>>>> static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) >>> -- >>> Chuck Lever > -- > Chuck Lever > > >
On Tue, 2023-01-10 at 18:53 +0000, Chuck Lever III wrote: > > > On Jan 10, 2023, at 1:46 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > > > > > On 1/10/23 10:17 AM, Chuck Lever III wrote: > > > > > > > On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > > > > > > > > > > > On 1/10/23 2:30 AM, Jeff Layton wrote: > > > > > On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: > > > > > > Currently nfsd4_state_shrinker_worker can be schduled multiple times > > > > > > from nfsd4_state_shrinker_count when memory is low. This causes > > > > > > the WARN_ON_ONCE in __queue_delayed_work to trigger. > > > > > > > > > > > > This patch allows only one instance of nfsd4_state_shrinker_worker > > > > > > at a time using the nfsd_shrinker_active flag, protected by the > > > > > > client_lock. > > > > > > > > > > > > Replace mod_delayed_work with queue_delayed_work since we > > > > > > don't expect to modify the delay of any pending work. > > > > > > > > > > > > Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") > > > > > > Reported-by: Mike Galbraith <efault@gmx.de> > > > > > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com> > > > > > > --- > > > > > > fs/nfsd/netns.h | 1 + > > > > > > fs/nfsd/nfs4state.c | 16 ++++++++++++++-- > > > > > > 2 files changed, 15 insertions(+), 2 deletions(-) > > > > > > > > > > > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h > > > > > > index 8c854ba3285b..801d70926442 100644 > > > > > > --- a/fs/nfsd/netns.h > > > > > > +++ b/fs/nfsd/netns.h > > > > > > @@ -196,6 +196,7 @@ struct nfsd_net { > > > > > > atomic_t nfsd_courtesy_clients; > > > > > > struct shrinker nfsd_client_shrinker; > > > > > > struct delayed_work nfsd_shrinker_work; > > > > > > + bool nfsd_shrinker_active; > > > > > > }; > > > > > > /* Simple check to find out if a given net was properly initialized */ > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > > > > > > index ee56c9466304..e00551af6a11 100644 > > > > > > --- a/fs/nfsd/nfs4state.c > > > > > > +++ b/fs/nfsd/nfs4state.c > > > > > > @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) > > > > > > struct nfsd_net *nn = container_of(shrink, > > > > > > struct nfsd_net, nfsd_client_shrinker); > > > > > > + spin_lock(&nn->client_lock); > > > > > > + if (nn->nfsd_shrinker_active) { > > > > > > + spin_unlock(&nn->client_lock); > > > > > > + return 0; > > > > > > + } > > > > > Is this extra machinery really necessary? The bool and spinlock don't > > > > > seem to be needed. Typically there is no issue with calling > > > > > queued_delayed_work when the work is already queued. It just returns > > > > > false in that case without doing anything. > > > > When there are multiple calls to mod_delayed_work/queue_delayed_work > > > > we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if > > > > the work is queued but not execute yet. > > > The delay argument of zero is interesting. If it's set to a value > > > greater than zero, do you still see a problem? > > > > I tried and tried but could not reproduce the problem that Mike > > reported. I guess my VMs don't have fast enough cpus to make it > > happen. > > I'd prefer not to guess... it sounds like we don't have a clear > root cause on this one yet. > > I think I agree with Jeff: a spinlock shouldn't be required to > make queuing work safe via this API. > > > > As Jeff mentioned, delay 0 should be safe and we want to run > > the shrinker as soon as possible when memory is low. > > I suggested that because the !delay code paths seem to lead > directly to the WARN_ONs in queue_work(). <shrug> > > One of the WARNs in that Mike hit was this: WARN_ON_ONCE(timer->function != delayed_work_timer_fn); nfsd isn't doing anything exotic with that function pointer, so that really looks like something got corrupted. Given that this is happening under low-memory conditions, then I have to wonder if we're just ending up with a workqueue job that remained on the queue after the nfsd_net got freed and recycled. I'd start with a patch like this (note, untested): diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2f4a2449b314..86da6663806e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8158,6 +8158,7 @@ nfs4_state_shutdown_net(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); unregister_shrinker(&nn->nfsd_client_shrinker); + cancel_delayed_work_sync(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); Either way, I think longer nfsd_shrinker_work ought to be converted to a normal work_struct since you don't ever use the delay. > > -Dai > > > > > > > > > > > > This problem was reported by Mike. I initially tried with only the > > > > bool but that was not enough that was why the spinlock was added. > > > > Mike verified that the patch fixed the problem. > > > > > > > > -Dai > > > > > > > > > > count = atomic_read(&nn->nfsd_courtesy_clients); > > > > > > if (!count) > > > > > > count = atomic_long_read(&num_delegations); > > > > > > - if (count) > > > > > > - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); > > > > > > + if (count) { > > > > > > + nn->nfsd_shrinker_active = true; > > > > > > + spin_unlock(&nn->client_lock); > > > > > > + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); > > > > > > + } else > > > > > > + spin_unlock(&nn->client_lock); > > > > > > return (unsigned long)count; > > > > > > } > > > > > > @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) > > > > > > courtesy_client_reaper(nn); > > > > > > deleg_reaper(nn); > > > > > > + spin_lock(&nn->client_lock); > > > > > > + nn->nfsd_shrinker_active = 0; > > > > > > + spin_unlock(&nn->client_lock); > > > > > > } > > > > > > static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) > > > -- > > > Chuck Lever > > -- > Chuck Lever > > >
On 1/10/23 10:34 AM, Jeff Layton wrote: > On Tue, 2023-01-10 at 18:17 +0000, Chuck Lever III wrote: >>> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >>> >>> >>> On 1/10/23 2:30 AM, Jeff Layton wrote: >>>> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: >>>>> Currently nfsd4_state_shrinker_worker can be schduled multiple times >>>>> from nfsd4_state_shrinker_count when memory is low. This causes >>>>> the WARN_ON_ONCE in __queue_delayed_work to trigger. >>>>> >>>>> This patch allows only one instance of nfsd4_state_shrinker_worker >>>>> at a time using the nfsd_shrinker_active flag, protected by the >>>>> client_lock. >>>>> >>>>> Replace mod_delayed_work with queue_delayed_work since we >>>>> don't expect to modify the delay of any pending work. >>>>> >>>>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") >>>>> Reported-by: Mike Galbraith <efault@gmx.de> >>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >>>>> --- >>>>> fs/nfsd/netns.h | 1 + >>>>> fs/nfsd/nfs4state.c | 16 ++++++++++++++-- >>>>> 2 files changed, 15 insertions(+), 2 deletions(-) >>>>> >>>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h >>>>> index 8c854ba3285b..801d70926442 100644 >>>>> --- a/fs/nfsd/netns.h >>>>> +++ b/fs/nfsd/netns.h >>>>> @@ -196,6 +196,7 @@ struct nfsd_net { >>>>> atomic_t nfsd_courtesy_clients; >>>>> struct shrinker nfsd_client_shrinker; >>>>> struct delayed_work nfsd_shrinker_work; >>>>> + bool nfsd_shrinker_active; >>>>> }; >>>>> /* Simple check to find out if a given net was properly initialized */ >>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >>>>> index ee56c9466304..e00551af6a11 100644 >>>>> --- a/fs/nfsd/nfs4state.c >>>>> +++ b/fs/nfsd/nfs4state.c >>>>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) >>>>> struct nfsd_net *nn = container_of(shrink, >>>>> struct nfsd_net, nfsd_client_shrinker); >>>>> + spin_lock(&nn->client_lock); >>>>> + if (nn->nfsd_shrinker_active) { >>>>> + spin_unlock(&nn->client_lock); >>>>> + return 0; >>>>> + } >>>> Is this extra machinery really necessary? The bool and spinlock don't >>>> seem to be needed. Typically there is no issue with calling >>>> queued_delayed_work when the work is already queued. It just returns >>>> false in that case without doing anything. >>> When there are multiple calls to mod_delayed_work/queue_delayed_work >>> we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if >>> the work is queued but not execute yet. >> The delay argument of zero is interesting. If it's set to a value >> greater than zero, do you still see a problem? >> > It should be safe to call it with a delay of 0. If it's always queued > with a delay of 0 though (and it seems to be), you could save a little > space by using a struct work_struct instead. Can I defer this optimization for now? I need some time to look into it. > > Also, I'm not sure if this is related, but where do you cancel the > nfsd_shrinker_work before tearing down the struct nfsd_net? I'm not sure > that would explains the problem Mike reported, but I think that needs to > be addressed. Yes, good catch. I will add the cancelling in v2 patch. Thanks, -Dai > >>> This problem was reported by Mike. I initially tried with only the >>> bool but that was not enough that was why the spinlock was added. >>> Mike verified that the patch fixed the problem. >>> >>> -Dai >>> >>>>> count = atomic_read(&nn->nfsd_courtesy_clients); >>>>> if (!count) >>>>> count = atomic_long_read(&num_delegations); >>>>> - if (count) >>>>> - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>>>> + if (count) { >>>>> + nn->nfsd_shrinker_active = true; >>>>> + spin_unlock(&nn->client_lock); >>>>> + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>>>> + } else >>>>> + spin_unlock(&nn->client_lock); >>>>> return (unsigned long)count; >>>>> } >>>>> @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) >>>>> courtesy_client_reaper(nn); >>>>> deleg_reaper(nn); >>>>> + spin_lock(&nn->client_lock); >>>>> + nn->nfsd_shrinker_active = 0; >>>>> + spin_unlock(&nn->client_lock); >>>>> } >>>>> static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) >> -- >> Chuck Lever >> >> >>
On Tue, 2023-01-10 at 11:07 -0800, dai.ngo@oracle.com wrote: > On 1/10/23 10:53 AM, Chuck Lever III wrote: > > > > > On Jan 10, 2023, at 1:46 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > > > > > > > > On 1/10/23 10:17 AM, Chuck Lever III wrote: > > > > > On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > > > > > > > > > > > > > > On 1/10/23 2:30 AM, Jeff Layton wrote: > > > > > > On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: > > > > > > > Currently nfsd4_state_shrinker_worker can be schduled multiple times > > > > > > > from nfsd4_state_shrinker_count when memory is low. This causes > > > > > > > the WARN_ON_ONCE in __queue_delayed_work to trigger. > > > > > > > > > > > > > > This patch allows only one instance of nfsd4_state_shrinker_worker > > > > > > > at a time using the nfsd_shrinker_active flag, protected by the > > > > > > > client_lock. > > > > > > > > > > > > > > Replace mod_delayed_work with queue_delayed_work since we > > > > > > > don't expect to modify the delay of any pending work. > > > > > > > > > > > > > > Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") > > > > > > > Reported-by: Mike Galbraith <efault@gmx.de> > > > > > > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com> > > > > > > > --- > > > > > > > fs/nfsd/netns.h | 1 + > > > > > > > fs/nfsd/nfs4state.c | 16 ++++++++++++++-- > > > > > > > 2 files changed, 15 insertions(+), 2 deletions(-) > > > > > > > > > > > > > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h > > > > > > > index 8c854ba3285b..801d70926442 100644 > > > > > > > --- a/fs/nfsd/netns.h > > > > > > > +++ b/fs/nfsd/netns.h > > > > > > > @@ -196,6 +196,7 @@ struct nfsd_net { > > > > > > > atomic_t nfsd_courtesy_clients; > > > > > > > struct shrinker nfsd_client_shrinker; > > > > > > > struct delayed_work nfsd_shrinker_work; > > > > > > > + bool nfsd_shrinker_active; > > > > > > > }; > > > > > > > /* Simple check to find out if a given net was properly initialized */ > > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > > > > > > > index ee56c9466304..e00551af6a11 100644 > > > > > > > --- a/fs/nfsd/nfs4state.c > > > > > > > +++ b/fs/nfsd/nfs4state.c > > > > > > > @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) > > > > > > > struct nfsd_net *nn = container_of(shrink, > > > > > > > struct nfsd_net, nfsd_client_shrinker); > > > > > > > + spin_lock(&nn->client_lock); > > > > > > > + if (nn->nfsd_shrinker_active) { > > > > > > > + spin_unlock(&nn->client_lock); > > > > > > > + return 0; > > > > > > > + } > > > > > > Is this extra machinery really necessary? The bool and spinlock don't > > > > > > seem to be needed. Typically there is no issue with calling > > > > > > queued_delayed_work when the work is already queued. It just returns > > > > > > false in that case without doing anything. > > > > > When there are multiple calls to mod_delayed_work/queue_delayed_work > > > > > we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if > > > > > the work is queued but not execute yet. > > > > The delay argument of zero is interesting. If it's set to a value > > > > greater than zero, do you still see a problem? > > > I tried and tried but could not reproduce the problem that Mike > > > reported. I guess my VMs don't have fast enough cpus to make it > > > happen. > > I'd prefer not to guess... it sounds like we don't have a clear > > root cause on this one yet. > > Yes, we do, as I explained in above. The reason I could not reproduce > it because my system is not fast enough to get multiple calls to > nfsd4_state_shrinker_count simultaneously. > > > > > I think I agree with Jeff: a spinlock shouldn't be required to > > make queuing work safe via this API. > > > > > > > As Jeff mentioned, delay 0 should be safe and we want to run > > > the shrinker as soon as possible when memory is low. > > I suggested that because the !delay code paths seem to lead > > directly to the WARN_ONs in queue_work(). <shrug> > > If we use 0 delay then we need the spinlock, as proven by Mike's test. > If we use delay != 0 then it may work without the spinlock, we will > Mike to retest it. > > You and Jeff decide what the delay is and I will make the change > and retest. > I don't think this is the issue either. It's more likely that this is just changing the timing such that it didn't happen. Looking at the traces that Mike posted, it really looks like the delayed_work is corrupt. > > > > > > > -Dai > > > > > > > > > > > > This problem was reported by Mike. I initially tried with only the > > > > > bool but that was not enough that was why the spinlock was added. > > > > > Mike verified that the patch fixed the problem. > > > > > > > > > > -Dai > > > > > > > > > > > > count = atomic_read(&nn->nfsd_courtesy_clients); > > > > > > > if (!count) > > > > > > > count = atomic_long_read(&num_delegations); > > > > > > > - if (count) > > > > > > > - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); > > > > > > > + if (count) { > > > > > > > + nn->nfsd_shrinker_active = true; > > > > > > > + spin_unlock(&nn->client_lock); > > > > > > > + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); > > > > > > > + } else > > > > > > > + spin_unlock(&nn->client_lock); > > > > > > > return (unsigned long)count; > > > > > > > } > > > > > > > @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) > > > > > > > courtesy_client_reaper(nn); > > > > > > > deleg_reaper(nn); > > > > > > > + spin_lock(&nn->client_lock); > > > > > > > + nn->nfsd_shrinker_active = 0; > > > > > > > + spin_unlock(&nn->client_lock); > > > > > > > } > > > > > > > static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) > > > > -- > > > > Chuck Lever > > -- > > Chuck Lever > > > > > >
On Tue, 2023-01-10 at 11:17 -0800, dai.ngo@oracle.com wrote: > On 1/10/23 10:34 AM, Jeff Layton wrote: > > On Tue, 2023-01-10 at 18:17 +0000, Chuck Lever III wrote: > > > > On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote: > > > > > > > > > > > > On 1/10/23 2:30 AM, Jeff Layton wrote: > > > > > On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: > > > > > > Currently nfsd4_state_shrinker_worker can be schduled multiple times > > > > > > from nfsd4_state_shrinker_count when memory is low. This causes > > > > > > the WARN_ON_ONCE in __queue_delayed_work to trigger. > > > > > > > > > > > > This patch allows only one instance of nfsd4_state_shrinker_worker > > > > > > at a time using the nfsd_shrinker_active flag, protected by the > > > > > > client_lock. > > > > > > > > > > > > Replace mod_delayed_work with queue_delayed_work since we > > > > > > don't expect to modify the delay of any pending work. > > > > > > > > > > > > Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") > > > > > > Reported-by: Mike Galbraith <efault@gmx.de> > > > > > > Signed-off-by: Dai Ngo <dai.ngo@oracle.com> > > > > > > --- > > > > > > fs/nfsd/netns.h | 1 + > > > > > > fs/nfsd/nfs4state.c | 16 ++++++++++++++-- > > > > > > 2 files changed, 15 insertions(+), 2 deletions(-) > > > > > > > > > > > > diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h > > > > > > index 8c854ba3285b..801d70926442 100644 > > > > > > --- a/fs/nfsd/netns.h > > > > > > +++ b/fs/nfsd/netns.h > > > > > > @@ -196,6 +196,7 @@ struct nfsd_net { > > > > > > atomic_t nfsd_courtesy_clients; > > > > > > struct shrinker nfsd_client_shrinker; > > > > > > struct delayed_work nfsd_shrinker_work; > > > > > > + bool nfsd_shrinker_active; > > > > > > }; > > > > > > /* Simple check to find out if a given net was properly initialized */ > > > > > > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c > > > > > > index ee56c9466304..e00551af6a11 100644 > > > > > > --- a/fs/nfsd/nfs4state.c > > > > > > +++ b/fs/nfsd/nfs4state.c > > > > > > @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) > > > > > > struct nfsd_net *nn = container_of(shrink, > > > > > > struct nfsd_net, nfsd_client_shrinker); > > > > > > + spin_lock(&nn->client_lock); > > > > > > + if (nn->nfsd_shrinker_active) { > > > > > > + spin_unlock(&nn->client_lock); > > > > > > + return 0; > > > > > > + } > > > > > Is this extra machinery really necessary? The bool and spinlock don't > > > > > seem to be needed. Typically there is no issue with calling > > > > > queued_delayed_work when the work is already queued. It just returns > > > > > false in that case without doing anything. > > > > When there are multiple calls to mod_delayed_work/queue_delayed_work > > > > we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if > > > > the work is queued but not execute yet. > > > The delay argument of zero is interesting. If it's set to a value > > > greater than zero, do you still see a problem? > > > > > It should be safe to call it with a delay of 0. If it's always queued > > with a delay of 0 though (and it seems to be), you could save a little > > space by using a struct work_struct instead. > > Can I defer this optimization for now? I need some time to look into it. > I'd like to see it as part of the eventual patch that's merged. There's no reason to use a delayed_work struct here at all. You always want the shrinker work to run ASAP. It should be a simple conversion. > > > > Also, I'm not sure if this is related, but where do you cancel the > > nfsd_shrinker_work before tearing down the struct nfsd_net? I'm not sure > > that would explains the problem Mike reported, but I think that needs to > > be addressed. > > Yes, good catch. I will add the cancelling in v2 patch. > > Looking over the traces that Mike posted, I suspect this is the real bug, particularly if the server is being restarted during this test. > > > > > > This problem was reported by Mike. I initially tried with only the > > > > bool but that was not enough that was why the spinlock was added. > > > > Mike verified that the patch fixed the problem. > > > > > > > > -Dai > > > > > > > > > > count = atomic_read(&nn->nfsd_courtesy_clients); > > > > > > if (!count) > > > > > > count = atomic_long_read(&num_delegations); > > > > > > - if (count) > > > > > > - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); > > > > > > + if (count) { > > > > > > + nn->nfsd_shrinker_active = true; > > > > > > + spin_unlock(&nn->client_lock); > > > > > > + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); > > > > > > + } else > > > > > > + spin_unlock(&nn->client_lock); > > > > > > return (unsigned long)count; > > > > > > } > > > > > > @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) > > > > > > courtesy_client_reaper(nn); > > > > > > deleg_reaper(nn); > > > > > > + spin_lock(&nn->client_lock); > > > > > > + nn->nfsd_shrinker_active = 0; > > > > > > + spin_unlock(&nn->client_lock); > > > > > > } > > > > > > static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) > > > -- > > > Chuck Lever > > > > > > > > >
On 1/10/23 11:30 AM, Jeff Layton wrote: > On Tue, 2023-01-10 at 11:17 -0800, dai.ngo@oracle.com wrote: >> On 1/10/23 10:34 AM, Jeff Layton wrote: >>> On Tue, 2023-01-10 at 18:17 +0000, Chuck Lever III wrote: >>>>> On Jan 10, 2023, at 12:33 PM, Dai Ngo <dai.ngo@oracle.com> wrote: >>>>> >>>>> >>>>> On 1/10/23 2:30 AM, Jeff Layton wrote: >>>>>> On Mon, 2023-01-09 at 22:48 -0800, Dai Ngo wrote: >>>>>>> Currently nfsd4_state_shrinker_worker can be schduled multiple times >>>>>>> from nfsd4_state_shrinker_count when memory is low. This causes >>>>>>> the WARN_ON_ONCE in __queue_delayed_work to trigger. >>>>>>> >>>>>>> This patch allows only one instance of nfsd4_state_shrinker_worker >>>>>>> at a time using the nfsd_shrinker_active flag, protected by the >>>>>>> client_lock. >>>>>>> >>>>>>> Replace mod_delayed_work with queue_delayed_work since we >>>>>>> don't expect to modify the delay of any pending work. >>>>>>> >>>>>>> Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") >>>>>>> Reported-by: Mike Galbraith <efault@gmx.de> >>>>>>> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> >>>>>>> --- >>>>>>> fs/nfsd/netns.h | 1 + >>>>>>> fs/nfsd/nfs4state.c | 16 ++++++++++++++-- >>>>>>> 2 files changed, 15 insertions(+), 2 deletions(-) >>>>>>> >>>>>>> diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h >>>>>>> index 8c854ba3285b..801d70926442 100644 >>>>>>> --- a/fs/nfsd/netns.h >>>>>>> +++ b/fs/nfsd/netns.h >>>>>>> @@ -196,6 +196,7 @@ struct nfsd_net { >>>>>>> atomic_t nfsd_courtesy_clients; >>>>>>> struct shrinker nfsd_client_shrinker; >>>>>>> struct delayed_work nfsd_shrinker_work; >>>>>>> + bool nfsd_shrinker_active; >>>>>>> }; >>>>>>> /* Simple check to find out if a given net was properly initialized */ >>>>>>> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c >>>>>>> index ee56c9466304..e00551af6a11 100644 >>>>>>> --- a/fs/nfsd/nfs4state.c >>>>>>> +++ b/fs/nfsd/nfs4state.c >>>>>>> @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) >>>>>>> struct nfsd_net *nn = container_of(shrink, >>>>>>> struct nfsd_net, nfsd_client_shrinker); >>>>>>> + spin_lock(&nn->client_lock); >>>>>>> + if (nn->nfsd_shrinker_active) { >>>>>>> + spin_unlock(&nn->client_lock); >>>>>>> + return 0; >>>>>>> + } >>>>>> Is this extra machinery really necessary? The bool and spinlock don't >>>>>> seem to be needed. Typically there is no issue with calling >>>>>> queued_delayed_work when the work is already queued. It just returns >>>>>> false in that case without doing anything. >>>>> When there are multiple calls to mod_delayed_work/queue_delayed_work >>>>> we hit the WARN_ON_ONCE's in __queue_delayed_work and __queue_work if >>>>> the work is queued but not execute yet. >>>> The delay argument of zero is interesting. If it's set to a value >>>> greater than zero, do you still see a problem? >>>> >>> It should be safe to call it with a delay of 0. If it's always queued >>> with a delay of 0 though (and it seems to be), you could save a little >>> space by using a struct work_struct instead. >> Can I defer this optimization for now? I need some time to look into it. >> > I'd like to see it as part of the eventual patch that's merged. There's > no reason to use a delayed_work struct here at all. You always want the > shrinker work to run ASAP. It should be a simple conversion. ok, I'll make the change in v2. >>> Also, I'm not sure if this is related, but where do you cancel the >>> nfsd_shrinker_work before tearing down the struct nfsd_net? I'm not sure >>> that would explains the problem Mike reported, but I think that needs to >>> be addressed. >> Yes, good catch. I will add the cancelling in v2 patch. >> >> > Looking over the traces that Mike posted, I suspect this is the real > bug, particularly if the server is being restarted during this test. Yes, I noticed the WARN_ON_ONCE(timer->function != delayed_work_timer_fn) too and this seems to indicate some kind of corruption. However, I'm not sure if Mike's test restarts the nfs-server service. This could be a bug in work queue module when it's under stress. -Dai > >>>>> This problem was reported by Mike. I initially tried with only the >>>>> bool but that was not enough that was why the spinlock was added. >>>>> Mike verified that the patch fixed the problem. >>>>> >>>>> -Dai >>>>> >>>>>>> count = atomic_read(&nn->nfsd_courtesy_clients); >>>>>>> if (!count) >>>>>>> count = atomic_long_read(&num_delegations); >>>>>>> - if (count) >>>>>>> - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>>>>>> + if (count) { >>>>>>> + nn->nfsd_shrinker_active = true; >>>>>>> + spin_unlock(&nn->client_lock); >>>>>>> + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); >>>>>>> + } else >>>>>>> + spin_unlock(&nn->client_lock); >>>>>>> return (unsigned long)count; >>>>>>> } >>>>>>> @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) >>>>>>> courtesy_client_reaper(nn); >>>>>>> deleg_reaper(nn); >>>>>>> + spin_lock(&nn->client_lock); >>>>>>> + nn->nfsd_shrinker_active = 0; >>>>>>> + spin_unlock(&nn->client_lock); >>>>>>> } >>>>>>> static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) >>>> -- >>>> Chuck Lever >>>> >>>> >>>>
On Tue, 2023-01-10 at 11:58 -0800, dai.ngo@oracle.com wrote: > > On 1/10/23 11:30 AM, Jeff Layton wrote: > > > > > > > > > Looking over the traces that Mike posted, I suspect this is the real > > bug, particularly if the server is being restarted during this test. > > Yes, I noticed the WARN_ON_ONCE(timer->function != delayed_work_timer_fn) > too and this seems to indicate some kind of corruption. However, I'm not > sure if Mike's test restarts the nfs-server service. This could be a bug > in work queue module when it's under stress. My reproducer was to merely mount and traverse/md5sum, while that was going on, fire up LTP's min_free_kbytes testcase (memory hog from hell) on the server. Systemthing may well be restarting the server service in response to oomkill. In fact, the struct delayed_work in question at WARN_ON_ONCE() time didn't look the least bit ready for business. FWIW, I had noticed the missing cancel while eyeballing, and stuck one next to the existing one as a hail-mary, but that helped not at all. crash> delayed_work ffff8881601fab48 struct delayed_work { work = { data = { counter = 1 }, entry = { next = 0x0, prev = 0x0 }, func = 0x0 }, timer = { entry = { next = 0x0, pprev = 0x0 }, expires = 0, function = 0x0, flags = 0 }, wq = 0x0, cpu = 0 }
On Wed, 2023-01-11 at 03:34 +0100, Mike Galbraith wrote: > On Tue, 2023-01-10 at 11:58 -0800, dai.ngo@oracle.com wrote: > > > > On 1/10/23 11:30 AM, Jeff Layton wrote: > > > > > > > > > > > > > Looking over the traces that Mike posted, I suspect this is the real > > > bug, particularly if the server is being restarted during this test. > > > > Yes, I noticed the WARN_ON_ONCE(timer->function != delayed_work_timer_fn) > > too and this seems to indicate some kind of corruption. However, I'm not > > sure if Mike's test restarts the nfs-server service. This could be a bug > > in work queue module when it's under stress. > > My reproducer was to merely mount and traverse/md5sum, while that was > going on, fire up LTP's min_free_kbytes testcase (memory hog from hell) > on the server. Systemthing may well be restarting the server service > in response to oomkill. In fact, the struct delayed_work in question > at WARN_ON_ONCE() time didn't look the least bit ready for business. > > FWIW, I had noticed the missing cancel while eyeballing, and stuck one > next to the existing one as a hail-mary, but that helped not at all. > Ok, thanks, that's good to know. I still doubt that the problem is the race that Dai seems to think it is. The workqueue infrastructure has been fairly stable for years. If there were problems with concurrent tasks queueing the same work, the kernel would be blowing up all over the place. > crash> delayed_work ffff8881601fab48 > struct delayed_work { > work = { > data = { > counter = 1 > }, > entry = { > next = 0x0, > prev = 0x0 > }, > func = 0x0 > }, > timer = { > entry = { > next = 0x0, > pprev = 0x0 > }, > expires = 0, > function = 0x0, > flags = 0 > }, > wq = 0x0, > cpu = 0 > } That looks more like a memory scribble or UAF. Merely having multiple tasks calling queue_work at the same time wouldn't be enough to trigger this, IMO. It's more likely that the extra locking is changing the timing of your reproducer somehow. It might be interesting to turn up KASAN if you're able.
On Wed, 2023-01-11 at 05:15 -0500, Jeff Layton wrote: > On Wed, 2023-01-11 at 03:34 +0100, Mike Galbraith wrote: > > On Tue, 2023-01-10 at 11:58 -0800, dai.ngo@oracle.com wrote: > > > > > > On 1/10/23 11:30 AM, Jeff Layton wrote: > > > > > > > > > > > > > > > > > Looking over the traces that Mike posted, I suspect this is the real > > > > bug, particularly if the server is being restarted during this test. > > > > > > Yes, I noticed the WARN_ON_ONCE(timer->function != delayed_work_timer_fn) > > > too and this seems to indicate some kind of corruption. However, I'm not > > > sure if Mike's test restarts the nfs-server service. This could be a bug > > > in work queue module when it's under stress. > > > > My reproducer was to merely mount and traverse/md5sum, while that was > > going on, fire up LTP's min_free_kbytes testcase (memory hog from hell) > > on the server. Systemthing may well be restarting the server service > > in response to oomkill. In fact, the struct delayed_work in question > > at WARN_ON_ONCE() time didn't look the least bit ready for business. > > > > FWIW, I had noticed the missing cancel while eyeballing, and stuck one > > next to the existing one as a hail-mary, but that helped not at all. > > > > Ok, thanks, that's good to know. > > I still doubt that the problem is the race that Dai seems to think it > is. The workqueue infrastructure has been fairly stable for years. If > there were problems with concurrent tasks queueing the same work, the > kernel would be blowing up all over the place. > > > crash> delayed_work ffff8881601fab48 > > struct delayed_work { > > work = { > > data = { > > counter = 1 > > }, > > entry = { > > next = 0x0, > > prev = 0x0 > > }, > > func = 0x0 > > }, > > timer = { > > entry = { > > next = 0x0, > > pprev = 0x0 > > }, > > expires = 0, > > function = 0x0, > > flags = 0 > > }, > > wq = 0x0, > > cpu = 0 > > } > > That looks more like a memory scribble or UAF. Merely having multiple > tasks calling queue_work at the same time wouldn't be enough to trigger > this, IMO. It's more likely that the extra locking is changing the > timing of your reproducer somehow. > > It might be interesting to turn up KASAN if you're able. If you still have this vmcore, it might be interesting to do the pointer math and find the nfsd_net structure that contains the above delayed_work. Does the rest of it also seem to be corrupt? My guess is that the corrupted structure extends beyond just the delayed_work above. Also, it might be helpful to do this: kmem -s ffff8881601fab48 ...which should tell us whether and what part of the slab this object is now a part of. That said, net-namespace object allocations are somewhat weird, and I'm not 100% sure they come out of the slab.
On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote: > > > > > crash> delayed_work ffff8881601fab48 > > > struct delayed_work { > > > work = { > > > data = { > > > counter = 1 > > > }, > > > entry = { > > > next = 0x0, > > > prev = 0x0 > > > }, > > > func = 0x0 > > > }, > > > timer = { > > > entry = { > > > next = 0x0, > > > pprev = 0x0 > > > }, > > > expires = 0, > > > function = 0x0, > > > flags = 0 > > > }, > > > wq = 0x0, > > > cpu = 0 > > > } > > > > That looks more like a memory scribble or UAF. Merely having multiple > > tasks calling queue_work at the same time wouldn't be enough to trigger > > this, IMO. It's more likely that the extra locking is changing the > > timing of your reproducer somehow. > > > > It might be interesting to turn up KASAN if you're able. I can try that. > If you still have this vmcore, it might be interesting to do the pointer > math and find the nfsd_net structure that contains the above > delayed_work. Does the rest of it also seem to be corrupt? My guess is > that the corrupted structure extends beyond just the delayed_work above. > > Also, it might be helpful to do this: > > kmem -s ffff8881601fab48 > > ...which should tell us whether and what part of the slab this object is > now a part of. That said, net-namespace object allocations are somewhat > weird, and I'm not 100% sure they come out of the slab. I tossed the vmcore, but can generate another. I had done kmem sans -s previously, still have that. crash> kmem ffff8881601fab48 CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME kmem: kmalloc-1k: partial list slab: ffffea0005b20c08 invalid page.inuse: -1 ffff888100041840 1024 2329 2432 76 32k kmalloc-1k SLAB MEMORY NODE TOTAL ALLOCATED FREE ffffea0005807e00 ffff8881601f8000 0 32 32 0 FREE / [ALLOCATED] [ffff8881601fa800] PAGE PHYSICAL MAPPING INDEX CNT FLAGS ffffea0005807e80 1601fa000 dead000000000400 0 0 200000000000000 crash
On 1/11/23 3:19 AM, Mike Galbraith wrote: > On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote: >>>> crash> delayed_work ffff8881601fab48 >>>> struct delayed_work { >>>> work = { >>>> data = { >>>> counter = 1 >>>> }, >>>> entry = { >>>> next = 0x0, >>>> prev = 0x0 >>>> }, >>>> func = 0x0 >>>> }, >>>> timer = { >>>> entry = { >>>> next = 0x0, >>>> pprev = 0x0 >>>> }, >>>> expires = 0, >>>> function = 0x0, >>>> flags = 0 >>>> }, >>>> wq = 0x0, >>>> cpu = 0 >>>> } >>> That looks more like a memory scribble or UAF. Merely having multiple >>> tasks calling queue_work at the same time wouldn't be enough to trigger >>> this, IMO. It's more likely that the extra locking is changing the >>> timing of your reproducer somehow. >>> >>> It might be interesting to turn up KASAN if you're able. > I can try that. > >> If you still have this vmcore, it might be interesting to do the pointer >> math and find the nfsd_net structure that contains the above >> delayed_work. Does the rest of it also seem to be corrupt? My guess is >> that the corrupted structure extends beyond just the delayed_work above. >> >> Also, it might be helpful to do this: >> >> kmem -s ffff8881601fab48 >> >> ...which should tell us whether and what part of the slab this object is >> now a part of. That said, net-namespace object allocations are somewhat >> weird, and I'm not 100% sure they come out of the slab. > I tossed the vmcore, but can generate another. I had done kmem sans -s > previously, still have that. > > crash> kmem ffff8881601fab48 > CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME > kmem: kmalloc-1k: partial list slab: ffffea0005b20c08 invalid page.inuse: -1 > ffff888100041840 1024 2329 2432 76 32k kmalloc-1k > SLAB MEMORY NODE TOTAL ALLOCATED FREE > ffffea0005807e00 ffff8881601f8000 0 32 32 0 > FREE / [ALLOCATED] > [ffff8881601fa800] > > PAGE PHYSICAL MAPPING INDEX CNT FLAGS > ffffea0005807e80 1601fa000 dead000000000400 0 0 200000000000000 > crash Can you try: crash7latest> nfsd_net_id nfsd_net_id = $2 = 9 <<=== crash7latest> struct net.gen init_net gen = 0xffff97fc17d07d80 crash7latest> x /10g 0xffff97fc17d07d80 0xffff97fc17d07d80: 0x000000000000000d 0x0000000000000000 0xffff97fc17d07d90: 0x0000000000000000 0xffff97fc0ac40060 0xffff97fc17d07da0: 0xffff994e7bf87600 0xffff98f731172a20 0xffff97fc17d07db0: 0xffff9844b05d9c00 0xffff9832a6a0add0 0xffff97fc17d07dc0: 0xffff984a4470d740 0xffff984a93eb0600 <<=== entry for nfsd_net_id crash7latest> nfsd_net 0xffff984a93eb0600 Thanks, -Dai >
On Wed, 2023-01-11 at 12:19 +0100, Mike Galbraith wrote: > On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote: > > > > > > > crash> delayed_work ffff8881601fab48 > > > > struct delayed_work { > > > > work = { > > > > data = { > > > > counter = 1 > > > > }, > > > > entry = { > > > > next = 0x0, > > > > prev = 0x0 > > > > }, > > > > func = 0x0 > > > > }, > > > > timer = { > > > > entry = { > > > > next = 0x0, > > > > pprev = 0x0 > > > > }, > > > > expires = 0, > > > > function = 0x0, > > > > flags = 0 > > > > }, > > > > wq = 0x0, > > > > cpu = 0 > > > > } > > > > > > That looks more like a memory scribble or UAF. Merely having multiple > > > tasks calling queue_work at the same time wouldn't be enough to trigger > > > this, IMO. It's more likely that the extra locking is changing the > > > timing of your reproducer somehow. > > > > > > It might be interesting to turn up KASAN if you're able. > > I can try that. > > > If you still have this vmcore, it might be interesting to do the pointer > > math and find the nfsd_net structure that contains the above > > delayed_work. Does the rest of it also seem to be corrupt? My guess is > > that the corrupted structure extends beyond just the delayed_work above. > > > > Also, it might be helpful to do this: > > > > kmem -s ffff8881601fab48 > > > > ...which should tell us whether and what part of the slab this object is > > now a part of. That said, net-namespace object allocations are somewhat > > weird, and I'm not 100% sure they come out of the slab. > > I tossed the vmcore, but can generate another. I had done kmem sans -s > previously, still have that. > > crash> kmem ffff8881601fab48 > CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME > kmem: kmalloc-1k: partial list slab: ffffea0005b20c08 invalid page.inuse: -1 > ffff888100041840 1024 2329 2432 76 32k kmalloc-1k > SLAB MEMORY NODE TOTAL ALLOCATED FREE > ffffea0005807e00 ffff8881601f8000 0 32 32 0 > FREE / [ALLOCATED] > [ffff8881601fa800] > > PAGE PHYSICAL MAPPING INDEX CNT FLAGS > ffffea0005807e80 1601fa000 dead000000000400 0 0 200000000000000 > crash > Thanks. The pernet allocations do come out of the slab. The allocation is done in ops_init in net/core/namespace.c. This one is a 1k allocation, which jives with the size of nfsd_net (which is 976 bytes). So, this seems to be consistent with where an nfsd_net would have come from. Maybe not a UAF, but I do think we have some sort of mem corruption going on.
On Wed, 2023-01-11 at 12:19 +0100, Mike Galbraith wrote: > On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote: > > > > > > > > > > > > It might be interesting to turn up KASAN if you're able. > > I can try that. KASAN did not squeak. > > If you still have this vmcore, it might be interesting to do the pointer > > math and find the nfsd_net structure that contains the above > > delayed_work. Does the rest of it also seem to be corrupt? Virgin source with workqueue.c WARN_ON_ONCE() landmine. crash> nfsd_net -x 0xFFFF8881114E9800 struct nfsd_net { cld_net = 0x0, svc_expkey_cache = 0xffff8881420f8a00, svc_export_cache = 0xffff8881420f8800, idtoname_cache = 0xffff8881420f9a00, nametoid_cache = 0xffff8881420f9c00, nfsd4_manager = { list = { next = 0x0, prev = 0x0 }, block_opens = 0x0 }, grace_ended = 0x0, boot_time = 0x0, nfsd_client_dir = 0x0, reclaim_str_hashtbl = 0x0, reclaim_str_hashtbl_size = 0x0, conf_id_hashtbl = 0x0, conf_name_tree = { rb_node = 0x0 }, unconf_id_hashtbl = 0x0, unconf_name_tree = { rb_node = 0x0 }, sessionid_hashtbl = 0x0, client_lru = { next = 0x0, prev = 0x0 }, close_lru = { next = 0x0, prev = 0x0 }, del_recall_lru = { next = 0x0, prev = 0x0 }, blocked_locks_lru = { next = 0x0, prev = 0x0 }, laundromat_work = { work = { data = { counter = 0x0 }, entry = { next = 0x0, prev = 0x0 }, func = 0x0 }, timer = { entry = { next = 0x0, pprev = 0x0 }, expires = 0x0, function = 0x0, flags = 0x0 }, wq = 0x0, cpu = 0x0 }, client_lock = { { rlock = { raw_lock = { { val = { counter = 0x0 }, { locked = 0x0, pending = 0x0 }, { locked_pending = 0x0, tail = 0x0 } } } } } }, blocked_locks_lock = { { rlock = { raw_lock = { { val = { counter = 0x0 }, { locked = 0x0, pending = 0x0 }, { locked_pending = 0x0, tail = 0x0 } } } } } }, rec_file = 0x0, in_grace = 0x0, client_tracking_ops = 0x0, nfsd4_lease = 0x5a, nfsd4_grace = 0x5a, somebody_reclaimed = 0x0, track_reclaim_completes = 0x0, nr_reclaim_complete = { counter = 0x0 }, nfsd_net_up = 0x0, lockd_up = 0x0, writeverf_lock = { seqcount = { seqcount = { sequence = 0x0 } }, lock = { { rlock = { raw_lock = { { val = { counter = 0x0 }, { locked = 0x0, pending = 0x0 }, { locked_pending = 0x0, tail = 0x0 } } } } } } }, writeverf = "\000\000\000\000\000\000\000", max_connections = 0x0, clientid_base = 0x37b4ca7b, clientid_counter = 0x37b4ca7d, clverifier_counter = 0xa8ee910d, nfsd_serv = 0x0, keep_active = 0x0, s2s_cp_cl_id = 0x37b4ca7c, s2s_cp_stateids = { idr_rt = { xa_lock = { { rlock = { raw_lock = { { val = { counter = 0x0 }, { locked = 0x0, pending = 0x0 }, { locked_pending = 0x0, tail = 0x0 } } } } } }, xa_flags = 0x0, xa_head = 0x0 }, idr_base = 0x0, idr_next = 0x0 }, s2s_cp_lock = { { rlock = { raw_lock = { { val = { counter = 0x0 }, { locked = 0x0, pending = 0x0 }, { locked_pending = 0x0, tail = 0x0 } } } } } }, nfsd_versions = 0x0, nfsd4_minorversions = 0x0, drc_hashtbl = 0xffff88810a2f0000, max_drc_entries = 0x14740, maskbits = 0xb, drc_hashsize = 0x800, num_drc_entries = { counter = 0x0 }, counter = {{ lock = { raw_lock = { { val = { counter = 0x0 }, { locked = 0x0, pending = 0x0 }, { locked_pending = 0x0, tail = 0x0 } } } }, count = 0x0, list = { next = 0xffff888103f98dd0, prev = 0xffff8881114e9a18 }, counters = 0x607dc8402e10 }, { lock = { raw_lock = { { val = { counter = 0x0 }, { locked = 0x0, pending = 0x0 }, { locked_pending = 0x0, tail = 0x0 } } } }, count = 0x0, list = { next = 0xffff8881114e99f0, prev = 0xffff88810b5743e0 }, counters = 0x607dc8402e14 }}, longest_chain = 0x0, longest_chain_cachesize = 0x0, nfsd_reply_cache_shrinker = { count_objects = 0xffffffffa0e4e9b0 <nfsd_reply_cache_count>, scan_objects = 0xffffffffa0e4f020 <nfsd_reply_cache_scan>, batch = 0x0, seeks = 0x1, flags = 0x1, list = { next = 0xffff888111daf420, prev = 0xffff8881114e9b30 }, nr_deferred = 0xffff88813a544a00 }, nfsd_ssc_lock = { { rlock = { raw_lock = { { val = { counter = 0x0 }, { locked = 0x0, pending = 0x0 }, { locked_pending = 0x0, tail = 0x0 } } } } } }, nfsd_ssc_mount_list = { next = 0x0, prev = 0x0 }, nfsd_ssc_waitq = { lock = { { rlock = { raw_lock = { { val = { counter = 0x0 }, { locked = 0x0, pending = 0x0 }, { locked_pending = 0x0, tail = 0x0 } } } } } }, head = { next = 0x0, prev = 0x0 } }, nfsd_name = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000", fcache_disposal = 0x0, siphash_key = { key = {0x2a5ba10a35b36754, 0xd6b3a5a0e7696876} }, nfs4_client_count = { counter = 0x0 }, nfs4_max_clients = 0x1800, nfsd_courtesy_clients = { counter = 0x0 }, nfsd_client_shrinker = { count_objects = 0xffffffffa0e742c0 <nfsd4_state_shrinker_count>, scan_objects = 0xffffffffa0e73a90 <nfsd4_state_shrinker_scan>, batch = 0x0, seeks = 0x2, flags = 0x1, list = { next = 0xffff8881114e9a58, prev = 0xffffffffa131b280 <mmu_shrinker+32> }, nr_deferred = 0xffff88813a5449d8 }, nfsd_shrinker_work = { work = { data = { counter = 0x1 }, entry = { next = 0x0, prev = 0x0 }, func = 0x0 }, timer = { entry = { next = 0x0, pprev = 0x0 }, expires = 0x0, function = 0x0, flags = 0x0 }, wq = 0x0, cpu = 0x0 } } crash> kmem -s 0xFFFF8881114E9800 CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME ffff888100042dc0 1024 18325 18352 1147 32k kmalloc-1k SLAB MEMORY NODE TOTAL ALLOCATED FREE ffffea0004453a00 ffff8881114e8000 0 16 16 0 FREE / [ALLOCATED] [ffff8881114e9800] crash>
On Wed, 2023-01-11 at 03:31 -0800, dai.ngo@oracle.com wrote: > > Can you try: > > crash7latest> nfsd_net_id > nfsd_net_id = $2 = 9 <<=== > crash7latest> struct net.gen init_net > gen = 0xffff97fc17d07d80 > crash7latest> x /10g 0xffff97fc17d07d80 > 0xffff97fc17d07d80: 0x000000000000000d 0x0000000000000000 > 0xffff97fc17d07d90: 0x0000000000000000 0xffff97fc0ac40060 > 0xffff97fc17d07da0: 0xffff994e7bf87600 0xffff98f731172a20 > 0xffff97fc17d07db0: 0xffff9844b05d9c00 0xffff9832a6a0add0 > 0xffff97fc17d07dc0: 0xffff984a4470d740 0xffff984a93eb0600 > <<=== entry for nfsd_net_id > crash7latest> nfsd_net 0xffff984a93eb0600 (monkey see monkey do.. eep eep) crash> nfsd_net_id p: gdb request failed: p nfsd_net_id crash> struct net.gen init_net gen = 0xffff88810b7b8a00, crash> x /10g 0xffff88810b7b8a00 0xffff88810b7b8a00: 0x0000000000000010 0x0000000000000000 0xffff88810b7b8a10: 0x0000000000000000 0xffff888101563380 0xffff88810b7b8a20: 0xffff888101ebd900 0xffff888101ebda00 0xffff88810b7b8a30: 0xffff888101f88b80 0xffff8881022056c0 0xffff88810b7b8a40: 0xffff888133b79e00 0xffff888110a2ca00 crash> nfsd_net 0xffff888110a2ca00 struct nfsd_net { cld_net = 0xffff888131c3c000, svc_expkey_cache = 0xffff888110a2cc00, svc_export_cache = 0xffff888110a2ce00, idtoname_cache = 0xffff8881061a8a00, nametoid_cache = 0xffff8881061a8c00, nfsd4_manager = { list = { next = 0xffff888141efa000, prev = 0xffff888133e6ea00 }, block_opens = false }, grace_ended = false, boot_time = -131387065447864, nfsd_client_dir = 0xffff888110a2ca48, reclaim_str_hashtbl = 0xffff88810bed7408, reclaim_str_hashtbl_size = 1083333640, conf_id_hashtbl = 0x0, conf_name_tree = { rb_node = 0xffff888140925c00 }, unconf_id_hashtbl = 0xffff88810181c800, unconf_name_tree = { rb_node = 0x200000000 }, sessionid_hashtbl = 0x1, client_lru = { next = 0x0, prev = 0x0 }, close_lru = { next = 0xffff888110a2caa0, prev = 0xffff888110a2caa0 }, del_recall_lru = { next = 0x0, prev = 0xffffffffffffffff }, blocked_locks_lru = { next = 0x0, prev = 0xffff88810a0e0f00 }, laundromat_work = { work = { data = { counter = 0 }, entry = { next = 0x0, prev = 0x0 }, func = 0x0 }, timer = { entry = { next = 0x0, pprev = 0x0 }, expires = 520729437059154371, function = 0x0, flags = 3526430787 }, wq = 0x24448948f6314540, cpu = 1133332496 }, client_lock = { { rlock = { raw_lock = { { val = { counter = 344528932 }, { locked = 36 '$', pending = 24 '\030' }, { locked_pending = 6180, tail = 5257 } } } } } }, blocked_locks_lock = { { rlock = { raw_lock = { { val = { counter = 1820937252 }, { locked = 36 '$', pending = 76 'L' }, { locked_pending = 19492, tail = 27785 } } } } } }, rec_file = 0x4808245c89483824, in_grace = 137, client_tracking_ops = 0xe8df8948005d8b, nfsd4_lease = -8266309238763028480, nfsd4_grace = 5476377146897729659, somebody_reclaimed = 139, track_reclaim_completes = 99, nr_reclaim_complete = { counter = -402096755 }, nfsd_net_up = false, lockd_up = false, writeverf_lock = { seqcount = { seqcount = { sequence = 140872013 } }, lock = { { rlock = { raw_lock = { { val = { counter = -387479220 }, { locked = 76 'L', pending = 137 '\211' }, { locked_pending = 35148, tail = 59623 } } } } } } }, writeverf = "\000\000\000\000M\211,$", max_connections = 612141896, clientid_base = 59416, clientid_counter = 2336751616, clverifier_counter = 1275601988, nfsd_serv = 0x1024448b48185889, keep_active = 140740940, s2s_cp_cl_id = 1223133516, s2s_cp_stateids = { idr_rt = { xa_lock = { { rlock = { raw_lock = { { val = { counter = 15205257 }, { locked = 137 '\211', pending = 3 '\003' }, { locked_pending = 905, tail = 232 } } } } } }, xa_flags = 1224736768, xa_head = 0xf74f6854d241c89 }, idr_base = 276532552, idr_next = 232 }, s2s_cp_lock = { { rlock = { raw_lock = { { val = { counter = 1933134848 }, { locked = 0 '\000', pending = 76 'L' }, { locked_pending = 19456, tail = 29497 } } } } } }, nfsd_versions = 0x443924048b012404, nfsd4_minorversions = 0x2b4820f2424, drc_hashtbl = 0x8678d4d107b8d48, max_drc_entries = 232, maskbits = 1938508800, drc_hashsize = 4287187984, num_drc_entries = { counter = 232 }, counter = {{ lock = { raw_lock = { { val = { counter = 931745024 }, { locked = 0 '\000', pending = 77 'M' }, { locked_pending = 19712, tail = 14217 } } } }, count = -8858645092202691189, list = { next = 0x24648b4cffffff43, prev = 0x246c8b4c24148b40 }, counters = 0xffffffffa0d0b540 <__this_module> }, { lock = { raw_lock = { { val = { counter = 256 }, { locked = 0 '\000', pending = 1 '\001' }, { locked_pending = 256, tail = 0 } } } }, count = -131387314532352, list = { next = 0x0, prev = 0xffffffffa0c949c0 <svc_udp_ops+1248> }, counters = 0xffffffffa0c67f00 <ip_map_put> }}, longest_chain = 2697366144, longest_chain_cachesize = 4294967295, nfsd_reply_cache_shrinker = { count_objects = 0xffffffffa0c67cd0 <ip_map_request>, scan_objects = 0xffffffffa0c68e40 <ip_map_parse>, batch = -1597606560, seeks = 0, flags = 0, list = { next = 0xffffffffa0c67350 <ip_map_alloc>, prev = 0x0 }, nr_deferred = 0xffffffffa0c68a00 <ip_map_match> }, nfsd_ssc_lock = { { rlock = { raw_lock = { { val = { counter = -1597603936 }, { locked = 160 '\240', pending = 127 '\177' }, { locked_pending = 32672, tail = 41158 } } } } } }, nfsd_ssc_mount_list = { next = 0xffffffffa0c68b10 <update>, prev = 0x49 }, nfsd_ssc_waitq = { lock = { { rlock = { raw_lock = { { val = { counter = -1596979232 }, { locked = 224 '\340', pending = 7 '\a' }, { locked_pending = 2016, tail = 41168 } } } } } }, head = { next = 0xffff888110a2ce88, prev = 0xc2 } }, nfsd_name = "\001\000\000\000\000\000\000\000\200\t\021D\201\210\377\377\200\t\021D\201\210\377\377\001\000\000\000\000\000\000\000\032\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\301\303\061\201\210\377\377@$\234\203\377\377\377\377", fcache_disposal = 0x0, siphash_key = { key = {0, 0} }, nfs4_client_count = { counter = 451 }, nfs4_max_clients = 122552490, nfsd_courtesy_clients = { counter = 0 }, nfsd_client_shrinker = { count_objects = 0xe8000002a0a3894c, scan_objects = 0x98b3894400000000, batch = 5483261796049485826, seeks = 15267721, flags = 1275068416, list = { next = 0x18247c8d4918658b, prev = 0x7c8b4900000000e8 }, nr_deferred = 0x4800000000e81824 }, nfsd_shrinker_work = { work = { data = { counter = -8554306017173128307 }, entry = { next = 0x894c00000000e8c4, prev = 0xf7894c00000268a3 }, func = 0x6d8b4800000000e8 }, timer = { entry = { next = 0x270bb8d4818, pprev = 0xbb8d4800000000e8 }, expires = 8118733695596102332, function = 0xe8000002, flags = 45908935 }, wq = 0x147424e783410000, cpu = 553616193 } } crash>
On Wed, 2023-01-11 at 13:15 +0100, Mike Galbraith wrote: > On Wed, 2023-01-11 at 12:19 +0100, Mike Galbraith wrote: > > On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote: > > > > > > > > > > > > > > > > It might be interesting to turn up KASAN if you're able. > > > > I can try that. > > KASAN did not squeak. > > > > If you still have this vmcore, it might be interesting to do the pointer > > > math and find the nfsd_net structure that contains the above > > > delayed_work. Does the rest of it also seem to be corrupt? > > Virgin source with workqueue.c WARN_ON_ONCE() landmine. > Thanks. Mixed bag here... > crash> nfsd_net -x 0xFFFF8881114E9800 > struct nfsd_net { > cld_net = 0x0, > svc_expkey_cache = 0xffff8881420f8a00, > svc_export_cache = 0xffff8881420f8800, > idtoname_cache = 0xffff8881420f9a00, > nametoid_cache = 0xffff8881420f9c00, > nfsd4_manager = { > list = { > next = 0x0, > prev = 0x0 > }, > block_opens = 0x0 > }, > grace_ended = 0x0, > boot_time = 0x0, > nfsd_client_dir = 0x0, > reclaim_str_hashtbl = 0x0, > reclaim_str_hashtbl_size = 0x0, > conf_id_hashtbl = 0x0, > conf_name_tree = { > rb_node = 0x0 > }, > unconf_id_hashtbl = 0x0, > unconf_name_tree = { > rb_node = 0x0 > }, > sessionid_hashtbl = 0x0, > client_lru = { > next = 0x0, > prev = 0x0 > }, > close_lru = { > next = 0x0, > prev = 0x0 > }, > del_recall_lru = { > next = 0x0, > prev = 0x0 > }, > blocked_locks_lru = { > next = 0x0, > prev = 0x0 > }, All of the above list_heads are zeroed out and they shouldn't be. > laundromat_work = { > work = { > data = { > counter = 0x0 > }, > entry = { > next = 0x0, > prev = 0x0 > }, > func = 0x0 > }, > timer = { > entry = { > next = 0x0, > pprev = 0x0 > }, > expires = 0x0, > function = 0x0, > flags = 0x0 > }, > wq = 0x0, > cpu = 0x0 > }, > client_lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = 0x0 > }, > { > locked = 0x0, > pending = 0x0 > }, > { > locked_pending = 0x0, > tail = 0x0 > } > } > } > } > } > }, > blocked_locks_lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = 0x0 > }, > { > locked = 0x0, > pending = 0x0 > }, > { > locked_pending = 0x0, > tail = 0x0 > } > } > } > } > } > }, > rec_file = 0x0, > in_grace = 0x0, > client_tracking_ops = 0x0, > nfsd4_lease = 0x5a, > nfsd4_grace = 0x5a, The grace and lease times look ok, oddly enough. > somebody_reclaimed = 0x0, > track_reclaim_completes = 0x0, > nr_reclaim_complete = { > counter = 0x0 > }, > nfsd_net_up = 0x0, nfsd_net_up is false, which means that this server isn't running (or that the memory here was scribbled over). > lockd_up = 0x0, > writeverf_lock = { > seqcount = { > seqcount = { > sequence = 0x0 > } > }, > lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = 0x0 > }, > { > locked = 0x0, > pending = 0x0 > }, > { > locked_pending = 0x0, > tail = 0x0 > } > } > } > } > } > } > }, > writeverf = "\000\000\000\000\000\000\000", > max_connections = 0x0, > clientid_base = 0x37b4ca7b, > clientid_counter = 0x37b4ca7d, > clverifier_counter = 0xa8ee910d, > nfsd_serv = 0x0, > keep_active = 0x0, > s2s_cp_cl_id = 0x37b4ca7c, > s2s_cp_stateids = { > idr_rt = { > xa_lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = 0x0 > }, > { > locked = 0x0, > pending = 0x0 > }, > { > locked_pending = 0x0, > tail = 0x0 > } > } > } > } > } > }, > xa_flags = 0x0, > xa_head = 0x0 > }, > idr_base = 0x0, > idr_next = 0x0 > }, > s2s_cp_lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = 0x0 > }, > { > locked = 0x0, > pending = 0x0 > }, > { > locked_pending = 0x0, > tail = 0x0 > } > } > } > } > } > }, > nfsd_versions = 0x0, > nfsd4_minorversions = 0x0, > drc_hashtbl = 0xffff88810a2f0000, > max_drc_entries = 0x14740, > maskbits = 0xb, > drc_hashsize = 0x800, > num_drc_entries = { > counter = 0x0 > }, > counter = {{ > lock = { > raw_lock = { > { > val = { > counter = 0x0 > }, > { > locked = 0x0, > pending = 0x0 > }, > { > locked_pending = 0x0, > tail = 0x0 > } > } > } > }, > count = 0x0, > list = { > next = 0xffff888103f98dd0, > prev = 0xffff8881114e9a18 > }, > counters = 0x607dc8402e10 > }, { > lock = { > raw_lock = { > { > val = { > counter = 0x0 > }, > { > locked = 0x0, > pending = 0x0 > }, > { > locked_pending = 0x0, > tail = 0x0 > } > } > } > }, > count = 0x0, > list = { > next = 0xffff8881114e99f0, > prev = 0xffff88810b5743e0 > }, > counters = 0x607dc8402e14 > }}, > longest_chain = 0x0, > longest_chain_cachesize = 0x0, > nfsd_reply_cache_shrinker = { > count_objects = 0xffffffffa0e4e9b0 <nfsd_reply_cache_count>, > scan_objects = 0xffffffffa0e4f020 <nfsd_reply_cache_scan>, Shrinker pointers look ok, as does its list_head. > batch = 0x0, > seeks = 0x1, > flags = 0x1, > list = { > next = 0xffff888111daf420, > prev = 0xffff8881114e9b30 > }, > nr_deferred = 0xffff88813a544a00 > }, > nfsd_ssc_lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = 0x0 > }, > { > locked = 0x0, > pending = 0x0 > }, > { > locked_pending = 0x0, > tail = 0x0 > } > } > } > } > } > }, > nfsd_ssc_mount_list = { > next = 0x0, > prev = 0x0 > }, > nfsd_ssc_waitq = { > lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = 0x0 > }, > { > locked = 0x0, > pending = 0x0 > }, > { > locked_pending = 0x0, > tail = 0x0 > } > } > } > } > } > }, > head = { > next = 0x0, > prev = 0x0 > } > }, > nfsd_name = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000", > > nfsd_name is usually set to utsname, so that looks bogus. > fcache_disposal = 0x0, > siphash_key = { > key = {0x2a5ba10a35b36754, 0xd6b3a5a0e7696876} > }, > nfs4_client_count = { > counter = 0x0 > }, > nfs4_max_clients = 0x1800, > nfsd_courtesy_clients = { > counter = 0x0 > }, > nfsd_client_shrinker = { > count_objects = 0xffffffffa0e742c0 <nfsd4_state_shrinker_count>, > scan_objects = 0xffffffffa0e73a90 <nfsd4_state_shrinker_scan>, > batch = 0x0, > seeks = 0x2, > flags = 0x1, > list = { > next = 0xffff8881114e9a58, > prev = 0xffffffffa131b280 <mmu_shrinker+32> > }, > nr_deferred = 0xffff88813a5449d8 > }, > nfsd_shrinker_work = { > work = { > data = { > counter = 0x1 > }, > entry = { > next = 0x0, > prev = 0x0 > }, > func = 0x0 > }, > timer = { > entry = { > next = 0x0, > pprev = 0x0 > }, > expires = 0x0, > function = 0x0, > flags = 0x0 > }, > wq = 0x0, > cpu = 0x0 > } > } > crash> kmem -s 0xFFFF8881114E9800 > CACHE OBJSIZE ALLOCATED TOTAL SLABS SSIZE NAME > ffff888100042dc0 1024 18325 18352 1147 32k kmalloc-1k > SLAB MEMORY NODE TOTAL ALLOCATED FREE > ffffea0004453a00 ffff8881114e8000 0 16 16 0 > FREE / [ALLOCATED] > [ffff8881114e9800] > crash> > Bit of a mixed bag here. A lot of these fields are corrupt, but not all of them. One thing that might interesting to rule out a UAF would be to explicitly poison this struct in nfsd_exit_net. Basically do something like this at the end of exit_net: memset(net, 0x7c, sizeof(*net)); That might help trigger an oops sooner after the problem occurs. If you're feeling ambitious, another thing you could do is track down some of the running nfsd's in the vmcore, find their rqstp values and see whether the sockets are pointed at the same nfsd_net as the one you found above (see nfsd() function to see how to get from one to the other). If they're pointed at a different nfsd_net that that would suggest that we are looking at a UAF. If it's the same nfsd_net, then I'd lean more toward some sort of memory scribble.
On Wed, 2023-01-11 at 13:26 +0100, Mike Galbraith wrote: > On Wed, 2023-01-11 at 03:31 -0800, dai.ngo@oracle.com wrote: > > > > Can you try: > > > > crash7latest> nfsd_net_id > > nfsd_net_id = $2 = 9 <<=== > > crash7latest> struct net.gen init_net > > gen = 0xffff97fc17d07d80 > > crash7latest> x /10g 0xffff97fc17d07d80 > > 0xffff97fc17d07d80: 0x000000000000000d 0x0000000000000000 > > 0xffff97fc17d07d90: 0x0000000000000000 0xffff97fc0ac40060 > > 0xffff97fc17d07da0: 0xffff994e7bf87600 0xffff98f731172a20 > > 0xffff97fc17d07db0: 0xffff9844b05d9c00 0xffff9832a6a0add0 > > 0xffff97fc17d07dc0: 0xffff984a4470d740 0xffff984a93eb0600 > > <<=== entry for nfsd_net_id > > crash7latest> nfsd_net 0xffff984a93eb0600 > > (monkey see monkey do.. eep eep) > > crash> nfsd_net_id > p: gdb request failed: p nfsd_net_id > crash> struct net.gen init_net > gen = 0xffff88810b7b8a00, > crash> x /10g 0xffff88810b7b8a00 > 0xffff88810b7b8a00: 0x0000000000000010 0x0000000000000000 > 0xffff88810b7b8a10: 0x0000000000000000 0xffff888101563380 > 0xffff88810b7b8a20: 0xffff888101ebd900 0xffff888101ebda00 > 0xffff88810b7b8a30: 0xffff888101f88b80 0xffff8881022056c0 > 0xffff88810b7b8a40: 0xffff888133b79e00 0xffff888110a2ca00 > crash> nfsd_net 0xffff888110a2ca00 > struct nfsd_net { > cld_net = 0xffff888131c3c000, > svc_expkey_cache = 0xffff888110a2cc00, > svc_export_cache = 0xffff888110a2ce00, > idtoname_cache = 0xffff8881061a8a00, > nametoid_cache = 0xffff8881061a8c00, > nfsd4_manager = { > list = { > next = 0xffff888141efa000, > prev = 0xffff888133e6ea00 > }, > block_opens = false > }, > grace_ended = false, > boot_time = -131387065447864, > nfsd_client_dir = 0xffff888110a2ca48, > reclaim_str_hashtbl = 0xffff88810bed7408, > reclaim_str_hashtbl_size = 1083333640, > conf_id_hashtbl = 0x0, > conf_name_tree = { > rb_node = 0xffff888140925c00 > }, > unconf_id_hashtbl = 0xffff88810181c800, > unconf_name_tree = { > rb_node = 0x200000000 > }, > sessionid_hashtbl = 0x1, > client_lru = { > next = 0x0, > prev = 0x0 > }, > close_lru = { > next = 0xffff888110a2caa0, > prev = 0xffff888110a2caa0 > }, > del_recall_lru = { > next = 0x0, > prev = 0xffffffffffffffff > }, > blocked_locks_lru = { > next = 0x0, > prev = 0xffff88810a0e0f00 > }, > laundromat_work = { > work = { > data = { > counter = 0 > }, > entry = { > next = 0x0, > prev = 0x0 > }, > func = 0x0 > }, > timer = { > entry = { > next = 0x0, > pprev = 0x0 > }, > expires = 520729437059154371, > function = 0x0, > flags = 3526430787 > }, > wq = 0x24448948f6314540, > cpu = 1133332496 > }, > client_lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = 344528932 > }, > { > locked = 36 '$', > pending = 24 '\030' > }, > { > locked_pending = 6180, > tail = 5257 > } > } > } > } > } > }, > blocked_locks_lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = 1820937252 > }, > { > locked = 36 '$', > pending = 76 'L' > }, > { > locked_pending = 19492, > tail = 27785 > } > } > } > } > } > }, > rec_file = 0x4808245c89483824, > in_grace = 137, > client_tracking_ops = 0xe8df8948005d8b, > nfsd4_lease = -8266309238763028480, > nfsd4_grace = 5476377146897729659, > somebody_reclaimed = 139, > track_reclaim_completes = 99, > nr_reclaim_complete = { > counter = -402096755 > }, > nfsd_net_up = false, > lockd_up = false, > writeverf_lock = { > seqcount = { > seqcount = { > sequence = 140872013 > } > }, > lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = -387479220 > }, > { > locked = 76 'L', > pending = 137 '\211' > }, > { > locked_pending = 35148, > tail = 59623 > } > } > } > } > } > } > }, > writeverf = "\000\000\000\000M\211,$", > max_connections = 612141896, > clientid_base = 59416, > clientid_counter = 2336751616, > clverifier_counter = 1275601988, > nfsd_serv = 0x1024448b48185889, > keep_active = 140740940, > s2s_cp_cl_id = 1223133516, > s2s_cp_stateids = { > idr_rt = { > xa_lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = 15205257 > }, > { > locked = 137 '\211', > pending = 3 '\003' > }, > { > locked_pending = 905, > tail = 232 > } > } > } > } > } > }, > xa_flags = 1224736768, > xa_head = 0xf74f6854d241c89 > }, > idr_base = 276532552, > idr_next = 232 > }, > s2s_cp_lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = 1933134848 > }, > { > locked = 0 '\000', > pending = 76 'L' > }, > { > locked_pending = 19456, > tail = 29497 > } > } > } > } > } > }, > nfsd_versions = 0x443924048b012404, > nfsd4_minorversions = 0x2b4820f2424, > drc_hashtbl = 0x8678d4d107b8d48, > max_drc_entries = 232, > maskbits = 1938508800, > drc_hashsize = 4287187984, > num_drc_entries = { > counter = 232 > }, > counter = {{ > lock = { > raw_lock = { > { > val = { > counter = 931745024 > }, > { > locked = 0 '\000', > pending = 77 'M' > }, > { > locked_pending = 19712, > tail = 14217 > } > } > } > }, > count = -8858645092202691189, > list = { > next = 0x24648b4cffffff43, > prev = 0x246c8b4c24148b40 > }, > counters = 0xffffffffa0d0b540 <__this_module> > }, { > lock = { > raw_lock = { > { > val = { > counter = 256 > }, > { > locked = 0 '\000', > pending = 1 '\001' > }, > { > locked_pending = 256, > tail = 0 > } > } > } > }, > count = -131387314532352, > list = { > next = 0x0, > prev = 0xffffffffa0c949c0 <svc_udp_ops+1248> > }, > counters = 0xffffffffa0c67f00 <ip_map_put> > }}, > longest_chain = 2697366144, > longest_chain_cachesize = 4294967295, > nfsd_reply_cache_shrinker = { > count_objects = 0xffffffffa0c67cd0 <ip_map_request>, > scan_objects = 0xffffffffa0c68e40 <ip_map_parse>, Looks like this part of the struct may have been overwritten with ip_map_cache_template ? Nothing else in here looks recognizable, so I have to wonder if you actually have the correct nfsd_net pointer here. > batch = -1597606560, > seeks = 0, > flags = 0, > list = { > next = 0xffffffffa0c67350 <ip_map_alloc>, > prev = 0x0 > }, > nr_deferred = 0xffffffffa0c68a00 <ip_map_match> > }, > nfsd_ssc_lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = -1597603936 > }, > { > locked = 160 '\240', > pending = 127 '\177' > }, > { > locked_pending = 32672, > tail = 41158 > } > } > } > } > } > }, > nfsd_ssc_mount_list = { > next = 0xffffffffa0c68b10 <update>, > prev = 0x49 > }, > nfsd_ssc_waitq = { > lock = { > { > rlock = { > raw_lock = { > { > val = { > counter = -1596979232 > }, > { > locked = 224 '\340', > pending = 7 '\a' > }, > { > locked_pending = 2016, > tail = 41168 > } > } > } > } > } > }, > head = { > next = 0xffff888110a2ce88, > prev = 0xc2 > } > }, > nfsd_name = "\001\000\000\000\000\000\000\000\200\t\021D\201\210\377\377\200\t\021D\201\210\377\377\001\000\000\000\000\000\000\000\032\000\000\000\000\000\000\000\377\377\377\377\377\377\377\377\000\301\303\061\201\210\377\377@$\234\203\377\377\377\377", > fcache_disposal = 0x0, > siphash_key = { > key = {0, 0} > }, > nfs4_client_count = { > counter = 451 > }, > nfs4_max_clients = 122552490, > nfsd_courtesy_clients = { > counter = 0 > }, > nfsd_client_shrinker = { > count_objects = 0xe8000002a0a3894c, > scan_objects = 0x98b3894400000000, > batch = 5483261796049485826, > seeks = 15267721, > flags = 1275068416, > list = { > next = 0x18247c8d4918658b, > prev = 0x7c8b4900000000e8 > }, > nr_deferred = 0x4800000000e81824 > }, > nfsd_shrinker_work = { > work = { > data = { > counter = -8554306017173128307 > }, > entry = { > next = 0x894c00000000e8c4, > prev = 0xf7894c00000268a3 > }, > func = 0x6d8b4800000000e8 > }, > timer = { > entry = { > next = 0x270bb8d4818, > pprev = 0xbb8d4800000000e8 > }, > expires = 8118733695596102332, > function = 0xe8000002, > flags = 45908935 > }, > wq = 0x147424e783410000, > cpu = 553616193 > } > } > crash> >
On Wed, 2023-01-11 at 07:33 -0500, Jeff Layton wrote: > > One thing that might interesting to rule out a UAF would be to > explicitly poison this struct in nfsd_exit_net. Basically do something > like this at the end of exit_net: > > memset(net, 0x7c, sizeof(*net)); > > That might help trigger an oops sooner after the problem occurs. Blasting net rendered the VM non-booting. Blasting nn OTOH seems to have changed nothing at all. > If you're feeling ambitious, another thing you could do is track down > some of the running nfsd's in the vmcore, find their rqstp values and > see whether the sockets are pointed at the same nfsd_net as the one you > found above (see nfsd() function to see how to get from one to the > other). > > If they're pointed at a different nfsd_net that that would suggest that > we are looking at a UAF. If it's the same nfsd_net, then I'd lean more > toward some sort of memory scribble. Way better: scrawny NFS chimp hands dump to big/strong NFS gorilla :) -Mike
On Wed, 2023-01-11 at 07:33 -0500, Jeff Layton wrote: > On Wed, 2023-01-11 at 13:15 +0100, Mike Galbraith wrote: > > On Wed, 2023-01-11 at 12:19 +0100, Mike Galbraith wrote: > > > On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote: > > > > > > > > > > > > > > > > > > > > It might be interesting to turn up KASAN if you're able. > > > > > > I can try that. > > > > KASAN did not squeak. > > > > > > If you still have this vmcore, it might be interesting to do the pointer > > > > math and find the nfsd_net structure that contains the above > > > > delayed_work. Does the rest of it also seem to be corrupt? > > > > Virgin source with workqueue.c WARN_ON_ONCE() landmine. > > > > Thanks. Mixed bag here... > > > > crash> nfsd_net -x 0xFFFF8881114E9800 > > struct nfsd_net { > > cld_net = 0x0, > > svc_expkey_cache = 0xffff8881420f8a00, > > svc_export_cache = 0xffff8881420f8800, > > idtoname_cache = 0xffff8881420f9a00, > > nametoid_cache = 0xffff8881420f9c00, > > nfsd4_manager = { > > list = { > > next = 0x0, > > prev = 0x0 > > }, > > block_opens = 0x0 > > }, > > grace_ended = 0x0, > > > > boot_time = 0x0, > > nfsd_client_dir = 0x0, > > reclaim_str_hashtbl = 0x0, > > reclaim_str_hashtbl_size = 0x0, > > conf_id_hashtbl = 0x0, > > conf_name_tree = { > > rb_node = 0x0 > > }, > > unconf_id_hashtbl = 0x0, > > unconf_name_tree = { > > rb_node = 0x0 > > }, > > sessionid_hashtbl = 0x0, > > client_lru = { > > next = 0x0, > > prev = 0x0 > > }, > > close_lru = { > > next = 0x0, > > prev = 0x0 > > }, > > del_recall_lru = { > > next = 0x0, > > prev = 0x0 > > }, > > blocked_locks_lru = { > > next = 0x0, > > prev = 0x0 > > }, > > All of the above list_heads are zeroed out and they shouldn't be. > > > laundromat_work = { > > work = { > > data = { > > counter = 0x0 > > }, > > entry = { > > next = 0x0, > > prev = 0x0 > > }, > > func = 0x0 > > }, > > timer = { > > entry = { > > next = 0x0, > > pprev = 0x0 > > }, > > expires = 0x0, > > function = 0x0, > > flags = 0x0 > > }, > > wq = 0x0, > > cpu = 0x0 > > }, > > client_lock = { > > { > > rlock = { > > raw_lock = { > > { > > val = { > > counter = 0x0 > > }, > > { > > locked = 0x0, > > pending = 0x0 > > }, > > { > > locked_pending = 0x0, > > tail = 0x0 > > } > > } > > } > > } > > } > > }, > > blocked_locks_lock = { > > { > > rlock = { > > raw_lock = { > > { > > val = { > > counter = 0x0 > > }, > > { > > locked = 0x0, > > pending = 0x0 > > }, > > { > > locked_pending = 0x0, > > tail = 0x0 > > } > > } > > } > > } > > } > > }, > > rec_file = 0x0, > > in_grace = 0x0, > > client_tracking_ops = 0x0, > > nfsd4_lease = 0x5a, > > nfsd4_grace = 0x5a, > > The grace and lease times look ok, oddly enough. > > > somebody_reclaimed = 0x0, > > track_reclaim_completes = 0x0, > > nr_reclaim_complete = { > > counter = 0x0 > > }, > > nfsd_net_up = 0x0, > > nfsd_net_up is false, which means that this server isn't running (or > that the memory here was scribbled over). > > > lockd_up = 0x0, > > writeverf_lock = { > > seqcount = { > > seqcount = { > > sequence = 0x0 > > } > > }, > > lock = { > > { > > rlock = { > > raw_lock = { > > { > > val = { > > counter = 0x0 > > }, > > { > > locked = 0x0, > > pending = 0x0 > > }, > > { > > locked_pending = 0x0, > > tail = 0x0 > > } > > } > > } > > } > > } > > } > > }, > > writeverf = "\000\000\000\000\000\000\000", > > max_connections = 0x0, > > clientid_base = 0x37b4ca7b, > > clientid_counter = 0x37b4ca7d, > > clverifier_counter = 0xa8ee910d, > > nfsd_serv = 0x0, > > keep_active = 0x0, > > s2s_cp_cl_id = 0x37b4ca7c, > > s2s_cp_stateids = { > > idr_rt = { > > xa_lock = { > > { > > rlock = { > > raw_lock = { > > { > > val = { > > counter = 0x0 > > }, > > { > > locked = 0x0, > > pending = 0x0 > > }, > > { > > locked_pending = 0x0, > > tail = 0x0 > > } > > } > > } > > } > > } > > }, > > xa_flags = 0x0, > > xa_head = 0x0 > > }, > > idr_base = 0x0, > > idr_next = 0x0 > > }, > > s2s_cp_lock = { > > { > > rlock = { > > raw_lock = { > > { > > val = { > > counter = 0x0 > > }, > > { > > locked = 0x0, > > pending = 0x0 > > }, > > { > > locked_pending = 0x0, > > tail = 0x0 > > } > > } > > } > > } > > } > > }, > > nfsd_versions = 0x0, > > nfsd4_minorversions = 0x0, > > drc_hashtbl = 0xffff88810a2f0000, > > max_drc_entries = 0x14740, > > maskbits = 0xb, > > drc_hashsize = 0x800, > > num_drc_entries = { > > counter = 0x0 > > }, > > counter = {{ > > lock = { > > raw_lock = { > > { > > val = { > > counter = 0x0 > > }, > > { > > locked = 0x0, > > pending = 0x0 > > }, > > { > > locked_pending = 0x0, > > tail = 0x0 > > } > > } > > } > > }, > > count = 0x0, > > list = { > > next = 0xffff888103f98dd0, > > prev = 0xffff8881114e9a18 > > }, > > counters = 0x607dc8402e10 > > }, { > > lock = { > > raw_lock = { > > { > > val = { > > counter = 0x0 > > }, > > { > > locked = 0x0, > > pending = 0x0 > > }, > > { > > locked_pending = 0x0, > > tail = 0x0 > > } > > } > > } > > }, > > count = 0x0, > > list = { > > next = 0xffff8881114e99f0, > > prev = 0xffff88810b5743e0 > > }, > > counters = 0x607dc8402e14 > > }}, > > longest_chain = 0x0, > > longest_chain_cachesize = 0x0, > > nfsd_reply_cache_shrinker = { > > count_objects = 0xffffffffa0e4e9b0 <nfsd_reply_cache_count>, > > scan_objects = 0xffffffffa0e4f020 <nfsd_reply_cache_scan>, > > Shrinker pointers look ok, as does its list_head. I think I might see what's going on here: This struct is consistent with an nfsd_net structure that allocated and had its ->init routine run on it, but that has not had nfsd started in its namespace. pernet structs are kzalloc'ed. The shrinker registrations and the lease/grace times are set just after allocation in the ->init routine. The rest of the fields are not set until nfsd is started (specifically when /proc/fs/nfsd/threads is written to). My guess is that Mike is doing some activity that creates new net namespaces, and then once we start getting into OOM conditions the shrinker ends up hitting uninitialized fields in the nfsd_net and kaboom. I haven't yet looked to see when this bug was introduced, but the right fix is probably to ensure that everything important for this job is initialized after the pernet ->init operation runs.
On Wed, 2023-01-11 at 09:01 -0500, Jeff Layton wrote: > On Wed, 2023-01-11 at 07:33 -0500, Jeff Layton wrote: > > On Wed, 2023-01-11 at 13:15 +0100, Mike Galbraith wrote: > > > On Wed, 2023-01-11 at 12:19 +0100, Mike Galbraith wrote: > > > > On Wed, 2023-01-11 at 05:55 -0500, Jeff Layton wrote: > > > > > > > > > > > > > > > > > > > > > > > > It might be interesting to turn up KASAN if you're able. > > > > > > > > I can try that. > > > > > > KASAN did not squeak. > > > > > > > > If you still have this vmcore, it might be interesting to do the pointer > > > > > math and find the nfsd_net structure that contains the above > > > > > delayed_work. Does the rest of it also seem to be corrupt? > > > > > > Virgin source with workqueue.c WARN_ON_ONCE() landmine. > > > > > > > Thanks. Mixed bag here... > > > > > > > crash> nfsd_net -x 0xFFFF8881114E9800 > > > struct nfsd_net { > > > cld_net = 0x0, > > > svc_expkey_cache = 0xffff8881420f8a00, > > > svc_export_cache = 0xffff8881420f8800, > > > idtoname_cache = 0xffff8881420f9a00, > > > nametoid_cache = 0xffff8881420f9c00, > > > nfsd4_manager = { > > > list = { > > > next = 0x0, > > > prev = 0x0 > > > }, > > > block_opens = 0x0 > > > }, > > > grace_ended = 0x0, > > > > > > > boot_time = 0x0, > > > nfsd_client_dir = 0x0, > > > reclaim_str_hashtbl = 0x0, > > > reclaim_str_hashtbl_size = 0x0, > > > conf_id_hashtbl = 0x0, > > > conf_name_tree = { > > > rb_node = 0x0 > > > }, > > > unconf_id_hashtbl = 0x0, > > > unconf_name_tree = { > > > rb_node = 0x0 > > > }, > > > sessionid_hashtbl = 0x0, > > > client_lru = { > > > next = 0x0, > > > prev = 0x0 > > > }, > > > close_lru = { > > > next = 0x0, > > > prev = 0x0 > > > }, > > > del_recall_lru = { > > > next = 0x0, > > > prev = 0x0 > > > }, > > > blocked_locks_lru = { > > > next = 0x0, > > > prev = 0x0 > > > }, > > > > All of the above list_heads are zeroed out and they shouldn't be. > > > > > laundromat_work = { > > > work = { > > > data = { > > > counter = 0x0 > > > }, > > > entry = { > > > next = 0x0, > > > prev = 0x0 > > > }, > > > func = 0x0 > > > }, > > > timer = { > > > entry = { > > > next = 0x0, > > > pprev = 0x0 > > > }, > > > expires = 0x0, > > > function = 0x0, > > > flags = 0x0 > > > }, > > > wq = 0x0, > > > cpu = 0x0 > > > }, > > > client_lock = { > > > { > > > rlock = { > > > raw_lock = { > > > { > > > val = { > > > counter = 0x0 > > > }, > > > { > > > locked = 0x0, > > > pending = 0x0 > > > }, > > > { > > > locked_pending = 0x0, > > > tail = 0x0 > > > } > > > } > > > } > > > } > > > } > > > }, > > > blocked_locks_lock = { > > > { > > > rlock = { > > > raw_lock = { > > > { > > > val = { > > > counter = 0x0 > > > }, > > > { > > > locked = 0x0, > > > pending = 0x0 > > > }, > > > { > > > locked_pending = 0x0, > > > tail = 0x0 > > > } > > > } > > > } > > > } > > > } > > > }, > > > rec_file = 0x0, > > > in_grace = 0x0, > > > client_tracking_ops = 0x0, > > > nfsd4_lease = 0x5a, > > > nfsd4_grace = 0x5a, > > > > The grace and lease times look ok, oddly enough. > > > > > somebody_reclaimed = 0x0, > > > track_reclaim_completes = 0x0, > > > nr_reclaim_complete = { > > > counter = 0x0 > > > }, > > > nfsd_net_up = 0x0, > > > > nfsd_net_up is false, which means that this server isn't running (or > > that the memory here was scribbled over). > > > > > lockd_up = 0x0, > > > writeverf_lock = { > > > seqcount = { > > > seqcount = { > > > sequence = 0x0 > > > } > > > }, > > > lock = { > > > { > > > rlock = { > > > raw_lock = { > > > { > > > val = { > > > counter = 0x0 > > > }, > > > { > > > locked = 0x0, > > > pending = 0x0 > > > }, > > > { > > > locked_pending = 0x0, > > > tail = 0x0 > > > } > > > } > > > } > > > } > > > } > > > } > > > }, > > > writeverf = "\000\000\000\000\000\000\000", > > > max_connections = 0x0, > > > clientid_base = 0x37b4ca7b, > > > clientid_counter = 0x37b4ca7d, > > > clverifier_counter = 0xa8ee910d, > > > nfsd_serv = 0x0, > > > keep_active = 0x0, > > > s2s_cp_cl_id = 0x37b4ca7c, > > > s2s_cp_stateids = { > > > idr_rt = { > > > xa_lock = { > > > { > > > rlock = { > > > raw_lock = { > > > { > > > val = { > > > counter = 0x0 > > > }, > > > { > > > locked = 0x0, > > > pending = 0x0 > > > }, > > > { > > > locked_pending = 0x0, > > > tail = 0x0 > > > } > > > } > > > } > > > } > > > } > > > }, > > > xa_flags = 0x0, > > > xa_head = 0x0 > > > }, > > > idr_base = 0x0, > > > idr_next = 0x0 > > > }, > > > s2s_cp_lock = { > > > { > > > rlock = { > > > raw_lock = { > > > { > > > val = { > > > counter = 0x0 > > > }, > > > { > > > locked = 0x0, > > > pending = 0x0 > > > }, > > > { > > > locked_pending = 0x0, > > > tail = 0x0 > > > } > > > } > > > } > > > } > > > } > > > }, > > > nfsd_versions = 0x0, > > > nfsd4_minorversions = 0x0, > > > drc_hashtbl = 0xffff88810a2f0000, > > > max_drc_entries = 0x14740, > > > maskbits = 0xb, > > > drc_hashsize = 0x800, > > > num_drc_entries = { > > > counter = 0x0 > > > }, > > > counter = {{ > > > lock = { > > > raw_lock = { > > > { > > > val = { > > > counter = 0x0 > > > }, > > > { > > > locked = 0x0, > > > pending = 0x0 > > > }, > > > { > > > locked_pending = 0x0, > > > tail = 0x0 > > > } > > > } > > > } > > > }, > > > count = 0x0, > > > list = { > > > next = 0xffff888103f98dd0, > > > prev = 0xffff8881114e9a18 > > > }, > > > counters = 0x607dc8402e10 > > > }, { > > > lock = { > > > raw_lock = { > > > { > > > val = { > > > counter = 0x0 > > > }, > > > { > > > locked = 0x0, > > > pending = 0x0 > > > }, > > > { > > > locked_pending = 0x0, > > > tail = 0x0 > > > } > > > } > > > } > > > }, > > > count = 0x0, > > > list = { > > > next = 0xffff8881114e99f0, > > > prev = 0xffff88810b5743e0 > > > }, > > > counters = 0x607dc8402e14 > > > }}, > > > longest_chain = 0x0, > > > longest_chain_cachesize = 0x0, > > > nfsd_reply_cache_shrinker = { > > > count_objects = 0xffffffffa0e4e9b0 <nfsd_reply_cache_count>, > > > scan_objects = 0xffffffffa0e4f020 <nfsd_reply_cache_scan>, > > > > Shrinker pointers look ok, as does its list_head. > > I think I might see what's going on here: > > This struct is consistent with an nfsd_net structure that allocated and > had its ->init routine run on it, but that has not had nfsd started in > its namespace. > > pernet structs are kzalloc'ed. The shrinker registrations and the > lease/grace times are set just after allocation in the ->init routine. > The rest of the fields are not set until nfsd is started (specifically > when /proc/fs/nfsd/threads is written to). > > My guess is that Mike is doing some activity that creates new net > namespaces, and then once we start getting into OOM conditions the > shrinker ends up hitting uninitialized fields in the nfsd_net and > kaboom. > > I haven't yet looked to see when this bug was introduced, but the riggitht > fix is probably to ensure that everything important for this job is > initialized after the pernet ->init operation runs. It might even be best to just move the shrinker registrations to occur when nfsd is started rather than doing it at nfsd_net initialization time. Obviously, if nfsd isn't running, then it'll have nothing to free, and it's a waste of time to register the shrinker at all.
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 8c854ba3285b..801d70926442 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -196,6 +196,7 @@ struct nfsd_net { atomic_t nfsd_courtesy_clients; struct shrinker nfsd_client_shrinker; struct delayed_work nfsd_shrinker_work; + bool nfsd_shrinker_active; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ee56c9466304..e00551af6a11 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4407,11 +4407,20 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) struct nfsd_net *nn = container_of(shrink, struct nfsd_net, nfsd_client_shrinker); + spin_lock(&nn->client_lock); + if (nn->nfsd_shrinker_active) { + spin_unlock(&nn->client_lock); + return 0; + } count = atomic_read(&nn->nfsd_courtesy_clients); if (!count) count = atomic_long_read(&num_delegations); - if (count) - mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); + if (count) { + nn->nfsd_shrinker_active = true; + spin_unlock(&nn->client_lock); + queue_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0); + } else + spin_unlock(&nn->client_lock); return (unsigned long)count; } @@ -6239,6 +6248,9 @@ nfsd4_state_shrinker_worker(struct work_struct *work) courtesy_client_reaper(nn); deleg_reaper(nn); + spin_lock(&nn->client_lock); + nn->nfsd_shrinker_active = 0; + spin_unlock(&nn->client_lock); } static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
Currently nfsd4_state_shrinker_worker can be schduled multiple times from nfsd4_state_shrinker_count when memory is low. This causes the WARN_ON_ONCE in __queue_delayed_work to trigger. This patch allows only one instance of nfsd4_state_shrinker_worker at a time using the nfsd_shrinker_active flag, protected by the client_lock. Replace mod_delayed_work with queue_delayed_work since we don't expect to modify the delay of any pending work. Fixes: 44df6f439a17 ("NFSD: add delegation reaper to react to low memory condition") Reported-by: Mike Galbraith <efault@gmx.de> Signed-off-by: Dai Ngo <dai.ngo@oracle.com> --- fs/nfsd/netns.h | 1 + fs/nfsd/nfs4state.c | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-)