diff mbox series

[v5,2/2] mm: vmscan: refactor reclaim_state helpers

Message ID 20230405185427.1246289-3-yosryahmed@google.com (mailing list archive)
State Deferred, archived
Headers show
Series Ignore non-LRU-based reclaim in memcg reclaim | expand

Commit Message

Yosry Ahmed April 5, 2023, 6:54 p.m. UTC
During reclaim, we keep track of pages reclaimed from other means than
LRU-based reclaim through scan_control->reclaim_state->reclaimed_slab,
which we stash a pointer to in current task_struct.

However, we keep track of more than just reclaimed slab pages through
this. We also use it for clean file pages dropped through pruned inodes,
and xfs buffer pages freed. Rename reclaimed_slab to reclaimed, and add
a helper function that wraps updating it through current, so that future
changes to this logic are contained within mm/vmscan.c.

Additionally, add a flush_reclaim_state() helper to wrap using
reclaim_state->reclaimed to updated sc->nr_reclaimed, and use that
helper to add an elaborate comment about why we only do the update for
global reclaim.

Finally, move set_task_reclaim_state() next to flush_reclaim_state() so
that all reclaim_state helpers are in close proximity for easier
readability.

Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
---
 fs/inode.c           |  3 +-
 fs/xfs/xfs_buf.c     |  3 +-
 include/linux/swap.h | 17 +++++++++-
 mm/slab.c            |  3 +-
 mm/slob.c            |  6 ++--
 mm/slub.c            |  5 ++-
 mm/vmscan.c          | 75 ++++++++++++++++++++++++++++++++------------
 7 files changed, 78 insertions(+), 34 deletions(-)

Comments

Tim Chen April 6, 2023, 5:31 p.m. UTC | #1
On Wed, 2023-04-05 at 18:54 +0000, Yosry Ahmed wrote:
> During reclaim, we keep track of pages reclaimed from other means than
> LRU-based reclaim through scan_control->reclaim_state->reclaimed_slab,
> which we stash a pointer to in current task_struct.
> 
> However, we keep track of more than just reclaimed slab pages through
> this. We also use it for clean file pages dropped through pruned inodes,
> and xfs buffer pages freed. Rename reclaimed_slab to reclaimed, and add
> a helper function that wraps updating it through current, so that future
> changes to this logic are contained within mm/vmscan.c.
> 
> Additionally, add a flush_reclaim_state() helper to wrap using
> reclaim_state->reclaimed to updated sc->nr_reclaimed, and use that
> helper to add an elaborate comment about why we only do the update for
> global reclaim.
> 
> Finally, move set_task_reclaim_state() next to flush_reclaim_state() so
> that all reclaim_state helpers are in close proximity for easier
> readability.
> 
> Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> ---
>  fs/inode.c           |  3 +-
>  fs/xfs/xfs_buf.c     |  3 +-
>  include/linux/swap.h | 17 +++++++++-
>  mm/slab.c            |  3 +-
>  mm/slob.c            |  6 ++--
>  mm/slub.c            |  5 ++-
>  mm/vmscan.c          | 75 ++++++++++++++++++++++++++++++++------------
>  7 files changed, 78 insertions(+), 34 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index 4558dc2f13557..e60fcc41faf17 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
>  				__count_vm_events(KSWAPD_INODESTEAL, reap);
>  			else
>  				__count_vm_events(PGINODESTEAL, reap);
> -			if (current->reclaim_state)
> -				current->reclaim_state->reclaimed_slab += reap;
> +			mm_account_reclaimed_pages(reap);
>  		}
>  		iput(inode);
>  		spin_lock(lru_lock);
> diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> index 54c774af6e1c6..15d1e5a7c2d34 100644
> --- a/fs/xfs/xfs_buf.c
> +++ b/fs/xfs/xfs_buf.c
> @@ -286,8 +286,7 @@ xfs_buf_free_pages(
>  		if (bp->b_pages[i])
>  			__free_page(bp->b_pages[i]);
>  	}
> -	if (current->reclaim_state)
> -		current->reclaim_state->reclaimed_slab += bp->b_page_count;
> +	mm_account_reclaimed_pages(bp->b_page_count);
>  
>  	if (bp->b_pages != bp->b_page_array)
>  		kmem_free(bp->b_pages);
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 209a425739a9f..e131ac155fb95 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -153,13 +153,28 @@ union swap_header {
>   * memory reclaim
>   */
>  struct reclaim_state {
> -	unsigned long reclaimed_slab;
> +	/* pages reclaimed outside of LRU-based reclaim */
> +	unsigned long reclaimed;
>  #ifdef CONFIG_LRU_GEN
>  	/* per-thread mm walk data */
>  	struct lru_gen_mm_walk *mm_walk;
>  #endif
>  };
>  
> +/*
> + * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
> + * reclaim
> + * @pages: number of pages reclaimed
> + *
> + * If the current process is undergoing a reclaim operation, increment the
> + * number of reclaimed pages by @pages.
> + */
> +static inline void mm_account_reclaimed_pages(unsigned long pages)
> +{
> +	if (current->reclaim_state)
> +		current->reclaim_state->reclaimed += pages;
> +}
> +
>  #ifdef __KERNEL__
>  
>  struct address_space;
> diff --git a/mm/slab.c b/mm/slab.c
> index dabc2a671fc6f..64bf1de817b24 100644
> --- a/mm/slab.c
> +++ b/mm/slab.c
> @@ -1392,8 +1392,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
>  	smp_wmb();
>  	__folio_clear_slab(folio);
>  
> -	if (current->reclaim_state)
> -		current->reclaim_state->reclaimed_slab += 1 << order;
> +	mm_account_reclaimed_pages(1 << order);
>  	unaccount_slab(slab, order, cachep);
>  	__free_pages(&folio->page, order);
>  }
> diff --git a/mm/slob.c b/mm/slob.c
> index fe567fcfa3a39..79cc8680c973c 100644
> --- a/mm/slob.c
> +++ b/mm/slob.c
> @@ -61,7 +61,7 @@
>  #include <linux/slab.h>
>  
>  #include <linux/mm.h>
> -#include <linux/swap.h> /* struct reclaim_state */
> +#include <linux/swap.h> /* mm_account_reclaimed_pages() */
>  #include <linux/cache.h>
>  #include <linux/init.h>
>  #include <linux/export.h>
> @@ -211,9 +211,7 @@ static void slob_free_pages(void *b, int order)
>  {
>  	struct page *sp = virt_to_page(b);
>  
> -	if (current->reclaim_state)
> -		current->reclaim_state->reclaimed_slab += 1 << order;
> -
> +	mm_account_reclaimed_pages(1 << order);
>  	mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
>  			    -(PAGE_SIZE << order));
>  	__free_pages(sp, order);
> diff --git a/mm/slub.c b/mm/slub.c
> index 39327e98fce34..7aa30eef82350 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -11,7 +11,7 @@
>   */
>  
>  #include <linux/mm.h>
> -#include <linux/swap.h> /* struct reclaim_state */
> +#include <linux/swap.h> /* mm_account_reclaimed_pages() */
>  #include <linux/module.h>
>  #include <linux/bit_spinlock.h>
>  #include <linux/interrupt.h>
> @@ -2063,8 +2063,7 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
>  	/* Make the mapping reset visible before clearing the flag */
>  	smp_wmb();
>  	__folio_clear_slab(folio);
> -	if (current->reclaim_state)
> -		current->reclaim_state->reclaimed_slab += pages;
> +	mm_account_reclaimed_pages(pages);
>  	unaccount_slab(slab, order, s);
>  	__free_pages(&folio->page, order);
>  }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c82bd89f90364..049e39202e6ce 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -188,18 +188,6 @@ struct scan_control {
>   */
>  int vm_swappiness = 60;
>  
> -static void set_task_reclaim_state(struct task_struct *task,
> -				   struct reclaim_state *rs)
> -{
> -	/* Check for an overwrite */
> -	WARN_ON_ONCE(rs && task->reclaim_state);
> -
> -	/* Check for the nulling of an already-nulled member */
> -	WARN_ON_ONCE(!rs && !task->reclaim_state);
> -
> -	task->reclaim_state = rs;
> -}
> -
>  LIST_HEAD(shrinker_list);
>  DECLARE_RWSEM(shrinker_rwsem);
>  
> @@ -511,6 +499,59 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>  }
>  #endif
>  
> +static void set_task_reclaim_state(struct task_struct *task,
> +				   struct reclaim_state *rs)
> +{
> +	/* Check for an overwrite */
> +	WARN_ON_ONCE(rs && task->reclaim_state);
> +
> +	/* Check for the nulling of an already-nulled member */
> +	WARN_ON_ONCE(!rs && !task->reclaim_state);
> +
> +	task->reclaim_state = rs;
> +}
> +
> +/*
> + * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
> + * scan_control->nr_reclaimed.
> + */
> +static void flush_reclaim_state(struct scan_control *sc,
> +				struct reclaim_state *rs)
> +{
> +	/*
> +	 * Currently, reclaim_state->reclaimed includes three types of pages
> +	 * freed outside of vmscan:
> +	 * (1) Slab pages.
> +	 * (2) Clean file pages from pruned inodes.
> +	 * (3) XFS freed buffer pages.
> +	 *
> +	 * For all of these cases, we have no way of finding out whether these
> +	 * pages were related to the memcg under reclaim. For example, a freed
> +	 * slab page could have had only a single object charged to the memcg

Minor nits:
s/could have had/could have

> +	 * under reclaim. Also, populated inodes are not on shrinker LRUs
> +	 * anymore except on highmem systems.
> +	 *
> +	 * Instead of over-reporting the reclaimed pages in a memcg reclaim,
> +	 * only count such pages in global reclaim. This prevents unnecessary

May be clearer to say:
This prevents under-reclaimaing the target memcg, and unnecessary

> +	 * retries during memcg charging and false positive from proactive
> +	 * reclaim (memory.reclaim).
> +	 *

Tim
Yosry Ahmed April 6, 2023, 5:43 p.m. UTC | #2
On Thu, Apr 6, 2023 at 10:32 AM Tim Chen <tim.c.chen@linux.intel.com> wrote:
>
> On Wed, 2023-04-05 at 18:54 +0000, Yosry Ahmed wrote:
> > During reclaim, we keep track of pages reclaimed from other means than
> > LRU-based reclaim through scan_control->reclaim_state->reclaimed_slab,
> > which we stash a pointer to in current task_struct.
> >
> > However, we keep track of more than just reclaimed slab pages through
> > this. We also use it for clean file pages dropped through pruned inodes,
> > and xfs buffer pages freed. Rename reclaimed_slab to reclaimed, and add
> > a helper function that wraps updating it through current, so that future
> > changes to this logic are contained within mm/vmscan.c.
> >
> > Additionally, add a flush_reclaim_state() helper to wrap using
> > reclaim_state->reclaimed to updated sc->nr_reclaimed, and use that
> > helper to add an elaborate comment about why we only do the update for
> > global reclaim.
> >
> > Finally, move set_task_reclaim_state() next to flush_reclaim_state() so
> > that all reclaim_state helpers are in close proximity for easier
> > readability.
> >
> > Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
> > ---
> >  fs/inode.c           |  3 +-
> >  fs/xfs/xfs_buf.c     |  3 +-
> >  include/linux/swap.h | 17 +++++++++-
> >  mm/slab.c            |  3 +-
> >  mm/slob.c            |  6 ++--
> >  mm/slub.c            |  5 ++-
> >  mm/vmscan.c          | 75 ++++++++++++++++++++++++++++++++------------
> >  7 files changed, 78 insertions(+), 34 deletions(-)
> >
> > diff --git a/fs/inode.c b/fs/inode.c
> > index 4558dc2f13557..e60fcc41faf17 100644
> > --- a/fs/inode.c
> > +++ b/fs/inode.c
> > @@ -864,8 +864,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
> >                               __count_vm_events(KSWAPD_INODESTEAL, reap);
> >                       else
> >                               __count_vm_events(PGINODESTEAL, reap);
> > -                     if (current->reclaim_state)
> > -                             current->reclaim_state->reclaimed_slab += reap;
> > +                     mm_account_reclaimed_pages(reap);
> >               }
> >               iput(inode);
> >               spin_lock(lru_lock);
> > diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
> > index 54c774af6e1c6..15d1e5a7c2d34 100644
> > --- a/fs/xfs/xfs_buf.c
> > +++ b/fs/xfs/xfs_buf.c
> > @@ -286,8 +286,7 @@ xfs_buf_free_pages(
> >               if (bp->b_pages[i])
> >                       __free_page(bp->b_pages[i]);
> >       }
> > -     if (current->reclaim_state)
> > -             current->reclaim_state->reclaimed_slab += bp->b_page_count;
> > +     mm_account_reclaimed_pages(bp->b_page_count);
> >
> >       if (bp->b_pages != bp->b_page_array)
> >               kmem_free(bp->b_pages);
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 209a425739a9f..e131ac155fb95 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -153,13 +153,28 @@ union swap_header {
> >   * memory reclaim
> >   */
> >  struct reclaim_state {
> > -     unsigned long reclaimed_slab;
> > +     /* pages reclaimed outside of LRU-based reclaim */
> > +     unsigned long reclaimed;
> >  #ifdef CONFIG_LRU_GEN
> >       /* per-thread mm walk data */
> >       struct lru_gen_mm_walk *mm_walk;
> >  #endif
> >  };
> >
> > +/*
> > + * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
> > + * reclaim
> > + * @pages: number of pages reclaimed
> > + *
> > + * If the current process is undergoing a reclaim operation, increment the
> > + * number of reclaimed pages by @pages.
> > + */
> > +static inline void mm_account_reclaimed_pages(unsigned long pages)
> > +{
> > +     if (current->reclaim_state)
> > +             current->reclaim_state->reclaimed += pages;
> > +}
> > +
> >  #ifdef __KERNEL__
> >
> >  struct address_space;
> > diff --git a/mm/slab.c b/mm/slab.c
> > index dabc2a671fc6f..64bf1de817b24 100644
> > --- a/mm/slab.c
> > +++ b/mm/slab.c
> > @@ -1392,8 +1392,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
> >       smp_wmb();
> >       __folio_clear_slab(folio);
> >
> > -     if (current->reclaim_state)
> > -             current->reclaim_state->reclaimed_slab += 1 << order;
> > +     mm_account_reclaimed_pages(1 << order);
> >       unaccount_slab(slab, order, cachep);
> >       __free_pages(&folio->page, order);
> >  }
> > diff --git a/mm/slob.c b/mm/slob.c
> > index fe567fcfa3a39..79cc8680c973c 100644
> > --- a/mm/slob.c
> > +++ b/mm/slob.c
> > @@ -61,7 +61,7 @@
> >  #include <linux/slab.h>
> >
> >  #include <linux/mm.h>
> > -#include <linux/swap.h> /* struct reclaim_state */
> > +#include <linux/swap.h> /* mm_account_reclaimed_pages() */
> >  #include <linux/cache.h>
> >  #include <linux/init.h>
> >  #include <linux/export.h>
> > @@ -211,9 +211,7 @@ static void slob_free_pages(void *b, int order)
> >  {
> >       struct page *sp = virt_to_page(b);
> >
> > -     if (current->reclaim_state)
> > -             current->reclaim_state->reclaimed_slab += 1 << order;
> > -
> > +     mm_account_reclaimed_pages(1 << order);
> >       mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
> >                           -(PAGE_SIZE << order));
> >       __free_pages(sp, order);
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 39327e98fce34..7aa30eef82350 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -11,7 +11,7 @@
> >   */
> >
> >  #include <linux/mm.h>
> > -#include <linux/swap.h> /* struct reclaim_state */
> > +#include <linux/swap.h> /* mm_account_reclaimed_pages() */
> >  #include <linux/module.h>
> >  #include <linux/bit_spinlock.h>
> >  #include <linux/interrupt.h>
> > @@ -2063,8 +2063,7 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
> >       /* Make the mapping reset visible before clearing the flag */
> >       smp_wmb();
> >       __folio_clear_slab(folio);
> > -     if (current->reclaim_state)
> > -             current->reclaim_state->reclaimed_slab += pages;
> > +     mm_account_reclaimed_pages(pages);
> >       unaccount_slab(slab, order, s);
> >       __free_pages(&folio->page, order);
> >  }
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index c82bd89f90364..049e39202e6ce 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -188,18 +188,6 @@ struct scan_control {
> >   */
> >  int vm_swappiness = 60;
> >
> > -static void set_task_reclaim_state(struct task_struct *task,
> > -                                struct reclaim_state *rs)
> > -{
> > -     /* Check for an overwrite */
> > -     WARN_ON_ONCE(rs && task->reclaim_state);
> > -
> > -     /* Check for the nulling of an already-nulled member */
> > -     WARN_ON_ONCE(!rs && !task->reclaim_state);
> > -
> > -     task->reclaim_state = rs;
> > -}
> > -
> >  LIST_HEAD(shrinker_list);
> >  DECLARE_RWSEM(shrinker_rwsem);
> >
> > @@ -511,6 +499,59 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> >  }
> >  #endif
> >
> > +static void set_task_reclaim_state(struct task_struct *task,
> > +                                struct reclaim_state *rs)
> > +{
> > +     /* Check for an overwrite */
> > +     WARN_ON_ONCE(rs && task->reclaim_state);
> > +
> > +     /* Check for the nulling of an already-nulled member */
> > +     WARN_ON_ONCE(!rs && !task->reclaim_state);
> > +
> > +     task->reclaim_state = rs;
> > +}
> > +
> > +/*
> > + * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
> > + * scan_control->nr_reclaimed.
> > + */
> > +static void flush_reclaim_state(struct scan_control *sc,
> > +                             struct reclaim_state *rs)
> > +{
> > +     /*
> > +      * Currently, reclaim_state->reclaimed includes three types of pages
> > +      * freed outside of vmscan:
> > +      * (1) Slab pages.
> > +      * (2) Clean file pages from pruned inodes.
> > +      * (3) XFS freed buffer pages.
> > +      *
> > +      * For all of these cases, we have no way of finding out whether these
> > +      * pages were related to the memcg under reclaim. For example, a freed
> > +      * slab page could have had only a single object charged to the memcg
>
> Minor nits:
> s/could have had/could have
>
> > +      * under reclaim. Also, populated inodes are not on shrinker LRUs
> > +      * anymore except on highmem systems.
> > +      *
> > +      * Instead of over-reporting the reclaimed pages in a memcg reclaim,
> > +      * only count such pages in global reclaim. This prevents unnecessary
>
> May be clearer to say:
> This prevents under-reclaimaing the target memcg, and unnecessary

Thanks, will rephrase for the next version!

>
> > +      * retries during memcg charging and false positive from proactive
> > +      * reclaim (memory.reclaim).
> > +      *
>
> Tim
>
Matthew Wilcox April 6, 2023, 7:42 p.m. UTC | #3
On Thu, Apr 06, 2023 at 10:31:53AM -0700, Tim Chen wrote:
> On Wed, 2023-04-05 at 18:54 +0000, Yosry Ahmed wrote:
> > +	 * For all of these cases, we have no way of finding out whether these
> > +	 * pages were related to the memcg under reclaim. For example, a freed
> > +	 * slab page could have had only a single object charged to the memcg
> 
> Minor nits:
> s/could have had/could have

No ... "could have had" is correct.  I'm a native English speaker, so I
have no idea what the rule here is, but I can ask my linguist wife later
if you want to know ;-)

Maybe it's something like this:
https://www.englishgrammar.org/have-had-and-had-had/
Peter Xu April 6, 2023, 8:45 p.m. UTC | #4
Hi, Yosry,

On Wed, Apr 05, 2023 at 06:54:27PM +0000, Yosry Ahmed wrote:

[...]

> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index c82bd89f90364..049e39202e6ce 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -188,18 +188,6 @@ struct scan_control {
>   */
>  int vm_swappiness = 60;
>  
> -static void set_task_reclaim_state(struct task_struct *task,
> -				   struct reclaim_state *rs)
> -{
> -	/* Check for an overwrite */
> -	WARN_ON_ONCE(rs && task->reclaim_state);
> -
> -	/* Check for the nulling of an already-nulled member */
> -	WARN_ON_ONCE(!rs && !task->reclaim_state);
> -
> -	task->reclaim_state = rs;
> -}
> -
>  LIST_HEAD(shrinker_list);
>  DECLARE_RWSEM(shrinker_rwsem);
>  
> @@ -511,6 +499,59 @@ static bool writeback_throttling_sane(struct scan_control *sc)
>  }
>  #endif
>  
> +static void set_task_reclaim_state(struct task_struct *task,
> +				   struct reclaim_state *rs)
> +{
> +	/* Check for an overwrite */
> +	WARN_ON_ONCE(rs && task->reclaim_state);
> +
> +	/* Check for the nulling of an already-nulled member */
> +	WARN_ON_ONCE(!rs && !task->reclaim_state);
> +
> +	task->reclaim_state = rs;
> +}

Nit: I just think such movement not necessary while it loses the "git
blame" information easily.

Instead of moving this here without major benefit, why not just define
flush_reclaim_state() right after previous set_task_reclaim_state()?

> +
> +/*
> + * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
> + * scan_control->nr_reclaimed.
> + */
> +static void flush_reclaim_state(struct scan_control *sc,
> +				struct reclaim_state *rs)
> +{
> +	/*
> +	 * Currently, reclaim_state->reclaimed includes three types of pages
> +	 * freed outside of vmscan:
> +	 * (1) Slab pages.
> +	 * (2) Clean file pages from pruned inodes.
> +	 * (3) XFS freed buffer pages.
> +	 *
> +	 * For all of these cases, we have no way of finding out whether these
> +	 * pages were related to the memcg under reclaim. For example, a freed
> +	 * slab page could have had only a single object charged to the memcg
> +	 * under reclaim. Also, populated inodes are not on shrinker LRUs
> +	 * anymore except on highmem systems.
> +	 *
> +	 * Instead of over-reporting the reclaimed pages in a memcg reclaim,
> +	 * only count such pages in global reclaim. This prevents unnecessary
> +	 * retries during memcg charging and false positive from proactive
> +	 * reclaim (memory.reclaim).
> +	 *
> +	 * For uncommon cases were the freed pages were actually significantly
> +	 * charged to the memcg under reclaim, and we end up under-reporting, it
> +	 * should be fine. The freed pages will be uncharged anyway, even if
> +	 * they are not reported properly, and we will be able to make forward
> +	 * progress in charging (which is usually in a retry loop).
> +	 *
> +	 * We can go one step further, and report the uncharged objcg pages in
> +	 * memcg reclaim, to make reporting more accurate and reduce
> +	 * under-reporting, but it's probably not worth the complexity for now.
> +	 */
> +	if (rs && global_reclaim(sc)) {
> +		sc->nr_reclaimed += rs->reclaimed;
> +		rs->reclaimed = 0;
> +	}
> +}
> +
>  static long xchg_nr_deferred(struct shrinker *shrinker,
>  			     struct shrink_control *sc)
>  {
> @@ -5346,10 +5387,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
>  		vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
>  			   sc->nr_reclaimed - reclaimed);
>  
> -	if (global_reclaim(sc)) {
> -		sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
> -		current->reclaim_state->reclaimed_slab = 0;
> -	}
> +	flush_reclaim_state(sc, current->reclaim_state);
>  
>  	return success ? MEMCG_LRU_YOUNG : 0;
>  }
> @@ -6474,10 +6512,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  
>  	shrink_node_memcgs(pgdat, sc);
>  
> -	if (reclaim_state && global_reclaim(sc)) {
> -		sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> -		reclaim_state->reclaimed_slab = 0;
> -	}
> +	flush_reclaim_state(sc, reclaim_state);

IIUC reclaim_state here still points to current->reclaim_state.  Could it
change at all?

Is it cleaner to make flush_reclaim_state() taking "sc" only if it always
references current->reclaim_state?

>  
>  	/* Record the subtree's reclaim efficiency */
>  	if (!sc->proactive)
> -- 
> 2.40.0.348.gf938b09366-goog
>
Yosry Ahmed April 7, 2023, 1:02 a.m. UTC | #5
On Thu, Apr 6, 2023 at 1:45 PM Peter Xu <peterx@redhat.com> wrote:
>
> Hi, Yosry,
>
> On Wed, Apr 05, 2023 at 06:54:27PM +0000, Yosry Ahmed wrote:
>
> [...]
>
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index c82bd89f90364..049e39202e6ce 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -188,18 +188,6 @@ struct scan_control {
> >   */
> >  int vm_swappiness = 60;
> >
> > -static void set_task_reclaim_state(struct task_struct *task,
> > -                                struct reclaim_state *rs)
> > -{
> > -     /* Check for an overwrite */
> > -     WARN_ON_ONCE(rs && task->reclaim_state);
> > -
> > -     /* Check for the nulling of an already-nulled member */
> > -     WARN_ON_ONCE(!rs && !task->reclaim_state);
> > -
> > -     task->reclaim_state = rs;
> > -}
> > -
> >  LIST_HEAD(shrinker_list);
> >  DECLARE_RWSEM(shrinker_rwsem);
> >
> > @@ -511,6 +499,59 @@ static bool writeback_throttling_sane(struct scan_control *sc)
> >  }
> >  #endif
> >
> > +static void set_task_reclaim_state(struct task_struct *task,
> > +                                struct reclaim_state *rs)
> > +{
> > +     /* Check for an overwrite */
> > +     WARN_ON_ONCE(rs && task->reclaim_state);
> > +
> > +     /* Check for the nulling of an already-nulled member */
> > +     WARN_ON_ONCE(!rs && !task->reclaim_state);
> > +
> > +     task->reclaim_state = rs;
> > +}
>
> Nit: I just think such movement not necessary while it loses the "git
> blame" information easily.
>
> Instead of moving this here without major benefit, why not just define
> flush_reclaim_state() right after previous set_task_reclaim_state()?

An earlier version did that, but we would have to add a forward
declaration of global_reclaim() (or cgroup_reclaim()), as they are
defined after the previous position of set_task_reclaim_state().

>
> > +
> > +/*
> > + * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
> > + * scan_control->nr_reclaimed.
> > + */
> > +static void flush_reclaim_state(struct scan_control *sc,
> > +                             struct reclaim_state *rs)
> > +{
> > +     /*
> > +      * Currently, reclaim_state->reclaimed includes three types of pages
> > +      * freed outside of vmscan:
> > +      * (1) Slab pages.
> > +      * (2) Clean file pages from pruned inodes.
> > +      * (3) XFS freed buffer pages.
> > +      *
> > +      * For all of these cases, we have no way of finding out whether these
> > +      * pages were related to the memcg under reclaim. For example, a freed
> > +      * slab page could have had only a single object charged to the memcg
> > +      * under reclaim. Also, populated inodes are not on shrinker LRUs
> > +      * anymore except on highmem systems.
> > +      *
> > +      * Instead of over-reporting the reclaimed pages in a memcg reclaim,
> > +      * only count such pages in global reclaim. This prevents unnecessary
> > +      * retries during memcg charging and false positive from proactive
> > +      * reclaim (memory.reclaim).
> > +      *
> > +      * For uncommon cases were the freed pages were actually significantly
> > +      * charged to the memcg under reclaim, and we end up under-reporting, it
> > +      * should be fine. The freed pages will be uncharged anyway, even if
> > +      * they are not reported properly, and we will be able to make forward
> > +      * progress in charging (which is usually in a retry loop).
> > +      *
> > +      * We can go one step further, and report the uncharged objcg pages in
> > +      * memcg reclaim, to make reporting more accurate and reduce
> > +      * under-reporting, but it's probably not worth the complexity for now.
> > +      */
> > +     if (rs && global_reclaim(sc)) {
> > +             sc->nr_reclaimed += rs->reclaimed;
> > +             rs->reclaimed = 0;
> > +     }
> > +}
> > +
> >  static long xchg_nr_deferred(struct shrinker *shrinker,
> >                            struct shrink_control *sc)
> >  {
> > @@ -5346,10 +5387,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
> >               vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
> >                          sc->nr_reclaimed - reclaimed);
> >
> > -     if (global_reclaim(sc)) {
> > -             sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
> > -             current->reclaim_state->reclaimed_slab = 0;
> > -     }
> > +     flush_reclaim_state(sc, current->reclaim_state);
> >
> >       return success ? MEMCG_LRU_YOUNG : 0;
> >  }
> > @@ -6474,10 +6512,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
> >
> >       shrink_node_memcgs(pgdat, sc);
> >
> > -     if (reclaim_state && global_reclaim(sc)) {
> > -             sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> > -             reclaim_state->reclaimed_slab = 0;
> > -     }
> > +     flush_reclaim_state(sc, reclaim_state);
>
> IIUC reclaim_state here still points to current->reclaim_state.  Could it
> change at all?
>
> Is it cleaner to make flush_reclaim_state() taking "sc" only if it always
> references current->reclaim_state?

Good point. I think it's always current->reclaim_state.

I think we can make flush_reclaim_state() only take "sc" as an
argument, and remove the "reclaim_state" local variable in
shrink_node() completely.

>
> >
> >       /* Record the subtree's reclaim efficiency */
> >       if (!sc->proactive)
> > --
> > 2.40.0.348.gf938b09366-goog
> >
>
> --
> Peter Xu
>
diff mbox series

Patch

diff --git a/fs/inode.c b/fs/inode.c
index 4558dc2f13557..e60fcc41faf17 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -864,8 +864,7 @@  static enum lru_status inode_lru_isolate(struct list_head *item,
 				__count_vm_events(KSWAPD_INODESTEAL, reap);
 			else
 				__count_vm_events(PGINODESTEAL, reap);
-			if (current->reclaim_state)
-				current->reclaim_state->reclaimed_slab += reap;
+			mm_account_reclaimed_pages(reap);
 		}
 		iput(inode);
 		spin_lock(lru_lock);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 54c774af6e1c6..15d1e5a7c2d34 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -286,8 +286,7 @@  xfs_buf_free_pages(
 		if (bp->b_pages[i])
 			__free_page(bp->b_pages[i]);
 	}
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += bp->b_page_count;
+	mm_account_reclaimed_pages(bp->b_page_count);
 
 	if (bp->b_pages != bp->b_page_array)
 		kmem_free(bp->b_pages);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 209a425739a9f..e131ac155fb95 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -153,13 +153,28 @@  union swap_header {
  * memory reclaim
  */
 struct reclaim_state {
-	unsigned long reclaimed_slab;
+	/* pages reclaimed outside of LRU-based reclaim */
+	unsigned long reclaimed;
 #ifdef CONFIG_LRU_GEN
 	/* per-thread mm walk data */
 	struct lru_gen_mm_walk *mm_walk;
 #endif
 };
 
+/*
+ * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
+ * reclaim
+ * @pages: number of pages reclaimed
+ *
+ * If the current process is undergoing a reclaim operation, increment the
+ * number of reclaimed pages by @pages.
+ */
+static inline void mm_account_reclaimed_pages(unsigned long pages)
+{
+	if (current->reclaim_state)
+		current->reclaim_state->reclaimed += pages;
+}
+
 #ifdef __KERNEL__
 
 struct address_space;
diff --git a/mm/slab.c b/mm/slab.c
index dabc2a671fc6f..64bf1de817b24 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1392,8 +1392,7 @@  static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
 	smp_wmb();
 	__folio_clear_slab(folio);
 
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += 1 << order;
+	mm_account_reclaimed_pages(1 << order);
 	unaccount_slab(slab, order, cachep);
 	__free_pages(&folio->page, order);
 }
diff --git a/mm/slob.c b/mm/slob.c
index fe567fcfa3a39..79cc8680c973c 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -61,7 +61,7 @@ 
 #include <linux/slab.h>
 
 #include <linux/mm.h>
-#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/swap.h> /* mm_account_reclaimed_pages() */
 #include <linux/cache.h>
 #include <linux/init.h>
 #include <linux/export.h>
@@ -211,9 +211,7 @@  static void slob_free_pages(void *b, int order)
 {
 	struct page *sp = virt_to_page(b);
 
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += 1 << order;
-
+	mm_account_reclaimed_pages(1 << order);
 	mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
 			    -(PAGE_SIZE << order));
 	__free_pages(sp, order);
diff --git a/mm/slub.c b/mm/slub.c
index 39327e98fce34..7aa30eef82350 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -11,7 +11,7 @@ 
  */
 
 #include <linux/mm.h>
-#include <linux/swap.h> /* struct reclaim_state */
+#include <linux/swap.h> /* mm_account_reclaimed_pages() */
 #include <linux/module.h>
 #include <linux/bit_spinlock.h>
 #include <linux/interrupt.h>
@@ -2063,8 +2063,7 @@  static void __free_slab(struct kmem_cache *s, struct slab *slab)
 	/* Make the mapping reset visible before clearing the flag */
 	smp_wmb();
 	__folio_clear_slab(folio);
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += pages;
+	mm_account_reclaimed_pages(pages);
 	unaccount_slab(slab, order, s);
 	__free_pages(&folio->page, order);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c82bd89f90364..049e39202e6ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -188,18 +188,6 @@  struct scan_control {
  */
 int vm_swappiness = 60;
 
-static void set_task_reclaim_state(struct task_struct *task,
-				   struct reclaim_state *rs)
-{
-	/* Check for an overwrite */
-	WARN_ON_ONCE(rs && task->reclaim_state);
-
-	/* Check for the nulling of an already-nulled member */
-	WARN_ON_ONCE(!rs && !task->reclaim_state);
-
-	task->reclaim_state = rs;
-}
-
 LIST_HEAD(shrinker_list);
 DECLARE_RWSEM(shrinker_rwsem);
 
@@ -511,6 +499,59 @@  static bool writeback_throttling_sane(struct scan_control *sc)
 }
 #endif
 
+static void set_task_reclaim_state(struct task_struct *task,
+				   struct reclaim_state *rs)
+{
+	/* Check for an overwrite */
+	WARN_ON_ONCE(rs && task->reclaim_state);
+
+	/* Check for the nulling of an already-nulled member */
+	WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+	task->reclaim_state = rs;
+}
+
+/*
+ * flush_reclaim_state(): add pages reclaimed outside of LRU-based reclaim to
+ * scan_control->nr_reclaimed.
+ */
+static void flush_reclaim_state(struct scan_control *sc,
+				struct reclaim_state *rs)
+{
+	/*
+	 * Currently, reclaim_state->reclaimed includes three types of pages
+	 * freed outside of vmscan:
+	 * (1) Slab pages.
+	 * (2) Clean file pages from pruned inodes.
+	 * (3) XFS freed buffer pages.
+	 *
+	 * For all of these cases, we have no way of finding out whether these
+	 * pages were related to the memcg under reclaim. For example, a freed
+	 * slab page could have had only a single object charged to the memcg
+	 * under reclaim. Also, populated inodes are not on shrinker LRUs
+	 * anymore except on highmem systems.
+	 *
+	 * Instead of over-reporting the reclaimed pages in a memcg reclaim,
+	 * only count such pages in global reclaim. This prevents unnecessary
+	 * retries during memcg charging and false positive from proactive
+	 * reclaim (memory.reclaim).
+	 *
+	 * For uncommon cases were the freed pages were actually significantly
+	 * charged to the memcg under reclaim, and we end up under-reporting, it
+	 * should be fine. The freed pages will be uncharged anyway, even if
+	 * they are not reported properly, and we will be able to make forward
+	 * progress in charging (which is usually in a retry loop).
+	 *
+	 * We can go one step further, and report the uncharged objcg pages in
+	 * memcg reclaim, to make reporting more accurate and reduce
+	 * under-reporting, but it's probably not worth the complexity for now.
+	 */
+	if (rs && global_reclaim(sc)) {
+		sc->nr_reclaimed += rs->reclaimed;
+		rs->reclaimed = 0;
+	}
+}
+
 static long xchg_nr_deferred(struct shrinker *shrinker,
 			     struct shrink_control *sc)
 {
@@ -5346,10 +5387,7 @@  static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
 		vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
 			   sc->nr_reclaimed - reclaimed);
 
-	if (global_reclaim(sc)) {
-		sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
-		current->reclaim_state->reclaimed_slab = 0;
-	}
+	flush_reclaim_state(sc, current->reclaim_state);
 
 	return success ? MEMCG_LRU_YOUNG : 0;
 }
@@ -6474,10 +6512,7 @@  static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 
 	shrink_node_memcgs(pgdat, sc);
 
-	if (reclaim_state && global_reclaim(sc)) {
-		sc->nr_reclaimed += reclaim_state->reclaimed_slab;
-		reclaim_state->reclaimed_slab = 0;
-	}
+	flush_reclaim_state(sc, reclaim_state);
 
 	/* Record the subtree's reclaim efficiency */
 	if (!sc->proactive)