diff mbox series

mm/memcontrol: add per-memcg pgpgin/pswpin counter

Message ID 20240830082244.156923-1-jingxiangzeng.cas@gmail.com (mailing list archive)
State New
Headers show
Series mm/memcontrol: add per-memcg pgpgin/pswpin counter | expand

Commit Message

Jingxiang Zeng Aug. 30, 2024, 8:22 a.m. UTC
From: Jingxiang Zeng <linuszeng@tencent.com>

In proactive memory reclamation scenarios, it is necessary to
estimate the pswpin and pswpout metrics of the cgroup to
determine whether to continue reclaiming anonymous pages in
the current batch. This patch will collect these metrics and
expose them.

Signed-off-by: Jingxiang Zeng <linuszeng@tencent.com>
---
 mm/memcontrol-v1.c | 2 ++
 mm/memcontrol.c    | 2 ++
 mm/page_io.c       | 4 ++++
 3 files changed, 8 insertions(+)

Comments

Andrew Morton Sept. 9, 2024, 10:29 p.m. UTC | #1
On Fri, 30 Aug 2024 16:22:44 +0800 Jingxiang Zeng <jingxiangzeng.cas@gmail.com> wrote:

> From: Jingxiang Zeng <linuszeng@tencent.com>
> 
> In proactive memory reclamation scenarios, it is necessary to
> estimate the pswpin and pswpout metrics of the cgroup to
> determine whether to continue reclaiming anonymous pages in
> the current batch. This patch will collect these metrics and
> expose them.

Could we have some reviewer input on this please?

> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index b37c0d870816..44803cbea38a 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -2729,6 +2729,8 @@ static const char *const memcg1_stat_names[] = {
>  static const unsigned int memcg1_events[] = {
>  	PGPGIN,
>  	PGPGOUT,
> +	PSWPIN,
> +	PSWPOUT,
>  	PGFAULT,
>  	PGMAJFAULT,
>  };
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 087a8cb1a6d8..dde3d026f174 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -418,6 +418,8 @@ static const unsigned int memcg_vm_event_stat[] = {
>  	PGPGIN,
>  	PGPGOUT,
>  #endif
> +	PSWPIN,
> +	PSWPOUT,
>  	PGSCAN_KSWAPD,
>  	PGSCAN_DIRECT,
>  	PGSCAN_KHUGEPAGED,
> diff --git a/mm/page_io.c b/mm/page_io.c
> index b6f1519d63b0..4bc77d1c6bfa 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -310,6 +310,7 @@ static inline void count_swpout_vm_event(struct folio *folio)
>  	}
>  	count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT);
>  #endif
> +	count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio));
>  	count_vm_events(PSWPOUT, folio_nr_pages(folio));
>  }
>  
> @@ -505,6 +506,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
>  		for (p = 0; p < sio->pages; p++) {
>  			struct folio *folio = page_folio(sio->bvec[p].bv_page);
>  
> +			count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
>  			folio_mark_uptodate(folio);
>  			folio_unlock(folio);
>  		}
> @@ -588,6 +590,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio,
>  	 * attempt to access it in the page fault retry time check.
>  	 */
>  	get_task_struct(current);
> +	count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
>  	count_vm_event(PSWPIN);
>  	submit_bio_wait(&bio);
>  	__end_swap_bio_read(&bio);
> @@ -603,6 +606,7 @@ static void swap_read_folio_bdev_async(struct folio *folio,
>  	bio->bi_iter.bi_sector = swap_folio_sector(folio);
>  	bio->bi_end_io = end_swap_bio_read;
>  	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
> +	count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
>  	count_vm_event(PSWPIN);
>  	submit_bio(bio);
>  }
> -- 
> 2.43.5
>
Yosry Ahmed Sept. 9, 2024, 10:45 p.m. UTC | #2
On Fri, Aug 30, 2024 at 1:23 AM Jingxiang Zeng
<jingxiangzeng.cas@gmail.com> wrote:
>
> From: Jingxiang Zeng <linuszeng@tencent.com>
>
> In proactive memory reclamation scenarios, it is necessary to
> estimate the pswpin and pswpout metrics of the cgroup to
> determine whether to continue reclaiming anonymous pages in
> the current batch. This patch will collect these metrics and
> expose them.

Could you add more details about the use case?

By "reclaiming anonymous pages", do you mean using memory.reclaim with
swappiness=200?

Why not just use PGPGOUT to figure out how many pages were reclaimed?
Do you find a significant amount of file pages getting reclaimed with
swappiness=200?

>
> Signed-off-by: Jingxiang Zeng <linuszeng@tencent.com>
> ---
>  mm/memcontrol-v1.c | 2 ++
>  mm/memcontrol.c    | 2 ++
>  mm/page_io.c       | 4 ++++
>  3 files changed, 8 insertions(+)
>
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index b37c0d870816..44803cbea38a 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -2729,6 +2729,8 @@ static const char *const memcg1_stat_names[] = {
>  static const unsigned int memcg1_events[] = {
>         PGPGIN,
>         PGPGOUT,
> +       PSWPIN,
> +       PSWPOUT,

memory.reclaim is not exposed in cgroup v1, so assuming these are only
used for such proactive reclaim, we don't need to add them here.

>         PGFAULT,
>         PGMAJFAULT,
>  };
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 087a8cb1a6d8..dde3d026f174 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -418,6 +418,8 @@ static const unsigned int memcg_vm_event_stat[] = {
>         PGPGIN,
>         PGPGOUT,
>  #endif
> +       PSWPIN,
> +       PSWPOUT,
>         PGSCAN_KSWAPD,
>         PGSCAN_DIRECT,
>         PGSCAN_KHUGEPAGED,
> diff --git a/mm/page_io.c b/mm/page_io.c
> index b6f1519d63b0..4bc77d1c6bfa 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -310,6 +310,7 @@ static inline void count_swpout_vm_event(struct folio *folio)
>         }
>         count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT);
>  #endif
> +       count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio));
>         count_vm_events(PSWPOUT, folio_nr_pages(folio));
>  }
>
> @@ -505,6 +506,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
>                 for (p = 0; p < sio->pages; p++) {
>                         struct folio *folio = page_folio(sio->bvec[p].bv_page);
>
> +                       count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
>                         folio_mark_uptodate(folio);
>                         folio_unlock(folio);
>                 }
> @@ -588,6 +590,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio,
>          * attempt to access it in the page fault retry time check.
>          */
>         get_task_struct(current);
> +       count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
>         count_vm_event(PSWPIN);
>         submit_bio_wait(&bio);
>         __end_swap_bio_read(&bio);
> @@ -603,6 +606,7 @@ static void swap_read_folio_bdev_async(struct folio *folio,
>         bio->bi_iter.bi_sector = swap_folio_sector(folio);
>         bio->bi_end_io = end_swap_bio_read;
>         bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
> +       count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
>         count_vm_event(PSWPIN);
>         submit_bio(bio);
>  }
> --
> 2.43.5
>
>
Jingxiang Zeng Sept. 10, 2024, 5:28 a.m. UTC | #3
On Tue, 10 Sept 2024 at 06:46, Yosry Ahmed <yosryahmed@google.com> wrote:
>
> On Fri, Aug 30, 2024 at 1:23 AM Jingxiang Zeng
> <jingxiangzeng.cas@gmail.com> wrote:
> >
> > From: Jingxiang Zeng <linuszeng@tencent.com>
> >
> > In proactive memory reclamation scenarios, it is necessary to
> > estimate the pswpin and pswpout metrics of the cgroup to
> > determine whether to continue reclaiming anonymous pages in
> > the current batch. This patch will collect these metrics and
> > expose them.
>
> Could you add more details about the use case?
>
> By "reclaiming anonymous pages", do you mean using memory.reclaim with
> swappiness=200?

Yes.
>
> Why not just use PGPGOUT to figure out how many pages were reclaimed?
> Do you find a significant amount of file pages getting reclaimed with
> swappiness=200?
>

Currently, it's not possible to know the swap out situation of a
cgroup, and the
PGPGOUT metric, which includes the reclaim count of file pages and
anonymous pages, cannot accurately reflect the swap out situation.
> >
> > Signed-off-by: Jingxiang Zeng <linuszeng@tencent.com>
> > ---
> >  mm/memcontrol-v1.c | 2 ++
> >  mm/memcontrol.c    | 2 ++
> >  mm/page_io.c       | 4 ++++
> >  3 files changed, 8 insertions(+)
> >
> > diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> > index b37c0d870816..44803cbea38a 100644
> > --- a/mm/memcontrol-v1.c
> > +++ b/mm/memcontrol-v1.c
> > @@ -2729,6 +2729,8 @@ static const char *const memcg1_stat_names[] = {
> >  static const unsigned int memcg1_events[] = {
> >         PGPGIN,
> >         PGPGOUT,
> > +       PSWPIN,
> > +       PSWPOUT,
>
> memory.reclaim is not exposed in cgroup v1, so assuming these are only
> used for such proactive reclaim, we don't need to add them here.

Your point makes sense. I will remove these fields in the v2 version.
>
> >         PGFAULT,
> >         PGMAJFAULT,
> >  };
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 087a8cb1a6d8..dde3d026f174 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -418,6 +418,8 @@ static const unsigned int memcg_vm_event_stat[] = {
> >         PGPGIN,
> >         PGPGOUT,
> >  #endif
> > +       PSWPIN,
> > +       PSWPOUT,
> >         PGSCAN_KSWAPD,
> >         PGSCAN_DIRECT,
> >         PGSCAN_KHUGEPAGED,
> > diff --git a/mm/page_io.c b/mm/page_io.c
> > index b6f1519d63b0..4bc77d1c6bfa 100644
> > --- a/mm/page_io.c
> > +++ b/mm/page_io.c
> > @@ -310,6 +310,7 @@ static inline void count_swpout_vm_event(struct folio *folio)
> >         }
> >         count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT);
> >  #endif
> > +       count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio));
> >         count_vm_events(PSWPOUT, folio_nr_pages(folio));
> >  }
> >
> > @@ -505,6 +506,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
> >                 for (p = 0; p < sio->pages; p++) {
> >                         struct folio *folio = page_folio(sio->bvec[p].bv_page);
> >
> > +                       count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
> >                         folio_mark_uptodate(folio);
> >                         folio_unlock(folio);
> >                 }
> > @@ -588,6 +590,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio,
> >          * attempt to access it in the page fault retry time check.
> >          */
> >         get_task_struct(current);
> > +       count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
> >         count_vm_event(PSWPIN);
> >         submit_bio_wait(&bio);
> >         __end_swap_bio_read(&bio);
> > @@ -603,6 +606,7 @@ static void swap_read_folio_bdev_async(struct folio *folio,
> >         bio->bi_iter.bi_sector = swap_folio_sector(folio);
> >         bio->bi_end_io = end_swap_bio_read;
> >         bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
> > +       count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
> >         count_vm_event(PSWPIN);
> >         submit_bio(bio);
> >  }
> > --
> > 2.43.5
> >
> >
>
Shakeel Butt Sept. 10, 2024, 7:08 a.m. UTC | #4
On Fri, Aug 30, 2024 at 04:22:44PM GMT, Jingxiang Zeng wrote:
> From: Jingxiang Zeng <linuszeng@tencent.com>
> 
> In proactive memory reclamation scenarios, it is necessary to
> estimate the pswpin and pswpout metrics of the cgroup to
> determine whether to continue reclaiming anonymous pages in
> the current batch. This patch will collect these metrics and
> expose them.

Please explain a bit more on how these metrics will be used to make
a decision to continue to do proactive reclaim or not.

> 
> Signed-off-by: Jingxiang Zeng <linuszeng@tencent.com>
> ---
>  mm/memcontrol-v1.c | 2 ++
>  mm/memcontrol.c    | 2 ++
>  mm/page_io.c       | 4 ++++
>  3 files changed, 8 insertions(+)
> 
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index b37c0d870816..44803cbea38a 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -2729,6 +2729,8 @@ static const char *const memcg1_stat_names[] = {
>  static const unsigned int memcg1_events[] = {
>  	PGPGIN,
>  	PGPGOUT,
> +	PSWPIN,
> +	PSWPOUT,
>  	PGFAULT,
>  	PGMAJFAULT,
>  };

As Yosry said, no need to add these in v1.

thanks,
Shakeel
Jingxiang Zeng Sept. 13, 2024, 8:05 a.m. UTC | #5
On Tue, 10 Sept 2024 at 15:10, Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Fri, Aug 30, 2024 at 04:22:44PM GMT, Jingxiang Zeng wrote:
> > From: Jingxiang Zeng <linuszeng@tencent.com>
> >
> > In proactive memory reclamation scenarios, it is necessary to
> > estimate the pswpin and pswpout metrics of the cgroup to
> > determine whether to continue reclaiming anonymous pages in
> > the current batch. This patch will collect these metrics and
> > expose them.
>
> Please explain a bit more on how these metrics will be used to make
> a decision to continue to do proactive reclaim or not.

Currently there is simply no way to know exactly how many anon page
was faulted in through SWAP for each cgroup. One may use
workingset refault as an indicator but it is inaccurate due to shadow reclaim.

We have a proactive reclaim agent that sets a forced swappiness
dynamically for each reclaim, so we can reclaim file or anon pages striclty.
Knowing the anon page swapin status is a huge win for estimating the
workload status.

And the swapout info is also important for getting an idea of how much
swapout is effective for a cgroup.

>
> >
> > Signed-off-by: Jingxiang Zeng <linuszeng@tencent.com>
> > ---
> >  mm/memcontrol-v1.c | 2 ++
> >  mm/memcontrol.c    | 2 ++
> >  mm/page_io.c       | 4 ++++
> >  3 files changed, 8 insertions(+)
> >
> > diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> > index b37c0d870816..44803cbea38a 100644
> > --- a/mm/memcontrol-v1.c
> > +++ b/mm/memcontrol-v1.c
> > @@ -2729,6 +2729,8 @@ static const char *const memcg1_stat_names[] = {
> >  static const unsigned int memcg1_events[] = {
> >       PGPGIN,
> >       PGPGOUT,
> > +     PSWPIN,
> > +     PSWPOUT,
> >       PGFAULT,
> >       PGMAJFAULT,
> >  };
>
> As Yosry said, no need to add these in v1.
>
> thanks,
> Shakeel
>
Shakeel Butt Sept. 13, 2024, 4:58 p.m. UTC | #6
On Fri, Sep 13, 2024 at 04:05:51PM GMT, jingxiang zeng wrote:
> On Tue, 10 Sept 2024 at 15:10, Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Fri, Aug 30, 2024 at 04:22:44PM GMT, Jingxiang Zeng wrote:
> > > From: Jingxiang Zeng <linuszeng@tencent.com>
> > >
> > > In proactive memory reclamation scenarios, it is necessary to
> > > estimate the pswpin and pswpout metrics of the cgroup to
> > > determine whether to continue reclaiming anonymous pages in
> > > the current batch. This patch will collect these metrics and
> > > expose them.
> >
> > Please explain a bit more on how these metrics will be used to make
> > a decision to continue to do proactive reclaim or not.
> 
> Currently there is simply no way to know exactly how many anon page
> was faulted in through SWAP for each cgroup. One may use
> workingset refault as an indicator but it is inaccurate due to shadow reclaim.
> 
> We have a proactive reclaim agent that sets a forced swappiness
> dynamically for each reclaim, so we can reclaim file or anon pages striclty.
> Knowing the anon page swapin status is a huge win for estimating the
> workload status.
> 
> And the swapout info is also important for getting an idea of how much
> swapout is effective for a cgroup.
> 

Please add all these details on your proactive reclaim agent in the
commit message. It would be beneficial to others doing proactive
reclaim.
diff mbox series

Patch

diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index b37c0d870816..44803cbea38a 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -2729,6 +2729,8 @@  static const char *const memcg1_stat_names[] = {
 static const unsigned int memcg1_events[] = {
 	PGPGIN,
 	PGPGOUT,
+	PSWPIN,
+	PSWPOUT,
 	PGFAULT,
 	PGMAJFAULT,
 };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 087a8cb1a6d8..dde3d026f174 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -418,6 +418,8 @@  static const unsigned int memcg_vm_event_stat[] = {
 	PGPGIN,
 	PGPGOUT,
 #endif
+	PSWPIN,
+	PSWPOUT,
 	PGSCAN_KSWAPD,
 	PGSCAN_DIRECT,
 	PGSCAN_KHUGEPAGED,
diff --git a/mm/page_io.c b/mm/page_io.c
index b6f1519d63b0..4bc77d1c6bfa 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -310,6 +310,7 @@  static inline void count_swpout_vm_event(struct folio *folio)
 	}
 	count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT);
 #endif
+	count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio));
 	count_vm_events(PSWPOUT, folio_nr_pages(folio));
 }
 
@@ -505,6 +506,7 @@  static void sio_read_complete(struct kiocb *iocb, long ret)
 		for (p = 0; p < sio->pages; p++) {
 			struct folio *folio = page_folio(sio->bvec[p].bv_page);
 
+			count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
 			folio_mark_uptodate(folio);
 			folio_unlock(folio);
 		}
@@ -588,6 +590,7 @@  static void swap_read_folio_bdev_sync(struct folio *folio,
 	 * attempt to access it in the page fault retry time check.
 	 */
 	get_task_struct(current);
+	count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
 	count_vm_event(PSWPIN);
 	submit_bio_wait(&bio);
 	__end_swap_bio_read(&bio);
@@ -603,6 +606,7 @@  static void swap_read_folio_bdev_async(struct folio *folio,
 	bio->bi_iter.bi_sector = swap_folio_sector(folio);
 	bio->bi_end_io = end_swap_bio_read;
 	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
+	count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio));
 	count_vm_event(PSWPIN);
 	submit_bio(bio);
 }