diff mbox series

[v3,3/7] mm/swap: always account swapped in page into current memcg

Message ID 20240129175423.1987-4-ryncsn@gmail.com (mailing list archive)
State New
Headers show
Series swapin refactor for optimization and unified readahead | expand

Commit Message

Kairui Song Jan. 29, 2024, 5:54 p.m. UTC
From: Kairui Song <kasong@tencent.com>

Currently, mem_cgroup_swapin_charge_folio is always called with
mm == NULL, except in swapin_direct.

swapin_direct is only used when swapin should skip readahead
and swapcache (SWP_SYNCHRONOUS_IO). All other callers of
mem_cgroup_swapin_charge_folio are for swapin that should
not skip readahead and cache.

This could cause swapin charging to behave differently depending
on swap device, which is unexpected.

This is currently not happening because the only caller of
swapin_direct is the direct anon page fault path, where mm always
equals to current->mm, but will no longer be true if swapin_direct
is shared and have other callers (eg, swapoff) to share the
readahead skipping logic.

So make swapin_direct also pass NULL for mm, so swpain charge
will behave consistently and not effected by type of swapin device
or readahead policy.

After this, the second param of mem_cgroup_swapin_charge_folio is
never used now, so it can be safely dropped.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 include/linux/memcontrol.h | 4 ++--
 mm/memcontrol.c            | 5 ++---
 mm/swap_state.c            | 7 +++----
 3 files changed, 7 insertions(+), 9 deletions(-)

Comments

Huang, Ying Jan. 30, 2024, 6:12 a.m. UTC | #1
Kairui Song <ryncsn@gmail.com> writes:

> From: Kairui Song <kasong@tencent.com>
>
> Currently, mem_cgroup_swapin_charge_folio is always called with
> mm == NULL, except in swapin_direct.
>
> swapin_direct is only used when swapin should skip readahead
> and swapcache (SWP_SYNCHRONOUS_IO). All other callers of
> mem_cgroup_swapin_charge_folio are for swapin that should
> not skip readahead and cache.
>
> This could cause swapin charging to behave differently depending
> on swap device, which is unexpected.
>
> This is currently not happening because the only caller of
> swapin_direct is the direct anon page fault path, where mm always
> equals to current->mm, but will no longer be true if swapin_direct
> is shared and have other callers (eg, swapoff) to share the
> readahead skipping logic.
>
> So make swapin_direct also pass NULL for mm, so swpain charge
> will behave consistently and not effected by type of swapin device
> or readahead policy.
>
> After this, the second param of mem_cgroup_swapin_charge_folio is
> never used now, so it can be safely dropped.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  include/linux/memcontrol.h | 4 ++--
>  mm/memcontrol.c            | 5 ++---
>  mm/swap_state.c            | 7 +++----
>  3 files changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 20ff87f8e001..540590d80958 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -693,7 +693,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
>  int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
>  		long nr_pages);
>  
> -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> +int mem_cgroup_swapin_charge_folio(struct folio *folio,
>  				  gfp_t gfp, swp_entry_t entry);
>  void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
>  
> @@ -1281,7 +1281,7 @@ static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
>  }
>  
>  static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
> -			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
> +		gfp_t gfp, swp_entry_t entry)
>  {
>  	return 0;
>  }
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e4c8735e7c85..5852742df958 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -7306,8 +7306,7 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
>   *
>   * Returns 0 on success. Otherwise, an error code is returned.
>   */
> -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> -				  gfp_t gfp, swp_entry_t entry)
> +int mem_cgroup_swapin_charge_folio(struct folio *folio, gfp_t gfp, swp_entry_t entry)
>  {
>  	struct mem_cgroup *memcg;
>  	unsigned short id;
> @@ -7320,7 +7319,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
>  	rcu_read_lock();
>  	memcg = mem_cgroup_from_id(id);
>  	if (!memcg || !css_tryget_online(&memcg->css))
> -		memcg = get_mem_cgroup_from_mm(mm);
> +		memcg = get_mem_cgroup_from_current();

The behavior of get_mem_cgroup_from_mm(NULL) and
get_mem_cgroup_from_current() isn't same exactly.  Are you sure that
this is OK?

--
Best Regards,
Huang, Ying


>  	rcu_read_unlock();
>  
>  	ret = charge_memcg(folio, memcg, gfp);
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 645f5bcad123..a450d09fc0db 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -495,7 +495,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>  	__folio_set_locked(folio);
>  	__folio_set_swapbacked(folio);
>  
> -	if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
> +	if (mem_cgroup_swapin_charge_folio(folio, gfp_mask, entry))
>  		goto fail_unlock;
>  
>  	/* May fail (-ENOMEM) if XArray node allocation failed. */
> @@ -884,9 +884,8 @@ struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask,
>  		__folio_set_locked(folio);
>  		__folio_set_swapbacked(folio);
>  
> -		if (mem_cgroup_swapin_charge_folio(folio,
> -					vma->vm_mm, GFP_KERNEL,
> -					entry)) {
> +		if (mem_cgroup_swapin_charge_folio(folio, GFP_KERNEL,
> +						   entry)) {
>  			folio_unlock(folio);
>  			folio_put(folio);
>  			return NULL;
Kairui Song Jan. 30, 2024, 7:01 a.m. UTC | #2
On Tue, Jan 30, 2024 at 2:14 PM Huang, Ying <ying.huang@intel.com> wrote:
>
> Kairui Song <ryncsn@gmail.com> writes:
>
> > From: Kairui Song <kasong@tencent.com>
> >
> > Currently, mem_cgroup_swapin_charge_folio is always called with
> > mm == NULL, except in swapin_direct.
> >
> > swapin_direct is only used when swapin should skip readahead
> > and swapcache (SWP_SYNCHRONOUS_IO). All other callers of
> > mem_cgroup_swapin_charge_folio are for swapin that should
> > not skip readahead and cache.
> >
> > This could cause swapin charging to behave differently depending
> > on swap device, which is unexpected.
> >
> > This is currently not happening because the only caller of
> > swapin_direct is the direct anon page fault path, where mm always
> > equals to current->mm, but will no longer be true if swapin_direct
> > is shared and have other callers (eg, swapoff) to share the
> > readahead skipping logic.
> >
> > So make swapin_direct also pass NULL for mm, so swpain charge
> > will behave consistently and not effected by type of swapin device
> > or readahead policy.
> >
> > After this, the second param of mem_cgroup_swapin_charge_folio is
> > never used now, so it can be safely dropped.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  include/linux/memcontrol.h | 4 ++--
> >  mm/memcontrol.c            | 5 ++---
> >  mm/swap_state.c            | 7 +++----
> >  3 files changed, 7 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 20ff87f8e001..540590d80958 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -693,7 +693,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
> >  int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
> >               long nr_pages);
> >
> > -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> > +int mem_cgroup_swapin_charge_folio(struct folio *folio,
> >                                 gfp_t gfp, swp_entry_t entry);
> >  void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
> >
> > @@ -1281,7 +1281,7 @@ static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
> >  }
> >
> >  static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
> > -                     struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
> > +             gfp_t gfp, swp_entry_t entry)
> >  {
> >       return 0;
> >  }
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index e4c8735e7c85..5852742df958 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -7306,8 +7306,7 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
> >   *
> >   * Returns 0 on success. Otherwise, an error code is returned.
> >   */
> > -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> > -                               gfp_t gfp, swp_entry_t entry)
> > +int mem_cgroup_swapin_charge_folio(struct folio *folio, gfp_t gfp, swp_entry_t entry)
> >  {
> >       struct mem_cgroup *memcg;
> >       unsigned short id;
> > @@ -7320,7 +7319,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> >       rcu_read_lock();
> >       memcg = mem_cgroup_from_id(id);
> >       if (!memcg || !css_tryget_online(&memcg->css))
> > -             memcg = get_mem_cgroup_from_mm(mm);
> > +             memcg = get_mem_cgroup_from_current();
>
> The behavior of get_mem_cgroup_from_mm(NULL) and
> get_mem_cgroup_from_current() isn't same exactly.  Are you sure that
> this is OK?

Hi Ying, thank you very much for the careful review.

IIUC, usually get_mem_cgroup_from_mm(NULL) is for allocations without
mm context (after set_active_memcg), so remote charging cgroup is used
first.

But for swap cases it's a bit special, all swapin are issued from
userspace, so remote charging isn't useful. Not sure if it even may
potentially lead to charging into the wrong cgroup.

And for this callsite, it's called only when `if (!memcg ||
!css_tryget_online(&memcg->css))` is true, only case I know is swapoff
(and the memcg is dead) case, or there are some leaks. The behaviour
of swapoff case has been discussed previously, so currently we just
charge it into the current task's memcg.

This is indeed a potential behaviour change though, I can change it
back to get_mem_cgroup_from_mm(NULL), and post another patch later for
this and discuss in more details.

>
> --
> Best Regards,
> Huang, Ying
Kairui Song Jan. 30, 2024, 7:03 a.m. UTC | #3
On Tue, Jan 30, 2024 at 3:01 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Tue, Jan 30, 2024 at 2:14 PM Huang, Ying <ying.huang@intel.com> wrote:
> >
> > Kairui Song <ryncsn@gmail.com> writes:
> >
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > Currently, mem_cgroup_swapin_charge_folio is always called with
> > > mm == NULL, except in swapin_direct.
> > >
> > > swapin_direct is only used when swapin should skip readahead
> > > and swapcache (SWP_SYNCHRONOUS_IO). All other callers of
> > > mem_cgroup_swapin_charge_folio are for swapin that should
> > > not skip readahead and cache.
> > >
> > > This could cause swapin charging to behave differently depending
> > > on swap device, which is unexpected.
> > >
> > > This is currently not happening because the only caller of
> > > swapin_direct is the direct anon page fault path, where mm always
> > > equals to current->mm, but will no longer be true if swapin_direct
> > > is shared and have other callers (eg, swapoff) to share the
> > > readahead skipping logic.
> > >
> > > So make swapin_direct also pass NULL for mm, so swpain charge
> > > will behave consistently and not effected by type of swapin device
> > > or readahead policy.
> > >
> > > After this, the second param of mem_cgroup_swapin_charge_folio is
> > > never used now, so it can be safely dropped.
> > >
> > > Signed-off-by: Kairui Song <kasong@tencent.com>
> > > ---
> > >  include/linux/memcontrol.h | 4 ++--
> > >  mm/memcontrol.c            | 5 ++---
> > >  mm/swap_state.c            | 7 +++----
> > >  3 files changed, 7 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > > index 20ff87f8e001..540590d80958 100644
> > > --- a/include/linux/memcontrol.h
> > > +++ b/include/linux/memcontrol.h
> > > @@ -693,7 +693,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
> > >  int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
> > >               long nr_pages);
> > >
> > > -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> > > +int mem_cgroup_swapin_charge_folio(struct folio *folio,
> > >                                 gfp_t gfp, swp_entry_t entry);
> > >  void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
> > >
> > > @@ -1281,7 +1281,7 @@ static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
> > >  }
> > >
> > >  static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
> > > -                     struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
> > > +             gfp_t gfp, swp_entry_t entry)
> > >  {
> > >       return 0;
> > >  }
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index e4c8735e7c85..5852742df958 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -7306,8 +7306,7 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
> > >   *
> > >   * Returns 0 on success. Otherwise, an error code is returned.
> > >   */
> > > -int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> > > -                               gfp_t gfp, swp_entry_t entry)
> > > +int mem_cgroup_swapin_charge_folio(struct folio *folio, gfp_t gfp, swp_entry_t entry)
> > >  {
> > >       struct mem_cgroup *memcg;
> > >       unsigned short id;
> > > @@ -7320,7 +7319,7 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
> > >       rcu_read_lock();
> > >       memcg = mem_cgroup_from_id(id);
> > >       if (!memcg || !css_tryget_online(&memcg->css))
> > > -             memcg = get_mem_cgroup_from_mm(mm);
> > > +             memcg = get_mem_cgroup_from_current();
> >
> > The behavior of get_mem_cgroup_from_mm(NULL) and
> > get_mem_cgroup_from_current() isn't same exactly.  Are you sure that
> > this is OK?
>
> Hi Ying, thank you very much for the careful review.
>
> IIUC, usually get_mem_cgroup_from_mm(NULL) is for allocations without
> mm context (after set_active_memcg), so remote charging cgroup is used
> first.
>
> But for swap cases it's a bit special, all swapin are issued from
> userspace, so remote charging isn't useful. Not sure if it even may
> potentially lead to charging into the wrong cgroup.
>
> And for this callsite, it's called only when `if (!memcg ||
> !css_tryget_online(&memcg->css))` is true, only case I know is swapoff
> (and the memcg is dead) case, or there are some leaks. The behaviour

Oh, actually shmem may also have the zombie cgroup issue, the
conclusion is still the same though, the task accessed the shmem owns
the charge.

> of swapoff case has been discussed previously, so currently we just
> charge it into the current task's memcg.
>
> This is indeed a potential behaviour change though, I can change it
> back to get_mem_cgroup_from_mm(NULL), and post another patch later for
> this and discuss in more details.
>
> >
> > --
> > Best Regards,
> > Huang, Ying
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 20ff87f8e001..540590d80958 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -693,7 +693,7 @@  static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
 int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
 		long nr_pages);
 
-int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
+int mem_cgroup_swapin_charge_folio(struct folio *folio,
 				  gfp_t gfp, swp_entry_t entry);
 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
 
@@ -1281,7 +1281,7 @@  static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
 }
 
 static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
-			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
+		gfp_t gfp, swp_entry_t entry)
 {
 	return 0;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e4c8735e7c85..5852742df958 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7306,8 +7306,7 @@  int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
  *
  * Returns 0 on success. Otherwise, an error code is returned.
  */
-int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
-				  gfp_t gfp, swp_entry_t entry)
+int mem_cgroup_swapin_charge_folio(struct folio *folio, gfp_t gfp, swp_entry_t entry)
 {
 	struct mem_cgroup *memcg;
 	unsigned short id;
@@ -7320,7 +7319,7 @@  int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
 	rcu_read_lock();
 	memcg = mem_cgroup_from_id(id);
 	if (!memcg || !css_tryget_online(&memcg->css))
-		memcg = get_mem_cgroup_from_mm(mm);
+		memcg = get_mem_cgroup_from_current();
 	rcu_read_unlock();
 
 	ret = charge_memcg(folio, memcg, gfp);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 645f5bcad123..a450d09fc0db 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -495,7 +495,7 @@  struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	__folio_set_locked(folio);
 	__folio_set_swapbacked(folio);
 
-	if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
+	if (mem_cgroup_swapin_charge_folio(folio, gfp_mask, entry))
 		goto fail_unlock;
 
 	/* May fail (-ENOMEM) if XArray node allocation failed. */
@@ -884,9 +884,8 @@  struct folio *swapin_direct(swp_entry_t entry, gfp_t gfp_mask,
 		__folio_set_locked(folio);
 		__folio_set_swapbacked(folio);
 
-		if (mem_cgroup_swapin_charge_folio(folio,
-					vma->vm_mm, GFP_KERNEL,
-					entry)) {
+		if (mem_cgroup_swapin_charge_folio(folio, GFP_KERNEL,
+						   entry)) {
 			folio_unlock(folio);
 			folio_put(folio);
 			return NULL;