diff mbox series

[bpf-next,v5,4/7] memcg: Use trylock to access memcg stock_lock.

Message ID 20250115021746.34691-5-alexei.starovoitov@gmail.com (mailing list archive)
State New
Headers show
Series bpf, mm: Introduce try_alloc_pages() | expand

Commit Message

Alexei Starovoitov Jan. 15, 2025, 2:17 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Teach memcg to operate under trylock conditions when spinning locks
cannot be used.

local_trylock might fail and this would lead to charge cache bypass if
the calling context doesn't allow spinning (gfpflags_allow_spinning).
In those cases charge the memcg counter directly and fail early if
that is not possible. This might cause a pre-mature charge failing
but it will allow an opportunistic charging that is safe from
try_alloc_pages path.

Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 mm/memcontrol.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

Comments

Vlastimil Babka Jan. 15, 2025, 4:07 p.m. UTC | #1
On 1/15/25 03:17, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Teach memcg to operate under trylock conditions when spinning locks
> cannot be used.
> 
> local_trylock might fail and this would lead to charge cache bypass if
> the calling context doesn't allow spinning (gfpflags_allow_spinning).
> In those cases charge the memcg counter directly and fail early if
> that is not possible. This might cause a pre-mature charge failing
> but it will allow an opportunistic charging that is safe from
> try_alloc_pages path.
> 
> Acked-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>  mm/memcontrol.c | 24 ++++++++++++++++++++----
>  1 file changed, 20 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 7b3503d12aaf..e4c7049465e0 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1756,7 +1756,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
>   *
>   * returns true if successful, false otherwise.
>   */
> -static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
> +			  gfp_t gfp_mask)
>  {
>  	struct memcg_stock_pcp *stock;
>  	unsigned int stock_pages;
> @@ -1766,7 +1767,11 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
>  	if (nr_pages > MEMCG_CHARGE_BATCH)
>  		return ret;
>  
> -	local_lock_irqsave(&memcg_stock.stock_lock, flags);
> +	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> +		if (!gfpflags_allow_spinning(gfp_mask))
> +			return ret;
> +		local_lock_irqsave(&memcg_stock.stock_lock, flags);

The last line can practially only happen on RT, right? On non-RT irqsave
means we could only fail the trylock from a nmi and then we should have
gfp_flags that don't allow spinning.

So suppose we used local_trylock(), local_lock() and local_unlock()  (no
_irqsave) instead, as I mentioned in reply to 3/7. The RT implementation
would be AFAICS the same. On !RT the trylock could now fail from a IRQ
context in addition to NMI context, but that should also have a gfp_mask
that does not allow spinning, so it should work fine.

It would however mean converting all users of the lock, i.e. also
consume_obj_stock() etc., but AFAIU that will be necessary anyway to have
opportunistic slab allocations?

> +	}
>  
>  	stock = this_cpu_ptr(&memcg_stock);
>  	stock_pages = READ_ONCE(stock->nr_pages);
> @@ -1851,7 +1856,14 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
>  {
>  	unsigned long flags;
>  
> -	local_lock_irqsave(&memcg_stock.stock_lock, flags);
> +	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> +		/*
> +		 * In case of unlikely failure to lock percpu stock_lock
> +		 * uncharge memcg directly.
> +		 */
> +		mem_cgroup_cancel_charge(memcg, nr_pages);
> +		return;
> +	}
>  	__refill_stock(memcg, nr_pages);
>  	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
>  }
> @@ -2196,9 +2208,13 @@ int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>  	unsigned long pflags;
>  
>  retry:
> -	if (consume_stock(memcg, nr_pages))
> +	if (consume_stock(memcg, nr_pages, gfp_mask))
>  		return 0;
>  
> +	if (!gfpflags_allow_spinning(gfp_mask))
> +		/* Avoid the refill and flush of the older stock */
> +		batch = nr_pages;
> +
>  	if (!do_memsw_account() ||
>  	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
>  		if (page_counter_try_charge(&memcg->memory, batch, &counter))
Shakeel Butt Jan. 16, 2025, 12:12 a.m. UTC | #2
On Tue, Jan 14, 2025 at 06:17:43PM -0800, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Teach memcg to operate under trylock conditions when spinning locks
> cannot be used.
> 
> local_trylock might fail and this would lead to charge cache bypass if
> the calling context doesn't allow spinning (gfpflags_allow_spinning).
> In those cases charge the memcg counter directly and fail early if
> that is not possible. This might cause a pre-mature charge failing
> but it will allow an opportunistic charging that is safe from
> try_alloc_pages path.
> 
> Acked-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Acked-by: Shakeel Butt <shakeel.butt@linux.dev>

> @@ -1851,7 +1856,14 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
>  {
>  	unsigned long flags;
>  
> -	local_lock_irqsave(&memcg_stock.stock_lock, flags);
> +	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> +		/*
> +		 * In case of unlikely failure to lock percpu stock_lock
> +		 * uncharge memcg directly.
> +		 */
> +		mem_cgroup_cancel_charge(memcg, nr_pages);

mem_cgroup_cancel_charge() has been removed by a patch in mm-tree. Maybe
we can either revive mem_cgroup_cancel_charge() or simply inline it
here.

> +		return;
> +	}
>  	__refill_stock(memcg, nr_pages);
>  	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
>  }
Alexei Starovoitov Jan. 16, 2025, 2:22 a.m. UTC | #3
On Wed, Jan 15, 2025 at 4:12 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Tue, Jan 14, 2025 at 06:17:43PM -0800, Alexei Starovoitov wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Teach memcg to operate under trylock conditions when spinning locks
> > cannot be used.
> >
> > local_trylock might fail and this would lead to charge cache bypass if
> > the calling context doesn't allow spinning (gfpflags_allow_spinning).
> > In those cases charge the memcg counter directly and fail early if
> > that is not possible. This might cause a pre-mature charge failing
> > but it will allow an opportunistic charging that is safe from
> > try_alloc_pages path.
> >
> > Acked-by: Michal Hocko <mhocko@suse.com>
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>
> Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
>
> > @@ -1851,7 +1856,14 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> >  {
> >       unsigned long flags;
> >
> > -     local_lock_irqsave(&memcg_stock.stock_lock, flags);
> > +     if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> > +             /*
> > +              * In case of unlikely failure to lock percpu stock_lock
> > +              * uncharge memcg directly.
> > +              */
> > +             mem_cgroup_cancel_charge(memcg, nr_pages);
>
> mem_cgroup_cancel_charge() has been removed by a patch in mm-tree. Maybe
> we can either revive mem_cgroup_cancel_charge() or simply inline it
> here.

Ouch.

this one?
https://lore.kernel.org/all/20241211203951.764733-4-joshua.hahnjy@gmail.com/

Joshua,

could you hold on to that clean up?
Or leave mem_cgroup_cancel_charge() in place ?
Joshua Hahn Jan. 16, 2025, 8:07 p.m. UTC | #4
On Wed, 15 Jan 2025 18:22:28 -0800 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Wed, Jan 15, 2025 at 4:12 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Tue, Jan 14, 2025 at 06:17:43PM -0800, Alexei Starovoitov wrote:
> > > From: Alexei Starovoitov <ast@kernel.org>
> > >
> > > Teach memcg to operate under trylock conditions when spinning locks
> > > cannot be used.
> > >
> > > local_trylock might fail and this would lead to charge cache bypass if
> > > the calling context doesn't allow spinning (gfpflags_allow_spinning).
> > > In those cases charge the memcg counter directly and fail early if
> > > that is not possible. This might cause a pre-mature charge failing
> > > but it will allow an opportunistic charging that is safe from
> > > try_alloc_pages path.
> > >
> > > Acked-by: Michal Hocko <mhocko@suse.com>
> > > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> >
> > Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
> >
> > > @@ -1851,7 +1856,14 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> > >  {
> > >       unsigned long flags;
> > >
> > > -     local_lock_irqsave(&memcg_stock.stock_lock, flags);
> > > +     if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> > > +             /*
> > > +              * In case of unlikely failure to lock percpu stock_lock
> > > +              * uncharge memcg directly.
> > > +              */
> > > +             mem_cgroup_cancel_charge(memcg, nr_pages);
> >
> > mem_cgroup_cancel_charge() has been removed by a patch in mm-tree. Maybe
> > we can either revive mem_cgroup_cancel_charge() or simply inline it
> > here.
> 
> Ouch.
> 
> this one?
> https://lore.kernel.org/all/20241211203951.764733-4-joshua.hahnjy@gmail.com/
> 
> Joshua,
> 
> could you hold on to that clean up?
> Or leave mem_cgroup_cancel_charge() in place ?

Hi Alexei,

Yes, that makes sense to me. The goal of the patch was to remove the
last users and remove it, but if there are users of the function, I
don't think the patch makes any sense : -)

Have a great day!
Joshua



Hi Andrew,

I think that the patch was moved into mm-stable earlier this week.
I was wondering if it would be possible to revert the patch and
replace it with this one below. The only difference is that I leave
mem_cgroup_cancel_charge untouched in this version.

I'm also not sure if this is the best way to send the revised patch.
Please let me know if there is another way I should do this to make
it easiest for you!

Thank you for your time!
Joshua

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d1ee98dc3a38..c8d0554e5490 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -620,8 +620,6 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
 		page_counter_read(&memcg->memory);
 }

-void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
-
 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);

 /**
@@ -646,9 +644,6 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
 	return __mem_cgroup_charge(folio, mm, gfp);
 }

-int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
-		long nr_pages);
-
 int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp);

 int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
@@ -1137,23 +1132,12 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
 	return false;
 }

-static inline void mem_cgroup_commit_charge(struct folio *folio,
-		struct mem_cgroup *memcg)
-{
-}
-
 static inline int mem_cgroup_charge(struct folio *folio,
 		struct mm_struct *mm, gfp_t gfp)
 {
 	return 0;
 }

-static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
-		gfp_t gfp, long nr_pages)
-{
-	return 0;
-}
-
 static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
 			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dd171bdf1bcc..aeff2af8d722 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2402,18 +2402,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
 	folio->memcg_data = (unsigned long)memcg;
 }

-/**
- * mem_cgroup_commit_charge - commit a previously successful try_charge().
- * @folio: folio to commit the charge to.
- * @memcg: memcg previously charged.
- */
-void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
-{
-	css_get(&memcg->css);
-	commit_charge(folio, memcg);
-	memcg1_commit_charge(folio, memcg);
-}
-
 static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
 				       struct pglist_data *pgdat,
 				       enum node_stat_item idx, int nr)
@@ -4501,7 +4489,9 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
 	if (ret)
 		goto out;

-	mem_cgroup_commit_charge(folio, memcg);
+	css_get(&memcg->css);
+	commit_charge(folio, memcg);
+	memcg1_commit_charge(folio, memcg);
 out:
 	return ret;
 }
@@ -4527,40 +4517,6 @@ bool memcg_accounts_hugetlb(void)
 #endif
 }

-/**
- * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
- * @memcg: memcg to charge.
- * @gfp: reclaim mode.
- * @nr_pages: number of pages to charge.
- *
- * This function is called when allocating a huge page folio to determine if
- * the memcg has the capacity for it. It does not commit the charge yet,
- * as the hugetlb folio itself has not been obtained from the hugetlb pool.
- *
- * Once we have obtained the hugetlb folio, we can call
- * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
- * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
- * of try_charge().
- *
- * Returns 0 on success. Otherwise, an error code is returned.
- */
-int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
-			long nr_pages)
-{
-	/*
-	 * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
-	 * but do not attempt to commit charge later (or cancel on error) either.
-	 */
-	if (mem_cgroup_disabled() || !memcg ||
-		!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !memcg_accounts_hugetlb())
-		return -EOPNOTSUPP;
-
-	if (try_charge(memcg, gfp, nr_pages))
-		return -ENOMEM;
-
-	return 0;
-}
-
 int mem_cgroup_charge_hugetlb(struct folio *folio, gfp_t gfp)
 {
 	struct mem_cgroup *memcg = get_mem_cgroup_from_current();
Johannes Weiner Jan. 17, 2025, 5:36 p.m. UTC | #5
On Thu, Jan 16, 2025 at 12:07:28PM -0800, Joshua Hahn wrote:
> On Wed, 15 Jan 2025 18:22:28 -0800 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > On Wed, Jan 15, 2025 at 4:12 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > On Tue, Jan 14, 2025 at 06:17:43PM -0800, Alexei Starovoitov wrote:
> > > > @@ -1851,7 +1856,14 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> > > >  {
> > > >       unsigned long flags;
> > > >
> > > > -     local_lock_irqsave(&memcg_stock.stock_lock, flags);
> > > > +     if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> > > > +             /*
> > > > +              * In case of unlikely failure to lock percpu stock_lock
> > > > +              * uncharge memcg directly.
> > > > +              */
> > > > +             mem_cgroup_cancel_charge(memcg, nr_pages);
> > >
> > > mem_cgroup_cancel_charge() has been removed by a patch in mm-tree. Maybe
> > > we can either revive mem_cgroup_cancel_charge() or simply inline it
> > > here.
> > 
> > Ouch.
> > 
> > this one?
> > https://lore.kernel.org/all/20241211203951.764733-4-joshua.hahnjy@gmail.com/
> > 
> > Joshua,
> > 
> > could you hold on to that clean up?
> > Or leave mem_cgroup_cancel_charge() in place ?
> 
> Hi Andrew,
> 
> I think that the patch was moved into mm-stable earlier this week.
> I was wondering if it would be possible to revert the patch and
> replace it with this one below. The only difference is that I leave
> mem_cgroup_cancel_charge untouched in this version.

Let's not revert.

This is a bit of a weird function to keep around without the rest of
the transaction API. It doesn't need to be external linkage, either.

Alexei, can you please just open-code the two page_counter calls?
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b3503d12aaf..e4c7049465e0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1756,7 +1756,8 @@  static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
  *
  * returns true if successful, false otherwise.
  */
-static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
+			  gfp_t gfp_mask)
 {
 	struct memcg_stock_pcp *stock;
 	unsigned int stock_pages;
@@ -1766,7 +1767,11 @@  static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 	if (nr_pages > MEMCG_CHARGE_BATCH)
 		return ret;
 
-	local_lock_irqsave(&memcg_stock.stock_lock, flags);
+	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+		if (!gfpflags_allow_spinning(gfp_mask))
+			return ret;
+		local_lock_irqsave(&memcg_stock.stock_lock, flags);
+	}
 
 	stock = this_cpu_ptr(&memcg_stock);
 	stock_pages = READ_ONCE(stock->nr_pages);
@@ -1851,7 +1856,14 @@  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	unsigned long flags;
 
-	local_lock_irqsave(&memcg_stock.stock_lock, flags);
+	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+		/*
+		 * In case of unlikely failure to lock percpu stock_lock
+		 * uncharge memcg directly.
+		 */
+		mem_cgroup_cancel_charge(memcg, nr_pages);
+		return;
+	}
 	__refill_stock(memcg, nr_pages);
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 }
@@ -2196,9 +2208,13 @@  int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	unsigned long pflags;
 
 retry:
-	if (consume_stock(memcg, nr_pages))
+	if (consume_stock(memcg, nr_pages, gfp_mask))
 		return 0;
 
+	if (!gfpflags_allow_spinning(gfp_mask))
+		/* Avoid the refill and flush of the older stock */
+		batch = nr_pages;
+
 	if (!do_memsw_account() ||
 	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
 		if (page_counter_try_charge(&memcg->memory, batch, &counter))