diff mbox series

[bpf-next,v2,2/6] mm, bpf: Introduce free_pages_nolock()

Message ID 20241210023936.46871-3-alexei.starovoitov@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf, mm: Introduce __GFP_TRYLOCK | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 568 this patch: 568
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers success CCed 3 of 3 maintainers
netdev/build_clang success Errors and warnings before: 980 this patch: 980
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 15903 this patch: 15903
netdev/checkpatch warning CHECK: extern prototypes should be avoided in .h files WARNING: Missing a blank line after declarations WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 1 this patch: 1
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-9 fail Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / test
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-14 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for x86_64-gcc / test
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-20 fail Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-llvm-17 / test
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-25 fail Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-26 fail Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-18 / test
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-18 / veristat-meta

Commit Message

Alexei Starovoitov Dec. 10, 2024, 2:39 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Introduce free_pages_nolock() that can free a page without taking locks.
It relies on trylock only and can be called from any context.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/gfp.h      |  1 +
 include/linux/mm_types.h |  4 +++
 include/linux/mmzone.h   |  3 ++
 mm/page_alloc.c          | 72 +++++++++++++++++++++++++++++++++++-----
 4 files changed, 72 insertions(+), 8 deletions(-)

Comments

Sebastian Andrzej Siewior Dec. 10, 2024, 8:35 a.m. UTC | #1
On 2024-12-09 18:39:32 [-0800], Alexei Starovoitov wrote:
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index d511e68903c6..a969a62ec0c3 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1251,9 +1254,33 @@ static void free_one_page(struct zone *zone, struct page *page,
>  			  unsigned long pfn, unsigned int order,
>  			  fpi_t fpi_flags)
>  {
> +	struct llist_head *llhead;
>  	unsigned long flags;
>  
> -	spin_lock_irqsave(&zone->lock, flags);
> +	if (!spin_trylock_irqsave(&zone->lock, flags)) {
> +		if (unlikely(fpi_flags & FPI_TRYLOCK)) {
> +			/* Remember the order */
> +			page->order = order;
> +			/* Add the page to the free list */
> +			llist_add(&page->pcp_llist, &zone->trylock_free_pages);
> +			return;
> +		}
> +		spin_lock_irqsave(&zone->lock, flags);
> +	}
> +
> +	/* The lock succeeded. Process deferred pages. */
> +	llhead = &zone->trylock_free_pages;
> +	if (unlikely(!llist_empty(llhead))) {
> +		struct llist_node *llnode;
> +		struct page *p, *tmp;
> +
> +		llnode = llist_del_all(llhead);

Do you really need to turn the list around?

> +		llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
> +			unsigned int p_order = p->order;
> +			split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
> +			__count_vm_events(PGFREE, 1 << p_order);
> +		}

We had something like that (returning memory in IRQ/ irq-off) in RT tree
and we got rid of it before posting the needed bits to mm.

If we really intend to do something like this, could we please process
this list in an explicitly locked section? I mean not in a try-lock
fashion which might have originated in an IRQ-off region on PREEMPT_RT
but in an explicit locked section which would remain preemptible. This
would also avoid the locking problem down the road when
shuffle_pick_tail() invokes get_random_u64() which in turn acquires a
spinlock_t.

> +	}
>  	split_large_buddy(zone, page, pfn, order, fpi_flags);
>  	spin_unlock_irqrestore(&zone->lock, flags);
>  

Sebastian
Alexei Starovoitov Dec. 10, 2024, 10:49 p.m. UTC | #2
On Tue, Dec 10, 2024 at 12:35 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2024-12-09 18:39:32 [-0800], Alexei Starovoitov wrote:
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index d511e68903c6..a969a62ec0c3 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1251,9 +1254,33 @@ static void free_one_page(struct zone *zone, struct page *page,
> >                         unsigned long pfn, unsigned int order,
> >                         fpi_t fpi_flags)
> >  {
> > +     struct llist_head *llhead;
> >       unsigned long flags;
> >
> > -     spin_lock_irqsave(&zone->lock, flags);
> > +     if (!spin_trylock_irqsave(&zone->lock, flags)) {
> > +             if (unlikely(fpi_flags & FPI_TRYLOCK)) {
> > +                     /* Remember the order */
> > +                     page->order = order;
> > +                     /* Add the page to the free list */
> > +                     llist_add(&page->pcp_llist, &zone->trylock_free_pages);
> > +                     return;
> > +             }
> > +             spin_lock_irqsave(&zone->lock, flags);
> > +     }
> > +
> > +     /* The lock succeeded. Process deferred pages. */
> > +     llhead = &zone->trylock_free_pages;
> > +     if (unlikely(!llist_empty(llhead))) {
> > +             struct llist_node *llnode;
> > +             struct page *p, *tmp;
> > +
> > +             llnode = llist_del_all(llhead);
>
> Do you really need to turn the list around?

I didn't think LIFO vs FIFO would make a difference.
Why spend time rotating it?

> > +             llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
> > +                     unsigned int p_order = p->order;
> > +                     split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
> > +                     __count_vm_events(PGFREE, 1 << p_order);
> > +             }
>
> We had something like that (returning memory in IRQ/ irq-off) in RT tree
> and we got rid of it before posting the needed bits to mm.
>
> If we really intend to do something like this, could we please process
> this list in an explicitly locked section? I mean not in a try-lock
> fashion which might have originated in an IRQ-off region on PREEMPT_RT
> but in an explicit locked section which would remain preemptible. This
> would also avoid the locking problem down the road when
> shuffle_pick_tail() invokes get_random_u64() which in turn acquires a
> spinlock_t.

I see. So the concern is though spin_lock_irqsave(&zone->lock)
is sleepable in RT, bpf prog might have been called in the context
where preemption is disabled and do split_large_buddy() for many
pages might take too much time?
How about kicking irq_work then? The callback is in kthread in RT.
We can irq_work_queue() right after llist_add().

Or we can process only N pages at a time in this loop and
llist_add() leftover back into zone->trylock_free_pages.
Vlastimil Babka Dec. 11, 2024, 10:11 a.m. UTC | #3
On 12/10/24 03:39, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Introduce free_pages_nolock() that can free a page without taking locks.
> It relies on trylock only and can be called from any context.
> 
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

<snip>

> +/* Can be called while holding raw_spin_lock or from IRQ. RCU must be watching. */
> +void free_pages_nolock(struct page *page, unsigned int order)

What does "RCU must be watching" mean and why?

> +{
> +	int head = PageHead(page);
> +	struct alloc_tag *tag = pgalloc_tag_get(page);
> +
> +	if (put_page_testzero(page)) {
> +		__free_unref_page(page, order, FPI_TRYLOCK);
> +	} else if (!head) {
> +		pgalloc_tag_sub_pages(tag, (1 << order) - 1);
> +		while (order-- > 0)
> +			__free_unref_page(page + (1 << order), order, FPI_TRYLOCK);

Not your fault for trying to support everything __free_pages did,
specifically order > 0 pages that are not compound and thus needing this
extra !head branch. We'd love to get rid of that eventually in
__free_pages(), but at least I think we don't need to support it in a new
API and instead any users switching to it should know it's not supported.
I suppose BFP doesn't need that, right?
Maybe if the function was taking a folio instead of page, it would be the
best as that has to be order-0 or compound by definition. It also wouldn't
need the order parameter. What do you think, Matthew?

> +	}
> +}
> +
>  void free_pages(unsigned long addr, unsigned int order)
>  {
>  	if (addr != 0) {
Alexei Starovoitov Dec. 12, 2024, 1:43 a.m. UTC | #4
On Wed, Dec 11, 2024 at 2:11 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 12/10/24 03:39, Alexei Starovoitov wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Introduce free_pages_nolock() that can free a page without taking locks.
> > It relies on trylock only and can be called from any context.
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>
> <snip>
>
> > +/* Can be called while holding raw_spin_lock or from IRQ. RCU must be watching. */
> > +void free_pages_nolock(struct page *page, unsigned int order)
>
> What does "RCU must be watching" mean and why?

Meaning that rcu_is_watching() must be true.
Because pgalloc_tag_get() relies on RCU.
ree_unref_page() and things it calls doesn't use RCU directly,
but it calls tracepoints and they need RCU too.

Hence the comment to say that free_pages_nolock() cannot be
called in _any_ context. Some care needs to be taken.
bpf needs RCU, so we don't allow attach bpf to idle and notrace,
but, last I heard, notrace attr is not 100% reliable on some odd arch-es.

> > +{
> > +     int head = PageHead(page);
> > +     struct alloc_tag *tag = pgalloc_tag_get(page);
> > +
> > +     if (put_page_testzero(page)) {
> > +             __free_unref_page(page, order, FPI_TRYLOCK);
> > +     } else if (!head) {
> > +             pgalloc_tag_sub_pages(tag, (1 << order) - 1);
> > +             while (order-- > 0)
> > +                     __free_unref_page(page + (1 << order), order, FPI_TRYLOCK);
>
> Not your fault for trying to support everything __free_pages did,
> specifically order > 0 pages that are not compound and thus needing this
> extra !head branch. We'd love to get rid of that eventually in
> __free_pages(), but at least I think we don't need to support it in a new
> API and instead any users switching to it should know it's not supported.

Great. Good to know.

> I suppose BFP doesn't need that, right?

Nope.
Initially I wrote above bit as:

void free_pages_nolock(struct page *page, unsigned int order)
{
         if (put_page_testzero(page))
                __free_unref_page(page, order, FPI_TRYLOCK);
}

but then I studied Matthew's commit
e320d3012d25 ("mm/page_alloc.c: fix freeing non-compound pages")

and couldn't convince myself that the race he described
+               get_page(page);
+               free_pages(addr, 3);
+               put_page(page);

will never happen.
It won't happen if bpf is the only user of this api,
but who knows what happens in the future.
So copy-pasted this part just in case.
Will be happy to drop it.

> Maybe if the function was taking a folio instead of page, it would be the
> best as that has to be order-0 or compound by definition. It also wouldn't
> need the order parameter. What do you think, Matthew?

For bpf there is no use case for folios.
All we need is a page order 0 and separately
later kmalloc_nolock() will call it from new_slab().
And I think new_slab() might need order >= 1.
So that's the reason to have order parameter.
Sebastian Andrzej Siewior Dec. 12, 2024, 2:44 p.m. UTC | #5
On 2024-12-10 14:49:14 [-0800], Alexei Starovoitov wrote:
> On Tue, Dec 10, 2024 at 12:35 AM Sebastian Andrzej Siewior
> <bigeasy@linutronix.de> wrote:
> >
> > On 2024-12-09 18:39:32 [-0800], Alexei Starovoitov wrote:
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index d511e68903c6..a969a62ec0c3 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -1251,9 +1254,33 @@ static void free_one_page(struct zone *zone, struct page *page,
> > >                         unsigned long pfn, unsigned int order,
> > >                         fpi_t fpi_flags)
> > >  {
> > > +     struct llist_head *llhead;
> > >       unsigned long flags;
> > >
> > > -     spin_lock_irqsave(&zone->lock, flags);
> > > +     if (!spin_trylock_irqsave(&zone->lock, flags)) {
> > > +             if (unlikely(fpi_flags & FPI_TRYLOCK)) {
> > > +                     /* Remember the order */
> > > +                     page->order = order;
> > > +                     /* Add the page to the free list */
> > > +                     llist_add(&page->pcp_llist, &zone->trylock_free_pages);
> > > +                     return;
> > > +             }
> > > +             spin_lock_irqsave(&zone->lock, flags);
> > > +     }
> > > +
> > > +     /* The lock succeeded. Process deferred pages. */
> > > +     llhead = &zone->trylock_free_pages;
> > > +     if (unlikely(!llist_empty(llhead))) {
> > > +             struct llist_node *llnode;
> > > +             struct page *p, *tmp;
> > > +
> > > +             llnode = llist_del_all(llhead);
> >
> > Do you really need to turn the list around?
> 
> I didn't think LIFO vs FIFO would make a difference.
> Why spend time rotating it?

I'm sorry. I read llist_reverse_order() in there but it is not there. So
it is all good.

> > > +             llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
> > > +                     unsigned int p_order = p->order;
> > > +                     split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
> > > +                     __count_vm_events(PGFREE, 1 << p_order);
> > > +             }
> >
> > We had something like that (returning memory in IRQ/ irq-off) in RT tree
> > and we got rid of it before posting the needed bits to mm.
> >
> > If we really intend to do something like this, could we please process
> > this list in an explicitly locked section? I mean not in a try-lock
> > fashion which might have originated in an IRQ-off region on PREEMPT_RT
> > but in an explicit locked section which would remain preemptible. This
> > would also avoid the locking problem down the road when
> > shuffle_pick_tail() invokes get_random_u64() which in turn acquires a
> > spinlock_t.
> 
> I see. So the concern is though spin_lock_irqsave(&zone->lock)
> is sleepable in RT, bpf prog might have been called in the context
> where preemption is disabled and do split_large_buddy() for many
> pages might take too much time?
Yes.

> How about kicking irq_work then? The callback is in kthread in RT.
> We can irq_work_queue() right after llist_add().
> 
> Or we can process only N pages at a time in this loop and
> llist_add() leftover back into zone->trylock_free_pages.

It could be simpler to not process the trylock_free_pages list in the
trylock attempt, only in the lock case which is preemptible.

Sebastian
Alexei Starovoitov Dec. 12, 2024, 7:57 p.m. UTC | #6
On Thu, Dec 12, 2024 at 6:44 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> On 2024-12-10 14:49:14 [-0800], Alexei Starovoitov wrote:
> > On Tue, Dec 10, 2024 at 12:35 AM Sebastian Andrzej Siewior
> > <bigeasy@linutronix.de> wrote:
> > >
> > > On 2024-12-09 18:39:32 [-0800], Alexei Starovoitov wrote:
> > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > index d511e68903c6..a969a62ec0c3 100644
> > > > --- a/mm/page_alloc.c
> > > > +++ b/mm/page_alloc.c
> > > > @@ -1251,9 +1254,33 @@ static void free_one_page(struct zone *zone, struct page *page,
> > > >                         unsigned long pfn, unsigned int order,
> > > >                         fpi_t fpi_flags)
> > > >  {
> > > > +     struct llist_head *llhead;
> > > >       unsigned long flags;
> > > >
> > > > -     spin_lock_irqsave(&zone->lock, flags);
> > > > +     if (!spin_trylock_irqsave(&zone->lock, flags)) {
> > > > +             if (unlikely(fpi_flags & FPI_TRYLOCK)) {
> > > > +                     /* Remember the order */
> > > > +                     page->order = order;
> > > > +                     /* Add the page to the free list */
> > > > +                     llist_add(&page->pcp_llist, &zone->trylock_free_pages);
> > > > +                     return;
> > > > +             }
> > > > +             spin_lock_irqsave(&zone->lock, flags);
> > > > +     }
> > > > +
> > > > +     /* The lock succeeded. Process deferred pages. */
> > > > +     llhead = &zone->trylock_free_pages;
> > > > +     if (unlikely(!llist_empty(llhead))) {
> > > > +             struct llist_node *llnode;
> > > > +             struct page *p, *tmp;
> > > > +
> > > > +             llnode = llist_del_all(llhead);
> > >
> > > Do you really need to turn the list around?
> >
> > I didn't think LIFO vs FIFO would make a difference.
> > Why spend time rotating it?
>
> I'm sorry. I read llist_reverse_order() in there but it is not there. So
> it is all good.
>
> > > > +             llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
> > > > +                     unsigned int p_order = p->order;
> > > > +                     split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
> > > > +                     __count_vm_events(PGFREE, 1 << p_order);
> > > > +             }
> > >
> > > We had something like that (returning memory in IRQ/ irq-off) in RT tree
> > > and we got rid of it before posting the needed bits to mm.
> > >
> > > If we really intend to do something like this, could we please process
> > > this list in an explicitly locked section? I mean not in a try-lock
> > > fashion which might have originated in an IRQ-off region on PREEMPT_RT
> > > but in an explicit locked section which would remain preemptible. This
> > > would also avoid the locking problem down the road when
> > > shuffle_pick_tail() invokes get_random_u64() which in turn acquires a
> > > spinlock_t.
> >
> > I see. So the concern is though spin_lock_irqsave(&zone->lock)
> > is sleepable in RT, bpf prog might have been called in the context
> > where preemption is disabled and do split_large_buddy() for many
> > pages might take too much time?
> Yes.
>
> > How about kicking irq_work then? The callback is in kthread in RT.
> > We can irq_work_queue() right after llist_add().
> >
> > Or we can process only N pages at a time in this loop and
> > llist_add() leftover back into zone->trylock_free_pages.
>
> It could be simpler to not process the trylock_free_pages list in the
> trylock attempt, only in the lock case which is preemptible.

Make sense. Will change to:

        /* The lock succeeded. Process deferred pages. */
        llhead = &zone->trylock_free_pages;
-       if (unlikely(!llist_empty(llhead))) {
+       if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
diff mbox series

Patch

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f68daa9c997b..dcae733ed006 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -394,6 +394,7 @@  __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
 	__get_free_pages((gfp_mask) | GFP_DMA, (order))
 
 extern void __free_pages(struct page *page, unsigned int order);
+extern void free_pages_nolock(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 
 #define __free_page(page) __free_pages((page), 0)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7361a8f3ab68..52547b3e5fd8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -99,6 +99,10 @@  struct page {
 				/* Or, free page */
 				struct list_head buddy_list;
 				struct list_head pcp_list;
+				struct {
+					struct llist_node pcp_llist;
+					unsigned int order;
+				};
 			};
 			/* See page-flags.h for PAGE_MAPPING_FLAGS */
 			struct address_space *mapping;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b36124145a16..1a854e0a9e3b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -953,6 +953,9 @@  struct zone {
 	/* Primarily protects free_area */
 	spinlock_t		lock;
 
+	/* Pages to be freed when next trylock succeeds */
+	struct llist_head	trylock_free_pages;
+
 	/* Write-intensive fields used by compaction and vmstats. */
 	CACHELINE_PADDING(_pad2_);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d511e68903c6..a969a62ec0c3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -88,6 +88,9 @@  typedef int __bitwise fpi_t;
  */
 #define FPI_TO_TAIL		((__force fpi_t)BIT(1))
 
+/* Free the page without taking locks. Rely on trylock only. */
+#define FPI_TRYLOCK		((__force fpi_t)BIT(2))
+
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -1251,9 +1254,33 @@  static void free_one_page(struct zone *zone, struct page *page,
 			  unsigned long pfn, unsigned int order,
 			  fpi_t fpi_flags)
 {
+	struct llist_head *llhead;
 	unsigned long flags;
 
-	spin_lock_irqsave(&zone->lock, flags);
+	if (!spin_trylock_irqsave(&zone->lock, flags)) {
+		if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+			/* Remember the order */
+			page->order = order;
+			/* Add the page to the free list */
+			llist_add(&page->pcp_llist, &zone->trylock_free_pages);
+			return;
+		}
+		spin_lock_irqsave(&zone->lock, flags);
+	}
+
+	/* The lock succeeded. Process deferred pages. */
+	llhead = &zone->trylock_free_pages;
+	if (unlikely(!llist_empty(llhead))) {
+		struct llist_node *llnode;
+		struct page *p, *tmp;
+
+		llnode = llist_del_all(llhead);
+		llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
+			unsigned int p_order = p->order;
+			split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
+			__count_vm_events(PGFREE, 1 << p_order);
+		}
+	}
 	split_large_buddy(zone, page, pfn, order, fpi_flags);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
@@ -2596,7 +2623,7 @@  static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 
 static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 				   struct page *page, int migratetype,
-				   unsigned int order)
+				   unsigned int order, fpi_t fpi_flags)
 {
 	int high, batch;
 	int pindex;
@@ -2631,6 +2658,14 @@  static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	}
 	if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
 		pcp->free_count += (1 << order);
+
+	if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+		/*
+		 * Do not attempt to take a zone lock. Let pcp->count get
+		 * over high mark temporarily.
+		 */
+		return;
+	}
 	high = nr_pcp_high(pcp, zone, batch, free_high);
 	if (pcp->count >= high) {
 		free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2645,7 +2680,8 @@  static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 /*
  * Free a pcp page
  */
-void free_unref_page(struct page *page, unsigned int order)
+static void __free_unref_page(struct page *page, unsigned int order,
+			      fpi_t fpi_flags)
 {
 	unsigned long __maybe_unused UP_flags;
 	struct per_cpu_pages *pcp;
@@ -2654,7 +2690,7 @@  void free_unref_page(struct page *page, unsigned int order)
 	int migratetype;
 
 	if (!pcp_allowed_order(order)) {
-		__free_pages_ok(page, order, FPI_NONE);
+		__free_pages_ok(page, order, fpi_flags);
 		return;
 	}
 
@@ -2671,7 +2707,7 @@  void free_unref_page(struct page *page, unsigned int order)
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
 		if (unlikely(is_migrate_isolate(migratetype))) {
-			free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
+			free_one_page(page_zone(page), page, pfn, order, fpi_flags);
 			return;
 		}
 		migratetype = MIGRATE_MOVABLE;
@@ -2681,14 +2717,19 @@  void free_unref_page(struct page *page, unsigned int order)
 	pcp_trylock_prepare(UP_flags);
 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 	if (pcp) {
-		free_unref_page_commit(zone, pcp, page, migratetype, order);
+		free_unref_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
 		pcp_spin_unlock(pcp);
 	} else {
-		free_one_page(zone, page, pfn, order, FPI_NONE);
+		free_one_page(zone, page, pfn, order, fpi_flags);
 	}
 	pcp_trylock_finish(UP_flags);
 }
 
+void free_unref_page(struct page *page, unsigned int order)
+{
+	__free_unref_page(page, order, FPI_NONE);
+}
+
 /*
  * Free a batch of folios
  */
@@ -2777,7 +2818,7 @@  void free_unref_folios(struct folio_batch *folios)
 
 		trace_mm_page_free_batched(&folio->page);
 		free_unref_page_commit(zone, pcp, &folio->page, migratetype,
-				order);
+				       order, FPI_NONE);
 	}
 
 	if (pcp) {
@@ -4855,6 +4896,21 @@  void __free_pages(struct page *page, unsigned int order)
 }
 EXPORT_SYMBOL(__free_pages);
 
+/* Can be called while holding raw_spin_lock or from IRQ. RCU must be watching. */
+void free_pages_nolock(struct page *page, unsigned int order)
+{
+	int head = PageHead(page);
+	struct alloc_tag *tag = pgalloc_tag_get(page);
+
+	if (put_page_testzero(page)) {
+		__free_unref_page(page, order, FPI_TRYLOCK);
+	} else if (!head) {
+		pgalloc_tag_sub_pages(tag, (1 << order) - 1);
+		while (order-- > 0)
+			__free_unref_page(page + (1 << order), order, FPI_TRYLOCK);
+	}
+}
+
 void free_pages(unsigned long addr, unsigned int order)
 {
 	if (addr != 0) {