diff mbox series

[v2,bpf-next,12/13] bpf: Introduce bpf_mem_free_rcu() similar to kfree_rcu().

Message ID 20230624031333.96597-13-alexei.starovoitov@gmail.com (mailing list archive)
State Superseded
Headers show
Series bpf: Introduce bpf_mem_cache_free_rcu(). | expand

Commit Message

Alexei Starovoitov June 24, 2023, 3:13 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
objects into free_by_rcu_ttrace list where they are waiting for RCU
task trace grace period to be freed into slab.

The life cycle of objects:
alloc: dequeue free_llist
free: enqeueu free_llist
free_rcu: enqueue free_by_rcu -> waiting_for_gp
free_llist above high watermark -> free_by_rcu_ttrace
after RCU GP waiting_for_gp -> free_by_rcu_ttrace
free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |   2 +
 kernel/bpf/memalloc.c         | 117 +++++++++++++++++++++++++++++++++-
 2 files changed, 117 insertions(+), 2 deletions(-)

Comments

Hou Tao June 24, 2023, 6:49 a.m. UTC | #1
Hi,

On 6/24/2023 11:13 AM, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
>
SNIP
>  
> +static void __free_by_rcu(struct rcu_head *head)
> +{
> +	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> +	struct bpf_mem_cache *tgt = c->tgt;
> +	struct llist_node *llnode;
> +
> +	if (unlikely(READ_ONCE(c->draining)))
> +		goto out;
> +
> +	llnode = llist_del_all(&c->waiting_for_gp);
> +	if (!llnode)
> +		goto out;
> +
> +	if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace))
> +		tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail;
Got a null-ptr dereference oops when running multiple test_maps and
htab-mem benchmark after hacking htab to use bpf_mem_cache_free_rcu().
And I think it happened as follow:

// c->tgt
P1: __free_by_rcu()
        // c->tgt is the same as P1
        P2: __free_by_rcu()

// return true
P1: llist_add_batch(&tgt->free_by_rcu_ttrace)
        // return false
        P2: llist_add_batch(&tgt->free_by_rcu_ttrace)
        P2: do_call_rcu_ttrace
        // return false
        P2: xchg(tgt->call_rcu_ttrace_in_progress, 1)
        // llnode is not NULL
        P2: llnode = llist_del_all(&c->free_by_rcu_ttrace)
        // BAD: c->free_by_rcu_ttrace_tail is NULL, so oops
        P2: __llist_add_batch(llnode, c->free_by_rcu_ttrace_tail)

P1: tgt->free_by_rcu_ttrace_tail = X   

I don't have a good fix for the problem except adding a spin-lock for
free_by_rcu_ttrace and free_by_rcu_ttrace_tail.
Hou Tao June 24, 2023, 7:53 a.m. UTC | #2
Hi,

On 6/24/2023 11:13 AM, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
>
> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
> objects into free_by_rcu_ttrace list where they are waiting for RCU
> task trace grace period to be freed into slab.
>
SNIP
> +static void check_free_by_rcu(struct bpf_mem_cache *c)
> +{
> +	struct llist_node *llnode, *t;
> +
> +	if (llist_empty(&c->free_by_rcu) && llist_empty(&c->free_llist_extra_rcu))
> +		return;
> +
> +	/* drain free_llist_extra_rcu */
> +	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
> +		if (__llist_add(llnode, &c->free_by_rcu))
> +			c->free_by_rcu_tail = llnode;

Just like add_obj_to_free_list(),  we should do conditional
local_irq_save(flags) and local_inc_return(&c->active) as well for
free_by_rcu, otherwise free_by_rcu may be corrupted by bpf program
running in a NMI context.
> +
> +	if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
> +		/*
> +		 * Instead of kmalloc-ing new rcu_head and triggering 10k
> +		 * call_rcu() to hit rcutree.qhimark and force RCU to notice
> +		 * the overload just ask RCU to hurry up. There could be many
> +		 * objects in free_by_rcu list.
> +		 * This hint reduces memory consumption for an artifical
> +		 * benchmark from 2 Gbyte to 150 Mbyte.
> +		 */
> +		rcu_request_urgent_qs_task(current);
> +		return;
> +	}
> +
> +	WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
> +
> +	WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
> +	c->waiting_for_gp_tail = c->free_by_rcu_tail;
> +	call_rcu_hurry(&c->rcu, __free_by_rcu);

The same protection is needed for c->free_by_rcu_tail as well.
Hou Tao June 24, 2023, 9:05 a.m. UTC | #3
Hi,

On 6/24/2023 11:13 AM, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
>
> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
> objects into free_by_rcu_ttrace list where they are waiting for RCU
> task trace grace period to be freed into slab.
SNIP
> +static void __free_by_rcu(struct rcu_head *head)
> +{
> +	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
> +	struct bpf_mem_cache *tgt = c->tgt;
> +	struct llist_node *llnode;
> +
> +	if (unlikely(READ_ONCE(c->draining)))
> +		goto out;
Because the reading of c->draining and list_add_batch(...,
free_by_rcu_ttrace) is lockless, so checking draining here could not
prevent the leak of objects in c->free_by_rcu_ttrace() as show below
(hope the formatting is OK now). A simple fix is to drain
free_by_rcu_ttrace twice as suggested before. Or checking c->draining
again in __free_by_rcu() when atomic_xchg() returns 1 and calling
free_all(free_by_rcu_ttrace) if draining is true.

P1: bpf_mem_alloc_destroy()
    P2: __free_by_rcu()

    // got false
    P2: read c->draining

P1: c->draining = true
P1: llist_del_all(&c->free_by_rcu_ttrace)

    // add to free_by_rcu_ttrace again
    P2: llist_add_batch(..., &tgt->free_by_rcu_ttrace)
        P2: do_call_rcu_ttrace()
            // call_rcu_ttrace_in_progress is 1, so xchg return 1
            // and it doesn't being moved to waiting_for_gp_ttrace
            P2: atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)

// got 1
P1: atomic_read(&c->call_rcu_ttrace_in_progress)
// objects in free_by_rcu_ttrace is leaked

c->draining also can't guarantee bpf_mem_alloc_destroy() will wait for
the inflight call_rcu_tasks_trace() callback as shown in the following
two cases (these two cases are the same as reported in v1 and I only
reformatted these two diagrams). And I suggest to do
bpf_mem_alloc_destroy as follows:

        if (ma->cache) {
                rcu_in_progress = 0;
                for_each_possible_cpu(cpu) {
                        c = per_cpu_ptr(ma->cache, cpu);
                        irq_work_sync(&c->refill_work);
                        drain_mem_cache(c);
                        rcu_in_progress +=
atomic_read(&c->call_rcu_in_progress);
                }
                for_each_possible_cpu(cpu) {
                        c = per_cpu_ptr(ma->cache, cpu);
                        rcu_in_progress +=
atomic_read(&c->call_rcu_ttrace_in_progress);
                }


Case 1:

P1: bpf_mem_alloc_destroy()
            P2: __free_by_rcu()

            // got false
            P2: c->draining
P1: c->draining = true

// got 0
P1: atomic_read(&c->call_rcu_ttrace_in_progress)

            P2: do_call_rcu_ttrace()
                // return 0
                P2: atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)
                P2: call_rcu_tasks_trace()
            P2: atomic_set(&c->call_rcu_in_progress, 0)

// also got 0
P1: atomic_read(&c->call_rcu_in_progress)
// won't wait for the inflight __free_rcu_tasks_trace

Case 2:

P1: bpf_mem_alloc_destroy
                P2: __free_by_rcu for c1

                P2: read c1->draing
P1: c0->draining = true
P1: c1->draining = true

// both of in_progress counter is 0
P1: read c0->call_rcu_in_progress
P1: read c0->call_rcu_ttrace_in_progress

                // c1->tgt is c0
                // c1->call_rcu_in_progress is 1
                // c0->call_rcu_ttrace_in_progress is 0
                P2: llist_add_batch(..., c0->free_by_rcu_ttrace)
                P2: xchg(c0->call_rcu_ttrace_in_progress, 1)
                P2: call_rcu_tasks_trace(c0)
                P2: c1->call_rcu_in_progress = 0

// both of in_progress counter is 0
P1: read c1->call_rcu_in_progress
P1: read c1->call_rcu_ttrace_in_progress

// BAD! There is still inflight tasks trace RCU callback
P1: free_mem_alloc_no_barrier()

> +
> +	llnode = llist_del_all(&c->waiting_for_gp);
> +	if (!llnode)
> +		goto out;
> +
> +	if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace))
> +		tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail;
> +
> +	/* Objects went through regular RCU GP. Send them to RCU tasks trace */
> +	do_call_rcu_ttrace(tgt);
> +out:
> +	atomic_set(&c->call_rcu_in_progress, 0);
> +}
> +
Hou Tao June 25, 2023, 11:15 a.m. UTC | #4
Hi,

On 6/24/2023 11:13 AM, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@kernel.org>
>
> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
> objects into free_by_rcu_ttrace list where they are waiting for RCU
> task trace grace period to be freed into slab.
SNIP
>  
>  static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
> @@ -498,8 +566,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
>  
>  static void free_mem_alloc(struct bpf_mem_alloc *ma)
>  {
> -	/* waiting_for_gp_ttrace lists was drained, but __free_rcu might
> -	 * still execute. Wait for it now before we freeing percpu caches.
> +	/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
> +	 * might still execute. Wait for them.
>  	 *
>  	 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
>  	 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
I think an extra rcu_barrier() before rcu_barrier_tasks_trace() is still
needed here, otherwise free_mem_alloc will not wait for inflight
__free_by_rcu() and there will oops in rcu_do_batch().
Hou Tao June 25, 2023, 11:20 a.m. UTC | #5
Hi,

On 6/24/2023 2:49 PM, Hou Tao wrote:
> Hi,
>
> On 6/24/2023 11:13 AM, Alexei Starovoitov wrote:
>> From: Alexei Starovoitov <ast@kernel.org>
>>
> SNIP
>>  
>> +static void __free_by_rcu(struct rcu_head *head)
>> +{
>> +	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>> +	struct bpf_mem_cache *tgt = c->tgt;
>> +	struct llist_node *llnode;
>> +
>> +	if (unlikely(READ_ONCE(c->draining)))
>> +		goto out;
>> +
>> +	llnode = llist_del_all(&c->waiting_for_gp);
>> +	if (!llnode)
>> +		goto out;
>> +
>> +	if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace))
>> +		tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail;
> Got a null-ptr dereference oops when running multiple test_maps and
> htab-mem benchmark after hacking htab to use bpf_mem_cache_free_rcu().
> And I think it happened as follow:
The same null-ptr dereference happened even before switching htab to use
bpf_mem_cache_free_rcu(). So it seems after introducing c->tgt, there
could be two concurrent free_bulk() but with the same c->tgt.
>
> // c->tgt
> P1: __free_by_rcu()
>         // c->tgt is the same as P1
>         P2: __free_by_rcu()
>
> // return true
> P1: llist_add_batch(&tgt->free_by_rcu_ttrace)
>         // return false
>         P2: llist_add_batch(&tgt->free_by_rcu_ttrace)
>         P2: do_call_rcu_ttrace
>         // return false
>         P2: xchg(tgt->call_rcu_ttrace_in_progress, 1)
>         // llnode is not NULL
>         P2: llnode = llist_del_all(&c->free_by_rcu_ttrace)
>         // BAD: c->free_by_rcu_ttrace_tail is NULL, so oops
>         P2: __llist_add_batch(llnode, c->free_by_rcu_ttrace_tail)
>
> P1: tgt->free_by_rcu_ttrace_tail = X   
>
> I don't have a good fix for the problem except adding a spin-lock for
> free_by_rcu_ttrace and free_by_rcu_ttrace_tail.
>
>
> .
Alexei Starovoitov June 28, 2023, 12:52 a.m. UTC | #6
On 6/23/23 11:49 PM, Hou Tao wrote:
> Hi,
> 
> On 6/24/2023 11:13 AM, Alexei Starovoitov wrote:
>> From: Alexei Starovoitov <ast@kernel.org>
>>
> SNIP
>>   
>> +static void __free_by_rcu(struct rcu_head *head)
>> +{
>> +	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
>> +	struct bpf_mem_cache *tgt = c->tgt;
>> +	struct llist_node *llnode;
>> +
>> +	if (unlikely(READ_ONCE(c->draining)))
>> +		goto out;
>> +
>> +	llnode = llist_del_all(&c->waiting_for_gp);
>> +	if (!llnode)
>> +		goto out;
>> +
>> +	if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace))
>> +		tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail;
> Got a null-ptr dereference oops when running multiple test_maps and
> htab-mem benchmark after hacking htab to use bpf_mem_cache_free_rcu().
> And I think it happened as follow:
> 
> // c->tgt
> P1: __free_by_rcu()
>          // c->tgt is the same as P1
>          P2: __free_by_rcu()
> 
> // return true
> P1: llist_add_batch(&tgt->free_by_rcu_ttrace)
>          // return false
>          P2: llist_add_batch(&tgt->free_by_rcu_ttrace)
>          P2: do_call_rcu_ttrace
>          // return false
>          P2: xchg(tgt->call_rcu_ttrace_in_progress, 1)
>          // llnode is not NULL
>          P2: llnode = llist_del_all(&c->free_by_rcu_ttrace)
>          // BAD: c->free_by_rcu_ttrace_tail is NULL, so oops
>          P2: __llist_add_batch(llnode, c->free_by_rcu_ttrace_tail)
> 
> P1: tgt->free_by_rcu_ttrace_tail = X
> 
> I don't have a good fix for the problem except adding a spin-lock for
> free_by_rcu_ttrace and free_by_rcu_ttrace_tail.

null-ptr is probably something else, since the race window is
extremely tiny.
In my testing this optimization doesn't buy much.
So I'll just drop _tail optimization and switch to for_each(del_all)
to move elements. We can revisit later.
Alexei Starovoitov June 28, 2023, 12:54 a.m. UTC | #7
On 6/24/23 12:53 AM, Hou Tao wrote:
> Hi,
> 
> On 6/24/2023 11:13 AM, Alexei Starovoitov wrote:
>> From: Alexei Starovoitov <ast@kernel.org>
>>
>> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
>> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
>> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
>> objects into free_by_rcu_ttrace list where they are waiting for RCU
>> task trace grace period to be freed into slab.
>>
> SNIP
>> +static void check_free_by_rcu(struct bpf_mem_cache *c)
>> +{
>> +	struct llist_node *llnode, *t;
>> +
>> +	if (llist_empty(&c->free_by_rcu) && llist_empty(&c->free_llist_extra_rcu))
>> +		return;
>> +
>> +	/* drain free_llist_extra_rcu */
>> +	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
>> +		if (__llist_add(llnode, &c->free_by_rcu))
>> +			c->free_by_rcu_tail = llnode;
> 
> Just like add_obj_to_free_list(),  we should do conditional
> local_irq_save(flags) and local_inc_return(&c->active) as well for
> free_by_rcu, otherwise free_by_rcu may be corrupted by bpf program
> running in a NMI context.

Good catch. Will do.
Alexei Starovoitov June 28, 2023, 12:56 a.m. UTC | #8
On 6/25/23 4:15 AM, Hou Tao wrote:
> Hi,
> 
> On 6/24/2023 11:13 AM, Alexei Starovoitov wrote:
>> From: Alexei Starovoitov <ast@kernel.org>
>>
>> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
>> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into
>> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves
>> objects into free_by_rcu_ttrace list where they are waiting for RCU
>> task trace grace period to be freed into slab.
> SNIP
>>   
>>   static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
>> @@ -498,8 +566,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
>>   
>>   static void free_mem_alloc(struct bpf_mem_alloc *ma)
>>   {
>> -	/* waiting_for_gp_ttrace lists was drained, but __free_rcu might
>> -	 * still execute. Wait for it now before we freeing percpu caches.
>> +	/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
>> +	 * might still execute. Wait for them.
>>   	 *
>>   	 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
>>   	 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
> I think an extra rcu_barrier() before rcu_barrier_tasks_trace() is still
> needed here, otherwise free_mem_alloc will not wait for inflight
> __free_by_rcu() and there will oops in rcu_do_batch().

Agree. I got confused by rcu_trace_implies_rcu_gp().
rcu_barrier() is necessary.

re: draining.
I'll switch to do if (draing) free_all; else call_rcu; scheme
to address potential memory leak though I wasn't able to repro it.
Hou Tao June 28, 2023, 1:36 a.m. UTC | #9
Hi,

On 6/28/2023 8:52 AM, Alexei Starovoitov wrote:
> On 6/23/23 11:49 PM, Hou Tao wrote:
>> Hi,
>>
>> On 6/24/2023 11:13 AM, Alexei Starovoitov wrote:
>>> From: Alexei Starovoitov <ast@kernel.org>
>>>
>> SNIP
>>>   +static void __free_by_rcu(struct rcu_head *head)
>>> +{
>>> +    struct bpf_mem_cache *c = container_of(head, struct
>>> bpf_mem_cache, rcu);
>>> +    struct bpf_mem_cache *tgt = c->tgt;
>>> +    struct llist_node *llnode;
>>> +
>>> +    if (unlikely(READ_ONCE(c->draining)))
>>> +        goto out;
>>> +
>>> +    llnode = llist_del_all(&c->waiting_for_gp);
>>> +    if (!llnode)
>>> +        goto out;
>>> +
>>> +    if (llist_add_batch(llnode, c->waiting_for_gp_tail,
>>> &tgt->free_by_rcu_ttrace))
>>> +        tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail;
>> Got a null-ptr dereference oops when running multiple test_maps and
>> htab-mem benchmark after hacking htab to use bpf_mem_cache_free_rcu().
>> And I think it happened as follow:
>>
>> // c->tgt
>> P1: __free_by_rcu()
>>          // c->tgt is the same as P1
>>          P2: __free_by_rcu()
>>
>> // return true
>> P1: llist_add_batch(&tgt->free_by_rcu_ttrace)
>>          // return false
>>          P2: llist_add_batch(&tgt->free_by_rcu_ttrace)
>>          P2: do_call_rcu_ttrace
>>          // return false
>>          P2: xchg(tgt->call_rcu_ttrace_in_progress, 1)
>>          // llnode is not NULL
>>          P2: llnode = llist_del_all(&c->free_by_rcu_ttrace)
>>          // BAD: c->free_by_rcu_ttrace_tail is NULL, so oops
>>          P2: __llist_add_batch(llnode, c->free_by_rcu_ttrace_tail)
>>
>> P1: tgt->free_by_rcu_ttrace_tail = X
>>
>> I don't have a good fix for the problem except adding a spin-lock for
>> free_by_rcu_ttrace and free_by_rcu_ttrace_tail.
>
> null-ptr is probably something else, since the race window is
> extremely tiny.

The null-ptr dereference is indeed due to free_by_rcu_ttrace_tail is
NULL. The oops occurred multiple times and I have checked the vmcore to
confirm that.

> In my testing this optimization doesn't buy much.
> So I'll just drop _tail optimization and switch to for_each(del_all)
> to move elements. We can revisit later.

OK
Hou Tao June 28, 2023, 1:43 a.m. UTC | #10
Hi,

On 6/28/2023 8:56 AM, Alexei Starovoitov wrote:
> On 6/25/23 4:15 AM, Hou Tao wrote:
>> Hi,
>>
>> On 6/24/2023 11:13 AM, Alexei Starovoitov wrote:
>>> From: Alexei Starovoitov <ast@kernel.org>
>>>
>>> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
>>> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse
>>> into
>>> per-cpu free list the _rcu() flavor waits for RCU grace period and
>>> then moves
>>> objects into free_by_rcu_ttrace list where they are waiting for RCU
>>> task trace grace period to be freed into slab.
>> SNIP
>>>     static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
>>> @@ -498,8 +566,8 @@ static void free_mem_alloc_no_barrier(struct
>>> bpf_mem_alloc *ma)
>>>     static void free_mem_alloc(struct bpf_mem_alloc *ma)
>>>   {
>>> -    /* waiting_for_gp_ttrace lists was drained, but __free_rcu might
>>> -     * still execute. Wait for it now before we freeing percpu caches.
>>> +    /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
>>> +     * might still execute. Wait for them.
>>>        *
>>>        * rcu_barrier_tasks_trace() doesn't imply
>>> synchronize_rcu_tasks_trace(),
>>>        * but rcu_barrier_tasks_trace() and rcu_barrier() below are
>>> only used
>> I think an extra rcu_barrier() before rcu_barrier_tasks_trace() is still
>> needed here, otherwise free_mem_alloc will not wait for inflight
>> __free_by_rcu() and there will oops in rcu_do_batch().
>
> Agree. I got confused by rcu_trace_implies_rcu_gp().
> rcu_barrier() is necessary.
>
> re: draining.
> I'll switch to do if (draing) free_all; else call_rcu; scheme
> to address potential memory leak though I wasn't able to repro it.
For v2, it was also hard for me to reproduce the leak problem. But after
I injected some delay by using udelay() in __free_by_rcu/__free_rcu()
after reading c->draining, it was relatively easy to reproduce the problems.
Alexei Starovoitov June 28, 2023, 1:51 a.m. UTC | #11
On Tue, Jun 27, 2023 at 6:43 PM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 6/28/2023 8:56 AM, Alexei Starovoitov wrote:
> > On 6/25/23 4:15 AM, Hou Tao wrote:
> >> Hi,
> >>
> >> On 6/24/2023 11:13 AM, Alexei Starovoitov wrote:
> >>> From: Alexei Starovoitov <ast@kernel.org>
> >>>
> >>> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu().
> >>> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse
> >>> into
> >>> per-cpu free list the _rcu() flavor waits for RCU grace period and
> >>> then moves
> >>> objects into free_by_rcu_ttrace list where they are waiting for RCU
> >>> task trace grace period to be freed into slab.
> >> SNIP
> >>>     static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
> >>> @@ -498,8 +566,8 @@ static void free_mem_alloc_no_barrier(struct
> >>> bpf_mem_alloc *ma)
> >>>     static void free_mem_alloc(struct bpf_mem_alloc *ma)
> >>>   {
> >>> -    /* waiting_for_gp_ttrace lists was drained, but __free_rcu might
> >>> -     * still execute. Wait for it now before we freeing percpu caches.
> >>> +    /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
> >>> +     * might still execute. Wait for them.
> >>>        *
> >>>        * rcu_barrier_tasks_trace() doesn't imply
> >>> synchronize_rcu_tasks_trace(),
> >>>        * but rcu_barrier_tasks_trace() and rcu_barrier() below are
> >>> only used
> >> I think an extra rcu_barrier() before rcu_barrier_tasks_trace() is still
> >> needed here, otherwise free_mem_alloc will not wait for inflight
> >> __free_by_rcu() and there will oops in rcu_do_batch().
> >
> > Agree. I got confused by rcu_trace_implies_rcu_gp().
> > rcu_barrier() is necessary.
> >
> > re: draining.
> > I'll switch to do if (draing) free_all; else call_rcu; scheme
> > to address potential memory leak though I wasn't able to repro it.
> For v2, it was also hard for me to reproduce the leak problem. But after
> I injected some delay by using udelay() in __free_by_rcu/__free_rcu()
> after reading c->draining, it was relatively easy to reproduce the problems.

1. Please respin htab bench.
We're still discussing patching without having the same base line.

2. 'adding udelay()' is too vague. Pls post a diff hunk of what exactly
you mean.

3. I'll send v3 shortly. Let's move discussion there.
Hou Tao June 28, 2023, 2:03 a.m. UTC | #12
Hi,

On 6/28/2023 9:51 AM, Alexei Starovoitov wrote:
> On Tue, Jun 27, 2023 at 6:43 PM Hou Tao <houtao@huaweicloud.com> wrote:
>>
SNIP
>>> re: draining.
>>> I'll switch to do if (draing) free_all; else call_rcu; scheme
>>> to address potential memory leak though I wasn't able to repro it.
>> For v2, it was also hard for me to reproduce the leak problem. But after
>> I injected some delay by using udelay() in __free_by_rcu/__free_rcu()
>> after reading c->draining, it was relatively easy to reproduce the problems.
> 1. Please respin htab bench.
> We're still discussing patching without having the same base line.
Almost done. Need to do benchmark again to update the numbers in commit
message.
>
> 2. 'adding udelay()' is too vague. Pls post a diff hunk of what exactly
> you mean.

--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -4,6 +4,7 @@
 #include <linux/llist.h>
 #include <linux/bpf.h>
 #include <linux/irq_work.h>
+#include <linux/delay.h>
 #include <linux/bpf_mem_alloc.h>
 #include <linux/memcontrol.h>
 #include <asm/local.h>
@@ -261,12 +262,17 @@ static int free_all(struct llist_node *llnode,
bool percpu)
        return cnt;
 }

+static unsigned int delay;
+module_param(delay, uint, 0644);
+
 static void __free_rcu(struct rcu_head *head)
 {
        struct bpf_mem_cache *c = container_of(head, struct
bpf_mem_cache, rcu_ttrace);

        if (unlikely(READ_ONCE(c->draining)))
                goto out;
+       if (delay)
+               udelay(delay);
        free_all(llist_del_all(&c->waiting_for_gp_ttrace),
!!c->percpu_size);
 out:
        atomic_set(&c->call_rcu_ttrace_in_progress, 0);
@@ -361,6 +367,8 @@ static void __free_by_rcu(struct rcu_head *head)

        if (unlikely(READ_ONCE(c->draining)))
                goto out;
+       if (delay)
+               udelay(delay);

        llnode = llist_del_all(&c->waiting_for_gp);
        if (!llnode)

>
> 3. I'll send v3 shortly. Let's move discussion there.
Sure.
diff mbox series

Patch

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
index 3929be5743f4..d644bbb298af 100644
--- a/include/linux/bpf_mem_alloc.h
+++ b/include/linux/bpf_mem_alloc.h
@@ -27,10 +27,12 @@  void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
 /* kmalloc/kfree equivalent: */
 void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
 void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
+void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
 
 /* kmem_cache_alloc/free equivalent: */
 void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
 void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
+void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr);
 void bpf_mem_cache_raw_free(void *ptr);
 void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
 
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 666917c16e87..dc144c54d502 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -101,6 +101,15 @@  struct bpf_mem_cache {
 	bool draining;
 	struct bpf_mem_cache *tgt;
 
+	/* list of objects to be freed after RCU GP */
+	struct llist_head free_by_rcu;
+	struct llist_node *free_by_rcu_tail;
+	struct llist_head waiting_for_gp;
+	struct llist_node *waiting_for_gp_tail;
+	struct rcu_head rcu;
+	atomic_t call_rcu_in_progress;
+	struct llist_head free_llist_extra_rcu;
+
 	/* list of objects to be freed after RCU tasks trace GP */
 	struct llist_head free_by_rcu_ttrace;
 	struct llist_node *free_by_rcu_ttrace_tail;
@@ -344,6 +353,60 @@  static void free_bulk(struct bpf_mem_cache *c)
 	do_call_rcu_ttrace(tgt);
 }
 
+static void __free_by_rcu(struct rcu_head *head)
+{
+	struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
+	struct bpf_mem_cache *tgt = c->tgt;
+	struct llist_node *llnode;
+
+	if (unlikely(READ_ONCE(c->draining)))
+		goto out;
+
+	llnode = llist_del_all(&c->waiting_for_gp);
+	if (!llnode)
+		goto out;
+
+	if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace))
+		tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail;
+
+	/* Objects went through regular RCU GP. Send them to RCU tasks trace */
+	do_call_rcu_ttrace(tgt);
+out:
+	atomic_set(&c->call_rcu_in_progress, 0);
+}
+
+static void check_free_by_rcu(struct bpf_mem_cache *c)
+{
+	struct llist_node *llnode, *t;
+
+	if (llist_empty(&c->free_by_rcu) && llist_empty(&c->free_llist_extra_rcu))
+		return;
+
+	/* drain free_llist_extra_rcu */
+	llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu))
+		if (__llist_add(llnode, &c->free_by_rcu))
+			c->free_by_rcu_tail = llnode;
+
+	if (atomic_xchg(&c->call_rcu_in_progress, 1)) {
+		/*
+		 * Instead of kmalloc-ing new rcu_head and triggering 10k
+		 * call_rcu() to hit rcutree.qhimark and force RCU to notice
+		 * the overload just ask RCU to hurry up. There could be many
+		 * objects in free_by_rcu list.
+		 * This hint reduces memory consumption for an artifical
+		 * benchmark from 2 Gbyte to 150 Mbyte.
+		 */
+		rcu_request_urgent_qs_task(current);
+		return;
+	}
+
+	WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp));
+
+	WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu));
+	c->waiting_for_gp_tail = c->free_by_rcu_tail;
+	call_rcu_hurry(&c->rcu, __free_by_rcu);
+}
+
 static void bpf_mem_refill(struct irq_work *work)
 {
 	struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
@@ -358,6 +421,8 @@  static void bpf_mem_refill(struct irq_work *work)
 		alloc_bulk(c, c->batch, NUMA_NO_NODE);
 	else if (cnt > c->high_watermark)
 		free_bulk(c);
+
+	check_free_by_rcu(c);
 }
 
 static void notrace irq_work_raise(struct bpf_mem_cache *c)
@@ -486,6 +551,9 @@  static void drain_mem_cache(struct bpf_mem_cache *c)
 	free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
 	free_all(__llist_del_all(&c->free_llist), percpu);
 	free_all(__llist_del_all(&c->free_llist_extra), percpu);
+	free_all(__llist_del_all(&c->free_by_rcu), percpu);
+	free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
+	free_all(llist_del_all(&c->waiting_for_gp), percpu);
 }
 
 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
@@ -498,8 +566,8 @@  static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
 
 static void free_mem_alloc(struct bpf_mem_alloc *ma)
 {
-	/* waiting_for_gp_ttrace lists was drained, but __free_rcu might
-	 * still execute. Wait for it now before we freeing percpu caches.
+	/* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks
+	 * might still execute. Wait for them.
 	 *
 	 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
 	 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
@@ -564,6 +632,7 @@  void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 			c = per_cpu_ptr(ma->cache, cpu);
 			drain_mem_cache(c);
 			rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+			rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 		}
 		/* objcg is the same across cpus */
 		if (c->objcg)
@@ -586,6 +655,7 @@  void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 				c = &cc->cache[i];
 				drain_mem_cache(c);
 				rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
+				rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 			}
 		}
 		if (c->objcg)
@@ -670,6 +740,27 @@  static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
 		irq_work_raise(c);
 }
 
+static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
+{
+	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+	unsigned long flags;
+
+	c->tgt = *(struct bpf_mem_cache **)llnode;
+
+	local_irq_save(flags);
+	if (local_inc_return(&c->active) == 1) {
+		if (__llist_add(llnode, &c->free_by_rcu))
+			c->free_by_rcu_tail = llnode;
+	} else {
+		llist_add(llnode, &c->free_llist_extra_rcu);
+	}
+	local_dec(&c->active);
+	local_irq_restore(flags);
+
+	if (!atomic_read(&c->call_rcu_in_progress))
+		irq_work_raise(c);
+}
+
 /* Called from BPF program or from sys_bpf syscall.
  * In both cases migration is disabled.
  */
@@ -703,6 +794,20 @@  void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
 	unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
 }
 
+void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+	int idx;
+
+	if (!ptr)
+		return;
+
+	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
+	if (idx < 0)
+		return;
+
+	unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
 void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
 {
 	void *ret;
@@ -719,6 +824,14 @@  void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
 	unit_free(this_cpu_ptr(ma->cache), ptr);
 }
 
+void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr)
+{
+	if (!ptr)
+		return;
+
+	unit_free_rcu(this_cpu_ptr(ma->cache), ptr);
+}
+
 /* Directly does a kfree() without putting 'ptr' back to the free_llist
  * for reuse and without waiting for a rcu_tasks_trace gp.
  * The caller must first go through the rcu_tasks_trace gp for 'ptr'