Message ID | 20230624031333.96597-13-alexei.starovoitov@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | bpf: Introduce bpf_mem_cache_free_rcu(). | expand |
Hi, On 6/24/2023 11:13 AM, Alexei Starovoitov wrote: > From: Alexei Starovoitov <ast@kernel.org> > SNIP > > +static void __free_by_rcu(struct rcu_head *head) > +{ > + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); > + struct bpf_mem_cache *tgt = c->tgt; > + struct llist_node *llnode; > + > + if (unlikely(READ_ONCE(c->draining))) > + goto out; > + > + llnode = llist_del_all(&c->waiting_for_gp); > + if (!llnode) > + goto out; > + > + if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace)) > + tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail; Got a null-ptr dereference oops when running multiple test_maps and htab-mem benchmark after hacking htab to use bpf_mem_cache_free_rcu(). And I think it happened as follow: // c->tgt P1: __free_by_rcu() // c->tgt is the same as P1 P2: __free_by_rcu() // return true P1: llist_add_batch(&tgt->free_by_rcu_ttrace) // return false P2: llist_add_batch(&tgt->free_by_rcu_ttrace) P2: do_call_rcu_ttrace // return false P2: xchg(tgt->call_rcu_ttrace_in_progress, 1) // llnode is not NULL P2: llnode = llist_del_all(&c->free_by_rcu_ttrace) // BAD: c->free_by_rcu_ttrace_tail is NULL, so oops P2: __llist_add_batch(llnode, c->free_by_rcu_ttrace_tail) P1: tgt->free_by_rcu_ttrace_tail = X I don't have a good fix for the problem except adding a spin-lock for free_by_rcu_ttrace and free_by_rcu_ttrace_tail.
Hi, On 6/24/2023 11:13 AM, Alexei Starovoitov wrote: > From: Alexei Starovoitov <ast@kernel.org> > > Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu(). > Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into > per-cpu free list the _rcu() flavor waits for RCU grace period and then moves > objects into free_by_rcu_ttrace list where they are waiting for RCU > task trace grace period to be freed into slab. > SNIP > +static void check_free_by_rcu(struct bpf_mem_cache *c) > +{ > + struct llist_node *llnode, *t; > + > + if (llist_empty(&c->free_by_rcu) && llist_empty(&c->free_llist_extra_rcu)) > + return; > + > + /* drain free_llist_extra_rcu */ > + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) > + if (__llist_add(llnode, &c->free_by_rcu)) > + c->free_by_rcu_tail = llnode; Just like add_obj_to_free_list(), we should do conditional local_irq_save(flags) and local_inc_return(&c->active) as well for free_by_rcu, otherwise free_by_rcu may be corrupted by bpf program running in a NMI context. > + > + if (atomic_xchg(&c->call_rcu_in_progress, 1)) { > + /* > + * Instead of kmalloc-ing new rcu_head and triggering 10k > + * call_rcu() to hit rcutree.qhimark and force RCU to notice > + * the overload just ask RCU to hurry up. There could be many > + * objects in free_by_rcu list. > + * This hint reduces memory consumption for an artifical > + * benchmark from 2 Gbyte to 150 Mbyte. > + */ > + rcu_request_urgent_qs_task(current); > + return; > + } > + > + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); > + > + WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); > + c->waiting_for_gp_tail = c->free_by_rcu_tail; > + call_rcu_hurry(&c->rcu, __free_by_rcu); The same protection is needed for c->free_by_rcu_tail as well.
Hi, On 6/24/2023 11:13 AM, Alexei Starovoitov wrote: > From: Alexei Starovoitov <ast@kernel.org> > > Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu(). > Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into > per-cpu free list the _rcu() flavor waits for RCU grace period and then moves > objects into free_by_rcu_ttrace list where they are waiting for RCU > task trace grace period to be freed into slab. SNIP > +static void __free_by_rcu(struct rcu_head *head) > +{ > + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); > + struct bpf_mem_cache *tgt = c->tgt; > + struct llist_node *llnode; > + > + if (unlikely(READ_ONCE(c->draining))) > + goto out; Because the reading of c->draining and list_add_batch(..., free_by_rcu_ttrace) is lockless, so checking draining here could not prevent the leak of objects in c->free_by_rcu_ttrace() as show below (hope the formatting is OK now). A simple fix is to drain free_by_rcu_ttrace twice as suggested before. Or checking c->draining again in __free_by_rcu() when atomic_xchg() returns 1 and calling free_all(free_by_rcu_ttrace) if draining is true. P1: bpf_mem_alloc_destroy() P2: __free_by_rcu() // got false P2: read c->draining P1: c->draining = true P1: llist_del_all(&c->free_by_rcu_ttrace) // add to free_by_rcu_ttrace again P2: llist_add_batch(..., &tgt->free_by_rcu_ttrace) P2: do_call_rcu_ttrace() // call_rcu_ttrace_in_progress is 1, so xchg return 1 // and it doesn't being moved to waiting_for_gp_ttrace P2: atomic_xchg(&c->call_rcu_ttrace_in_progress, 1) // got 1 P1: atomic_read(&c->call_rcu_ttrace_in_progress) // objects in free_by_rcu_ttrace is leaked c->draining also can't guarantee bpf_mem_alloc_destroy() will wait for the inflight call_rcu_tasks_trace() callback as shown in the following two cases (these two cases are the same as reported in v1 and I only reformatted these two diagrams). And I suggest to do bpf_mem_alloc_destroy as follows: if (ma->cache) { rcu_in_progress = 0; for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); irq_work_sync(&c->refill_work); drain_mem_cache(c); rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); } Case 1: P1: bpf_mem_alloc_destroy() P2: __free_by_rcu() // got false P2: c->draining P1: c->draining = true // got 0 P1: atomic_read(&c->call_rcu_ttrace_in_progress) P2: do_call_rcu_ttrace() // return 0 P2: atomic_xchg(&c->call_rcu_ttrace_in_progress, 1) P2: call_rcu_tasks_trace() P2: atomic_set(&c->call_rcu_in_progress, 0) // also got 0 P1: atomic_read(&c->call_rcu_in_progress) // won't wait for the inflight __free_rcu_tasks_trace Case 2: P1: bpf_mem_alloc_destroy P2: __free_by_rcu for c1 P2: read c1->draing P1: c0->draining = true P1: c1->draining = true // both of in_progress counter is 0 P1: read c0->call_rcu_in_progress P1: read c0->call_rcu_ttrace_in_progress // c1->tgt is c0 // c1->call_rcu_in_progress is 1 // c0->call_rcu_ttrace_in_progress is 0 P2: llist_add_batch(..., c0->free_by_rcu_ttrace) P2: xchg(c0->call_rcu_ttrace_in_progress, 1) P2: call_rcu_tasks_trace(c0) P2: c1->call_rcu_in_progress = 0 // both of in_progress counter is 0 P1: read c1->call_rcu_in_progress P1: read c1->call_rcu_ttrace_in_progress // BAD! There is still inflight tasks trace RCU callback P1: free_mem_alloc_no_barrier() > + > + llnode = llist_del_all(&c->waiting_for_gp); > + if (!llnode) > + goto out; > + > + if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace)) > + tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail; > + > + /* Objects went through regular RCU GP. Send them to RCU tasks trace */ > + do_call_rcu_ttrace(tgt); > +out: > + atomic_set(&c->call_rcu_in_progress, 0); > +} > +
Hi, On 6/24/2023 11:13 AM, Alexei Starovoitov wrote: > From: Alexei Starovoitov <ast@kernel.org> > > Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu(). > Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into > per-cpu free list the _rcu() flavor waits for RCU grace period and then moves > objects into free_by_rcu_ttrace list where they are waiting for RCU > task trace grace period to be freed into slab. SNIP > > static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) > @@ -498,8 +566,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) > > static void free_mem_alloc(struct bpf_mem_alloc *ma) > { > - /* waiting_for_gp_ttrace lists was drained, but __free_rcu might > - * still execute. Wait for it now before we freeing percpu caches. > + /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks > + * might still execute. Wait for them. > * > * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), > * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used I think an extra rcu_barrier() before rcu_barrier_tasks_trace() is still needed here, otherwise free_mem_alloc will not wait for inflight __free_by_rcu() and there will oops in rcu_do_batch().
Hi, On 6/24/2023 2:49 PM, Hou Tao wrote: > Hi, > > On 6/24/2023 11:13 AM, Alexei Starovoitov wrote: >> From: Alexei Starovoitov <ast@kernel.org> >> > SNIP >> >> +static void __free_by_rcu(struct rcu_head *head) >> +{ >> + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); >> + struct bpf_mem_cache *tgt = c->tgt; >> + struct llist_node *llnode; >> + >> + if (unlikely(READ_ONCE(c->draining))) >> + goto out; >> + >> + llnode = llist_del_all(&c->waiting_for_gp); >> + if (!llnode) >> + goto out; >> + >> + if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace)) >> + tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail; > Got a null-ptr dereference oops when running multiple test_maps and > htab-mem benchmark after hacking htab to use bpf_mem_cache_free_rcu(). > And I think it happened as follow: The same null-ptr dereference happened even before switching htab to use bpf_mem_cache_free_rcu(). So it seems after introducing c->tgt, there could be two concurrent free_bulk() but with the same c->tgt. > > // c->tgt > P1: __free_by_rcu() > // c->tgt is the same as P1 > P2: __free_by_rcu() > > // return true > P1: llist_add_batch(&tgt->free_by_rcu_ttrace) > // return false > P2: llist_add_batch(&tgt->free_by_rcu_ttrace) > P2: do_call_rcu_ttrace > // return false > P2: xchg(tgt->call_rcu_ttrace_in_progress, 1) > // llnode is not NULL > P2: llnode = llist_del_all(&c->free_by_rcu_ttrace) > // BAD: c->free_by_rcu_ttrace_tail is NULL, so oops > P2: __llist_add_batch(llnode, c->free_by_rcu_ttrace_tail) > > P1: tgt->free_by_rcu_ttrace_tail = X > > I don't have a good fix for the problem except adding a spin-lock for > free_by_rcu_ttrace and free_by_rcu_ttrace_tail. > > > .
On 6/23/23 11:49 PM, Hou Tao wrote: > Hi, > > On 6/24/2023 11:13 AM, Alexei Starovoitov wrote: >> From: Alexei Starovoitov <ast@kernel.org> >> > SNIP >> >> +static void __free_by_rcu(struct rcu_head *head) >> +{ >> + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); >> + struct bpf_mem_cache *tgt = c->tgt; >> + struct llist_node *llnode; >> + >> + if (unlikely(READ_ONCE(c->draining))) >> + goto out; >> + >> + llnode = llist_del_all(&c->waiting_for_gp); >> + if (!llnode) >> + goto out; >> + >> + if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace)) >> + tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail; > Got a null-ptr dereference oops when running multiple test_maps and > htab-mem benchmark after hacking htab to use bpf_mem_cache_free_rcu(). > And I think it happened as follow: > > // c->tgt > P1: __free_by_rcu() > // c->tgt is the same as P1 > P2: __free_by_rcu() > > // return true > P1: llist_add_batch(&tgt->free_by_rcu_ttrace) > // return false > P2: llist_add_batch(&tgt->free_by_rcu_ttrace) > P2: do_call_rcu_ttrace > // return false > P2: xchg(tgt->call_rcu_ttrace_in_progress, 1) > // llnode is not NULL > P2: llnode = llist_del_all(&c->free_by_rcu_ttrace) > // BAD: c->free_by_rcu_ttrace_tail is NULL, so oops > P2: __llist_add_batch(llnode, c->free_by_rcu_ttrace_tail) > > P1: tgt->free_by_rcu_ttrace_tail = X > > I don't have a good fix for the problem except adding a spin-lock for > free_by_rcu_ttrace and free_by_rcu_ttrace_tail. null-ptr is probably something else, since the race window is extremely tiny. In my testing this optimization doesn't buy much. So I'll just drop _tail optimization and switch to for_each(del_all) to move elements. We can revisit later.
On 6/24/23 12:53 AM, Hou Tao wrote: > Hi, > > On 6/24/2023 11:13 AM, Alexei Starovoitov wrote: >> From: Alexei Starovoitov <ast@kernel.org> >> >> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu(). >> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into >> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves >> objects into free_by_rcu_ttrace list where they are waiting for RCU >> task trace grace period to be freed into slab. >> > SNIP >> +static void check_free_by_rcu(struct bpf_mem_cache *c) >> +{ >> + struct llist_node *llnode, *t; >> + >> + if (llist_empty(&c->free_by_rcu) && llist_empty(&c->free_llist_extra_rcu)) >> + return; >> + >> + /* drain free_llist_extra_rcu */ >> + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) >> + if (__llist_add(llnode, &c->free_by_rcu)) >> + c->free_by_rcu_tail = llnode; > > Just like add_obj_to_free_list(), we should do conditional > local_irq_save(flags) and local_inc_return(&c->active) as well for > free_by_rcu, otherwise free_by_rcu may be corrupted by bpf program > running in a NMI context. Good catch. Will do.
On 6/25/23 4:15 AM, Hou Tao wrote: > Hi, > > On 6/24/2023 11:13 AM, Alexei Starovoitov wrote: >> From: Alexei Starovoitov <ast@kernel.org> >> >> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu(). >> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into >> per-cpu free list the _rcu() flavor waits for RCU grace period and then moves >> objects into free_by_rcu_ttrace list where they are waiting for RCU >> task trace grace period to be freed into slab. > SNIP >> >> static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) >> @@ -498,8 +566,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) >> >> static void free_mem_alloc(struct bpf_mem_alloc *ma) >> { >> - /* waiting_for_gp_ttrace lists was drained, but __free_rcu might >> - * still execute. Wait for it now before we freeing percpu caches. >> + /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks >> + * might still execute. Wait for them. >> * >> * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), >> * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used > I think an extra rcu_barrier() before rcu_barrier_tasks_trace() is still > needed here, otherwise free_mem_alloc will not wait for inflight > __free_by_rcu() and there will oops in rcu_do_batch(). Agree. I got confused by rcu_trace_implies_rcu_gp(). rcu_barrier() is necessary. re: draining. I'll switch to do if (draing) free_all; else call_rcu; scheme to address potential memory leak though I wasn't able to repro it.
Hi, On 6/28/2023 8:52 AM, Alexei Starovoitov wrote: > On 6/23/23 11:49 PM, Hou Tao wrote: >> Hi, >> >> On 6/24/2023 11:13 AM, Alexei Starovoitov wrote: >>> From: Alexei Starovoitov <ast@kernel.org> >>> >> SNIP >>> +static void __free_by_rcu(struct rcu_head *head) >>> +{ >>> + struct bpf_mem_cache *c = container_of(head, struct >>> bpf_mem_cache, rcu); >>> + struct bpf_mem_cache *tgt = c->tgt; >>> + struct llist_node *llnode; >>> + >>> + if (unlikely(READ_ONCE(c->draining))) >>> + goto out; >>> + >>> + llnode = llist_del_all(&c->waiting_for_gp); >>> + if (!llnode) >>> + goto out; >>> + >>> + if (llist_add_batch(llnode, c->waiting_for_gp_tail, >>> &tgt->free_by_rcu_ttrace)) >>> + tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail; >> Got a null-ptr dereference oops when running multiple test_maps and >> htab-mem benchmark after hacking htab to use bpf_mem_cache_free_rcu(). >> And I think it happened as follow: >> >> // c->tgt >> P1: __free_by_rcu() >> // c->tgt is the same as P1 >> P2: __free_by_rcu() >> >> // return true >> P1: llist_add_batch(&tgt->free_by_rcu_ttrace) >> // return false >> P2: llist_add_batch(&tgt->free_by_rcu_ttrace) >> P2: do_call_rcu_ttrace >> // return false >> P2: xchg(tgt->call_rcu_ttrace_in_progress, 1) >> // llnode is not NULL >> P2: llnode = llist_del_all(&c->free_by_rcu_ttrace) >> // BAD: c->free_by_rcu_ttrace_tail is NULL, so oops >> P2: __llist_add_batch(llnode, c->free_by_rcu_ttrace_tail) >> >> P1: tgt->free_by_rcu_ttrace_tail = X >> >> I don't have a good fix for the problem except adding a spin-lock for >> free_by_rcu_ttrace and free_by_rcu_ttrace_tail. > > null-ptr is probably something else, since the race window is > extremely tiny. The null-ptr dereference is indeed due to free_by_rcu_ttrace_tail is NULL. The oops occurred multiple times and I have checked the vmcore to confirm that. > In my testing this optimization doesn't buy much. > So I'll just drop _tail optimization and switch to for_each(del_all) > to move elements. We can revisit later. OK
Hi, On 6/28/2023 8:56 AM, Alexei Starovoitov wrote: > On 6/25/23 4:15 AM, Hou Tao wrote: >> Hi, >> >> On 6/24/2023 11:13 AM, Alexei Starovoitov wrote: >>> From: Alexei Starovoitov <ast@kernel.org> >>> >>> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu(). >>> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse >>> into >>> per-cpu free list the _rcu() flavor waits for RCU grace period and >>> then moves >>> objects into free_by_rcu_ttrace list where they are waiting for RCU >>> task trace grace period to be freed into slab. >> SNIP >>> static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) >>> @@ -498,8 +566,8 @@ static void free_mem_alloc_no_barrier(struct >>> bpf_mem_alloc *ma) >>> static void free_mem_alloc(struct bpf_mem_alloc *ma) >>> { >>> - /* waiting_for_gp_ttrace lists was drained, but __free_rcu might >>> - * still execute. Wait for it now before we freeing percpu caches. >>> + /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks >>> + * might still execute. Wait for them. >>> * >>> * rcu_barrier_tasks_trace() doesn't imply >>> synchronize_rcu_tasks_trace(), >>> * but rcu_barrier_tasks_trace() and rcu_barrier() below are >>> only used >> I think an extra rcu_barrier() before rcu_barrier_tasks_trace() is still >> needed here, otherwise free_mem_alloc will not wait for inflight >> __free_by_rcu() and there will oops in rcu_do_batch(). > > Agree. I got confused by rcu_trace_implies_rcu_gp(). > rcu_barrier() is necessary. > > re: draining. > I'll switch to do if (draing) free_all; else call_rcu; scheme > to address potential memory leak though I wasn't able to repro it. For v2, it was also hard for me to reproduce the leak problem. But after I injected some delay by using udelay() in __free_by_rcu/__free_rcu() after reading c->draining, it was relatively easy to reproduce the problems.
On Tue, Jun 27, 2023 at 6:43 PM Hou Tao <houtao@huaweicloud.com> wrote: > > Hi, > > On 6/28/2023 8:56 AM, Alexei Starovoitov wrote: > > On 6/25/23 4:15 AM, Hou Tao wrote: > >> Hi, > >> > >> On 6/24/2023 11:13 AM, Alexei Starovoitov wrote: > >>> From: Alexei Starovoitov <ast@kernel.org> > >>> > >>> Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu(). > >>> Unlike bpf_mem_[cache_]free() that links objects for immediate reuse > >>> into > >>> per-cpu free list the _rcu() flavor waits for RCU grace period and > >>> then moves > >>> objects into free_by_rcu_ttrace list where they are waiting for RCU > >>> task trace grace period to be freed into slab. > >> SNIP > >>> static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) > >>> @@ -498,8 +566,8 @@ static void free_mem_alloc_no_barrier(struct > >>> bpf_mem_alloc *ma) > >>> static void free_mem_alloc(struct bpf_mem_alloc *ma) > >>> { > >>> - /* waiting_for_gp_ttrace lists was drained, but __free_rcu might > >>> - * still execute. Wait for it now before we freeing percpu caches. > >>> + /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks > >>> + * might still execute. Wait for them. > >>> * > >>> * rcu_barrier_tasks_trace() doesn't imply > >>> synchronize_rcu_tasks_trace(), > >>> * but rcu_barrier_tasks_trace() and rcu_barrier() below are > >>> only used > >> I think an extra rcu_barrier() before rcu_barrier_tasks_trace() is still > >> needed here, otherwise free_mem_alloc will not wait for inflight > >> __free_by_rcu() and there will oops in rcu_do_batch(). > > > > Agree. I got confused by rcu_trace_implies_rcu_gp(). > > rcu_barrier() is necessary. > > > > re: draining. > > I'll switch to do if (draing) free_all; else call_rcu; scheme > > to address potential memory leak though I wasn't able to repro it. > For v2, it was also hard for me to reproduce the leak problem. But after > I injected some delay by using udelay() in __free_by_rcu/__free_rcu() > after reading c->draining, it was relatively easy to reproduce the problems. 1. Please respin htab bench. We're still discussing patching without having the same base line. 2. 'adding udelay()' is too vague. Pls post a diff hunk of what exactly you mean. 3. I'll send v3 shortly. Let's move discussion there.
Hi, On 6/28/2023 9:51 AM, Alexei Starovoitov wrote: > On Tue, Jun 27, 2023 at 6:43 PM Hou Tao <houtao@huaweicloud.com> wrote: >> SNIP >>> re: draining. >>> I'll switch to do if (draing) free_all; else call_rcu; scheme >>> to address potential memory leak though I wasn't able to repro it. >> For v2, it was also hard for me to reproduce the leak problem. But after >> I injected some delay by using udelay() in __free_by_rcu/__free_rcu() >> after reading c->draining, it was relatively easy to reproduce the problems. > 1. Please respin htab bench. > We're still discussing patching without having the same base line. Almost done. Need to do benchmark again to update the numbers in commit message. > > 2. 'adding udelay()' is too vague. Pls post a diff hunk of what exactly > you mean. --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -4,6 +4,7 @@ #include <linux/llist.h> #include <linux/bpf.h> #include <linux/irq_work.h> +#include <linux/delay.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> #include <asm/local.h> @@ -261,12 +262,17 @@ static int free_all(struct llist_node *llnode, bool percpu) return cnt; } +static unsigned int delay; +module_param(delay, uint, 0644); + static void __free_rcu(struct rcu_head *head) { struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); if (unlikely(READ_ONCE(c->draining))) goto out; + if (delay) + udelay(delay); free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); out: atomic_set(&c->call_rcu_ttrace_in_progress, 0); @@ -361,6 +367,8 @@ static void __free_by_rcu(struct rcu_head *head) if (unlikely(READ_ONCE(c->draining))) goto out; + if (delay) + udelay(delay); llnode = llist_del_all(&c->waiting_for_gp); if (!llnode) > > 3. I'll send v3 shortly. Let's move discussion there. Sure.
diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 3929be5743f4..d644bbb298af 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); +void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr); /* kmem_cache_alloc/free equivalent: */ void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); +void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr); void bpf_mem_cache_raw_free(void *ptr); void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 666917c16e87..dc144c54d502 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -101,6 +101,15 @@ struct bpf_mem_cache { bool draining; struct bpf_mem_cache *tgt; + /* list of objects to be freed after RCU GP */ + struct llist_head free_by_rcu; + struct llist_node *free_by_rcu_tail; + struct llist_head waiting_for_gp; + struct llist_node *waiting_for_gp_tail; + struct rcu_head rcu; + atomic_t call_rcu_in_progress; + struct llist_head free_llist_extra_rcu; + /* list of objects to be freed after RCU tasks trace GP */ struct llist_head free_by_rcu_ttrace; struct llist_node *free_by_rcu_ttrace_tail; @@ -344,6 +353,60 @@ static void free_bulk(struct bpf_mem_cache *c) do_call_rcu_ttrace(tgt); } +static void __free_by_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct bpf_mem_cache *tgt = c->tgt; + struct llist_node *llnode; + + if (unlikely(READ_ONCE(c->draining))) + goto out; + + llnode = llist_del_all(&c->waiting_for_gp); + if (!llnode) + goto out; + + if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace)) + tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail; + + /* Objects went through regular RCU GP. Send them to RCU tasks trace */ + do_call_rcu_ttrace(tgt); +out: + atomic_set(&c->call_rcu_in_progress, 0); +} + +static void check_free_by_rcu(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + + if (llist_empty(&c->free_by_rcu) && llist_empty(&c->free_llist_extra_rcu)) + return; + + /* drain free_llist_extra_rcu */ + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) + if (__llist_add(llnode, &c->free_by_rcu)) + c->free_by_rcu_tail = llnode; + + if (atomic_xchg(&c->call_rcu_in_progress, 1)) { + /* + * Instead of kmalloc-ing new rcu_head and triggering 10k + * call_rcu() to hit rcutree.qhimark and force RCU to notice + * the overload just ask RCU to hurry up. There could be many + * objects in free_by_rcu list. + * This hint reduces memory consumption for an artifical + * benchmark from 2 Gbyte to 150 Mbyte. + */ + rcu_request_urgent_qs_task(current); + return; + } + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); + + WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); + c->waiting_for_gp_tail = c->free_by_rcu_tail; + call_rcu_hurry(&c->rcu, __free_by_rcu); +} + static void bpf_mem_refill(struct irq_work *work) { struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); @@ -358,6 +421,8 @@ static void bpf_mem_refill(struct irq_work *work) alloc_bulk(c, c->batch, NUMA_NO_NODE); else if (cnt > c->high_watermark) free_bulk(c); + + check_free_by_rcu(c); } static void notrace irq_work_raise(struct bpf_mem_cache *c) @@ -486,6 +551,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c) free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); free_all(__llist_del_all(&c->free_llist), percpu); free_all(__llist_del_all(&c->free_llist_extra), percpu); + free_all(__llist_del_all(&c->free_by_rcu), percpu); + free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); + free_all(llist_del_all(&c->waiting_for_gp), percpu); } static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) @@ -498,8 +566,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) static void free_mem_alloc(struct bpf_mem_alloc *ma) { - /* waiting_for_gp_ttrace lists was drained, but __free_rcu might - * still execute. Wait for it now before we freeing percpu caches. + /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks + * might still execute. Wait for them. * * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used @@ -564,6 +632,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) c = per_cpu_ptr(ma->cache, cpu); drain_mem_cache(c); rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } /* objcg is the same across cpus */ if (c->objcg) @@ -586,6 +655,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) c = &cc->cache[i]; drain_mem_cache(c); rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } if (c->objcg) @@ -670,6 +740,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) irq_work_raise(c); } +static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) +{ + struct llist_node *llnode = ptr - LLIST_NODE_SZ; + unsigned long flags; + + c->tgt = *(struct bpf_mem_cache **)llnode; + + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + if (__llist_add(llnode, &c->free_by_rcu)) + c->free_by_rcu_tail = llnode; + } else { + llist_add(llnode, &c->free_llist_extra_rcu); + } + local_dec(&c->active); + local_irq_restore(flags); + + if (!atomic_read(&c->call_rcu_in_progress)) + irq_work_raise(c); +} + /* Called from BPF program or from sys_bpf syscall. * In both cases migration is disabled. */ @@ -703,6 +794,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); } +void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) +{ + int idx; + + if (!ptr) + return; + + idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + if (idx < 0) + return; + + unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); +} + void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) { void *ret; @@ -719,6 +824,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) unit_free(this_cpu_ptr(ma->cache), ptr); } +void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) +{ + if (!ptr) + return; + + unit_free_rcu(this_cpu_ptr(ma->cache), ptr); +} + /* Directly does a kfree() without putting 'ptr' back to the free_llist * for reuse and without waiting for a rcu_tasks_trace gp. * The caller must first go through the rcu_tasks_trace gp for 'ptr'