Message ID | 20230621023238.87079-12-alexei.starovoitov@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | bpf: Introduce bpf_mem_cache_free_rcu(). | expand |
Hi, On 6/21/2023 10:32 AM, Alexei Starovoitov wrote: > From: Alexei Starovoitov <ast@kernel.org> > > Introduce bpf_mem_[cache_]free_rcu() similar to kfree_rcu(). > Unlike bpf_mem_[cache_]free() that links objects for immediate reuse into > per-cpu free list the _rcu() flavor waits for RCU grace period and then moves > objects into free_by_rcu_ttrace list where they are waiting for RCU > task trace grace period to be freed into slab. > > The life cycle of objects: > alloc: dequeue free_llist > free: enqeueu free_llist > free_rcu: enqueue free_by_rcu -> waiting_for_gp > free_llist above high watermark -> free_by_rcu_ttrace > after RCU GP waiting_for_gp -> free_by_rcu_ttrace > free_by_rcu_ttrace -> waiting_for_gp_ttrace -> slab > > Signed-off-by: Alexei Starovoitov <ast@kernel.org> > --- > include/linux/bpf_mem_alloc.h | 2 + > kernel/bpf/memalloc.c | 118 ++++++++++++++++++++++++++++++++-- > 2 files changed, 116 insertions(+), 4 deletions(-) > > diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h > index 3929be5743f4..d644bbb298af 100644 > --- a/include/linux/bpf_mem_alloc.h > +++ b/include/linux/bpf_mem_alloc.h > @@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); > /* kmalloc/kfree equivalent: */ > void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); > void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); > +void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr); > > /* kmem_cache_alloc/free equivalent: */ > void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); > void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); > +void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr); > void bpf_mem_cache_raw_free(void *ptr); > void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags); > > diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c > index 10d027674743..4d1002e7b4b5 100644 > --- a/kernel/bpf/memalloc.c > +++ b/kernel/bpf/memalloc.c > @@ -100,6 +100,15 @@ struct bpf_mem_cache { > int percpu_size; > struct bpf_mem_cache *tgt; > > + /* list of objects to be freed after RCU GP */ > + struct llist_head free_by_rcu; > + struct llist_node *free_by_rcu_tail; > + struct llist_head waiting_for_gp; > + struct llist_node *waiting_for_gp_tail; > + struct rcu_head rcu; > + atomic_t call_rcu_in_progress; > + struct llist_head free_llist_extra_rcu; > + > /* list of objects to be freed after RCU tasks trace GP */ > struct llist_head free_by_rcu_ttrace; > struct llist_node *free_by_rcu_ttrace_tail; > @@ -340,6 +349,56 @@ static void free_bulk(struct bpf_mem_cache *c) > do_call_rcu_ttrace(tgt); > } > > +static void __free_by_rcu(struct rcu_head *head) > +{ > + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); > + struct bpf_mem_cache *tgt = c->tgt; > + struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); > + > + if (!llnode) > + goto out; > + > + if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace)) > + tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail; > + > + /* Objects went through regular RCU GP. Send them to RCU tasks trace */ > + do_call_rcu_ttrace(tgt); > +out: > + atomic_set(&c->call_rcu_in_progress, 0); > +} > + > +static void check_free_by_rcu(struct bpf_mem_cache *c) > +{ > + struct llist_node *llnode, *t; > + > + if (llist_empty(&c->free_by_rcu) && llist_empty(&c->free_llist_extra_rcu)) > + return; > + > + /* drain free_llist_extra_rcu */ > + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) > + if (__llist_add(llnode, &c->free_by_rcu)) > + c->free_by_rcu_tail = llnode; > + > + if (atomic_xchg(&c->call_rcu_in_progress, 1)) { > + /* > + * Instead of kmalloc-ing new rcu_head and triggering 10k > + * call_rcu() to hit rcutree.qhimark and force RCU to notice > + * the overload just ask RCU to hurry up. There could be many > + * objects in free_by_rcu list. > + * This hint reduces memory consumption for an artifical > + * benchmark from 2 Gbyte to 150 Mbyte. > + */ > + rcu_request_urgent_qs_task(current); > + return; > + } > + > + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); > + > + WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); > + c->waiting_for_gp_tail = c->free_by_rcu_tail; > + call_rcu_hurry(&c->rcu, __free_by_rcu); > +} > + > static void bpf_mem_refill(struct irq_work *work) > { > struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); > @@ -354,6 +413,8 @@ static void bpf_mem_refill(struct irq_work *work) > alloc_bulk(c, c->batch, NUMA_NO_NODE); > else if (cnt > c->high_watermark) > free_bulk(c); > + > + check_free_by_rcu(c); > } > > static void notrace irq_work_raise(struct bpf_mem_cache *c) > @@ -482,6 +543,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c) > free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); > free_all(__llist_del_all(&c->free_llist), percpu); > free_all(__llist_del_all(&c->free_llist_extra), percpu); > + free_all(__llist_del_all(&c->free_by_rcu), percpu); > + free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); > + free_all(llist_del_all(&c->waiting_for_gp), percpu); > } > > static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) > @@ -494,8 +558,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) > > static void free_mem_alloc(struct bpf_mem_alloc *ma) > { > - /* waiting_for_gp_ttrace lists was drained, but __free_rcu might > - * still execute. Wait for it now before we freeing percpu caches. > + /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks > + * might still execute. Wait for them. > * > * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), > * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used > @@ -504,9 +568,10 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) > * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by > * using rcu_trace_implies_rcu_gp() as well. > */ > - rcu_barrier_tasks_trace(); > + rcu_barrier(); /* wait for __free_by_rcu() */ > + rcu_barrier_tasks_trace(); /* wait for __free_rcu() via call_rcu_tasks_trace */ Using rcu_barrier_tasks_trace and rcu_barrier() is not enough, the objects in c->free_by_rcu_ttrace may be leaked as shown below. We may need to add an extra variable to notify __free_by_rcu() to free these elements directly instead of trying to move it into waiting_for_gp_ttrace as I did before. Or we can just drain free_by_rcu_ttrace twice. destroy process __free_by_rcu llist_del_all(&c->free_by_rcu_ttrace) // add to free_by_rcu_ttrace again llist_add_batch(..., &tgt->free_by_rcu_ttrace) do_call_rcu_ttrace() // call_rcu_ttrace_in_progress is 1, so xchg return 1 // and it will not be moved to waiting_for_gp_ttrace atomic_xchg(&c->call_rcu_ttrace_in_progress, 1) // got 1 atomic_read(&c->call_rcu_ttrace_in_progress) > if (!rcu_trace_implies_rcu_gp()) > - rcu_barrier(); > + rcu_barrier(); /* wait for __free_rcu() via call_rcu */ > free_mem_alloc_no_barrier(ma); > } > > @@ -565,6 +630,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) > irq_work_sync(&c->refill_work); > drain_mem_cache(c); > rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); > + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); > } > /* objcg is the same across cpus */ > if (c->objcg) > @@ -580,6 +646,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) > irq_work_sync(&c->refill_work); > drain_mem_cache(c); > rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); > + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); I got a oops in rcu_tasks_invoke_cbs() during stressing test and it seems we should do atomic_read(&call_rcu_in_progress) first, then do atomic_read(&call_rcu_ttrace_in_progress) to fix the problem. And to prevent memory reordering in non-x86 host, a memory barrier (e.g., smp_mb__before_atomic) is also needed between these two reads. Otherwise it is possible there are inflight RCU callbacks, but we don't wait for these callbacks as shown in the scenario below: destroy process __free_by_rcu // got 0 atomic_read(&c->call_rcu_ttrace_in_progress) do_call_rcu_ttrace() atomic_xchg(&c->call_rcu_ttrace_in_progress, 1) atomic_set(&c->call_rcu_in_progress, 0) // also got 0 atomic_read(&c->call_rcu_in_progress) The introduction of c->tgt make the destroy procedure more complicated. Even with the proposed fix above, there is still oops in rcu_tasks_invoke_cbs() and I think it happens as follows: bpf_mem_alloc_destroy free_by_rcu for c1 // both of in_progress counter is 0 read c0->call_rcu_in_progress read c0->call_rcu_ttrace_in_progress // c1->tgt = c0 // c1->call_rcu_in_progress == 1 add c0->free_by_rcu_ttrace xchg(c0->call_rcu_ttrace_in_progress, 1) call_rcu_tasks_trace(c0) c1->call_rcu_in_progress = 0 // both of in_progress counter is 0 read c1->call_rcu_in_progress read c1->call_rcu_ttrace_in_progress // BAD! There is still inflight tasks trace RCU callback free_mem_alloc_no_barrier() My original though is trying to do "rcu_in_progress += atomic_read(c->tgt->call_rcu_ttrace_in_progress)" after "rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress)" in bpf_mem_alloc_destroy(), but c->tgt is changed in unit_free(), so c->tgt may have been changed to B after the setting of A->call_rcu_ttrace_in_progress. I think we could read c->call_rcu_ttrace_in_progress for all bpf_mem_cache again when the summary of c->call_rcu_in_progress and c->call_rcu_ttrace_in_progress for all bpf_mem_cache is 0, because when rcu_in_progress is zero, it means the setting of c->tgt->call_rcu_ttrace_in_progress must have been completed. > } > } > if (c->objcg) > @@ -664,6 +731,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) > irq_work_raise(c); > } > > +static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) > +{ > + struct llist_node *llnode = ptr - LLIST_NODE_SZ; > + unsigned long flags; > + > + c->tgt = *(struct bpf_mem_cache **)llnode; > + > + local_irq_save(flags); > + if (local_inc_return(&c->active) == 1) { > + if (__llist_add(llnode, &c->free_by_rcu)) > + c->free_by_rcu_tail = llnode; > + } else { > + llist_add(llnode, &c->free_llist_extra_rcu); > + } > + local_dec(&c->active); > + local_irq_restore(flags); > + > + if (!atomic_read(&c->call_rcu_in_progress)) > + irq_work_raise(c); > +} > + > /* Called from BPF program or from sys_bpf syscall. > * In both cases migration is disabled. > */ > @@ -697,6 +785,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) > unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); > } > > +void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) > +{ > + int idx; > + > + if (!ptr) > + return; > + > + idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); > + if (idx < 0) > + return; > + > + unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); > +} > + > void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) > { > void *ret; > @@ -713,6 +815,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) > unit_free(this_cpu_ptr(ma->cache), ptr); > } > > +void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) > +{ > + if (!ptr) > + return; > + > + unit_free_rcu(this_cpu_ptr(ma->cache), ptr); > +} > + > /* Directly does a kfree() without putting 'ptr' back to the free_llist > * for reuse and without waiting for a rcu_tasks_trace gp. > * The caller must first go through the rcu_tasks_trace gp for 'ptr'
On Wed, Jun 21, 2023 at 08:26:30PM +0800, Hou Tao wrote: > > */ > > - rcu_barrier_tasks_trace(); > > + rcu_barrier(); /* wait for __free_by_rcu() */ > > + rcu_barrier_tasks_trace(); /* wait for __free_rcu() via call_rcu_tasks_trace */ > Using rcu_barrier_tasks_trace and rcu_barrier() is not enough, the > objects in c->free_by_rcu_ttrace may be leaked as shown below. We may > need to add an extra variable to notify __free_by_rcu() to free these > elements directly instead of trying to move it into > waiting_for_gp_ttrace as I did before. Or we can just drain > free_by_rcu_ttrace twice. > > destroy process __free_by_rcu > > llist_del_all(&c->free_by_rcu_ttrace) > > // add to free_by_rcu_ttrace again > llist_add_batch(..., &tgt->free_by_rcu_ttrace) > do_call_rcu_ttrace() > // call_rcu_ttrace_in_progress is 1, so > xchg return 1 > // and it will not be moved to > waiting_for_gp_ttrace > > atomic_xchg(&c->call_rcu_ttrace_in_progress, 1) > > // got 1 > atomic_read(&c->call_rcu_ttrace_in_progress) The formatting is off, but I think I got the idea. Yes. It's a race. > > rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); > > + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); > I got a oops in rcu_tasks_invoke_cbs() during stressing test and it > seems we should do atomic_read(&call_rcu_in_progress) first, then do > atomic_read(&call_rcu_ttrace_in_progress) to fix the problem. And to yes. it's a race. As you find out yourself changing the order won't fix it. > The introduction of c->tgt make the destroy procedure more complicated. > Even with the proposed fix above, there is still oops in > rcu_tasks_invoke_cbs() and I think it happens as follows: Right. That's another issue. Please send bench htab test and your special stress test, so we can have a common ground to reason about. Also please share your bench htab numbers before/after. I'm thinking to fix the races in the following way. Could you please test it with your stress test? The idea is to set 'draining' first everywhere that it will make all rcu callbacks a nop. Then drain all link lists. At this point nothing races with them. And then wait for single rcu_barrier_tasks_trace() that will make sure all callbcaks done. At this point the only thing they will do is if (c->draining) goto out; The barriers are needed to make 'c' access not uaf. ... From e20782160166d4327c76b57af160c4973396e0d0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov <ast@kernel.org> Date: Wed, 21 Jun 2023 20:11:33 -0700 Subject: [PATCH bpf-next] bpf: Fix races. Signed-off-by: Alexei Starovoitov <ast@kernel.org> --- kernel/bpf/memalloc.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 4d1002e7b4b5..75c553b15deb 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -99,6 +99,7 @@ struct bpf_mem_cache { int low_watermark, high_watermark, batch; int percpu_size; struct bpf_mem_cache *tgt; + bool draining; /* list of objects to be freed after RCU GP */ struct llist_head free_by_rcu; @@ -264,7 +265,10 @@ static void __free_rcu(struct rcu_head *head) { struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); + if (unlikely(c->draining)) + goto out; free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); +out: atomic_set(&c->call_rcu_ttrace_in_progress, 0); } @@ -353,8 +357,11 @@ static void __free_by_rcu(struct rcu_head *head) { struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); struct bpf_mem_cache *tgt = c->tgt; - struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); + struct llist_node *llnode; + if (unlikely(c->draining)) + goto out; + llnode = llist_del_all(&c->waiting_for_gp); if (!llnode) goto out; @@ -568,10 +575,9 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by * using rcu_trace_implies_rcu_gp() as well. */ - rcu_barrier(); /* wait for __free_by_rcu() */ - rcu_barrier_tasks_trace(); /* wait for __free_rcu() via call_rcu_tasks_trace */ + rcu_barrier_tasks_trace(); if (!rcu_trace_implies_rcu_gp()) - rcu_barrier(); /* wait for __free_rcu() via call_rcu */ + rcu_barrier(); free_mem_alloc_no_barrier(ma); } @@ -616,6 +622,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) if (ma->cache) { rcu_in_progress = 0; + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + c->draining = true; + } for_each_possible_cpu(cpu) { c = per_cpu_ptr(ma->cache, cpu); /* @@ -639,6 +649,13 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) } if (ma->caches) { rcu_in_progress = 0; + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + c->draining = true; + } + } for_each_possible_cpu(cpu) { cc = per_cpu_ptr(ma->caches, cpu); for (i = 0; i < NUM_CACHES; i++) {
Hi, On 6/22/2023 11:23 AM, Alexei Starovoitov wrote: > On Wed, Jun 21, 2023 at 08:26:30PM +0800, Hou Tao wrote: >>> SNIP >>> rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); >>> + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); >> I got a oops in rcu_tasks_invoke_cbs() during stressing test and it >> seems we should do atomic_read(&call_rcu_in_progress) first, then do >> atomic_read(&call_rcu_ttrace_in_progress) to fix the problem. And to > yes. it's a race. As you find out yourself changing the order won't fix it. > >> The introduction of c->tgt make the destroy procedure more complicated. >> Even with the proposed fix above, there is still oops in >> rcu_tasks_invoke_cbs() and I think it happens as follows: > Right. That's another issue. > > Please send bench htab test and your special stress test, > so we can have a common ground to reason about. > Also please share your bench htab numbers before/after. Will send htab-mem benchmark next week and update the benchmark results accordingly. There is no peculiarity in my local stress test. I just hacked htab to use bpf_mem_cache_free_rcu() and ran multiple tests_maps, map_perf_test and htab_mem benchmark simultaneously. > > I'm thinking to fix the races in the following way. > Could you please test it with your stress test? > The idea is to set 'draining' first everywhere that it will make all rcu > callbacks a nop. > Then drain all link lists. At this point nothing races with them. > And then wait for single rcu_barrier_tasks_trace() that will make sure > all callbcaks done. At this point the only thing they will do is > if (c->draining) goto out; > The barriers are needed to make 'c' access not uaf. As I replied in the v2, I don't think the lockless checking of c->draining will work. And I will test it anyway. > > ... > > >From e20782160166d4327c76b57af160c4973396e0d0 Mon Sep 17 00:00:00 2001 > From: Alexei Starovoitov <ast@kernel.org> > Date: Wed, 21 Jun 2023 20:11:33 -0700 > Subject: [PATCH bpf-next] bpf: Fix races. > > Signed-off-by: Alexei Starovoitov <ast@kernel.org> > --- > kernel/bpf/memalloc.c | 25 +++++++++++++++++++++---- > 1 file changed, 21 insertions(+), 4 deletions(-) > > diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c > index 4d1002e7b4b5..75c553b15deb 100644 > --- a/kernel/bpf/memalloc.c > +++ b/kernel/bpf/memalloc.c > @@ -99,6 +99,7 @@ struct bpf_mem_cache { > int low_watermark, high_watermark, batch; > int percpu_size; > struct bpf_mem_cache *tgt; > + bool draining; > > /* list of objects to be freed after RCU GP */ > struct llist_head free_by_rcu; > @@ -264,7 +265,10 @@ static void __free_rcu(struct rcu_head *head) > { > struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); > > + if (unlikely(c->draining)) > + goto out; > free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); > +out: > atomic_set(&c->call_rcu_ttrace_in_progress, 0); > } > > @@ -353,8 +357,11 @@ static void __free_by_rcu(struct rcu_head *head) > { > struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); > struct bpf_mem_cache *tgt = c->tgt; > - struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); > + struct llist_node *llnode; > > + if (unlikely(c->draining)) > + goto out; > + llnode = llist_del_all(&c->waiting_for_gp); > if (!llnode) > goto out; > > @@ -568,10 +575,9 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) > * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by > * using rcu_trace_implies_rcu_gp() as well. > */ > - rcu_barrier(); /* wait for __free_by_rcu() */ > - rcu_barrier_tasks_trace(); /* wait for __free_rcu() via call_rcu_tasks_trace */ > + rcu_barrier_tasks_trace(); > if (!rcu_trace_implies_rcu_gp()) > - rcu_barrier(); /* wait for __free_rcu() via call_rcu */ > + rcu_barrier(); > free_mem_alloc_no_barrier(ma); > } > > @@ -616,6 +622,10 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) > > if (ma->cache) { > rcu_in_progress = 0; > + for_each_possible_cpu(cpu) { > + c = per_cpu_ptr(ma->cache, cpu); > + c->draining = true; > + } > for_each_possible_cpu(cpu) { > c = per_cpu_ptr(ma->cache, cpu); > /* > @@ -639,6 +649,13 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) > } > if (ma->caches) { > rcu_in_progress = 0; > + for_each_possible_cpu(cpu) { > + cc = per_cpu_ptr(ma->caches, cpu); > + for (i = 0; i < NUM_CACHES; i++) { > + c = &cc->cache[i]; > + c->draining = true; > + } > + } > for_each_possible_cpu(cpu) { > cc = per_cpu_ptr(ma->caches, cpu); > for (i = 0; i < NUM_CACHES; i++) {
diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 3929be5743f4..d644bbb298af 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -27,10 +27,12 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); +void bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr); /* kmem_cache_alloc/free equivalent: */ void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); +void bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr); void bpf_mem_cache_raw_free(void *ptr); void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags); diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 10d027674743..4d1002e7b4b5 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -100,6 +100,15 @@ struct bpf_mem_cache { int percpu_size; struct bpf_mem_cache *tgt; + /* list of objects to be freed after RCU GP */ + struct llist_head free_by_rcu; + struct llist_node *free_by_rcu_tail; + struct llist_head waiting_for_gp; + struct llist_node *waiting_for_gp_tail; + struct rcu_head rcu; + atomic_t call_rcu_in_progress; + struct llist_head free_llist_extra_rcu; + /* list of objects to be freed after RCU tasks trace GP */ struct llist_head free_by_rcu_ttrace; struct llist_node *free_by_rcu_ttrace_tail; @@ -340,6 +349,56 @@ static void free_bulk(struct bpf_mem_cache *c) do_call_rcu_ttrace(tgt); } +static void __free_by_rcu(struct rcu_head *head) +{ + struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); + struct bpf_mem_cache *tgt = c->tgt; + struct llist_node *llnode = llist_del_all(&c->waiting_for_gp); + + if (!llnode) + goto out; + + if (llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace)) + tgt->free_by_rcu_ttrace_tail = c->waiting_for_gp_tail; + + /* Objects went through regular RCU GP. Send them to RCU tasks trace */ + do_call_rcu_ttrace(tgt); +out: + atomic_set(&c->call_rcu_in_progress, 0); +} + +static void check_free_by_rcu(struct bpf_mem_cache *c) +{ + struct llist_node *llnode, *t; + + if (llist_empty(&c->free_by_rcu) && llist_empty(&c->free_llist_extra_rcu)) + return; + + /* drain free_llist_extra_rcu */ + llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) + if (__llist_add(llnode, &c->free_by_rcu)) + c->free_by_rcu_tail = llnode; + + if (atomic_xchg(&c->call_rcu_in_progress, 1)) { + /* + * Instead of kmalloc-ing new rcu_head and triggering 10k + * call_rcu() to hit rcutree.qhimark and force RCU to notice + * the overload just ask RCU to hurry up. There could be many + * objects in free_by_rcu list. + * This hint reduces memory consumption for an artifical + * benchmark from 2 Gbyte to 150 Mbyte. + */ + rcu_request_urgent_qs_task(current); + return; + } + + WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); + + WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); + c->waiting_for_gp_tail = c->free_by_rcu_tail; + call_rcu_hurry(&c->rcu, __free_by_rcu); +} + static void bpf_mem_refill(struct irq_work *work) { struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); @@ -354,6 +413,8 @@ static void bpf_mem_refill(struct irq_work *work) alloc_bulk(c, c->batch, NUMA_NO_NODE); else if (cnt > c->high_watermark) free_bulk(c); + + check_free_by_rcu(c); } static void notrace irq_work_raise(struct bpf_mem_cache *c) @@ -482,6 +543,9 @@ static void drain_mem_cache(struct bpf_mem_cache *c) free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); free_all(__llist_del_all(&c->free_llist), percpu); free_all(__llist_del_all(&c->free_llist_extra), percpu); + free_all(__llist_del_all(&c->free_by_rcu), percpu); + free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); + free_all(llist_del_all(&c->waiting_for_gp), percpu); } static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) @@ -494,8 +558,8 @@ static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) static void free_mem_alloc(struct bpf_mem_alloc *ma) { - /* waiting_for_gp_ttrace lists was drained, but __free_rcu might - * still execute. Wait for it now before we freeing percpu caches. + /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks + * might still execute. Wait for them. * * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used @@ -504,9 +568,10 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by * using rcu_trace_implies_rcu_gp() as well. */ - rcu_barrier_tasks_trace(); + rcu_barrier(); /* wait for __free_by_rcu() */ + rcu_barrier_tasks_trace(); /* wait for __free_rcu() via call_rcu_tasks_trace */ if (!rcu_trace_implies_rcu_gp()) - rcu_barrier(); + rcu_barrier(); /* wait for __free_rcu() via call_rcu */ free_mem_alloc_no_barrier(ma); } @@ -565,6 +630,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) irq_work_sync(&c->refill_work); drain_mem_cache(c); rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } /* objcg is the same across cpus */ if (c->objcg) @@ -580,6 +646,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) irq_work_sync(&c->refill_work); drain_mem_cache(c); rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); + rcu_in_progress += atomic_read(&c->call_rcu_in_progress); } } if (c->objcg) @@ -664,6 +731,27 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) irq_work_raise(c); } +static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) +{ + struct llist_node *llnode = ptr - LLIST_NODE_SZ; + unsigned long flags; + + c->tgt = *(struct bpf_mem_cache **)llnode; + + local_irq_save(flags); + if (local_inc_return(&c->active) == 1) { + if (__llist_add(llnode, &c->free_by_rcu)) + c->free_by_rcu_tail = llnode; + } else { + llist_add(llnode, &c->free_llist_extra_rcu); + } + local_dec(&c->active); + local_irq_restore(flags); + + if (!atomic_read(&c->call_rcu_in_progress)) + irq_work_raise(c); +} + /* Called from BPF program or from sys_bpf syscall. * In both cases migration is disabled. */ @@ -697,6 +785,20 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); } +void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) +{ + int idx; + + if (!ptr) + return; + + idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ)); + if (idx < 0) + return; + + unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); +} + void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) { void *ret; @@ -713,6 +815,14 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) unit_free(this_cpu_ptr(ma->cache), ptr); } +void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) +{ + if (!ptr) + return; + + unit_free_rcu(this_cpu_ptr(ma->cache), ptr); +} + /* Directly does a kfree() without putting 'ptr' back to the free_llist * for reuse and without waiting for a rcu_tasks_trace gp. * The caller must first go through the rcu_tasks_trace gp for 'ptr'