diff mbox series

[v2,bpf-next,01/12] bpf: Introduce any context BPF specific memory allocator.

Message ID 20220817210419.95560-2-alexei.starovoitov@gmail.com (mailing list archive)
State New
Headers show
Series bpf: BPF specific memory allocator. | expand

Commit Message

Alexei Starovoitov Aug. 17, 2022, 9:04 p.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Tracing BPF programs can attach to kprobe and fentry. Hence they
run in unknown context where calling plain kmalloc() might not be safe.

Front-end kmalloc() with minimal per-cpu cache of free elements.
Refill this cache asynchronously from irq_work.

BPF programs always run with migration disabled.
It's safe to allocate from cache of the current cpu with irqs disabled.
Free-ing is always done into bucket of the current cpu as well.
irq_work trims extra free elements from buckets with kfree
and refills them with kmalloc, so global kmalloc logic takes care
of freeing objects allocated by one cpu and freed on another.

struct bpf_mem_alloc supports two modes:
- When size != 0 create kmem_cache and bpf_mem_cache for each cpu.
  This is typical bpf hash map use case when all elements have equal size.
- When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
  kmalloc/kfree. Max allocation size is 4096 in this case.
  This is bpf_dynptr and bpf_kptr use case.

bpf_mem_alloc/bpf_mem_free are bpf specific 'wrappers' of kmalloc/kfree.
bpf_mem_cache_alloc/bpf_mem_cache_free are 'wrappers' of kmem_cache_alloc/kmem_cache_free.

The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_mem_alloc.h |  26 ++
 kernel/bpf/Makefile           |   2 +-
 kernel/bpf/memalloc.c         | 526 ++++++++++++++++++++++++++++++++++
 3 files changed, 553 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/bpf_mem_alloc.h
 create mode 100644 kernel/bpf/memalloc.c

Comments

Kumar Kartikeya Dwivedi Aug. 17, 2022, 11:51 p.m. UTC | #1
On Wed, 17 Aug 2022 at 23:04, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>
>
> Tracing BPF programs can attach to kprobe and fentry. Hence they
> run in unknown context where calling plain kmalloc() might not be safe.
>
> Front-end kmalloc() with minimal per-cpu cache of free elements.
> Refill this cache asynchronously from irq_work.
>
> BPF programs always run with migration disabled.
> It's safe to allocate from cache of the current cpu with irqs disabled.
> Free-ing is always done into bucket of the current cpu as well.
> irq_work trims extra free elements from buckets with kfree
> and refills them with kmalloc, so global kmalloc logic takes care
> of freeing objects allocated by one cpu and freed on another.
>
> struct bpf_mem_alloc supports two modes:
> - When size != 0 create kmem_cache and bpf_mem_cache for each cpu.
>   This is typical bpf hash map use case when all elements have equal size.
> - When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
>   kmalloc/kfree. Max allocation size is 4096 in this case.
>   This is bpf_dynptr and bpf_kptr use case.
>
> bpf_mem_alloc/bpf_mem_free are bpf specific 'wrappers' of kmalloc/kfree.
> bpf_mem_cache_alloc/bpf_mem_cache_free are 'wrappers' of kmem_cache_alloc/kmem_cache_free.
>
> The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  include/linux/bpf_mem_alloc.h |  26 ++
>  kernel/bpf/Makefile           |   2 +-
>  kernel/bpf/memalloc.c         | 526 ++++++++++++++++++++++++++++++++++
>  3 files changed, 553 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/bpf_mem_alloc.h
>  create mode 100644 kernel/bpf/memalloc.c
>
> diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
> new file mode 100644
> index 000000000000..804733070f8d
> --- /dev/null
> +++ b/include/linux/bpf_mem_alloc.h
> @@ -0,0 +1,26 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
> +#ifndef _BPF_MEM_ALLOC_H
> +#define _BPF_MEM_ALLOC_H
> +#include <linux/compiler_types.h>
> +
> +struct bpf_mem_cache;
> +struct bpf_mem_caches;
> +
> +struct bpf_mem_alloc {
> +       struct bpf_mem_caches __percpu *caches;
> +       struct bpf_mem_cache __percpu *cache;
> +};
> +
> +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size);
> +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
> +
> +/* kmalloc/kfree equivalent: */
> +void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
> +void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
> +
> +/* kmem_cache_alloc/free equivalent: */
> +void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
> +void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
> +
> +#endif /* _BPF_MEM_ALLOC_H */
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index 057ba8e01e70..11fb9220909b 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -13,7 +13,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
>  obj-${CONFIG_BPF_LSM}    += bpf_inode_storage.o
>  obj-$(CONFIG_BPF_SYSCALL) += disasm.o
>  obj-$(CONFIG_BPF_JIT) += trampoline.o
> -obj-$(CONFIG_BPF_SYSCALL) += btf.o
> +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
>  obj-$(CONFIG_BPF_JIT) += dispatcher.o
>  ifeq ($(CONFIG_NET),y)
>  obj-$(CONFIG_BPF_SYSCALL) += devmap.o
> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> new file mode 100644
> index 000000000000..8de268922380
> --- /dev/null
> +++ b/kernel/bpf/memalloc.c
> @@ -0,0 +1,526 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
> +#include <linux/mm.h>
> +#include <linux/llist.h>
> +#include <linux/bpf.h>
> +#include <linux/irq_work.h>
> +#include <linux/bpf_mem_alloc.h>
> +#include <linux/memcontrol.h>
> +
> +/* Any context (including NMI) BPF specific memory allocator.
> + *
> + * Tracing BPF programs can attach to kprobe and fentry. Hence they
> + * run in unknown context where calling plain kmalloc() might not be safe.
> + *
> + * Front-end kmalloc() with per-cpu per-bucket cache of free elements.
> + * Refill this cache asynchronously from irq_work.
> + *
> + * CPU_0 buckets
> + * 16 32 64 96 128 196 256 512 1024 2048 4096
> + * ...
> + * CPU_N buckets
> + * 16 32 64 96 128 196 256 512 1024 2048 4096
> + *
> + * The buckets are prefilled at the start.
> + * BPF programs always run with migration disabled.
> + * It's safe to allocate from cache of the current cpu with irqs disabled.
> + * Free-ing is always done into bucket of the current cpu as well.
> + * irq_work trims extra free elements from buckets with kfree
> + * and refills them with kmalloc, so global kmalloc logic takes care
> + * of freeing objects allocated by one cpu and freed on another.
> + *
> + * Every allocated objected is padded with extra 8 bytes that contains
> + * struct llist_node.
> + */
> +#define LLIST_NODE_SZ sizeof(struct llist_node)
> +
> +/* similar to kmalloc, but sizeof == 8 bucket is gone */
> +static u8 size_index[24] __ro_after_init = {
> +       3,      /* 8 */
> +       3,      /* 16 */
> +       4,      /* 24 */
> +       4,      /* 32 */
> +       5,      /* 40 */
> +       5,      /* 48 */
> +       5,      /* 56 */
> +       5,      /* 64 */
> +       1,      /* 72 */
> +       1,      /* 80 */
> +       1,      /* 88 */
> +       1,      /* 96 */
> +       6,      /* 104 */
> +       6,      /* 112 */
> +       6,      /* 120 */
> +       6,      /* 128 */
> +       2,      /* 136 */
> +       2,      /* 144 */
> +       2,      /* 152 */
> +       2,      /* 160 */
> +       2,      /* 168 */
> +       2,      /* 176 */
> +       2,      /* 184 */
> +       2       /* 192 */
> +};
> +
> +static int bpf_mem_cache_idx(size_t size)
> +{
> +       if (!size || size > 4096)
> +               return -1;
> +
> +       if (size <= 192)
> +               return size_index[(size - 1) / 8] - 1;
> +
> +       return fls(size - 1) - 1;
> +}
> +
> +#define NUM_CACHES 11
> +
> +struct bpf_mem_cache {
> +       /* per-cpu list of free objects of size 'unit_size'.
> +        * All accesses are done with preemption disabled
> +        * with __llist_add() and __llist_del_first().
> +        */
> +       struct llist_head free_llist;
> +
> +       /* NMI only free list.
> +        * All accesses are NMI-safe llist_add() and llist_del_first().
> +        *
> +        * Each allocated object is either on free_llist or on free_llist_nmi.
> +        * One cpu can allocate it from NMI by doing llist_del_first() from
> +        * free_llist_nmi, while another might free it back from non-NMI by
> +        * doing llist_add() into free_llist.
> +        */
> +       struct llist_head free_llist_nmi;
> +
> +       /* kmem_cache != NULL when bpf_mem_alloc was created for specific
> +        * element size.
> +        */
> +       struct kmem_cache *kmem_cache;
> +       struct irq_work refill_work;
> +       struct obj_cgroup *objcg;
> +       int unit_size;
> +       /* count of objects in free_llist */
> +       int free_cnt;
> +       /* count of objects in free_llist_nmi */
> +       atomic_t free_cnt_nmi;
> +       /* flag to refill nmi list too */
> +       bool refill_nmi_list;
> +};
> +
> +struct bpf_mem_caches {
> +       struct bpf_mem_cache cache[NUM_CACHES];
> +};
> +
> +static struct llist_node notrace *__llist_del_first(struct llist_head *head)
> +{
> +       struct llist_node *entry, *next;
> +
> +       entry = head->first;
> +       if (!entry)
> +               return NULL;
> +       next = entry->next;
> +       head->first = next;
> +       return entry;
> +}
> +
> +#define BATCH 48
> +#define LOW_WATERMARK 32
> +#define HIGH_WATERMARK 96
> +/* Assuming the average number of elements per bucket is 64, when all buckets
> + * are used the total memory will be: 64*16*32 + 64*32*32 + 64*64*32 + ... +
> + * 64*4096*32 ~ 20Mbyte
> + */
> +
> +/* extra macro useful for testing by randomizing in_nmi condition */
> +#define bpf_in_nmi() in_nmi()
> +
> +static void *__alloc(struct bpf_mem_cache *c, int node)
> +{
> +       /* Allocate, but don't deplete atomic reserves that typical
> +        * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
> +        * will allocate from the current numa node which is what we
> +        * want here.
> +        */
> +       gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
> +
> +       if (c->kmem_cache)
> +               return kmem_cache_alloc_node(c->kmem_cache, flags, node);
> +
> +       return kmalloc_node(c->unit_size, flags, node);
> +}
> +
> +static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
> +{
> +#ifdef CONFIG_MEMCG_KMEM
> +       if (c->objcg)
> +               return get_mem_cgroup_from_objcg(c->objcg);
> +#endif
> +
> +#ifdef CONFIG_MEMCG
> +       return root_mem_cgroup;
> +#else
> +       return NULL;
> +#endif
> +}
> +
> +/* Mostly runs from irq_work except __init phase. */
> +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
> +{
> +       struct mem_cgroup *memcg = NULL, *old_memcg;
> +       unsigned long flags;
> +       void *obj;
> +       int i;
> +
> +       memcg = get_memcg(c);
> +       old_memcg = set_active_memcg(memcg);
> +       for (i = 0; i < cnt; i++) {
> +               obj = __alloc(c, node);
> +               if (!obj)
> +                       break;
> +               if (IS_ENABLED(CONFIG_PREEMPT_RT))
> +                       /* In RT irq_work runs in per-cpu kthread, so we have
> +                        * to disable interrupts to avoid race with
> +                        * bpf_mem_alloc/free. Note the read of free_cnt in
> +                        * bpf_mem_refill is racy in RT. It's ok to do.
> +                        */
> +                       local_irq_save(flags);
> +               __llist_add(obj, &c->free_llist);
> +               c->free_cnt++;
> +               if (IS_ENABLED(CONFIG_PREEMPT_RT))
> +                       local_irq_restore(flags);
> +       }
> +       set_active_memcg(old_memcg);
> +       mem_cgroup_put(memcg);
> +}
> +
> +/* Refill NMI specific llist. Mostly runs from irq_work except __init phase. */
> +static void alloc_bulk_nmi(struct bpf_mem_cache *c, int cnt, int node)
> +{
> +       struct mem_cgroup *memcg = NULL, *old_memcg;
> +       void *obj;
> +       int i;
> +
> +       memcg = get_memcg(c);
> +       old_memcg = set_active_memcg(memcg);
> +       for (i = 0; i < cnt; i++) {
> +               obj = __alloc(c, node);
> +               if (!obj)
> +                       break;
> +               llist_add(obj, &c->free_llist_nmi);
> +               atomic_inc(&c->free_cnt_nmi);
> +       }
> +       set_active_memcg(old_memcg);
> +       mem_cgroup_put(memcg);
> +}
> +
> +static void free_one(struct bpf_mem_cache *c, void *obj)
> +{
> +       if (c->kmem_cache)
> +               kmem_cache_free(c->kmem_cache, obj);
> +       else
> +               kfree(obj);
> +}
> +
> +static void free_bulk(struct bpf_mem_cache *c)
> +{
> +       struct llist_node *llnode;
> +       unsigned long flags;
> +       int cnt;
> +
> +       do {
> +               if (IS_ENABLED(CONFIG_PREEMPT_RT))
> +                       local_irq_save(flags);
> +               llnode = __llist_del_first(&c->free_llist);
> +               if (llnode)
> +                       cnt = --c->free_cnt;
> +               else
> +                       cnt = 0;
> +               if (IS_ENABLED(CONFIG_PREEMPT_RT))
> +                       local_irq_restore(flags);
> +               free_one(c, llnode);
> +       } while (cnt > (HIGH_WATERMARK + LOW_WATERMARK) / 2);
> +}
> +
> +static void free_bulk_nmi(struct bpf_mem_cache *c)
> +{
> +       struct llist_node *llnode;
> +       int cnt;
> +
> +       do {
> +               llnode = llist_del_first(&c->free_llist_nmi);
> +               if (llnode)
> +                       cnt = atomic_dec_return(&c->free_cnt_nmi);
> +               else
> +                       cnt = 0;
> +               free_one(c, llnode);
> +       } while (cnt > (HIGH_WATERMARK + LOW_WATERMARK) / 2);
> +}
> +
> +static void bpf_mem_refill(struct irq_work *work)
> +{
> +       struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
> +       int cnt;
> +
> +       cnt = c->free_cnt;
> +       if (cnt < LOW_WATERMARK)
> +               /* irq_work runs on this cpu and kmalloc will allocate
> +                * from the current numa node which is what we want here.
> +                */
> +               alloc_bulk(c, BATCH, NUMA_NO_NODE);
> +       else if (cnt > HIGH_WATERMARK)
> +               free_bulk(c);
> +
> +       if (!c->refill_nmi_list)
> +               /* don't refill NMI specific freelist
> +                * until alloc/free from NMI.
> +                */
> +               return;
> +       cnt = atomic_read(&c->free_cnt_nmi);
> +       if (cnt < LOW_WATERMARK)
> +               alloc_bulk_nmi(c, BATCH, NUMA_NO_NODE);
> +       else if (cnt > HIGH_WATERMARK)
> +               free_bulk_nmi(c);
> +       c->refill_nmi_list = false;
> +}
> +
> +static void notrace irq_work_raise(struct bpf_mem_cache *c, bool in_nmi)
> +{
> +       if (in_nmi)
> +               /* Raise the flag only if in_nmi. Cannot assign it
> +                * unconditionally since subsequent non-nmi irq_work_raise
> +                * might clear it.
> +                */
> +               c->refill_nmi_list = in_nmi;
> +       irq_work_queue(&c->refill_work);
> +}
> +
> +static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
> +{
> +       init_irq_work(&c->refill_work, bpf_mem_refill);
> +       /* To avoid consuming memory assume that 1st run of bpf
> +        * prog won't be doing more than 4 map_update_elem from
> +        * irq disabled region
> +        */
> +       alloc_bulk(c, c->unit_size < 256 ? 4 : 1, cpu_to_node(cpu));
> +
> +       /* NMI progs are rare. Assume they have one map_update
> +        * per prog at the very beginning.
> +        */
> +       alloc_bulk_nmi(c, 1, cpu_to_node(cpu));
> +}
> +
> +/* When size != 0 create kmem_cache and bpf_mem_cache for each cpu.
> + * This is typical bpf hash map use case when all elements have equal size.
> + *
> + * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
> + * kmalloc/kfree. Max allocation size is 4096 in this case.
> + * This is bpf_dynptr and bpf_kptr use case.
> + */
> +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size)
> +{
> +       static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
> +       struct bpf_mem_caches *cc, __percpu *pcc;
> +       struct bpf_mem_cache *c, __percpu *pc;
> +       struct kmem_cache *kmem_cache;
> +       struct obj_cgroup *objcg = NULL;
> +       char buf[32];
> +       int cpu, i;
> +
> +       if (size) {
> +               pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
> +               if (!pc)
> +                       return -ENOMEM;
> +               size += LLIST_NODE_SZ; /* room for llist_node */
> +               snprintf(buf, sizeof(buf), "bpf-%u", size);
> +               kmem_cache = kmem_cache_create(buf, size, 8, 0, NULL);
> +               if (!kmem_cache) {
> +                       free_percpu(pc);
> +                       return -ENOMEM;
> +               }
> +#ifdef CONFIG_MEMCG_KMEM
> +               objcg = get_obj_cgroup_from_current();
> +#endif
> +               for_each_possible_cpu(cpu) {
> +                       c = per_cpu_ptr(pc, cpu);
> +                       c->kmem_cache = kmem_cache;
> +                       c->unit_size = size;
> +                       c->objcg = objcg;
> +                       prefill_mem_cache(c, cpu);
> +               }
> +               ma->cache = pc;
> +               return 0;
> +       }
> +
> +       pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
> +       if (!pcc)
> +               return -ENOMEM;
> +#ifdef CONFIG_MEMCG_KMEM
> +       objcg = get_obj_cgroup_from_current();
> +#endif
> +       for_each_possible_cpu(cpu) {
> +               cc = per_cpu_ptr(pcc, cpu);
> +               for (i = 0; i < NUM_CACHES; i++) {
> +                       c = &cc->cache[i];
> +                       c->unit_size = sizes[i];
> +                       c->objcg = objcg;
> +                       prefill_mem_cache(c, cpu);
> +               }
> +       }
> +       ma->caches = pcc;
> +       return 0;
> +}
> +
> +static void drain_mem_cache(struct bpf_mem_cache *c)
> +{
> +       struct llist_node *llnode;
> +
> +       while ((llnode = llist_del_first(&c->free_llist_nmi)))
> +               free_one(c, llnode);
> +       while ((llnode = __llist_del_first(&c->free_llist)))
> +               free_one(c, llnode);
> +}
> +
> +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
> +{
> +       struct bpf_mem_caches *cc;
> +       struct bpf_mem_cache *c;
> +       int cpu, i;
> +
> +       if (ma->cache) {
> +               for_each_possible_cpu(cpu) {
> +                       c = per_cpu_ptr(ma->cache, cpu);
> +                       drain_mem_cache(c);
> +               }
> +               /* kmem_cache and memcg are the same across cpus */
> +               kmem_cache_destroy(c->kmem_cache);
> +               if (c->objcg)
> +                       obj_cgroup_put(c->objcg);
> +               free_percpu(ma->cache);
> +               ma->cache = NULL;
> +       }
> +       if (ma->caches) {
> +               for_each_possible_cpu(cpu) {
> +                       cc = per_cpu_ptr(ma->caches, cpu);
> +                       for (i = 0; i < NUM_CACHES; i++) {
> +                               c = &cc->cache[i];
> +                               drain_mem_cache(c);
> +                       }
> +               }
> +               if (c->objcg)
> +                       obj_cgroup_put(c->objcg);
> +               free_percpu(ma->caches);
> +               ma->caches = NULL;
> +       }
> +}
> +
> +/* notrace is necessary here and in other functions to make sure
> + * bpf programs cannot attach to them and cause llist corruptions.
> + */
> +static void notrace *unit_alloc(struct bpf_mem_cache *c)
> +{
> +       bool in_nmi = bpf_in_nmi();
> +       struct llist_node *llnode;
> +       unsigned long flags;
> +       int cnt = 0;
> +
> +       if (unlikely(in_nmi)) {
> +               llnode = llist_del_first(&c->free_llist_nmi);
> +               if (llnode)
> +                       cnt = atomic_dec_return(&c->free_cnt_nmi);

I am trying to understand which case this
atomic_dec_return/atomic_inc_return protects against in the
unit_alloc/unit_free for in_nmi branch. Is it protecting nested NMI
BPF prog interrupting NMI prog?

In case of perf it seems we use bpf_prog_active, so nested NMI prog
won't be invoked while we are interrupted inside a BPF program in NMI
context. Which are the other cases that might cause reentrancy in this
branch such that we need atomics instead of c->free_cnt_nmi--? Or are
you anticipating you might allow this in the future even if it is
disallowed for now?

If programs are allowed to stack like this, and we try to reason about
the safety of llist_del_first operation, the code is:

struct llist_node *llist_del_first(struct llist_head *head)
{
     struct llist_node *entry, *old_entry, *next;

     entry = smp_load_acquire(&head->first);
     for (;;) {
         if (entry == NULL)
             return NULL;
         old_entry = entry;
         next = READ_ONCE(entry->next);
>>>>>>>> Suppose nested NMI comes at this point and BPF prog is invoked.
         entry = cmpxchg(&head->first, old_entry, next);
         if (entry == old_entry)
             break;
     }
     return entry;
}

Assume the current nmi free llist is HEAD -> A -> B -> C -> D -> ...
For our cmpxchg, parameters are going to be cmpxchg(&head->first, A, B);

Now, nested NMI prog does unit_alloc thrice. this does llist_del_first thrice
This makes nmi free llist HEAD -> D -> ...
A, B, C are allocated in prog.
Now it does unit_free of all three, but in order of B, C, A.
unit_free does llist_add, nmi free llist becomes HEAD -> A -> C -> B -> D -> ...

Nested NMI prog exits.
We continue with our cmpxchg(&head->first, A, B); It succeeds, A is
returned, but C will be leaked.

> +       } else {
> +               /* Disable irqs to prevent the following race:
> +                * bpf_prog_A
> +                *   bpf_mem_alloc
> +                *      preemption or irq -> bpf_prog_B
> +                *        bpf_mem_alloc
> +                */
> +               local_irq_save(flags);
> +               llnode = __llist_del_first(&c->free_llist);
> +               if (llnode)
> +                       cnt = --c->free_cnt;
> +               local_irq_restore(flags);
> +       }
> +       WARN_ON(cnt < 0);
> +
> +       if (cnt < LOW_WATERMARK)
> +               irq_work_raise(c, in_nmi);
> +       return llnode;
> +}
> +
> +/* Though 'ptr' object could have been allocated on a different cpu
> + * add it to the free_llist of the current cpu.
> + * Let kfree() logic deal with it when it's later called from irq_work.
> + */
> +static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
> +{
> +       struct llist_node *llnode = ptr - LLIST_NODE_SZ;
> +       bool in_nmi = bpf_in_nmi();
> +       unsigned long flags;
> +       int cnt;
> +
> +       BUILD_BUG_ON(LLIST_NODE_SZ > 8);
> +
> +       if (unlikely(in_nmi)) {
> +               llist_add(llnode, &c->free_llist_nmi);
> +               cnt = atomic_inc_return(&c->free_cnt_nmi);
> +       } else {
> +               local_irq_save(flags);
> +               __llist_add(llnode, &c->free_llist);
> +               cnt = ++c->free_cnt;
> +               local_irq_restore(flags);
> +       }
> +       WARN_ON(cnt <= 0);
> +
> +       if (cnt > HIGH_WATERMARK)
> +               /* free few objects from current cpu into global kmalloc pool */
> +               irq_work_raise(c, in_nmi);
> +}
> +
> +/* Called from BPF program or from sys_bpf syscall.
> + * In both cases migration is disabled.
> + */
> +void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
> +{
> +       int idx;
> +       void *ret;
> +
> +       if (!size)
> +               return ZERO_SIZE_PTR;
> +
> +       idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
> +       if (idx < 0)
> +               return NULL;
> +
> +       ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
> +       return !ret ? NULL : ret + LLIST_NODE_SZ;
> +}
> +
> +void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
> +{
> +       int idx;
> +
> +       if (!ptr)
> +               return;
> +
> +       idx = bpf_mem_cache_idx(__ksize(ptr - LLIST_NODE_SZ));
> +       if (idx < 0)
> +               return;
> +
> +       unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
> +}
> +
> +void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
> +{
> +       void *ret;
> +
> +       ret = unit_alloc(this_cpu_ptr(ma->cache));
> +       return !ret ? NULL : ret + LLIST_NODE_SZ;
> +}
> +
> +void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
> +{
> +       if (!ptr)
> +               return;
> +
> +       unit_free(this_cpu_ptr(ma->cache), ptr);
> +}
> --
> 2.30.2
>
Alexei Starovoitov Aug. 18, 2022, 12:39 a.m. UTC | #2
On Thu, Aug 18, 2022 at 01:51:39AM +0200, Kumar Kartikeya Dwivedi wrote:
> > +
> > +/* notrace is necessary here and in other functions to make sure
> > + * bpf programs cannot attach to them and cause llist corruptions.
> > + */
> > +static void notrace *unit_alloc(struct bpf_mem_cache *c)
> > +{
> > +       bool in_nmi = bpf_in_nmi();
> > +       struct llist_node *llnode;
> > +       unsigned long flags;
> > +       int cnt = 0;
> > +
> > +       if (unlikely(in_nmi)) {
> > +               llnode = llist_del_first(&c->free_llist_nmi);
> > +               if (llnode)
> > +                       cnt = atomic_dec_return(&c->free_cnt_nmi);
> 
> I am trying to understand which case this
> atomic_dec_return/atomic_inc_return protects against in the
> unit_alloc/unit_free for in_nmi branch. Is it protecting nested NMI
> BPF prog interrupting NMI prog?
> 
> In case of perf it seems we use bpf_prog_active, 

yes, but bpf_prog_active has plenty of downsides and hopefully
will be replaced eventually with cleaner mechanism.
Separate topic.

> so nested NMI prog
> won't be invoked while we are interrupted inside a BPF program in NMI
> context. Which are the other cases that might cause reentrancy in this
> branch such that we need atomics instead of c->free_cnt_nmi--? Or are
> you anticipating you might allow this in the future even if it is
> disallowed for now?
> 
> If programs are allowed to stack like this, and we try to reason about
> the safety of llist_del_first operation, the code is:
> 
> struct llist_node *llist_del_first(struct llist_head *head)
> {
>      struct llist_node *entry, *old_entry, *next;
> 
>      entry = smp_load_acquire(&head->first);
>      for (;;) {
>          if (entry == NULL)
>              return NULL;
>          old_entry = entry;
>          next = READ_ONCE(entry->next);
> >>>>>>>> Suppose nested NMI comes at this point and BPF prog is invoked.

llist_del_first is notrace.
unit_alloc() above is also notrace. See comment before it.
perf event overflow NMI can happen here, but for some other llist.
Hence we're talking about NMI issues only here. fentry progs do not apply here.

> Assume the current nmi free llist is HEAD -> A -> B -> C -> D -> ...
> For our cmpxchg, parameters are going to be cmpxchg(&head->first, A, B);
> 
> Now, nested NMI prog does unit_alloc thrice. this does llist_del_first thrice

Even double llist_del_first on the same llist is bad. That's a known fact.

> This makes nmi free llist HEAD -> D -> ...
> A, B, C are allocated in prog.
> Now it does unit_free of all three, but in order of B, C, A.
> unit_free does llist_add, nmi free llist becomes HEAD -> A -> C -> B -> D -> ...
> 
> Nested NMI prog exits.
> We continue with our cmpxchg(&head->first, A, B); It succeeds, A is
> returned, but C will be leaked.

This exact scenario cannot happen for bpf_mem_cache's freelist.
unit_alloc is doing llist_del_first on per-cpu freelist.
We can have two perf_event bpf progs. Both progs would
share the same hash map and use the same struct bpf_mem_alloc,
and both call unit_alloc() on the same cpu freelist,
but as you noticed bpf_prog_active covers that case.
bpf_prog_active is too coarse as we discussed in the other thread a
month or so ago. It prevents valid and safe execution of bpf progs, lost
events, etc. We will surely come up with a better mechanism.

Going back to your earlier question:

> Which are the other cases that might cause reentrancy in this
> branch such that we need atomics instead of c->free_cnt_nmi--?

It's the case where perf_event bpf prog happened inside bpf_mem_refill in irq_work.
bpf_mem_refill manipulates free_cnt_nmi and nmi bpf prog too through unit_alloc.
Which got me thinking that there is indeed a missing check here.
We need to protect free_bulk_nmi's llist_del_first from unit_alloc's llist_del_first.
bpf_prog_active could be used for that, but let's come up with a cleaner way.
Probably going to add atomic_t flag to bpf_mem_cache and cmpxchg it,
or lock and spin_trylock it. tbd.
Kumar Kartikeya Dwivedi Aug. 18, 2022, 12:38 p.m. UTC | #3
On Thu, 18 Aug 2022 at 02:40, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Aug 18, 2022 at 01:51:39AM +0200, Kumar Kartikeya Dwivedi wrote:
> > > +
> > > +/* notrace is necessary here and in other functions to make sure
> > > + * bpf programs cannot attach to them and cause llist corruptions.
> > > + */
> > > +static void notrace *unit_alloc(struct bpf_mem_cache *c)
> > > +{
> > > +       bool in_nmi = bpf_in_nmi();
> > > +       struct llist_node *llnode;
> > > +       unsigned long flags;
> > > +       int cnt = 0;
> > > +
> > > +       if (unlikely(in_nmi)) {
> > > +               llnode = llist_del_first(&c->free_llist_nmi);
> > > +               if (llnode)
> > > +                       cnt = atomic_dec_return(&c->free_cnt_nmi);
> >
> > I am trying to understand which case this
> > atomic_dec_return/atomic_inc_return protects against in the
> > unit_alloc/unit_free for in_nmi branch. Is it protecting nested NMI
> > BPF prog interrupting NMI prog?
> >
> > In case of perf it seems we use bpf_prog_active,
>
> yes, but bpf_prog_active has plenty of downsides and hopefully
> will be replaced eventually with cleaner mechanism.
> Separate topic.

I see.

>
> > so nested NMI prog
> > won't be invoked while we are interrupted inside a BPF program in NMI
> > context. Which are the other cases that might cause reentrancy in this
> > branch such that we need atomics instead of c->free_cnt_nmi--? Or are
> > you anticipating you might allow this in the future even if it is
> > disallowed for now?
> >
> > If programs are allowed to stack like this, and we try to reason about
> > the safety of llist_del_first operation, the code is:
> >
> > struct llist_node *llist_del_first(struct llist_head *head)
> > {
> >      struct llist_node *entry, *old_entry, *next;
> >
> >      entry = smp_load_acquire(&head->first);
> >      for (;;) {
> >          if (entry == NULL)
> >              return NULL;
> >          old_entry = entry;
> >          next = READ_ONCE(entry->next);
> > >>>>>>>> Suppose nested NMI comes at this point and BPF prog is invoked.
>
> llist_del_first is notrace.
> unit_alloc() above is also notrace. See comment before it.
> perf event overflow NMI can happen here, but for some other llist.
> Hence we're talking about NMI issues only here. fentry progs do not apply here.

I was not thinking about fentry progs either. I saw perf overflow
progs can't nest, so I was wondering whether there were any other
cases. But you mentioned bpf_mem_refill in IRQ and perf in NMI can't
touch the same lockless list, so the scenario is still possible.

>
> > Assume the current nmi free llist is HEAD -> A -> B -> C -> D -> ...
> > For our cmpxchg, parameters are going to be cmpxchg(&head->first, A, B);
> >
> > Now, nested NMI prog does unit_alloc thrice. this does llist_del_first thrice
>
> Even double llist_del_first on the same llist is bad. That's a known fact.

Well, if you think about it (correct me if I'm wrong), at least in
this kind of nesting scenario on the same CPU, just doing
llist_del_first in NMI prog which interrupts llist_del_first of
bpf_mem_refill isn't a problem. The cmpxchg will fail as head->first
changed. The problem occurs when you combine it with llist_add between
the READ_ONCE(entry->next) and cmpxchg of the interrupted
llist_del_first. The main invariant of llist_del_first is that
entry->next should not change between READ_ONCE and cmpxchg, but if we
construct an ABA scenario like I did in my previous reply, _then_ we
have a problem. Otherwise it will just retry loop on exit if we e.g.
llist_del_first and kptr_xchg the ptr (which won't do llist_add).

>
> > This makes nmi free llist HEAD -> D -> ...
> > A, B, C are allocated in prog.
> > Now it does unit_free of all three, but in order of B, C, A.
> > unit_free does llist_add, nmi free llist becomes HEAD -> A -> C -> B -> D -> ...
> >
> > Nested NMI prog exits.
> > We continue with our cmpxchg(&head->first, A, B); It succeeds, A is
> > returned, but C will be leaked.
>
> This exact scenario cannot happen for bpf_mem_cache's freelist.
> unit_alloc is doing llist_del_first on per-cpu freelist.
> We can have two perf_event bpf progs. Both progs would
> share the same hash map and use the same struct bpf_mem_alloc,
> and both call unit_alloc() on the same cpu freelist,
> but as you noticed bpf_prog_active covers that case.
> bpf_prog_active is too coarse as we discussed in the other thread a
> month or so ago. It prevents valid and safe execution of bpf progs, lost
> events, etc. We will surely come up with a better mechanism.
>
> Going back to your earlier question:
>
> > Which are the other cases that might cause reentrancy in this
> > branch such that we need atomics instead of c->free_cnt_nmi--?
>
> It's the case where perf_event bpf prog happened inside bpf_mem_refill in irq_work.
> bpf_mem_refill manipulates free_cnt_nmi and nmi bpf prog too through unit_alloc.
> Which got me thinking that there is indeed a missing check here.

Aaah, ok, so this is what you wanted to prevent. Makes sense, even
though NMI nesting won't happen in progs (atleast for now), this
irq_work refilling can be interrupted by some perf NMI prog, or raw_tp
tracing prog in NMI context.

> We need to protect free_bulk_nmi's llist_del_first from unit_alloc's llist_del_first.
> bpf_prog_active could be used for that, but let's come up with a cleaner way.
> Probably going to add atomic_t flag to bpf_mem_cache and cmpxchg it,
> or lock and spin_trylock it. tbd.

Hm, can you explain why an atomic flag or lock would be needed, and
not just having a small busy counter like bpf_prog_active for the NMI
free llist will work? bpf_mem_cache is already per-CPU so it can just
be int alongside the llist. You inc it before llist_del_first, and
then assuming inc is atomic across interrupt boundary (which I think
this_cpu_inc_return for bpf_prog_active is already assuming), NMI prog
will see llist as busy and will fail its llist_del_first.
llist_add should still be fine to allow.

Technically we can fail llist_add instead, since doing multiple
llist_del_first won't be an issue, but you can't fail bpf_mem_free,
though you can fail bpf_mem_alloc, so it makes sense to protect only
llist_del_first using the counter.

PS: Also, it would be great if you could add a few words about this
around this code, i.e. which contexts would nest and those are what
this atomic_inc/dec and llist_del_first protection prevents problems
in.
Alexei Starovoitov Aug. 18, 2022, 10:30 p.m. UTC | #4
On Thu, Aug 18, 2022 at 02:38:06PM +0200, Kumar Kartikeya Dwivedi wrote:
> >
> > > Assume the current nmi free llist is HEAD -> A -> B -> C -> D -> ...
> > > For our cmpxchg, parameters are going to be cmpxchg(&head->first, A, B);
> > >
> > > Now, nested NMI prog does unit_alloc thrice. this does llist_del_first thrice
> >
> > Even double llist_del_first on the same llist is bad. That's a known fact.
> 
> Well, if you think about it (correct me if I'm wrong), at least in
> this kind of nesting scenario on the same CPU, just doing
> llist_del_first in NMI prog which interrupts llist_del_first of
> bpf_mem_refill isn't a problem. The cmpxchg will fail as head->first
> changed. The problem occurs when you combine it with llist_add between
> the READ_ONCE(entry->next) and cmpxchg of the interrupted
> llist_del_first. The main invariant of llist_del_first is that
> entry->next should not change between READ_ONCE and cmpxchg, but if we
> construct an ABA scenario like I did in my previous reply, _then_ we
> have a problem. Otherwise it will just retry loop on exit if we e.g.
> llist_del_first and kptr_xchg the ptr (which won't do llist_add).

Of course. In some race scenarios the llist will stay sane.
In others there will be leaks. In others crashes.
Like we don't really need 3 llist_del followed by 3 out of order llist_add-s
to observe bad things. 2 llist_del-s and 1 llist_add are just as bad.
That's why the doc says do one llist_del_first at a time and doesn't
specify all possible bad things.

> >
> > > This makes nmi free llist HEAD -> D -> ...
> > > A, B, C are allocated in prog.
> > > Now it does unit_free of all three, but in order of B, C, A.
> > > unit_free does llist_add, nmi free llist becomes HEAD -> A -> C -> B -> D -> ...
> > >
> > > Nested NMI prog exits.
> > > We continue with our cmpxchg(&head->first, A, B); It succeeds, A is
> > > returned, but C will be leaked.
> >
> > This exact scenario cannot happen for bpf_mem_cache's freelist.
> > unit_alloc is doing llist_del_first on per-cpu freelist.
> > We can have two perf_event bpf progs. Both progs would
> > share the same hash map and use the same struct bpf_mem_alloc,
> > and both call unit_alloc() on the same cpu freelist,
> > but as you noticed bpf_prog_active covers that case.
> > bpf_prog_active is too coarse as we discussed in the other thread a
> > month or so ago. It prevents valid and safe execution of bpf progs, lost
> > events, etc. We will surely come up with a better mechanism.
> >
> > Going back to your earlier question:
> >
> > > Which are the other cases that might cause reentrancy in this
> > > branch such that we need atomics instead of c->free_cnt_nmi--?
> >
> > It's the case where perf_event bpf prog happened inside bpf_mem_refill in irq_work.
> > bpf_mem_refill manipulates free_cnt_nmi and nmi bpf prog too through unit_alloc.
> > Which got me thinking that there is indeed a missing check here.
> 
> Aaah, ok, so this is what you wanted to prevent. Makes sense, even
> though NMI nesting won't happen in progs (atleast for now), this
> irq_work refilling can be interrupted by some perf NMI prog, or raw_tp
> tracing prog in NMI context.

Right. Doesn't matter which prog type that would be.
in_nmi() is the context that needs special handling.
It could happen not only in bpf_prog_type_perf_event.

> > We need to protect free_bulk_nmi's llist_del_first from unit_alloc's llist_del_first.
> > bpf_prog_active could be used for that, but let's come up with a cleaner way.
> > Probably going to add atomic_t flag to bpf_mem_cache and cmpxchg it,
> > or lock and spin_trylock it. tbd.
> 
> Hm, can you explain why an atomic flag or lock would be needed, and
> not just having a small busy counter like bpf_prog_active for the NMI
> free llist will work? bpf_mem_cache is already per-CPU so it can just
> be int alongside the llist. You inc it before llist_del_first, and
> then assuming inc is atomic across interrupt boundary (which I think
> this_cpu_inc_return for bpf_prog_active is already assuming), NMI prog
> will see llist as busy and will fail its llist_del_first.
> llist_add should still be fine to allow.

Good idea. The per-cpu counter is faster and simpler.

> Technically we can fail llist_add instead, since doing multiple
> llist_del_first won't be an issue, but you can't fail bpf_mem_free,
> though you can fail bpf_mem_alloc, so it makes sense to protect only
> llist_del_first using the counter.

Right. We cannot fail in unit_free().
With per-cpu counter both unit_alloc() and free_bulk_nmi() would
potentially fail in such unlikely scenario.
Not a big deal for free_bulk_nmi(). It would pick the element later.
For unit_alloc() return NULL is normal.
Especially since it's so unlikely for nmi to hit right in the middle
of llist_del_first().

Since we'll add this per-cpu counter to solve interrupted llist_del_first()
it feels that the same counter can be used to protect unit_alloc/free/irq_work.
Then we don't need free_llist_nmi. Single free_llist would be fine,
but unit_free() should not fail. If free_list cannot be accessed
due to per-cpu counter being busy we have to push somewhere.
So it seems two lists are necessary. Maybe it's still better ?
Roughly I'm thinking of the following:
unit_alloc()
{
  llnode = NULL;
  local_irq_save();
  if (__this_cpu_inc_return(c->alloc_active) != 1))
     goto out;
  llnode = __llist_del_first(&c->free_llist);
  if (llnode)
      cnt = --c->free_cnt;
out:
  __this_cpu_dec(c->alloc_active);
  local_irq_restore();
  return ret;
}
unit_free()
{
  local_irq_save();
  if (__this_cpu_inc_return(c->alloc_active) != 1)) {
     llist_add(llnode, &c->free_llist_nmi);
     goto out;
  }
  __llist_add(llnode, &c->free_llist);
  cnt = ++c->free_cnt;
out:
  __this_cpu_dec(c->alloc_active);
  local_irq_restore();
  return ret;
}
alloc_bulk, free_bulk would be protected by alloc_active as well.
alloc_bulk_nmi is gone.
free_bulk_nmi is still there to drain unlucky unit_free,
but it's now alone to do llist_del_first() and it just frees anything
that is in the free_llist_nmi.
The main advantage is that free_llist_nmi doesn't need to prefilled.
It will be empty most of the time.
wdyt?
Kumar Kartikeya Dwivedi Aug. 19, 2022, 2:31 p.m. UTC | #5
On Fri, 19 Aug 2022 at 00:30, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> Right. We cannot fail in unit_free().
> With per-cpu counter both unit_alloc() and free_bulk_nmi() would
> potentially fail in such unlikely scenario.
> Not a big deal for free_bulk_nmi(). It would pick the element later.
> For unit_alloc() return NULL is normal.
> Especially since it's so unlikely for nmi to hit right in the middle
> of llist_del_first().
>
> Since we'll add this per-cpu counter to solve interrupted llist_del_first()
> it feels that the same counter can be used to protect unit_alloc/free/irq_work.
> Then we don't need free_llist_nmi. Single free_llist would be fine,
> but unit_free() should not fail. If free_list cannot be accessed
> due to per-cpu counter being busy we have to push somewhere.
> So it seems two lists are necessary. Maybe it's still better ?
> Roughly I'm thinking of the following:
> unit_alloc()
> {
>   llnode = NULL;
>   local_irq_save();
>   if (__this_cpu_inc_return(c->alloc_active) != 1))
>      goto out;
>   llnode = __llist_del_first(&c->free_llist);
>   if (llnode)
>       cnt = --c->free_cnt;
> out:
>   __this_cpu_dec(c->alloc_active);
>   local_irq_restore();
>   return ret;
> }
> unit_free()
> {
>   local_irq_save();
>   if (__this_cpu_inc_return(c->alloc_active) != 1)) {
>      llist_add(llnode, &c->free_llist_nmi);
>      goto out;
>   }
>   __llist_add(llnode, &c->free_llist);
>   cnt = ++c->free_cnt;
> out:
>   __this_cpu_dec(c->alloc_active);
>   local_irq_restore();
>   return ret;
> }
> alloc_bulk, free_bulk would be protected by alloc_active as well.
> alloc_bulk_nmi is gone.
> free_bulk_nmi is still there to drain unlucky unit_free,
> but it's now alone to do llist_del_first() and it just frees anything
> that is in the free_llist_nmi.
> The main advantage is that free_llist_nmi doesn't need to prefilled.
> It will be empty most of the time.
> wdyt?

Looks great! The other option would be to not have the overflow
free_llist_nmi list and just allowing llist_add to free_llist from the
NMI case even if we interrupt llist_del_first, but then the non-NMI
user needs to use the atomic llist_add version as well (since we may
interrupt it), which won't be great for performance. So having the
extra list is much better.
Alexei Starovoitov Aug. 19, 2022, 5:51 p.m. UTC | #6
On Fri, Aug 19, 2022 at 04:31:11PM +0200, Kumar Kartikeya Dwivedi wrote:
> On Fri, 19 Aug 2022 at 00:30, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > Right. We cannot fail in unit_free().
> > With per-cpu counter both unit_alloc() and free_bulk_nmi() would
> > potentially fail in such unlikely scenario.
> > Not a big deal for free_bulk_nmi(). It would pick the element later.
> > For unit_alloc() return NULL is normal.
> > Especially since it's so unlikely for nmi to hit right in the middle
> > of llist_del_first().
> >
> > Since we'll add this per-cpu counter to solve interrupted llist_del_first()
> > it feels that the same counter can be used to protect unit_alloc/free/irq_work.
> > Then we don't need free_llist_nmi. Single free_llist would be fine,
> > but unit_free() should not fail. If free_list cannot be accessed
> > due to per-cpu counter being busy we have to push somewhere.
> > So it seems two lists are necessary. Maybe it's still better ?
> > Roughly I'm thinking of the following:
> > unit_alloc()
> > {
> >   llnode = NULL;
> >   local_irq_save();
> >   if (__this_cpu_inc_return(c->alloc_active) != 1))
> >      goto out;
> >   llnode = __llist_del_first(&c->free_llist);
> >   if (llnode)
> >       cnt = --c->free_cnt;
> > out:
> >   __this_cpu_dec(c->alloc_active);
> >   local_irq_restore();
> >   return ret;
> > }
> > unit_free()
> > {
> >   local_irq_save();
> >   if (__this_cpu_inc_return(c->alloc_active) != 1)) {
> >      llist_add(llnode, &c->free_llist_nmi);
> >      goto out;
> >   }
> >   __llist_add(llnode, &c->free_llist);
> >   cnt = ++c->free_cnt;
> > out:
> >   __this_cpu_dec(c->alloc_active);
> >   local_irq_restore();
> >   return ret;
> > }
> > alloc_bulk, free_bulk would be protected by alloc_active as well.
> > alloc_bulk_nmi is gone.
> > free_bulk_nmi is still there to drain unlucky unit_free,
> > but it's now alone to do llist_del_first() and it just frees anything
> > that is in the free_llist_nmi.
> > The main advantage is that free_llist_nmi doesn't need to prefilled.
> > It will be empty most of the time.
> > wdyt?
> 
> Looks great! The other option would be to not have the overflow
> free_llist_nmi list and just allowing llist_add to free_llist from the
> NMI case even if we interrupt llist_del_first, but then the non-NMI
> user needs to use the atomic llist_add version as well (since we may
> interrupt it), 

not only llist_add, but unit_alloc would have to use atomic llist_del_first too.
So any operation on the list would have to be with cmpxchg.

> which won't be great for performance.

exactly.

> So having the
> extra list is much better.

yep. same thinking.
I'll refactor the patches and send v3 with this approach.
diff mbox series

Patch

diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h
new file mode 100644
index 000000000000..804733070f8d
--- /dev/null
+++ b/include/linux/bpf_mem_alloc.h
@@ -0,0 +1,26 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#ifndef _BPF_MEM_ALLOC_H
+#define _BPF_MEM_ALLOC_H
+#include <linux/compiler_types.h>
+
+struct bpf_mem_cache;
+struct bpf_mem_caches;
+
+struct bpf_mem_alloc {
+	struct bpf_mem_caches __percpu *caches;
+	struct bpf_mem_cache __percpu *cache;
+};
+
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size);
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
+
+/* kmalloc/kfree equivalent: */
+void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size);
+void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
+
+/* kmem_cache_alloc/free equivalent: */
+void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
+void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
+
+#endif /* _BPF_MEM_ALLOC_H */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 057ba8e01e70..11fb9220909b 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -13,7 +13,7 @@  obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
 obj-${CONFIG_BPF_LSM}	  += bpf_inode_storage.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
-obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
 obj-$(CONFIG_BPF_JIT) += dispatcher.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
new file mode 100644
index 000000000000..8de268922380
--- /dev/null
+++ b/kernel/bpf/memalloc.c
@@ -0,0 +1,526 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <linux/mm.h>
+#include <linux/llist.h>
+#include <linux/bpf.h>
+#include <linux/irq_work.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/memcontrol.h>
+
+/* Any context (including NMI) BPF specific memory allocator.
+ *
+ * Tracing BPF programs can attach to kprobe and fentry. Hence they
+ * run in unknown context where calling plain kmalloc() might not be safe.
+ *
+ * Front-end kmalloc() with per-cpu per-bucket cache of free elements.
+ * Refill this cache asynchronously from irq_work.
+ *
+ * CPU_0 buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ * ...
+ * CPU_N buckets
+ * 16 32 64 96 128 196 256 512 1024 2048 4096
+ *
+ * The buckets are prefilled at the start.
+ * BPF programs always run with migration disabled.
+ * It's safe to allocate from cache of the current cpu with irqs disabled.
+ * Free-ing is always done into bucket of the current cpu as well.
+ * irq_work trims extra free elements from buckets with kfree
+ * and refills them with kmalloc, so global kmalloc logic takes care
+ * of freeing objects allocated by one cpu and freed on another.
+ *
+ * Every allocated objected is padded with extra 8 bytes that contains
+ * struct llist_node.
+ */
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+/* similar to kmalloc, but sizeof == 8 bucket is gone */
+static u8 size_index[24] __ro_after_init = {
+	3,	/* 8 */
+	3,	/* 16 */
+	4,	/* 24 */
+	4,	/* 32 */
+	5,	/* 40 */
+	5,	/* 48 */
+	5,	/* 56 */
+	5,	/* 64 */
+	1,	/* 72 */
+	1,	/* 80 */
+	1,	/* 88 */
+	1,	/* 96 */
+	6,	/* 104 */
+	6,	/* 112 */
+	6,	/* 120 */
+	6,	/* 128 */
+	2,	/* 136 */
+	2,	/* 144 */
+	2,	/* 152 */
+	2,	/* 160 */
+	2,	/* 168 */
+	2,	/* 176 */
+	2,	/* 184 */
+	2	/* 192 */
+};
+
+static int bpf_mem_cache_idx(size_t size)
+{
+	if (!size || size > 4096)
+		return -1;
+
+	if (size <= 192)
+		return size_index[(size - 1) / 8] - 1;
+
+	return fls(size - 1) - 1;
+}
+
+#define NUM_CACHES 11
+
+struct bpf_mem_cache {
+	/* per-cpu list of free objects of size 'unit_size'.
+	 * All accesses are done with preemption disabled
+	 * with __llist_add() and __llist_del_first().
+	 */
+	struct llist_head free_llist;
+
+	/* NMI only free list.
+	 * All accesses are NMI-safe llist_add() and llist_del_first().
+	 *
+	 * Each allocated object is either on free_llist or on free_llist_nmi.
+	 * One cpu can allocate it from NMI by doing llist_del_first() from
+	 * free_llist_nmi, while another might free it back from non-NMI by
+	 * doing llist_add() into free_llist.
+	 */
+	struct llist_head free_llist_nmi;
+
+	/* kmem_cache != NULL when bpf_mem_alloc was created for specific
+	 * element size.
+	 */
+	struct kmem_cache *kmem_cache;
+	struct irq_work refill_work;
+	struct obj_cgroup *objcg;
+	int unit_size;
+	/* count of objects in free_llist */
+	int free_cnt;
+	/* count of objects in free_llist_nmi */
+	atomic_t free_cnt_nmi;
+	/* flag to refill nmi list too */
+	bool refill_nmi_list;
+};
+
+struct bpf_mem_caches {
+	struct bpf_mem_cache cache[NUM_CACHES];
+};
+
+static struct llist_node notrace *__llist_del_first(struct llist_head *head)
+{
+	struct llist_node *entry, *next;
+
+	entry = head->first;
+	if (!entry)
+		return NULL;
+	next = entry->next;
+	head->first = next;
+	return entry;
+}
+
+#define BATCH 48
+#define LOW_WATERMARK 32
+#define HIGH_WATERMARK 96
+/* Assuming the average number of elements per bucket is 64, when all buckets
+ * are used the total memory will be: 64*16*32 + 64*32*32 + 64*64*32 + ... +
+ * 64*4096*32 ~ 20Mbyte
+ */
+
+/* extra macro useful for testing by randomizing in_nmi condition */
+#define bpf_in_nmi() in_nmi()
+
+static void *__alloc(struct bpf_mem_cache *c, int node)
+{
+	/* Allocate, but don't deplete atomic reserves that typical
+	 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+	 * will allocate from the current numa node which is what we
+	 * want here.
+	 */
+	gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
+
+	if (c->kmem_cache)
+		return kmem_cache_alloc_node(c->kmem_cache, flags, node);
+
+	return kmalloc_node(c->unit_size, flags, node);
+}
+
+static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
+{
+#ifdef CONFIG_MEMCG_KMEM
+	if (c->objcg)
+		return get_mem_cgroup_from_objcg(c->objcg);
+#endif
+
+#ifdef CONFIG_MEMCG
+	return root_mem_cgroup;
+#else
+	return NULL;
+#endif
+}
+
+/* Mostly runs from irq_work except __init phase. */
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
+{
+	struct mem_cgroup *memcg = NULL, *old_memcg;
+	unsigned long flags;
+	void *obj;
+	int i;
+
+	memcg = get_memcg(c);
+	old_memcg = set_active_memcg(memcg);
+	for (i = 0; i < cnt; i++) {
+		obj = __alloc(c, node);
+		if (!obj)
+			break;
+		if (IS_ENABLED(CONFIG_PREEMPT_RT))
+			/* In RT irq_work runs in per-cpu kthread, so we have
+			 * to disable interrupts to avoid race with
+			 * bpf_mem_alloc/free. Note the read of free_cnt in
+			 * bpf_mem_refill is racy in RT. It's ok to do.
+			 */
+			local_irq_save(flags);
+		__llist_add(obj, &c->free_llist);
+		c->free_cnt++;
+		if (IS_ENABLED(CONFIG_PREEMPT_RT))
+			local_irq_restore(flags);
+	}
+	set_active_memcg(old_memcg);
+	mem_cgroup_put(memcg);
+}
+
+/* Refill NMI specific llist. Mostly runs from irq_work except __init phase. */
+static void alloc_bulk_nmi(struct bpf_mem_cache *c, int cnt, int node)
+{
+	struct mem_cgroup *memcg = NULL, *old_memcg;
+	void *obj;
+	int i;
+
+	memcg = get_memcg(c);
+	old_memcg = set_active_memcg(memcg);
+	for (i = 0; i < cnt; i++) {
+		obj = __alloc(c, node);
+		if (!obj)
+			break;
+		llist_add(obj, &c->free_llist_nmi);
+		atomic_inc(&c->free_cnt_nmi);
+	}
+	set_active_memcg(old_memcg);
+	mem_cgroup_put(memcg);
+}
+
+static void free_one(struct bpf_mem_cache *c, void *obj)
+{
+	if (c->kmem_cache)
+		kmem_cache_free(c->kmem_cache, obj);
+	else
+		kfree(obj);
+}
+
+static void free_bulk(struct bpf_mem_cache *c)
+{
+	struct llist_node *llnode;
+	unsigned long flags;
+	int cnt;
+
+	do {
+		if (IS_ENABLED(CONFIG_PREEMPT_RT))
+			local_irq_save(flags);
+		llnode = __llist_del_first(&c->free_llist);
+		if (llnode)
+			cnt = --c->free_cnt;
+		else
+			cnt = 0;
+		if (IS_ENABLED(CONFIG_PREEMPT_RT))
+			local_irq_restore(flags);
+		free_one(c, llnode);
+	} while (cnt > (HIGH_WATERMARK + LOW_WATERMARK) / 2);
+}
+
+static void free_bulk_nmi(struct bpf_mem_cache *c)
+{
+	struct llist_node *llnode;
+	int cnt;
+
+	do {
+		llnode = llist_del_first(&c->free_llist_nmi);
+		if (llnode)
+			cnt = atomic_dec_return(&c->free_cnt_nmi);
+		else
+			cnt = 0;
+		free_one(c, llnode);
+	} while (cnt > (HIGH_WATERMARK + LOW_WATERMARK) / 2);
+}
+
+static void bpf_mem_refill(struct irq_work *work)
+{
+	struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work);
+	int cnt;
+
+	cnt = c->free_cnt;
+	if (cnt < LOW_WATERMARK)
+		/* irq_work runs on this cpu and kmalloc will allocate
+		 * from the current numa node which is what we want here.
+		 */
+		alloc_bulk(c, BATCH, NUMA_NO_NODE);
+	else if (cnt > HIGH_WATERMARK)
+		free_bulk(c);
+
+	if (!c->refill_nmi_list)
+		/* don't refill NMI specific freelist
+		 * until alloc/free from NMI.
+		 */
+		return;
+	cnt = atomic_read(&c->free_cnt_nmi);
+	if (cnt < LOW_WATERMARK)
+		alloc_bulk_nmi(c, BATCH, NUMA_NO_NODE);
+	else if (cnt > HIGH_WATERMARK)
+		free_bulk_nmi(c);
+	c->refill_nmi_list = false;
+}
+
+static void notrace irq_work_raise(struct bpf_mem_cache *c, bool in_nmi)
+{
+	if (in_nmi)
+		/* Raise the flag only if in_nmi. Cannot assign it
+		 * unconditionally since subsequent non-nmi irq_work_raise
+		 * might clear it.
+		 */
+		c->refill_nmi_list = in_nmi;
+	irq_work_queue(&c->refill_work);
+}
+
+static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
+{
+	init_irq_work(&c->refill_work, bpf_mem_refill);
+	/* To avoid consuming memory assume that 1st run of bpf
+	 * prog won't be doing more than 4 map_update_elem from
+	 * irq disabled region
+	 */
+	alloc_bulk(c, c->unit_size < 256 ? 4 : 1, cpu_to_node(cpu));
+
+	/* NMI progs are rare. Assume they have one map_update
+	 * per prog at the very beginning.
+	 */
+	alloc_bulk_nmi(c, 1, cpu_to_node(cpu));
+}
+
+/* When size != 0 create kmem_cache and bpf_mem_cache for each cpu.
+ * This is typical bpf hash map use case when all elements have equal size.
+ *
+ * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on
+ * kmalloc/kfree. Max allocation size is 4096 in this case.
+ * This is bpf_dynptr and bpf_kptr use case.
+ */
+int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size)
+{
+	static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
+	struct bpf_mem_caches *cc, __percpu *pcc;
+	struct bpf_mem_cache *c, __percpu *pc;
+	struct kmem_cache *kmem_cache;
+	struct obj_cgroup *objcg = NULL;
+	char buf[32];
+	int cpu, i;
+
+	if (size) {
+		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
+		if (!pc)
+			return -ENOMEM;
+		size += LLIST_NODE_SZ; /* room for llist_node */
+		snprintf(buf, sizeof(buf), "bpf-%u", size);
+		kmem_cache = kmem_cache_create(buf, size, 8, 0, NULL);
+		if (!kmem_cache) {
+			free_percpu(pc);
+			return -ENOMEM;
+		}
+#ifdef CONFIG_MEMCG_KMEM
+		objcg = get_obj_cgroup_from_current();
+#endif
+		for_each_possible_cpu(cpu) {
+			c = per_cpu_ptr(pc, cpu);
+			c->kmem_cache = kmem_cache;
+			c->unit_size = size;
+			c->objcg = objcg;
+			prefill_mem_cache(c, cpu);
+		}
+		ma->cache = pc;
+		return 0;
+	}
+
+	pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
+	if (!pcc)
+		return -ENOMEM;
+#ifdef CONFIG_MEMCG_KMEM
+	objcg = get_obj_cgroup_from_current();
+#endif
+	for_each_possible_cpu(cpu) {
+		cc = per_cpu_ptr(pcc, cpu);
+		for (i = 0; i < NUM_CACHES; i++) {
+			c = &cc->cache[i];
+			c->unit_size = sizes[i];
+			c->objcg = objcg;
+			prefill_mem_cache(c, cpu);
+		}
+	}
+	ma->caches = pcc;
+	return 0;
+}
+
+static void drain_mem_cache(struct bpf_mem_cache *c)
+{
+	struct llist_node *llnode;
+
+	while ((llnode = llist_del_first(&c->free_llist_nmi)))
+		free_one(c, llnode);
+	while ((llnode = __llist_del_first(&c->free_llist)))
+		free_one(c, llnode);
+}
+
+void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
+{
+	struct bpf_mem_caches *cc;
+	struct bpf_mem_cache *c;
+	int cpu, i;
+
+	if (ma->cache) {
+		for_each_possible_cpu(cpu) {
+			c = per_cpu_ptr(ma->cache, cpu);
+			drain_mem_cache(c);
+		}
+		/* kmem_cache and memcg are the same across cpus */
+		kmem_cache_destroy(c->kmem_cache);
+		if (c->objcg)
+			obj_cgroup_put(c->objcg);
+		free_percpu(ma->cache);
+		ma->cache = NULL;
+	}
+	if (ma->caches) {
+		for_each_possible_cpu(cpu) {
+			cc = per_cpu_ptr(ma->caches, cpu);
+			for (i = 0; i < NUM_CACHES; i++) {
+				c = &cc->cache[i];
+				drain_mem_cache(c);
+			}
+		}
+		if (c->objcg)
+			obj_cgroup_put(c->objcg);
+		free_percpu(ma->caches);
+		ma->caches = NULL;
+	}
+}
+
+/* notrace is necessary here and in other functions to make sure
+ * bpf programs cannot attach to them and cause llist corruptions.
+ */
+static void notrace *unit_alloc(struct bpf_mem_cache *c)
+{
+	bool in_nmi = bpf_in_nmi();
+	struct llist_node *llnode;
+	unsigned long flags;
+	int cnt = 0;
+
+	if (unlikely(in_nmi)) {
+		llnode = llist_del_first(&c->free_llist_nmi);
+		if (llnode)
+			cnt = atomic_dec_return(&c->free_cnt_nmi);
+	} else {
+		/* Disable irqs to prevent the following race:
+		 * bpf_prog_A
+		 *   bpf_mem_alloc
+		 *      preemption or irq -> bpf_prog_B
+		 *        bpf_mem_alloc
+		 */
+		local_irq_save(flags);
+		llnode = __llist_del_first(&c->free_llist);
+		if (llnode)
+			cnt = --c->free_cnt;
+		local_irq_restore(flags);
+	}
+	WARN_ON(cnt < 0);
+
+	if (cnt < LOW_WATERMARK)
+		irq_work_raise(c, in_nmi);
+	return llnode;
+}
+
+/* Though 'ptr' object could have been allocated on a different cpu
+ * add it to the free_llist of the current cpu.
+ * Let kfree() logic deal with it when it's later called from irq_work.
+ */
+static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
+{
+	struct llist_node *llnode = ptr - LLIST_NODE_SZ;
+	bool in_nmi = bpf_in_nmi();
+	unsigned long flags;
+	int cnt;
+
+	BUILD_BUG_ON(LLIST_NODE_SZ > 8);
+
+	if (unlikely(in_nmi)) {
+		llist_add(llnode, &c->free_llist_nmi);
+		cnt = atomic_inc_return(&c->free_cnt_nmi);
+	} else {
+		local_irq_save(flags);
+		__llist_add(llnode, &c->free_llist);
+		cnt = ++c->free_cnt;
+		local_irq_restore(flags);
+	}
+	WARN_ON(cnt <= 0);
+
+	if (cnt > HIGH_WATERMARK)
+		/* free few objects from current cpu into global kmalloc pool */
+		irq_work_raise(c, in_nmi);
+}
+
+/* Called from BPF program or from sys_bpf syscall.
+ * In both cases migration is disabled.
+ */
+void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
+{
+	int idx;
+	void *ret;
+
+	if (!size)
+		return ZERO_SIZE_PTR;
+
+	idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
+	if (idx < 0)
+		return NULL;
+
+	ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx);
+	return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+	int idx;
+
+	if (!ptr)
+		return;
+
+	idx = bpf_mem_cache_idx(__ksize(ptr - LLIST_NODE_SZ));
+	if (idx < 0)
+		return;
+
+	unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr);
+}
+
+void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma)
+{
+	void *ret;
+
+	ret = unit_alloc(this_cpu_ptr(ma->cache));
+	return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
+
+void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
+{
+	if (!ptr)
+		return;
+
+	unit_free(this_cpu_ptr(ma->cache), ptr);
+}