diff mbox series

[bpf-next,11/16] libbpf: Add support for bpf_arena.

Message ID 20240206220441.38311-12-alexei.starovoitov@gmail.com (mailing list archive)
State New
Headers show
Series bpf: Introduce BPF arena. | expand

Commit Message

Alexei Starovoitov Feb. 6, 2024, 10:04 p.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

mmap() bpf_arena right after creation, since the kernel needs to
remember the address returned from mmap. This is user_vm_start.
LLVM will generate bpf_arena_cast_user() instructions where
necessary and JIT will add upper 32-bit of user_vm_start
to such pointers.

Use traditional map->value_size * map->max_entries to calculate mmap sz,
though it's not the best fit.

Also don't set BTF at bpf_arena creation time, since it doesn't support it.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/libbpf.c        | 18 ++++++++++++++++++
 tools/lib/bpf/libbpf_probes.c |  6 ++++++
 2 files changed, 24 insertions(+)

Comments

Andrii Nakryiko Feb. 8, 2024, 1:15 a.m. UTC | #1
On Tue, Feb 6, 2024 at 2:05 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> From: Alexei Starovoitov <ast@kernel.org>
>
> mmap() bpf_arena right after creation, since the kernel needs to
> remember the address returned from mmap. This is user_vm_start.
> LLVM will generate bpf_arena_cast_user() instructions where
> necessary and JIT will add upper 32-bit of user_vm_start
> to such pointers.
>
> Use traditional map->value_size * map->max_entries to calculate mmap sz,
> though it's not the best fit.

We should probably make bpf_map_mmap_sz() aware of specific map type
and do different calculations based on that. It makes sense to have
round_up(PAGE_SIZE) for BPF map arena, and use just just value_size or
max_entries to specify the size (fixing the other to be zero).

>
> Also don't set BTF at bpf_arena creation time, since it doesn't support it.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  tools/lib/bpf/libbpf.c        | 18 ++++++++++++++++++
>  tools/lib/bpf/libbpf_probes.c |  6 ++++++
>  2 files changed, 24 insertions(+)
>
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 01f407591a92..c5ce5946dc6d 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -185,6 +185,7 @@ static const char * const map_type_name[] = {
>         [BPF_MAP_TYPE_BLOOM_FILTER]             = "bloom_filter",
>         [BPF_MAP_TYPE_USER_RINGBUF]             = "user_ringbuf",
>         [BPF_MAP_TYPE_CGRP_STORAGE]             = "cgrp_storage",
> +       [BPF_MAP_TYPE_ARENA]                    = "arena",
>  };
>
>  static const char * const prog_type_name[] = {
> @@ -4852,6 +4853,7 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
>         case BPF_MAP_TYPE_SOCKHASH:
>         case BPF_MAP_TYPE_QUEUE:
>         case BPF_MAP_TYPE_STACK:
> +       case BPF_MAP_TYPE_ARENA:
>                 create_attr.btf_fd = 0;
>                 create_attr.btf_key_type_id = 0;
>                 create_attr.btf_value_type_id = 0;
> @@ -4908,6 +4910,22 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
>         if (map->fd == map_fd)
>                 return 0;
>
> +       if (def->type == BPF_MAP_TYPE_ARENA) {
> +               size_t mmap_sz;
> +
> +               mmap_sz = bpf_map_mmap_sz(def->value_size, def->max_entries);
> +               map->mmaped = mmap((void *)map->map_extra, mmap_sz, PROT_READ | PROT_WRITE,
> +                                  map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED,
> +                                  map_fd, 0);
> +               if (map->mmaped == MAP_FAILED) {
> +                       err = -errno;
> +                       map->mmaped = NULL;
> +                       pr_warn("map '%s': failed to mmap bpf_arena: %d\n",
> +                               bpf_map__name(map), err);
> +                       return err;

leaking map_fd here, you need to close(map_fd) before erroring out


> +               }
> +       }
> +
>         /* Keep placeholder FD value but now point it to the BPF map object.
>          * This way everything that relied on this map's FD (e.g., relocated
>          * ldimm64 instructions) will stay valid and won't need adjustments.
> diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
> index ee9b1dbea9eb..cbc7f4c09060 100644
> --- a/tools/lib/bpf/libbpf_probes.c
> +++ b/tools/lib/bpf/libbpf_probes.c
> @@ -338,6 +338,12 @@ static int probe_map_create(enum bpf_map_type map_type)
>                 key_size = 0;
>                 max_entries = 1;
>                 break;
> +       case BPF_MAP_TYPE_ARENA:
> +               key_size        = sizeof(__u64);
> +               value_size      = sizeof(__u64);
> +               opts.map_extra  = 0; /* can mmap() at any address */
> +               opts.map_flags  = BPF_F_MMAPABLE;
> +               break;
>         case BPF_MAP_TYPE_HASH:
>         case BPF_MAP_TYPE_ARRAY:
>         case BPF_MAP_TYPE_PROG_ARRAY:
> --
> 2.34.1
>
Alexei Starovoitov Feb. 8, 2024, 1:38 a.m. UTC | #2
On Wed, Feb 7, 2024 at 5:15 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Feb 6, 2024 at 2:05 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > mmap() bpf_arena right after creation, since the kernel needs to
> > remember the address returned from mmap. This is user_vm_start.
> > LLVM will generate bpf_arena_cast_user() instructions where
> > necessary and JIT will add upper 32-bit of user_vm_start
> > to such pointers.
> >
> > Use traditional map->value_size * map->max_entries to calculate mmap sz,
> > though it's not the best fit.
>
> We should probably make bpf_map_mmap_sz() aware of specific map type
> and do different calculations based on that. It makes sense to have
> round_up(PAGE_SIZE) for BPF map arena, and use just just value_size or
> max_entries to specify the size (fixing the other to be zero).

I went with value_size == key_size == 8 in order to be able to extend
it in the future and allow map_lookup/update/delete to do something
useful. Ex: lookup/delete can behave just like arena_alloc/free_pages.

Are you proposing to force key/value_size to zero ?
That was my first attempt.
key_size can be zero, but syscall side of lookup/update expects
a non-zero value_size for all maps regardless of type.
We can modify bpf/syscall.c, of course, but it feels arena would be
too different of a map if generic map handling code would need
to be specialized.

Then since value_size is > 0 then what sizes make sense?
When it's 8 it can be an indirection to anything.
key/value would be user pointers to other structs that
would be meaningful for an arena.
Right now it costs nothing to force both to 8 and pick any logic
when we decide what lookup/update should do.

But then when value_size == 8 than making max_entries to
mean the size of arena in bytes or pages.. starting to look odd
and different from all other maps.

We could go with max_entries==0 and value_size to mean the size of
arena in bytes, but it will prevent us from defining lookup/update
in the future, which doesn't feel right.

Considering all this I went with map->value_size * map->max_entries choice.
Though it's not pretty.

> > @@ -4908,6 +4910,22 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
> >         if (map->fd == map_fd)
> >                 return 0;
> >
> > +       if (def->type == BPF_MAP_TYPE_ARENA) {
> > +               size_t mmap_sz;
> > +
> > +               mmap_sz = bpf_map_mmap_sz(def->value_size, def->max_entries);
> > +               map->mmaped = mmap((void *)map->map_extra, mmap_sz, PROT_READ | PROT_WRITE,
> > +                                  map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED,
> > +                                  map_fd, 0);
> > +               if (map->mmaped == MAP_FAILED) {
> > +                       err = -errno;
> > +                       map->mmaped = NULL;
> > +                       pr_warn("map '%s': failed to mmap bpf_arena: %d\n",
> > +                               bpf_map__name(map), err);
> > +                       return err;
>
> leaking map_fd here, you need to close(map_fd) before erroring out

ahh. good catch.
Andrii Nakryiko Feb. 8, 2024, 6:29 p.m. UTC | #3
On Wed, Feb 7, 2024 at 5:38 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Feb 7, 2024 at 5:15 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Feb 6, 2024 at 2:05 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > From: Alexei Starovoitov <ast@kernel.org>
> > >
> > > mmap() bpf_arena right after creation, since the kernel needs to
> > > remember the address returned from mmap. This is user_vm_start.
> > > LLVM will generate bpf_arena_cast_user() instructions where
> > > necessary and JIT will add upper 32-bit of user_vm_start
> > > to such pointers.
> > >
> > > Use traditional map->value_size * map->max_entries to calculate mmap sz,
> > > though it's not the best fit.
> >
> > We should probably make bpf_map_mmap_sz() aware of specific map type
> > and do different calculations based on that. It makes sense to have
> > round_up(PAGE_SIZE) for BPF map arena, and use just just value_size or
> > max_entries to specify the size (fixing the other to be zero).
>
> I went with value_size == key_size == 8 in order to be able to extend
> it in the future and allow map_lookup/update/delete to do something
> useful. Ex: lookup/delete can behave just like arena_alloc/free_pages.
>
> Are you proposing to force key/value_size to zero ?

Yeah, I was thinking either (value_size=<size-in-bytes> and
max_entries=0) or (value_size=0 and max_entries=<size-in-bytes>). The
latter is what we do for BPF ringbuf, for example.

What you are saying about lookup/update already seems different from
any "normal" map anyways, so I'm not sure that's a good enough reason
to have hard-coded 8 for value size. And it seems like in practice
instead of doing lookup/update through syscall, the more natural way
of working with arena is going to be mmap() anyways, so not even sure
we need to implement the syscall side of lookup/update.

But just as an extra aside, what you have in mind for lookup/update
for the arena map can be generalized into "partial lookup/update" for
any map where it makes sense. I.e., instead of expecting the user to
read/update the entire value size, we can allow them to provide a
subrange to read/update (i.e., offset+size combo to specify subrange
within full map value range). This will work for the arena, but also
for most other maps (if not all) that currently support LOOKUP/UPDATE.

but specifically for bpf_map_mmap_sz(), regardless of what we decide
we should still change it to be something like:

switch (map_type) {
case BPF_MAP_TYPE_ARRAY:
    return <whatever we are doing right now>;
case BPF_MAP_TYPE_ARENA:
    /* round up to page size */
    return round_up(<whatever based on value_size and/or max_entries>,
page_size);
default:
    return 0; /* not supported */
}

we can also add a still different case for RINGBUF, where it's (2 *
max_entries). The general point is that mmapable size rules differ by
map type, so we best express this explicitly in this helper.


> That was my first attempt.
> key_size can be zero, but syscall side of lookup/update expects
> a non-zero value_size for all maps regardless of type.
> We can modify bpf/syscall.c, of course, but it feels arena would be
> too different of a map if generic map handling code would need
> to be specialized.
>
> Then since value_size is > 0 then what sizes make sense?
> When it's 8 it can be an indirection to anything.
> key/value would be user pointers to other structs that
> would be meaningful for an arena.
> Right now it costs nothing to force both to 8 and pick any logic
> when we decide what lookup/update should do.
>
> But then when value_size == 8 than making max_entries to
> mean the size of arena in bytes or pages.. starting to look odd
> and different from all other maps.
>
> We could go with max_entries==0 and value_size to mean the size of
> arena in bytes, but it will prevent us from defining lookup/update
> in the future, which doesn't feel right.
>
> Considering all this I went with map->value_size * map->max_entries choice.
> Though it's not pretty.
>
> > > @@ -4908,6 +4910,22 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
> > >         if (map->fd == map_fd)
> > >                 return 0;
> > >
> > > +       if (def->type == BPF_MAP_TYPE_ARENA) {
> > > +               size_t mmap_sz;
> > > +
> > > +               mmap_sz = bpf_map_mmap_sz(def->value_size, def->max_entries);
> > > +               map->mmaped = mmap((void *)map->map_extra, mmap_sz, PROT_READ | PROT_WRITE,
> > > +                                  map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED,
> > > +                                  map_fd, 0);
> > > +               if (map->mmaped == MAP_FAILED) {
> > > +                       err = -errno;
> > > +                       map->mmaped = NULL;
> > > +                       pr_warn("map '%s': failed to mmap bpf_arena: %d\n",
> > > +                               bpf_map__name(map), err);
> > > +                       return err;
> >
> > leaking map_fd here, you need to close(map_fd) before erroring out
>
> ahh. good catch.
Alexei Starovoitov Feb. 8, 2024, 6:45 p.m. UTC | #4
On Thu, Feb 8, 2024 at 10:29 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Wed, Feb 7, 2024 at 5:38 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Wed, Feb 7, 2024 at 5:15 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Tue, Feb 6, 2024 at 2:05 PM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > >
> > > > From: Alexei Starovoitov <ast@kernel.org>
> > > >
> > > > mmap() bpf_arena right after creation, since the kernel needs to
> > > > remember the address returned from mmap. This is user_vm_start.
> > > > LLVM will generate bpf_arena_cast_user() instructions where
> > > > necessary and JIT will add upper 32-bit of user_vm_start
> > > > to such pointers.
> > > >
> > > > Use traditional map->value_size * map->max_entries to calculate mmap sz,
> > > > though it's not the best fit.
> > >
> > > We should probably make bpf_map_mmap_sz() aware of specific map type
> > > and do different calculations based on that. It makes sense to have
> > > round_up(PAGE_SIZE) for BPF map arena, and use just just value_size or
> > > max_entries to specify the size (fixing the other to be zero).
> >
> > I went with value_size == key_size == 8 in order to be able to extend
> > it in the future and allow map_lookup/update/delete to do something
> > useful. Ex: lookup/delete can behave just like arena_alloc/free_pages.
> >
> > Are you proposing to force key/value_size to zero ?
>
> Yeah, I was thinking either (value_size=<size-in-bytes> and
> max_entries=0) or (value_size=0 and max_entries=<size-in-bytes>). The
> latter is what we do for BPF ringbuf, for example.

Ouch. since map_update_elem() does:
        value_size = bpf_map_value_size(map);
        value = kvmemdup_bpfptr(uvalue, value_size);
...
static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len)
{
        void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN);

        if (!p)
                return ERR_PTR(-ENOMEM);
        if (copy_from_bpfptr(p, src, len)) {
...
        if (unlikely(!size))
                return ZERO_SIZE_PTR;

and it's probably crashing the kernel.

Looks like we have fixes to do anyway :(
Andrii Nakryiko Feb. 8, 2024, 6:54 p.m. UTC | #5
On Thu, Feb 8, 2024 at 10:45 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Feb 8, 2024 at 10:29 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Wed, Feb 7, 2024 at 5:38 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Wed, Feb 7, 2024 at 5:15 PM Andrii Nakryiko
> > > <andrii.nakryiko@gmail.com> wrote:
> > > >
> > > > On Tue, Feb 6, 2024 at 2:05 PM Alexei Starovoitov
> > > > <alexei.starovoitov@gmail.com> wrote:
> > > > >
> > > > > From: Alexei Starovoitov <ast@kernel.org>
> > > > >
> > > > > mmap() bpf_arena right after creation, since the kernel needs to
> > > > > remember the address returned from mmap. This is user_vm_start.
> > > > > LLVM will generate bpf_arena_cast_user() instructions where
> > > > > necessary and JIT will add upper 32-bit of user_vm_start
> > > > > to such pointers.
> > > > >
> > > > > Use traditional map->value_size * map->max_entries to calculate mmap sz,
> > > > > though it's not the best fit.
> > > >
> > > > We should probably make bpf_map_mmap_sz() aware of specific map type
> > > > and do different calculations based on that. It makes sense to have
> > > > round_up(PAGE_SIZE) for BPF map arena, and use just just value_size or
> > > > max_entries to specify the size (fixing the other to be zero).
> > >
> > > I went with value_size == key_size == 8 in order to be able to extend
> > > it in the future and allow map_lookup/update/delete to do something
> > > useful. Ex: lookup/delete can behave just like arena_alloc/free_pages.
> > >
> > > Are you proposing to force key/value_size to zero ?
> >
> > Yeah, I was thinking either (value_size=<size-in-bytes> and
> > max_entries=0) or (value_size=0 and max_entries=<size-in-bytes>). The
> > latter is what we do for BPF ringbuf, for example.
>
> Ouch. since map_update_elem() does:
>         value_size = bpf_map_value_size(map);
>         value = kvmemdup_bpfptr(uvalue, value_size);
> ...
> static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len)
> {
>         void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN);
>
>         if (!p)
>                 return ERR_PTR(-ENOMEM);
>         if (copy_from_bpfptr(p, src, len)) {
> ...
>         if (unlikely(!size))
>                 return ZERO_SIZE_PTR;
>
> and it's probably crashing the kernel.

You mean when doing this from SYSCALL program?

>
> Looks like we have fixes to do anyway :(

Yeah, it's kind of weird to first read key/value "memory", and then
getting -ENOTSUP for maps that don't support lookup/update. We should
error out sooner.
Alexei Starovoitov Feb. 8, 2024, 6:59 p.m. UTC | #6
On Thu, Feb 8, 2024 at 10:55 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> You mean when doing this from SYSCALL program?

both. regular syscall too.


> >
> > Looks like we have fixes to do anyway :(
>
> Yeah, it's kind of weird to first read key/value "memory", and then
> getting -ENOTSUP for maps that don't support lookup/update. We should
> error out sooner.

it's all over the place.
Probably better to hack bpf_map_value_size() to never return 0.
diff mbox series

Patch

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 01f407591a92..c5ce5946dc6d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -185,6 +185,7 @@  static const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_BLOOM_FILTER]		= "bloom_filter",
 	[BPF_MAP_TYPE_USER_RINGBUF]             = "user_ringbuf",
 	[BPF_MAP_TYPE_CGRP_STORAGE]		= "cgrp_storage",
+	[BPF_MAP_TYPE_ARENA]			= "arena",
 };
 
 static const char * const prog_type_name[] = {
@@ -4852,6 +4853,7 @@  static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 	case BPF_MAP_TYPE_SOCKHASH:
 	case BPF_MAP_TYPE_QUEUE:
 	case BPF_MAP_TYPE_STACK:
+	case BPF_MAP_TYPE_ARENA:
 		create_attr.btf_fd = 0;
 		create_attr.btf_key_type_id = 0;
 		create_attr.btf_value_type_id = 0;
@@ -4908,6 +4910,22 @@  static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b
 	if (map->fd == map_fd)
 		return 0;
 
+	if (def->type == BPF_MAP_TYPE_ARENA) {
+		size_t mmap_sz;
+
+		mmap_sz = bpf_map_mmap_sz(def->value_size, def->max_entries);
+		map->mmaped = mmap((void *)map->map_extra, mmap_sz, PROT_READ | PROT_WRITE,
+				   map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED,
+				   map_fd, 0);
+		if (map->mmaped == MAP_FAILED) {
+			err = -errno;
+			map->mmaped = NULL;
+			pr_warn("map '%s': failed to mmap bpf_arena: %d\n",
+				bpf_map__name(map), err);
+			return err;
+		}
+	}
+
 	/* Keep placeholder FD value but now point it to the BPF map object.
 	 * This way everything that relied on this map's FD (e.g., relocated
 	 * ldimm64 instructions) will stay valid and won't need adjustments.
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index ee9b1dbea9eb..cbc7f4c09060 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -338,6 +338,12 @@  static int probe_map_create(enum bpf_map_type map_type)
 		key_size = 0;
 		max_entries = 1;
 		break;
+	case BPF_MAP_TYPE_ARENA:
+		key_size	= sizeof(__u64);
+		value_size	= sizeof(__u64);
+		opts.map_extra	= 0; /* can mmap() at any address */
+		opts.map_flags	= BPF_F_MMAPABLE;
+		break;
 	case BPF_MAP_TYPE_HASH:
 	case BPF_MAP_TYPE_ARRAY:
 	case BPF_MAP_TYPE_PROG_ARRAY: