diff mbox series

[bpf-next,v4,2/6] bpf: Add verifier support for dynptrs and implement malloc dynptrs

Message ID 20220509224257.3222614-3-joannelkoong@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Dynamic pointers | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 1818 this patch: 1822
netdev/cc_maintainers warning 6 maintainers not CCed: netdev@vger.kernel.org kafai@fb.com john.fastabend@gmail.com yhs@fb.com kpsingh@kernel.org songliubraving@fb.com
netdev/build_clang fail Errors and warnings before: 196 this patch: 198
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 1828 this patch: 1832
netdev/checkpatch fail ERROR: space prohibited before that ':' (ctx:WxV) WARNING: line length of 114 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline fail Was 0 now: 4
bpf/vmtest-bpf-next-PR fail merge-conflict
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest + selftests
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on z15 + selftests

Commit Message

Joanne Koong May 9, 2022, 10:42 p.m. UTC
This patch adds the bulk of the verifier work for supporting dynamic
pointers (dynptrs) in bpf. This patch implements malloc-type dynptrs
through 2 new APIs (bpf_dynptr_alloc and bpf_dynptr_put) that can be
called by a bpf program. Malloc-type dynptrs are dynptrs that dynamically
allocate memory on behalf of the program.

A bpf_dynptr is opaque to the bpf program. It is a 16-byte structure
defined internally as:

struct bpf_dynptr_kern {
    void *data;
    u32 size;
    u32 offset;
} __aligned(8);

The upper 8 bits of *size* is reserved (it contains extra metadata about
read-only status and dynptr type); consequently, a dynptr only supports
memory less than 16 MB.

The 2 new APIs for malloc-type dynptrs are:

long bpf_dynptr_alloc(u32 size, u64 flags, struct bpf_dynptr *ptr);
void bpf_dynptr_put(struct bpf_dynptr *ptr);

Please note that there *must* be a corresponding bpf_dynptr_put for
every bpf_dynptr_alloc (even if the alloc fails). This is enforced
by the verifier.

In the verifier, dynptr state information will be tracked in stack
slots. When the program passes in an uninitialized dynptr
(ARG_PTR_TO_DYNPTR | MEM_UNINIT), the stack slots corresponding
to the frame pointer where the dynptr resides at are marked STACK_DYNPTR.

For helper functions that take in initialized dynptrs (eg
bpf_dynptr_read + bpf_dynptr_write which are added later in this
patchset), the verifier enforces that the dynptr has been initialized
properly by checking that their corresponding stack slots have been marked
as STACK_DYNPTR. Dynptr release functions (eg bpf_dynptr_put) will clear
the stack slots. The verifier enforces at program exit that there are no
referenced dynptrs that haven't been released.

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
 include/linux/bpf.h            |  62 ++++++++-
 include/linux/bpf_verifier.h   |  21 +++
 include/uapi/linux/bpf.h       |  30 +++++
 kernel/bpf/helpers.c           |  75 +++++++++++
 kernel/bpf/verifier.c          | 228 ++++++++++++++++++++++++++++++++-
 scripts/bpf_doc.py             |   2 +
 tools/include/uapi/linux/bpf.h |  30 +++++
 7 files changed, 445 insertions(+), 3 deletions(-)

Comments

Daniel Borkmann May 12, 2022, 12:05 a.m. UTC | #1
On 5/10/22 12:42 AM, Joanne Koong wrote:
[...]
> @@ -6498,6 +6523,11 @@ struct bpf_timer {
>   	__u64 :64;
>   } __attribute__((aligned(8)));
>   
> +struct bpf_dynptr {
> +	__u64 :64;
> +	__u64 :64;
> +} __attribute__((aligned(8)));
> +
>   struct bpf_sysctl {
>   	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
>   				 * Allows 1,2,4-byte read, but no write.
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 8a2398ac14c2..a4272e9239ea 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -1396,6 +1396,77 @@ const struct bpf_func_proto bpf_kptr_xchg_proto = {
>   	.arg2_btf_id  = BPF_PTR_POISON,
>   };
>   
> +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type,
> +		     u32 offset, u32 size)
> +{
> +	ptr->data = data;
> +	ptr->offset = offset;
> +	ptr->size = size;
> +	bpf_dynptr_set_type(ptr, type);
> +}
> +
> +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
> +{
> +	memset(ptr, 0, sizeof(*ptr));
> +}
> +
> +BPF_CALL_3(bpf_dynptr_alloc, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
> +{
> +	gfp_t gfp_flags = GFP_ATOMIC;

nit: should also have __GFP_NOWARN

I presume mem accounting cannot be done on this one given there is no real "ownership"
of this piece of mem?

Was planning to run some more local tests tomorrow, but from glance at selftest side
I haven't seen sanity checks like these:

bpf_dynptr_alloc(8, 0, &ptr);
data = bpf_dynptr_data(&ptr, 0, 0);
bpf_dynptr_put(&ptr);
*(__u8 *)data = 23;

How is this prevented? I think you do a ptr id check in the is_dynptr_ref_function
check on the acquire function, but with above use, would our data pointer escape, or
get invalidated via last put?

Thanks,
Daniel
Joanne Koong May 12, 2022, 8:03 p.m. UTC | #2
On Wed, May 11, 2022 at 5:05 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 5/10/22 12:42 AM, Joanne Koong wrote:
> [...]
> > @@ -6498,6 +6523,11 @@ struct bpf_timer {
> >       __u64 :64;
> >   } __attribute__((aligned(8)));
> >
> > +struct bpf_dynptr {
> > +     __u64 :64;
> > +     __u64 :64;
> > +} __attribute__((aligned(8)));
> > +
> >   struct bpf_sysctl {
> >       __u32   write;          /* Sysctl is being read (= 0) or written (= 1).
> >                                * Allows 1,2,4-byte read, but no write.
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index 8a2398ac14c2..a4272e9239ea 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -1396,6 +1396,77 @@ const struct bpf_func_proto bpf_kptr_xchg_proto = {
> >       .arg2_btf_id  = BPF_PTR_POISON,
> >   };
> >
> > +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type,
> > +                  u32 offset, u32 size)
> > +{
> > +     ptr->data = data;
> > +     ptr->offset = offset;
> > +     ptr->size = size;
> > +     bpf_dynptr_set_type(ptr, type);
> > +}
> > +
> > +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
> > +{
> > +     memset(ptr, 0, sizeof(*ptr));
> > +}
> > +
> > +BPF_CALL_3(bpf_dynptr_alloc, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
> > +{
> > +     gfp_t gfp_flags = GFP_ATOMIC;
>
> nit: should also have __GFP_NOWARN
I will add this in to v5
>
> I presume mem accounting cannot be done on this one given there is no real "ownership"
> of this piece of mem?
I'm not too familiar with memory accounting, but I think the ownership
can get ambiguous given that the memory can be persisted in a map and
"owned" by different bpf programs (eg the one that frees it may not be
the same one that allocated it)
>
> Was planning to run some more local tests tomorrow, but from glance at selftest side
> I haven't seen sanity checks like these:
>
> bpf_dynptr_alloc(8, 0, &ptr);
> data = bpf_dynptr_data(&ptr, 0, 0);
> bpf_dynptr_put(&ptr);
> *(__u8 *)data = 23;
>
> How is this prevented? I think you do a ptr id check in the is_dynptr_ref_function
> check on the acquire function, but with above use, would our data pointer escape, or
> get invalidated via last put?

There's a subtest inside the dynptr_fail.c file called
"data_slice_use_after_put" that does:

bpf_dynptr_alloc(8, 0, &ptr);
data =bpf_dynptr_data(&ptr, 0, 8);
bpf_dynptr_put(&ptr);
val = *(__u8 *)data;

and checks that trying to dereference the data slice in that last line
fails the verifier (with error msg "invalid mem access 'scalar'")

In the verifier, the call to bpf_dynptr_put will invalidate any data
slices associated with the dyntpr. This happens in
unmark_stack_slots_dynptr() which calls release_reference() which
marks the data slice reg as an unknown scalar value. When you try to
then dereference the data slice, the verifier rejects it with an
"invalid mem access 'scalar'" message.

Thanks for your comments.
>
> Thanks,
> Daniel
Daniel Borkmann May 13, 2022, 1:12 p.m. UTC | #3
On 5/12/22 10:03 PM, Joanne Koong wrote:
> On Wed, May 11, 2022 at 5:05 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 5/10/22 12:42 AM, Joanne Koong wrote:
>> [...]
>>> @@ -6498,6 +6523,11 @@ struct bpf_timer {
>>>        __u64 :64;
>>>    } __attribute__((aligned(8)));
>>>
>>> +struct bpf_dynptr {
>>> +     __u64 :64;
>>> +     __u64 :64;
>>> +} __attribute__((aligned(8)));
>>> +
>>>    struct bpf_sysctl {
>>>        __u32   write;          /* Sysctl is being read (= 0) or written (= 1).
>>>                                 * Allows 1,2,4-byte read, but no write.
>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>> index 8a2398ac14c2..a4272e9239ea 100644
>>> --- a/kernel/bpf/helpers.c
>>> +++ b/kernel/bpf/helpers.c
>>> @@ -1396,6 +1396,77 @@ const struct bpf_func_proto bpf_kptr_xchg_proto = {
>>>        .arg2_btf_id  = BPF_PTR_POISON,
>>>    };
>>>
>>> +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type,
>>> +                  u32 offset, u32 size)
>>> +{
>>> +     ptr->data = data;
>>> +     ptr->offset = offset;
>>> +     ptr->size = size;
>>> +     bpf_dynptr_set_type(ptr, type);
>>> +}
>>> +
>>> +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
>>> +{
>>> +     memset(ptr, 0, sizeof(*ptr));
>>> +}
>>> +
>>> +BPF_CALL_3(bpf_dynptr_alloc, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
>>> +{
>>> +     gfp_t gfp_flags = GFP_ATOMIC;
>>
>> nit: should also have __GFP_NOWARN
> I will add this in to v5
>>
>> I presume mem accounting cannot be done on this one given there is no real "ownership"
>> of this piece of mem?
> I'm not too familiar with memory accounting, but I think the ownership
> can get ambiguous given that the memory can be persisted in a map and
> "owned" by different bpf programs (eg the one that frees it may not be
> the same one that allocated it)

Right, it's ambiguous. My worry in particular is that where you added the
BPF_FUNC_dynptr_alloc in the bpf_base_func_proto() with this, e.g. it would
also mean that even unprivileged bpf would be able to allocate huge chunks
of DYNPTR_MAX_SIZE (resp. whatever max kmalloc memory under atomic constraints
can provide) without holding anyone accountable for it.

Thinking more about it, is there even any value for BPF_FUNC_dynptr_* for
fully unpriv BPF if these are rejected anyway by the spectre mitigations
from verifier?

>> Was planning to run some more local tests tomorrow, but from glance at selftest side
>> I haven't seen sanity checks like these:
>>
>> bpf_dynptr_alloc(8, 0, &ptr);
>> data = bpf_dynptr_data(&ptr, 0, 0);
>> bpf_dynptr_put(&ptr);
>> *(__u8 *)data = 23;
>>
>> How is this prevented? I think you do a ptr id check in the is_dynptr_ref_function
>> check on the acquire function, but with above use, would our data pointer escape, or
>> get invalidated via last put?
> 
> There's a subtest inside the dynptr_fail.c file called
> "data_slice_use_after_put" that does:
> 
> bpf_dynptr_alloc(8, 0, &ptr);
> data =bpf_dynptr_data(&ptr, 0, 8);
> bpf_dynptr_put(&ptr);
> val = *(__u8 *)data;
> 
> and checks that trying to dereference the data slice in that last line
> fails the verifier (with error msg "invalid mem access 'scalar'")
> 
> In the verifier, the call to bpf_dynptr_put will invalidate any data
> slices associated with the dyntpr. This happens in
> unmark_stack_slots_dynptr() which calls release_reference() which
> marks the data slice reg as an unknown scalar value. When you try to
> then dereference the data slice, the verifier rejects it with an
> "invalid mem access 'scalar'" message.

Got it, thanks for claryfying! While playing a bit around locally with the
selftests and in relation to above earlier statement, one thing I noticed is
that bpf_dynptr_alloc() and subsequent read is allowed:

bpf_dynptr_alloc(8, 0, &ptr);
bpf_dynptr_read(tmp, sizeof(tmp), &ptr, 0);
bpf_dynptr_put(&ptr);

bpf_dynptr_alloc(8, 0, &ptr);
data = bpf_dynptr_data(&ptr, 0, 8);
// read access uninit data[x]
bpf_dynptr_put(&ptr);

So either for alloc, we always built-in __GFP_ZERO or bpf_dynptr_alloc()
helper usage should go under perfmon_capable() where it's allowed to read
kernel mem.

Thanks,
Daniel
Alexei Starovoitov May 13, 2022, 4:39 p.m. UTC | #4
On Fri, May 13, 2022 at 03:12:06PM +0200, Daniel Borkmann wrote:
> 
> Thinking more about it, is there even any value for BPF_FUNC_dynptr_* for
> fully unpriv BPF if these are rejected anyway by the spectre mitigations
> from verifier?
...
> So either for alloc, we always built-in __GFP_ZERO or bpf_dynptr_alloc()
> helper usage should go under perfmon_capable() where it's allowed to read
> kernel mem.

dynptr should probably by cap_bpf and cap_perfmon for now.
Otherwise we will start adding cap_perfmon checks in run-time to helpers
which is not easy to do. Some sort of prog or user context would need
to be passed as hidden arg into helper. That's too much hassle just
to enable dynptr for cap_bpf only.

Similar problem with gfp_account... remembering memcg and passing all
the way to bpf_dynptr_alloc helper is not easy. And it's not clear
which memcg to use. The one where task was that loaded that bpf prog?
That task could have been gone and cgroup is in dying stage.
bpf prog is executing some context and allocating memory for itself.
Like kernel allocates memory for its needs. It doesn't feel right to
charge prog's memcg in that case. It probably should be an explicit choice
by bpf program author. Maybe in the future we can introduce a fake map
for such accounting needs and bpf prog could pass a map pointer to
bpf_dynptr_alloc. When such fake and empty map is created the memcg
would be recorded the same way we do for existing normal maps.
Then the helper will look like:
bpf_dynptr_alloc(struct bpf_map *map, u32 size, u64 flags, struct bpf_dynptr *ptr)
{
  set_active_memcg(map->memcg);
  kmalloc into dynptr;
}

Should we do this change now and allow NULL to be passed as a map ?
This way the bpf prog will have a choice whether to account into memcg or not.
Maybe it's all overkill and none of this needed?

On the other side maybe map should be a mandatory argument and dynptr_alloc
can do its own memory accounting for stats ? atomic inc and dec is probably
an acceptable overhead? bpftool will print the dynptr allocation stats.
All sounds nice and extra visibility is great, but the kernel code that
allocates for the kernel doesn't use memcg. bpf progs semantically are part of
the kernel whereas memcg is a mechanism to restrict memory that kernel
allocated on behalf of user tasks. We abused memcg for bpf progs/maps
to have a limit. Not clear whether we should continue doing so for dynpr_alloc
and in the future for kptr_alloc. gfp_account adds overhead too. It's not free.
Thoughts?
Daniel Borkmann May 13, 2022, 7:28 p.m. UTC | #5
On 5/13/22 6:39 PM, Alexei Starovoitov wrote:
> On Fri, May 13, 2022 at 03:12:06PM +0200, Daniel Borkmann wrote:
>>
>> Thinking more about it, is there even any value for BPF_FUNC_dynptr_* for
>> fully unpriv BPF if these are rejected anyway by the spectre mitigations
>> from verifier?
> ...
>> So either for alloc, we always built-in __GFP_ZERO or bpf_dynptr_alloc()
>> helper usage should go under perfmon_capable() where it's allowed to read
>> kernel mem.
> 
> dynptr should probably by cap_bpf and cap_perfmon for now.
> Otherwise we will start adding cap_perfmon checks in run-time to helpers
> which is not easy to do. Some sort of prog or user context would need
> to be passed as hidden arg into helper. That's too much hassle just
> to enable dynptr for cap_bpf only.
> 
> Similar problem with gfp_account... remembering memcg and passing all
> the way to bpf_dynptr_alloc helper is not easy. And it's not clear
> which memcg to use. The one where task was that loaded that bpf prog?
> That task could have been gone and cgroup is in dying stage.
> bpf prog is executing some context and allocating memory for itself.
> Like kernel allocates memory for its needs. It doesn't feel right to
> charge prog's memcg in that case. It probably should be an explicit choice
> by bpf program author. Maybe in the future we can introduce a fake map
> for such accounting needs and bpf prog could pass a map pointer to
> bpf_dynptr_alloc. When such fake and empty map is created the memcg
> would be recorded the same way we do for existing normal maps.
> Then the helper will look like:
> bpf_dynptr_alloc(struct bpf_map *map, u32 size, u64 flags, struct bpf_dynptr *ptr)
> {
>    set_active_memcg(map->memcg);
>    kmalloc into dynptr;
> }
> 
> Should we do this change now and allow NULL to be passed as a map ?

Hm, this looks a bit too much like a hack, I wouldn't do that, fwiw.

> This way the bpf prog will have a choice whether to account into memcg or not.
> Maybe it's all overkill and none of this needed?
> 
> On the other side maybe map should be a mandatory argument and dynptr_alloc
> can do its own memory accounting for stats ? atomic inc and dec is probably
> an acceptable overhead? bpftool will print the dynptr allocation stats.
> All sounds nice and extra visibility is great, but the kernel code that
> allocates for the kernel doesn't use memcg. bpf progs semantically are part of
> the kernel whereas memcg is a mechanism to restrict memory that kernel
> allocated on behalf of user tasks. We abused memcg for bpf progs/maps
> to have a limit. Not clear whether we should continue doing so for dynpr_alloc
> and in the future for kptr_alloc. gfp_account adds overhead too. It's not free.
> Thoughts?

Great question, I think the memcg is useful, just that the ownership for bpf
progs/maps has been relying on current whereas current is not a real 'owner',
just the entity which did the loading.

Maybe we need some sort of memcg object for bpf where we can "bind" the prog
and map to it at load time, which is then different from current and can be
flexibly set, e.g. fd = open(/sys/fs/cgroup/memory/<foo>) and pass that fd to
BPF_PROG_LOAD and BPF_MAP_CREATE via bpf_attr (otherwise, if not set, then
no accounting)?

Thanks,
Daniel
Andrii Nakryiko May 13, 2022, 8:59 p.m. UTC | #6
On Mon, May 9, 2022 at 3:44 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> This patch adds the bulk of the verifier work for supporting dynamic
> pointers (dynptrs) in bpf. This patch implements malloc-type dynptrs
> through 2 new APIs (bpf_dynptr_alloc and bpf_dynptr_put) that can be
> called by a bpf program. Malloc-type dynptrs are dynptrs that dynamically
> allocate memory on behalf of the program.
>
> A bpf_dynptr is opaque to the bpf program. It is a 16-byte structure
> defined internally as:
>
> struct bpf_dynptr_kern {
>     void *data;
>     u32 size;
>     u32 offset;
> } __aligned(8);
>
> The upper 8 bits of *size* is reserved (it contains extra metadata about
> read-only status and dynptr type); consequently, a dynptr only supports
> memory less than 16 MB.
>
> The 2 new APIs for malloc-type dynptrs are:
>
> long bpf_dynptr_alloc(u32 size, u64 flags, struct bpf_dynptr *ptr);
> void bpf_dynptr_put(struct bpf_dynptr *ptr);
>
> Please note that there *must* be a corresponding bpf_dynptr_put for
> every bpf_dynptr_alloc (even if the alloc fails). This is enforced
> by the verifier.
>
> In the verifier, dynptr state information will be tracked in stack
> slots. When the program passes in an uninitialized dynptr
> (ARG_PTR_TO_DYNPTR | MEM_UNINIT), the stack slots corresponding
> to the frame pointer where the dynptr resides at are marked STACK_DYNPTR.
>
> For helper functions that take in initialized dynptrs (eg
> bpf_dynptr_read + bpf_dynptr_write which are added later in this
> patchset), the verifier enforces that the dynptr has been initialized
> properly by checking that their corresponding stack slots have been marked
> as STACK_DYNPTR. Dynptr release functions (eg bpf_dynptr_put) will clear
> the stack slots. The verifier enforces at program exit that there are no
> referenced dynptrs that haven't been released.
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
>  include/linux/bpf.h            |  62 ++++++++-
>  include/linux/bpf_verifier.h   |  21 +++
>  include/uapi/linux/bpf.h       |  30 +++++
>  kernel/bpf/helpers.c           |  75 +++++++++++
>  kernel/bpf/verifier.c          | 228 ++++++++++++++++++++++++++++++++-
>  scripts/bpf_doc.py             |   2 +
>  tools/include/uapi/linux/bpf.h |  30 +++++
>  7 files changed, 445 insertions(+), 3 deletions(-)
>

Apart from what Daniel and Alexei are discussing, LGTM

Acked-by: Andrii Nakryiko <andrii@kernel.org>

[...]
Andrii Nakryiko May 13, 2022, 9:04 p.m. UTC | #7
On Fri, May 13, 2022 at 12:28 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 5/13/22 6:39 PM, Alexei Starovoitov wrote:
> > On Fri, May 13, 2022 at 03:12:06PM +0200, Daniel Borkmann wrote:
> >>
> >> Thinking more about it, is there even any value for BPF_FUNC_dynptr_* for
> >> fully unpriv BPF if these are rejected anyway by the spectre mitigations
> >> from verifier?
> > ...
> >> So either for alloc, we always built-in __GFP_ZERO or bpf_dynptr_alloc()
> >> helper usage should go under perfmon_capable() where it's allowed to read
> >> kernel mem.
> >
> > dynptr should probably by cap_bpf and cap_perfmon for now.
> > Otherwise we will start adding cap_perfmon checks in run-time to helpers
> > which is not easy to do. Some sort of prog or user context would need
> > to be passed as hidden arg into helper. That's too much hassle just
> > to enable dynptr for cap_bpf only.
> >
> > Similar problem with gfp_account... remembering memcg and passing all
> > the way to bpf_dynptr_alloc helper is not easy. And it's not clear
> > which memcg to use. The one where task was that loaded that bpf prog?
> > That task could have been gone and cgroup is in dying stage.
> > bpf prog is executing some context and allocating memory for itself.
> > Like kernel allocates memory for its needs. It doesn't feel right to
> > charge prog's memcg in that case. It probably should be an explicit choice
> > by bpf program author. Maybe in the future we can introduce a fake map
> > for such accounting needs and bpf prog could pass a map pointer to
> > bpf_dynptr_alloc. When such fake and empty map is created the memcg
> > would be recorded the same way we do for existing normal maps.
> > Then the helper will look like:
> > bpf_dynptr_alloc(struct bpf_map *map, u32 size, u64 flags, struct bpf_dynptr *ptr)
> > {
> >    set_active_memcg(map->memcg);
> >    kmalloc into dynptr;
> > }
> >
> > Should we do this change now and allow NULL to be passed as a map ?
>
> Hm, this looks a bit too much like a hack, I wouldn't do that, fwiw.
>
> > This way the bpf prog will have a choice whether to account into memcg or not.
> > Maybe it's all overkill and none of this needed?
> >
> > On the other side maybe map should be a mandatory argument and dynptr_alloc
> > can do its own memory accounting for stats ? atomic inc and dec is probably
> > an acceptable overhead? bpftool will print the dynptr allocation stats.
> > All sounds nice and extra visibility is great, but the kernel code that
> > allocates for the kernel doesn't use memcg. bpf progs semantically are part of
> > the kernel whereas memcg is a mechanism to restrict memory that kernel
> > allocated on behalf of user tasks. We abused memcg for bpf progs/maps
> > to have a limit. Not clear whether we should continue doing so for dynpr_alloc
> > and in the future for kptr_alloc. gfp_account adds overhead too. It's not free.
> > Thoughts?
>
> Great question, I think the memcg is useful, just that the ownership for bpf
> progs/maps has been relying on current whereas current is not a real 'owner',
> just the entity which did the loading.
>
> Maybe we need some sort of memcg object for bpf where we can "bind" the prog
> and map to it at load time, which is then different from current and can be
> flexibly set, e.g. fd = open(/sys/fs/cgroup/memory/<foo>) and pass that fd to
> BPF_PROG_LOAD and BPF_MAP_CREATE via bpf_attr (otherwise, if not set, then
> no accounting)?
>

I think it would be great to have memory accounting for BPF program as
a separate entity from current. BPF program is sort of like a special
process w.r.t. memory that it owns. Good thing is that with
bpf_run_ctx (once wired for all program types) such "ambient" entities
can be easily accessed from helpers to do accounting without any
verifier magic involved.

> Thanks,
> Daniel
David Vernet May 13, 2022, 9:36 p.m. UTC | #8
On Mon, May 09, 2022 at 03:42:53PM -0700, Joanne Koong wrote:
> This patch adds the bulk of the verifier work for supporting dynamic
> pointers (dynptrs) in bpf. This patch implements malloc-type dynptrs
> through 2 new APIs (bpf_dynptr_alloc and bpf_dynptr_put) that can be
> called by a bpf program. Malloc-type dynptrs are dynptrs that dynamically
> allocate memory on behalf of the program.
> 
> A bpf_dynptr is opaque to the bpf program. It is a 16-byte structure
> defined internally as:
> 
> struct bpf_dynptr_kern {
>     void *data;
>     u32 size;
>     u32 offset;
> } __aligned(8);
> 
> The upper 8 bits of *size* is reserved (it contains extra metadata about
> read-only status and dynptr type); consequently, a dynptr only supports
> memory less than 16 MB.
> 

Small nit: s/less than/up to?


[...]

> +/* the implementation of the opaque uapi struct bpf_dynptr */
> +struct bpf_dynptr_kern {
> +	void *data;
> +	/* Size represents the number of usable bytes in the dynptr.
> +	 * If for example the offset is at 200 for a malloc dynptr with
> +	 * allocation size 256, the number of usable bytes is 56.
> +	 *
> +	 * The upper 8 bits are reserved.
> +	 * Bit 31 denotes whether the dynptr is read-only.
> +	 * Bits 28-30 denote the dynptr type.

It's pretty clear from context, but just for completeness, could you also
explicitly specify what bits 0 - 27 denote (24 - 27 reserved, 0 - 23 size)?

> +	 */
> +	u32 size;
> +	u32 offset;
> +} __aligned(8);
> +
> +enum bpf_dynptr_type {
> +	BPF_DYNPTR_TYPE_INVALID,
> +	/* Memory allocated dynamically by the kernel for the dynptr */
> +	BPF_DYNPTR_TYPE_MALLOC,
> +};
> +
> +/* Since the upper 8 bits of dynptr->size is reserved, the
> + * maximum supported size is 2^24 - 1.
> + */
> +#define DYNPTR_MAX_SIZE	((1UL << 24) - 1)
> +#define DYNPTR_SIZE_MASK	0xFFFFFF
> +#define DYNPTR_TYPE_SHIFT	28
> +#define DYNPTR_TYPE_MASK	0x7

Should we add a static_assert(DYNPTR_SIZE_MASK >= DYNPTR_MAX_SIZE);
Potentially overkill, but if we're going to have separate macros for them
it might be prudent to add it?

[...]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 0fe1dea520ae..8cdedc776987 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -187,6 +187,10 @@ struct bpf_verifier_stack_elem {
>  					  POISON_POINTER_DELTA))
>  #define BPF_MAP_PTR(X)		((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
>  
> +static bool arg_type_is_mem_size(enum bpf_arg_type type);
> +static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
> +static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
> +
>  static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
>  {
>  	return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
> @@ -259,6 +263,7 @@ struct bpf_call_arg_meta {
>  	u32 ret_btf_id;
>  	u32 subprogno;
>  	struct bpf_map_value_off_desc *kptr_off_desc;
> +	u8 uninit_dynptr_regno;
>  };
>  
>  struct btf *btf_vmlinux;
> @@ -580,6 +585,7 @@ static char slot_type_char[] = {
>  	[STACK_SPILL]	= 'r',
>  	[STACK_MISC]	= 'm',
>  	[STACK_ZERO]	= '0',
> +	[STACK_DYNPTR]	= 'd',
>  };
>  
>  static void print_liveness(struct bpf_verifier_env *env,
> @@ -595,6 +601,25 @@ static void print_liveness(struct bpf_verifier_env *env,
>  		verbose(env, "D");
>  }
>  
> +static inline int get_spi(s32 off)
> +{
> +	return (-off - 1) / BPF_REG_SIZE;
> +}

Small / optional nit: It's probably harmless to leave this as inline as the
compiler will almost certainly inline it for you, but to that point, it's
probably not necessary to mark this as inline. It looks like most other
static functions in verifier.c are non-inline, so IMO it's probably best to
follow that lead.

[...]

>  static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
> @@ -5725,7 +5885,16 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
>  
>  skip_type_check:
>  	if (arg_type_is_release(arg_type)) {
> -		if (!reg->ref_obj_id && !register_is_null(reg)) {
> +		if (arg_type_is_dynptr(arg_type)) {
> +			struct bpf_func_state *state = func(env, reg);
> +			int spi = get_spi(reg->off);
> +
> +			if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
> +			    !state->stack[spi].spilled_ptr.id) {
> +				verbose(env, "arg %d is an unacquired reference\n", regno);
> +				return -EINVAL;
> +			}
> +		} else if (!reg->ref_obj_id && !register_is_null(reg)) {
>  			verbose(env, "R%d must be referenced when passed to release function\n",
>  				regno);
>  			return -EINVAL;
> @@ -5837,6 +6006,43 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
>  		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
>  
>  		err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
> +	} else if (arg_type_is_dynptr(arg_type)) {
> +		/* Can't pass in a dynptr at a weird offset */
> +		if (reg->off % BPF_REG_SIZE) {
> +			verbose(env, "cannot pass in non-zero dynptr offset\n");
> +			return -EINVAL;
> +		}

Should this check be moved to check_func_arg_reg_off()?

> +
> +		if (arg_type & MEM_UNINIT)  {
> +			if (!is_dynptr_reg_valid_uninit(env, reg)) {
> +				verbose(env, "Arg #%d dynptr has to be an uninitialized dynptr\n",
> +					arg + BPF_REG_1);
> +				return -EINVAL;
> +			}
> +
> +			/* We only support one dynptr being uninitialized at the moment,
> +			 * which is sufficient for the helper functions we have right now.
> +			 */
> +			if (meta->uninit_dynptr_regno) {
> +				verbose(env, "verifier internal error: more than one uninitialized dynptr arg\n");
> +				return -EFAULT;
> +			}
> +
> +			meta->uninit_dynptr_regno = arg + BPF_REG_1;

Can this be simplified to:

meta->uninit_dynptr_regno = regno;

[...]

Looks good otherwise, thanks!

Acked-by: David Vernet <void@manifault.com>
Alexei Starovoitov May 13, 2022, 10:16 p.m. UTC | #9
On Fri, May 13, 2022 at 09:28:03PM +0200, Daniel Borkmann wrote:
> On 5/13/22 6:39 PM, Alexei Starovoitov wrote:
> > On Fri, May 13, 2022 at 03:12:06PM +0200, Daniel Borkmann wrote:
> > > 
> > > Thinking more about it, is there even any value for BPF_FUNC_dynptr_* for
> > > fully unpriv BPF if these are rejected anyway by the spectre mitigations
> > > from verifier?
> > ...
> > > So either for alloc, we always built-in __GFP_ZERO or bpf_dynptr_alloc()
> > > helper usage should go under perfmon_capable() where it's allowed to read
> > > kernel mem.
> > 
> > dynptr should probably by cap_bpf and cap_perfmon for now.
> > Otherwise we will start adding cap_perfmon checks in run-time to helpers
> > which is not easy to do. Some sort of prog or user context would need
> > to be passed as hidden arg into helper. That's too much hassle just
> > to enable dynptr for cap_bpf only.
> > 
> > Similar problem with gfp_account... remembering memcg and passing all
> > the way to bpf_dynptr_alloc helper is not easy. And it's not clear
> > which memcg to use. The one where task was that loaded that bpf prog?
> > That task could have been gone and cgroup is in dying stage.
> > bpf prog is executing some context and allocating memory for itself.
> > Like kernel allocates memory for its needs. It doesn't feel right to
> > charge prog's memcg in that case. It probably should be an explicit choice
> > by bpf program author. Maybe in the future we can introduce a fake map
> > for such accounting needs and bpf prog could pass a map pointer to
> > bpf_dynptr_alloc. When such fake and empty map is created the memcg
> > would be recorded the same way we do for existing normal maps.
> > Then the helper will look like:
> > bpf_dynptr_alloc(struct bpf_map *map, u32 size, u64 flags, struct bpf_dynptr *ptr)
> > {
> >    set_active_memcg(map->memcg);
> >    kmalloc into dynptr;
> > }
> > 
> > Should we do this change now and allow NULL to be passed as a map ?
> 
> Hm, this looks a bit too much like a hack, I wouldn't do that, fwiw.
> 
> > This way the bpf prog will have a choice whether to account into memcg or not.
> > Maybe it's all overkill and none of this needed?
> > 
> > On the other side maybe map should be a mandatory argument and dynptr_alloc
> > can do its own memory accounting for stats ? atomic inc and dec is probably
> > an acceptable overhead? bpftool will print the dynptr allocation stats.
> > All sounds nice and extra visibility is great, but the kernel code that
> > allocates for the kernel doesn't use memcg. bpf progs semantically are part of
> > the kernel whereas memcg is a mechanism to restrict memory that kernel
> > allocated on behalf of user tasks. We abused memcg for bpf progs/maps
> > to have a limit. Not clear whether we should continue doing so for dynpr_alloc
> > and in the future for kptr_alloc. gfp_account adds overhead too. It's not free.
> > Thoughts?
> 
> Great question, I think the memcg is useful, just that the ownership for bpf
> progs/maps has been relying on current whereas current is not a real 'owner',
> just the entity which did the loading.
> 
> Maybe we need some sort of memcg object for bpf where we can "bind" the prog
> and map to it at load time, which is then different from current and can be
> flexibly set, e.g. fd = open(/sys/fs/cgroup/memory/<foo>) and pass that fd to
> BPF_PROG_LOAD and BPF_MAP_CREATE via bpf_attr (otherwise, if not set, then
> no accounting)?

Agree. Explicitly specifying memcg by FD would be nice.
It will be useful for normal maps and progs.
This is a bit orthogonal to having a map argument to bpf_dynptr/kptr_alloc.

Here is the main reason why we probably should have it mandatory:
kmalloc cannot be called from nmi and in general cannot be called from tracing.
kprobe/fentry could be inside slab or page alloc path and it might blow up.
That's the reason why hashmap defaults to pre-alloc.
In order to do pre-alloc in bpf_dynptr/kptr_alloc() it has to have a map-like
argument that will keep the info about preallocated memory.

How about the following api:
mem = bpf_map_create(BPF_MAP_TYPE_MEMORY); // form user space
bpf_mem_prealloc(mem, size); // preallocate memory. from sleepable or irqwork
bpf_dynptr_alloc(mem, size, flags, &dynptr); // non-sleepable
// returns 'size' bytes if they were available in preallocated memory

Right now bpf maps are either full prealloc or full kmalloc.
This approach will be a hybrid.
The bpf progs will be using it roughly like this:

// init from user space
mem = bpf_map_create(BPF_MAP_TYPE_MEMORY);
sys_bpf(mem_prealloc, 1Mbyte); // prealloc largest possible single dynptr_alloc

// from bpf prog
bpf_dynptr_alloc(mem, size, flags, &dynptr); // if (size < 1M) all good
bpf_irq_work_queue(replenish_prealloc, size); // refill mem's prealloc

void replenish_prealloc(sz) { bpf_mem_prealloc(mem, sz); }

bpf_dynptr_alloc would need to implement a memory allocator out of
reserved memory. We can probably reuse some of sl[oua]b code.
slob_alloc may fit the best (without dynamic slob_new_pages).
Song's pack_alloc probably good enough to start.

The gfp_account flag moves into bpf_mem_prealloc() helper.
It doesn't make sense in bpf_dynptr_alloc.
While gfp_zero makes sense only in bpf_dynptr_alloc.

Thoughts?
Joanne Koong May 16, 2022, 8:29 p.m. UTC | #10
On Fri, May 13, 2022 at 3:16 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, May 13, 2022 at 09:28:03PM +0200, Daniel Borkmann wrote:
> > On 5/13/22 6:39 PM, Alexei Starovoitov wrote:
> > > On Fri, May 13, 2022 at 03:12:06PM +0200, Daniel Borkmann wrote:
> > > >
> > > > Thinking more about it, is there even any value for BPF_FUNC_dynptr_* for
> > > > fully unpriv BPF if these are rejected anyway by the spectre mitigations
> > > > from verifier?
> > > ...
> > > > So either for alloc, we always built-in __GFP_ZERO or bpf_dynptr_alloc()
> > > > helper usage should go under perfmon_capable() where it's allowed to read
> > > > kernel mem.
> > >
> > > dynptr should probably by cap_bpf and cap_perfmon for now.
> > > Otherwise we will start adding cap_perfmon checks in run-time to helpers
> > > which is not easy to do. Some sort of prog or user context would need
> > > to be passed as hidden arg into helper. That's too much hassle just
> > > to enable dynptr for cap_bpf only.
> > >
> > > Similar problem with gfp_account... remembering memcg and passing all
> > > the way to bpf_dynptr_alloc helper is not easy. And it's not clear
> > > which memcg to use. The one where task was that loaded that bpf prog?
> > > That task could have been gone and cgroup is in dying stage.
> > > bpf prog is executing some context and allocating memory for itself.
> > > Like kernel allocates memory for its needs. It doesn't feel right to
> > > charge prog's memcg in that case. It probably should be an explicit choice
> > > by bpf program author. Maybe in the future we can introduce a fake map
> > > for such accounting needs and bpf prog could pass a map pointer to
> > > bpf_dynptr_alloc. When such fake and empty map is created the memcg
> > > would be recorded the same way we do for existing normal maps.
> > > Then the helper will look like:
> > > bpf_dynptr_alloc(struct bpf_map *map, u32 size, u64 flags, struct bpf_dynptr *ptr)
> > > {
> > >    set_active_memcg(map->memcg);
> > >    kmalloc into dynptr;
> > > }
> > >
> > > Should we do this change now and allow NULL to be passed as a map ?
> >
> > Hm, this looks a bit too much like a hack, I wouldn't do that, fwiw.
> >
> > > This way the bpf prog will have a choice whether to account into memcg or not.
> > > Maybe it's all overkill and none of this needed?
> > >
> > > On the other side maybe map should be a mandatory argument and dynptr_alloc
> > > can do its own memory accounting for stats ? atomic inc and dec is probably
> > > an acceptable overhead? bpftool will print the dynptr allocation stats.
> > > All sounds nice and extra visibility is great, but the kernel code that
> > > allocates for the kernel doesn't use memcg. bpf progs semantically are part of
> > > the kernel whereas memcg is a mechanism to restrict memory that kernel
> > > allocated on behalf of user tasks. We abused memcg for bpf progs/maps
> > > to have a limit. Not clear whether we should continue doing so for dynpr_alloc
> > > and in the future for kptr_alloc. gfp_account adds overhead too. It's not free.
> > > Thoughts?
> >
> > Great question, I think the memcg is useful, just that the ownership for bpf
> > progs/maps has been relying on current whereas current is not a real 'owner',
> > just the entity which did the loading.
> >
> > Maybe we need some sort of memcg object for bpf where we can "bind" the prog
> > and map to it at load time, which is then different from current and can be
> > flexibly set, e.g. fd = open(/sys/fs/cgroup/memory/<foo>) and pass that fd to
> > BPF_PROG_LOAD and BPF_MAP_CREATE via bpf_attr (otherwise, if not set, then
> > no accounting)?
>
> Agree. Explicitly specifying memcg by FD would be nice.
> It will be useful for normal maps and progs.
> This is a bit orthogonal to having a map argument to bpf_dynptr/kptr_alloc.
>
> Here is the main reason why we probably should have it mandatory:
> kmalloc cannot be called from nmi and in general cannot be called from tracing.
> kprobe/fentry could be inside slab or page alloc path and it might blow up.
> That's the reason why hashmap defaults to pre-alloc.
> In order to do pre-alloc in bpf_dynptr/kptr_alloc() it has to have a map-like
> argument that will keep the info about preallocated memory.
>
> How about the following api:
> mem = bpf_map_create(BPF_MAP_TYPE_MEMORY); // form user space
> bpf_mem_prealloc(mem, size); // preallocate memory. from sleepable or irqwork
> bpf_dynptr_alloc(mem, size, flags, &dynptr); // non-sleepable
> // returns 'size' bytes if they were available in preallocated memory
>
> Right now bpf maps are either full prealloc or full kmalloc.
> This approach will be a hybrid.
> The bpf progs will be using it roughly like this:
>
> // init from user space
> mem = bpf_map_create(BPF_MAP_TYPE_MEMORY);
> sys_bpf(mem_prealloc, 1Mbyte); // prealloc largest possible single dynptr_alloc
>
> // from bpf prog
> bpf_dynptr_alloc(mem, size, flags, &dynptr); // if (size < 1M) all good
> bpf_irq_work_queue(replenish_prealloc, size); // refill mem's prealloc
>
> void replenish_prealloc(sz) { bpf_mem_prealloc(mem, sz); }
>
> bpf_dynptr_alloc would need to implement a memory allocator out of
> reserved memory. We can probably reuse some of sl[oua]b code.
> slob_alloc may fit the best (without dynamic slob_new_pages).
> Song's pack_alloc probably good enough to start.
>
> The gfp_account flag moves into bpf_mem_prealloc() helper.
> It doesn't make sense in bpf_dynptr_alloc.
> While gfp_zero makes sense only in bpf_dynptr_alloc.
>
> Thoughts?

Do you envision this also being used for accounting for kfunc memory
allocations?
Joanne Koong May 16, 2022, 8:52 p.m. UTC | #11
On Wed, May 11, 2022 at 5:05 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 5/10/22 12:42 AM, Joanne Koong wrote:
> [...]
> > @@ -6498,6 +6523,11 @@ struct bpf_timer {
> >       __u64 :64;
> >   } __attribute__((aligned(8)));
> >
> > +struct bpf_dynptr {
> > +     __u64 :64;
> > +     __u64 :64;
> > +} __attribute__((aligned(8)));
> > +
> >   struct bpf_sysctl {
> >       __u32   write;          /* Sysctl is being read (= 0) or written (= 1).
> >                                * Allows 1,2,4-byte read, but no write.
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index 8a2398ac14c2..a4272e9239ea 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -1396,6 +1396,77 @@ const struct bpf_func_proto bpf_kptr_xchg_proto = {
> >       .arg2_btf_id  = BPF_PTR_POISON,
> >   };
> >
> > +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type,
> > +                  u32 offset, u32 size)
> > +{
> > +     ptr->data = data;
> > +     ptr->offset = offset;
> > +     ptr->size = size;
> > +     bpf_dynptr_set_type(ptr, type);
> > +}
> > +
> > +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
> > +{
> > +     memset(ptr, 0, sizeof(*ptr));
> > +}
> > +
> > +BPF_CALL_3(bpf_dynptr_alloc, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
> > +{
> > +     gfp_t gfp_flags = GFP_ATOMIC;
>
> nit: should also have __GFP_NOWARN
>
> I presume mem accounting cannot be done on this one given there is no real "ownership"
> of this piece of mem?
While we figure out the details of memory accounting for allocations,
I will defer the malloc parts of this patchset to the 2nd dynptr
patchset. I will resubmit v5 without malloc-type dynptrs
>
> Was planning to run some more local tests tomorrow, but from glance at selftest side
> I haven't seen sanity checks like these:
>
> bpf_dynptr_alloc(8, 0, &ptr);
> data = bpf_dynptr_data(&ptr, 0, 0);
> bpf_dynptr_put(&ptr);
> *(__u8 *)data = 23;
>
> How is this prevented? I think you do a ptr id check in the is_dynptr_ref_function
> check on the acquire function, but with above use, would our data pointer escape, or
> get invalidated via last put?
>
> Thanks,
> Daniel
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d0c167865504..e078b8a911fe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -391,9 +391,14 @@  enum bpf_type_flag {
 
 	MEM_UNINIT		= BIT(7 + BPF_BASE_TYPE_BITS),
 
-	__BPF_TYPE_LAST_FLAG	= MEM_UNINIT,
+	/* DYNPTR points to dynamically allocated memory. */
+	DYNPTR_TYPE_MALLOC	= BIT(8 + BPF_BASE_TYPE_BITS),
+
+	__BPF_TYPE_LAST_FLAG	= DYNPTR_TYPE_MALLOC,
 };
 
+#define DYNPTR_TYPE_FLAG_MASK	DYNPTR_TYPE_MALLOC
+
 /* Max number of base types. */
 #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
 
@@ -436,6 +441,7 @@  enum bpf_arg_type {
 	ARG_PTR_TO_CONST_STR,	/* pointer to a null terminated read-only string */
 	ARG_PTR_TO_TIMER,	/* pointer to bpf_timer */
 	ARG_PTR_TO_KPTR,	/* pointer to referenced kptr */
+	ARG_PTR_TO_DYNPTR,      /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */
 	__BPF_ARG_TYPE_MAX,
 
 	/* Extended arg_types. */
@@ -2347,4 +2353,58 @@  int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
 			u32 **bin_buf, u32 num_args);
 void bpf_bprintf_cleanup(void);
 
+/* the implementation of the opaque uapi struct bpf_dynptr */
+struct bpf_dynptr_kern {
+	void *data;
+	/* Size represents the number of usable bytes in the dynptr.
+	 * If for example the offset is at 200 for a malloc dynptr with
+	 * allocation size 256, the number of usable bytes is 56.
+	 *
+	 * The upper 8 bits are reserved.
+	 * Bit 31 denotes whether the dynptr is read-only.
+	 * Bits 28-30 denote the dynptr type.
+	 */
+	u32 size;
+	u32 offset;
+} __aligned(8);
+
+enum bpf_dynptr_type {
+	BPF_DYNPTR_TYPE_INVALID,
+	/* Memory allocated dynamically by the kernel for the dynptr */
+	BPF_DYNPTR_TYPE_MALLOC,
+};
+
+/* Since the upper 8 bits of dynptr->size is reserved, the
+ * maximum supported size is 2^24 - 1.
+ */
+#define DYNPTR_MAX_SIZE	((1UL << 24) - 1)
+#define DYNPTR_SIZE_MASK	0xFFFFFF
+#define DYNPTR_TYPE_SHIFT	28
+#define DYNPTR_TYPE_MASK	0x7
+
+static inline enum bpf_dynptr_type bpf_dynptr_get_type(struct bpf_dynptr_kern *ptr)
+{
+	return (ptr->size >> DYNPTR_TYPE_SHIFT) & DYNPTR_TYPE_MASK;
+}
+
+static inline void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
+{
+	ptr->size |= type << DYNPTR_TYPE_SHIFT;
+}
+
+static inline u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+{
+	return ptr->size & DYNPTR_SIZE_MASK;
+}
+
+static inline int bpf_dynptr_check_size(u32 size)
+{
+	return size > DYNPTR_MAX_SIZE ? -E2BIG : 0;
+}
+
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type,
+		     u32 offset, u32 size);
+
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1f1e7f2ea967..830a0e11ae97 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -72,6 +72,18 @@  struct bpf_reg_state {
 
 		u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
 
+		/* For dynptr stack slots */
+		struct {
+			enum bpf_dynptr_type type;
+			/* A dynptr is 16 bytes so it takes up 2 stack slots.
+			 * We need to track which slot is the first slot
+			 * to protect against cases where the user may try to
+			 * pass in an address starting at the second slot of the
+			 * dynptr.
+			 */
+			bool first_slot;
+		} dynptr;
+
 		/* Max size from any of the above. */
 		struct {
 			unsigned long raw1;
@@ -88,6 +100,8 @@  struct bpf_reg_state {
 	 * for the purpose of tracking that it's freed.
 	 * For PTR_TO_SOCKET this is used to share which pointers retain the
 	 * same reference to the socket, to determine proper reference freeing.
+	 * For stack slots that are dynptrs, this is used to track references to
+	 * the dynptr to determine proper reference freeing.
 	 */
 	u32 id;
 	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
@@ -174,9 +188,16 @@  enum bpf_stack_slot_type {
 	STACK_SPILL,      /* register spilled into stack */
 	STACK_MISC,	  /* BPF program wrote some data into this slot */
 	STACK_ZERO,	  /* BPF program wrote constant zero */
+	/* A dynptr is stored in this stack slot. The type of dynptr
+	 * is stored in bpf_stack_state->spilled_ptr.dynptr.type
+	 */
+	STACK_DYNPTR,
 };
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
+/* size of a struct bpf_dynptr in bytes */
+#define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern)
+#define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE)
 
 struct bpf_stack_state {
 	struct bpf_reg_state spilled_ptr;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 444fe6f1cf35..5a87ed654016 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5154,6 +5154,29 @@  union bpf_attr {
  *		if not NULL, is a reference which must be released using its
  *		corresponding release function, or moved into a BPF map before
  *		program exit.
+ *
+ * long bpf_dynptr_alloc(u32 size, u64 flags, struct bpf_dynptr *ptr)
+ *	Description
+ *		Allocate memory of *size* bytes.
+ *
+ *		Every call to bpf_dynptr_alloc must have a corresponding
+ *		bpf_dynptr_put, regardless of whether the bpf_dynptr_alloc
+ *		succeeded.
+ *
+ *		The maximum *size* supported is DYNPTR_MAX_SIZE.
+ *		Supported *flags* are __GFP_ZERO.
+ *	Return
+ *		0 on success, -ENOMEM if there is not enough memory for the
+ *		allocation, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, -EINVAL
+ *		if the flags is not supported.
+ *
+ * void bpf_dynptr_put(struct bpf_dynptr *ptr)
+ *	Description
+ *		Free memory allocated by bpf_dynptr_alloc.
+ *
+ *		After this operation, *ptr* will be an invalidated dynptr.
+ *	Return
+ *		Void.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5351,6 +5374,8 @@  union bpf_attr {
 	FN(skb_set_tstamp),		\
 	FN(ima_file_hash),		\
 	FN(kptr_xchg),			\
+	FN(dynptr_alloc),		\
+	FN(dynptr_put),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6498,6 +6523,11 @@  struct bpf_timer {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_dynptr {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 8a2398ac14c2..a4272e9239ea 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1396,6 +1396,77 @@  const struct bpf_func_proto bpf_kptr_xchg_proto = {
 	.arg2_btf_id  = BPF_PTR_POISON,
 };
 
+void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type,
+		     u32 offset, u32 size)
+{
+	ptr->data = data;
+	ptr->offset = offset;
+	ptr->size = size;
+	bpf_dynptr_set_type(ptr, type);
+}
+
+void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
+{
+	memset(ptr, 0, sizeof(*ptr));
+}
+
+BPF_CALL_3(bpf_dynptr_alloc, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr)
+{
+	gfp_t gfp_flags = GFP_ATOMIC;
+	void *data;
+	int err;
+
+	err = bpf_dynptr_check_size(size);
+	if (err)
+		goto error;
+
+	if (flags) {
+		if (flags == __GFP_ZERO) {
+			gfp_flags |= flags;
+		} else {
+			err = -EINVAL;
+			goto error;
+		}
+	}
+
+	data = kmalloc(size, gfp_flags);
+	if (!data) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	bpf_dynptr_init(ptr, data, BPF_DYNPTR_TYPE_MALLOC, 0, size);
+
+	return 0;
+
+error:
+	bpf_dynptr_set_null(ptr);
+	return err;
+}
+
+const struct bpf_func_proto bpf_dynptr_alloc_proto = {
+	.func		= bpf_dynptr_alloc,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_MALLOC | MEM_UNINIT,
+};
+
+BPF_CALL_1(bpf_dynptr_put, struct bpf_dynptr_kern *, dynptr)
+{
+	kfree(dynptr->data);
+	bpf_dynptr_set_null(dynptr);
+	return 0;
+}
+
+const struct bpf_func_proto bpf_dynptr_put_proto = {
+	.func		= bpf_dynptr_put,
+	.gpl_only	= false,
+	.ret_type	= RET_VOID,
+	.arg1_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_MALLOC | OBJ_RELEASE,
+};
+
 const struct bpf_func_proto bpf_get_current_task_proto __weak;
 const struct bpf_func_proto bpf_get_current_task_btf_proto __weak;
 const struct bpf_func_proto bpf_probe_read_user_proto __weak;
@@ -1448,6 +1519,10 @@  bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_loop_proto;
 	case BPF_FUNC_strncmp:
 		return &bpf_strncmp_proto;
+	case BPF_FUNC_dynptr_alloc:
+		return &bpf_dynptr_alloc_proto;
+	case BPF_FUNC_dynptr_put:
+		return &bpf_dynptr_put_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0fe1dea520ae..8cdedc776987 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -187,6 +187,10 @@  struct bpf_verifier_stack_elem {
 					  POISON_POINTER_DELTA))
 #define BPF_MAP_PTR(X)		((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
 
+static bool arg_type_is_mem_size(enum bpf_arg_type type);
+static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
+
 static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
 {
 	return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
@@ -259,6 +263,7 @@  struct bpf_call_arg_meta {
 	u32 ret_btf_id;
 	u32 subprogno;
 	struct bpf_map_value_off_desc *kptr_off_desc;
+	u8 uninit_dynptr_regno;
 };
 
 struct btf *btf_vmlinux;
@@ -580,6 +585,7 @@  static char slot_type_char[] = {
 	[STACK_SPILL]	= 'r',
 	[STACK_MISC]	= 'm',
 	[STACK_ZERO]	= '0',
+	[STACK_DYNPTR]	= 'd',
 };
 
 static void print_liveness(struct bpf_verifier_env *env,
@@ -595,6 +601,25 @@  static void print_liveness(struct bpf_verifier_env *env,
 		verbose(env, "D");
 }
 
+static inline int get_spi(s32 off)
+{
+	return (-off - 1) / BPF_REG_SIZE;
+}
+
+static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_slots)
+{
+	int allocated_slots = state->allocated_stack / BPF_REG_SIZE;
+
+	/* We need to check that slots between [spi - nr_slots + 1, spi] are
+	 * within [0, allocated_stack).
+	 *
+	 * Please note that the spi grows downwards. For example, a dynptr
+	 * takes the size of two stack slots; the first slot will be at
+	 * spi and the second slot will be at spi - 1.
+	 */
+	return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
+}
+
 static struct bpf_func_state *func(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *reg)
 {
@@ -646,6 +671,130 @@  static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
 	env->scratched_stack_slots = ~0ULL;
 }
 
+static int arg_to_dynptr_type(enum bpf_arg_type arg_type)
+{
+	switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+	case DYNPTR_TYPE_MALLOC:
+		return BPF_DYNPTR_TYPE_MALLOC;
+	default:
+		return BPF_DYNPTR_TYPE_INVALID;
+	}
+}
+
+static inline bool dynptr_type_refcounted(enum bpf_dynptr_type type)
+{
+	return type == BPF_DYNPTR_TYPE_MALLOC;
+}
+
+static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				   enum bpf_arg_type arg_type, int insn_idx)
+{
+	struct bpf_func_state *state = cur_func(env);
+	enum bpf_dynptr_type type;
+	int spi, id, i;
+
+	spi = get_spi(reg->off);
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+		return -EINVAL;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		state->stack[spi].slot_type[i] = STACK_DYNPTR;
+		state->stack[spi - 1].slot_type[i] = STACK_DYNPTR;
+	}
+
+	type = arg_to_dynptr_type(arg_type);
+	if (type == BPF_DYNPTR_TYPE_INVALID)
+		return -EINVAL;
+
+	state->stack[spi].spilled_ptr.dynptr.type = type;
+	state->stack[spi - 1].spilled_ptr.dynptr.type = type;
+
+	state->stack[spi].spilled_ptr.dynptr.first_slot = true;
+
+	if (dynptr_type_refcounted(type)) {
+		/* The id is used to track proper releasing */
+		id = acquire_reference_state(env, insn_idx);
+		if (id < 0)
+			return id;
+
+		state->stack[spi].spilled_ptr.id = id;
+		state->stack[spi - 1].spilled_ptr.id = id;
+	}
+
+	return 0;
+}
+
+static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi, i;
+
+	spi = get_spi(reg->off);
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+		return -EINVAL;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		state->stack[spi].slot_type[i] = STACK_INVALID;
+		state->stack[spi - 1].slot_type[i] = STACK_INVALID;
+	}
+
+	/* Invalidate any slices associated with this dynptr */
+	if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
+		release_reference(env, state->stack[spi].spilled_ptr.id);
+		state->stack[spi].spilled_ptr.id = 0;
+		state->stack[spi - 1].spilled_ptr.id = 0;
+	}
+
+	state->stack[spi].spilled_ptr.dynptr.type = 0;
+	state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
+
+	return 0;
+}
+
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi = get_spi(reg->off);
+	int i;
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
+		return true;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		if (state->stack[spi].slot_type[i] == STACK_DYNPTR ||
+		    state->stack[spi - 1].slot_type[i] == STACK_DYNPTR)
+			return false;
+	}
+
+	return true;
+}
+
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+				     enum bpf_arg_type arg_type)
+{
+	struct bpf_func_state *state = func(env, reg);
+	int spi = get_spi(reg->off);
+	int i;
+
+	if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+	    !state->stack[spi].spilled_ptr.dynptr.first_slot)
+		return false;
+
+	for (i = 0; i < BPF_REG_SIZE; i++) {
+		if (state->stack[spi].slot_type[i] != STACK_DYNPTR ||
+		    state->stack[spi - 1].slot_type[i] != STACK_DYNPTR)
+			return false;
+	}
+
+	/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
+	if (arg_type == ARG_PTR_TO_DYNPTR)
+		return true;
+
+	return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type);
+}
+
 /* The reg state of a pointer or a bounded scalar was saved when
  * it was spilled to the stack.
  */
@@ -5400,6 +5549,16 @@  static bool arg_type_is_release(enum bpf_arg_type type)
 	return type & OBJ_RELEASE;
 }
 
+static inline bool arg_type_is_dynptr(enum bpf_arg_type type)
+{
+	return base_type(type) == ARG_PTR_TO_DYNPTR;
+}
+
+static inline bool arg_type_is_dynptr_uninit(enum bpf_arg_type type)
+{
+	return arg_type_is_dynptr(type) && (type & MEM_UNINIT);
+}
+
 static int int_ptr_type_to_size(enum bpf_arg_type type)
 {
 	if (type == ARG_PTR_TO_INT)
@@ -5539,6 +5698,7 @@  static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
 	[ARG_PTR_TO_TIMER]		= &timer_types,
 	[ARG_PTR_TO_KPTR]		= &kptr_types,
+	[ARG_PTR_TO_DYNPTR]		= &stack_ptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -5725,7 +5885,16 @@  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 
 skip_type_check:
 	if (arg_type_is_release(arg_type)) {
-		if (!reg->ref_obj_id && !register_is_null(reg)) {
+		if (arg_type_is_dynptr(arg_type)) {
+			struct bpf_func_state *state = func(env, reg);
+			int spi = get_spi(reg->off);
+
+			if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+			    !state->stack[spi].spilled_ptr.id) {
+				verbose(env, "arg %d is an unacquired reference\n", regno);
+				return -EINVAL;
+			}
+		} else if (!reg->ref_obj_id && !register_is_null(reg)) {
 			verbose(env, "R%d must be referenced when passed to release function\n",
 				regno);
 			return -EINVAL;
@@ -5837,6 +6006,43 @@  static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
 		err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
+	} else if (arg_type_is_dynptr(arg_type)) {
+		/* Can't pass in a dynptr at a weird offset */
+		if (reg->off % BPF_REG_SIZE) {
+			verbose(env, "cannot pass in non-zero dynptr offset\n");
+			return -EINVAL;
+		}
+
+		if (arg_type & MEM_UNINIT)  {
+			if (!is_dynptr_reg_valid_uninit(env, reg)) {
+				verbose(env, "Arg #%d dynptr has to be an uninitialized dynptr\n",
+					arg + BPF_REG_1);
+				return -EINVAL;
+			}
+
+			/* We only support one dynptr being uninitialized at the moment,
+			 * which is sufficient for the helper functions we have right now.
+			 */
+			if (meta->uninit_dynptr_regno) {
+				verbose(env, "verifier internal error: more than one uninitialized dynptr arg\n");
+				return -EFAULT;
+			}
+
+			meta->uninit_dynptr_regno = arg + BPF_REG_1;
+		} else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) {
+			const char *err_extra = "";
+
+			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+			case DYNPTR_TYPE_MALLOC:
+				err_extra = "malloc ";
+				break;
+			default:
+				break;
+			}
+			verbose(env, "Expected an initialized %sdynptr as arg #%d\n",
+				err_extra, arg + BPF_REG_1);
+			return -EINVAL;
+		}
 	} else if (arg_type_is_alloc_size(arg_type)) {
 		if (!tnum_is_const(reg->var_off)) {
 			verbose(env, "R%d is not a known constant'\n",
@@ -6963,9 +7169,27 @@  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 
 	regs = cur_regs(env);
 
+	if (meta.uninit_dynptr_regno) {
+		/* we write BPF_DW bits (8 bytes) at a time */
+		for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
+			err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
+					       i, BPF_DW, BPF_WRITE, -1, false);
+			if (err)
+				return err;
+		}
+
+		err = mark_stack_slots_dynptr(env, &regs[meta.uninit_dynptr_regno],
+					      fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1],
+					      insn_idx);
+		if (err)
+			return err;
+	}
+
 	if (meta.release_regno) {
 		err = -EINVAL;
-		if (meta.ref_obj_id)
+		if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1]))
+			err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+		else if (meta.ref_obj_id)
 			err = release_reference(env, meta.ref_obj_id);
 		/* meta.ref_obj_id can only be 0 if register that is meant to be
 		 * released is NULL, which must be > R0.
diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
index 096625242475..766dcbc73897 100755
--- a/scripts/bpf_doc.py
+++ b/scripts/bpf_doc.py
@@ -633,6 +633,7 @@  class PrinterHelpers(Printer):
             'struct socket',
             'struct file',
             'struct bpf_timer',
+            'struct bpf_dynptr',
     ]
     known_types = {
             '...',
@@ -682,6 +683,7 @@  class PrinterHelpers(Printer):
             'struct socket',
             'struct file',
             'struct bpf_timer',
+            'struct bpf_dynptr',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 444fe6f1cf35..5a87ed654016 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5154,6 +5154,29 @@  union bpf_attr {
  *		if not NULL, is a reference which must be released using its
  *		corresponding release function, or moved into a BPF map before
  *		program exit.
+ *
+ * long bpf_dynptr_alloc(u32 size, u64 flags, struct bpf_dynptr *ptr)
+ *	Description
+ *		Allocate memory of *size* bytes.
+ *
+ *		Every call to bpf_dynptr_alloc must have a corresponding
+ *		bpf_dynptr_put, regardless of whether the bpf_dynptr_alloc
+ *		succeeded.
+ *
+ *		The maximum *size* supported is DYNPTR_MAX_SIZE.
+ *		Supported *flags* are __GFP_ZERO.
+ *	Return
+ *		0 on success, -ENOMEM if there is not enough memory for the
+ *		allocation, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, -EINVAL
+ *		if the flags is not supported.
+ *
+ * void bpf_dynptr_put(struct bpf_dynptr *ptr)
+ *	Description
+ *		Free memory allocated by bpf_dynptr_alloc.
+ *
+ *		After this operation, *ptr* will be an invalidated dynptr.
+ *	Return
+ *		Void.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5351,6 +5374,8 @@  union bpf_attr {
 	FN(skb_set_tstamp),		\
 	FN(ima_file_hash),		\
 	FN(kptr_xchg),			\
+	FN(dynptr_alloc),		\
+	FN(dynptr_put),			\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@ -6498,6 +6523,11 @@  struct bpf_timer {
 	__u64 :64;
 } __attribute__((aligned(8)));
 
+struct bpf_dynptr {
+	__u64 :64;
+	__u64 :64;
+} __attribute__((aligned(8)));
+
 struct bpf_sysctl {
 	__u32	write;		/* Sysctl is being read (= 0) or written (= 1).
 				 * Allows 1,2,4-byte read, but no write.