diff mbox series

[bpf-next,v3,01/13] bpf: Add support for non-fix-size percpu mem allocation

Message ID 20230827152734.1995725-1-yonghong.song@linux.dev (mailing list archive)
State Accepted
Commit baa9555b6c1c88e885820c82fcc05d36e30daaa2
Delegated to: BPF
Headers show
Series bpf: Add support for local percpu kptr | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2828 this patch: 2828
netdev/cc_maintainers warning 7 maintainers not CCed: kpsingh@kernel.org martin.lau@linux.dev john.fastabend@gmail.com sdf@google.com song@kernel.org jolsa@kernel.org haoluo@google.com
netdev/build_clang success Errors and warnings before: 1526 this patch: 1526
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2856 this patch: 2856
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 67 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-1 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-6 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 fail Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 fail Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for veristat
bpf/vmtest-bpf-next-VM_Test-10 fail Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs_no_alu32 on s390x with gcc

Commit Message

Yonghong Song Aug. 27, 2023, 3:27 p.m. UTC
This is needed for later percpu mem allocation when the
allocation is done by bpf program. For such cases, a global
bpf_global_percpu_ma is added where a flexible allocation
size is needed.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 include/linux/bpf.h   |  4 ++--
 kernel/bpf/core.c     |  8 +++++---
 kernel/bpf/memalloc.c | 14 ++++++--------
 3 files changed, 13 insertions(+), 13 deletions(-)

Comments

Hou Tao Sept. 13, 2023, 2:10 p.m. UTC | #1
Hi,

On 8/27/2023 11:27 PM, Yonghong Song wrote:
> This is needed for later percpu mem allocation when the
> allocation is done by bpf program. For such cases, a global
> bpf_global_percpu_ma is added where a flexible allocation
> size is needed.
>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  include/linux/bpf.h   |  4 ++--
>  kernel/bpf/core.c     |  8 +++++---
>  kernel/bpf/memalloc.c | 14 ++++++--------
>  3 files changed, 13 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 12596af59c00..144dbddf53bd 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -55,8 +55,8 @@ struct cgroup;
>  extern struct idr btf_idr;
>  extern spinlock_t btf_idr_lock;
>  extern struct kobject *btf_kobj;
> -extern struct bpf_mem_alloc bpf_global_ma;
> -extern bool bpf_global_ma_set;
> +extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
> +extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>  
>  typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
>  typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 0f8f036d8bd1..95599df82ee4 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -64,8 +64,8 @@
>  #define OFF	insn->off
>  #define IMM	insn->imm
>  
> -struct bpf_mem_alloc bpf_global_ma;
> -bool bpf_global_ma_set;
> +struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
> +bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>  
>  /* No hurry in this branch
>   *
> @@ -2921,7 +2921,9 @@ static int __init bpf_global_ma_init(void)
>  
>  	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
>  	bpf_global_ma_set = !ret;
> -	return ret;
> +	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
> +	bpf_global_percpu_ma_set = !ret;
> +	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
>  }
>  late_initcall(bpf_global_ma_init);
>  #endif
> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> index 9c49ae53deaf..cb60445de98a 100644
> --- a/kernel/bpf/memalloc.c
> +++ b/kernel/bpf/memalloc.c
> @@ -499,15 +499,16 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
>  	struct obj_cgroup *objcg = NULL;
>  	int cpu, i, unit_size, percpu_size = 0;
>  
> +	/* room for llist_node and per-cpu pointer */
> +	if (percpu)
> +		percpu_size = LLIST_NODE_SZ + sizeof(void *);
> +
>  	if (size) {
>  		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
>  		if (!pc)
>  			return -ENOMEM;
>  
> -		if (percpu)
> -			/* room for llist_node and per-cpu pointer */
> -			percpu_size = LLIST_NODE_SZ + sizeof(void *);
> -		else
> +		if (!percpu)
>  			size += LLIST_NODE_SZ; /* room for llist_node */
>  		unit_size = size;
>  
> @@ -527,10 +528,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
>  		return 0;
>  	}
>  
> -	/* size == 0 && percpu is an invalid combination */
> -	if (WARN_ON_ONCE(percpu))
> -		return -EINVAL;
> -
>  	pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
>  	if (!pcc)
>  		return -ENOMEM;
> @@ -543,6 +540,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
>  			c = &cc->cache[i];
>  			c->unit_size = sizes[i];
>  			c->objcg = objcg;
> +			c->percpu_size = percpu_size;
>  			c->tgt = c;
>  			prefill_mem_cache(c, cpu);
>  		}

It would trigger the WARN_ON_ONCE in free_bulk().  Because for per-cpu
ptr returned by bpf memory allocator, ksize() in bpf_mem_free() will
return 16 constantly, so it will be unmatched with the unit_size of the
source bpf_mem_cache. Have not found a good solution for the problem yet
except disabling the warning for per-cpu bpf memory allocator, because
now per-cpu allocator doesn't have an API to return the size of the
returned ptr. Could we make the verifier to remember the size of the
allocator object and pass it to bpf memory allocator ?
Heiko Carstens Nov. 15, 2023, 3:31 p.m. UTC | #2
On Sun, Aug 27, 2023 at 08:27:34AM -0700, Yonghong Song wrote:
> This is needed for later percpu mem allocation when the
> allocation is done by bpf program. For such cases, a global
> bpf_global_percpu_ma is added where a flexible allocation
> size is needed.
> 
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  include/linux/bpf.h   |  4 ++--
>  kernel/bpf/core.c     |  8 +++++---
>  kernel/bpf/memalloc.c | 14 ++++++--------
>  3 files changed, 13 insertions(+), 13 deletions(-)

Both Marc and Mikhail reported out-of-memory conditions on s390 machines,
and bisected it down to this upstream commit 41a5db8d8161 ("bpf: Add
support for non-fix-size percpu mem allocation").
This seems to eat up a lot of memory only based on the number of possible
CPUs.

If we have a machine with 8GB, 6 present CPUs and 512 possible CPUs (yes,
this is a realistic scenario) the memory consumption directly after boot
is:

$ cat /sys/devices/system/cpu/present
0-5
$ cat /sys/devices/system/cpu/possible
0-511

Before this commit:

$ cat /proc/meminfo
MemTotal:        8141924 kB
MemFree:         7639872 kB

With this commit

$ cat /proc/meminfo
MemTotal:        8141924 kB
MemFree:         4852248 kB

So, this appears to be a significant regression.
I'm quoting the rest of the original patch below for reference only.

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 12596af59c00..144dbddf53bd 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -55,8 +55,8 @@ struct cgroup;
>  extern struct idr btf_idr;
>  extern spinlock_t btf_idr_lock;
>  extern struct kobject *btf_kobj;
> -extern struct bpf_mem_alloc bpf_global_ma;
> -extern bool bpf_global_ma_set;
> +extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
> +extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>  
>  typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
>  typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 0f8f036d8bd1..95599df82ee4 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -64,8 +64,8 @@
>  #define OFF	insn->off
>  #define IMM	insn->imm
>  
> -struct bpf_mem_alloc bpf_global_ma;
> -bool bpf_global_ma_set;
> +struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
> +bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>  
>  /* No hurry in this branch
>   *
> @@ -2921,7 +2921,9 @@ static int __init bpf_global_ma_init(void)
>  
>  	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
>  	bpf_global_ma_set = !ret;
> -	return ret;
> +	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
> +	bpf_global_percpu_ma_set = !ret;
> +	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
>  }
>  late_initcall(bpf_global_ma_init);
>  #endif
> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> index 9c49ae53deaf..cb60445de98a 100644
> --- a/kernel/bpf/memalloc.c
> +++ b/kernel/bpf/memalloc.c
> @@ -499,15 +499,16 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
>  	struct obj_cgroup *objcg = NULL;
>  	int cpu, i, unit_size, percpu_size = 0;
>  
> +	/* room for llist_node and per-cpu pointer */
> +	if (percpu)
> +		percpu_size = LLIST_NODE_SZ + sizeof(void *);
> +
>  	if (size) {
>  		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
>  		if (!pc)
>  			return -ENOMEM;
>  
> -		if (percpu)
> -			/* room for llist_node and per-cpu pointer */
> -			percpu_size = LLIST_NODE_SZ + sizeof(void *);
> -		else
> +		if (!percpu)
>  			size += LLIST_NODE_SZ; /* room for llist_node */
>  		unit_size = size;
>  
> @@ -527,10 +528,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
>  		return 0;
>  	}
>  
> -	/* size == 0 && percpu is an invalid combination */
> -	if (WARN_ON_ONCE(percpu))
> -		return -EINVAL;
> -
>  	pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
>  	if (!pcc)
>  		return -ENOMEM;
> @@ -543,6 +540,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
>  			c = &cc->cache[i];
>  			c->unit_size = sizes[i];
>  			c->objcg = objcg;
> +			c->percpu_size = percpu_size;
>  			c->tgt = c;
>  			prefill_mem_cache(c, cpu);
>  		}
> -- 
> 2.34.1
>
Alexei Starovoitov Nov. 15, 2023, 3:54 p.m. UTC | #3
On Wed, Nov 15, 2023 at 7:32 AM Heiko Carstens <hca@linux.ibm.com> wrote:
>
> On Sun, Aug 27, 2023 at 08:27:34AM -0700, Yonghong Song wrote:
> > This is needed for later percpu mem allocation when the
> > allocation is done by bpf program. For such cases, a global
> > bpf_global_percpu_ma is added where a flexible allocation
> > size is needed.
> >
> > Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> > ---
> >  include/linux/bpf.h   |  4 ++--
> >  kernel/bpf/core.c     |  8 +++++---
> >  kernel/bpf/memalloc.c | 14 ++++++--------
> >  3 files changed, 13 insertions(+), 13 deletions(-)
>
> Both Marc and Mikhail reported out-of-memory conditions on s390 machines,
> and bisected it down to this upstream commit 41a5db8d8161 ("bpf: Add
> support for non-fix-size percpu mem allocation").
> This seems to eat up a lot of memory only based on the number of possible
> CPUs.
>
> If we have a machine with 8GB, 6 present CPUs and 512 possible CPUs (yes,
> this is a realistic scenario) the memory consumption directly after boot
> is:
>
> $ cat /sys/devices/system/cpu/present
> 0-5
> $ cat /sys/devices/system/cpu/possible
> 0-511
>
> Before this commit:
>
> $ cat /proc/meminfo
> MemTotal:        8141924 kB
> MemFree:         7639872 kB
>
> With this commit
>
> $ cat /proc/meminfo
> MemTotal:        8141924 kB
> MemFree:         4852248 kB
>
> So, this appears to be a significant regression.
> I'm quoting the rest of the original patch below for reference only.

Yes. Sorry about this. The issue slipped through.
It's fixed in bpf tree by commit:
https://patchwork.kernel.org/project/netdevbpf/patch/20231111013928.948838-1-yonghong.song@linux.dev/

The fix will be sent to Linus soon.
Hou Tao Nov. 16, 2023, 1:15 a.m. UTC | #4
Hi,

On 11/15/2023 11:31 PM, Heiko Carstens wrote:
> On Sun, Aug 27, 2023 at 08:27:34AM -0700, Yonghong Song wrote:
>> This is needed for later percpu mem allocation when the
>> allocation is done by bpf program. For such cases, a global
>> bpf_global_percpu_ma is added where a flexible allocation
>> size is needed.
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>  include/linux/bpf.h   |  4 ++--
>>  kernel/bpf/core.c     |  8 +++++---
>>  kernel/bpf/memalloc.c | 14 ++++++--------
>>  3 files changed, 13 insertions(+), 13 deletions(-)
> Both Marc and Mikhail reported out-of-memory conditions on s390 machines,
> and bisected it down to this upstream commit 41a5db8d8161 ("bpf: Add
> support for non-fix-size percpu mem allocation").
> This seems to eat up a lot of memory only based on the number of possible
> CPUs.
>
> If we have a machine with 8GB, 6 present CPUs and 512 possible CPUs (yes,
> this is a realistic scenario) the memory consumption directly after boot
> is:
>
> $ cat /sys/devices/system/cpu/present
> 0-5
> $ cat /sys/devices/system/cpu/possible
> 0-511

Will the present CPUs be hot-added dynamically and eventually increase
to 512 CPUs ? Or will the present CPUs rarely be hot-added ? After all
possible CPUs are online, will these CPUs be hot-plugged dynamically ?
Because I am considering add CPU hotplug support for bpf mem allocator,
so we can allocate memory according to the present CPUs instead of
possible CPUs. But if the present CPUs will be increased to all possible
CPUs quickly, there will be not too much benefit to support hotplug in
bpf mem allocator.
Yonghong Song Nov. 16, 2023, 5:52 a.m. UTC | #5
On 11/15/23 8:15 PM, Hou Tao wrote:
> Hi,
>
> On 11/15/2023 11:31 PM, Heiko Carstens wrote:
>> On Sun, Aug 27, 2023 at 08:27:34AM -0700, Yonghong Song wrote:
>>> This is needed for later percpu mem allocation when the
>>> allocation is done by bpf program. For such cases, a global
>>> bpf_global_percpu_ma is added where a flexible allocation
>>> size is needed.
>>>
>>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>>> ---
>>>   include/linux/bpf.h   |  4 ++--
>>>   kernel/bpf/core.c     |  8 +++++---
>>>   kernel/bpf/memalloc.c | 14 ++++++--------
>>>   3 files changed, 13 insertions(+), 13 deletions(-)
>> Both Marc and Mikhail reported out-of-memory conditions on s390 machines,
>> and bisected it down to this upstream commit 41a5db8d8161 ("bpf: Add
>> support for non-fix-size percpu mem allocation").
>> This seems to eat up a lot of memory only based on the number of possible
>> CPUs.
>>
>> If we have a machine with 8GB, 6 present CPUs and 512 possible CPUs (yes,
>> this is a realistic scenario) the memory consumption directly after boot
>> is:
>>
>> $ cat /sys/devices/system/cpu/present
>> 0-5
>> $ cat /sys/devices/system/cpu/possible
>> 0-511
> Will the present CPUs be hot-added dynamically and eventually increase
> to 512 CPUs ? Or will the present CPUs rarely be hot-added ? After all
> possible CPUs are online, will these CPUs be hot-plugged dynamically ?
> Because I am considering add CPU hotplug support for bpf mem allocator,
> so we can allocate memory according to the present CPUs instead of
> possible CPUs. But if the present CPUs will be increased to all possible
> CPUs quickly, there will be not too much benefit to support hotplug in
> bpf mem allocator.

Thanks, Hou. In my development machine and also checking
some internal production machines, no suprisingly, the 'present' number
cpus is equal to the 'possible' number of cpus. I suspect in most cases,
'present' and 'possible' are the same. But it would be good that
other people can share their 'present != possible' configuration
and explain why.
Heiko Carstens Nov. 16, 2023, 1:54 p.m. UTC | #6
On Thu, Nov 16, 2023 at 09:15:26AM +0800, Hou Tao wrote:
> > If we have a machine with 8GB, 6 present CPUs and 512 possible CPUs (yes,
> > this is a realistic scenario) the memory consumption directly after boot
> > is:
> >
> > $ cat /sys/devices/system/cpu/present
> > 0-5
> > $ cat /sys/devices/system/cpu/possible
> > 0-511
> 
> Will the present CPUs be hot-added dynamically and eventually increase
> to 512 CPUs ? Or will the present CPUs rarely be hot-added ? After all
> possible CPUs are online, will these CPUs be hot-plugged dynamically ?
> Because I am considering add CPU hotplug support for bpf mem allocator,
> so we can allocate memory according to the present CPUs instead of
> possible CPUs. But if the present CPUs will be increased to all possible
> CPUs quickly, there will be not too much benefit to support hotplug in
> bpf mem allocator.

You can assume that the present CPUs would change only very rarely. Even
though we are only talking about virtual CPUs in this case systems are
usually setup in a way that they have enough CPUs for their workload. Only
if that is not the case additional CPUs may be added (and brought online) -
which is usually much later than boot time.

Obviously the above is even more true for systems where you have to add new
CPUs in a physical way in order to change present CPUs.

So I guess it is fair to assume that if there is such a large difference
between present and possible CPUs, that this will also stay that way while
the system is running in most cases.

Or in other words: it sounds like it is worth to add CPU hotplug support
for the the bpf mem allocator (without that I would know what that would
really mean for the bpf code).

Note for the above numbers: I hacked the number of possible CPUs manually
in the kernel code just to illustrate the high memory consumption for the
report. On a real system you would see "0-399" CPUs instead.
But that's just a minor detail.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 12596af59c00..144dbddf53bd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -55,8 +55,8 @@  struct cgroup;
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
-extern struct bpf_mem_alloc bpf_global_ma;
-extern bool bpf_global_ma_set;
+extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
+extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 0f8f036d8bd1..95599df82ee4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -64,8 +64,8 @@ 
 #define OFF	insn->off
 #define IMM	insn->imm
 
-struct bpf_mem_alloc bpf_global_ma;
-bool bpf_global_ma_set;
+struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
+bool bpf_global_ma_set, bpf_global_percpu_ma_set;
 
 /* No hurry in this branch
  *
@@ -2921,7 +2921,9 @@  static int __init bpf_global_ma_init(void)
 
 	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
 	bpf_global_ma_set = !ret;
-	return ret;
+	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
+	bpf_global_percpu_ma_set = !ret;
+	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
 }
 late_initcall(bpf_global_ma_init);
 #endif
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 9c49ae53deaf..cb60445de98a 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -499,15 +499,16 @@  int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 	struct obj_cgroup *objcg = NULL;
 	int cpu, i, unit_size, percpu_size = 0;
 
+	/* room for llist_node and per-cpu pointer */
+	if (percpu)
+		percpu_size = LLIST_NODE_SZ + sizeof(void *);
+
 	if (size) {
 		pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL);
 		if (!pc)
 			return -ENOMEM;
 
-		if (percpu)
-			/* room for llist_node and per-cpu pointer */
-			percpu_size = LLIST_NODE_SZ + sizeof(void *);
-		else
+		if (!percpu)
 			size += LLIST_NODE_SZ; /* room for llist_node */
 		unit_size = size;
 
@@ -527,10 +528,6 @@  int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 		return 0;
 	}
 
-	/* size == 0 && percpu is an invalid combination */
-	if (WARN_ON_ONCE(percpu))
-		return -EINVAL;
-
 	pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL);
 	if (!pcc)
 		return -ENOMEM;
@@ -543,6 +540,7 @@  int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
 			c = &cc->cache[i];
 			c->unit_size = sizes[i];
 			c->objcg = objcg;
+			c->percpu_size = percpu_size;
 			c->tgt = c;
 			prefill_mem_cache(c, cpu);
 		}