diff mbox series

[bpf-next,v3] bpf: Do not allocate percpu memory at init stage

Message ID 20231111013928.948838-1-yonghong.song@linux.dev (mailing list archive)
State Accepted
Commit 1fda5bb66ad8fb24ecb3858e61a13a6548428898
Delegated to: BPF
Headers show
Series [bpf-next,v3] bpf: Do not allocate percpu memory at init stage | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2677 this patch: 2677
netdev/cc_maintainers warning 7 maintainers not CCed: jolsa@kernel.org martin.lau@linux.dev song@kernel.org haoluo@google.com john.fastabend@gmail.com sdf@google.com kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 1296 this patch: 1296
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 2756 this patch: 2756
netdev/checkpatch warning WARNING: 'upto' may be misspelled - perhaps 'up to'? WARNING: Too many leading tabs - consider code refactoring WARNING: line length of 101 exceeds 80 columns WARNING: line length of 113 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-3 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-llvm-16 / build / build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-16 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-llvm-16 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-16 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-llvm-16 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-16 / veristat
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc

Commit Message

Yonghong Song Nov. 11, 2023, 1:39 a.m. UTC
Kirill Shutemov reported significant percpu memory consumption increase after
booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for
non-fix-size percpu mem allocation"). The percpu memory consumption is
increased from 111MB to 969MB. The number is from /proc/meminfo.

I tried to reproduce the issue with my local VM which at most supports upto
255 cpus. With 252 cpus, without the above commit, the percpu memory
consumption immediately after boot is 57MB while with the above commit the
percpu memory consumption is 231MB.

This is not good since so far percpu memory from bpf memory allocator is not
widely used yet. Let us change pre-allocation in init stage to on-demand
allocation when verifier detects there is a need of percpu memory for bpf
program. With this change, percpu memory consumption after boot can be reduced
signicantly.

  [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/

Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation")
Reported-and-tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 include/linux/bpf.h   |  2 +-
 kernel/bpf/core.c     |  8 +++-----
 kernel/bpf/verifier.c | 20 ++++++++++++++++++--
 3 files changed, 22 insertions(+), 8 deletions(-)

Changelog:
  v2 -> v3:
    - Use dedicated mutex lock (bpf_percpu_ma_lock)
  v1 -> v2:
    - Add proper Reported-and-tested-by tag.
    - Do a check of !bpf_global_percpu_ma_set before acquiring verifier_lock.

Comments

Hou Tao Nov. 13, 2023, 12:42 p.m. UTC | #1
Hi,

On 11/11/2023 9:39 AM, Yonghong Song wrote:
> Kirill Shutemov reported significant percpu memory consumption increase after
> booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for
> non-fix-size percpu mem allocation"). The percpu memory consumption is
> increased from 111MB to 969MB. The number is from /proc/meminfo.
>
> I tried to reproduce the issue with my local VM which at most supports upto
> 255 cpus. With 252 cpus, without the above commit, the percpu memory
> consumption immediately after boot is 57MB while with the above commit the
> percpu memory consumption is 231MB.
>
> This is not good since so far percpu memory from bpf memory allocator is not
> widely used yet. Let us change pre-allocation in init stage to on-demand
> allocation when verifier detects there is a need of percpu memory for bpf
> program. With this change, percpu memory consumption after boot can be reduced
> signicantly.
>
>   [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/
>
> Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation")
> Reported-and-tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  include/linux/bpf.h   |  2 +-
>  kernel/bpf/core.c     |  8 +++-----
>  kernel/bpf/verifier.c | 20 ++++++++++++++++++--
>  3 files changed, 22 insertions(+), 8 deletions(-)
>
> Changelog:
>   v2 -> v3:
>     - Use dedicated mutex lock (bpf_percpu_ma_lock)
>   v1 -> v2:
>     - Add proper Reported-and-tested-by tag.
>     - Do a check of !bpf_global_percpu_ma_set before acquiring verifier_lock.
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 35bff17396c0..6762dac3ef76 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -56,7 +56,7 @@ extern struct idr btf_idr;
>  extern spinlock_t btf_idr_lock;
>  extern struct kobject *btf_kobj;
>  extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
> -extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
> +extern bool bpf_global_ma_set;
>  
>  typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
>  typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 08626b519ce2..cd3afe57ece3 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -64,8 +64,8 @@
>  #define OFF	insn->off
>  #define IMM	insn->imm
>  
> -struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
> -bool bpf_global_ma_set, bpf_global_percpu_ma_set;
> +struct bpf_mem_alloc bpf_global_ma;
> +bool bpf_global_ma_set;
>  
>  /* No hurry in this branch
>   *
> @@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void)
>  
>  	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
>  	bpf_global_ma_set = !ret;
> -	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
> -	bpf_global_percpu_ma_set = !ret;
> -	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
> +	return ret;
>  }
>  late_initcall(bpf_global_ma_init);
>  #endif
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index a2267d5ed14e..6da370a047fe 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -26,6 +26,7 @@
>  #include <linux/poison.h>
>  #include <linux/module.h>
>  #include <linux/cpumask.h>
> +#include <linux/bpf_mem_alloc.h>
>  #include <net/xdp.h>
>  
>  #include "disasm.h"
> @@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
>  #undef BPF_LINK_TYPE
>  };
>  
> +struct bpf_mem_alloc bpf_global_percpu_ma;
> +static bool bpf_global_percpu_ma_set;
> +
>  /* bpf_check() is a static code analyzer that walks eBPF program
>   * instruction by instruction and updates register/stack state.
>   * All paths of conditional branches are analyzed until 'bpf_exit' insn.
> @@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta {
>  struct btf *btf_vmlinux;
>  
>  static DEFINE_MUTEX(bpf_verifier_lock);
> +static DEFINE_MUTEX(bpf_percpu_ma_lock);
>  
>  static const struct bpf_line_info *
>  find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
> @@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>  				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
>  					return -ENOMEM;
>  
> -				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set)
> -					return -ENOMEM;
> +				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
> +					if (!bpf_global_percpu_ma_set) {
> +						mutex_lock(&bpf_percpu_ma_lock);
> +						if (!bpf_global_percpu_ma_set) {
> +							err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
> +							if (!err)
> +								bpf_global_percpu_ma_set = true;
> +						}

A dumb question here: do we need some memory barrier to guarantee the
memory order between bpf_global_percpu_ma_set and bpf_global_percpu_ma ?
> +						mutex_unlock(&bpf_percpu_ma_lock);
> +						if (err)
> +							return err;
> +					}
> +				}
>  
>  				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
>  					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
Yonghong Song Nov. 14, 2023, 3:23 a.m. UTC | #2
On 11/13/23 4:42 AM, Hou Tao wrote:
> Hi,
>
> On 11/11/2023 9:39 AM, Yonghong Song wrote:
>> Kirill Shutemov reported significant percpu memory consumption increase after
>> booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for
>> non-fix-size percpu mem allocation"). The percpu memory consumption is
>> increased from 111MB to 969MB. The number is from /proc/meminfo.
>>
>> I tried to reproduce the issue with my local VM which at most supports upto
>> 255 cpus. With 252 cpus, without the above commit, the percpu memory
>> consumption immediately after boot is 57MB while with the above commit the
>> percpu memory consumption is 231MB.
>>
>> This is not good since so far percpu memory from bpf memory allocator is not
>> widely used yet. Let us change pre-allocation in init stage to on-demand
>> allocation when verifier detects there is a need of percpu memory for bpf
>> program. With this change, percpu memory consumption after boot can be reduced
>> signicantly.
>>
>>    [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/
>>
>> Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation")
>> Reported-and-tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf.h   |  2 +-
>>   kernel/bpf/core.c     |  8 +++-----
>>   kernel/bpf/verifier.c | 20 ++++++++++++++++++--
>>   3 files changed, 22 insertions(+), 8 deletions(-)
>>
>> Changelog:
>>    v2 -> v3:
>>      - Use dedicated mutex lock (bpf_percpu_ma_lock)
>>    v1 -> v2:
>>      - Add proper Reported-and-tested-by tag.
>>      - Do a check of !bpf_global_percpu_ma_set before acquiring verifier_lock.
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 35bff17396c0..6762dac3ef76 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -56,7 +56,7 @@ extern struct idr btf_idr;
>>   extern spinlock_t btf_idr_lock;
>>   extern struct kobject *btf_kobj;
>>   extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
>> -extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>> +extern bool bpf_global_ma_set;
>>   
>>   typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
>>   typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>> index 08626b519ce2..cd3afe57ece3 100644
>> --- a/kernel/bpf/core.c
>> +++ b/kernel/bpf/core.c
>> @@ -64,8 +64,8 @@
>>   #define OFF	insn->off
>>   #define IMM	insn->imm
>>   
>> -struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
>> -bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>> +struct bpf_mem_alloc bpf_global_ma;
>> +bool bpf_global_ma_set;
>>   
>>   /* No hurry in this branch
>>    *
>> @@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void)
>>   
>>   	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
>>   	bpf_global_ma_set = !ret;
>> -	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
>> -	bpf_global_percpu_ma_set = !ret;
>> -	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
>> +	return ret;
>>   }
>>   late_initcall(bpf_global_ma_init);
>>   #endif
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index a2267d5ed14e..6da370a047fe 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -26,6 +26,7 @@
>>   #include <linux/poison.h>
>>   #include <linux/module.h>
>>   #include <linux/cpumask.h>
>> +#include <linux/bpf_mem_alloc.h>
>>   #include <net/xdp.h>
>>   
>>   #include "disasm.h"
>> @@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
>>   #undef BPF_LINK_TYPE
>>   };
>>   
>> +struct bpf_mem_alloc bpf_global_percpu_ma;
>> +static bool bpf_global_percpu_ma_set;
>> +
>>   /* bpf_check() is a static code analyzer that walks eBPF program
>>    * instruction by instruction and updates register/stack state.
>>    * All paths of conditional branches are analyzed until 'bpf_exit' insn.
>> @@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta {
>>   struct btf *btf_vmlinux;
>>   
>>   static DEFINE_MUTEX(bpf_verifier_lock);
>> +static DEFINE_MUTEX(bpf_percpu_ma_lock);
>>   
>>   static const struct bpf_line_info *
>>   find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
>> @@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>>   				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
>>   					return -ENOMEM;
>>   
>> -				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set)
>> -					return -ENOMEM;
>> +				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
>> +					if (!bpf_global_percpu_ma_set) {
>> +						mutex_lock(&bpf_percpu_ma_lock);
>> +						if (!bpf_global_percpu_ma_set) {
>> +							err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
>> +							if (!err)
>> +								bpf_global_percpu_ma_set = true;
>> +						}
> A dumb question here: do we need some memory barrier to guarantee the
> memory order between bpf_global_percpu_ma_set and bpf_global_percpu_ma ?

We should be fine. There is a control dependence on '!err' for
'bpf_global_percpu_ma_set = true'.

>> +						mutex_unlock(&bpf_percpu_ma_lock);
>> +						if (err)
>> +							return err;
>> +					}
>> +				}
>>   
>>   				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
>>   					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
Hou Tao Nov. 14, 2023, 4:03 a.m. UTC | #3
On 11/14/2023 11:23 AM, Yonghong Song wrote:
>
> On 11/13/23 4:42 AM, Hou Tao wrote:
>> Hi,
>>
>> On 11/11/2023 9:39 AM, Yonghong Song wrote:
>>> Kirill Shutemov reported significant percpu memory consumption
>>> increase after
>>> booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add
>>> support for
>>> non-fix-size percpu mem allocation"). The percpu memory consumption is
>>> increased from 111MB to 969MB. The number is from /proc/meminfo.
>>>
>>> I tried to reproduce the issue with my local VM which at most
>>> supports upto
>>> 255 cpus. With 252 cpus, without the above commit, the percpu memory
>>> consumption immediately after boot is 57MB while with the above
>>> commit the
>>> percpu memory consumption is 231MB.
>>>
>>> This is not good since so far percpu memory from bpf memory
>>> allocator is not
>>> widely used yet. Let us change pre-allocation in init stage to
>>> on-demand
>>> allocation when verifier detects there is a need of percpu memory
>>> for bpf
>>> program. With this change, percpu memory consumption after boot can
>>> be reduced
>>> signicantly.
>>>
>>>    [1]
>>> https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/
>>>
>>> Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem
>>> allocation")
>>> Reported-and-tested-by: Kirill A. Shutemov
>>> <kirill.shutemov@linux.intel.com>
>>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>>> ---
>>>   include/linux/bpf.h   |  2 +-
>>>   kernel/bpf/core.c     |  8 +++-----
>>>   kernel/bpf/verifier.c | 20 ++++++++++++++++++--
>>>   3 files changed, 22 insertions(+), 8 deletions(-)
>>>
>>> Changelog:
>>>    v2 -> v3:
>>>      - Use dedicated mutex lock (bpf_percpu_ma_lock)
>>>    v1 -> v2:
>>>      - Add proper Reported-and-tested-by tag.
>>>      - Do a check of !bpf_global_percpu_ma_set before acquiring
>>> verifier_lock.
>>>
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index 35bff17396c0..6762dac3ef76 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -56,7 +56,7 @@ extern struct idr btf_idr;
>>>   extern spinlock_t btf_idr_lock;
>>>   extern struct kobject *btf_kobj;
>>>   extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
>>> -extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>>> +extern bool bpf_global_ma_set;
>>>     typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
>>>   typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
>>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>>> index 08626b519ce2..cd3afe57ece3 100644
>>> --- a/kernel/bpf/core.c
>>> +++ b/kernel/bpf/core.c
>>> @@ -64,8 +64,8 @@
>>>   #define OFF    insn->off
>>>   #define IMM    insn->imm
>>>   -struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
>>> -bool bpf_global_ma_set, bpf_global_percpu_ma_set;
>>> +struct bpf_mem_alloc bpf_global_ma;
>>> +bool bpf_global_ma_set;
>>>     /* No hurry in this branch
>>>    *
>>> @@ -2934,9 +2934,7 @@ static int __init bpf_global_ma_init(void)
>>>         ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
>>>       bpf_global_ma_set = !ret;
>>> -    ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
>>> -    bpf_global_percpu_ma_set = !ret;
>>> -    return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
>>> +    return ret;
>>>   }
>>>   late_initcall(bpf_global_ma_init);
>>>   #endif
>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>> index a2267d5ed14e..6da370a047fe 100644
>>> --- a/kernel/bpf/verifier.c
>>> +++ b/kernel/bpf/verifier.c
>>> @@ -26,6 +26,7 @@
>>>   #include <linux/poison.h>
>>>   #include <linux/module.h>
>>>   #include <linux/cpumask.h>
>>> +#include <linux/bpf_mem_alloc.h>
>>>   #include <net/xdp.h>
>>>     #include "disasm.h"
>>> @@ -41,6 +42,9 @@ static const struct bpf_verifier_ops * const
>>> bpf_verifier_ops[] = {
>>>   #undef BPF_LINK_TYPE
>>>   };
>>>   +struct bpf_mem_alloc bpf_global_percpu_ma;
>>> +static bool bpf_global_percpu_ma_set;
>>> +
>>>   /* bpf_check() is a static code analyzer that walks eBPF program
>>>    * instruction by instruction and updates register/stack state.
>>>    * All paths of conditional branches are analyzed until 'bpf_exit'
>>> insn.
>>> @@ -336,6 +340,7 @@ struct bpf_kfunc_call_arg_meta {
>>>   struct btf *btf_vmlinux;
>>>     static DEFINE_MUTEX(bpf_verifier_lock);
>>> +static DEFINE_MUTEX(bpf_percpu_ma_lock);
>>>     static const struct bpf_line_info *
>>>   find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
>>> @@ -12091,8 +12096,19 @@ static int check_kfunc_call(struct
>>> bpf_verifier_env *env, struct bpf_insn *insn,
>>>                   if (meta.func_id ==
>>> special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
>>>                       return -ENOMEM;
>>>   -                if (meta.func_id ==
>>> special_kfunc_list[KF_bpf_percpu_obj_new_impl] &&
>>> !bpf_global_percpu_ma_set)
>>> -                    return -ENOMEM;
>>> +                if (meta.func_id ==
>>> special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
>>> +                    if (!bpf_global_percpu_ma_set) {
>>> +                        mutex_lock(&bpf_percpu_ma_lock);
>>> +                        if (!bpf_global_percpu_ma_set) {
>>> +                            err =
>>> bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
>>> +                            if (!err)
>>> +                                bpf_global_percpu_ma_set = true;
>>> +                        }
>> A dumb question here: do we need some memory barrier to guarantee the
>> memory order between bpf_global_percpu_ma_set and bpf_global_percpu_ma ?
>
> We should be fine. There is a control dependence on '!err' for
> 'bpf_global_percpu_ma_set = true'.

I see. Thanks for the explanation.
>
>>> +                        mutex_unlock(&bpf_percpu_ma_lock);
>>> +                        if (err)
>>> +                            return err;
>>> +                    }
>>> +                }
>>>                     if (((u64)(u32)meta.arg_constant.value) !=
>>> meta.arg_constant.value) {
>>>                       verbose(env, "local type ID argument must be
>>> in range [0, U32_MAX]\n");
>
> .
Hou Tao Nov. 14, 2023, 4:12 a.m. UTC | #4
On 11/11/2023 9:39 AM, Yonghong Song wrote:
> Kirill Shutemov reported significant percpu memory consumption increase after
> booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for
> non-fix-size percpu mem allocation"). The percpu memory consumption is
> increased from 111MB to 969MB. The number is from /proc/meminfo.
>
> I tried to reproduce the issue with my local VM which at most supports upto
> 255 cpus. With 252 cpus, without the above commit, the percpu memory
> consumption immediately after boot is 57MB while with the above commit the
> percpu memory consumption is 231MB.
>
> This is not good since so far percpu memory from bpf memory allocator is not
> widely used yet. Let us change pre-allocation in init stage to on-demand
> allocation when verifier detects there is a need of percpu memory for bpf
> program. With this change, percpu memory consumption after boot can be reduced
> signicantly.
>
>   [1] https://lore.kernel.org/lkml/20231109154934.4saimljtqx625l3v@box.shutemov.name/
>
> Fixes: 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation")
> Reported-and-tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>

Acked-by: Hou Tao <houtao1@huawei.com>
patchwork-bot+netdevbpf@kernel.org Nov. 15, 2023, 4 p.m. UTC | #5
Hello:

This patch was applied to bpf/bpf.git (master)
by Alexei Starovoitov <ast@kernel.org>:

On Fri, 10 Nov 2023 17:39:28 -0800 you wrote:
> Kirill Shutemov reported significant percpu memory consumption increase after
> booting in 288-cpu VM ([1]) due to commit 41a5db8d8161 ("bpf: Add support for
> non-fix-size percpu mem allocation"). The percpu memory consumption is
> increased from 111MB to 969MB. The number is from /proc/meminfo.
> 
> I tried to reproduce the issue with my local VM which at most supports upto
> 255 cpus. With 252 cpus, without the above commit, the percpu memory
> consumption immediately after boot is 57MB while with the above commit the
> percpu memory consumption is 231MB.
> 
> [...]

Here is the summary with links:
  - [bpf-next,v3] bpf: Do not allocate percpu memory at init stage
    https://git.kernel.org/bpf/bpf/c/1fda5bb66ad8

You are awesome, thank you!
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 35bff17396c0..6762dac3ef76 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,7 +56,7 @@  extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 extern struct kobject *btf_kobj;
 extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
-extern bool bpf_global_ma_set, bpf_global_percpu_ma_set;
+extern bool bpf_global_ma_set;
 
 typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64);
 typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 08626b519ce2..cd3afe57ece3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -64,8 +64,8 @@ 
 #define OFF	insn->off
 #define IMM	insn->imm
 
-struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma;
-bool bpf_global_ma_set, bpf_global_percpu_ma_set;
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
 
 /* No hurry in this branch
  *
@@ -2934,9 +2934,7 @@  static int __init bpf_global_ma_init(void)
 
 	ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
 	bpf_global_ma_set = !ret;
-	ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
-	bpf_global_percpu_ma_set = !ret;
-	return !bpf_global_ma_set || !bpf_global_percpu_ma_set;
+	return ret;
 }
 late_initcall(bpf_global_ma_init);
 #endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2267d5ed14e..6da370a047fe 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -26,6 +26,7 @@ 
 #include <linux/poison.h>
 #include <linux/module.h>
 #include <linux/cpumask.h>
+#include <linux/bpf_mem_alloc.h>
 #include <net/xdp.h>
 
 #include "disasm.h"
@@ -41,6 +42,9 @@  static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
 #undef BPF_LINK_TYPE
 };
 
+struct bpf_mem_alloc bpf_global_percpu_ma;
+static bool bpf_global_percpu_ma_set;
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -336,6 +340,7 @@  struct bpf_kfunc_call_arg_meta {
 struct btf *btf_vmlinux;
 
 static DEFINE_MUTEX(bpf_verifier_lock);
+static DEFINE_MUTEX(bpf_percpu_ma_lock);
 
 static const struct bpf_line_info *
 find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
@@ -12091,8 +12096,19 @@  static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 				if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
 					return -ENOMEM;
 
-				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && !bpf_global_percpu_ma_set)
-					return -ENOMEM;
+				if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
+					if (!bpf_global_percpu_ma_set) {
+						mutex_lock(&bpf_percpu_ma_lock);
+						if (!bpf_global_percpu_ma_set) {
+							err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
+							if (!err)
+								bpf_global_percpu_ma_set = true;
+						}
+						mutex_unlock(&bpf_percpu_ma_lock);
+						if (err)
+							return err;
+					}
+				}
 
 				if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
 					verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");