Message ID | 20230827152734.1995725-1-yonghong.song@linux.dev (mailing list archive) |
---|---|
State | Accepted |
Commit | baa9555b6c1c88e885820c82fcc05d36e30daaa2 |
Delegated to: | BPF |
Headers | show |
Series | bpf: Add support for local percpu kptr | expand |
Hi, On 8/27/2023 11:27 PM, Yonghong Song wrote: > This is needed for later percpu mem allocation when the > allocation is done by bpf program. For such cases, a global > bpf_global_percpu_ma is added where a flexible allocation > size is needed. > > Signed-off-by: Yonghong Song <yonghong.song@linux.dev> > --- > include/linux/bpf.h | 4 ++-- > kernel/bpf/core.c | 8 +++++--- > kernel/bpf/memalloc.c | 14 ++++++-------- > 3 files changed, 13 insertions(+), 13 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 12596af59c00..144dbddf53bd 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -55,8 +55,8 @@ struct cgroup; > extern struct idr btf_idr; > extern spinlock_t btf_idr_lock; > extern struct kobject *btf_kobj; > -extern struct bpf_mem_alloc bpf_global_ma; > -extern bool bpf_global_ma_set; > +extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; > +extern bool bpf_global_ma_set, bpf_global_percpu_ma_set; > > typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); > typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 0f8f036d8bd1..95599df82ee4 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -64,8 +64,8 @@ > #define OFF insn->off > #define IMM insn->imm > > -struct bpf_mem_alloc bpf_global_ma; > -bool bpf_global_ma_set; > +struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; > +bool bpf_global_ma_set, bpf_global_percpu_ma_set; > > /* No hurry in this branch > * > @@ -2921,7 +2921,9 @@ static int __init bpf_global_ma_init(void) > > ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); > bpf_global_ma_set = !ret; > - return ret; > + ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); > + bpf_global_percpu_ma_set = !ret; > + return !bpf_global_ma_set || !bpf_global_percpu_ma_set; > } > late_initcall(bpf_global_ma_init); > #endif > diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c > index 9c49ae53deaf..cb60445de98a 100644 > --- a/kernel/bpf/memalloc.c > +++ b/kernel/bpf/memalloc.c > @@ -499,15 +499,16 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) > struct obj_cgroup *objcg = NULL; > int cpu, i, unit_size, percpu_size = 0; > > + /* room for llist_node and per-cpu pointer */ > + if (percpu) > + percpu_size = LLIST_NODE_SZ + sizeof(void *); > + > if (size) { > pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); > if (!pc) > return -ENOMEM; > > - if (percpu) > - /* room for llist_node and per-cpu pointer */ > - percpu_size = LLIST_NODE_SZ + sizeof(void *); > - else > + if (!percpu) > size += LLIST_NODE_SZ; /* room for llist_node */ > unit_size = size; > > @@ -527,10 +528,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) > return 0; > } > > - /* size == 0 && percpu is an invalid combination */ > - if (WARN_ON_ONCE(percpu)) > - return -EINVAL; > - > pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); > if (!pcc) > return -ENOMEM; > @@ -543,6 +540,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) > c = &cc->cache[i]; > c->unit_size = sizes[i]; > c->objcg = objcg; > + c->percpu_size = percpu_size; > c->tgt = c; > prefill_mem_cache(c, cpu); > } It would trigger the WARN_ON_ONCE in free_bulk(). Because for per-cpu ptr returned by bpf memory allocator, ksize() in bpf_mem_free() will return 16 constantly, so it will be unmatched with the unit_size of the source bpf_mem_cache. Have not found a good solution for the problem yet except disabling the warning for per-cpu bpf memory allocator, because now per-cpu allocator doesn't have an API to return the size of the returned ptr. Could we make the verifier to remember the size of the allocator object and pass it to bpf memory allocator ?
On Sun, Aug 27, 2023 at 08:27:34AM -0700, Yonghong Song wrote: > This is needed for later percpu mem allocation when the > allocation is done by bpf program. For such cases, a global > bpf_global_percpu_ma is added where a flexible allocation > size is needed. > > Signed-off-by: Yonghong Song <yonghong.song@linux.dev> > --- > include/linux/bpf.h | 4 ++-- > kernel/bpf/core.c | 8 +++++--- > kernel/bpf/memalloc.c | 14 ++++++-------- > 3 files changed, 13 insertions(+), 13 deletions(-) Both Marc and Mikhail reported out-of-memory conditions on s390 machines, and bisected it down to this upstream commit 41a5db8d8161 ("bpf: Add support for non-fix-size percpu mem allocation"). This seems to eat up a lot of memory only based on the number of possible CPUs. If we have a machine with 8GB, 6 present CPUs and 512 possible CPUs (yes, this is a realistic scenario) the memory consumption directly after boot is: $ cat /sys/devices/system/cpu/present 0-5 $ cat /sys/devices/system/cpu/possible 0-511 Before this commit: $ cat /proc/meminfo MemTotal: 8141924 kB MemFree: 7639872 kB With this commit $ cat /proc/meminfo MemTotal: 8141924 kB MemFree: 4852248 kB So, this appears to be a significant regression. I'm quoting the rest of the original patch below for reference only. > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index 12596af59c00..144dbddf53bd 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -55,8 +55,8 @@ struct cgroup; > extern struct idr btf_idr; > extern spinlock_t btf_idr_lock; > extern struct kobject *btf_kobj; > -extern struct bpf_mem_alloc bpf_global_ma; > -extern bool bpf_global_ma_set; > +extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; > +extern bool bpf_global_ma_set, bpf_global_percpu_ma_set; > > typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); > typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c > index 0f8f036d8bd1..95599df82ee4 100644 > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -64,8 +64,8 @@ > #define OFF insn->off > #define IMM insn->imm > > -struct bpf_mem_alloc bpf_global_ma; > -bool bpf_global_ma_set; > +struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; > +bool bpf_global_ma_set, bpf_global_percpu_ma_set; > > /* No hurry in this branch > * > @@ -2921,7 +2921,9 @@ static int __init bpf_global_ma_init(void) > > ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); > bpf_global_ma_set = !ret; > - return ret; > + ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); > + bpf_global_percpu_ma_set = !ret; > + return !bpf_global_ma_set || !bpf_global_percpu_ma_set; > } > late_initcall(bpf_global_ma_init); > #endif > diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c > index 9c49ae53deaf..cb60445de98a 100644 > --- a/kernel/bpf/memalloc.c > +++ b/kernel/bpf/memalloc.c > @@ -499,15 +499,16 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) > struct obj_cgroup *objcg = NULL; > int cpu, i, unit_size, percpu_size = 0; > > + /* room for llist_node and per-cpu pointer */ > + if (percpu) > + percpu_size = LLIST_NODE_SZ + sizeof(void *); > + > if (size) { > pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); > if (!pc) > return -ENOMEM; > > - if (percpu) > - /* room for llist_node and per-cpu pointer */ > - percpu_size = LLIST_NODE_SZ + sizeof(void *); > - else > + if (!percpu) > size += LLIST_NODE_SZ; /* room for llist_node */ > unit_size = size; > > @@ -527,10 +528,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) > return 0; > } > > - /* size == 0 && percpu is an invalid combination */ > - if (WARN_ON_ONCE(percpu)) > - return -EINVAL; > - > pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); > if (!pcc) > return -ENOMEM; > @@ -543,6 +540,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) > c = &cc->cache[i]; > c->unit_size = sizes[i]; > c->objcg = objcg; > + c->percpu_size = percpu_size; > c->tgt = c; > prefill_mem_cache(c, cpu); > } > -- > 2.34.1 >
On Wed, Nov 15, 2023 at 7:32 AM Heiko Carstens <hca@linux.ibm.com> wrote: > > On Sun, Aug 27, 2023 at 08:27:34AM -0700, Yonghong Song wrote: > > This is needed for later percpu mem allocation when the > > allocation is done by bpf program. For such cases, a global > > bpf_global_percpu_ma is added where a flexible allocation > > size is needed. > > > > Signed-off-by: Yonghong Song <yonghong.song@linux.dev> > > --- > > include/linux/bpf.h | 4 ++-- > > kernel/bpf/core.c | 8 +++++--- > > kernel/bpf/memalloc.c | 14 ++++++-------- > > 3 files changed, 13 insertions(+), 13 deletions(-) > > Both Marc and Mikhail reported out-of-memory conditions on s390 machines, > and bisected it down to this upstream commit 41a5db8d8161 ("bpf: Add > support for non-fix-size percpu mem allocation"). > This seems to eat up a lot of memory only based on the number of possible > CPUs. > > If we have a machine with 8GB, 6 present CPUs and 512 possible CPUs (yes, > this is a realistic scenario) the memory consumption directly after boot > is: > > $ cat /sys/devices/system/cpu/present > 0-5 > $ cat /sys/devices/system/cpu/possible > 0-511 > > Before this commit: > > $ cat /proc/meminfo > MemTotal: 8141924 kB > MemFree: 7639872 kB > > With this commit > > $ cat /proc/meminfo > MemTotal: 8141924 kB > MemFree: 4852248 kB > > So, this appears to be a significant regression. > I'm quoting the rest of the original patch below for reference only. Yes. Sorry about this. The issue slipped through. It's fixed in bpf tree by commit: https://patchwork.kernel.org/project/netdevbpf/patch/20231111013928.948838-1-yonghong.song@linux.dev/ The fix will be sent to Linus soon.
Hi, On 11/15/2023 11:31 PM, Heiko Carstens wrote: > On Sun, Aug 27, 2023 at 08:27:34AM -0700, Yonghong Song wrote: >> This is needed for later percpu mem allocation when the >> allocation is done by bpf program. For such cases, a global >> bpf_global_percpu_ma is added where a flexible allocation >> size is needed. >> >> Signed-off-by: Yonghong Song <yonghong.song@linux.dev> >> --- >> include/linux/bpf.h | 4 ++-- >> kernel/bpf/core.c | 8 +++++--- >> kernel/bpf/memalloc.c | 14 ++++++-------- >> 3 files changed, 13 insertions(+), 13 deletions(-) > Both Marc and Mikhail reported out-of-memory conditions on s390 machines, > and bisected it down to this upstream commit 41a5db8d8161 ("bpf: Add > support for non-fix-size percpu mem allocation"). > This seems to eat up a lot of memory only based on the number of possible > CPUs. > > If we have a machine with 8GB, 6 present CPUs and 512 possible CPUs (yes, > this is a realistic scenario) the memory consumption directly after boot > is: > > $ cat /sys/devices/system/cpu/present > 0-5 > $ cat /sys/devices/system/cpu/possible > 0-511 Will the present CPUs be hot-added dynamically and eventually increase to 512 CPUs ? Or will the present CPUs rarely be hot-added ? After all possible CPUs are online, will these CPUs be hot-plugged dynamically ? Because I am considering add CPU hotplug support for bpf mem allocator, so we can allocate memory according to the present CPUs instead of possible CPUs. But if the present CPUs will be increased to all possible CPUs quickly, there will be not too much benefit to support hotplug in bpf mem allocator.
On 11/15/23 8:15 PM, Hou Tao wrote: > Hi, > > On 11/15/2023 11:31 PM, Heiko Carstens wrote: >> On Sun, Aug 27, 2023 at 08:27:34AM -0700, Yonghong Song wrote: >>> This is needed for later percpu mem allocation when the >>> allocation is done by bpf program. For such cases, a global >>> bpf_global_percpu_ma is added where a flexible allocation >>> size is needed. >>> >>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev> >>> --- >>> include/linux/bpf.h | 4 ++-- >>> kernel/bpf/core.c | 8 +++++--- >>> kernel/bpf/memalloc.c | 14 ++++++-------- >>> 3 files changed, 13 insertions(+), 13 deletions(-) >> Both Marc and Mikhail reported out-of-memory conditions on s390 machines, >> and bisected it down to this upstream commit 41a5db8d8161 ("bpf: Add >> support for non-fix-size percpu mem allocation"). >> This seems to eat up a lot of memory only based on the number of possible >> CPUs. >> >> If we have a machine with 8GB, 6 present CPUs and 512 possible CPUs (yes, >> this is a realistic scenario) the memory consumption directly after boot >> is: >> >> $ cat /sys/devices/system/cpu/present >> 0-5 >> $ cat /sys/devices/system/cpu/possible >> 0-511 > Will the present CPUs be hot-added dynamically and eventually increase > to 512 CPUs ? Or will the present CPUs rarely be hot-added ? After all > possible CPUs are online, will these CPUs be hot-plugged dynamically ? > Because I am considering add CPU hotplug support for bpf mem allocator, > so we can allocate memory according to the present CPUs instead of > possible CPUs. But if the present CPUs will be increased to all possible > CPUs quickly, there will be not too much benefit to support hotplug in > bpf mem allocator. Thanks, Hou. In my development machine and also checking some internal production machines, no suprisingly, the 'present' number cpus is equal to the 'possible' number of cpus. I suspect in most cases, 'present' and 'possible' are the same. But it would be good that other people can share their 'present != possible' configuration and explain why.
On Thu, Nov 16, 2023 at 09:15:26AM +0800, Hou Tao wrote: > > If we have a machine with 8GB, 6 present CPUs and 512 possible CPUs (yes, > > this is a realistic scenario) the memory consumption directly after boot > > is: > > > > $ cat /sys/devices/system/cpu/present > > 0-5 > > $ cat /sys/devices/system/cpu/possible > > 0-511 > > Will the present CPUs be hot-added dynamically and eventually increase > to 512 CPUs ? Or will the present CPUs rarely be hot-added ? After all > possible CPUs are online, will these CPUs be hot-plugged dynamically ? > Because I am considering add CPU hotplug support for bpf mem allocator, > so we can allocate memory according to the present CPUs instead of > possible CPUs. But if the present CPUs will be increased to all possible > CPUs quickly, there will be not too much benefit to support hotplug in > bpf mem allocator. You can assume that the present CPUs would change only very rarely. Even though we are only talking about virtual CPUs in this case systems are usually setup in a way that they have enough CPUs for their workload. Only if that is not the case additional CPUs may be added (and brought online) - which is usually much later than boot time. Obviously the above is even more true for systems where you have to add new CPUs in a physical way in order to change present CPUs. So I guess it is fair to assume that if there is such a large difference between present and possible CPUs, that this will also stay that way while the system is running in most cases. Or in other words: it sounds like it is worth to add CPU hotplug support for the the bpf mem allocator (without that I would know what that would really mean for the bpf code). Note for the above numbers: I hacked the number of possible CPUs manually in the kernel code just to illustrate the high memory consumption for the report. On a real system you would see "0-399" CPUs instead. But that's just a minor detail.
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 12596af59c00..144dbddf53bd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -55,8 +55,8 @@ struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; extern struct kobject *btf_kobj; -extern struct bpf_mem_alloc bpf_global_ma; -extern bool bpf_global_ma_set; +extern struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; +extern bool bpf_global_ma_set, bpf_global_percpu_ma_set; typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0f8f036d8bd1..95599df82ee4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -64,8 +64,8 @@ #define OFF insn->off #define IMM insn->imm -struct bpf_mem_alloc bpf_global_ma; -bool bpf_global_ma_set; +struct bpf_mem_alloc bpf_global_ma, bpf_global_percpu_ma; +bool bpf_global_ma_set, bpf_global_percpu_ma_set; /* No hurry in this branch * @@ -2921,7 +2921,9 @@ static int __init bpf_global_ma_init(void) ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); bpf_global_ma_set = !ret; - return ret; + ret = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true); + bpf_global_percpu_ma_set = !ret; + return !bpf_global_ma_set || !bpf_global_percpu_ma_set; } late_initcall(bpf_global_ma_init); #endif diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 9c49ae53deaf..cb60445de98a 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -499,15 +499,16 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) struct obj_cgroup *objcg = NULL; int cpu, i, unit_size, percpu_size = 0; + /* room for llist_node and per-cpu pointer */ + if (percpu) + percpu_size = LLIST_NODE_SZ + sizeof(void *); + if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); if (!pc) return -ENOMEM; - if (percpu) - /* room for llist_node and per-cpu pointer */ - percpu_size = LLIST_NODE_SZ + sizeof(void *); - else + if (!percpu) size += LLIST_NODE_SZ; /* room for llist_node */ unit_size = size; @@ -527,10 +528,6 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) return 0; } - /* size == 0 && percpu is an invalid combination */ - if (WARN_ON_ONCE(percpu)) - return -EINVAL; - pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; @@ -543,6 +540,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c = &cc->cache[i]; c->unit_size = sizes[i]; c->objcg = objcg; + c->percpu_size = percpu_size; c->tgt = c; prefill_mem_cache(c, cpu); }
This is needed for later percpu mem allocation when the allocation is done by bpf program. For such cases, a global bpf_global_percpu_ma is added where a flexible allocation size is needed. Signed-off-by: Yonghong Song <yonghong.song@linux.dev> --- include/linux/bpf.h | 4 ++-- kernel/bpf/core.c | 8 +++++--- kernel/bpf/memalloc.c | 14 ++++++-------- 3 files changed, 13 insertions(+), 13 deletions(-)