Message ID | 20230306084216.3186830-13-martin.lau@linux.dev (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | bpf: Use bpf_mem_cache_alloc/free in bpf_local_storage | expand |
On Mon, Mar 6, 2023 at 12:43 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > From: Martin KaFai Lau <martin.lau@kernel.org> > > This patch uses bpf_mem_cache_alloc/free in bpf_selem_alloc/free. > > The ____cacheline_aligned attribute is no longer needed > in 'struct bpf_local_storage_elem'. bpf_mem_cache_alloc will > have 'struct llist_node' in front of the 'struct bpf_local_storage_elem'. > It will use the 8bytes hole in the bpf_local_storage_elem. > > After bpf_mem_cache_alloc(), the SDATA(selem)->data is zero-ed because > bpf_mem_cache_alloc() could return a reused selem. It is to keep > the existing bpf_map_kzalloc() behavior. Only SDATA(selem)->data > is zero-ed. SDATA(selem)->data is the visible part to the bpf prog. > No need to use zero_map_value() to do the zeroing because > bpf_selem_free() ensures no bpf prog is using the selem before > returning the selem through bpf_mem_cache_free(). For the internal > fields of selem, they will be initialized when linking to the > new smap and the new local_storage. > > When bpf_mem_cache_alloc() fails, bpf_selem_alloc() will try to > fallback to kzalloc only if the caller has GFP_KERNEL flag set (ie. from > sleepable bpf prog that should not cause deadlock). BPF_MA_SIZE > and BPF_MA_PTR macro are added for that. > > For the common selem free path where the selem is freed when its owner > is also being freed, reuse_now == true and selem can be reused > immediately. bpf_selem_free() uses bpf_mem_cache_free() where > selem will be considered for immediate reuse. > > For the uncommon path that the bpf prog explicitly deletes the selem (by > using the helper bpf_*_storage_delete), the selem cannot be reused > immediately. reuse_now == false and bpf_selem_free() will stay with > the current call_rcu_tasks_trace. BPF_MA_NODE macro is added to get > the correct address for the kfree. > > mem_charge and mem_uncharge are changed to use the BPF_MA_SIZE > macro. It will have a temporarily over-charge for the > bpf_local_storage_alloc() because bpf_local_storage is not > moved to bpf_mem_cache_alloc in this patch but it will be done > in the next patch. > > Cc: Namhyung Kim <namhyung@kernel.org> > Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> > --- > include/linux/bpf_local_storage.h | 8 ++--- > include/linux/bpf_mem_alloc.h | 5 +++ > kernel/bpf/bpf_local_storage.c | 56 +++++++++++++++++++++++++------ > 3 files changed, 53 insertions(+), 16 deletions(-) > > diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h > index adb5023a1af5..a236c9b964cf 100644 > --- a/include/linux/bpf_local_storage.h > +++ b/include/linux/bpf_local_storage.h > @@ -13,6 +13,7 @@ > #include <linux/list.h> > #include <linux/hash.h> > #include <linux/types.h> > +#include <linux/bpf_mem_alloc.h> > #include <uapi/linux/btf.h> > > #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 > @@ -55,6 +56,7 @@ struct bpf_local_storage_map { > u32 bucket_log; > u16 elem_size; > u16 cache_idx; > + struct bpf_mem_alloc selem_ma; > }; > > struct bpf_local_storage_data { > @@ -74,11 +76,7 @@ struct bpf_local_storage_elem { > struct hlist_node snode; /* Linked to bpf_local_storage */ > struct bpf_local_storage __rcu *local_storage; > struct rcu_head rcu; > - /* 8 bytes hole */ > - /* The data is stored in another cacheline to minimize > - * the number of cachelines access during a cache hit. > - */ > - struct bpf_local_storage_data sdata ____cacheline_aligned; > + struct bpf_local_storage_data sdata; > }; > > struct bpf_local_storage { > diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h > index a7104af61ab4..0ab16fb0ab50 100644 > --- a/include/linux/bpf_mem_alloc.h > +++ b/include/linux/bpf_mem_alloc.h > @@ -5,6 +5,11 @@ > #include <linux/compiler_types.h> > #include <linux/workqueue.h> > > +#define BPF_MA_NODE_SZ sizeof(struct llist_node) > +#define BPF_MA_SIZE(_size) ((_size) + BPF_MA_NODE_SZ) > +#define BPF_MA_PTR(_node) ((void *)(_node) + BPF_MA_NODE_SZ) > +#define BPF_MA_NODE(_ptr) ((void *)(_ptr) - BPF_MA_NODE_SZ) > + > struct bpf_mem_cache; > struct bpf_mem_caches; > > diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c > index 532b82084ba7..d3c0dd5737d6 100644 > --- a/kernel/bpf/bpf_local_storage.c > +++ b/kernel/bpf/bpf_local_storage.c > @@ -31,7 +31,7 @@ static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) > if (!map->ops->map_local_storage_charge) > return 0; > > - return map->ops->map_local_storage_charge(smap, owner, size); > + return map->ops->map_local_storage_charge(smap, owner, BPF_MA_SIZE(size)); > } > > static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, > @@ -40,7 +40,7 @@ static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, > struct bpf_map *map = &smap->map; > > if (map->ops->map_local_storage_uncharge) > - map->ops->map_local_storage_uncharge(smap, owner, size); > + map->ops->map_local_storage_uncharge(smap, owner, BPF_MA_SIZE(size)); > } > > static struct bpf_local_storage __rcu ** > @@ -80,12 +80,32 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, > if (charge_mem && mem_charge(smap, owner, smap->elem_size)) > return NULL; > > - selem = bpf_map_kzalloc(&smap->map, smap->elem_size, > - gfp_flags | __GFP_NOWARN); > + migrate_disable(); > + selem = bpf_mem_cache_alloc(&smap->selem_ma); > + migrate_enable(); > + if (!selem && (gfp_flags & GFP_KERNEL)) { > + void *ma_node; > + > + ma_node = bpf_map_kzalloc(&smap->map, > + BPF_MA_SIZE(smap->elem_size), > + gfp_flags | __GFP_NOWARN); > + if (ma_node) > + selem = BPF_MA_PTR(ma_node); > + } If I understand it correctly the code is not trying to free selem the same way it allocated it. So we can have kzalloc-ed selems freed into bpf_mem_cache_alloc free-list. That feels dangerous. I don't think we can do such things in local storage, but if we add this api to bpf_mem_alloc it might be acceptable. I mean mem alloc will try to take from the free list and if empty and GFP_KERNEL it will kzalloc it. The knowledge of hidden llist_node shouldn't leave the bpf/memalloc.c file. reuse_now should probably be a memalloc api flag too. The implementation detail that it's scary but ok-ish to kfree or bpf_mem_cache_free depending on circumstances should stay in memalloc.c
On 3/6/23 7:47 PM, Alexei Starovoitov wrote: >> @@ -80,12 +80,32 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, >> if (charge_mem && mem_charge(smap, owner, smap->elem_size)) >> return NULL; >> >> - selem = bpf_map_kzalloc(&smap->map, smap->elem_size, >> - gfp_flags | __GFP_NOWARN); >> + migrate_disable(); >> + selem = bpf_mem_cache_alloc(&smap->selem_ma); >> + migrate_enable(); >> + if (!selem && (gfp_flags & GFP_KERNEL)) { >> + void *ma_node; >> + >> + ma_node = bpf_map_kzalloc(&smap->map, >> + BPF_MA_SIZE(smap->elem_size), >> + gfp_flags | __GFP_NOWARN); >> + if (ma_node) >> + selem = BPF_MA_PTR(ma_node); >> + } > > If I understand it correctly the code is not trying > to free selem the same way it allocated it. > So we can have kzalloc-ed selems freed into bpf_mem_cache_alloc free-list. > That feels dangerous. > I don't think we can do such things in local storage, > but if we add this api to bpf_mem_alloc it might be acceptable. > I mean mem alloc will try to take from the free list and if empty > and GFP_KERNEL it will kzalloc it. > The knowledge of hidden llist_node shouldn't leave the bpf/memalloc.c file. > reuse_now should probably be a memalloc api flag too. > The implementation detail that it's scary but ok-ish to kfree or > bpf_mem_cache_free depending on circumstances should stay in memalloc.c All make sense. I will create a bpf_mem_cache_alloc_flags(..., gfp_t flags) to hide the llist_node and kzalloc details. For free, local storage still needs to use the selem->rcu head in its call_rcu_tasks_trace(), so I will create a bpf_mem_cache_raw_free(void *ptr) to hide the llist_node details, like: /* 'struct bpf_mem_alloc *ma' is not available at this * point but the caller knows it is percpu or not and * call different raw_free function. */ void bpf_mem_cache_raw_free(void *ptr) { kfree(ptr - LLIST_NODE_SZ); }
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index adb5023a1af5..a236c9b964cf 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -13,6 +13,7 @@ #include <linux/list.h> #include <linux/hash.h> #include <linux/types.h> +#include <linux/bpf_mem_alloc.h> #include <uapi/linux/btf.h> #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 @@ -55,6 +56,7 @@ struct bpf_local_storage_map { u32 bucket_log; u16 elem_size; u16 cache_idx; + struct bpf_mem_alloc selem_ma; }; struct bpf_local_storage_data { @@ -74,11 +76,7 @@ struct bpf_local_storage_elem { struct hlist_node snode; /* Linked to bpf_local_storage */ struct bpf_local_storage __rcu *local_storage; struct rcu_head rcu; - /* 8 bytes hole */ - /* The data is stored in another cacheline to minimize - * the number of cachelines access during a cache hit. - */ - struct bpf_local_storage_data sdata ____cacheline_aligned; + struct bpf_local_storage_data sdata; }; struct bpf_local_storage { diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index a7104af61ab4..0ab16fb0ab50 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -5,6 +5,11 @@ #include <linux/compiler_types.h> #include <linux/workqueue.h> +#define BPF_MA_NODE_SZ sizeof(struct llist_node) +#define BPF_MA_SIZE(_size) ((_size) + BPF_MA_NODE_SZ) +#define BPF_MA_PTR(_node) ((void *)(_node) + BPF_MA_NODE_SZ) +#define BPF_MA_NODE(_ptr) ((void *)(_ptr) - BPF_MA_NODE_SZ) + struct bpf_mem_cache; struct bpf_mem_caches; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 532b82084ba7..d3c0dd5737d6 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -31,7 +31,7 @@ static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size) if (!map->ops->map_local_storage_charge) return 0; - return map->ops->map_local_storage_charge(smap, owner, size); + return map->ops->map_local_storage_charge(smap, owner, BPF_MA_SIZE(size)); } static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, @@ -40,7 +40,7 @@ static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner, struct bpf_map *map = &smap->map; if (map->ops->map_local_storage_uncharge) - map->ops->map_local_storage_uncharge(smap, owner, size); + map->ops->map_local_storage_uncharge(smap, owner, BPF_MA_SIZE(size)); } static struct bpf_local_storage __rcu ** @@ -80,12 +80,32 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (charge_mem && mem_charge(smap, owner, smap->elem_size)) return NULL; - selem = bpf_map_kzalloc(&smap->map, smap->elem_size, - gfp_flags | __GFP_NOWARN); + migrate_disable(); + selem = bpf_mem_cache_alloc(&smap->selem_ma); + migrate_enable(); + if (!selem && (gfp_flags & GFP_KERNEL)) { + void *ma_node; + + ma_node = bpf_map_kzalloc(&smap->map, + BPF_MA_SIZE(smap->elem_size), + gfp_flags | __GFP_NOWARN); + if (ma_node) + selem = BPF_MA_PTR(ma_node); + } + if (selem) { if (value) copy_map_value(&smap->map, SDATA(selem)->data, value); - /* No need to call check_and_init_map_value as memory is zero init */ + else + /* Keep the original bpf_map_kzalloc behavior + * before started using the bpf_mem_cache_alloc. + * + * No need to use zero_map_value. The bpf_selem_free() + * only does bpf_mem_cache_free when there is + * no other bpf prog is using the selem. + */ + memset(SDATA(selem)->data, 0, smap->map.value_size); + return selem; } @@ -129,7 +149,7 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) struct bpf_local_storage_elem *selem; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); - kfree(selem); + kfree(BPF_MA_NODE(selem)); } static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) @@ -145,10 +165,13 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, bool reuse_now) { bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - if (!reuse_now) + if (!reuse_now) { call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); - else - call_rcu(&selem->rcu, bpf_selem_free_rcu); + } else { + migrate_disable(); + bpf_mem_cache_free(&smap->selem_ma, selem); + migrate_enable(); + } } /* local_storage->lock must be held and selem->local_storage == local_storage. @@ -651,6 +674,7 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_map *smap; unsigned int i; u32 nbuckets; + int err; smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); if (!smap) @@ -665,8 +689,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { - bpf_map_area_free(smap); - return ERR_PTR(-ENOMEM); + err = -ENOMEM; + goto free_smap; } for (i = 0; i < nbuckets; i++) { @@ -677,8 +701,17 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, smap->elem_size = offsetof(struct bpf_local_storage_elem, sdata.data[attr->value_size]); + err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false); + if (err) + goto free_smap; + smap->cache_idx = bpf_local_storage_cache_idx_get(cache); return &smap->map; + +free_smap: + kvfree(smap->buckets); + bpf_map_area_free(smap); + return ERR_PTR(err); } void bpf_local_storage_map_free(struct bpf_map *map, @@ -744,6 +777,7 @@ void bpf_local_storage_map_free(struct bpf_map *map, */ synchronize_rcu(); + bpf_mem_alloc_destroy(&smap->selem_ma); kvfree(smap->buckets); bpf_map_area_free(smap); }