Message ID | 20240221225911.757861-3-thinker.li@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Allow struct_ops maps with a large number of programs | expand |
On 2/21/24 2:59 PM, thinker.li@gmail.com wrote: > @@ -531,10 +567,10 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > const struct btf_type *module_type; > const struct btf_member *member; > const struct btf_type *t = st_ops_desc->type; > + void *image = NULL, *image_end = NULL; > struct bpf_tramp_links *tlinks; > void *udata, *kdata; > int prog_fd, err; > - void *image, *image_end; > u32 i; > > if (flags) > @@ -573,15 +609,14 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > > udata = &uvalue->data; > kdata = &kvalue->data; > - image = st_map->image; > - image_end = st_map->image + PAGE_SIZE; > > module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); > for_each_member(i, t, member) { > const struct btf_type *mtype, *ptype; > struct bpf_prog *prog; > struct bpf_tramp_link *link; > - u32 moff; > + u32 moff, tflags; > + int tsize; > > moff = __btf_member_bit_offset(t, member) / 8; > ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); > @@ -653,10 +688,38 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > &bpf_struct_ops_link_lops, prog); > st_map->links[i] = &link->link; > > - err = bpf_struct_ops_prepare_trampoline(tlinks, link, > - &st_ops->func_models[i], > - *(void **)(st_ops->cfi_stubs + moff), > - image, image_end); > + tflags = BPF_TRAMP_F_INDIRECT; > + if (st_ops->func_models[i].ret_size > 0) > + tflags |= BPF_TRAMP_F_RET_FENTRY_RET; > + > + /* Compute the size of the trampoline */ > + tlinks[BPF_TRAMP_FENTRY].links[0] = link; > + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; > + tsize = arch_bpf_trampoline_size(&st_ops->func_models[i], > + tflags, tlinks, NULL); > + if (tsize < 0) { > + err = tsize; > + goto reset_unlock; > + } > + > + /* Allocate pages */ > + if (tsize > (unsigned long)image_end - (unsigned long)image) { > + if (tsize > PAGE_SIZE) { > + err = -E2BIG; > + goto reset_unlock; > + } > + image = bpf_struct_ops_map_inc_image(st_map); > + if (IS_ERR(image)) { > + err = PTR_ERR(image); > + goto reset_unlock; > + } > + image_end = image + PAGE_SIZE; > + } > + > + err = arch_prepare_bpf_trampoline(NULL, image, image_end, > + &st_ops->func_models[i], > + tflags, tlinks, > + *(void **)(st_ops->cfi_stubs + moff)); I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, and the arch_*_trampoline_*() logic from bpf_struct_ops_prepare_trampoline() which is used by the bpf_dummy_ops for testing also. Considering struct_ops supports kernel module now, in the future, it is better to move bpf_dummy_ops out to the bpf_testmod somehow and avoid its bpf_struct_ops_prepare_trampoline() usage. For now, it is still better to keep bpf_struct_ops_prepare_trampoline() to be reusable by both. Have you thought about the earlier suggestion in v1 to do arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() instead of copying codes from bpf_struct_ops_prepare_trampoline() to bpf_struct_ops_map_update_elem()? Something like this (untested code): void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, void *stub_func, void *image, u32 *image_off, bool allow_alloc) { u32 flags = BPF_TRAMP_F_INDIRECT; void *new_image = NULL; int size; tlinks[BPF_TRAMP_FENTRY].links[0] = link; tlinks[BPF_TRAMP_FENTRY].nr_links = 1; if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); if (size < 0) return ERR_PTR(size); if (!image || size > PAGE_SIZE - *image_off) { int err; if (!allow_alloc) return ERR_PTR(-E2BIG); err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) return ERR_PTR(err); new_image = image = arch_alloc_bpf_trampoline(PAGE_SIZE); if (!new_image) { bpf_jit_uncharge_modmem(PAGE_SIZE); return ERR_PTR(-ENOMEM); } *image_off = 0; } size = arch_prepare_bpf_trampoline(NULL, image + *image_off, image + PAGE_SIZE, model, flags, tlinks, stub_func); if (size >= 0) { *image_off += size; return image; } if (new_image) { bpf_jit_uncharge_modmem(PAGE_SIZE); arch_free_bpf_trampoline(new_image, PAGE_SIZE); } return ERR_PTR(size); } ---- pw-bot: cr > if (err < 0) > goto reset_unlock; > > @@ -672,10 +735,11 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > if (err) > goto reset_unlock; > } > + for (i = 0; i < st_map->image_pages_cnt; i++) > + arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); > > if (st_map->map.map_flags & BPF_F_LINK) { > err = 0; > - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); > /* Let bpf_link handle registration & unregistration. > * > * Pair with smp_load_acquire() during lookup_elem(). > @@ -684,7 +748,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > goto unlock; > } > > - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); > err = st_ops->reg(kdata); > if (likely(!err)) { > /* This refcnt increment on the map here after > @@ -707,9 +770,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, > * there was a race in registering the struct_ops (under the same name) to > * a sub-system through different struct_ops's maps. > */ > - arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE); > > reset_unlock: > + bpf_struct_ops_map_free_image(st_map); > bpf_struct_ops_map_put_progs(st_map); > memset(uvalue, 0, map->value_size); > memset(kvalue, 0, map->value_size); > @@ -776,10 +839,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) > if (st_map->links) > bpf_struct_ops_map_put_progs(st_map); > bpf_map_area_free(st_map->links); > - if (st_map->image) { > - arch_free_bpf_trampoline(st_map->image, PAGE_SIZE); > - bpf_jit_uncharge_modmem(PAGE_SIZE); > - } > + bpf_struct_ops_map_free_image(st_map); > bpf_map_area_free(st_map->uvalue); > bpf_map_area_free(st_map); > } > @@ -889,20 +949,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) > st_map->st_ops_desc = st_ops_desc; > map = &st_map->map; > > - ret = bpf_jit_charge_modmem(PAGE_SIZE); > - if (ret) > - goto errout_free; > - > - st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE); > - if (!st_map->image) { > - /* __bpf_struct_ops_map_free() uses st_map->image as flag > - * for "charged or not". In this case, we need to unchange > - * here. > - */ > - bpf_jit_uncharge_modmem(PAGE_SIZE); > - ret = -ENOMEM; > - goto errout_free; > - } > st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); > st_map->links_cnt = btf_type_vlen(t); > st_map->links =
On 2/22/24 16:33, Martin KaFai Lau wrote: > On 2/21/24 2:59 PM, thinker.li@gmail.com wrote: >> @@ -531,10 +567,10 @@ static long >> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >> const struct btf_type *module_type; >> const struct btf_member *member; >> const struct btf_type *t = st_ops_desc->type; >> + void *image = NULL, *image_end = NULL; >> struct bpf_tramp_links *tlinks; >> void *udata, *kdata; >> int prog_fd, err; >> - void *image, *image_end; >> u32 i; >> if (flags) >> @@ -573,15 +609,14 @@ static long >> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >> udata = &uvalue->data; >> kdata = &kvalue->data; >> - image = st_map->image; >> - image_end = st_map->image + PAGE_SIZE; >> module_type = btf_type_by_id(btf_vmlinux, >> st_ops_ids[IDX_MODULE_ID]); >> for_each_member(i, t, member) { >> const struct btf_type *mtype, *ptype; >> struct bpf_prog *prog; >> struct bpf_tramp_link *link; >> - u32 moff; >> + u32 moff, tflags; >> + int tsize; >> moff = __btf_member_bit_offset(t, member) / 8; >> ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); >> @@ -653,10 +688,38 @@ static long >> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >> &bpf_struct_ops_link_lops, prog); >> st_map->links[i] = &link->link; >> - err = bpf_struct_ops_prepare_trampoline(tlinks, link, >> - &st_ops->func_models[i], >> - *(void **)(st_ops->cfi_stubs + moff), >> - image, image_end); >> + tflags = BPF_TRAMP_F_INDIRECT; >> + if (st_ops->func_models[i].ret_size > 0) >> + tflags |= BPF_TRAMP_F_RET_FENTRY_RET; >> + >> + /* Compute the size of the trampoline */ >> + tlinks[BPF_TRAMP_FENTRY].links[0] = link; >> + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; >> + tsize = arch_bpf_trampoline_size(&st_ops->func_models[i], >> + tflags, tlinks, NULL); >> + if (tsize < 0) { >> + err = tsize; >> + goto reset_unlock; >> + } >> + >> + /* Allocate pages */ >> + if (tsize > (unsigned long)image_end - (unsigned long)image) { >> + if (tsize > PAGE_SIZE) { >> + err = -E2BIG; >> + goto reset_unlock; >> + } >> + image = bpf_struct_ops_map_inc_image(st_map); >> + if (IS_ERR(image)) { >> + err = PTR_ERR(image); >> + goto reset_unlock; >> + } >> + image_end = image + PAGE_SIZE; >> + } >> + >> + err = arch_prepare_bpf_trampoline(NULL, image, image_end, >> + &st_ops->func_models[i], >> + tflags, tlinks, >> + *(void **)(st_ops->cfi_stubs + moff)); > > I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, and > the arch_*_trampoline_*() logic from bpf_struct_ops_prepare_trampoline() > which is used by the bpf_dummy_ops for testing also. Considering > struct_ops supports kernel module now, in the future, it is better to > move bpf_dummy_ops out to the bpf_testmod somehow and avoid its > bpf_struct_ops_prepare_trampoline() usage. For now, it is still better > to keep bpf_struct_ops_prepare_trampoline() to be reusable by both. > > Have you thought about the earlier suggestion in v1 to do > arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() > instead of copying codes from bpf_struct_ops_prepare_trampoline() to > bpf_struct_ops_map_update_elem()? > > Something like this (untested code): > > void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, > struct bpf_tramp_link *link, > const struct btf_func_model *model, > void *stub_func, void *image, > u32 *image_off, > bool allow_alloc) How about pass a struct bpf_struct_ops_map to bpf_struct_ops_prepare_trampoline(). If the pointer of struct bpf_struct_ops_map is not NULL, try to allocate new pages for the map? For example, static int _bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, void *stub_func, void *image, void *image_end, struct bpf_struct_ops_map *st_map) { ... if (!image || size > PAGE_SIZE - *image_off) { if (!st_map) return -E2BIG; image = bpf_struct_ops_map_inc_image(st_map); if (IS_ERR(image)) return PTR_ERR(image); image_end = image + PAGE_SIZE; } ... } int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model*model, void *stub_func, void *image, void *image_end) { return _bpf_struct_ops_prepare_trampoline(tlinks, link, model, stub_func, image, image_end, NULL); } > { > u32 flags = BPF_TRAMP_F_INDIRECT; > void *new_image = NULL; > int size; > > tlinks[BPF_TRAMP_FENTRY].links[0] = link; > tlinks[BPF_TRAMP_FENTRY].nr_links = 1; > > if (model->ret_size > 0) > flags |= BPF_TRAMP_F_RET_FENTRY_RET; > > size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); > if (size < 0) > return ERR_PTR(size); > if (!image || size > PAGE_SIZE - *image_off) { > int err; > > if (!allow_alloc) > return ERR_PTR(-E2BIG); > > err = bpf_jit_charge_modmem(PAGE_SIZE); > if (err) > return ERR_PTR(err); > > new_image = image = arch_alloc_bpf_trampoline(PAGE_SIZE); > if (!new_image) { > bpf_jit_uncharge_modmem(PAGE_SIZE); > return ERR_PTR(-ENOMEM); > } > *image_off = 0; > } > > > size = arch_prepare_bpf_trampoline(NULL, image + *image_off, > image + PAGE_SIZE, > model, flags, tlinks, stub_func); > if (size >= 0) { > *image_off += size; > return image; > } > > if (new_image) { > bpf_jit_uncharge_modmem(PAGE_SIZE); > arch_free_bpf_trampoline(new_image, PAGE_SIZE); > } > > return ERR_PTR(size); > } > > ---- > > pw-bot: cr > >> if (err < 0) >> goto reset_unlock; >> @@ -672,10 +735,11 @@ static long >> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >> if (err) >> goto reset_unlock; >> } >> + for (i = 0; i < st_map->image_pages_cnt; i++) >> + arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); >> if (st_map->map.map_flags & BPF_F_LINK) { >> err = 0; >> - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); >> /* Let bpf_link handle registration & unregistration. >> * >> * Pair with smp_load_acquire() during lookup_elem(). >> @@ -684,7 +748,6 @@ static long bpf_struct_ops_map_update_elem(struct >> bpf_map *map, void *key, >> goto unlock; >> } >> - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); >> err = st_ops->reg(kdata); >> if (likely(!err)) { >> /* This refcnt increment on the map here after >> @@ -707,9 +770,9 @@ static long bpf_struct_ops_map_update_elem(struct >> bpf_map *map, void *key, >> * there was a race in registering the struct_ops (under the >> same name) to >> * a sub-system through different struct_ops's maps. >> */ >> - arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE); >> reset_unlock: >> + bpf_struct_ops_map_free_image(st_map); >> bpf_struct_ops_map_put_progs(st_map); >> memset(uvalue, 0, map->value_size); >> memset(kvalue, 0, map->value_size); >> @@ -776,10 +839,7 @@ static void __bpf_struct_ops_map_free(struct >> bpf_map *map) >> if (st_map->links) >> bpf_struct_ops_map_put_progs(st_map); >> bpf_map_area_free(st_map->links); >> - if (st_map->image) { >> - arch_free_bpf_trampoline(st_map->image, PAGE_SIZE); >> - bpf_jit_uncharge_modmem(PAGE_SIZE); >> - } >> + bpf_struct_ops_map_free_image(st_map); >> bpf_map_area_free(st_map->uvalue); >> bpf_map_area_free(st_map); >> } >> @@ -889,20 +949,6 @@ static struct bpf_map >> *bpf_struct_ops_map_alloc(union bpf_attr *attr) >> st_map->st_ops_desc = st_ops_desc; >> map = &st_map->map; >> - ret = bpf_jit_charge_modmem(PAGE_SIZE); >> - if (ret) >> - goto errout_free; >> - >> - st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE); >> - if (!st_map->image) { >> - /* __bpf_struct_ops_map_free() uses st_map->image as flag >> - * for "charged or not". In this case, we need to unchange >> - * here. >> - */ >> - bpf_jit_uncharge_modmem(PAGE_SIZE); >> - ret = -ENOMEM; >> - goto errout_free; >> - } >> st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); >> st_map->links_cnt = btf_type_vlen(t); >> st_map->links = >
On 2/22/24 5:35 PM, Kui-Feng Lee wrote: > > > On 2/22/24 16:33, Martin KaFai Lau wrote: >> On 2/21/24 2:59 PM, thinker.li@gmail.com wrote: >>> @@ -531,10 +567,10 @@ static long bpf_struct_ops_map_update_elem(struct >>> bpf_map *map, void *key, >>> const struct btf_type *module_type; >>> const struct btf_member *member; >>> const struct btf_type *t = st_ops_desc->type; >>> + void *image = NULL, *image_end = NULL; >>> struct bpf_tramp_links *tlinks; >>> void *udata, *kdata; >>> int prog_fd, err; >>> - void *image, *image_end; >>> u32 i; >>> if (flags) >>> @@ -573,15 +609,14 @@ static long bpf_struct_ops_map_update_elem(struct >>> bpf_map *map, void *key, >>> udata = &uvalue->data; >>> kdata = &kvalue->data; >>> - image = st_map->image; >>> - image_end = st_map->image + PAGE_SIZE; >>> module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); >>> for_each_member(i, t, member) { >>> const struct btf_type *mtype, *ptype; >>> struct bpf_prog *prog; >>> struct bpf_tramp_link *link; >>> - u32 moff; >>> + u32 moff, tflags; >>> + int tsize; >>> moff = __btf_member_bit_offset(t, member) / 8; >>> ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); >>> @@ -653,10 +688,38 @@ static long bpf_struct_ops_map_update_elem(struct >>> bpf_map *map, void *key, >>> &bpf_struct_ops_link_lops, prog); >>> st_map->links[i] = &link->link; >>> - err = bpf_struct_ops_prepare_trampoline(tlinks, link, >>> - &st_ops->func_models[i], >>> - *(void **)(st_ops->cfi_stubs + moff), >>> - image, image_end); >>> + tflags = BPF_TRAMP_F_INDIRECT; >>> + if (st_ops->func_models[i].ret_size > 0) >>> + tflags |= BPF_TRAMP_F_RET_FENTRY_RET; >>> + >>> + /* Compute the size of the trampoline */ >>> + tlinks[BPF_TRAMP_FENTRY].links[0] = link; >>> + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; >>> + tsize = arch_bpf_trampoline_size(&st_ops->func_models[i], >>> + tflags, tlinks, NULL); >>> + if (tsize < 0) { >>> + err = tsize; >>> + goto reset_unlock; >>> + } >>> + >>> + /* Allocate pages */ >>> + if (tsize > (unsigned long)image_end - (unsigned long)image) { >>> + if (tsize > PAGE_SIZE) { >>> + err = -E2BIG; >>> + goto reset_unlock; >>> + } >>> + image = bpf_struct_ops_map_inc_image(st_map); >>> + if (IS_ERR(image)) { >>> + err = PTR_ERR(image); >>> + goto reset_unlock; >>> + } >>> + image_end = image + PAGE_SIZE; >>> + } >>> + >>> + err = arch_prepare_bpf_trampoline(NULL, image, image_end, >>> + &st_ops->func_models[i], >>> + tflags, tlinks, >>> + *(void **)(st_ops->cfi_stubs + moff)); >> >> I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, and the >> arch_*_trampoline_*() logic from bpf_struct_ops_prepare_trampoline() which is >> used by the bpf_dummy_ops for testing also. Considering struct_ops supports >> kernel module now, in the future, it is better to move bpf_dummy_ops out to >> the bpf_testmod somehow and avoid its bpf_struct_ops_prepare_trampoline() >> usage. For now, it is still better to keep bpf_struct_ops_prepare_trampoline() >> to be reusable by both. >> >> Have you thought about the earlier suggestion in v1 to do >> arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() instead of >> copying codes from bpf_struct_ops_prepare_trampoline() to >> bpf_struct_ops_map_update_elem()? >> >> Something like this (untested code): >> >> void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, >> struct bpf_tramp_link *link, >> const struct btf_func_model *model, >> void *stub_func, void *image, >> u32 *image_off, >> bool allow_alloc) To be a little more specific, the changes in bpf_struct_ops_map_update_elem() could be mostly like this (untested): ret_image = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], *(void **)(st_ops->cfi_stubs + moff), image, &image_off, st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES); if (IS_ERR(ret_image)) goto reset_unlock; if (image != ret_image) { image = ret_image; st_map->image_pages[st_map->image_pages_cnt++] = image; } > > > How about pass a struct bpf_struct_ops_map to > bpf_struct_ops_prepare_trampoline(). If the pointer of struct > bpf_struct_ops_map is not NULL, try to allocate new pages for the map? > > For example, > > static int > _bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, > > struct bpf_tramp_link *link, > > const struct btf_func_model *model, > > void *stub_func, void *image, > void *image_end, > struct bpf_struct_ops_map *st_map) > { > > ... > > if (!image || size > PAGE_SIZE - *image_off) { > if (!st_map) Why only limit to st_map != NULL? arch_alloc_bpf_trampoline() is also called in bpf_dummy_ops. If bpf_struct_ops_prepare_trampoline() can do the alloc, it may as well simplify bpf_dummy_ops and just use bpf_struct_ops_prepare_trampoline() to alloc. > return -E2BIG; > image = bpf_struct_ops_map_inc_image(st_map); > if (IS_ERR(image)) > return PTR_ERR(image); > image_end = image + PAGE_SIZE; > } > ... > } > > int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, > > struct bpf_tramp_link *link, > > const struct btf_func_model*model, > void *stub_func, void *image, > void *image_end) > { > return _bpf_struct_ops_prepare_trampoline(tlinks, link, model, > stub_func, image, > image_end, NULL); > } > >> { >> u32 flags = BPF_TRAMP_F_INDIRECT; >> void *new_image = NULL; >> int size; >> >> tlinks[BPF_TRAMP_FENTRY].links[0] = link; >> tlinks[BPF_TRAMP_FENTRY].nr_links = 1; >> >> if (model->ret_size > 0) >> flags |= BPF_TRAMP_F_RET_FENTRY_RET; >> >> size = arch_bpf_trampoline_size(model, flags, tlinks, NULL); >> if (size < 0) >> return ERR_PTR(size); >> if (!image || size > PAGE_SIZE - *image_off) { >> int err; >> >> if (!allow_alloc) >> return ERR_PTR(-E2BIG); >> >> err = bpf_jit_charge_modmem(PAGE_SIZE); >> if (err) >> return ERR_PTR(err); >> >> new_image = image = arch_alloc_bpf_trampoline(PAGE_SIZE); >> if (!new_image) { >> bpf_jit_uncharge_modmem(PAGE_SIZE); >> return ERR_PTR(-ENOMEM); >> } >> *image_off = 0; >> } >> >> >> size = arch_prepare_bpf_trampoline(NULL, image + *image_off, >> image + PAGE_SIZE, >> model, flags, tlinks, stub_func); >> if (size >= 0) { >> *image_off += size; >> return image; >> } >> >> if (new_image) { >> bpf_jit_uncharge_modmem(PAGE_SIZE); >> arch_free_bpf_trampoline(new_image, PAGE_SIZE); >> } >> >> return ERR_PTR(size); >> } >> >> ---- >> >> pw-bot: cr >> >>> if (err < 0) >>> goto reset_unlock; >>> @@ -672,10 +735,11 @@ static long bpf_struct_ops_map_update_elem(struct >>> bpf_map *map, void *key, >>> if (err) >>> goto reset_unlock; >>> } >>> + for (i = 0; i < st_map->image_pages_cnt; i++) >>> + arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); >>> if (st_map->map.map_flags & BPF_F_LINK) { >>> err = 0; >>> - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); >>> /* Let bpf_link handle registration & unregistration. >>> * >>> * Pair with smp_load_acquire() during lookup_elem(). >>> @@ -684,7 +748,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map >>> *map, void *key, >>> goto unlock; >>> } >>> - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); >>> err = st_ops->reg(kdata); >>> if (likely(!err)) { >>> /* This refcnt increment on the map here after >>> @@ -707,9 +770,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map >>> *map, void *key, >>> * there was a race in registering the struct_ops (under the same name) to >>> * a sub-system through different struct_ops's maps. >>> */ >>> - arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE); >>> reset_unlock: >>> + bpf_struct_ops_map_free_image(st_map); >>> bpf_struct_ops_map_put_progs(st_map); >>> memset(uvalue, 0, map->value_size); >>> memset(kvalue, 0, map->value_size); >>> @@ -776,10 +839,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) >>> if (st_map->links) >>> bpf_struct_ops_map_put_progs(st_map); >>> bpf_map_area_free(st_map->links); >>> - if (st_map->image) { >>> - arch_free_bpf_trampoline(st_map->image, PAGE_SIZE); >>> - bpf_jit_uncharge_modmem(PAGE_SIZE); >>> - } >>> + bpf_struct_ops_map_free_image(st_map); >>> bpf_map_area_free(st_map->uvalue); >>> bpf_map_area_free(st_map); >>> } >>> @@ -889,20 +949,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union >>> bpf_attr *attr) >>> st_map->st_ops_desc = st_ops_desc; >>> map = &st_map->map; >>> - ret = bpf_jit_charge_modmem(PAGE_SIZE); >>> - if (ret) >>> - goto errout_free; >>> - >>> - st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE); >>> - if (!st_map->image) { >>> - /* __bpf_struct_ops_map_free() uses st_map->image as flag >>> - * for "charged or not". In this case, we need to unchange >>> - * here. >>> - */ >>> - bpf_jit_uncharge_modmem(PAGE_SIZE); >>> - ret = -ENOMEM; >>> - goto errout_free; >>> - } >>> st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); >>> st_map->links_cnt = btf_type_vlen(t); >>> st_map->links = >>
On 2/22/24 18:16, Martin KaFai Lau wrote: > On 2/22/24 5:35 PM, Kui-Feng Lee wrote: >> >> >> On 2/22/24 16:33, Martin KaFai Lau wrote: >>> On 2/21/24 2:59 PM, thinker.li@gmail.com wrote: >>>> @@ -531,10 +567,10 @@ static long >>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >>>> const struct btf_type *module_type; >>>> const struct btf_member *member; >>>> const struct btf_type *t = st_ops_desc->type; >>>> + void *image = NULL, *image_end = NULL; >>>> struct bpf_tramp_links *tlinks; >>>> void *udata, *kdata; >>>> int prog_fd, err; >>>> - void *image, *image_end; >>>> u32 i; >>>> if (flags) >>>> @@ -573,15 +609,14 @@ static long >>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >>>> udata = &uvalue->data; >>>> kdata = &kvalue->data; >>>> - image = st_map->image; >>>> - image_end = st_map->image + PAGE_SIZE; >>>> module_type = btf_type_by_id(btf_vmlinux, >>>> st_ops_ids[IDX_MODULE_ID]); >>>> for_each_member(i, t, member) { >>>> const struct btf_type *mtype, *ptype; >>>> struct bpf_prog *prog; >>>> struct bpf_tramp_link *link; >>>> - u32 moff; >>>> + u32 moff, tflags; >>>> + int tsize; >>>> moff = __btf_member_bit_offset(t, member) / 8; >>>> ptype = btf_type_resolve_ptr(st_map->btf, member->type, >>>> NULL); >>>> @@ -653,10 +688,38 @@ static long >>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >>>> &bpf_struct_ops_link_lops, prog); >>>> st_map->links[i] = &link->link; >>>> - err = bpf_struct_ops_prepare_trampoline(tlinks, link, >>>> - &st_ops->func_models[i], >>>> - *(void **)(st_ops->cfi_stubs + moff), >>>> - image, image_end); >>>> + tflags = BPF_TRAMP_F_INDIRECT; >>>> + if (st_ops->func_models[i].ret_size > 0) >>>> + tflags |= BPF_TRAMP_F_RET_FENTRY_RET; >>>> + >>>> + /* Compute the size of the trampoline */ >>>> + tlinks[BPF_TRAMP_FENTRY].links[0] = link; >>>> + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; >>>> + tsize = arch_bpf_trampoline_size(&st_ops->func_models[i], >>>> + tflags, tlinks, NULL); >>>> + if (tsize < 0) { >>>> + err = tsize; >>>> + goto reset_unlock; >>>> + } >>>> + >>>> + /* Allocate pages */ >>>> + if (tsize > (unsigned long)image_end - (unsigned long)image) { >>>> + if (tsize > PAGE_SIZE) { >>>> + err = -E2BIG; >>>> + goto reset_unlock; >>>> + } >>>> + image = bpf_struct_ops_map_inc_image(st_map); >>>> + if (IS_ERR(image)) { >>>> + err = PTR_ERR(image); >>>> + goto reset_unlock; >>>> + } >>>> + image_end = image + PAGE_SIZE; >>>> + } >>>> + >>>> + err = arch_prepare_bpf_trampoline(NULL, image, image_end, >>>> + &st_ops->func_models[i], >>>> + tflags, tlinks, >>>> + *(void **)(st_ops->cfi_stubs + moff)); >>> >>> I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, >>> and the arch_*_trampoline_*() logic from >>> bpf_struct_ops_prepare_trampoline() which is used by the >>> bpf_dummy_ops for testing also. Considering struct_ops supports >>> kernel module now, in the future, it is better to move bpf_dummy_ops >>> out to the bpf_testmod somehow and avoid its >>> bpf_struct_ops_prepare_trampoline() usage. For now, it is still >>> better to keep bpf_struct_ops_prepare_trampoline() to be reusable by >>> both. >>> >>> Have you thought about the earlier suggestion in v1 to do >>> arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() >>> instead of copying codes from bpf_struct_ops_prepare_trampoline() to >>> bpf_struct_ops_map_update_elem()? >>> >>> Something like this (untested code): >>> >>> void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, >>> struct bpf_tramp_link *link, >>> const struct btf_func_model *model, >>> void *stub_func, void *image, >>> u32 *image_off, >>> bool allow_alloc) > > To be a little more specific, the changes in > bpf_struct_ops_map_update_elem() > could be mostly like this (untested): > > ret_image = bpf_struct_ops_prepare_trampoline(tlinks, link, > &st_ops->func_models[i], > *(void **)(st_ops->cfi_stubs + moff), > image, &image_off, > st_map->image_pages_cnt < > MAX_TRAMP_IMAGE_PAGES); > if (IS_ERR(ret_image)) > goto reset_unlock; > > if (image != ret_image) { > image = ret_image; > st_map->image_pages[st_map->image_pages_cnt++] = image; > } > What I don't like is the memory management code was in two named functions, bpf_struct_ops_map_free_image() and bpf_struct_ops_map_inc_image(). Now, it falls apart. Allocate in one place, keep accounting in another place, and free yet at the 3rd place. >> >> >> How about pass a struct bpf_struct_ops_map to >> bpf_struct_ops_prepare_trampoline(). If the pointer of struct >> bpf_struct_ops_map is not NULL, try to allocate new pages for the map? >> >> For example, >> >> static int >> _bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, >> >> struct bpf_tramp_link *link, >> >> const struct btf_func_model *model, >> >> void *stub_func, void *image, >> void *image_end, >> struct bpf_struct_ops_map *st_map) >> { >> >> ... >> >> if (!image || size > PAGE_SIZE - *image_off) { >> if (!st_map) > > Why only limit to st_map != NULL? > > arch_alloc_bpf_trampoline() is also called in bpf_dummy_ops. > If bpf_struct_ops_prepare_trampoline() can do the alloc, it may as well > simplify > bpf_dummy_ops and just use bpf_struct_ops_prepare_trampoline() to alloc. Yes, it can save a few lines from bpf_dummy_ops. But, bpf_dummy_ops still need to free the memory. And, it doesn't pair alloc and free in the same function. Usually, paring alloc and free in the same function, nearby, or the same module is easier to understand. [ skip ]
On 2/22/24 7:01 PM, Kui-Feng Lee wrote: > > > > On 2/22/24 18:16, Martin KaFai Lau wrote: >> On 2/22/24 5:35 PM, Kui-Feng Lee wrote: >>> >>> >>> On 2/22/24 16:33, Martin KaFai Lau wrote: >>>> On 2/21/24 2:59 PM, thinker.li@gmail.com wrote: >>>>> @@ -531,10 +567,10 @@ static long bpf_struct_ops_map_update_elem(struct >>>>> bpf_map *map, void *key, >>>>> const struct btf_type *module_type; >>>>> const struct btf_member *member; >>>>> const struct btf_type *t = st_ops_desc->type; >>>>> + void *image = NULL, *image_end = NULL; >>>>> struct bpf_tramp_links *tlinks; >>>>> void *udata, *kdata; >>>>> int prog_fd, err; >>>>> - void *image, *image_end; >>>>> u32 i; >>>>> if (flags) >>>>> @@ -573,15 +609,14 @@ static long bpf_struct_ops_map_update_elem(struct >>>>> bpf_map *map, void *key, >>>>> udata = &uvalue->data; >>>>> kdata = &kvalue->data; >>>>> - image = st_map->image; >>>>> - image_end = st_map->image + PAGE_SIZE; >>>>> module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); >>>>> for_each_member(i, t, member) { >>>>> const struct btf_type *mtype, *ptype; >>>>> struct bpf_prog *prog; >>>>> struct bpf_tramp_link *link; >>>>> - u32 moff; >>>>> + u32 moff, tflags; >>>>> + int tsize; >>>>> moff = __btf_member_bit_offset(t, member) / 8; >>>>> ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); >>>>> @@ -653,10 +688,38 @@ static long bpf_struct_ops_map_update_elem(struct >>>>> bpf_map *map, void *key, >>>>> &bpf_struct_ops_link_lops, prog); >>>>> st_map->links[i] = &link->link; >>>>> - err = bpf_struct_ops_prepare_trampoline(tlinks, link, >>>>> - &st_ops->func_models[i], >>>>> - *(void **)(st_ops->cfi_stubs + moff), >>>>> - image, image_end); >>>>> + tflags = BPF_TRAMP_F_INDIRECT; >>>>> + if (st_ops->func_models[i].ret_size > 0) >>>>> + tflags |= BPF_TRAMP_F_RET_FENTRY_RET; >>>>> + >>>>> + /* Compute the size of the trampoline */ >>>>> + tlinks[BPF_TRAMP_FENTRY].links[0] = link; >>>>> + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; >>>>> + tsize = arch_bpf_trampoline_size(&st_ops->func_models[i], >>>>> + tflags, tlinks, NULL); >>>>> + if (tsize < 0) { >>>>> + err = tsize; >>>>> + goto reset_unlock; >>>>> + } >>>>> + >>>>> + /* Allocate pages */ >>>>> + if (tsize > (unsigned long)image_end - (unsigned long)image) { >>>>> + if (tsize > PAGE_SIZE) { >>>>> + err = -E2BIG; >>>>> + goto reset_unlock; >>>>> + } >>>>> + image = bpf_struct_ops_map_inc_image(st_map); >>>>> + if (IS_ERR(image)) { >>>>> + err = PTR_ERR(image); >>>>> + goto reset_unlock; >>>>> + } >>>>> + image_end = image + PAGE_SIZE; >>>>> + } >>>>> + >>>>> + err = arch_prepare_bpf_trampoline(NULL, image, image_end, >>>>> + &st_ops->func_models[i], >>>>> + tflags, tlinks, >>>>> + *(void **)(st_ops->cfi_stubs + moff)); >>>> >>>> I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, and the >>>> arch_*_trampoline_*() logic from bpf_struct_ops_prepare_trampoline() which >>>> is used by the bpf_dummy_ops for testing also. Considering struct_ops >>>> supports kernel module now, in the future, it is better to move >>>> bpf_dummy_ops out to the bpf_testmod somehow and avoid its >>>> bpf_struct_ops_prepare_trampoline() usage. For now, it is still better to >>>> keep bpf_struct_ops_prepare_trampoline() to be reusable by both. >>>> >>>> Have you thought about the earlier suggestion in v1 to do >>>> arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() instead >>>> of copying codes from bpf_struct_ops_prepare_trampoline() to >>>> bpf_struct_ops_map_update_elem()? >>>> >>>> Something like this (untested code): >>>> >>>> void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, >>>> struct bpf_tramp_link *link, >>>> const struct btf_func_model *model, >>>> void *stub_func, void *image, >>>> u32 *image_off, >>>> bool allow_alloc) >> >> To be a little more specific, the changes in bpf_struct_ops_map_update_elem() >> could be mostly like this (untested): >> >> ret_image = bpf_struct_ops_prepare_trampoline(tlinks, link, >> &st_ops->func_models[i], >> *(void **)(st_ops->cfi_stubs + moff), >> image, &image_off, >> st_map->image_pages_cnt < >> MAX_TRAMP_IMAGE_PAGES); >> if (IS_ERR(ret_image)) >> goto reset_unlock; >> >> if (image != ret_image) { >> image = ret_image; >> st_map->image_pages[st_map->image_pages_cnt++] = image; >> } >> > > What I don't like is the memory management code was in two named > functions, bpf_struct_ops_map_free_image() and > bpf_struct_ops_map_inc_image(). bpf_struct_ops_map_inc_image() is not needed. > Now, it falls apart. Allocate in one place, keep accounting in another > place, and free yet at the 3rd place. > >>> >>> >>> How about pass a struct bpf_struct_ops_map to >>> bpf_struct_ops_prepare_trampoline(). If the pointer of struct >>> bpf_struct_ops_map is not NULL, try to allocate new pages for the map? >>> >>> For example, >>> >>> static int >>> _bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, >>> >>> struct bpf_tramp_link *link, >>> >>> const struct btf_func_model *model, >>> >>> void *stub_func, void *image, >>> void *image_end, >>> struct bpf_struct_ops_map *st_map) >>> { >>> >>> ... >>> >>> if (!image || size > PAGE_SIZE - *image_off) { >>> if (!st_map) >> >> Why only limit to st_map != NULL? >> >> arch_alloc_bpf_trampoline() is also called in bpf_dummy_ops. >> If bpf_struct_ops_prepare_trampoline() can do the alloc, it may as well simplify >> bpf_dummy_ops and just use bpf_struct_ops_prepare_trampoline() to alloc. > > > Yes, it can save a few lines from bpf_dummy_ops. But, bpf_dummy_ops > still need to free the memory. And, it doesn't pair alloc and free in > the same function. Usually, paring alloc and free in the same function, > nearby, or the same module is easier to understand. It is not only about saving a few lines. It just does not make sense to add all this complexity and another "_"bpf_struct_ops_prepare_trampoline() variant to make it conform to the alloc/free pair idea. To be clear, I don't see alloc/free pair is a must have in all cases. There are many situations that non-alloc named function calls multiple kmalloc() in different places and one xyz_free() releases everything. Even alloc/free is really preferred, there has to be a better way or else need to make a trade off. I suggested the high level ideal on making bpf_struct_ops_prepare_trampoline() to allocate page. It can sure add a bpf_struct_ops_free_trampoline() if you see fit to make it match with bpf_struct_ops_prepare_trampoline() as alloc/free pair, for example, void bpf_struct_ops_free_trampoline(void *image) { bpf_jit_uncharge_modmem(PAGE_SIZE); arch_free_bpf_trampoline(image, PAGE_SIZE); } and make bpf_struct_ops_map_free_image() to use bpf_struct_ops_free_trampoline() static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) { int i; for (i = 0; i < st_map->image_pages_cnt; i++) { bpf_struct_ops_free_trampoline(st_map->image_pages[i]); st_map->image_pages[i] = NULL; } st_map->image_pages_cnt = 0; } Then it should work like alloc/free pair.
On 2/22/24 21:25, Martin KaFai Lau wrote: > On 2/22/24 7:01 PM, Kui-Feng Lee wrote: >> >> >> >> On 2/22/24 18:16, Martin KaFai Lau wrote: >>> On 2/22/24 5:35 PM, Kui-Feng Lee wrote: >>>> >>>> >>>> On 2/22/24 16:33, Martin KaFai Lau wrote: >>>>> On 2/21/24 2:59 PM, thinker.li@gmail.com wrote: >>>>>> @@ -531,10 +567,10 @@ static long >>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >>>>>> const struct btf_type *module_type; >>>>>> const struct btf_member *member; >>>>>> const struct btf_type *t = st_ops_desc->type; >>>>>> + void *image = NULL, *image_end = NULL; >>>>>> struct bpf_tramp_links *tlinks; >>>>>> void *udata, *kdata; >>>>>> int prog_fd, err; >>>>>> - void *image, *image_end; >>>>>> u32 i; >>>>>> if (flags) >>>>>> @@ -573,15 +609,14 @@ static long >>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >>>>>> udata = &uvalue->data; >>>>>> kdata = &kvalue->data; >>>>>> - image = st_map->image; >>>>>> - image_end = st_map->image + PAGE_SIZE; >>>>>> module_type = btf_type_by_id(btf_vmlinux, >>>>>> st_ops_ids[IDX_MODULE_ID]); >>>>>> for_each_member(i, t, member) { >>>>>> const struct btf_type *mtype, *ptype; >>>>>> struct bpf_prog *prog; >>>>>> struct bpf_tramp_link *link; >>>>>> - u32 moff; >>>>>> + u32 moff, tflags; >>>>>> + int tsize; >>>>>> moff = __btf_member_bit_offset(t, member) / 8; >>>>>> ptype = btf_type_resolve_ptr(st_map->btf, member->type, >>>>>> NULL); >>>>>> @@ -653,10 +688,38 @@ static long >>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >>>>>> &bpf_struct_ops_link_lops, prog); >>>>>> st_map->links[i] = &link->link; >>>>>> - err = bpf_struct_ops_prepare_trampoline(tlinks, link, >>>>>> - &st_ops->func_models[i], >>>>>> - *(void **)(st_ops->cfi_stubs + moff), >>>>>> - image, image_end); >>>>>> + tflags = BPF_TRAMP_F_INDIRECT; >>>>>> + if (st_ops->func_models[i].ret_size > 0) >>>>>> + tflags |= BPF_TRAMP_F_RET_FENTRY_RET; >>>>>> + >>>>>> + /* Compute the size of the trampoline */ >>>>>> + tlinks[BPF_TRAMP_FENTRY].links[0] = link; >>>>>> + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; >>>>>> + tsize = arch_bpf_trampoline_size(&st_ops->func_models[i], >>>>>> + tflags, tlinks, NULL); >>>>>> + if (tsize < 0) { >>>>>> + err = tsize; >>>>>> + goto reset_unlock; >>>>>> + } >>>>>> + >>>>>> + /* Allocate pages */ >>>>>> + if (tsize > (unsigned long)image_end - (unsigned >>>>>> long)image) { >>>>>> + if (tsize > PAGE_SIZE) { >>>>>> + err = -E2BIG; >>>>>> + goto reset_unlock; >>>>>> + } >>>>>> + image = bpf_struct_ops_map_inc_image(st_map); >>>>>> + if (IS_ERR(image)) { >>>>>> + err = PTR_ERR(image); >>>>>> + goto reset_unlock; >>>>>> + } >>>>>> + image_end = image + PAGE_SIZE; >>>>>> + } >>>>>> + >>>>>> + err = arch_prepare_bpf_trampoline(NULL, image, image_end, >>>>>> + &st_ops->func_models[i], >>>>>> + tflags, tlinks, >>>>>> + *(void **)(st_ops->cfi_stubs + moff)); >>>>> >>>>> I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, tlinks, >>>>> and the arch_*_trampoline_*() logic from >>>>> bpf_struct_ops_prepare_trampoline() which is used by the >>>>> bpf_dummy_ops for testing also. Considering struct_ops supports >>>>> kernel module now, in the future, it is better to move >>>>> bpf_dummy_ops out to the bpf_testmod somehow and avoid its >>>>> bpf_struct_ops_prepare_trampoline() usage. For now, it is still >>>>> better to keep bpf_struct_ops_prepare_trampoline() to be reusable >>>>> by both. >>>>> >>>>> Have you thought about the earlier suggestion in v1 to do >>>>> arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() >>>>> instead of copying codes from bpf_struct_ops_prepare_trampoline() >>>>> to bpf_struct_ops_map_update_elem()? >>>>> >>>>> Something like this (untested code): >>>>> >>>>> void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links >>>>> *tlinks, >>>>> struct bpf_tramp_link *link, >>>>> const struct btf_func_model *model, >>>>> void *stub_func, void *image, >>>>> u32 *image_off, >>>>> bool allow_alloc) >>> >>> To be a little more specific, the changes in >>> bpf_struct_ops_map_update_elem() >>> could be mostly like this (untested): >>> >>> ret_image = bpf_struct_ops_prepare_trampoline(tlinks, link, >>> &st_ops->func_models[i], >>> *(void **)(st_ops->cfi_stubs + moff), >>> image, &image_off, >>> st_map->image_pages_cnt < >>> MAX_TRAMP_IMAGE_PAGES); >>> if (IS_ERR(ret_image)) >>> goto reset_unlock; >>> >>> if (image != ret_image) { >>> image = ret_image; >>> st_map->image_pages[st_map->image_pages_cnt++] = image; >>> } >>> >> >> What I don't like is the memory management code was in two named >> functions, bpf_struct_ops_map_free_image() and >> bpf_struct_ops_map_inc_image(). > > bpf_struct_ops_map_inc_image() is not needed. > >> Now, it falls apart. Allocate in one place, keep accounting in another >> place, and free yet at the 3rd place. >> >>>> >>>> >>>> How about pass a struct bpf_struct_ops_map to >>>> bpf_struct_ops_prepare_trampoline(). If the pointer of struct >>>> bpf_struct_ops_map is not NULL, try to allocate new pages for the map? >>>> >>>> For example, >>>> >>>> static int >>>> _bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, >>>> >>>> struct bpf_tramp_link *link, >>>> >>>> const struct btf_func_model *model, >>>> >>>> void *stub_func, void *image, >>>> void *image_end, >>>> struct bpf_struct_ops_map *st_map) >>>> { >>>> >>>> ... >>>> >>>> if (!image || size > PAGE_SIZE - *image_off) { >>>> if (!st_map) >>> >>> Why only limit to st_map != NULL? >>> >>> arch_alloc_bpf_trampoline() is also called in bpf_dummy_ops. >>> If bpf_struct_ops_prepare_trampoline() can do the alloc, it may as >>> well simplify >>> bpf_dummy_ops and just use bpf_struct_ops_prepare_trampoline() to alloc. >> >> >> Yes, it can save a few lines from bpf_dummy_ops. But, bpf_dummy_ops >> still need to free the memory. And, it doesn't pair alloc and free in >> the same function. Usually, paring alloc and free in the same function, >> nearby, or the same module is easier to understand. > > It is not only about saving a few lines. It just does not make sense to > add all this complexity and another "_"bpf_struct_ops_prepare_trampoline() > variant to make it conform to the alloc/free pair idea. To be clear, I > don't To be clear, we are not talking computation or memory complexity here. I consider the complexity in another way. When I look at the code of bpf_dummy_ops, and see it free the memory at the very end of a function. I have to guess who allocate the memory by looking around without a clear sign or hint if we move the allocation to bpf_struct_ops_prepare_trampoline(). It is a source of complexity. Very often, a duplication is much more simple and easy to understand. Especially, when the duplication is in a very well know/recognized pattern. Here will create a unusual way to replace a well recognized one to simplify the code. My reason of duplicating the code from bpf_struct_ops_prepare_trampoline() was we don't need bpf_struct_ops_prepare_trampoline() in future if we were going to move bpf_dummy_ops out. But, just like you said, we still have bpf_dummy_ops now, so it is a good trade of to move memory allocation into bpf_struct_ops_prepare_trampoline() to avoid the duplication the code about flags and tlinks. But, the trade off we are talking here goes to an opposite way. By the way, I am not insisting on these tiny details. I am just trying to explain what I don't like here. > see alloc/free pair is a must have in all cases. There are many > situations that > non-alloc named function calls multiple kmalloc() in different places and > one xyz_free() releases everything. Even alloc/free is really preferred, > there has to be a better way or else need to make a trade off. > > I suggested the high level ideal on making > bpf_struct_ops_prepare_trampoline() to allocate page. It can sure add a > bpf_struct_ops_free_trampoline() if you see fit to make it match with > bpf_struct_ops_prepare_trampoline() as alloc/free pair, for example, > > void bpf_struct_ops_free_trampoline(void *image) > { > bpf_jit_uncharge_modmem(PAGE_SIZE); > arch_free_bpf_trampoline(image, PAGE_SIZE); > } > > and make bpf_struct_ops_map_free_image() to use > bpf_struct_ops_free_trampoline() > > static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map > *st_map) > { > int i; > > for (i = 0; i < st_map->image_pages_cnt; i++) { > bpf_struct_ops_free_trampoline(st_map->image_pages[i]); > st_map->image_pages[i] = NULL; > } > st_map->image_pages_cnt = 0; > } > > Then it should work like alloc/free pair.
On 2/23/24 09:36, Kui-Feng Lee wrote: > > > On 2/22/24 21:25, Martin KaFai Lau wrote: >> On 2/22/24 7:01 PM, Kui-Feng Lee wrote: >>> >>> >>> >>> On 2/22/24 18:16, Martin KaFai Lau wrote: >>>> On 2/22/24 5:35 PM, Kui-Feng Lee wrote: >>>>> >>>>> >>>>> On 2/22/24 16:33, Martin KaFai Lau wrote: >>>>>> On 2/21/24 2:59 PM, thinker.li@gmail.com wrote: >>>>>>> @@ -531,10 +567,10 @@ static long >>>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >>>>>>> const struct btf_type *module_type; >>>>>>> const struct btf_member *member; >>>>>>> const struct btf_type *t = st_ops_desc->type; >>>>>>> + void *image = NULL, *image_end = NULL; >>>>>>> struct bpf_tramp_links *tlinks; >>>>>>> void *udata, *kdata; >>>>>>> int prog_fd, err; >>>>>>> - void *image, *image_end; >>>>>>> u32 i; >>>>>>> if (flags) >>>>>>> @@ -573,15 +609,14 @@ static long >>>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >>>>>>> udata = &uvalue->data; >>>>>>> kdata = &kvalue->data; >>>>>>> - image = st_map->image; >>>>>>> - image_end = st_map->image + PAGE_SIZE; >>>>>>> module_type = btf_type_by_id(btf_vmlinux, >>>>>>> st_ops_ids[IDX_MODULE_ID]); >>>>>>> for_each_member(i, t, member) { >>>>>>> const struct btf_type *mtype, *ptype; >>>>>>> struct bpf_prog *prog; >>>>>>> struct bpf_tramp_link *link; >>>>>>> - u32 moff; >>>>>>> + u32 moff, tflags; >>>>>>> + int tsize; >>>>>>> moff = __btf_member_bit_offset(t, member) / 8; >>>>>>> ptype = btf_type_resolve_ptr(st_map->btf, member->type, >>>>>>> NULL); >>>>>>> @@ -653,10 +688,38 @@ static long >>>>>>> bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, >>>>>>> &bpf_struct_ops_link_lops, prog); >>>>>>> st_map->links[i] = &link->link; >>>>>>> - err = bpf_struct_ops_prepare_trampoline(tlinks, link, >>>>>>> - &st_ops->func_models[i], >>>>>>> - *(void **)(st_ops->cfi_stubs + moff), >>>>>>> - image, image_end); >>>>>>> + tflags = BPF_TRAMP_F_INDIRECT; >>>>>>> + if (st_ops->func_models[i].ret_size > 0) >>>>>>> + tflags |= BPF_TRAMP_F_RET_FENTRY_RET; >>>>>>> + >>>>>>> + /* Compute the size of the trampoline */ >>>>>>> + tlinks[BPF_TRAMP_FENTRY].links[0] = link; >>>>>>> + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; >>>>>>> + tsize = arch_bpf_trampoline_size(&st_ops->func_models[i], >>>>>>> + tflags, tlinks, NULL); >>>>>>> + if (tsize < 0) { >>>>>>> + err = tsize; >>>>>>> + goto reset_unlock; >>>>>>> + } >>>>>>> + >>>>>>> + /* Allocate pages */ >>>>>>> + if (tsize > (unsigned long)image_end - (unsigned >>>>>>> long)image) { >>>>>>> + if (tsize > PAGE_SIZE) { >>>>>>> + err = -E2BIG; >>>>>>> + goto reset_unlock; >>>>>>> + } >>>>>>> + image = bpf_struct_ops_map_inc_image(st_map); >>>>>>> + if (IS_ERR(image)) { >>>>>>> + err = PTR_ERR(image); >>>>>>> + goto reset_unlock; >>>>>>> + } >>>>>>> + image_end = image + PAGE_SIZE; >>>>>>> + } >>>>>>> + >>>>>>> + err = arch_prepare_bpf_trampoline(NULL, image, image_end, >>>>>>> + &st_ops->func_models[i], >>>>>>> + tflags, tlinks, >>>>>>> + *(void **)(st_ops->cfi_stubs + moff)); >>>>>> >>>>>> I don't prefer to copy the BPF_TRAMP_F_* setting on tflags, >>>>>> tlinks, and the arch_*_trampoline_*() logic from >>>>>> bpf_struct_ops_prepare_trampoline() which is used by the >>>>>> bpf_dummy_ops for testing also. Considering struct_ops supports >>>>>> kernel module now, in the future, it is better to move >>>>>> bpf_dummy_ops out to the bpf_testmod somehow and avoid its >>>>>> bpf_struct_ops_prepare_trampoline() usage. For now, it is still >>>>>> better to keep bpf_struct_ops_prepare_trampoline() to be reusable >>>>>> by both. >>>>>> >>>>>> Have you thought about the earlier suggestion in v1 to do >>>>>> arch_alloc_bpf_trampoline() in bpf_struct_ops_prepare_trampoline() >>>>>> instead of copying codes from bpf_struct_ops_prepare_trampoline() >>>>>> to bpf_struct_ops_map_update_elem()? >>>>>> >>>>>> Something like this (untested code): >>>>>> >>>>>> void *bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links >>>>>> *tlinks, >>>>>> struct bpf_tramp_link *link, >>>>>> const struct btf_func_model *model, >>>>>> void *stub_func, void *image, >>>>>> u32 *image_off, >>>>>> bool allow_alloc) >>>> >>>> To be a little more specific, the changes in >>>> bpf_struct_ops_map_update_elem() >>>> could be mostly like this (untested): >>>> >>>> ret_image = bpf_struct_ops_prepare_trampoline(tlinks, link, >>>> &st_ops->func_models[i], >>>> *(void **)(st_ops->cfi_stubs + >>>> moff), >>>> image, &image_off, >>>> st_map->image_pages_cnt < >>>> MAX_TRAMP_IMAGE_PAGES); >>>> if (IS_ERR(ret_image)) >>>> goto reset_unlock; >>>> >>>> if (image != ret_image) { >>>> image = ret_image; >>>> st_map->image_pages[st_map->image_pages_cnt++] = image; >>>> } >>>> >>> >>> What I don't like is the memory management code was in two named >>> functions, bpf_struct_ops_map_free_image() and >>> bpf_struct_ops_map_inc_image(). >> >> bpf_struct_ops_map_inc_image() is not needed. >> >>> Now, it falls apart. Allocate in one place, keep accounting in another >>> place, and free yet at the 3rd place. >>> >>>>> >>>>> >>>>> How about pass a struct bpf_struct_ops_map to >>>>> bpf_struct_ops_prepare_trampoline(). If the pointer of struct >>>>> bpf_struct_ops_map is not NULL, try to allocate new pages for the map? >>>>> >>>>> For example, >>>>> >>>>> static int >>>>> _bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, >>>>> >>>>> struct bpf_tramp_link *link, >>>>> >>>>> const struct btf_func_model >>>>> *model, >>>>> >>>>> void *stub_func, void *image, >>>>> void *image_end, >>>>> struct bpf_struct_ops_map *st_map) >>>>> { >>>>> >>>>> ... >>>>> >>>>> if (!image || size > PAGE_SIZE - *image_off) { >>>>> if (!st_map) >>>> >>>> Why only limit to st_map != NULL? >>>> >>>> arch_alloc_bpf_trampoline() is also called in bpf_dummy_ops. >>>> If bpf_struct_ops_prepare_trampoline() can do the alloc, it may as >>>> well simplify >>>> bpf_dummy_ops and just use bpf_struct_ops_prepare_trampoline() to >>>> alloc. >>> >>> >>> Yes, it can save a few lines from bpf_dummy_ops. But, bpf_dummy_ops >>> still need to free the memory. And, it doesn't pair alloc and free in >>> the same function. Usually, paring alloc and free in the same function, >>> nearby, or the same module is easier to understand. >> >> It is not only about saving a few lines. It just does not make sense to >> add all this complexity and another >> "_"bpf_struct_ops_prepare_trampoline() >> variant to make it conform to the alloc/free pair idea. To be clear, I >> don't > > To be clear, we are not talking computation or memory complexity here. > I consider the complexity in another way. When I look at the code of > bpf_dummy_ops, and see it free the memory at the very end of a function. > I have to guess who allocate the memory by looking around without a > clear sign or hint if we move the allocation to > bpf_struct_ops_prepare_trampoline(). It is a source of complexity. > Very often, a duplication is much more simple and easy to understand. > Especially, when the duplication is in a very well know/recognized > pattern. Here will create a unusual way to replace a well recognized one > to simplify the code. > > My reason of duplicating the code from > bpf_struct_ops_prepare_trampoline() was we don't need > bpf_struct_ops_prepare_trampoline() in future if we were going to move > bpf_dummy_ops out. But, just like you said, we still have bpf_dummy_ops > now, so it is a good trade of to move memory allocation into > bpf_struct_ops_prepare_trampoline() to avoid the duplication the code > about flags and tlinks. But, the trade off we are talking here goes to > an opposite way. > > By the way, I am not insisting on these tiny details. I am just trying > to explain what I don't like here. One thing I forgot to mention is that bpf_dummy_ops has to call bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(), meaning bpf_struct_ops_map_update_elem() should handle the case that the allocation in bpf_struct_ops_prepare_trampoline() successes, but bpf_jit_charge_modmem() fails. > >> see alloc/free pair is a must have in all cases. There are many >> situations that >> non-alloc named function calls multiple kmalloc() in different places and >> one xyz_free() releases everything. Even alloc/free is really preferred, >> there has to be a better way or else need to make a trade off. >> >> I suggested the high level ideal on making >> bpf_struct_ops_prepare_trampoline() to allocate page. It can sure add a >> bpf_struct_ops_free_trampoline() if you see fit to make it match with >> bpf_struct_ops_prepare_trampoline() as alloc/free pair, for example, >> >> void bpf_struct_ops_free_trampoline(void *image) >> { >> bpf_jit_uncharge_modmem(PAGE_SIZE); >> arch_free_bpf_trampoline(image, PAGE_SIZE); >> } >> >> and make bpf_struct_ops_map_free_image() to use >> bpf_struct_ops_free_trampoline() >> >> static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map >> *st_map) >> { >> int i; >> >> for (i = 0; i < st_map->image_pages_cnt; i++) { >> bpf_struct_ops_free_trampoline(st_map->image_pages[i]); >> st_map->image_pages[i] = NULL; >> } >> st_map->image_pages_cnt = 0; >> } >> >> Then it should work like alloc/free pair.
On 2/23/24 9:36 AM, Kui-Feng Lee wrote: > > To be clear, we are not talking computation or memory complexity here. > I consider the complexity in another way. When I look at the code of > bpf_dummy_ops, and see it free the memory at the very end of a function. > I have to guess who allocate the memory by looking around without a > clear sign or hint if we move the allocation to > bpf_struct_ops_prepare_trampoline(). It is a source of complexity. It still sounds like a naming perception issue more than a practical code-wise complexity/readability. Rename it to bpf_struct_ops_"s/alloc/prepare/"_trampoline() if it can make it more obvious that it is an alloc function. imo, that function returning a page is a clear sign that it can alloc but I don't mind renaming it if it can help to make it sounds more like alloc and free pair. > Very often, a duplication is much more simple and easy to understand. > Especially, when the duplication is in a very well know/recognized > pattern. Here will create a unusual way to replace a well recognized one > to simplify the code. Sorry, I don't agree on this where this patch is duplicating lines of code which is not obvious like setting BPF_TRAMP_F_*. At least I often have to go back to arch_prepare_bpf_trampoline() to understand how it works. Not copy-and-pasting this piece of codes everywhere is more important than making bpf_dummy_ops looks better. > > My reason of duplicating the code from > bpf_struct_ops_prepare_trampoline() was we don't need > bpf_struct_ops_prepare_trampoline() in future if we were going to move > bpf_dummy_ops out. But, just like you said, we still have bpf_dummy_ops Yep, it will be great to move bpf_dummy_ops out but how it can be done and whether it can remove its bpf_struct_ops_prepare_trampoline() usage is still TBD. I think it should be possible. Even it is moved out in the future, bpf_struct_ops_(prepare|alloc)_trampoline() can be keep as is. > now, so it is a good trade of to move memory allocation into > bpf_struct_ops_prepare_trampoline() to avoid the duplication the code > about flags and tlinks. But, the trade off we are talking here goes to > an opposite way.
On 2/23/24 10:29 AM, Kui-Feng Lee wrote: > One thing I forgot to mention is that bpf_dummy_ops has to call > bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move > bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(), > meaning bpf_struct_ops_map_update_elem() should handle the case that the > allocation in bpf_struct_ops_prepare_trampoline() successes, but > bpf_jit_charge_modmem() fails. Keep the charge/uncharge in bpf_struct_ops_prepare_trampoline(). It is fine to have bpf_dummy_ops charge and then uncharge a PAGE_SIZE. There is no need to optimize for bpf_dummy_ops. Use bpf_struct_ops_free_trampoline() in bpf_dummy_ops to uncharge and free. >>> void bpf_struct_ops_free_trampoline(void *image) >>> { >>> bpf_jit_uncharge_modmem(PAGE_SIZE); >>> arch_free_bpf_trampoline(image, PAGE_SIZE); >>> } >>>
On 2/23/24 10:42, Martin KaFai Lau wrote: > On 2/23/24 10:29 AM, Kui-Feng Lee wrote: >> One thing I forgot to mention is that bpf_dummy_ops has to call >> bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move >> bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(), >> meaning bpf_struct_ops_map_update_elem() should handle the case that the >> allocation in bpf_struct_ops_prepare_trampoline() successes, but >> bpf_jit_charge_modmem() fails. > > Keep the charge/uncharge in bpf_struct_ops_prepare_trampoline(). > > It is fine to have bpf_dummy_ops charge and then uncharge a PAGE_SIZE. > There is no need to optimize for bpf_dummy_ops. Use > bpf_struct_ops_free_trampoline() in bpf_dummy_ops to uncharge and free. Then, I don't get the point here. I agree with moving the allocation into bpf_struct_ops_prepare_trampoline() to avoid duplication of the code about flags and tlinks. It really simplifies the code with the fact that bpf_dummy_ops is still there. So, I tried to pass a st_map to bpf_struct_ops_prepare_trampoline() to keep page managements code together. But, you said to simplify the code of bpf_dummy_ops by allocating pages in bpf_struct_ops_prepare_trampoline(), do bookkeeping in bpf_struct_ops_map_update_elem(), so bpf_dummy_ops doesn't have to allocate memory. But, we have to move a bpf_jit_uncharge_modmem() to bpf_dummy_ops. For me, this trade-off that include removing an allocation and adding a bpf_jit_uncharge_modmem() make no sense. > > >>>> void bpf_struct_ops_free_trampoline(void *image) >>>> { >>>> bpf_jit_uncharge_modmem(PAGE_SIZE); >>>> arch_free_bpf_trampoline(image, PAGE_SIZE); >>>> } >>>> >
On 2/23/24 11:05 AM, Kui-Feng Lee wrote: > > > > On 2/23/24 10:42, Martin KaFai Lau wrote: >> On 2/23/24 10:29 AM, Kui-Feng Lee wrote: >>> One thing I forgot to mention is that bpf_dummy_ops has to call >>> bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move >>> bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(), >>> meaning bpf_struct_ops_map_update_elem() should handle the case that the >>> allocation in bpf_struct_ops_prepare_trampoline() successes, but >>> bpf_jit_charge_modmem() fails. >> >> Keep the charge/uncharge in bpf_struct_ops_prepare_trampoline(). >> >> It is fine to have bpf_dummy_ops charge and then uncharge a PAGE_SIZE. There >> is no need to optimize for bpf_dummy_ops. Use bpf_struct_ops_free_trampoline() >> in bpf_dummy_ops to uncharge and free. > > > Then, I don't get the point here. > I agree with moving the allocation into > bpf_struct_ops_prepare_trampoline() to avoid duplication of the code > about flags and tlinks. It really simplifies the code with the fact > that bpf_dummy_ops is still there. So, I tried to pass a st_map to > bpf_struct_ops_prepare_trampoline() to keep page managements code > together. But, you said to simplify the code of bpf_dummy_ops by > allocating pages in bpf_struct_ops_prepare_trampoline(), do bookkeeping > in bpf_struct_ops_map_update_elem(), so bpf_dummy_ops doesn't have to I don't think I ever mentioned to do book keeping in bpf_struct_ops_map_update_elem(). Have you looked at my earlier code in bpf_struct_ops_prepare_trampoline() which also does the memory charging also? > allocate memory. But, we have to move a bpf_jit_uncharge_modmem() to > bpf_dummy_ops. For me, this trade-off that include removing an > allocation and adding a bpf_jit_uncharge_modmem() make no sense. Which part make no sense? Having bpf_dummy_ops charge/uncharge memory also? The bpf_dummy_ops() uses the below bpf_struct_ops_free_trampoline() which does uncharge and free. bpf_struct_ops_prepare_trampoline() does charge and alloc. charge/alloc matches with uncharge/free. > >> >> >>>>> void bpf_struct_ops_free_trampoline(void *image) >>>>> { >>>>> bpf_jit_uncharge_modmem(PAGE_SIZE); >>>>> arch_free_bpf_trampoline(image, PAGE_SIZE); >>>>> } >>>>> >>
On 2/23/24 11:15, Martin KaFai Lau wrote: > On 2/23/24 11:05 AM, Kui-Feng Lee wrote: >> >> >> >> On 2/23/24 10:42, Martin KaFai Lau wrote: >>> On 2/23/24 10:29 AM, Kui-Feng Lee wrote: >>>> One thing I forgot to mention is that bpf_dummy_ops has to call >>>> bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move >>>> bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(), >>>> meaning bpf_struct_ops_map_update_elem() should handle the case that >>>> the >>>> allocation in bpf_struct_ops_prepare_trampoline() successes, but >>>> bpf_jit_charge_modmem() fails. >>> >>> Keep the charge/uncharge in bpf_struct_ops_prepare_trampoline(). >>> >>> It is fine to have bpf_dummy_ops charge and then uncharge a >>> PAGE_SIZE. There is no need to optimize for bpf_dummy_ops. Use >>> bpf_struct_ops_free_trampoline() in bpf_dummy_ops to uncharge and free. >> >> >> Then, I don't get the point here. >> I agree with moving the allocation into >> bpf_struct_ops_prepare_trampoline() to avoid duplication of the code >> about flags and tlinks. It really simplifies the code with the fact >> that bpf_dummy_ops is still there. So, I tried to pass a st_map to >> bpf_struct_ops_prepare_trampoline() to keep page managements code >> together. But, you said to simplify the code of bpf_dummy_ops by >> allocating pages in bpf_struct_ops_prepare_trampoline(), do bookkeeping >> in bpf_struct_ops_map_update_elem(), so bpf_dummy_ops doesn't have to > > I don't think I ever mentioned to do book keeping in > bpf_struct_ops_map_update_elem(). Have you looked at my earlier code in > bpf_struct_ops_prepare_trampoline() which also does the memory charging > also? The bookkeeping that I am saying is about maintaining image_pages and image_pages_cnt. > >> allocate memory. But, we have to move a bpf_jit_uncharge_modmem() to >> bpf_dummy_ops. For me, this trade-off that include removing an >> allocation and adding a bpf_jit_uncharge_modmem() make no sense. > > Which part make no sense? Having bpf_dummy_ops charge/uncharge memory also? Simplifying bpf_dummy_ops by removing the duty of allocation but adding the duty of uncharge memory doesn't make sense to me in terms of simplification. Although the lines of code would be similar, it actually makes it more complicated than before. > > The bpf_dummy_ops() uses the below bpf_struct_ops_free_trampoline() > which does uncharge and free. bpf_struct_ops_prepare_trampoline() does > charge and alloc. > charge/alloc matches with uncharge/free. > >> >>> >>> >>>>>> void bpf_struct_ops_free_trampoline(void *image) >>>>>> { >>>>>> bpf_jit_uncharge_modmem(PAGE_SIZE); >>>>>> arch_free_bpf_trampoline(image, PAGE_SIZE); >>>>>> } >>>>>> >>> >
On 2/23/24 2:06 PM, Kui-Feng Lee wrote: > > > On 2/23/24 11:15, Martin KaFai Lau wrote: >> On 2/23/24 11:05 AM, Kui-Feng Lee wrote: >>> >>> >>> >>> On 2/23/24 10:42, Martin KaFai Lau wrote: >>>> On 2/23/24 10:29 AM, Kui-Feng Lee wrote: >>>>> One thing I forgot to mention is that bpf_dummy_ops has to call >>>>> bpf_jit_uncharge_modmem(PAGE_SIZE) as well. The other option is to move >>>>> bpf_jit_charge_modmem() out of bpf_struct_ops_prepare_trampoline(), >>>>> meaning bpf_struct_ops_map_update_elem() should handle the case that the >>>>> allocation in bpf_struct_ops_prepare_trampoline() successes, but >>>>> bpf_jit_charge_modmem() fails. >>>> >>>> Keep the charge/uncharge in bpf_struct_ops_prepare_trampoline(). >>>> >>>> It is fine to have bpf_dummy_ops charge and then uncharge a PAGE_SIZE. There >>>> is no need to optimize for bpf_dummy_ops. Use >>>> bpf_struct_ops_free_trampoline() in bpf_dummy_ops to uncharge and free. >>> >>> >>> Then, I don't get the point here. >>> I agree with moving the allocation into >>> bpf_struct_ops_prepare_trampoline() to avoid duplication of the code >>> about flags and tlinks. It really simplifies the code with the fact >>> that bpf_dummy_ops is still there. So, I tried to pass a st_map to >>> bpf_struct_ops_prepare_trampoline() to keep page managements code >>> together. But, you said to simplify the code of bpf_dummy_ops by >>> allocating pages in bpf_struct_ops_prepare_trampoline(), do bookkeeping >>> in bpf_struct_ops_map_update_elem(), so bpf_dummy_ops doesn't have to >> >> I don't think I ever mentioned to do book keeping in >> bpf_struct_ops_map_update_elem(). Have you looked at my earlier code in >> bpf_struct_ops_prepare_trampoline() which also does the memory charging also? > > The bookkeeping that I am saying is about maintaining image_pages and > image_pages_cnt. image_pages and image_pages_cnt are in the st_map (struct_ops map) itself. Why the bpf_struct_ops_map_update_elem() cannot keep track of the pages itself and need "_"bpf_struct_ops_prepare_trampoline() to handle it? It is not like refactoring the image_pages[_cnt] handling to _bpf_struct_ops_prepare_trampoline() and it can be reused by another caller. bpf_struct_ops_map_update_elem() is the only one needing to keep track of its own image_pages and image_pages_cnt. > >> >>> allocate memory. But, we have to move a bpf_jit_uncharge_modmem() to >>> bpf_dummy_ops. For me, this trade-off that include removing an >>> allocation and adding a bpf_jit_uncharge_modmem() make no sense. >> >> Which part make no sense? Having bpf_dummy_ops charge/uncharge memory also? > > Simplifying bpf_dummy_ops by removing the duty of allocation but adding > the duty of uncharge memory doesn't make sense to me in terms of > simplification. Although the lines of code would be similar, it actually > makes it more complicated than before. The bpf_dummy_ops does not need to call uncharge. It does not need to know the page is charged or not. It uses bpf_struct_ops_prepare_trampoline() to get a prepared trampoline page and bpf_struct_ops_free_trampoline() which does the uncharge+free. It is the alloc/free concept that you have been proposing which fits well here. The bpf_dummy_ops is doing arch_alloc_bpf_trampoline and arch_free_bpf_trampoline. The arch_alloc_bpf_trampoline will be gone (-1). The arch_free_bpf_trampoline will be replaced (+0) by bpf_struct_ops_free_trampoline. Untested code: diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 02de71719aed..a3bb89eb037f 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -89,8 +89,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, struct bpf_dummy_ops_test_args *args; struct bpf_tramp_links *tlinks; struct bpf_tramp_link *link = NULL; + unsigned int op_idx, image_off = 0; void *image = NULL; - unsigned int op_idx; int prog_ret; s32 type_id; int err; @@ -114,12 +114,6 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, goto out; } - image = arch_alloc_bpf_trampoline(PAGE_SIZE); - if (!image) { - err = -ENOMEM; - goto out; - } - link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; @@ -130,12 +124,14 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog); op_idx = prog->expected_attach_type; - err = bpf_struct_ops_prepare_trampoline(tlinks, link, - &st_ops->func_models[op_idx], - &dummy_ops_test_ret_function, - image, image + PAGE_SIZE); - if (err < 0) + image = bpf_struct_ops_prepare_trampoline(tlinks, link, + &st_ops->func_models[op_idx], + &dummy_ops_test_ret_function, + NULL, &image_off, true); + if (IS_ERR(image)) { + err = PTR_ERR(image); goto out; + } arch_protect_bpf_trampoline(image, PAGE_SIZE); prog_ret = dummy_ops_call_op(image, args); @@ -147,7 +143,8 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, err = -EFAULT; out: kfree(args); - arch_free_bpf_trampoline(image, PAGE_SIZE); + if (!IS_ERR_OR_NULL(image)) + bpf_struct_ops_free_trampoline(image); if (link) bpf_link_put(&link->link); kfree(tlinks); >>>>>>> void bpf_struct_ops_free_trampoline(void *image) >>>>>>> { >>>>>>> bpf_jit_uncharge_modmem(PAGE_SIZE); >>>>>>> arch_free_bpf_trampoline(image, PAGE_SIZE); >>>>>>> } ^^^^^^^^^^^^^^^ To be clear, this bpf_struct_ops_free_trampoline() function that does the uncharge+free. ~~~~~~~~~~~~~~~~ Lets summarize a bit of the thread: . I think we agree duplicating codes from bpf_struct_ops_prepare_trampoline() is not good. . bpf_struct_ops_prepare_trampoline() can allocate page. . The first concern was there is no alloc/free pairing. Then adding bpf_struct_ops_free_trampoline was suggested to do the uncharge+free together, while bpf_struct_ops_prepare_trampoline does the charge+alloc . The second concern was bpf_struct_ops_prepare_trampoline() is not obvious that a free(page) needs to be done. It is more like a naming perception since returning a page is already a good signal that needs to be freed. However, it is open for a function name change if it can make the "alloc" part more obvious. . Another concern was bpf_dummy_ops now needs to remember it needs to uncharge. bpf_struct_ops_free_trampoline() should address it also since it does uncharge+free. . Another point brought up was having bpf_struct_ops_prepare_trampoline() store the allocated page in st_map->image_pages instead of map_update doing it itself which was covered in the first part of this email. I hope the code pieces have addressed the points brought up above. I have combined these code pieces in a patch which is only compiler tested. Hope it will make things clearer (first patch): https://git.kernel.org/pub/scm/linux/kernel/git/martin.lau/bpf-next.git/log/?h=struct_ops.images
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index c244ed5114fd..15bf8058161b 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -18,6 +18,8 @@ struct bpf_struct_ops_value { char data[] ____cacheline_aligned_in_smp; }; +#define MAX_TRAMP_IMAGE_PAGES 8 + struct bpf_struct_ops_map { struct bpf_map map; struct rcu_head rcu; @@ -30,12 +32,11 @@ struct bpf_struct_ops_map { */ struct bpf_link **links; u32 links_cnt; - /* image is a page that has all the trampolines + u32 image_pages_cnt; + /* image_pages is an array of pages that has all the trampolines * that stores the func args before calling the bpf_prog. - * A PAGE_SIZE "image" is enough to store all trampoline for - * "links[]". */ - void *image; + void *image_pages[MAX_TRAMP_IMAGE_PAGES]; /* The owner moduler's btf. */ struct btf *btf; /* uvalue->data stores the kernel struct @@ -456,6 +457,41 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) } } +static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) +{ + int i; + void *image; + + bpf_jit_uncharge_modmem(PAGE_SIZE * st_map->image_pages_cnt); + for (i = 0; i < st_map->image_pages_cnt; i++) { + image = st_map->image_pages[i]; + arch_free_bpf_trampoline(image, PAGE_SIZE); + } + st_map->image_pages_cnt = 0; +} + +static void *bpf_struct_ops_map_inc_image(struct bpf_struct_ops_map *st_map) +{ + void *image; + int err; + + if (st_map->image_pages_cnt >= MAX_TRAMP_IMAGE_PAGES) + return ERR_PTR(-E2BIG); + + err = bpf_jit_charge_modmem(PAGE_SIZE); + if (err) + return ERR_PTR(err); + + image = arch_alloc_bpf_trampoline(PAGE_SIZE); + if (!image) { + bpf_jit_uncharge_modmem(PAGE_SIZE); + return ERR_PTR(-ENOMEM); + } + st_map->image_pages[st_map->image_pages_cnt++] = image; + + return image; +} + static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data) { const struct btf_member *member; @@ -531,10 +567,10 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, const struct btf_type *module_type; const struct btf_member *member; const struct btf_type *t = st_ops_desc->type; + void *image = NULL, *image_end = NULL; struct bpf_tramp_links *tlinks; void *udata, *kdata; int prog_fd, err; - void *image, *image_end; u32 i; if (flags) @@ -573,15 +609,14 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, udata = &uvalue->data; kdata = &kvalue->data; - image = st_map->image; - image_end = st_map->image + PAGE_SIZE; module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; struct bpf_prog *prog; struct bpf_tramp_link *link; - u32 moff; + u32 moff, tflags; + int tsize; moff = __btf_member_bit_offset(t, member) / 8; ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); @@ -653,10 +688,38 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, &bpf_struct_ops_link_lops, prog); st_map->links[i] = &link->link; - err = bpf_struct_ops_prepare_trampoline(tlinks, link, - &st_ops->func_models[i], - *(void **)(st_ops->cfi_stubs + moff), - image, image_end); + tflags = BPF_TRAMP_F_INDIRECT; + if (st_ops->func_models[i].ret_size > 0) + tflags |= BPF_TRAMP_F_RET_FENTRY_RET; + + /* Compute the size of the trampoline */ + tlinks[BPF_TRAMP_FENTRY].links[0] = link; + tlinks[BPF_TRAMP_FENTRY].nr_links = 1; + tsize = arch_bpf_trampoline_size(&st_ops->func_models[i], + tflags, tlinks, NULL); + if (tsize < 0) { + err = tsize; + goto reset_unlock; + } + + /* Allocate pages */ + if (tsize > (unsigned long)image_end - (unsigned long)image) { + if (tsize > PAGE_SIZE) { + err = -E2BIG; + goto reset_unlock; + } + image = bpf_struct_ops_map_inc_image(st_map); + if (IS_ERR(image)) { + err = PTR_ERR(image); + goto reset_unlock; + } + image_end = image + PAGE_SIZE; + } + + err = arch_prepare_bpf_trampoline(NULL, image, image_end, + &st_ops->func_models[i], + tflags, tlinks, + *(void **)(st_ops->cfi_stubs + moff)); if (err < 0) goto reset_unlock; @@ -672,10 +735,11 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, if (err) goto reset_unlock; } + for (i = 0; i < st_map->image_pages_cnt; i++) + arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); if (st_map->map.map_flags & BPF_F_LINK) { err = 0; - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); /* Let bpf_link handle registration & unregistration. * * Pair with smp_load_acquire() during lookup_elem(). @@ -684,7 +748,6 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto unlock; } - arch_protect_bpf_trampoline(st_map->image, PAGE_SIZE); err = st_ops->reg(kdata); if (likely(!err)) { /* This refcnt increment on the map here after @@ -707,9 +770,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ - arch_unprotect_bpf_trampoline(st_map->image, PAGE_SIZE); reset_unlock: + bpf_struct_ops_map_free_image(st_map); bpf_struct_ops_map_put_progs(st_map); memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); @@ -776,10 +839,7 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map) if (st_map->links) bpf_struct_ops_map_put_progs(st_map); bpf_map_area_free(st_map->links); - if (st_map->image) { - arch_free_bpf_trampoline(st_map->image, PAGE_SIZE); - bpf_jit_uncharge_modmem(PAGE_SIZE); - } + bpf_struct_ops_map_free_image(st_map); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); } @@ -889,20 +949,6 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) st_map->st_ops_desc = st_ops_desc; map = &st_map->map; - ret = bpf_jit_charge_modmem(PAGE_SIZE); - if (ret) - goto errout_free; - - st_map->image = arch_alloc_bpf_trampoline(PAGE_SIZE); - if (!st_map->image) { - /* __bpf_struct_ops_map_free() uses st_map->image as flag - * for "charged or not". In this case, we need to unchange - * here. - */ - bpf_jit_uncharge_modmem(PAGE_SIZE); - ret = -ENOMEM; - goto errout_free; - } st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); st_map->links_cnt = btf_type_vlen(t); st_map->links =