Message ID | 20240524223036.318800-3-thinker.li@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Notify user space when a struct_ops object is detached/unregistered | expand |
On 5/24/24 3:30 PM, Kui-Feng Lee wrote: > +static int bpf_struct_ops_map_link_detach(struct bpf_link *link) > +{ > + struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link); > + struct bpf_struct_ops_map *st_map; > + struct bpf_map *map; > + > + mutex_lock(&update_mutex); update_mutex is needed to detach. > + > + map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); > + if (!map) { > + mutex_unlock(&update_mutex); > + return 0; > + } > + st_map = container_of(map, struct bpf_struct_ops_map, map); > + > + st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); > + > + rcu_assign_pointer(st_link->map, NULL); > + /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or > + * bpf_map_inc() in bpf_struct_ops_map_link_update(). > + */ > + bpf_map_put(&st_map->map); > + > + mutex_unlock(&update_mutex); > + > + return 0; > +} > + > static const struct bpf_link_ops bpf_struct_ops_map_lops = { > .dealloc = bpf_struct_ops_map_link_dealloc, > + .detach = bpf_struct_ops_map_link_detach, > .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, > .fill_link_info = bpf_struct_ops_map_link_fill_link_info, > .update_map = bpf_struct_ops_map_link_update, > @@ -1176,13 +1208,22 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) > if (err) > goto err_out; > > + /* Init link->map before calling reg() in case being detached > + * immediately. > + */ With update_mutex held in link_create here, the parallel detach can still happen before the link is fully initialized (the link->map pointer here in particular)? > + RCU_INIT_POINTER(link->map, map); > + > + mutex_lock(&update_mutex); > err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); > if (err) { > + RCU_INIT_POINTER(link->map, NULL); I was hoping by holding the the update_mutex, it can avoid this link->map init dance, like RCU_INIT_POINTER(link->map, map) above and then resetting here on the error case. > + mutex_unlock(&update_mutex); > bpf_link_cleanup(&link_primer); > + /* The link has been free by bpf_link_cleanup() */ > link = NULL; > goto err_out; > } > - RCU_INIT_POINTER(link->map, map); If only init link->map once here like the existing code (and the init is protected by the update_mutex), the subsystem should not be able to detach until the link->map is fully initialized. or I am missing something obvious. Can you explain why this link->map init dance is still needed? > + mutex_unlock(&update_mutex);
On Tue, May 28, 2024 at 11:17 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 5/24/24 3:30 PM, Kui-Feng Lee wrote: > > +static int bpf_struct_ops_map_link_detach(struct bpf_link *link) > > +{ > > + struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link); > > + struct bpf_struct_ops_map *st_map; > > + struct bpf_map *map; > > + > > + mutex_lock(&update_mutex); > > update_mutex is needed to detach. > > > + > > + map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); > > + if (!map) { > > + mutex_unlock(&update_mutex); > > + return 0; > > + } > > + st_map = container_of(map, struct bpf_struct_ops_map, map); > > + > > + st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); > > + > > + rcu_assign_pointer(st_link->map, NULL); > > + /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or > > + * bpf_map_inc() in bpf_struct_ops_map_link_update(). > > + */ > > + bpf_map_put(&st_map->map); > > + > > + mutex_unlock(&update_mutex); > > + > > + return 0; > > +} > > + > > static const struct bpf_link_ops bpf_struct_ops_map_lops = { > > .dealloc = bpf_struct_ops_map_link_dealloc, > > + .detach = bpf_struct_ops_map_link_detach, > > .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, > > .fill_link_info = bpf_struct_ops_map_link_fill_link_info, > > .update_map = bpf_struct_ops_map_link_update, > > @@ -1176,13 +1208,22 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) > > if (err) > > goto err_out; > > > > + /* Init link->map before calling reg() in case being detached > > + * immediately. > > + */ > > With update_mutex held in link_create here, the parallel detach can still happen > before the link is fully initialized (the link->map pointer here in particular)? > > > + RCU_INIT_POINTER(link->map, map); > > + > > + mutex_lock(&update_mutex); > > err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); > > if (err) { > > + RCU_INIT_POINTER(link->map, NULL); > > I was hoping by holding the the update_mutex, it can avoid this link->map init > dance, like RCU_INIT_POINTER(link->map, map) above and then resetting here on > the error case. > > > + mutex_unlock(&update_mutex); > > bpf_link_cleanup(&link_primer); > > + /* The link has been free by bpf_link_cleanup() */ > > link = NULL; > > goto err_out; > > } > > - RCU_INIT_POINTER(link->map, map); > > If only init link->map once here like the existing code (and the init is > protected by the update_mutex), the subsystem should not be able to detach until > the link->map is fully initialized. > > or I am missing something obvious. Can you explain why this link->map init dance > is still needed? Ok, I get what you mean. I will move RCU_INIT_POINTER() back to its original place, and move the check on the value of "err" to the place after mutext_unlock(). Is it what you like? > > > + mutex_unlock(&update_mutex); >
On 5/29/24 8:04 AM, Kuifeng Lee wrote: > On Tue, May 28, 2024 at 11:17 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: >> >> On 5/24/24 3:30 PM, Kui-Feng Lee wrote: >>> +static int bpf_struct_ops_map_link_detach(struct bpf_link *link) >>> +{ >>> + struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link); >>> + struct bpf_struct_ops_map *st_map; >>> + struct bpf_map *map; >>> + >>> + mutex_lock(&update_mutex); >> >> update_mutex is needed to detach. >> >>> + >>> + map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); >>> + if (!map) { >>> + mutex_unlock(&update_mutex); >>> + return 0; >>> + } >>> + st_map = container_of(map, struct bpf_struct_ops_map, map); >>> + >>> + st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); >>> + >>> + rcu_assign_pointer(st_link->map, NULL); >>> + /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or >>> + * bpf_map_inc() in bpf_struct_ops_map_link_update(). >>> + */ >>> + bpf_map_put(&st_map->map); >>> + >>> + mutex_unlock(&update_mutex); >>> + >>> + return 0; >>> +} >>> + >>> static const struct bpf_link_ops bpf_struct_ops_map_lops = { >>> .dealloc = bpf_struct_ops_map_link_dealloc, >>> + .detach = bpf_struct_ops_map_link_detach, >>> .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, >>> .fill_link_info = bpf_struct_ops_map_link_fill_link_info, >>> .update_map = bpf_struct_ops_map_link_update, >>> @@ -1176,13 +1208,22 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) >>> if (err) >>> goto err_out; >>> >>> + /* Init link->map before calling reg() in case being detached >>> + * immediately. >>> + */ >> >> With update_mutex held in link_create here, the parallel detach can still happen >> before the link is fully initialized (the link->map pointer here in particular)? >> >>> + RCU_INIT_POINTER(link->map, map); >>> + >>> + mutex_lock(&update_mutex); >>> err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); >>> if (err) { >>> + RCU_INIT_POINTER(link->map, NULL); >> >> I was hoping by holding the the update_mutex, it can avoid this link->map init >> dance, like RCU_INIT_POINTER(link->map, map) above and then resetting here on >> the error case. >> >>> + mutex_unlock(&update_mutex); >>> bpf_link_cleanup(&link_primer); >>> + /* The link has been free by bpf_link_cleanup() */ >>> link = NULL; >>> goto err_out; >>> } >>> - RCU_INIT_POINTER(link->map, map); >> >> If only init link->map once here like the existing code (and the init is >> protected by the update_mutex), the subsystem should not be able to detach until >> the link->map is fully initialized. >> >> or I am missing something obvious. Can you explain why this link->map init dance >> is still needed? > > Ok, I get what you mean. > > I will move RCU_INIT_POINTER() back to its original place, and move the check > on the value of "err" to the place after mutext_unlock(). The RCU_INIT_POINTER(link->map, map) needs to be done with update_mutex held and it should be init after the err check, so the err check needs to be inside update_mutex lock also. Something like this (untested): mutex_lock(&update_mutex); err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); if (err) { mutex_unlock(&update_mutex); bpf_link_cleanup(&link_primer); link = NULL; goto err_out; } RCU_INIT_POINTER(link->map, map); mutex_unlock(&update_mutex); > >> >>> + mutex_unlock(&update_mutex); >>
On Wed, May 29, 2024 at 3:38 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 5/29/24 8:04 AM, Kuifeng Lee wrote: > > On Tue, May 28, 2024 at 11:17 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > >> > >> On 5/24/24 3:30 PM, Kui-Feng Lee wrote: > >>> +static int bpf_struct_ops_map_link_detach(struct bpf_link *link) > >>> +{ > >>> + struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link); > >>> + struct bpf_struct_ops_map *st_map; > >>> + struct bpf_map *map; > >>> + > >>> + mutex_lock(&update_mutex); > >> > >> update_mutex is needed to detach. > >> > >>> + > >>> + map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); > >>> + if (!map) { > >>> + mutex_unlock(&update_mutex); > >>> + return 0; > >>> + } > >>> + st_map = container_of(map, struct bpf_struct_ops_map, map); > >>> + > >>> + st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); > >>> + > >>> + rcu_assign_pointer(st_link->map, NULL); > >>> + /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or > >>> + * bpf_map_inc() in bpf_struct_ops_map_link_update(). > >>> + */ > >>> + bpf_map_put(&st_map->map); > >>> + > >>> + mutex_unlock(&update_mutex); > >>> + > >>> + return 0; > >>> +} > >>> + > >>> static const struct bpf_link_ops bpf_struct_ops_map_lops = { > >>> .dealloc = bpf_struct_ops_map_link_dealloc, > >>> + .detach = bpf_struct_ops_map_link_detach, > >>> .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, > >>> .fill_link_info = bpf_struct_ops_map_link_fill_link_info, > >>> .update_map = bpf_struct_ops_map_link_update, > >>> @@ -1176,13 +1208,22 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) > >>> if (err) > >>> goto err_out; > >>> > >>> + /* Init link->map before calling reg() in case being detached > >>> + * immediately. > >>> + */ > >> > >> With update_mutex held in link_create here, the parallel detach can still happen > >> before the link is fully initialized (the link->map pointer here in particular)? > >> > >>> + RCU_INIT_POINTER(link->map, map); > >>> + > >>> + mutex_lock(&update_mutex); > >>> err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); > >>> if (err) { > >>> + RCU_INIT_POINTER(link->map, NULL); > >> > >> I was hoping by holding the the update_mutex, it can avoid this link->map init > >> dance, like RCU_INIT_POINTER(link->map, map) above and then resetting here on > >> the error case. > >> > >>> + mutex_unlock(&update_mutex); > >>> bpf_link_cleanup(&link_primer); > >>> + /* The link has been free by bpf_link_cleanup() */ > >>> link = NULL; > >>> goto err_out; > >>> } > >>> - RCU_INIT_POINTER(link->map, map); > >> > >> If only init link->map once here like the existing code (and the init is > >> protected by the update_mutex), the subsystem should not be able to detach until > >> the link->map is fully initialized. > >> > >> or I am missing something obvious. Can you explain why this link->map init dance > >> is still needed? > > > > Ok, I get what you mean. > > > > I will move RCU_INIT_POINTER() back to its original place, and move the check > > on the value of "err" to the place after mutext_unlock(). > The RCU_INIT_POINTER(link->map, map) needs to be done with update_mutex held and > it should be init after the err check, so the err check needs to be inside > update_mutex lock also. > > Something like this (untested): > > mutex_lock(&update_mutex); > > err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); > if (err) { > mutex_unlock(&update_mutex); > bpf_link_cleanup(&link_primer); > link = NULL; > goto err_out; > } > RCU_INIT_POINTER(link->map, map); > > mutex_unlock(&update_mutex); > Sure! According to what we discussed off-line, the RCU_INIT_POINTER() will be moved back to its original place. Subsystems should not try to access link->map. > > > > >> > >>> + mutex_unlock(&update_mutex); > >> >
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 1542dded7489..f2439acd9757 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1057,9 +1057,6 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) st_map = (struct bpf_struct_ops_map *) rcu_dereference_protected(st_link->map, true); if (st_map) { - /* st_link->map can be NULL if - * bpf_struct_ops_link_create() fails to register. - */ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); bpf_map_put(&st_map->map); } @@ -1075,7 +1072,8 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, st_link = container_of(link, struct bpf_struct_ops_link, link); rcu_read_lock(); map = rcu_dereference(st_link->map); - seq_printf(seq, "map_id:\t%d\n", map->id); + if (map) + seq_printf(seq, "map_id:\t%d\n", map->id); rcu_read_unlock(); } @@ -1088,7 +1086,8 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, st_link = container_of(link, struct bpf_struct_ops_link, link); rcu_read_lock(); map = rcu_dereference(st_link->map); - info->struct_ops.map_id = map->id; + if (map) + info->struct_ops.map_id = map->id; rcu_read_unlock(); return 0; } @@ -1113,6 +1112,10 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map mutex_lock(&update_mutex); old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); + if (!old_map) { + err = -ENOLINK; + goto err_out; + } if (expected_old_map && old_map != expected_old_map) { err = -EPERM; goto err_out; @@ -1139,8 +1142,37 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map return err; } +static int bpf_struct_ops_map_link_detach(struct bpf_link *link) +{ + struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link); + struct bpf_struct_ops_map *st_map; + struct bpf_map *map; + + mutex_lock(&update_mutex); + + map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); + if (!map) { + mutex_unlock(&update_mutex); + return 0; + } + st_map = container_of(map, struct bpf_struct_ops_map, map); + + st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); + + rcu_assign_pointer(st_link->map, NULL); + /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or + * bpf_map_inc() in bpf_struct_ops_map_link_update(). + */ + bpf_map_put(&st_map->map); + + mutex_unlock(&update_mutex); + + return 0; +} + static const struct bpf_link_ops bpf_struct_ops_map_lops = { .dealloc = bpf_struct_ops_map_link_dealloc, + .detach = bpf_struct_ops_map_link_detach, .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, .fill_link_info = bpf_struct_ops_map_link_fill_link_info, .update_map = bpf_struct_ops_map_link_update, @@ -1176,13 +1208,22 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) if (err) goto err_out; + /* Init link->map before calling reg() in case being detached + * immediately. + */ + RCU_INIT_POINTER(link->map, map); + + mutex_lock(&update_mutex); err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); if (err) { + RCU_INIT_POINTER(link->map, NULL); + mutex_unlock(&update_mutex); bpf_link_cleanup(&link_primer); + /* The link has been free by bpf_link_cleanup() */ link = NULL; goto err_out; } - RCU_INIT_POINTER(link->map, map); + mutex_unlock(&update_mutex); return bpf_link_settle(&link_primer);
Implement the detach callback in bpf_link_ops for struct_ops so that user programs can detach a struct_ops link. The subsystems that struct_ops objects are registered to can also use this callback to detach the links being passed to them. Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com> --- kernel/bpf/bpf_struct_ops.c | 53 ++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 6 deletions(-)