Message ID | 20220720151234.3873008-2-jiri@resnulli.us (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | mlxsw: Implement dev info and dev flash for line cards | expand |
> -----Original Message----- > From: Jiri Pirko <jiri@resnulli.us> > Sent: Wednesday, July 20, 2022 8:12 AM > To: netdev@vger.kernel.org > Cc: davem@davemloft.net; kuba@kernel.org; idosch@nvidia.com; > petrm@nvidia.com; pabeni@redhat.com; edumazet@google.com; > mlxsw@nvidia.com; saeedm@nvidia.com; snelson@pensando.io > Subject: [patch net-next v3 01/11] net: devlink: make sure that devlink_try_get() > works with valid pointer during xarray iteration > > From: Jiri Pirko <jiri@nvidia.com> > > Remove dependency on devlink_mutex during devlinks xarray iteration. > > The reason is that devlink_register/unregister() functions taking > devlink_mutex would deadlock during devlink reload operation of devlink > instance which registers/unregisters nested devlink instances. > > The devlinks xarray consistency is ensured internally by xarray. > There is a reference taken when working with devlink using > devlink_try_get(). But there is no guarantee that devlink pointer > picked during xarray iteration is not freed before devlink_try_get() > is called. > > Make sure that devlink_try_get() works with valid pointer. > Achieve it by: > 1) Splitting devlink_put() so the completion is sent only > after grace period. Completion unblocks the devlink_unregister() > routine, which is followed-up by devlink_free() > 2) Iterate the devlink xarray holding RCU read lock. > > Signed-off-by: Jiri Pirko <jiri@nvidia.com> This makes sense as long as its ok to drop the rcu_read_lock while in the body of the xa loops. That feels a bit odd to me... > --- > v2->v3: > - s/enf/end/ in devlink_put() comment > - added missing rcu_read_lock() call to info_get_dumpit() > - extended patch description by motivation > - removed an extra "by" from patch description > v1->v2: > - new patch (originally part of different patchset) > --- > net/core/devlink.c | 114 ++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 96 insertions(+), 18 deletions(-) > > diff --git a/net/core/devlink.c b/net/core/devlink.c > index 98d79feeb3dc..6a3931a8e338 100644 > --- a/net/core/devlink.c > +++ b/net/core/devlink.c > @@ -70,6 +70,7 @@ struct devlink { > u8 reload_failed:1; > refcount_t refcount; > struct completion comp; > + struct rcu_head rcu; > char priv[] __aligned(NETDEV_ALIGN); > }; > > @@ -221,8 +222,6 @@ static DEFINE_XARRAY_FLAGS(devlinks, > XA_FLAGS_ALLOC); > /* devlink_mutex > * > * An overall lock guarding every operation coming from userspace. > - * It also guards devlink devices list and it is taken when > - * driver registers/unregisters it. > */ > static DEFINE_MUTEX(devlink_mutex); > > @@ -232,10 +231,21 @@ struct net *devlink_net(const struct devlink *devlink) > } > EXPORT_SYMBOL_GPL(devlink_net); > > +static void __devlink_put_rcu(struct rcu_head *head) > +{ > + struct devlink *devlink = container_of(head, struct devlink, rcu); > + > + complete(&devlink->comp); > +} > + > void devlink_put(struct devlink *devlink) > { > if (refcount_dec_and_test(&devlink->refcount)) > - complete(&devlink->comp); > + /* Make sure unregister operation that may await the completion > + * is unblocked only after all users are after the end of > + * RCU grace period. > + */ > + call_rcu(&devlink->rcu, __devlink_put_rcu); > } > > struct devlink *__must_check devlink_try_get(struct devlink *devlink) > @@ -295,6 +305,7 @@ static struct devlink *devlink_get_from_attrs(struct net > *net, > > lockdep_assert_held(&devlink_mutex); > > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (strcmp(devlink->dev->bus->name, busname) == 0 && > strcmp(dev_name(devlink->dev), devname) == 0 && > @@ -306,6 +317,7 @@ static struct devlink *devlink_get_from_attrs(struct net > *net, > > if (!found || !devlink_try_get(devlink)) > devlink = ERR_PTR(-ENODEV); > + rcu_read_unlock(); > > return devlink; > } > @@ -1329,9 +1341,11 @@ static int devlink_nl_cmd_rate_get_dumpit(struct > sk_buff *msg, > int err = 0; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > @@ -1358,7 +1372,9 @@ static int devlink_nl_cmd_rate_get_dumpit(struct > sk_buff *msg, > devl_unlock(devlink); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > if (err != -EMSGSIZE) > @@ -1432,29 +1448,32 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff > *msg, > int err; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > Is it safe to rcu_read_unlock here while we're still in the middle of the array processing? What happens if something else updates the xarray? is the for_each_marked safe? > - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) { > - devlink_put(devlink); > - continue; > - } > + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > + goto retry; > Ahh retry is at the end of the loop, so we'll just skip this one and move to the next one without needing to duplicate both devlink_put and rcu_read_lock.. ok. > - if (idx < start) { > - idx++; > - devlink_put(devlink); > - continue; > - } > + if (idx < start) > + goto inc; > > err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, > NETLINK_CB(cb->skb).portid, > cb->nlh->nlmsg_seq, NLM_F_MULTI); > - devlink_put(devlink); > - if (err) > + if (err) { > + devlink_put(devlink); > goto out; > + } > +inc: > idx++; > +retry: > + devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -1495,9 +1514,11 @@ static int devlink_nl_cmd_port_get_dumpit(struct > sk_buff *msg, > int err; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > @@ -1523,7 +1544,9 @@ static int devlink_nl_cmd_port_get_dumpit(struct > sk_buff *msg, > devl_unlock(devlink); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -2177,9 +2200,11 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct > sk_buff *msg, > int err; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > @@ -2208,7 +2233,9 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct > sk_buff *msg, > mutex_unlock(&devlink->linecards_lock); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -2449,9 +2476,11 @@ static int devlink_nl_cmd_sb_get_dumpit(struct > sk_buff *msg, > int err; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > @@ -2477,7 +2506,9 @@ static int devlink_nl_cmd_sb_get_dumpit(struct > sk_buff *msg, > devl_unlock(devlink); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -2601,9 +2632,11 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct > sk_buff *msg, > int err = 0; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || > !devlink->ops->sb_pool_get) > @@ -2626,7 +2659,9 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct > sk_buff *msg, > devl_unlock(devlink); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -2822,9 +2857,11 @@ static int > devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, > int err = 0; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || > !devlink->ops->sb_port_pool_get) > @@ -2847,7 +2884,9 @@ static int > devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, > devl_unlock(devlink); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -3071,9 +3110,11 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct > sk_buff *msg, > int err = 0; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || > !devlink->ops->sb_tc_pool_bind_get) > @@ -3097,7 +3138,9 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct > sk_buff *msg, > devl_unlock(devlink); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -5158,9 +5201,11 @@ static int devlink_nl_cmd_param_get_dumpit(struct > sk_buff *msg, > int err = 0; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > @@ -5188,7 +5233,9 @@ static int devlink_nl_cmd_param_get_dumpit(struct > sk_buff *msg, > devl_unlock(devlink); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -5393,9 +5440,11 @@ static int > devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, > int err = 0; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > @@ -5428,7 +5477,9 @@ static int > devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, > devl_unlock(devlink); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -5977,9 +6028,11 @@ static int devlink_nl_cmd_region_get_dumpit(struct > sk_buff *msg, > int err = 0; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > @@ -5990,7 +6043,9 @@ static int devlink_nl_cmd_region_get_dumpit(struct > sk_buff *msg, > devlink_put(devlink); > if (err) > goto out; > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > cb->args[0] = idx; > @@ -6511,9 +6566,11 @@ static int devlink_nl_cmd_info_get_dumpit(struct > sk_buff *msg, > int err = 0; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > @@ -6531,13 +6588,16 @@ static int devlink_nl_cmd_info_get_dumpit(struct > sk_buff *msg, > err = 0; > else if (err) { > devlink_put(devlink); > + rcu_read_lock(); > break; > } > inc: > idx++; > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > mutex_unlock(&devlink_mutex); > > if (err != -EMSGSIZE) > @@ -7691,9 +7751,11 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct > sk_buff *msg, > int err; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry_rep; > @@ -7719,11 +7781,13 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct > sk_buff *msg, > mutex_unlock(&devlink->reporters_lock); > retry_rep: > devlink_put(devlink); > + rcu_read_lock(); > } > > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry_port; > @@ -7754,7 +7818,9 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct > sk_buff *msg, > devl_unlock(devlink); > retry_port: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -8291,9 +8357,11 @@ static int devlink_nl_cmd_trap_get_dumpit(struct > sk_buff *msg, > int err; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > @@ -8319,7 +8387,9 @@ static int devlink_nl_cmd_trap_get_dumpit(struct > sk_buff *msg, > devl_unlock(devlink); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -8518,9 +8588,11 @@ static int > devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, > int err; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > @@ -8547,7 +8619,9 @@ static int > devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, > devl_unlock(devlink); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -8832,9 +8906,11 @@ static int > devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, > int err; > > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > @@ -8861,7 +8937,9 @@ static int > devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, > devl_unlock(devlink); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > out: > mutex_unlock(&devlink_mutex); > > @@ -9589,10 +9667,8 @@ void devlink_register(struct devlink *devlink) > ASSERT_DEVLINK_NOT_REGISTERED(devlink); > /* Make sure that we are in .probe() routine */ > > - mutex_lock(&devlink_mutex); > xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); > devlink_notify_register(devlink); > - mutex_unlock(&devlink_mutex); > } > EXPORT_SYMBOL_GPL(devlink_register); > > @@ -9609,10 +9685,8 @@ void devlink_unregister(struct devlink *devlink) > devlink_put(devlink); > wait_for_completion(&devlink->comp); > > - mutex_lock(&devlink_mutex); > devlink_notify_unregister(devlink); > xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); > - mutex_unlock(&devlink_mutex); > } > EXPORT_SYMBOL_GPL(devlink_unregister); > > @@ -12281,9 +12355,11 @@ static void __net_exit > devlink_pernet_pre_exit(struct net *net) > * all devlink instances from this namespace into init_net. > */ > mutex_lock(&devlink_mutex); > + rcu_read_lock(); > xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { > if (!devlink_try_get(devlink)) > continue; > + rcu_read_unlock(); > > if (!net_eq(devlink_net(devlink), net)) > goto retry; > @@ -12297,7 +12373,9 @@ static void __net_exit devlink_pernet_pre_exit(struct > net *net) > pr_warn("Failed to reload devlink instance into > init_net\n"); > retry: > devlink_put(devlink); > + rcu_read_lock(); > } > + rcu_read_unlock(); > mutex_unlock(&devlink_mutex); > } > > -- > 2.35.3
On Wed, 20 Jul 2022 17:12:24 +0200 Jiri Pirko wrote: > +static void __devlink_put_rcu(struct rcu_head *head) > +{ > + struct devlink *devlink = container_of(head, struct devlink, rcu); > + > + complete(&devlink->comp); > +} > + > void devlink_put(struct devlink *devlink) > { > if (refcount_dec_and_test(&devlink->refcount)) > - complete(&devlink->comp); > + /* Make sure unregister operation that may await the completion > + * is unblocked only after all users are after the end of > + * RCU grace period. > + */ > + call_rcu(&devlink->rcu, __devlink_put_rcu); > } Hm. I always assumed we'd just use the xa_lock(). Unmarking the instance as registered takes that lock which provides a natural barrier for others trying to take a reference. Something along these lines (untested): diff --git a/net/core/devlink.c b/net/core/devlink.c index 98d79feeb3dc..6321ea123f79 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -278,6 +278,38 @@ void devl_unlock(struct devlink *devlink) } EXPORT_SYMBOL_GPL(devl_unlock); +static struct devlink *devlink_iter_next(unsigned long *index) +{ + struct devlink *devlink; + + xa_lock(&devlinks); + devlink = xa_find_after(&devlinks, index, ULONG_MAX, + DEVLINK_REGISTERED); + if (devlink && !refcount_inc_not_zero(&devlink->refcount)) + devlink = NULL; + xa_unlock(&devlinks); + + return devlink ?: devlink_iter_next(index); +} + +static struct devlink *devlink_iter_start(unsigned long *index) +{ + struct devlink *devlink; + + xa_lock(&devlinks); + devlink = xa_find(&devlinks, index, ULONG_MAX, DEVLINK_REGISTERED); + if (devlink && !refcount_inc_not_zero(&devlink->refcount)) + devlink = NULL; + xa_unlock(&devlinks); + + return devlink ?: devlink_iter_next(index); +} + +#define devlink_for_each_get(index, entry) \ + for (index = 0, entry = devlink_iter_start(&index); \ + entry; entry = devlink_iter_next(&index)) + static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { @@ -1329,10 +1361,7 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, int err = 0; mutex_lock(&devlink_mutex); - xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { - if (!devlink_try_get(devlink)) - continue; - + devlink_for_each_get(index, devlink) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; etc. Plus we need to be more careful about the unregistering order, I believe the correct ordering is: clear_unmark() put() wait() notify() but I believe we'll run afoul of Leon's notification suppression. So I guess notify() has to go before clear_unmark(), but we should unmark before we wait otherwise we could live lock (once the mutex is really gone, I mean).
Thu, Jul 21, 2022 at 12:25:54AM CEST, jacob.e.keller@intel.com wrote: > > >> -----Original Message----- >> From: Jiri Pirko <jiri@resnulli.us> >> Sent: Wednesday, July 20, 2022 8:12 AM >> To: netdev@vger.kernel.org >> Cc: davem@davemloft.net; kuba@kernel.org; idosch@nvidia.com; >> petrm@nvidia.com; pabeni@redhat.com; edumazet@google.com; >> mlxsw@nvidia.com; saeedm@nvidia.com; snelson@pensando.io >> Subject: [patch net-next v3 01/11] net: devlink: make sure that devlink_try_get() >> works with valid pointer during xarray iteration >> >> From: Jiri Pirko <jiri@nvidia.com> >> >> Remove dependency on devlink_mutex during devlinks xarray iteration. >> >> The reason is that devlink_register/unregister() functions taking >> devlink_mutex would deadlock during devlink reload operation of devlink >> instance which registers/unregisters nested devlink instances. >> >> The devlinks xarray consistency is ensured internally by xarray. >> There is a reference taken when working with devlink using >> devlink_try_get(). But there is no guarantee that devlink pointer >> picked during xarray iteration is not freed before devlink_try_get() >> is called. >> >> Make sure that devlink_try_get() works with valid pointer. >> Achieve it by: >> 1) Splitting devlink_put() so the completion is sent only >> after grace period. Completion unblocks the devlink_unregister() >> routine, which is followed-up by devlink_free() >> 2) Iterate the devlink xarray holding RCU read lock. >> >> Signed-off-by: Jiri Pirko <jiri@nvidia.com> > > >This makes sense as long as its ok to drop the rcu_read_lock while in the body of the xa loops. That feels a bit odd to me... Yes, it is okay. See my comment below. > >> --- >> v2->v3: >> - s/enf/end/ in devlink_put() comment >> - added missing rcu_read_lock() call to info_get_dumpit() >> - extended patch description by motivation >> - removed an extra "by" from patch description >> v1->v2: >> - new patch (originally part of different patchset) >> --- >> net/core/devlink.c | 114 ++++++++++++++++++++++++++++++++++++++------- >> 1 file changed, 96 insertions(+), 18 deletions(-) >> >> diff --git a/net/core/devlink.c b/net/core/devlink.c >> index 98d79feeb3dc..6a3931a8e338 100644 >> --- a/net/core/devlink.c >> +++ b/net/core/devlink.c >> @@ -70,6 +70,7 @@ struct devlink { >> u8 reload_failed:1; >> refcount_t refcount; >> struct completion comp; >> + struct rcu_head rcu; >> char priv[] __aligned(NETDEV_ALIGN); >> }; >> >> @@ -221,8 +222,6 @@ static DEFINE_XARRAY_FLAGS(devlinks, >> XA_FLAGS_ALLOC); >> /* devlink_mutex >> * >> * An overall lock guarding every operation coming from userspace. >> - * It also guards devlink devices list and it is taken when >> - * driver registers/unregisters it. >> */ >> static DEFINE_MUTEX(devlink_mutex); >> >> @@ -232,10 +231,21 @@ struct net *devlink_net(const struct devlink *devlink) >> } >> EXPORT_SYMBOL_GPL(devlink_net); >> >> +static void __devlink_put_rcu(struct rcu_head *head) >> +{ >> + struct devlink *devlink = container_of(head, struct devlink, rcu); >> + >> + complete(&devlink->comp); >> +} >> + >> void devlink_put(struct devlink *devlink) >> { >> if (refcount_dec_and_test(&devlink->refcount)) >> - complete(&devlink->comp); >> + /* Make sure unregister operation that may await the completion >> + * is unblocked only after all users are after the end of >> + * RCU grace period. >> + */ >> + call_rcu(&devlink->rcu, __devlink_put_rcu); >> } >> >> struct devlink *__must_check devlink_try_get(struct devlink *devlink) >> @@ -295,6 +305,7 @@ static struct devlink *devlink_get_from_attrs(struct net >> *net, >> >> lockdep_assert_held(&devlink_mutex); >> >> + rcu_read_lock(); >> xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { >> if (strcmp(devlink->dev->bus->name, busname) == 0 && >> strcmp(dev_name(devlink->dev), devname) == 0 && >> @@ -306,6 +317,7 @@ static struct devlink *devlink_get_from_attrs(struct net >> *net, >> >> if (!found || !devlink_try_get(devlink)) >> devlink = ERR_PTR(-ENODEV); >> + rcu_read_unlock(); >> >> return devlink; >> } >> @@ -1329,9 +1341,11 @@ static int devlink_nl_cmd_rate_get_dumpit(struct >> sk_buff *msg, >> int err = 0; >> >> mutex_lock(&devlink_mutex); >> + rcu_read_lock(); >> xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { >> if (!devlink_try_get(devlink)) >> continue; >> + rcu_read_unlock(); >> >> if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) >> goto retry; >> @@ -1358,7 +1372,9 @@ static int devlink_nl_cmd_rate_get_dumpit(struct >> sk_buff *msg, >> devl_unlock(devlink); >> retry: >> devlink_put(devlink); >> + rcu_read_lock(); >> } >> + rcu_read_unlock(); >> out: >> mutex_unlock(&devlink_mutex); >> if (err != -EMSGSIZE) >> @@ -1432,29 +1448,32 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff >> *msg, >> int err; >> >> mutex_lock(&devlink_mutex); >> + rcu_read_lock(); >> xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { >> if (!devlink_try_get(devlink)) >> continue; >> + rcu_read_unlock(); >> > >Is it safe to rcu_read_unlock here while we're still in the middle of the array processing? What happens if something else updates the xarray? is the for_each_marked safe? Sure, you don't need to hold rcu_read_lock during call to xa_for_each_marked. The consistency of xarray is itself guaranteed. The only reason to take rcu_read_lock outside is that the devlink pointer which is rcu_dereference_check()'ed inside xa_for_each_marked() is still valid once we devlink_try_get() it. > >> - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) { >> - devlink_put(devlink); >> - continue; >> - } >> + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) >> + goto retry; >> > >Ahh retry is at the end of the loop, so we'll just skip this one and move to the next one without needing to duplicate both devlink_put and rcu_read_lock.. ok. Yep. > >> - if (idx < start) { >> - idx++; >> - devlink_put(devlink); >> - continue; >> - } >> + if (idx < start) >> + goto inc; >> >> err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, >> NETLINK_CB(cb->skb).portid, >> cb->nlh->nlmsg_seq, NLM_F_MULTI); >> - devlink_put(devlink); >> - if (err) >> + if (err) { >> + devlink_put(devlink); >> goto out; >> + } >> +inc: >> idx++; >> +retry: >> + devlink_put(devlink); >> + rcu_read_lock(); >> } >> + rcu_read_unlock(); >> out: >> mutex_unlock(&devlink_mutex); >> [...]
Thu, Jul 21, 2022 at 02:49:53AM CEST, kuba@kernel.org wrote: >On Wed, 20 Jul 2022 17:12:24 +0200 Jiri Pirko wrote: >> +static void __devlink_put_rcu(struct rcu_head *head) >> +{ >> + struct devlink *devlink = container_of(head, struct devlink, rcu); >> + >> + complete(&devlink->comp); >> +} >> + >> void devlink_put(struct devlink *devlink) >> { >> if (refcount_dec_and_test(&devlink->refcount)) >> - complete(&devlink->comp); >> + /* Make sure unregister operation that may await the completion >> + * is unblocked only after all users are after the end of >> + * RCU grace period. >> + */ >> + call_rcu(&devlink->rcu, __devlink_put_rcu); >> } > >Hm. I always assumed we'd just use the xa_lock(). Unmarking the >instance as registered takes that lock which provides a natural >barrier for others trying to take a reference. I guess that the xa_lock() scheme could work, as far as I see it. But what's wrong with the rcu scheme? I actually find it quite neat. No need to have another odd iteration helpers. We just benefit of xa_array rcu internals to make sure devlink pointer is valid at the time we make a reference. Very clear. > >Something along these lines (untested): > >diff --git a/net/core/devlink.c b/net/core/devlink.c >index 98d79feeb3dc..6321ea123f79 100644 >--- a/net/core/devlink.c >+++ b/net/core/devlink.c >@@ -278,6 +278,38 @@ void devl_unlock(struct devlink *devlink) > } > EXPORT_SYMBOL_GPL(devl_unlock); > >+static struct devlink *devlink_iter_next(unsigned long *index) >+{ >+ struct devlink *devlink; >+ >+ xa_lock(&devlinks); >+ devlink = xa_find_after(&devlinks, index, ULONG_MAX, >+ DEVLINK_REGISTERED); >+ if (devlink && !refcount_inc_not_zero(&devlink->refcount)) >+ devlink = NULL; >+ xa_unlock(&devlinks); >+ >+ return devlink ?: devlink_iter_next(index); >+} >+ >+static struct devlink *devlink_iter_start(unsigned long *index) >+{ >+ struct devlink *devlink; >+ >+ xa_lock(&devlinks); >+ devlink = xa_find(&devlinks, index, ULONG_MAX, DEVLINK_REGISTERED); >+ if (devlink && !refcount_inc_not_zero(&devlink->refcount)) >+ devlink = NULL; >+ xa_unlock(&devlinks); >+ >+ return devlink ?: devlink_iter_next(index); >+} >+ >+#define devlink_for_each_get(index, entry) \ >+ for (index = 0, entry = devlink_iter_start(&index); \ >+ entry; entry = devlink_iter_next(&index)) >+ > static struct devlink *devlink_get_from_attrs(struct net *net, > struct nlattr **attrs) > { >@@ -1329,10 +1361,7 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, > int err = 0; > > mutex_lock(&devlink_mutex); >- xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { >- if (!devlink_try_get(devlink)) >- continue; >- >+ devlink_for_each_get(index, devlink) { > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > >etc. > >Plus we need to be more careful about the unregistering order, I >believe the correct ordering is: > > clear_unmark() > put() > wait() > notify() > >but I believe we'll run afoul of Leon's notification suppression. >So I guess notify() has to go before clear_unmark(), but we should >unmark before we wait otherwise we could live lock (once the mutex >is really gone, I mean). Will check.
On Thu, 21 Jul 2022 07:51:37 +0200 Jiri Pirko wrote: > >Hm. I always assumed we'd just use the xa_lock(). Unmarking the > >instance as registered takes that lock which provides a natural > >barrier for others trying to take a reference. > > I guess that the xa_lock() scheme could work, as far as I see it. But > what's wrong with the rcu scheme? I actually find it quite neat. No need > to have another odd iteration helpers. We just benefit of xa_array rcu > internals to make sure devlink pointer is valid at the time we make a > reference. Very clear. Nothing strongly against the RCU scheme, TBH. Just didn't expect it. I can concoct some argument like it's one extra sync primitive we haven't had to think about in devlink so far, but really if you prefer RCU, I don't mind. I do like the idea of wrapping the iteration into our own helper, tho. Contains the implementation details of the iteration nicely. I didn't look in sufficient detail but I would have even considered rolling the namespace check into it for dump.
Thu, Jul 21, 2022 at 08:22:58AM CEST, kuba@kernel.org wrote: >On Thu, 21 Jul 2022 07:51:37 +0200 Jiri Pirko wrote: >> >Hm. I always assumed we'd just use the xa_lock(). Unmarking the >> >instance as registered takes that lock which provides a natural >> >barrier for others trying to take a reference. >> >> I guess that the xa_lock() scheme could work, as far as I see it. But >> what's wrong with the rcu scheme? I actually find it quite neat. No need >> to have another odd iteration helpers. We just benefit of xa_array rcu >> internals to make sure devlink pointer is valid at the time we make a >> reference. Very clear. > >Nothing strongly against the RCU scheme, TBH. Just didn't expect it. >I can concoct some argument like it's one extra sync primitive we >haven't had to think about in devlink so far, but really if you prefer >RCU, I don't mind. > >I do like the idea of wrapping the iteration into our own helper, tho. >Contains the implementation details of the iteration nicely. I didn't >look in sufficient detail but I would have even considered rolling the >namespace check into it for dump. Hmm, okay. I will think about helpers to contain the iteration/rcu/refget stuff. Thanks!
> -----Original Message----- > From: Jiri Pirko <jiri@resnulli.us> > Sent: Wednesday, July 20, 2022 10:45 PM > To: Keller, Jacob E <jacob.e.keller@intel.com> > Cc: netdev@vger.kernel.org; davem@davemloft.net; kuba@kernel.org; > idosch@nvidia.com; petrm@nvidia.com; pabeni@redhat.com; > edumazet@google.com; mlxsw@nvidia.com; saeedm@nvidia.com; > snelson@pensando.io > Subject: Re: [patch net-next v3 01/11] net: devlink: make sure that > devlink_try_get() works with valid pointer during xarray iteration > > >Is it safe to rcu_read_unlock here while we're still in the middle of the array > processing? What happens if something else updates the xarray? is the > for_each_marked safe? > > Sure, you don't need to hold rcu_read_lock during call to xa_for_each_marked. > The consistency of xarray is itself guaranteed. The only reason to take > rcu_read_lock outside is that the devlink pointer which is > rcu_dereference_check()'ed inside xa_for_each_marked() is still valid > once we devlink_try_get() it. > Excellent, ok. Basically we need the RCU for protecting just the pointer until we get a reference to it separately. Thanks! In that case: Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Thu, Jul 21, 2022 at 02:49:53AM CEST, kuba@kernel.org wrote: >On Wed, 20 Jul 2022 17:12:24 +0200 Jiri Pirko wrote: >> +static void __devlink_put_rcu(struct rcu_head *head) >> +{ >> + struct devlink *devlink = container_of(head, struct devlink, rcu); >> + >> + complete(&devlink->comp); >> +} >> + >> void devlink_put(struct devlink *devlink) >> { >> if (refcount_dec_and_test(&devlink->refcount)) >> - complete(&devlink->comp); >> + /* Make sure unregister operation that may await the completion >> + * is unblocked only after all users are after the end of >> + * RCU grace period. >> + */ >> + call_rcu(&devlink->rcu, __devlink_put_rcu); >> } > >Hm. I always assumed we'd just use the xa_lock(). Unmarking the >instance as registered takes that lock which provides a natural >barrier for others trying to take a reference. > >Something along these lines (untested): > >diff --git a/net/core/devlink.c b/net/core/devlink.c >index 98d79feeb3dc..6321ea123f79 100644 >--- a/net/core/devlink.c >+++ b/net/core/devlink.c >@@ -278,6 +278,38 @@ void devl_unlock(struct devlink *devlink) > } > EXPORT_SYMBOL_GPL(devl_unlock); > >+static struct devlink *devlink_iter_next(unsigned long *index) >+{ >+ struct devlink *devlink; >+ >+ xa_lock(&devlinks); >+ devlink = xa_find_after(&devlinks, index, ULONG_MAX, >+ DEVLINK_REGISTERED); >+ if (devlink && !refcount_inc_not_zero(&devlink->refcount)) >+ devlink = NULL; >+ xa_unlock(&devlinks); >+ >+ return devlink ?: devlink_iter_next(index); >+} >+ >+static struct devlink *devlink_iter_start(unsigned long *index) >+{ >+ struct devlink *devlink; >+ >+ xa_lock(&devlinks); >+ devlink = xa_find(&devlinks, index, ULONG_MAX, DEVLINK_REGISTERED); >+ if (devlink && !refcount_inc_not_zero(&devlink->refcount)) >+ devlink = NULL; >+ xa_unlock(&devlinks); >+ >+ return devlink ?: devlink_iter_next(index); >+} >+ >+#define devlink_for_each_get(index, entry) \ >+ for (index = 0, entry = devlink_iter_start(&index); \ >+ entry; entry = devlink_iter_next(&index)) >+ > static struct devlink *devlink_get_from_attrs(struct net *net, > struct nlattr **attrs) > { >@@ -1329,10 +1361,7 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, > int err = 0; > > mutex_lock(&devlink_mutex); >- xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { >- if (!devlink_try_get(devlink)) >- continue; >- >+ devlink_for_each_get(index, devlink) { > if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) > goto retry; > >etc. > >Plus we need to be more careful about the unregistering order, I >believe the correct ordering is: > > clear_unmark() > put() > wait() > notify() Fixed. > >but I believe we'll run afoul of Leon's notification suppression. >So I guess notify() has to go before clear_unmark(), but we should >unmark before we wait otherwise we could live lock (once the mutex >is really gone, I mean).
Thu, Jul 21, 2022 at 08:55:04PM CEST, jacob.e.keller@intel.com wrote: > > >> -----Original Message----- >> From: Jiri Pirko <jiri@resnulli.us> >> Sent: Wednesday, July 20, 2022 10:45 PM >> To: Keller, Jacob E <jacob.e.keller@intel.com> >> Cc: netdev@vger.kernel.org; davem@davemloft.net; kuba@kernel.org; >> idosch@nvidia.com; petrm@nvidia.com; pabeni@redhat.com; >> edumazet@google.com; mlxsw@nvidia.com; saeedm@nvidia.com; >> snelson@pensando.io >> Subject: Re: [patch net-next v3 01/11] net: devlink: make sure that >> devlink_try_get() works with valid pointer during xarray iteration >> >> >Is it safe to rcu_read_unlock here while we're still in the middle of the array >> processing? What happens if something else updates the xarray? is the >> for_each_marked safe? >> >> Sure, you don't need to hold rcu_read_lock during call to xa_for_each_marked. >> The consistency of xarray is itself guaranteed. The only reason to take >> rcu_read_lock outside is that the devlink pointer which is >> rcu_dereference_check()'ed inside xa_for_each_marked() is still valid >> once we devlink_try_get() it. >> > >Excellent, ok. Basically we need the RCU for protecting just the pointer until we get a reference to it separately. Yep. > >Thanks! > >In that case: > >Reviewed-by: Jacob Keller <jacob.e.keller@intel.com> Thanks. I will send v4 soon wrapping this up into helper as Jakub requested.
Thu, Jul 21, 2022 at 02:49:53AM CEST, kuba@kernel.org wrote: >On Wed, 20 Jul 2022 17:12:24 +0200 Jiri Pirko wrote: [...] >Plus we need to be more careful about the unregistering order, I >believe the correct ordering is: > > clear_unmark() > put() > wait() > notify() > >but I believe we'll run afoul of Leon's notification suppression. >So I guess notify() has to go before clear_unmark(), but we should >unmark before we wait otherwise we could live lock (once the mutex >is really gone, I mean). Kuba, could you elaborate a bit more about the live lock problem here? Thanks!
On Fri, 22 Jul 2022 17:50:17 +0200 Jiri Pirko wrote: > >Plus we need to be more careful about the unregistering order, I > >believe the correct ordering is: > > > > clear_unmark() > > put() > > wait() > > notify() > > > >but I believe we'll run afoul of Leon's notification suppression. > >So I guess notify() has to go before clear_unmark(), but we should > >unmark before we wait otherwise we could live lock (once the mutex > >is really gone, I mean). > > Kuba, could you elaborate a bit more about the live lock problem here? Once the devlink_mutex lock is gone - (unprivileged) user space dumping devlink objects could prevent any de-registration from happening because it can keep the reference of the instance up. So we should mark the instance as not REGISTERED first, then go to wait. Pretty theoretical, I guess, but I wanted to mention it in case you can figure out a solution along the way :S I don't think it's a blocker right now since we still have the mutex.
Fri, Jul 22, 2022 at 08:23:48PM CEST, kuba@kernel.org wrote: >On Fri, 22 Jul 2022 17:50:17 +0200 Jiri Pirko wrote: >> >Plus we need to be more careful about the unregistering order, I >> >believe the correct ordering is: >> > >> > clear_unmark() >> > put() >> > wait() >> > notify() >> > >> >but I believe we'll run afoul of Leon's notification suppression. >> >So I guess notify() has to go before clear_unmark(), but we should >> >unmark before we wait otherwise we could live lock (once the mutex >> >is really gone, I mean). >> >> Kuba, could you elaborate a bit more about the live lock problem here? > >Once the devlink_mutex lock is gone - (unprivileged) user space dumping >devlink objects could prevent any de-registration from happening >because it can keep the reference of the instance up. So we should mark >the instance as not REGISTERED first, then go to wait. Yeah, that is what I thought. I resolved it as you wrote. I removed the WARN_ON from devlink_notify(). It is really not good for anything anyway. > >Pretty theoretical, I guess, but I wanted to mention it in case you can >figure out a solution along the way :S I don't think it's a blocker >right now since we still have the mutex. Got it.
Sat, Jul 23, 2022 at 05:41:08PM CEST, jiri@resnulli.us wrote: >Fri, Jul 22, 2022 at 08:23:48PM CEST, kuba@kernel.org wrote: >>On Fri, 22 Jul 2022 17:50:17 +0200 Jiri Pirko wrote: >>> >Plus we need to be more careful about the unregistering order, I >>> >believe the correct ordering is: >>> > >>> > clear_unmark() >>> > put() >>> > wait() >>> > notify() >>> > >>> >but I believe we'll run afoul of Leon's notification suppression. >>> >So I guess notify() has to go before clear_unmark(), but we should >>> >unmark before we wait otherwise we could live lock (once the mutex >>> >is really gone, I mean). >>> >>> Kuba, could you elaborate a bit more about the live lock problem here? >> >>Once the devlink_mutex lock is gone - (unprivileged) user space dumping >>devlink objects could prevent any de-registration from happening >>because it can keep the reference of the instance up. So we should mark >>the instance as not REGISTERED first, then go to wait. > >Yeah, that is what I thought. I resolved it as you wrote. I removed the >WARN_ON from devlink_notify(). It is really not good for anything >anyway. The check for "registered" is in more notifications. I will handle this in the next patchset, you are right, it is not needed to handle here. Sending v4. Thanks! > > >> >>Pretty theoretical, I guess, but I wanted to mention it in case you can >>figure out a solution along the way :S I don't think it's a blocker >>right now since we still have the mutex. > >Got it.
diff --git a/net/core/devlink.c b/net/core/devlink.c index 98d79feeb3dc..6a3931a8e338 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -70,6 +70,7 @@ struct devlink { u8 reload_failed:1; refcount_t refcount; struct completion comp; + struct rcu_head rcu; char priv[] __aligned(NETDEV_ALIGN); }; @@ -221,8 +222,6 @@ static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); /* devlink_mutex * * An overall lock guarding every operation coming from userspace. - * It also guards devlink devices list and it is taken when - * driver registers/unregisters it. */ static DEFINE_MUTEX(devlink_mutex); @@ -232,10 +231,21 @@ struct net *devlink_net(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_net); +static void __devlink_put_rcu(struct rcu_head *head) +{ + struct devlink *devlink = container_of(head, struct devlink, rcu); + + complete(&devlink->comp); +} + void devlink_put(struct devlink *devlink) { if (refcount_dec_and_test(&devlink->refcount)) - complete(&devlink->comp); + /* Make sure unregister operation that may await the completion + * is unblocked only after all users are after the end of + * RCU grace period. + */ + call_rcu(&devlink->rcu, __devlink_put_rcu); } struct devlink *__must_check devlink_try_get(struct devlink *devlink) @@ -295,6 +305,7 @@ static struct devlink *devlink_get_from_attrs(struct net *net, lockdep_assert_held(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (strcmp(devlink->dev->bus->name, busname) == 0 && strcmp(dev_name(devlink->dev), devname) == 0 && @@ -306,6 +317,7 @@ static struct devlink *devlink_get_from_attrs(struct net *net, if (!found || !devlink_try_get(devlink)) devlink = ERR_PTR(-ENODEV); + rcu_read_unlock(); return devlink; } @@ -1329,9 +1341,11 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, int err = 0; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; @@ -1358,7 +1372,9 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); if (err != -EMSGSIZE) @@ -1432,29 +1448,32 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) { - devlink_put(devlink); - continue; - } + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; - if (idx < start) { - idx++; - devlink_put(devlink); - continue; - } + if (idx < start) + goto inc; err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - devlink_put(devlink); - if (err) + if (err) { + devlink_put(devlink); goto out; + } +inc: idx++; +retry: + devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -1495,9 +1514,11 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; @@ -1523,7 +1544,9 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -2177,9 +2200,11 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; @@ -2208,7 +2233,9 @@ static int devlink_nl_cmd_linecard_get_dumpit(struct sk_buff *msg, mutex_unlock(&devlink->linecards_lock); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -2449,9 +2476,11 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; @@ -2477,7 +2506,9 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -2601,9 +2632,11 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, int err = 0; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_pool_get) @@ -2626,7 +2659,9 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -2822,9 +2857,11 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, int err = 0; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_port_pool_get) @@ -2847,7 +2884,9 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -3071,9 +3110,11 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, int err = 0; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_tc_pool_bind_get) @@ -3097,7 +3138,9 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -5158,9 +5201,11 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, int err = 0; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; @@ -5188,7 +5233,9 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -5393,9 +5440,11 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, int err = 0; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; @@ -5428,7 +5477,9 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -5977,9 +6028,11 @@ static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, int err = 0; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; @@ -5990,7 +6043,9 @@ static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, devlink_put(devlink); if (err) goto out; + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -6511,9 +6566,11 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, int err = 0; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; @@ -6531,13 +6588,16 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, err = 0; else if (err) { devlink_put(devlink); + rcu_read_lock(); break; } inc: idx++; retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); mutex_unlock(&devlink_mutex); if (err != -EMSGSIZE) @@ -7691,9 +7751,11 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry_rep; @@ -7719,11 +7781,13 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, mutex_unlock(&devlink->reporters_lock); retry_rep: devlink_put(devlink); + rcu_read_lock(); } xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry_port; @@ -7754,7 +7818,9 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry_port: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -8291,9 +8357,11 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; @@ -8319,7 +8387,9 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -8518,9 +8588,11 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; @@ -8547,7 +8619,9 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -8832,9 +8906,11 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) goto retry; @@ -8861,7 +8937,9 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, devl_unlock(devlink); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); out: mutex_unlock(&devlink_mutex); @@ -9589,10 +9667,8 @@ void devlink_register(struct devlink *devlink) ASSERT_DEVLINK_NOT_REGISTERED(devlink); /* Make sure that we are in .probe() routine */ - mutex_lock(&devlink_mutex); xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_notify_register(devlink); - mutex_unlock(&devlink_mutex); } EXPORT_SYMBOL_GPL(devlink_register); @@ -9609,10 +9685,8 @@ void devlink_unregister(struct devlink *devlink) devlink_put(devlink); wait_for_completion(&devlink->comp); - mutex_lock(&devlink_mutex); devlink_notify_unregister(devlink); xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); - mutex_unlock(&devlink_mutex); } EXPORT_SYMBOL_GPL(devlink_unregister); @@ -12281,9 +12355,11 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) * all devlink instances from this namespace into init_net. */ mutex_lock(&devlink_mutex); + rcu_read_lock(); xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (!devlink_try_get(devlink)) continue; + rcu_read_unlock(); if (!net_eq(devlink_net(devlink), net)) goto retry; @@ -12297,7 +12373,9 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) pr_warn("Failed to reload devlink instance into init_net\n"); retry: devlink_put(devlink); + rcu_read_lock(); } + rcu_read_unlock(); mutex_unlock(&devlink_mutex); }