diff mbox series

[net-next] devlink: don't take instance lock for nested handle put

Message ID 20231003074349.1435667-1-jiri@resnulli.us (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next] devlink: don't take instance lock for nested handle put | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1342 this patch: 1342
netdev/cc_maintainers success CCed 6 of 6 maintainers
netdev/build_clang success Errors and warnings before: 1364 this patch: 1364
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 1365 this patch: 1365
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 72 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Jiri Pirko Oct. 3, 2023, 7:43 a.m. UTC
From: Jiri Pirko <jiri@nvidia.com>

Lockdep reports following issue:

WARNING: possible circular locking dependency detected
------------------------------------------------------
devlink/8191 is trying to acquire lock:
ffff88813f32c250 (&devlink->lock_key#14){+.+.}-{3:3}, at: devlink_rel_devlink_handle_put+0x11e/0x2d0

                           but task is already holding lock:
ffffffff8511eca8 (rtnl_mutex){+.+.}-{3:3}, at: unregister_netdev+0xe/0x20

                           which lock already depends on the new lock.

                           the existing dependency chain (in reverse order) is:

                           -> #3 (rtnl_mutex){+.+.}-{3:3}:
       lock_acquire+0x1c3/0x500
       __mutex_lock+0x14c/0x1b20
       register_netdevice_notifier_net+0x13/0x30
       mlx5_lag_add_mdev+0x51c/0xa00 [mlx5_core]
       mlx5_load+0x222/0xc70 [mlx5_core]
       mlx5_init_one_devl_locked+0x4a0/0x1310 [mlx5_core]
       mlx5_init_one+0x3b/0x60 [mlx5_core]
       probe_one+0x786/0xd00 [mlx5_core]
       local_pci_probe+0xd7/0x180
       pci_device_probe+0x231/0x720
       really_probe+0x1e4/0xb60
       __driver_probe_device+0x261/0x470
       driver_probe_device+0x49/0x130
       __driver_attach+0x215/0x4c0
       bus_for_each_dev+0xf0/0x170
       bus_add_driver+0x21d/0x590
       driver_register+0x133/0x460
       vdpa_match_remove+0x89/0xc0 [vdpa]
       do_one_initcall+0xc4/0x360
       do_init_module+0x22d/0x760
       load_module+0x51d7/0x6750
       init_module_from_file+0xd2/0x130
       idempotent_init_module+0x326/0x5a0
       __x64_sys_finit_module+0xc1/0x130
       do_syscall_64+0x3d/0x90
       entry_SYSCALL_64_after_hwframe+0x46/0xb0

                           -> #2 (mlx5_intf_mutex){+.+.}-{3:3}:
       lock_acquire+0x1c3/0x500
       __mutex_lock+0x14c/0x1b20
       mlx5_register_device+0x3e/0xd0 [mlx5_core]
       mlx5_init_one_devl_locked+0x8fa/0x1310 [mlx5_core]
       mlx5_devlink_reload_up+0x147/0x170 [mlx5_core]
       devlink_reload+0x203/0x380
       devlink_nl_cmd_reload+0xb84/0x10e0
       genl_family_rcv_msg_doit+0x1cc/0x2a0
       genl_rcv_msg+0x3c9/0x670
       netlink_rcv_skb+0x12c/0x360
       genl_rcv+0x24/0x40
       netlink_unicast+0x435/0x6f0
       netlink_sendmsg+0x7a0/0xc70
       sock_sendmsg+0xc5/0x190
       __sys_sendto+0x1c8/0x290
       __x64_sys_sendto+0xdc/0x1b0
       do_syscall_64+0x3d/0x90
       entry_SYSCALL_64_after_hwframe+0x46/0xb0

                           -> #1 (&dev->lock_key#8){+.+.}-{3:3}:
       lock_acquire+0x1c3/0x500
       __mutex_lock+0x14c/0x1b20
       mlx5_init_one_devl_locked+0x45/0x1310 [mlx5_core]
       mlx5_devlink_reload_up+0x147/0x170 [mlx5_core]
       devlink_reload+0x203/0x380
       devlink_nl_cmd_reload+0xb84/0x10e0
       genl_family_rcv_msg_doit+0x1cc/0x2a0
       genl_rcv_msg+0x3c9/0x670
       netlink_rcv_skb+0x12c/0x360
       genl_rcv+0x24/0x40
       netlink_unicast+0x435/0x6f0
       netlink_sendmsg+0x7a0/0xc70
       sock_sendmsg+0xc5/0x190
       __sys_sendto+0x1c8/0x290
       __x64_sys_sendto+0xdc/0x1b0
       do_syscall_64+0x3d/0x90
       entry_SYSCALL_64_after_hwframe+0x46/0xb0

                           -> #0 (&devlink->lock_key#14){+.+.}-{3:3}:
       check_prev_add+0x1af/0x2300
       __lock_acquire+0x31d7/0x4eb0
       lock_acquire+0x1c3/0x500
       __mutex_lock+0x14c/0x1b20
       devlink_rel_devlink_handle_put+0x11e/0x2d0
       devlink_nl_port_fill+0xddf/0x1b00
       devlink_port_notify+0xb5/0x220
       __devlink_port_type_set+0x151/0x510
       devlink_port_netdevice_event+0x17c/0x220
       notifier_call_chain+0x97/0x240
       unregister_netdevice_many_notify+0x876/0x1790
       unregister_netdevice_queue+0x274/0x350
       unregister_netdev+0x18/0x20
       mlx5e_vport_rep_unload+0xc5/0x1c0 [mlx5_core]
       __esw_offloads_unload_rep+0xd8/0x130 [mlx5_core]
       mlx5_esw_offloads_rep_unload+0x52/0x70 [mlx5_core]
       mlx5_esw_offloads_unload_rep+0x85/0xc0 [mlx5_core]
       mlx5_eswitch_unload_sf_vport+0x41/0x90 [mlx5_core]
       mlx5_devlink_sf_port_del+0x120/0x280 [mlx5_core]
       genl_family_rcv_msg_doit+0x1cc/0x2a0
       genl_rcv_msg+0x3c9/0x670
       netlink_rcv_skb+0x12c/0x360
       genl_rcv+0x24/0x40
       netlink_unicast+0x435/0x6f0
       netlink_sendmsg+0x7a0/0xc70
       sock_sendmsg+0xc5/0x190
       __sys_sendto+0x1c8/0x290
       __x64_sys_sendto+0xdc/0x1b0
       do_syscall_64+0x3d/0x90
       entry_SYSCALL_64_after_hwframe+0x46/0xb0

                           other info that might help us debug this:

Chain exists of:
                             &devlink->lock_key#14 --> mlx5_intf_mutex --> rtnl_mutex

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(rtnl_mutex);
                               lock(mlx5_intf_mutex);
                               lock(rtnl_mutex);
  lock(&devlink->lock_key#14);

Problem is taking the devlink instance lock of nested instance when RTNL
is already held.

To fix this, don't take the devlink instance lock when putting nested
handle. Instead, rely on devlink reference to access relevant pointers
within devlink structure. Also, make sure that the device does
not disappear by taking a reference in devlink_alloc_ns().

Fixes: c137743bce02 ("devlink: introduce object and nested devlink relationship infra")
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
---
 net/devlink/core.c    | 20 +++++---------------
 net/devlink/netlink.c |  6 +++---
 2 files changed, 8 insertions(+), 18 deletions(-)

Comments

Jakub Kicinski Oct. 6, 2023, 1:30 a.m. UTC | #1
On Tue,  3 Oct 2023 09:43:49 +0200 Jiri Pirko wrote:
> To fix this, don't take the devlink instance lock when putting nested
> handle. Instead, rely on devlink reference to access relevant pointers
> within devlink structure. Also, make sure that the device does

struct device ?

> not disappear by taking a reference in devlink_alloc_ns().

> @@ -310,6 +299,7 @@ static void devlink_release(struct work_struct *work)
>  
>  	mutex_destroy(&devlink->lock);
>  	lockdep_unregister_key(&devlink->lock_key);
> +	put_device(devlink->dev);

IDK.. holding references until all references are gone may lead 
to reference cycles :(

>  	kfree(devlink);
>  }

> @@ -92,9 +93,8 @@ int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net,
>  		return -EMSGSIZE;
>  	if (devlink_nl_put_handle(msg, devlink))
>  		goto nla_put_failure;
> -	if (!net_eq(net, devlink_net(devlink))) {
> -		int id = peernet2id_alloc(net, devlink_net(devlink),
> -					  GFP_KERNEL);
> +	if (!net_eq(net, devl_net)) {
> +		int id = peernet2id_alloc(net, devl_net, GFP_KERNEL);
>  
>  		if (nla_put_s32(msg, DEVLINK_ATTR_NETNS_ID, id))
>  			return -EMSGSIZE;

Looks like pure refactoring. But are you sure that the netns can't
disappear? We're not holding the lock, the instance may get moved.

Overall I feel like recording the references on the objects will be
an endless source of locking pain. Would it be insane if we held 
the relationships as independent objects? Not as attributes of either
side?
Jiri Pirko Oct. 6, 2023, 7:22 a.m. UTC | #2
Fri, Oct 06, 2023 at 03:30:29AM CEST, kuba@kernel.org wrote:
>On Tue,  3 Oct 2023 09:43:49 +0200 Jiri Pirko wrote:
>> To fix this, don't take the devlink instance lock when putting nested
>> handle. Instead, rely on devlink reference to access relevant pointers
>> within devlink structure. Also, make sure that the device does
>
>struct device ?

Yes.


>
>> not disappear by taking a reference in devlink_alloc_ns().
>
>> @@ -310,6 +299,7 @@ static void devlink_release(struct work_struct *work)
>>  
>>  	mutex_destroy(&devlink->lock);
>>  	lockdep_unregister_key(&devlink->lock_key);
>> +	put_device(devlink->dev);
>
>IDK.. holding references until all references are gone may lead 
>to reference cycles :(

I don't follow. What seems to be the problematic flow? I can't spot any
reference cycle, do you?


>
>>  	kfree(devlink);
>>  }
>
>> @@ -92,9 +93,8 @@ int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net,
>>  		return -EMSGSIZE;
>>  	if (devlink_nl_put_handle(msg, devlink))
>>  		goto nla_put_failure;
>> -	if (!net_eq(net, devlink_net(devlink))) {
>> -		int id = peernet2id_alloc(net, devlink_net(devlink),
>> -					  GFP_KERNEL);
>> +	if (!net_eq(net, devl_net)) {
>> +		int id = peernet2id_alloc(net, devl_net, GFP_KERNEL);
>>  
>>  		if (nla_put_s32(msg, DEVLINK_ATTR_NETNS_ID, id))
>>  			return -EMSGSIZE;
>
>Looks like pure refapeernet2id_allocctoring. But are you sure that the netns can't
>disappear? We're not holding the lock, the instance may get moved.

Yeah, I think you are right. I can do peernet2id_alloc during devlink
init/netnschange and store id into devlink structure. That should solve
this.


>
>Overall I feel like recording the references on the objects will be
>an endless source of locking pain. Would it be insane if we held 
>the relationships as independent objects? Not as attributes of either
>side? 

How exactly do you envision this? rel struct would hold the bus/name
strings direcly?
Jakub Kicinski Oct. 6, 2023, 2:48 p.m. UTC | #3
On Fri, 6 Oct 2023 09:22:01 +0200 Jiri Pirko wrote:
> Fri, Oct 06, 2023 at 03:30:29AM CEST, kuba@kernel.org wrote:
> >> @@ -310,6 +299,7 @@ static void devlink_release(struct work_struct *work)
> >>  
> >>  	mutex_destroy(&devlink->lock);
> >>  	lockdep_unregister_key(&devlink->lock_key);
> >> +	put_device(devlink->dev);  
> >
> >IDK.. holding references until all references are gone may lead 
> >to reference cycles :(  
> 
> I don't follow. What seems to be the problematic flow? I can't spot any
> reference cycle, do you?

I can't remember to be honest. But we already assume that we can access
struct device of a devlink instance without holding the instance lock.
Because the relationship between devlink objects is usually fairly
straightforward and non-cyclical.

Isn't the "rel infrastructure"... well.. over-designed?

The user creates a port on an instance A, which spawns instance B.
Instance A links instance B to itself.
Instance A cannot disappear before instance B disappears.
Also instance A is what controls the destruction of instance B
so it can unlink it.

We can tell lockdep how the locks nest, too.

> >Overall I feel like recording the references on the objects will be
> >an endless source of locking pain. Would it be insane if we held 
> >the relationships as independent objects? Not as attributes of either
> >side?   
> 
> How exactly do you envision this? rel struct would hold the bus/name
> strings direcly?

No exactly, if we want bi-directional relationships we can create 
the link struct as a:

rel {
	u32 rel_id;
	struct devlink *instanceA, *instanceB; // hold reference
	struct list_head rel_listA, rel_listB; // under instance locks
	u32 state;
	struct list_head ntf_process_queue;
}

Operations on relationship can take the instance locks (sequentially).
Notifications from a workqueue.
Instance dumps would only report rel IDs, but the get the "members" of
the relationship user needs to issue a separate DL command / syscall.
Jiri Pirko Oct. 6, 2023, 5:07 p.m. UTC | #4
Fri, Oct 06, 2023 at 04:48:42PM CEST, kuba@kernel.org wrote:
>On Fri, 6 Oct 2023 09:22:01 +0200 Jiri Pirko wrote:
>> Fri, Oct 06, 2023 at 03:30:29AM CEST, kuba@kernel.org wrote:
>> >> @@ -310,6 +299,7 @@ static void devlink_release(struct work_struct *work)
>> >>  
>> >>  	mutex_destroy(&devlink->lock);
>> >>  	lockdep_unregister_key(&devlink->lock_key);
>> >> +	put_device(devlink->dev);  
>> >
>> >IDK.. holding references until all references are gone may lead 
>> >to reference cycles :(  
>> 
>> I don't follow. What seems to be the problematic flow? I can't spot any
>> reference cycle, do you?
>
>I can't remember to be honest. But we already assume that we can access
>struct device of a devlink instance without holding the instance lock.
>Because the relationship between devlink objects is usually fairly
>straightforward and non-cyclical.
>
>Isn't the "rel infrastructure"... well.. over-designed?
>
>The user creates a port on an instance A, which spawns instance B.
>Instance A links instance B to itself.
>Instance A cannot disappear before instance B disappears.

It can. mlx5 port sf removal is very nice example of that. It just tells
the FW to remove the sf and returns. The actual SF removal is spawned
after that when processing FW events.


>Also instance A is what controls the destruction of instance B
>so it can unlink it.
>
>We can tell lockdep how the locks nest, too.
>
>> >Overall I feel like recording the references on the objects will be
>> >an endless source of locking pain. Would it be insane if we held 
>> >the relationships as independent objects? Not as attributes of either
>> >side?   
>> 
>> How exactly do you envision this? rel struct would hold the bus/name
>> strings direcly?
>
>No exactly, if we want bi-directional relationships we can create 
>the link struct as a:
>
>rel {
>	u32 rel_id;
>	struct devlink *instanceA, *instanceB; // hold reference

Sometimes port, sometimes linecard is one one side (A).


>	struct list_head rel_listA, rel_listB; // under instance locks
>	u32 state;
>	struct list_head ntf_process_queue;
>}
>
>Operations on relationship can take the instance locks (sequentially).
>Notifications from a workqueue.

That is pretty much how that works now.


>Instance dumps would only report rel IDs, but the get the "members" of
>the relationship user needs to issue a separate DL command / syscall.

Oh! At least with process listening on notifications, this may be a bit
painful :/
I need some time to digest this.
Jakub Kicinski Oct. 6, 2023, 10:14 p.m. UTC | #5
On Fri, 6 Oct 2023 19:07:34 +0200 Jiri Pirko wrote:
> >The user creates a port on an instance A, which spawns instance B.
> >Instance A links instance B to itself.
> >Instance A cannot disappear before instance B disappears.  
> 
> It can. mlx5 port sf removal is very nice example of that. It just tells
> the FW to remove the sf and returns. The actual SF removal is spawned
> after that when processing FW events.

Isn't the PF driver processing the "FW events"? A is PF here, and B 
is SF, are you saying that the PF devlink instance can be completely
removed (not just unregistered, freed) before the SF instance is
unregistered?
Jiri Pirko Oct. 7, 2023, 10:17 a.m. UTC | #6
Sat, Oct 07, 2023 at 12:14:46AM CEST, kuba@kernel.org wrote:
>On Fri, 6 Oct 2023 19:07:34 +0200 Jiri Pirko wrote:
>> >The user creates a port on an instance A, which spawns instance B.
>> >Instance A links instance B to itself.
>> >Instance A cannot disappear before instance B disappears.  
>> 
>> It can. mlx5 port sf removal is very nice example of that. It just tells
>> the FW to remove the sf and returns. The actual SF removal is spawned
>> after that when processing FW events.
>
>Isn't the PF driver processing the "FW events"? A is PF here, and B 
>is SF, are you saying that the PF devlink instance can be completely
>removed (not just unregistered, freed) before the SF instance is
>unregistered?

Kernel-wise, yes. The FW probably holds necessary resource until SF goes
away.
Jakub Kicinski Oct. 9, 2023, 3:15 p.m. UTC | #7
On Sat, 7 Oct 2023 12:17:31 +0200 Jiri Pirko wrote:
>> Isn't the PF driver processing the "FW events"? A is PF here, and B 
>> is SF, are you saying that the PF devlink instance can be completely
>> removed (not just unregistered, freed) before the SF instance is
>> unregistered?  
> 
> Kernel-wise, yes. The FW probably holds necessary resource until SF goes
> away.

I think kernel assuming that this should not happen and requiring 
the PF driver to work around potentially stupid FW designs should
be entirely without our rights.
Jiri Pirko Oct. 9, 2023, 3:37 p.m. UTC | #8
Mon, Oct 09, 2023 at 05:15:32PM CEST, kuba@kernel.org wrote:
>On Sat, 7 Oct 2023 12:17:31 +0200 Jiri Pirko wrote:
>>> Isn't the PF driver processing the "FW events"? A is PF here, and B 
>>> is SF, are you saying that the PF devlink instance can be completely
>>> removed (not just unregistered, freed) before the SF instance is
>>> unregistered?  
>> 
>> Kernel-wise, yes. The FW probably holds necessary resource until SF goes
>> away.
>
>I think kernel assuming that this should not happen and requiring 
>the PF driver to work around potentially stupid FW designs should
>be entirely without our rights.

But why is it stupid? The SF may be spawned on the same host, but it
could be spawned on another one. The FW creates SF internally and shows
that to the kernel. Symetrically, the FW is asked to remove SF and it
tells to the host that the SF is going away. Flows have to go
through FW.
Jakub Kicinski Oct. 9, 2023, 4:31 p.m. UTC | #9
On Mon, 9 Oct 2023 17:37:27 +0200 Jiri Pirko wrote:
> >I think kernel assuming that this should not happen and requiring 
> >the PF driver to work around potentially stupid FW designs should
> >be entirely without our rights.  
> 
> But why is it stupid? The SF may be spawned on the same host, but it
> could be spawned on another one. The FW creates SF internally and shows
> that to the kernel. Symetrically, the FW is asked to remove SF and it
> tells to the host that the SF is going away. Flows have to go
> through FW.

In Linux the PF is what controls the SFs, right?
Privileges, configuration/admin, resource control.
How can the parent disappear and children still exist.

You can make it work with putting the proprietary FW in the center.
But Linux as a project has its own objectives.
Jiri Pirko Oct. 10, 2023, 7:31 a.m. UTC | #10
Mon, Oct 09, 2023 at 06:31:29PM CEST, kuba@kernel.org wrote:
>On Mon, 9 Oct 2023 17:37:27 +0200 Jiri Pirko wrote:
>> >I think kernel assuming that this should not happen and requiring 
>> >the PF driver to work around potentially stupid FW designs should
>> >be entirely without our rights.  
>> 
>> But why is it stupid? The SF may be spawned on the same host, but it
>> could be spawned on another one. The FW creates SF internally and shows
>> that to the kernel. Symetrically, the FW is asked to remove SF and it
>> tells to the host that the SF is going away. Flows have to go
>> through FW.
>
>In Linux the PF is what controls the SFs, right?
>Privileges, configuration/admin, resource control.
>How can the parent disappear and children still exist.

It's not like the PF instance disappears, the devlink port related to
the SF is removed. Whan user does it, driver asks FW to shutdown the SF.
That invokes FW flow which eventually leads to event delivered back to
driver that removes the SF instance itself.


>
>You can make it work with putting the proprietary FW in the center.
>But Linux as a project has its own objectives.
Jakub Kicinski Oct. 10, 2023, 2:52 p.m. UTC | #11
On Tue, 10 Oct 2023 09:31:20 +0200 Jiri Pirko wrote:
>> In Linux the PF is what controls the SFs, right?
>> Privileges, configuration/admin, resource control.
>> How can the parent disappear and children still exist.  
> 
> It's not like the PF instance disappears, the devlink port related to
> the SF is removed. Whan user does it, driver asks FW to shutdown the SF.
> That invokes FW flow which eventually leads to event delivered back to
> driver that removes the SF instance itself.

You understand what I'm saying tho, right?

If we can depend on the parent not disappearing before the child,
and the hierarchy is a DAG - the locking is much easier, because
parent can lock the child.

If it's only nVidia that put the control in hands of FW we shouldn't
complicate the core for y'all.
Jiri Pirko Oct. 10, 2023, 3:56 p.m. UTC | #12
Tue, Oct 10, 2023 at 04:52:31PM CEST, kuba@kernel.org wrote:
>On Tue, 10 Oct 2023 09:31:20 +0200 Jiri Pirko wrote:
>>> In Linux the PF is what controls the SFs, right?
>>> Privileges, configuration/admin, resource control.
>>> How can the parent disappear and children still exist.  
>> 
>> It's not like the PF instance disappears, the devlink port related to
>> the SF is removed. Whan user does it, driver asks FW to shutdown the SF.
>> That invokes FW flow which eventually leads to event delivered back to
>> driver that removes the SF instance itself.
>
>You understand what I'm saying tho, right?
>
>If we can depend on the parent not disappearing before the child,
>and the hierarchy is a DAG - the locking is much easier, because
>parent can lock the child.

It won't help with the locking though. During GET, the devlink lock
is taken and within it, you need to access the nested devlink attributes.

And during reload->notify, we still need work so the lock are taken in
proper order.

It would only make the rel infrastructure a bit similer. I will look
into that. But it's parallel to this patchset really.

>
>If it's only nVidia that put the control in hands of FW we shouldn't
>complicate the core for y'all.
Jakub Kicinski Oct. 10, 2023, 6:16 p.m. UTC | #13
On Tue, 10 Oct 2023 17:56:36 +0200 Jiri Pirko wrote:
> >You understand what I'm saying tho, right?
> >
> >If we can depend on the parent not disappearing before the child,
> >and the hierarchy is a DAG - the locking is much easier, because
> >parent can lock the child.  
> 
> It won't help with the locking though. During GET, the devlink lock
> is taken and within it, you need to access the nested devlink attributes.
> 
> And during reload->notify, we still need work so the lock are taken in
> proper order.

If parent is guaranteed to exist the read only fields can be accessed
freely and the read-write fields can be cached on children.
Parent has a list of children, it can store/cache a netns pointer on all
of them. When reload happens lock them and update that pointer.
At which point children do not have to lock the parent.

> It would only make the rel infrastructure a bit similer. I will look
> into that. But it's parallel to this patchset really.
Jiri Pirko Oct. 11, 2023, 1:34 p.m. UTC | #14
Tue, Oct 10, 2023 at 08:16:05PM CEST, kuba@kernel.org wrote:
>On Tue, 10 Oct 2023 17:56:36 +0200 Jiri Pirko wrote:
>> >You understand what I'm saying tho, right?
>> >
>> >If we can depend on the parent not disappearing before the child,
>> >and the hierarchy is a DAG - the locking is much easier, because
>> >parent can lock the child.  
>> 
>> It won't help with the locking though. During GET, the devlink lock
>> is taken and within it, you need to access the nested devlink attributes.
>> 
>> And during reload->notify, we still need work so the lock are taken in
>> proper order.
>
>If parent is guaranteed to exist the read only fields can be accessed
>freely and the read-write fields can be cached on children.

Only reason to access parent currently is netns change notification.
See devlink_rel_nested_in_notify().
It basically just scheduled delayed work by calling:
devlink_rel_nested_in_notify_work_schedule().

When work is processed in
devlink_rel_nested_in_notify_work()
There is no guarantee the parent exists, therefore devlink_index is used
to get the instance and then obj_index to get port/linecard index.

notify_cb() basically sends notification of parent object and that needs
parent instance lock. <--- This is why you need to lock the parent.

I see no way how to cache anything on children as you describe in this
scenario.


>Parent has a list of children, it can store/cache a netns pointer on all
>of them. When reload happens lock them and update that pointer.
>At which point children do not have to lock the parent.

Access of netns pointer is not a problem. See my latest version (v2)
where rcu is used in order to make sure peernet2id_alloc() call is safe:

devlink: call peernet2id_alloc() with net pointer under RCU read lock

       rcu_read_lock();
       devl_net = read_pnet_rcu(&devlink->_net);
       if (!net_eq(net, devl_net)) {
               int id = peernet2id_alloc(net, devl_net, GFP_ATOMIC);

               rcu_read_unlock();
               if (nla_put_s32(msg, DEVLINK_ATTR_NETNS_ID, id))
                       return -EMSGSIZE;
       } else {
               rcu_read_unlock();
       }


>
>> It would only make the rel infrastructure a bit similer. I will look
>> into that. But it's parallel to this patchset really.
>
Jakub Kicinski Oct. 12, 2023, 12:20 a.m. UTC | #15
On Wed, 11 Oct 2023 15:34:59 +0200 Jiri Pirko wrote:
> >If parent is guaranteed to exist the read only fields can be accessed
> >freely and the read-write fields can be cached on children.  
> 
> Only reason to access parent currently is netns change notification.
> See devlink_rel_nested_in_notify().
> It basically just scheduled delayed work by calling:
> devlink_rel_nested_in_notify_work_schedule().
> 
> When work is processed in
> devlink_rel_nested_in_notify_work()
> There is no guarantee the parent exists, therefore devlink_index is used
> to get the instance and then obj_index to get port/linecard index.
> 
> notify_cb() basically sends notification of parent object and that needs
> parent instance lock. <--- This is why you need to lock the parent.
> 
> I see no way how to cache anything on children as you describe in this
> scenario.
> 
> 
> >Parent has a list of children, it can store/cache a netns pointer on all
> >of them. When reload happens lock them and update that pointer.
> >At which point children do not have to lock the parent.  
> 
> Access of netns pointer is not a problem. 

The current code is a problem in itself. You added another xarray,
with some mark, callbacks and unclear locking semantics. All of it
completely undocumented.

The RCU lock on top is just fixing one obvious bug I pointed out to you.

Maybe this is completely unfair but I feel like devlink locking has
been haphazard and semi-broken since the inception. I had to step in 
to fix it. And now a year later we're back to weird locking and random
dependencies. The only reason it was merged is because I was on PTO.
Jiri Pirko Oct. 12, 2023, 6:14 a.m. UTC | #16
Thu, Oct 12, 2023 at 02:20:25AM CEST, kuba@kernel.org wrote:
>On Wed, 11 Oct 2023 15:34:59 +0200 Jiri Pirko wrote:
>> >If parent is guaranteed to exist the read only fields can be accessed
>> >freely and the read-write fields can be cached on children.  
>> 
>> Only reason to access parent currently is netns change notification.
>> See devlink_rel_nested_in_notify().
>> It basically just scheduled delayed work by calling:
>> devlink_rel_nested_in_notify_work_schedule().
>> 
>> When work is processed in
>> devlink_rel_nested_in_notify_work()
>> There is no guarantee the parent exists, therefore devlink_index is used
>> to get the instance and then obj_index to get port/linecard index.
>> 
>> notify_cb() basically sends notification of parent object and that needs
>> parent instance lock. <--- This is why you need to lock the parent.
>> 
>> I see no way how to cache anything on children as you describe in this
>> scenario.
>> 
>> 
>> >Parent has a list of children, it can store/cache a netns pointer on all
>> >of them. When reload happens lock them and update that pointer.
>> >At which point children do not have to lock the parent.  
>> 
>> Access of netns pointer is not a problem. 
>
>The current code is a problem in itself. You added another xarray,
>with some mark, callbacks and unclear locking semantics. All of it
>completely undocumented.

Okay, I will add the documentation. But I thouth it is clear. The parent
instance lock needs to be taken out of child lock. The problem this
patch tries to fix is when the rntl comes into the picture in one flow,
see the patch description.

>
>The RCU lock on top is just fixing one obvious bug I pointed out to you.

Not sure what obvious bug you mean. If you mean the parent-child
lifetime change, I don't know how that would help here. I don't see how.

Plus it has performance implications. When user removes SF port under
instance lock, the SF itself is removed asynchonously out of the lock.
You suggest to remove it synchronously holding the instance lock,
correct? SF removal does not need that lock. Removing thousands of SFs
would take much longer as currently, they are removed in parallel.
You would serialize the removals for no good reason.


>
>Maybe this is completely unfair but I feel like devlink locking has
>been haphazard and semi-broken since the inception. I had to step in 

Well, it got broken over time. I appreciate you helped to fix it.


>to fix it. And now a year later we're back to weird locking and random
>dependencies. The only reason it was merged is because I was on PTO.

Not sure what you mean by that. Locking is quite clear. Why weird?
What's weird exactly? What do you mean by "random dependencies"?

I have to say I feel we got a bit lost in the conversation.
Jakub Kicinski Oct. 13, 2023, 3:39 p.m. UTC | #17
On Thu, 12 Oct 2023 08:14:03 +0200 Jiri Pirko wrote:
> >The current code is a problem in itself. You added another xarray,
> >with some mark, callbacks and unclear locking semantics. All of it
> >completely undocumented.  
> 
> Okay, I will add the documentation. But I thouth it is clear. The parent
> instance lock needs to be taken out of child lock. The problem this
> patch tries to fix is when the rntl comes into the picture in one flow,
> see the patch description.
> 
> >The RCU lock on top is just fixing one obvious bug I pointed out to you.  
> 
> Not sure what obvious bug you mean. If you mean the parent-child
> lifetime change, I don't know how that would help here. I don't see how.
> 
> Plus it has performance implications. When user removes SF port under
> instance lock, the SF itself is removed asynchonously out of the lock.
> You suggest to remove it synchronously holding the instance lock,
> correct? 

The SF is deleted by calling ->port_del() on the PF instance, correct?

> SF removal does not need that lock. Removing thousands of SFs
> would take much longer as currently, they are removed in parallel.
> You would serialize the removals for no good reason.

First of all IDK what the removal rate you're targeting is, and what
is achievable under PF's lock. Handwaving "we need parallelism" without
data is not a serious argument.

> >Maybe this is completely unfair but I feel like devlink locking has
> >been haphazard and semi-broken since the inception. I had to step in   
> 
> Well, it got broken over time. I appreciate you helped to fix it.
> 
> >to fix it. And now a year later we're back to weird locking and random
> >dependencies. The only reason it was merged is because I was on PTO.  
> 
> Not sure what you mean by that. Locking is quite clear. Why weird?
> What's weird exactly? What do you mean by "random dependencies"?
> 
> I have to say I feel we got a bit lost in the conversation.

You have a rel object, which is refcounted, xarray with a lock, and 
an async work for notifications.

All you need is a list, and a requirement that the PF can't disappear
before SF (which your version also has, BTW).
Jiri Pirko Oct. 13, 2023, 5:07 p.m. UTC | #18
Fri, Oct 13, 2023 at 05:39:45PM CEST, kuba@kernel.org wrote:
>On Thu, 12 Oct 2023 08:14:03 +0200 Jiri Pirko wrote:
>> >The current code is a problem in itself. You added another xarray,
>> >with some mark, callbacks and unclear locking semantics. All of it
>> >completely undocumented.  
>> 
>> Okay, I will add the documentation. But I thouth it is clear. The parent
>> instance lock needs to be taken out of child lock. The problem this
>> patch tries to fix is when the rntl comes into the picture in one flow,
>> see the patch description.
>> 
>> >The RCU lock on top is just fixing one obvious bug I pointed out to you.  
>> 
>> Not sure what obvious bug you mean. If you mean the parent-child
>> lifetime change, I don't know how that would help here. I don't see how.
>> 
>> Plus it has performance implications. When user removes SF port under
>> instance lock, the SF itself is removed asynchonously out of the lock.
>> You suggest to remove it synchronously holding the instance lock,
>> correct? 
>
>The SF is deleted by calling ->port_del() on the PF instance, correct?

That or setting opstate "inactive".


>
>> SF removal does not need that lock. Removing thousands of SFs
>> would take much longer as currently, they are removed in parallel.
>> You would serialize the removals for no good reason.
>
>First of all IDK what the removal rate you're targeting is, and what
>is achievable under PF's lock. Handwaving "we need parallelism" without
>data is not a serious argument.

Oh there are data and there is a need. My colleagues are working
on parallel creation/removal within mlx5 driver as we speak. What you
suggest would be huge setback :/


>
>> >Maybe this is completely unfair but I feel like devlink locking has
>> >been haphazard and semi-broken since the inception. I had to step in   
>> 
>> Well, it got broken over time. I appreciate you helped to fix it.
>> 
>> >to fix it. And now a year later we're back to weird locking and random
>> >dependencies. The only reason it was merged is because I was on PTO.  
>> 
>> Not sure what you mean by that. Locking is quite clear. Why weird?
>> What's weird exactly? What do you mean by "random dependencies"?
>> 
>> I have to say I feel we got a bit lost in the conversation.
>
>You have a rel object, which is refcounted, xarray with a lock, and 
>an async work for notifications.

Yes. The async work for notification is something you would need anyway,
even with object lifetime change you suggest. It's about locking order.
Please see the patchset I sent today (v3), I did put in a documentation
describing that (3 last patches). That should make it clear.


>
>All you need is a list, and a requirement that the PF can't disappear
>before SF (which your version also has, BTW).

It's not PF, but object contained in PF. Just to be clear. That does not
scale as we discuss above :/
Jakub Kicinski Oct. 13, 2023, 8:01 p.m. UTC | #19
On Fri, 13 Oct 2023 19:07:05 +0200 Jiri Pirko wrote:
> >> Not sure what obvious bug you mean. If you mean the parent-child
> >> lifetime change, I don't know how that would help here. I don't see how.
> >> 
> >> Plus it has performance implications. When user removes SF port under
> >> instance lock, the SF itself is removed asynchonously out of the lock.
> >> You suggest to remove it synchronously holding the instance lock,
> >> correct?   
> >
> >The SF is deleted by calling ->port_del() on the PF instance, correct?  
> 
> That or setting opstate "inactive".

The opstate also set on the port (i.e. from the PF), right?

> >> SF removal does not need that lock. Removing thousands of SFs
> >> would take much longer as currently, they are removed in parallel.
> >> You would serialize the removals for no good reason.  
> >
> >First of all IDK what the removal rate you're targeting is, and what
> >is achievable under PF's lock. Handwaving "we need parallelism" without
> >data is not a serious argument.  
> 
> Oh there are data and there is a need. My colleagues are working
> on parallel creation/removal within mlx5 driver as we speak. What you
> suggest would be huge setback :/

The only part that needs to be synchronous is un-linking.
Once the SF is designated for destruction we can live without the link,
it's just waiting to be garbage-collected.

> >> Not sure what you mean by that. Locking is quite clear. Why weird?
> >> What's weird exactly? What do you mean by "random dependencies"?
> >> 
> >> I have to say I feel we got a bit lost in the conversation.  
> >
> >You have a rel object, which is refcounted, xarray with a lock, and 
> >an async work for notifications.  
> 
> Yes. The async work for notification is something you would need anyway,
> even with object lifetime change you suggest. It's about locking order.

I don't think I would. If linking is always done under PF's lock we can
safely send any ntf.

> Please see the patchset I sent today (v3), I did put in a documentation
> describing that (3 last patches). That should make it clear.

It's unnecessarily complicated, but whatever, I'm not touching it.
Jiri Pirko Oct. 15, 2023, 11:12 a.m. UTC | #20
Fri, Oct 13, 2023 at 10:01:00PM CEST, kuba@kernel.org wrote:
>On Fri, 13 Oct 2023 19:07:05 +0200 Jiri Pirko wrote:
>> >> Not sure what obvious bug you mean. If you mean the parent-child
>> >> lifetime change, I don't know how that would help here. I don't see how.
>> >> 
>> >> Plus it has performance implications. When user removes SF port under
>> >> instance lock, the SF itself is removed asynchonously out of the lock.
>> >> You suggest to remove it synchronously holding the instance lock,
>> >> correct?   
>> >
>> >The SF is deleted by calling ->port_del() on the PF instance, correct?  
>> 
>> That or setting opstate "inactive".
>
>The opstate also set on the port (i.e. from the PF), right?

Correct. But the problem is elsewehere. The actual SF auxdev lifecycle,
see below.


>
>> >> SF removal does not need that lock. Removing thousands of SFs
>> >> would take much longer as currently, they are removed in parallel.
>> >> You would serialize the removals for no good reason.  
>> >
>> >First of all IDK what the removal rate you're targeting is, and what
>> >is achievable under PF's lock. Handwaving "we need parallelism" without
>> >data is not a serious argument.  
>> 
>> Oh there are data and there is a need. My colleagues are working
>> on parallel creation/removal within mlx5 driver as we speak. What you
>> suggest would be huge setback :/
>
>The only part that needs to be synchronous is un-linking.
>Once the SF is designated for destruction we can live without the link,
>it's just waiting to be garbage-collected.

Yeah. Another flow to consider is SF unbind. When user unbinds the SF
manually, PF lock is not involved in at all (can't be really). SF needs
to be unlisted as well as the SF devlink instance goes away. That was
another reason for the rel infrastructure.


>
>> >> Not sure what you mean by that. Locking is quite clear. Why weird?
>> >> What's weird exactly? What do you mean by "random dependencies"?
>> >> 
>> >> I have to say I feel we got a bit lost in the conversation.  
>> >
>> >You have a rel object, which is refcounted, xarray with a lock, and 
>> >an async work for notifications.  
>> 
>> Yes. The async work for notification is something you would need anyway,
>> even with object lifetime change you suggest. It's about locking order.
>
>I don't think I would. If linking is always done under PF's lock we can
>safely send any ntf.

If is not. Manual bind called over sysfs of the SF auxiliary device is
called without any lock held.

There are multiple race conditions to consider the probing/removing the
SF auxiliary device in parallel operations like PF devlink reload, port
funcion removal etc. Rel infra is considering and resolving them all.


>
>> Please see the patchset I sent today (v3), I did put in a documentation
>> describing that (3 last patches). That should make it clear.
>
>It's unnecessarily complicated, but whatever, I'm not touching it.
diff mbox series

Patch

diff --git a/net/devlink/core.c b/net/devlink/core.c
index bcbbb952569f..655903ddbdfd 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -183,9 +183,8 @@  static struct devlink_rel *devlink_rel_find(unsigned long rel_index)
 		       DEVLINK_REL_IN_USE);
 }
 
-static struct devlink *devlink_rel_devlink_get_lock(u32 rel_index)
+static struct devlink *devlink_rel_devlink_get(u32 rel_index)
 {
-	struct devlink *devlink;
 	struct devlink_rel *rel;
 	u32 devlink_index;
 
@@ -198,16 +197,7 @@  static struct devlink *devlink_rel_devlink_get_lock(u32 rel_index)
 	xa_unlock(&devlink_rels);
 	if (!rel)
 		return NULL;
-	devlink = devlinks_xa_get(devlink_index);
-	if (!devlink)
-		return NULL;
-	devl_lock(devlink);
-	if (!devl_is_registered(devlink)) {
-		devl_unlock(devlink);
-		devlink_put(devlink);
-		return NULL;
-	}
-	return devlink;
+	return devlinks_xa_get(devlink_index);
 }
 
 int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink,
@@ -218,11 +208,10 @@  int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink,
 	struct devlink *rel_devlink;
 	int err;
 
-	rel_devlink = devlink_rel_devlink_get_lock(rel_index);
+	rel_devlink = devlink_rel_devlink_get(rel_index);
 	if (!rel_devlink)
 		return 0;
 	err = devlink_nl_put_nested_handle(msg, net, rel_devlink, attrtype);
-	devl_unlock(rel_devlink);
 	devlink_put(rel_devlink);
 	if (!err && msg_updated)
 		*msg_updated = true;
@@ -310,6 +299,7 @@  static void devlink_release(struct work_struct *work)
 
 	mutex_destroy(&devlink->lock);
 	lockdep_unregister_key(&devlink->lock_key);
+	put_device(devlink->dev);
 	kfree(devlink);
 }
 
@@ -425,7 +415,7 @@  struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
 	if (ret < 0)
 		goto err_xa_alloc;
 
-	devlink->dev = dev;
+	devlink->dev = get_device(dev);
 	devlink->ops = ops;
 	xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC);
 	xa_init_flags(&devlink->params, XA_FLAGS_ALLOC);
diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c
index 499304d9de49..2685c8fcf124 100644
--- a/net/devlink/netlink.c
+++ b/net/devlink/netlink.c
@@ -85,6 +85,7 @@  static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net,
 				 struct devlink *devlink, int attrtype)
 {
+	struct net *devl_net = devlink_net(devlink);
 	struct nlattr *nested_attr;
 
 	nested_attr = nla_nest_start(msg, attrtype);
@@ -92,9 +93,8 @@  int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net,
 		return -EMSGSIZE;
 	if (devlink_nl_put_handle(msg, devlink))
 		goto nla_put_failure;
-	if (!net_eq(net, devlink_net(devlink))) {
-		int id = peernet2id_alloc(net, devlink_net(devlink),
-					  GFP_KERNEL);
+	if (!net_eq(net, devl_net)) {
+		int id = peernet2id_alloc(net, devl_net, GFP_KERNEL);
 
 		if (nla_put_s32(msg, DEVLINK_ATTR_NETNS_ID, id))
 			return -EMSGSIZE;