diff mbox series

[rdma-rc] IB/core: fix a UAF for netdev in netdevice_event process

Message ID 20211025034258.2426872-1-william.xuanziyang@huawei.com (mailing list archive)
State Changes Requested
Delegated to: Jason Gunthorpe
Headers show
Series [rdma-rc] IB/core: fix a UAF for netdev in netdevice_event process | expand

Commit Message

Ziyang Xuan (William) Oct. 25, 2021, 3:42 a.m. UTC
When a vlan netdev enter netdevice_event process although it is not a
roce netdev, it will be passed to netdevice_event_work_handler() to
process. In order to hold the netdev of netdevice_event after
netdevice_event() return, call dev_hold() to hold the netdev in
netdevice_queue_work(). But that did not consider the real_dev of a vlan
netdev, the real_dev can be freed within netdevice_event_work_handler()
be scheduled. It would trigger the UAF problem for the real_dev like
following:

==================================================================
BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120
Read of size 4 at addr ffff88801648a0c4 by task kworker/u8:0/8
Workqueue: gid-cache-wq netdevice_event_work_handler
Call Trace:
 dump_stack_lvl+0xcd/0x134
 print_address_description.constprop.0.cold+0x93/0x334
 kasan_report.cold+0x83/0xdf
 vlan_dev_real_dev+0xf9/0x120
 is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0
 is_eth_port_of_netdev_filter+0x28/0x40
 ib_enum_roce_netdev+0x1a3/0x300
 ib_enum_all_roce_netdevs+0xc7/0x140
 netdevice_event_work_handler+0x9d/0x210
...

Allocated by task 9289:
 kasan_save_stack+0x1b/0x40
 __kasan_kmalloc+0x9b/0xd0
 __kmalloc_node+0x20a/0x330
 kvmalloc_node+0x61/0xf0
 alloc_netdev_mqs+0x9d/0x1140
 rtnl_create_link+0x955/0xb70
 __rtnl_newlink+0xe10/0x15b0
 rtnl_newlink+0x64/0xa0
...

Freed by task 9288:
 kasan_save_stack+0x1b/0x40
 kasan_set_track+0x1c/0x30
 kasan_set_free_info+0x20/0x30
 __kasan_slab_free+0xfc/0x130
 slab_free_freelist_hook+0xdd/0x240
 kfree+0xe4/0x690
 kvfree+0x42/0x50
 device_release+0x9f/0x240
 kobject_put+0x1c8/0x530
 put_device+0x1b/0x30
 free_netdev+0x370/0x540
 ppp_destroy_interface+0x313/0x3d0
 ppp_release+0x1bf/0x240
...

Hold the real_dev for a vlan netdev in netdevice_event_work_handler()
to fix the UAF problem.

Fixes: 238fdf48f2b5 ("IB/core: Add RoCE table bonding support")
Reported-by: syzbot+e4df4e1389e28972e955@syzkaller.appspotmail.com
Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
---
 drivers/infiniband/core/roce_gid_mgmt.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

Comments

Leon Romanovsky Oct. 25, 2021, 7:33 a.m. UTC | #1
On Mon, Oct 25, 2021 at 11:42:58AM +0800, Ziyang Xuan wrote:
> When a vlan netdev enter netdevice_event process although it is not a
> roce netdev, it will be passed to netdevice_event_work_handler() to
> process. In order to hold the netdev of netdevice_event after
> netdevice_event() return, call dev_hold() to hold the netdev in
> netdevice_queue_work(). But that did not consider the real_dev of a vlan
> netdev, the real_dev can be freed within netdevice_event_work_handler()
> be scheduled. It would trigger the UAF problem for the real_dev like
> following:
> 
> ==================================================================
> BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120
> Read of size 4 at addr ffff88801648a0c4 by task kworker/u8:0/8
> Workqueue: gid-cache-wq netdevice_event_work_handler
> Call Trace:
>  dump_stack_lvl+0xcd/0x134
>  print_address_description.constprop.0.cold+0x93/0x334
>  kasan_report.cold+0x83/0xdf
>  vlan_dev_real_dev+0xf9/0x120
>  is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0
>  is_eth_port_of_netdev_filter+0x28/0x40
>  ib_enum_roce_netdev+0x1a3/0x300
>  ib_enum_all_roce_netdevs+0xc7/0x140
>  netdevice_event_work_handler+0x9d/0x210
> ...
> 
> Allocated by task 9289:
>  kasan_save_stack+0x1b/0x40
>  __kasan_kmalloc+0x9b/0xd0
>  __kmalloc_node+0x20a/0x330
>  kvmalloc_node+0x61/0xf0
>  alloc_netdev_mqs+0x9d/0x1140
>  rtnl_create_link+0x955/0xb70
>  __rtnl_newlink+0xe10/0x15b0
>  rtnl_newlink+0x64/0xa0
> ...
> 
> Freed by task 9288:
>  kasan_save_stack+0x1b/0x40
>  kasan_set_track+0x1c/0x30
>  kasan_set_free_info+0x20/0x30
>  __kasan_slab_free+0xfc/0x130
>  slab_free_freelist_hook+0xdd/0x240
>  kfree+0xe4/0x690
>  kvfree+0x42/0x50
>  device_release+0x9f/0x240
>  kobject_put+0x1c8/0x530
>  put_device+0x1b/0x30
>  free_netdev+0x370/0x540
>  ppp_destroy_interface+0x313/0x3d0
>  ppp_release+0x1bf/0x240
> ...
> 
> Hold the real_dev for a vlan netdev in netdevice_event_work_handler()
> to fix the UAF problem.
> 
> Fixes: 238fdf48f2b5 ("IB/core: Add RoCE table bonding support")
> Reported-by: syzbot+e4df4e1389e28972e955@syzkaller.appspotmail.com
> Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
> ---
>  drivers/infiniband/core/roce_gid_mgmt.c | 16 +++++++++++++++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
> index 68197e576433..063dbe72b7c2 100644
> --- a/drivers/infiniband/core/roce_gid_mgmt.c
> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
> @@ -621,6 +621,7 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>  {
>  	struct netdev_event_work *work =
>  		container_of(_work, struct netdev_event_work, work);
> +	struct net_device *real_dev;
>  	unsigned int i;
>  
>  	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
> @@ -628,6 +629,12 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>  					 work->cmds[i].filter_ndev,
>  					 work->cmds[i].cb,
>  					 work->cmds[i].ndev);
> +		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
> +		if (real_dev)
> +			dev_put(real_dev);
> +		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
> +		if (real_dev)
> +			dev_put(real_dev);
>  		dev_put(work->cmds[i].ndev);
>  		dev_put(work->cmds[i].filter_ndev);
>  	}
> @@ -638,9 +645,10 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>  static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
>  				struct net_device *ndev)
>  {
> -	unsigned int i;
>  	struct netdev_event_work *ndev_work =
>  		kmalloc(sizeof(*ndev_work), GFP_KERNEL);
> +	struct net_device *real_dev;
> +	unsigned int i;
>  
>  	if (!ndev_work)
>  		return NOTIFY_DONE;
> @@ -653,6 +661,12 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
>  			ndev_work->cmds[i].filter_ndev = ndev;
>  		dev_hold(ndev_work->cmds[i].ndev);
>  		dev_hold(ndev_work->cmds[i].filter_ndev);
> +		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
> +		if (real_dev)
> +			dev_hold(real_dev);
> +		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
> +		if (real_dev)
> +			dev_hold(real_dev);
>  	}
>  	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);

Probably, this is the right change, but I don't know well enough that
part of code. What prevents from "real_dev" to disappear right after
your call to rdma_vlan_dev_real_dev()?

Thanks

>  
> -- 
> 2.25.1
>
Ziyang Xuan (William) Oct. 25, 2021, 8:37 a.m. UTC | #2
> On Mon, Oct 25, 2021 at 11:42:58AM +0800, Ziyang Xuan wrote:
>> When a vlan netdev enter netdevice_event process although it is not a
>> roce netdev, it will be passed to netdevice_event_work_handler() to
>> process. In order to hold the netdev of netdevice_event after
>> netdevice_event() return, call dev_hold() to hold the netdev in
>> netdevice_queue_work(). But that did not consider the real_dev of a vlan
>> netdev, the real_dev can be freed within netdevice_event_work_handler()
>> be scheduled. It would trigger the UAF problem for the real_dev like
>> following:
>>
>> ==================================================================
>> BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120
>> Read of size 4 at addr ffff88801648a0c4 by task kworker/u8:0/8
>> Workqueue: gid-cache-wq netdevice_event_work_handler
>> Call Trace:
>>  dump_stack_lvl+0xcd/0x134
>>  print_address_description.constprop.0.cold+0x93/0x334
>>  kasan_report.cold+0x83/0xdf
>>  vlan_dev_real_dev+0xf9/0x120
>>  is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0
>>  is_eth_port_of_netdev_filter+0x28/0x40
>>  ib_enum_roce_netdev+0x1a3/0x300
>>  ib_enum_all_roce_netdevs+0xc7/0x140
>>  netdevice_event_work_handler+0x9d/0x210
>> ...
>>
>> Allocated by task 9289:
>>  kasan_save_stack+0x1b/0x40
>>  __kasan_kmalloc+0x9b/0xd0
>>  __kmalloc_node+0x20a/0x330
>>  kvmalloc_node+0x61/0xf0
>>  alloc_netdev_mqs+0x9d/0x1140
>>  rtnl_create_link+0x955/0xb70
>>  __rtnl_newlink+0xe10/0x15b0
>>  rtnl_newlink+0x64/0xa0
>> ...
>>
>> Freed by task 9288:
>>  kasan_save_stack+0x1b/0x40
>>  kasan_set_track+0x1c/0x30
>>  kasan_set_free_info+0x20/0x30
>>  __kasan_slab_free+0xfc/0x130
>>  slab_free_freelist_hook+0xdd/0x240
>>  kfree+0xe4/0x690
>>  kvfree+0x42/0x50
>>  device_release+0x9f/0x240
>>  kobject_put+0x1c8/0x530
>>  put_device+0x1b/0x30
>>  free_netdev+0x370/0x540
>>  ppp_destroy_interface+0x313/0x3d0
>>  ppp_release+0x1bf/0x240
>> ...
>>
>> Hold the real_dev for a vlan netdev in netdevice_event_work_handler()
>> to fix the UAF problem.
>>
>> Fixes: 238fdf48f2b5 ("IB/core: Add RoCE table bonding support")
>> Reported-by: syzbot+e4df4e1389e28972e955@syzkaller.appspotmail.com
>> Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
>> ---
>>  drivers/infiniband/core/roce_gid_mgmt.c | 16 +++++++++++++++-
>>  1 file changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
>> index 68197e576433..063dbe72b7c2 100644
>> --- a/drivers/infiniband/core/roce_gid_mgmt.c
>> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
>> @@ -621,6 +621,7 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>>  {
>>  	struct netdev_event_work *work =
>>  		container_of(_work, struct netdev_event_work, work);
>> +	struct net_device *real_dev;
>>  	unsigned int i;
>>  
>>  	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
>> @@ -628,6 +629,12 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>>  					 work->cmds[i].filter_ndev,
>>  					 work->cmds[i].cb,
>>  					 work->cmds[i].ndev);
>> +		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
>> +		if (real_dev)
>> +			dev_put(real_dev);
>> +		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
>> +		if (real_dev)
>> +			dev_put(real_dev);
>>  		dev_put(work->cmds[i].ndev);
>>  		dev_put(work->cmds[i].filter_ndev);
>>  	}
>> @@ -638,9 +645,10 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>>  static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
>>  				struct net_device *ndev)
>>  {
>> -	unsigned int i;
>>  	struct netdev_event_work *ndev_work =
>>  		kmalloc(sizeof(*ndev_work), GFP_KERNEL);
>> +	struct net_device *real_dev;
>> +	unsigned int i;
>>  
>>  	if (!ndev_work)
>>  		return NOTIFY_DONE;
>> @@ -653,6 +661,12 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
>>  			ndev_work->cmds[i].filter_ndev = ndev;
>>  		dev_hold(ndev_work->cmds[i].ndev);
>>  		dev_hold(ndev_work->cmds[i].filter_ndev);
>> +		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
>> +		if (real_dev)
>> +			dev_hold(real_dev);
>> +		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
>> +		if (real_dev)
>> +			dev_hold(real_dev);
>>  	}
>>  	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
> 
> Probably, this is the right change, but I don't know well enough that
> part of code. What prevents from "real_dev" to disappear right after
> your call to rdma_vlan_dev_real_dev()?
> 

It is known that free the net_device until its dev_refcnt is one. The
detail realization see netdev_run_todo().The real_dev's dev_refcnt of
a vlan net_device will reach one after unregister_netdevice(&real_dev)
and unregister_vlan_dev(&vlan_ndev, ...) but the dev_refcnt of the vlan
net_device is bigger than one because netdevice_queue_work() will hold
the vlan net_device. So my solution is hold the real_dev too in
netdevice_queue_work().

> Thanks
> 
>>  
>> -- 
>> 2.25.1
>>
> .
>
Leon Romanovsky Oct. 25, 2021, 11:06 a.m. UTC | #3
On Mon, Oct 25, 2021 at 04:37:41PM +0800, Ziyang Xuan (William) wrote:
> > On Mon, Oct 25, 2021 at 11:42:58AM +0800, Ziyang Xuan wrote:
> >> When a vlan netdev enter netdevice_event process although it is not a
> >> roce netdev, it will be passed to netdevice_event_work_handler() to
> >> process. In order to hold the netdev of netdevice_event after
> >> netdevice_event() return, call dev_hold() to hold the netdev in
> >> netdevice_queue_work(). But that did not consider the real_dev of a vlan
> >> netdev, the real_dev can be freed within netdevice_event_work_handler()
> >> be scheduled. It would trigger the UAF problem for the real_dev like
> >> following:
> >>
> >> ==================================================================
> >> BUG: KASAN: use-after-free in vlan_dev_real_dev+0xf9/0x120
> >> Read of size 4 at addr ffff88801648a0c4 by task kworker/u8:0/8
> >> Workqueue: gid-cache-wq netdevice_event_work_handler
> >> Call Trace:
> >>  dump_stack_lvl+0xcd/0x134
> >>  print_address_description.constprop.0.cold+0x93/0x334
> >>  kasan_report.cold+0x83/0xdf
> >>  vlan_dev_real_dev+0xf9/0x120
> >>  is_eth_port_of_netdev_filter.part.0+0xb1/0x2c0
> >>  is_eth_port_of_netdev_filter+0x28/0x40
> >>  ib_enum_roce_netdev+0x1a3/0x300
> >>  ib_enum_all_roce_netdevs+0xc7/0x140
> >>  netdevice_event_work_handler+0x9d/0x210
> >> ...
> >>
> >> Allocated by task 9289:
> >>  kasan_save_stack+0x1b/0x40
> >>  __kasan_kmalloc+0x9b/0xd0
> >>  __kmalloc_node+0x20a/0x330
> >>  kvmalloc_node+0x61/0xf0
> >>  alloc_netdev_mqs+0x9d/0x1140
> >>  rtnl_create_link+0x955/0xb70
> >>  __rtnl_newlink+0xe10/0x15b0
> >>  rtnl_newlink+0x64/0xa0
> >> ...
> >>
> >> Freed by task 9288:
> >>  kasan_save_stack+0x1b/0x40
> >>  kasan_set_track+0x1c/0x30
> >>  kasan_set_free_info+0x20/0x30
> >>  __kasan_slab_free+0xfc/0x130
> >>  slab_free_freelist_hook+0xdd/0x240
> >>  kfree+0xe4/0x690
> >>  kvfree+0x42/0x50
> >>  device_release+0x9f/0x240
> >>  kobject_put+0x1c8/0x530
> >>  put_device+0x1b/0x30
> >>  free_netdev+0x370/0x540
> >>  ppp_destroy_interface+0x313/0x3d0
> >>  ppp_release+0x1bf/0x240
> >> ...
> >>
> >> Hold the real_dev for a vlan netdev in netdevice_event_work_handler()
> >> to fix the UAF problem.
> >>
> >> Fixes: 238fdf48f2b5 ("IB/core: Add RoCE table bonding support")
> >> Reported-by: syzbot+e4df4e1389e28972e955@syzkaller.appspotmail.com
> >> Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
> >> ---
> >>  drivers/infiniband/core/roce_gid_mgmt.c | 16 +++++++++++++++-
> >>  1 file changed, 15 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
> >> index 68197e576433..063dbe72b7c2 100644
> >> --- a/drivers/infiniband/core/roce_gid_mgmt.c
> >> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
> >> @@ -621,6 +621,7 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >>  {
> >>  	struct netdev_event_work *work =
> >>  		container_of(_work, struct netdev_event_work, work);
> >> +	struct net_device *real_dev;
> >>  	unsigned int i;
> >>  
> >>  	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
> >> @@ -628,6 +629,12 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >>  					 work->cmds[i].filter_ndev,
> >>  					 work->cmds[i].cb,
> >>  					 work->cmds[i].ndev);
> >> +		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
> >> +		if (real_dev)
> >> +			dev_put(real_dev);
> >> +		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
> >> +		if (real_dev)
> >> +			dev_put(real_dev);
> >>  		dev_put(work->cmds[i].ndev);
> >>  		dev_put(work->cmds[i].filter_ndev);
> >>  	}
> >> @@ -638,9 +645,10 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >>  static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
> >>  				struct net_device *ndev)
> >>  {
> >> -	unsigned int i;
> >>  	struct netdev_event_work *ndev_work =
> >>  		kmalloc(sizeof(*ndev_work), GFP_KERNEL);
> >> +	struct net_device *real_dev;
> >> +	unsigned int i;
> >>  
> >>  	if (!ndev_work)
> >>  		return NOTIFY_DONE;
> >> @@ -653,6 +661,12 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
> >>  			ndev_work->cmds[i].filter_ndev = ndev;
> >>  		dev_hold(ndev_work->cmds[i].ndev);
> >>  		dev_hold(ndev_work->cmds[i].filter_ndev);
> >> +		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
> >> +		if (real_dev)
> >> +			dev_hold(real_dev);
> >> +		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
> >> +		if (real_dev)
> >> +			dev_hold(real_dev);
> >>  	}
> >>  	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
> > 
> > Probably, this is the right change, but I don't know well enough that
> > part of code. What prevents from "real_dev" to disappear right after
> > your call to rdma_vlan_dev_real_dev()?
> > 
> 
> It is known that free the net_device until its dev_refcnt is one. The
> detail realization see netdev_run_todo().The real_dev's dev_refcnt of
> a vlan net_device will reach one after unregister_netdevice(&real_dev)
> and unregister_vlan_dev(&vlan_ndev, ...) but the dev_refcnt of the vlan
> net_device is bigger than one because netdevice_queue_work() will hold
> the vlan net_device. So my solution is hold the real_dev too in
> netdevice_queue_work().

              dev_hold(ndev_work->cmds[i].filter_ndev);
 +            real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
 +            if (real_dev)
                  <------------ real_dev is released here.
 +                    dev_hold(real_dev);


> 
> > Thanks
> > 
> >>  
> >> -- 
> >> 2.25.1
> >>
> > .
> >
Jason Gunthorpe Oct. 25, 2021, 4:39 p.m. UTC | #4
On Mon, Oct 25, 2021 at 11:42:58AM +0800, Ziyang Xuan wrote:
> When a vlan netdev enter netdevice_event process although it is not a
> roce netdev, it will be passed to netdevice_event_work_handler() to
> process. In order to hold the netdev of netdevice_event after
> netdevice_event() return, call dev_hold() to hold the netdev in
> netdevice_queue_work(). But that did not consider the real_dev of a vlan
> netdev, the real_dev can be freed within netdevice_event_work_handler()
> be scheduled. It would trigger the UAF problem for the real_dev like
> following:

I think this is a netdev bug. Under rtnl vlan_dev_real_dev() should
return NULL if the vlan device has passed unregister_vlan_dev()

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 55275ef9a31a7c..1106da84e72559 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -126,6 +126,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 
        /* Get rid of the vlan's reference to real_dev */
        dev_put(real_dev);
+       vlan->real_dev = NULL;
 }
 
 int vlan_check_real_dev(struct net_device *real_dev,

I'm assuming there is more too it than this, but it is a starting
point.

Jason
Ziyang Xuan (William) Oct. 26, 2021, 3:14 a.m. UTC | #5
>>>> diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
>>>> index 68197e576433..063dbe72b7c2 100644
>>>> --- a/drivers/infiniband/core/roce_gid_mgmt.c
>>>> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
>>>> @@ -621,6 +621,7 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>>>>  {
>>>>  	struct netdev_event_work *work =
>>>>  		container_of(_work, struct netdev_event_work, work);
>>>> +	struct net_device *real_dev;
>>>>  	unsigned int i;
>>>>  
>>>>  	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
>>>> @@ -628,6 +629,12 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>>>>  					 work->cmds[i].filter_ndev,
>>>>  					 work->cmds[i].cb,
>>>>  					 work->cmds[i].ndev);
>>>> +		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
>>>> +		if (real_dev)
>>>> +			dev_put(real_dev);
>>>> +		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
>>>> +		if (real_dev)
>>>> +			dev_put(real_dev);
>>>>  		dev_put(work->cmds[i].ndev);
>>>>  		dev_put(work->cmds[i].filter_ndev);
>>>>  	}
>>>> @@ -638,9 +645,10 @@ static void netdevice_event_work_handler(struct work_struct *_work)
>>>>  static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
>>>>  				struct net_device *ndev)
>>>>  {
>>>> -	unsigned int i;
>>>>  	struct netdev_event_work *ndev_work =
>>>>  		kmalloc(sizeof(*ndev_work), GFP_KERNEL);
>>>> +	struct net_device *real_dev;
>>>> +	unsigned int i;
>>>>  
>>>>  	if (!ndev_work)
>>>>  		return NOTIFY_DONE;
>>>> @@ -653,6 +661,12 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
>>>>  			ndev_work->cmds[i].filter_ndev = ndev;
>>>>  		dev_hold(ndev_work->cmds[i].ndev);
>>>>  		dev_hold(ndev_work->cmds[i].filter_ndev);
>>>> +		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
>>>> +		if (real_dev)
>>>> +			dev_hold(real_dev);
>>>> +		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
>>>> +		if (real_dev)
>>>> +			dev_hold(real_dev);
>>>>  	}
>>>>  	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
>>>
>>> Probably, this is the right change, but I don't know well enough that
>>> part of code. What prevents from "real_dev" to disappear right after
>>> your call to rdma_vlan_dev_real_dev()?
>>>
>>
>> It is known that free the net_device until its dev_refcnt is one. The
>> detail realization see netdev_run_todo().The real_dev's dev_refcnt of
>> a vlan net_device will reach one after unregister_netdevice(&real_dev)
>> and unregister_vlan_dev(&vlan_ndev, ...) but the dev_refcnt of the vlan
>> net_device is bigger than one because netdevice_queue_work() will hold
>> the vlan net_device. So my solution is hold the real_dev too in
>> netdevice_queue_work().
> 
>               dev_hold(ndev_work->cmds[i].filter_ndev);
>  +            real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
>  +            if (real_dev)
>                   <------------ real_dev is released here.
>  +                    dev_hold(real_dev);

At first, I thought the real_dev's dev_refcnt is bigger than one before
NETDEV_UNREGISTER notifier event of the vlan net_device because it calls
dev_put(real_dev) after calling unregister_netdevice_queue(dev, head).
I thought unregister_netdevice_queue() would issue NETDEV_UNREGISTER
notifier event of the vlan net_device, I can hold the real_dev in
NETDEV_UNREGISTER notifier event handler netdevice_queue_work().

But I read unregister_vlan_dev() again, found unregister_netdevice_queue()
in unregister_vlan_dev() just move the vlan net_device to a list to unregister
later. So it is possible the real_dev has been freed when we access in
netdevice_queue_work() although the probability is very small.

So the modification need to improve. For example set vlan->real_dev = NULL
after dev_put(real_dev) in unregister_vlan_dev() proposed by Jason Gunthorpe.

Do you have any other good ideas?

Thank you!
Leon Romanovsky Oct. 26, 2021, 9:03 a.m. UTC | #6
On Tue, Oct 26, 2021 at 11:14:01AM +0800, Ziyang Xuan (William) wrote:
> >>>> diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
> >>>> index 68197e576433..063dbe72b7c2 100644
> >>>> --- a/drivers/infiniband/core/roce_gid_mgmt.c
> >>>> +++ b/drivers/infiniband/core/roce_gid_mgmt.c
> >>>> @@ -621,6 +621,7 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >>>>  {
> >>>>  	struct netdev_event_work *work =
> >>>>  		container_of(_work, struct netdev_event_work, work);
> >>>> +	struct net_device *real_dev;
> >>>>  	unsigned int i;
> >>>>  
> >>>>  	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
> >>>> @@ -628,6 +629,12 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >>>>  					 work->cmds[i].filter_ndev,
> >>>>  					 work->cmds[i].cb,
> >>>>  					 work->cmds[i].ndev);
> >>>> +		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
> >>>> +		if (real_dev)
> >>>> +			dev_put(real_dev);
> >>>> +		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
> >>>> +		if (real_dev)
> >>>> +			dev_put(real_dev);
> >>>>  		dev_put(work->cmds[i].ndev);
> >>>>  		dev_put(work->cmds[i].filter_ndev);
> >>>>  	}
> >>>> @@ -638,9 +645,10 @@ static void netdevice_event_work_handler(struct work_struct *_work)
> >>>>  static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
> >>>>  				struct net_device *ndev)
> >>>>  {
> >>>> -	unsigned int i;
> >>>>  	struct netdev_event_work *ndev_work =
> >>>>  		kmalloc(sizeof(*ndev_work), GFP_KERNEL);
> >>>> +	struct net_device *real_dev;
> >>>> +	unsigned int i;
> >>>>  
> >>>>  	if (!ndev_work)
> >>>>  		return NOTIFY_DONE;
> >>>> @@ -653,6 +661,12 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
> >>>>  			ndev_work->cmds[i].filter_ndev = ndev;
> >>>>  		dev_hold(ndev_work->cmds[i].ndev);
> >>>>  		dev_hold(ndev_work->cmds[i].filter_ndev);
> >>>> +		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
> >>>> +		if (real_dev)
> >>>> +			dev_hold(real_dev);
> >>>> +		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
> >>>> +		if (real_dev)
> >>>> +			dev_hold(real_dev);
> >>>>  	}
> >>>>  	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
> >>>
> >>> Probably, this is the right change, but I don't know well enough that
> >>> part of code. What prevents from "real_dev" to disappear right after
> >>> your call to rdma_vlan_dev_real_dev()?
> >>>
> >>
> >> It is known that free the net_device until its dev_refcnt is one. The
> >> detail realization see netdev_run_todo().The real_dev's dev_refcnt of
> >> a vlan net_device will reach one after unregister_netdevice(&real_dev)
> >> and unregister_vlan_dev(&vlan_ndev, ...) but the dev_refcnt of the vlan
> >> net_device is bigger than one because netdevice_queue_work() will hold
> >> the vlan net_device. So my solution is hold the real_dev too in
> >> netdevice_queue_work().
> > 
> >               dev_hold(ndev_work->cmds[i].filter_ndev);
> >  +            real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
> >  +            if (real_dev)
> >                   <------------ real_dev is released here.
> >  +                    dev_hold(real_dev);
> 
> At first, I thought the real_dev's dev_refcnt is bigger than one before
> NETDEV_UNREGISTER notifier event of the vlan net_device because it calls
> dev_put(real_dev) after calling unregister_netdevice_queue(dev, head).
> I thought unregister_netdevice_queue() would issue NETDEV_UNREGISTER
> notifier event of the vlan net_device, I can hold the real_dev in
> NETDEV_UNREGISTER notifier event handler netdevice_queue_work().
> 
> But I read unregister_vlan_dev() again, found unregister_netdevice_queue()
> in unregister_vlan_dev() just move the vlan net_device to a list to unregister
> later. So it is possible the real_dev has been freed when we access in
> netdevice_queue_work() although the probability is very small.
> 
> So the modification need to improve. For example set vlan->real_dev = NULL
> after dev_put(real_dev) in unregister_vlan_dev() proposed by Jason Gunthorpe.
> 
> Do you have any other good ideas?

It is hard to tell, such implementation existed almost from day one.

Thanks

> 
> Thank you!
diff mbox series

Patch

diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index 68197e576433..063dbe72b7c2 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -621,6 +621,7 @@  static void netdevice_event_work_handler(struct work_struct *_work)
 {
 	struct netdev_event_work *work =
 		container_of(_work, struct netdev_event_work, work);
+	struct net_device *real_dev;
 	unsigned int i;
 
 	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
@@ -628,6 +629,12 @@  static void netdevice_event_work_handler(struct work_struct *_work)
 					 work->cmds[i].filter_ndev,
 					 work->cmds[i].cb,
 					 work->cmds[i].ndev);
+		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].ndev);
+		if (real_dev)
+			dev_put(real_dev);
+		real_dev = rdma_vlan_dev_real_dev(work->cmds[i].filter_ndev);
+		if (real_dev)
+			dev_put(real_dev);
 		dev_put(work->cmds[i].ndev);
 		dev_put(work->cmds[i].filter_ndev);
 	}
@@ -638,9 +645,10 @@  static void netdevice_event_work_handler(struct work_struct *_work)
 static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
 				struct net_device *ndev)
 {
-	unsigned int i;
 	struct netdev_event_work *ndev_work =
 		kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+	struct net_device *real_dev;
+	unsigned int i;
 
 	if (!ndev_work)
 		return NOTIFY_DONE;
@@ -653,6 +661,12 @@  static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
 			ndev_work->cmds[i].filter_ndev = ndev;
 		dev_hold(ndev_work->cmds[i].ndev);
 		dev_hold(ndev_work->cmds[i].filter_ndev);
+		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].ndev);
+		if (real_dev)
+			dev_hold(real_dev);
+		real_dev = rdma_vlan_dev_real_dev(ndev_work->cmds[i].filter_ndev);
+		if (real_dev)
+			dev_hold(real_dev);
 	}
 	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);