Message ID | 20240724085428.3813-1-showrya@chelsio.com (mailing list archive) |
---|---|
State | Accepted |
Headers | show |
Series | [for-rc] RDMA/siw: Remove NETDEV_GOING_DOWN event handler | expand |
在 2024/7/24 10:54, Showrya M N 写道: > Toggling link while running NVME-oF over siw hits a kernel panic > due to race condition within siw_handler and ib_destroy_qp(). > The IB_EVENT_PORT_ERR event can alone handle destroying qps. > therefore remove unwanted processing in siw. In the link: https://lore.kernel.org/all/000000000000fe34b1061e0ffa36@google.com/T/ The Call Trace is as below. Not sure if this call trace is the same with this commit. Call Trace: <TASK> __debug_object_init+0x2a9/0x400 lib/debugobjects.c:654 siw_device_goes_down drivers/infiniband/sw/siw/siw_main.c:395 [inline] siw_netdev_event+0x3bd/0x620 drivers/infiniband/sw/siw/siw_main.c:422 notifier_call_chain+0x19f/0x3e0 kernel/notifier.c:93 call_netdevice_notifiers_extack net/core/dev.c:2032 [inline] call_netdevice_notifiers net/core/dev.c:2046 [inline] __dev_close_many+0x146/0x300 net/core/dev.c:1532 __dev_close net/core/dev.c:1570 [inline] __dev_change_flags+0x30e/0x6f0 net/core/dev.c:8835 dev_change_flags+0x8b/0x1a0 net/core/dev.c:8909 do_setlink+0xccd/0x41f0 net/core/rtnetlink.c:2900 rtnl_setlink+0x40d/0x5a0 net/core/rtnetlink.c:3201 rtnetlink_rcv_msg+0x73f/0xcf0 net/core/rtnetlink.c:6647 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2550 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x7f0/0x990 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8e4/0xcb0 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:745 sock_write_iter+0x2dd/0x400 net/socket.c:1160 do_iter_readv_writev+0x60a/0x890 vfs_writev+0x37c/0xbb0 fs/read_write.c:971 do_writev+0x1b1/0x350 fs/read_write.c:1018 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fda35175f19 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fda35fff048 EFLAGS: 00000246 ORIG_RAX: 0000000000000014 RAX: ffffffffffffffda RBX: 00007fda35305f60 RCX: 00007fda35175f19 RDX: 0000000000000001 RSI: 00000000200003c0 RDI: 0000000000000006 RBP: 00007fda351e4e68 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 000000000000000b R14: 00007fda35305f60 R15: 00007ffc0c742898 </TASK> Zhu Yanjun > > Suggested-by: Bernard Metzler <bmt@zurich.ibm.com> > Signed-off-by: Showrya M N <showrya@chelsio.com> > Signed-off-by: Potnuri Bharat Teja <bharat@chelsio.com> > --- > drivers/infiniband/sw/siw/siw.h | 2 -- > drivers/infiniband/sw/siw/siw_main.c | 37 ---------------------------- > 2 files changed, 39 deletions(-) > > diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h > index 75253f2b3e3d..86d4d6a2170e 100644 > --- a/drivers/infiniband/sw/siw/siw.h > +++ b/drivers/infiniband/sw/siw/siw.h > @@ -94,8 +94,6 @@ struct siw_device { > atomic_t num_mr; > atomic_t num_srq; > atomic_t num_ctx; > - > - struct work_struct netdev_down; > }; > > struct siw_ucontext { > diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c > index 61ad8ca3d1a2..9a50a9dcce39 100644 > --- a/drivers/infiniband/sw/siw/siw_main.c > +++ b/drivers/infiniband/sw/siw/siw_main.c > @@ -364,39 +364,6 @@ static struct siw_device *siw_device_create(struct net_device *netdev) > return NULL; > } > > -/* > - * Network link becomes unavailable. Mark all > - * affected QP's accordingly. > - */ > -static void siw_netdev_down(struct work_struct *work) > -{ > - struct siw_device *sdev = > - container_of(work, struct siw_device, netdev_down); > - > - struct siw_qp_attrs qp_attrs; > - struct list_head *pos, *tmp; > - > - memset(&qp_attrs, 0, sizeof(qp_attrs)); > - qp_attrs.state = SIW_QP_STATE_ERROR; > - > - list_for_each_safe(pos, tmp, &sdev->qp_list) { > - struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); > - > - down_write(&qp->state_lock); > - WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); > - up_write(&qp->state_lock); > - } > - ib_device_put(&sdev->base_dev); > -} > - > -static void siw_device_goes_down(struct siw_device *sdev) > -{ > - if (ib_device_try_get(&sdev->base_dev)) { > - INIT_WORK(&sdev->netdev_down, siw_netdev_down); > - schedule_work(&sdev->netdev_down); > - } > -} > - > static int siw_netdev_event(struct notifier_block *nb, unsigned long event, > void *arg) > { > @@ -418,10 +385,6 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, > siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); > break; > > - case NETDEV_GOING_DOWN: > - siw_device_goes_down(sdev); > - break; > - > case NETDEV_DOWN: > sdev->state = IB_PORT_DOWN; > siw_port_event(sdev, 1, IB_EVENT_PORT_ERR);
On Wed, 24 Jul 2024 14:24:28 +0530, Showrya M N wrote: > Toggling link while running NVME-oF over siw hits a kernel panic > due to race condition within siw_handler and ib_destroy_qp(). > The IB_EVENT_PORT_ERR event can alone handle destroying qps. > therefore remove unwanted processing in siw. > > Applied, thanks! [1/1] RDMA/siw: Remove NETDEV_GOING_DOWN event handler https://git.kernel.org/rdma/rdma/c/60dc7fcafea817 Best regards,
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 75253f2b3e3d..86d4d6a2170e 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -94,8 +94,6 @@ struct siw_device { atomic_t num_mr; atomic_t num_srq; atomic_t num_ctx; - - struct work_struct netdev_down; }; struct siw_ucontext { diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 61ad8ca3d1a2..9a50a9dcce39 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -364,39 +364,6 @@ static struct siw_device *siw_device_create(struct net_device *netdev) return NULL; } -/* - * Network link becomes unavailable. Mark all - * affected QP's accordingly. - */ -static void siw_netdev_down(struct work_struct *work) -{ - struct siw_device *sdev = - container_of(work, struct siw_device, netdev_down); - - struct siw_qp_attrs qp_attrs; - struct list_head *pos, *tmp; - - memset(&qp_attrs, 0, sizeof(qp_attrs)); - qp_attrs.state = SIW_QP_STATE_ERROR; - - list_for_each_safe(pos, tmp, &sdev->qp_list) { - struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); - - down_write(&qp->state_lock); - WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); - up_write(&qp->state_lock); - } - ib_device_put(&sdev->base_dev); -} - -static void siw_device_goes_down(struct siw_device *sdev) -{ - if (ib_device_try_get(&sdev->base_dev)) { - INIT_WORK(&sdev->netdev_down, siw_netdev_down); - schedule_work(&sdev->netdev_down); - } -} - static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { @@ -418,10 +385,6 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; - case NETDEV_GOING_DOWN: - siw_device_goes_down(sdev); - break; - case NETDEV_DOWN: sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR);