Message ID | 20240711014006.11294-1-honggangli@163.com (mailing list archive) |
---|---|
State | Rejected |
Headers | show |
Series | RDMA/rxe: Restore tasklet call for rxe_cq.c | expand |
On Thu, Jul 11, 2024 at 9:41 AM Honggang LI <honggangli@163.com> wrote: > > If ib_req_notify_cq() was called in complete handler, deadlock occurs > in receive path. > > rxe_req_notify_cq+0x21/0x70 [rdma_rxe] > krping_cq_event_handler+0x26f/0x2c0 [rdma_krping] What is rdma_krping? What is the deadlock? Please explain the deadlock in details. > rxe_cq_post+0x128/0x1d0 [rdma_rxe] > ? copy_data+0xe0/0x230 [rdma_rxe] > rxe_responder+0xbe8/0x23a0 [rdma_rxe] > do_task+0x68/0x1e0 [rdma_rxe] > ? __pfx_rxe_udp_encap_recv+0x10/0x10 [rdma_rxe] > rxe_udp_encap_recv+0x79/0xe0 [rdma_rxe] > udp_queue_rcv_one_skb+0x272/0x540 > udp_unicast_rcv_skb+0x76/0x90 > __udp4_lib_rcv+0xab2/0xd60 > ? raw_local_deliver+0xd2/0x2a0 > ip_protocol_deliver_rcu+0xd1/0x1b0 > ip_local_deliver_finish+0x76/0xa0 > ip_local_deliver+0x68/0x100 > ? ip_rcv_finish_core.isra.0+0xc2/0x430 > ip_sublist_rcv+0x2a0/0x340 > ip_list_rcv+0x13b/0x170 > __netif_receive_skb_list_core+0x2bb/0x2e0 > netif_receive_skb_list_internal+0x1cd/0x300 > napi_complete_done+0x72/0x200 > e1000e_poll+0xcf/0x320 [e1000e] > __napi_poll+0x2b/0x160 > net_rx_action+0x2c6/0x3b0 > ? enqueue_hrtimer+0x42/0xa0 > handle_softirqs+0xf7/0x340 > ? sched_clock_cpu+0xf/0x1f0 > __irq_exit_rcu+0x97/0xb0 > common_interrupt+0x85/0xa0 > > Fixes: 0c7e314a6352 ("RDMA/rxe: Fix rxe_cq_post") > Fixes: 78b26a335310 ("RDMA/rxe: Remove tasklet call from rxe_cq.c") > Signed-off-by: Honggang LI <honggangli@163.com> > --- > drivers/infiniband/sw/rxe/rxe_cq.c | 35 ++++++++++++++++++++++++--- > drivers/infiniband/sw/rxe/rxe_loc.h | 2 ++ > drivers/infiniband/sw/rxe/rxe_verbs.c | 2 ++ > drivers/infiniband/sw/rxe/rxe_verbs.h | 2 ++ > 4 files changed, 37 insertions(+), 4 deletions(-) > > diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c > index fec87c9030ab..97a537994aee 100644 > --- a/drivers/infiniband/sw/rxe/rxe_cq.c > +++ b/drivers/infiniband/sw/rxe/rxe_cq.c > @@ -39,6 +39,21 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, > return -EINVAL; > } > > +static void rxe_send_complete(struct tasklet_struct *t) > +{ > + struct rxe_cq *cq = from_tasklet(cq, t, comp_task); > + unsigned long flags; > + > + spin_lock_irqsave(&cq->cq_lock, flags); > + if (cq->is_dying) { > + spin_unlock_irqrestore(&cq->cq_lock, flags); > + return; > + } > + spin_unlock_irqrestore(&cq->cq_lock, flags); > + > + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); > +} > + > int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, > int comp_vector, struct ib_udata *udata, > struct rxe_create_cq_resp __user *uresp) > @@ -64,6 +79,10 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, > > cq->is_user = uresp; > > + cq->is_dying = false; > + > + tasklet_setup(&cq->comp_task, rxe_send_complete); > + > spin_lock_init(&cq->cq_lock); > cq->ibcq.cqe = cqe; > return 0; > @@ -84,7 +103,6 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, > return err; > } > > -/* caller holds reference to cq */ > int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) > { > struct ib_event ev; > @@ -113,17 +131,26 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) > > queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); > > + spin_unlock_irqrestore(&cq->cq_lock, flags); > + > if ((cq->notify & IB_CQ_NEXT_COMP) || > (cq->notify & IB_CQ_SOLICITED && solicited)) { > cq->notify = 0; > - cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); > + tasklet_schedule(&cq->comp_task); > } > > - spin_unlock_irqrestore(&cq->cq_lock, flags); > - > return 0; > } > > +void rxe_cq_disable(struct rxe_cq *cq) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&cq->cq_lock, flags); > + cq->is_dying = true; > + spin_unlock_irqrestore(&cq->cq_lock, flags); > +} > + > void rxe_cq_cleanup(struct rxe_pool_elem *elem) > { > struct rxe_cq *cq = container_of(elem, typeof(*cq), elem); > diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h > index ded46119151b..ba84f780aa3d 100644 > --- a/drivers/infiniband/sw/rxe/rxe_loc.h > +++ b/drivers/infiniband/sw/rxe/rxe_loc.h > @@ -31,6 +31,8 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, > > int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited); > > +void rxe_cq_disable(struct rxe_cq *cq); > + > void rxe_cq_cleanup(struct rxe_pool_elem *elem); > > /* rxe_mcast.c */ > diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c > index de6238ee4379..a964d86789f6 100644 > --- a/drivers/infiniband/sw/rxe/rxe_verbs.c > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c > @@ -1205,6 +1205,8 @@ static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) > goto err_out; > } > > + rxe_cq_disable(cq); > + > err = rxe_cleanup(cq); > if (err) > rxe_err_cq(cq, "cleanup failed, err = %d\n", err); > diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h > index 3c1354f82283..03df97e83570 100644 > --- a/drivers/infiniband/sw/rxe/rxe_verbs.h > +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h > @@ -63,7 +63,9 @@ struct rxe_cq { > struct rxe_queue *queue; > spinlock_t cq_lock; > u8 notify; > + bool is_dying; > bool is_user; > + struct tasklet_struct comp_task; > atomic_t num_wq; > }; > > -- > 2.45.2 > >
On Thu, Jul 11, 2024 at 11:06:06AM +0800, Greg Sword wrote: > Subject: Re: [PATCH] RDMA/rxe: Restore tasklet call for rxe_cq.c > From: Greg Sword <gregsword0@gmail.com> > Date: Thu, 11 Jul 2024 11:06:06 +0800 > > On Thu, Jul 11, 2024 at 9:41 AM Honggang LI <honggangli@163.com> wrote: > > > > If ib_req_notify_cq() was called in complete handler, deadlock occurs > > in receive path. > > > > rxe_req_notify_cq+0x21/0x70 [rdma_rxe] > > krping_cq_event_handler+0x26f/0x2c0 [rdma_krping] > > What is rdma_krping? What is the deadlock? https://github.com/larrystevenwise/krping.git > Please explain the deadlock in details. 88 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) 89 { 90 struct ib_event ev; 91 int full; 92 void *addr; 93 unsigned long flags; 94 95 spin_lock_irqsave(&cq->cq_lock, flags); // Lock! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 96 97 full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT); 98 if (unlikely(full)) { 99 rxe_err_cq(cq, "queue full\n"); 100 spin_unlock_irqrestore(&cq->cq_lock, flags); 101 if (cq->ibcq.event_handler) { 102 ev.device = cq->ibcq.device; 103 ev.element.cq = &cq->ibcq; 104 ev.event = IB_EVENT_CQ_ERR; 105 cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); 106 } 107 108 return -EBUSY; 109 } 110 111 addr = queue_producer_addr(cq->queue, QUEUE_TYPE_TO_CLIENT); 112 memcpy(addr, cqe, sizeof(*cqe)); 113 114 queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); 115 116 if ((cq->notify & IB_CQ_NEXT_COMP) || 117 (cq->notify & IB_CQ_SOLICITED && solicited)) { 118 cq->notify = 0; 119 cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ call the complete handler krping_cq_event_handler() 120 } 121 122 spin_unlock_irqrestore(&cq->cq_lock, flags); static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) { struct krping_cb *cb = ctx; struct ib_wc wc; const struct ib_recv_wr *bad_wr; int ret; BUG_ON(cb->cq != cq); if (cb->state == ERROR) { printk(KERN_ERR PFX "cq completion in ERROR state\n"); return; } if (cb->frtest) { printk(KERN_ERR PFX "cq completion event in frtest!\n"); return; } if (!cb->wlat && !cb->rlat && !cb->bw) ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { if (wc.status) { static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct rxe_cq *cq = to_rcq(ibcq); int ret = 0; int empty; unsigned long irq_flags; spin_lock_irqsave(&cq->cq_lock, irq_flags); ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Deadlock
在 2024/7/11 9:01, Honggang LI 写道: > On Thu, Jul 11, 2024 at 11:06:06AM +0800, Greg Sword wrote: >> Subject: Re: [PATCH] RDMA/rxe: Restore tasklet call for rxe_cq.c >> From: Greg Sword <gregsword0@gmail.com> >> Date: Thu, 11 Jul 2024 11:06:06 +0800 >> >> On Thu, Jul 11, 2024 at 9:41 AM Honggang LI <honggangli@163.com> wrote: >>> >>> If ib_req_notify_cq() was called in complete handler, deadlock occurs >>> in receive path. >>> >>> rxe_req_notify_cq+0x21/0x70 [rdma_rxe] >>> krping_cq_event_handler+0x26f/0x2c0 [rdma_krping] >> >> What is rdma_krping? What is the deadlock? > > https://github.com/larrystevenwise/krping.git > >> Please explain the deadlock in details. I read the discussion carefully. I have the following: 1). This problem is not from RXE. It seems to be related with krping modules. As such, the root cause is not in RXE. It is not good to fix this problem in RXE. 2). In the kernel upstream, tasklet is marked obsolete and has some design flaws. So replacing workqueue with tasklet in RXE does not keep up with the kernel upstream. https://patchwork.kernel.org/project/linux-rdma/cover/20240621050525.3720069-1-allen.lkml@gmail.com/ In this link, there are some work to replace tasklet with BH workqueue. As such, it is not good to replace workqueue with tasklet. From the above, to now I can not agree with you. This is just my 2-cent suggestions. I am not sure if others have better suggestions about this commit or not. Zhu Yanjun > > 88 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) > 89 { > 90 struct ib_event ev; > 91 int full; > 92 void *addr; > 93 unsigned long flags; > 94 > 95 spin_lock_irqsave(&cq->cq_lock, flags); // Lock! > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > 96 > 97 full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT); > 98 if (unlikely(full)) { > 99 rxe_err_cq(cq, "queue full\n"); > 100 spin_unlock_irqrestore(&cq->cq_lock, flags); > 101 if (cq->ibcq.event_handler) { > 102 ev.device = cq->ibcq.device; > 103 ev.element.cq = &cq->ibcq; > 104 ev.event = IB_EVENT_CQ_ERR; > 105 cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); > 106 } > 107 > 108 return -EBUSY; > 109 } > 110 > 111 addr = queue_producer_addr(cq->queue, QUEUE_TYPE_TO_CLIENT); > 112 memcpy(addr, cqe, sizeof(*cqe)); > 113 > 114 queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); > 115 > 116 if ((cq->notify & IB_CQ_NEXT_COMP) || > 117 (cq->notify & IB_CQ_SOLICITED && solicited)) { > 118 cq->notify = 0; > 119 cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > call the complete handler krping_cq_event_handler() > 120 } > 121 > 122 spin_unlock_irqrestore(&cq->cq_lock, flags); > > > > static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) > { > struct krping_cb *cb = ctx; > struct ib_wc wc; > const struct ib_recv_wr *bad_wr; > int ret; > > BUG_ON(cb->cq != cq); > if (cb->state == ERROR) { > printk(KERN_ERR PFX "cq completion in ERROR state\n"); > return; > } > if (cb->frtest) { > printk(KERN_ERR PFX "cq completion event in frtest!\n"); > return; > } > if (!cb->wlat && !cb->rlat && !cb->bw) > ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP); > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { > if (wc.status) { > > static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) > { > struct rxe_cq *cq = to_rcq(ibcq); > int ret = 0; > int empty; > unsigned long irq_flags; > > spin_lock_irqsave(&cq->cq_lock, irq_flags); > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Deadlock >
在 2024/7/11 9:01, Honggang LI 写道: > On Thu, Jul 11, 2024 at 11:06:06AM +0800, Greg Sword wrote: >> Subject: Re: [PATCH] RDMA/rxe: Restore tasklet call for rxe_cq.c >> From: Greg Sword <gregsword0@gmail.com> >> Date: Thu, 11 Jul 2024 11:06:06 +0800 >> >> On Thu, Jul 11, 2024 at 9:41 AM Honggang LI <honggangli@163.com> wrote: >>> >>> If ib_req_notify_cq() was called in complete handler, deadlock occurs >>> in receive path. >>> >>> rxe_req_notify_cq+0x21/0x70 [rdma_rxe] >>> krping_cq_event_handler+0x26f/0x2c0 [rdma_krping] >> >> What is rdma_krping? What is the deadlock? > > https://github.com/larrystevenwise/krping.git > >> Please explain the deadlock in details. > > 88 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) > 89 { > 90 struct ib_event ev; > 91 int full; > 92 void *addr; > 93 unsigned long flags; > 94 > 95 spin_lock_irqsave(&cq->cq_lock, flags); // Lock! > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > 96 > 97 full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT); > 98 if (unlikely(full)) { > 99 rxe_err_cq(cq, "queue full\n"); > 100 spin_unlock_irqrestore(&cq->cq_lock, flags); > 101 if (cq->ibcq.event_handler) { > 102 ev.device = cq->ibcq.device; > 103 ev.element.cq = &cq->ibcq; > 104 ev.event = IB_EVENT_CQ_ERR; > 105 cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); > 106 } > 107 > 108 return -EBUSY; > 109 } > 110 > 111 addr = queue_producer_addr(cq->queue, QUEUE_TYPE_TO_CLIENT); > 112 memcpy(addr, cqe, sizeof(*cqe)); > 113 > 114 queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); > 115 > 116 if ((cq->notify & IB_CQ_NEXT_COMP) || > 117 (cq->notify & IB_CQ_SOLICITED && solicited)) { > 118 cq->notify = 0; > 119 cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > call the complete handler krping_cq_event_handler() > 120 } > 121 > 122 spin_unlock_irqrestore(&cq->cq_lock, flags); > > > > static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) > { > struct krping_cb *cb = ctx; > struct ib_wc wc; > const struct ib_recv_wr *bad_wr; > int ret; > > BUG_ON(cb->cq != cq); > if (cb->state == ERROR) { > printk(KERN_ERR PFX "cq completion in ERROR state\n"); > return; > } > if (cb->frtest) { > printk(KERN_ERR PFX "cq completion event in frtest!\n"); > return; > } > if (!cb->wlat && !cb->rlat && !cb->bw) > ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP) > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ IMO, here, we can use a BH workqueue to execute this notification? Or add an event handler? Please reference other ULP kernel modules. Thanks, Zhu Yanjun > while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { > if (wc.status) { > > static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) > { > struct rxe_cq *cq = to_rcq(ibcq); > int ret = 0; > int empty; > unsigned long irq_flags; > > spin_lock_irqsave(&cq->cq_lock, irq_flags); > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Deadlock >
On Thu, Jul 11, 2024 at 03:46:24PM +0200, Zhu Yanjun wrote: > Subject: Re: [PATCH] RDMA/rxe: Restore tasklet call for rxe_cq.c > > > > > Please explain the deadlock in details. > > I read the discussion carefully. I have the following: > 1). This problem is not from RXE. It seems to be related with krping > modules. As such, the root cause is not in RXE. It is not good to fix this > problem in RXE. Can't say it a problem in krping. As krping works over irdma/mlx5/siw. The deadlock only happens with rxe. > > 2). In the kernel upstream, tasklet is marked obsolete and has some design > flaws. So replacing workqueue with tasklet in RXE does not keep up with the > kernel upstream. > > https://patchwork.kernel.org/project/linux-rdma/cover/20240621050525.3720069-1-allen.lkml@gmail.com/ > In this link, there are some work to replace tasklet with BH workqueue. > As such, it is not good to replace workqueue with tasklet. Yes, you are right. Tasklet is obsoleted. We need a better solution for this issue. Please feel free to drop the patch.
在 2024/7/12 7:25, Zhu Yanjun 写道: > 在 2024/7/11 9:01, Honggang LI 写道: >> On Thu, Jul 11, 2024 at 11:06:06AM +0800, Greg Sword wrote: >>> Subject: Re: [PATCH] RDMA/rxe: Restore tasklet call for rxe_cq.c >>> From: Greg Sword <gregsword0@gmail.com> >>> Date: Thu, 11 Jul 2024 11:06:06 +0800 >>> >>> On Thu, Jul 11, 2024 at 9:41 AM Honggang LI <honggangli@163.com> wrote: >>>> >>>> If ib_req_notify_cq() was called in complete handler, deadlock occurs >>>> in receive path. >>>> >>>> rxe_req_notify_cq+0x21/0x70 [rdma_rxe] >>>> krping_cq_event_handler+0x26f/0x2c0 [rdma_krping] >>> >>> What is rdma_krping? What is the deadlock? >> >> https://github.com/larrystevenwise/krping.git >> >>> Please explain the deadlock in details. >> >> 88 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int >> solicited) >> 89 { >> 90 struct ib_event ev; >> 91 int full; >> 92 void *addr; >> 93 unsigned long flags; >> 94 >> 95 spin_lock_irqsave(&cq->cq_lock, flags); // Lock! >> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ >> 96 >> 97 full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT); >> 98 if (unlikely(full)) { >> 99 rxe_err_cq(cq, "queue full\n"); >> 100 spin_unlock_irqrestore(&cq->cq_lock, flags); >> 101 if (cq->ibcq.event_handler) { >> 102 ev.device = cq->ibcq.device; >> 103 ev.element.cq = &cq->ibcq; >> 104 ev.event = IB_EVENT_CQ_ERR; >> 105 cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); >> 106 } >> 107 >> 108 return -EBUSY; >> 109 } >> 110 >> 111 addr = queue_producer_addr(cq->queue, >> QUEUE_TYPE_TO_CLIENT); >> 112 memcpy(addr, cqe, sizeof(*cqe)); >> 113 >> 114 queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); >> 115 >> 116 if ((cq->notify & IB_CQ_NEXT_COMP) || >> 117 (cq->notify & IB_CQ_SOLICITED && solicited)) { >> 118 cq->notify = 0; >> 119 cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); >> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ >> call the complete handler krping_cq_event_handler() Today I moved krping_cq_event_handler into a workqueue. And this krping can work well in SoftRoCE. That is, I modifed the source code of krping and moved this krping_cq_event_handler into the system workqueue. Then I run the krping tests on SoftRoCE. The krping tests work well on SoftRoCE. Zhu Yanjun >> 120 } >> 121 >> 122 spin_unlock_irqrestore(&cq->cq_lock, flags); >> >> >> >> static void krping_cq_event_handler(struct ib_cq *cq, void *ctx) >> { >> struct krping_cb *cb = ctx; >> struct ib_wc wc; >> const struct ib_recv_wr *bad_wr; >> int ret; >> >> BUG_ON(cb->cq != cq); >> if (cb->state == ERROR) { >> printk(KERN_ERR PFX "cq completion in ERROR state\n"); >> return; >> } >> if (cb->frtest) { >> printk(KERN_ERR PFX "cq completion event in >> frtest!\n"); >> return; >> } >> if (!cb->wlat && !cb->rlat && !cb->bw) >> ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP) > >> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > > IMO, here, we can use a BH workqueue to execute this notification? Or > add an event handler? > > Please reference other ULP kernel modules. > > Thanks, > Zhu Yanjun > >> while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) { >> if (wc.status) { >> >> static int rxe_req_notify_cq(struct ib_cq *ibcq, enum >> ib_cq_notify_flags flags) >> { >> struct rxe_cq *cq = to_rcq(ibcq); >> int ret = 0; >> int empty; >> unsigned long irq_flags; >> >> spin_lock_irqsave(&cq->cq_lock, irq_flags); >> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Deadlock >> >
diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index fec87c9030ab..97a537994aee 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -39,6 +39,21 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq, return -EINVAL; } +static void rxe_send_complete(struct tasklet_struct *t) +{ + struct rxe_cq *cq = from_tasklet(cq, t, comp_task); + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + if (cq->is_dying) { + spin_unlock_irqrestore(&cq->cq_lock, flags); + return; + } + spin_unlock_irqrestore(&cq->cq_lock, flags); + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, int comp_vector, struct ib_udata *udata, struct rxe_create_cq_resp __user *uresp) @@ -64,6 +79,10 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, cq->is_user = uresp; + cq->is_dying = false; + + tasklet_setup(&cq->comp_task, rxe_send_complete); + spin_lock_init(&cq->cq_lock); cq->ibcq.cqe = cqe; return 0; @@ -84,7 +103,6 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe, return err; } -/* caller holds reference to cq */ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) { struct ib_event ev; @@ -113,17 +131,26 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited) queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT); + spin_unlock_irqrestore(&cq->cq_lock, flags); + if ((cq->notify & IB_CQ_NEXT_COMP) || (cq->notify & IB_CQ_SOLICITED && solicited)) { cq->notify = 0; - cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + tasklet_schedule(&cq->comp_task); } - spin_unlock_irqrestore(&cq->cq_lock, flags); - return 0; } +void rxe_cq_disable(struct rxe_cq *cq) +{ + unsigned long flags; + + spin_lock_irqsave(&cq->cq_lock, flags); + cq->is_dying = true; + spin_unlock_irqrestore(&cq->cq_lock, flags); +} + void rxe_cq_cleanup(struct rxe_pool_elem *elem) { struct rxe_cq *cq = container_of(elem, typeof(*cq), elem); diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index ded46119151b..ba84f780aa3d 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -31,6 +31,8 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe, int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited); +void rxe_cq_disable(struct rxe_cq *cq); + void rxe_cq_cleanup(struct rxe_pool_elem *elem); /* rxe_mcast.c */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index de6238ee4379..a964d86789f6 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1205,6 +1205,8 @@ static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) goto err_out; } + rxe_cq_disable(cq); + err = rxe_cleanup(cq); if (err) rxe_err_cq(cq, "cleanup failed, err = %d\n", err); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 3c1354f82283..03df97e83570 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -63,7 +63,9 @@ struct rxe_cq { struct rxe_queue *queue; spinlock_t cq_lock; u8 notify; + bool is_dying; bool is_user; + struct tasklet_struct comp_task; atomic_t num_wq; };
If ib_req_notify_cq() was called in complete handler, deadlock occurs in receive path. rxe_req_notify_cq+0x21/0x70 [rdma_rxe] krping_cq_event_handler+0x26f/0x2c0 [rdma_krping] rxe_cq_post+0x128/0x1d0 [rdma_rxe] ? copy_data+0xe0/0x230 [rdma_rxe] rxe_responder+0xbe8/0x23a0 [rdma_rxe] do_task+0x68/0x1e0 [rdma_rxe] ? __pfx_rxe_udp_encap_recv+0x10/0x10 [rdma_rxe] rxe_udp_encap_recv+0x79/0xe0 [rdma_rxe] udp_queue_rcv_one_skb+0x272/0x540 udp_unicast_rcv_skb+0x76/0x90 __udp4_lib_rcv+0xab2/0xd60 ? raw_local_deliver+0xd2/0x2a0 ip_protocol_deliver_rcu+0xd1/0x1b0 ip_local_deliver_finish+0x76/0xa0 ip_local_deliver+0x68/0x100 ? ip_rcv_finish_core.isra.0+0xc2/0x430 ip_sublist_rcv+0x2a0/0x340 ip_list_rcv+0x13b/0x170 __netif_receive_skb_list_core+0x2bb/0x2e0 netif_receive_skb_list_internal+0x1cd/0x300 napi_complete_done+0x72/0x200 e1000e_poll+0xcf/0x320 [e1000e] __napi_poll+0x2b/0x160 net_rx_action+0x2c6/0x3b0 ? enqueue_hrtimer+0x42/0xa0 handle_softirqs+0xf7/0x340 ? sched_clock_cpu+0xf/0x1f0 __irq_exit_rcu+0x97/0xb0 common_interrupt+0x85/0xa0 Fixes: 0c7e314a6352 ("RDMA/rxe: Fix rxe_cq_post") Fixes: 78b26a335310 ("RDMA/rxe: Remove tasklet call from rxe_cq.c") Signed-off-by: Honggang LI <honggangli@163.com> --- drivers/infiniband/sw/rxe/rxe_cq.c | 35 ++++++++++++++++++++++++--- drivers/infiniband/sw/rxe/rxe_loc.h | 2 ++ drivers/infiniband/sw/rxe/rxe_verbs.c | 2 ++ drivers/infiniband/sw/rxe/rxe_verbs.h | 2 ++ 4 files changed, 37 insertions(+), 4 deletions(-)