diff mbox series

RDMA/rxe: Restore tasklet call for rxe_cq.c

Message ID 20240711014006.11294-1-honggangli@163.com (mailing list archive)
State Rejected
Headers show
Series RDMA/rxe: Restore tasklet call for rxe_cq.c | expand

Commit Message

Honggang LI July 11, 2024, 1:40 a.m. UTC
If ib_req_notify_cq() was called in complete handler, deadlock occurs
in receive path.

rxe_req_notify_cq+0x21/0x70 [rdma_rxe]
krping_cq_event_handler+0x26f/0x2c0 [rdma_krping]
rxe_cq_post+0x128/0x1d0 [rdma_rxe]
? copy_data+0xe0/0x230 [rdma_rxe]
rxe_responder+0xbe8/0x23a0 [rdma_rxe]
do_task+0x68/0x1e0 [rdma_rxe]
? __pfx_rxe_udp_encap_recv+0x10/0x10 [rdma_rxe]
rxe_udp_encap_recv+0x79/0xe0 [rdma_rxe]
udp_queue_rcv_one_skb+0x272/0x540
udp_unicast_rcv_skb+0x76/0x90
__udp4_lib_rcv+0xab2/0xd60
? raw_local_deliver+0xd2/0x2a0
ip_protocol_deliver_rcu+0xd1/0x1b0
ip_local_deliver_finish+0x76/0xa0
ip_local_deliver+0x68/0x100
? ip_rcv_finish_core.isra.0+0xc2/0x430
ip_sublist_rcv+0x2a0/0x340
ip_list_rcv+0x13b/0x170
__netif_receive_skb_list_core+0x2bb/0x2e0
netif_receive_skb_list_internal+0x1cd/0x300
napi_complete_done+0x72/0x200
e1000e_poll+0xcf/0x320 [e1000e]
__napi_poll+0x2b/0x160
net_rx_action+0x2c6/0x3b0
? enqueue_hrtimer+0x42/0xa0
handle_softirqs+0xf7/0x340
? sched_clock_cpu+0xf/0x1f0
__irq_exit_rcu+0x97/0xb0
common_interrupt+0x85/0xa0

Fixes: 0c7e314a6352 ("RDMA/rxe: Fix rxe_cq_post")
Fixes: 78b26a335310 ("RDMA/rxe: Remove tasklet call from rxe_cq.c")
Signed-off-by: Honggang LI <honggangli@163.com>
---
 drivers/infiniband/sw/rxe/rxe_cq.c    | 35 ++++++++++++++++++++++++---
 drivers/infiniband/sw/rxe/rxe_loc.h   |  2 ++
 drivers/infiniband/sw/rxe/rxe_verbs.c |  2 ++
 drivers/infiniband/sw/rxe/rxe_verbs.h |  2 ++
 4 files changed, 37 insertions(+), 4 deletions(-)

Comments

Greg Sword July 11, 2024, 3:06 a.m. UTC | #1
On Thu, Jul 11, 2024 at 9:41 AM Honggang LI <honggangli@163.com> wrote:
>
> If ib_req_notify_cq() was called in complete handler, deadlock occurs
> in receive path.
>
> rxe_req_notify_cq+0x21/0x70 [rdma_rxe]
> krping_cq_event_handler+0x26f/0x2c0 [rdma_krping]

What is rdma_krping? What is the deadlock?
Please explain the deadlock in details.


> rxe_cq_post+0x128/0x1d0 [rdma_rxe]
> ? copy_data+0xe0/0x230 [rdma_rxe]
> rxe_responder+0xbe8/0x23a0 [rdma_rxe]
> do_task+0x68/0x1e0 [rdma_rxe]
> ? __pfx_rxe_udp_encap_recv+0x10/0x10 [rdma_rxe]
> rxe_udp_encap_recv+0x79/0xe0 [rdma_rxe]
> udp_queue_rcv_one_skb+0x272/0x540
> udp_unicast_rcv_skb+0x76/0x90
> __udp4_lib_rcv+0xab2/0xd60
> ? raw_local_deliver+0xd2/0x2a0
> ip_protocol_deliver_rcu+0xd1/0x1b0
> ip_local_deliver_finish+0x76/0xa0
> ip_local_deliver+0x68/0x100
> ? ip_rcv_finish_core.isra.0+0xc2/0x430
> ip_sublist_rcv+0x2a0/0x340
> ip_list_rcv+0x13b/0x170
> __netif_receive_skb_list_core+0x2bb/0x2e0
> netif_receive_skb_list_internal+0x1cd/0x300
> napi_complete_done+0x72/0x200
> e1000e_poll+0xcf/0x320 [e1000e]
> __napi_poll+0x2b/0x160
> net_rx_action+0x2c6/0x3b0
> ? enqueue_hrtimer+0x42/0xa0
> handle_softirqs+0xf7/0x340
> ? sched_clock_cpu+0xf/0x1f0
> __irq_exit_rcu+0x97/0xb0
> common_interrupt+0x85/0xa0
>
> Fixes: 0c7e314a6352 ("RDMA/rxe: Fix rxe_cq_post")
> Fixes: 78b26a335310 ("RDMA/rxe: Remove tasklet call from rxe_cq.c")
> Signed-off-by: Honggang LI <honggangli@163.com>
> ---
>  drivers/infiniband/sw/rxe/rxe_cq.c    | 35 ++++++++++++++++++++++++---
>  drivers/infiniband/sw/rxe/rxe_loc.h   |  2 ++
>  drivers/infiniband/sw/rxe/rxe_verbs.c |  2 ++
>  drivers/infiniband/sw/rxe/rxe_verbs.h |  2 ++
>  4 files changed, 37 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
> index fec87c9030ab..97a537994aee 100644
> --- a/drivers/infiniband/sw/rxe/rxe_cq.c
> +++ b/drivers/infiniband/sw/rxe/rxe_cq.c
> @@ -39,6 +39,21 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
>         return -EINVAL;
>  }
>
> +static void rxe_send_complete(struct tasklet_struct *t)
> +{
> +       struct rxe_cq *cq = from_tasklet(cq, t, comp_task);
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&cq->cq_lock, flags);
> +       if (cq->is_dying) {
> +               spin_unlock_irqrestore(&cq->cq_lock, flags);
> +               return;
> +       }
> +       spin_unlock_irqrestore(&cq->cq_lock, flags);
> +
> +       cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
> +}
> +
>  int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
>                      int comp_vector, struct ib_udata *udata,
>                      struct rxe_create_cq_resp __user *uresp)
> @@ -64,6 +79,10 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
>
>         cq->is_user = uresp;
>
> +       cq->is_dying = false;
> +
> +       tasklet_setup(&cq->comp_task, rxe_send_complete);
> +
>         spin_lock_init(&cq->cq_lock);
>         cq->ibcq.cqe = cqe;
>         return 0;
> @@ -84,7 +103,6 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe,
>         return err;
>  }
>
> -/* caller holds reference to cq */
>  int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
>  {
>         struct ib_event ev;
> @@ -113,17 +131,26 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
>
>         queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT);
>
> +       spin_unlock_irqrestore(&cq->cq_lock, flags);
> +
>         if ((cq->notify & IB_CQ_NEXT_COMP) ||
>             (cq->notify & IB_CQ_SOLICITED && solicited)) {
>                 cq->notify = 0;
> -               cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
> +               tasklet_schedule(&cq->comp_task);
>         }
>
> -       spin_unlock_irqrestore(&cq->cq_lock, flags);
> -
>         return 0;
>  }
>
> +void rxe_cq_disable(struct rxe_cq *cq)
> +{
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&cq->cq_lock, flags);
> +       cq->is_dying = true;
> +       spin_unlock_irqrestore(&cq->cq_lock, flags);
> +}
> +
>  void rxe_cq_cleanup(struct rxe_pool_elem *elem)
>  {
>         struct rxe_cq *cq = container_of(elem, typeof(*cq), elem);
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index ded46119151b..ba84f780aa3d 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -31,6 +31,8 @@ int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe,
>
>  int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited);
>
> +void rxe_cq_disable(struct rxe_cq *cq);
> +
>  void rxe_cq_cleanup(struct rxe_pool_elem *elem);
>
>  /* rxe_mcast.c */
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index de6238ee4379..a964d86789f6 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -1205,6 +1205,8 @@ static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
>                 goto err_out;
>         }
>
> +       rxe_cq_disable(cq);
> +
>         err = rxe_cleanup(cq);
>         if (err)
>                 rxe_err_cq(cq, "cleanup failed, err = %d\n", err);
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index 3c1354f82283..03df97e83570 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -63,7 +63,9 @@ struct rxe_cq {
>         struct rxe_queue        *queue;
>         spinlock_t              cq_lock;
>         u8                      notify;
> +       bool                    is_dying;
>         bool                    is_user;
> +       struct tasklet_struct   comp_task;
>         atomic_t                num_wq;
>  };
>
> --
> 2.45.2
>
>
Honggang LI July 11, 2024, 7:01 a.m. UTC | #2
On Thu, Jul 11, 2024 at 11:06:06AM +0800, Greg Sword wrote:
> Subject: Re: [PATCH] RDMA/rxe: Restore tasklet call for rxe_cq.c
> From: Greg Sword <gregsword0@gmail.com>
> Date: Thu, 11 Jul 2024 11:06:06 +0800
> 
> On Thu, Jul 11, 2024 at 9:41 AM Honggang LI <honggangli@163.com> wrote:
> >
> > If ib_req_notify_cq() was called in complete handler, deadlock occurs
> > in receive path.
> >
> > rxe_req_notify_cq+0x21/0x70 [rdma_rxe]
> > krping_cq_event_handler+0x26f/0x2c0 [rdma_krping]
> 
> What is rdma_krping? What is the deadlock?

https://github.com/larrystevenwise/krping.git

> Please explain the deadlock in details.

   88 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
   89 {
   90         struct ib_event ev;
   91         int full;
   92         void *addr;
   93         unsigned long flags;
   94 
   95         spin_lock_irqsave(&cq->cq_lock, flags);  // Lock!
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   96 
   97         full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT);
   98         if (unlikely(full)) {
   99                 rxe_err_cq(cq, "queue full\n");
  100                 spin_unlock_irqrestore(&cq->cq_lock, flags);
  101                 if (cq->ibcq.event_handler) {
  102                         ev.device = cq->ibcq.device;
  103                         ev.element.cq = &cq->ibcq;
  104                         ev.event = IB_EVENT_CQ_ERR;
  105                         cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
  106                 }
  107 
  108                 return -EBUSY;
  109         }
  110 
  111         addr = queue_producer_addr(cq->queue, QUEUE_TYPE_TO_CLIENT);
  112         memcpy(addr, cqe, sizeof(*cqe));
  113 
  114         queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT);
  115 
  116         if ((cq->notify & IB_CQ_NEXT_COMP) ||
  117             (cq->notify & IB_CQ_SOLICITED && solicited)) {
  118                 cq->notify = 0;
  119                 cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
		      call the complete handler   krping_cq_event_handler()
  120         }
  121 
  122         spin_unlock_irqrestore(&cq->cq_lock, flags);



static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
{
        struct krping_cb *cb = ctx;
        struct ib_wc wc;
        const struct ib_recv_wr *bad_wr;
        int ret;

        BUG_ON(cb->cq != cq);
        if (cb->state == ERROR) {
                printk(KERN_ERR PFX "cq completion in ERROR state\n");
                return;
        }
        if (cb->frtest) {
                printk(KERN_ERR PFX "cq completion event in frtest!\n");
                return;
        }
        if (!cb->wlat && !cb->rlat && !cb->bw)
                ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
		^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
                if (wc.status) {

static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
{
        struct rxe_cq *cq = to_rcq(ibcq);
        int ret = 0;
        int empty;
        unsigned long irq_flags;

        spin_lock_irqsave(&cq->cq_lock, irq_flags);
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Deadlock
Zhu Yanjun July 11, 2024, 1:46 p.m. UTC | #3
在 2024/7/11 9:01, Honggang LI 写道:
> On Thu, Jul 11, 2024 at 11:06:06AM +0800, Greg Sword wrote:
>> Subject: Re: [PATCH] RDMA/rxe: Restore tasklet call for rxe_cq.c
>> From: Greg Sword <gregsword0@gmail.com>
>> Date: Thu, 11 Jul 2024 11:06:06 +0800
>>
>> On Thu, Jul 11, 2024 at 9:41 AM Honggang LI <honggangli@163.com> wrote:
>>>
>>> If ib_req_notify_cq() was called in complete handler, deadlock occurs
>>> in receive path.
>>>
>>> rxe_req_notify_cq+0x21/0x70 [rdma_rxe]
>>> krping_cq_event_handler+0x26f/0x2c0 [rdma_krping]
>>
>> What is rdma_krping? What is the deadlock?
> 
> https://github.com/larrystevenwise/krping.git
> 
>> Please explain the deadlock in details.

I read the discussion carefully. I have the following:
1). This problem is not from RXE. It seems to be related with krping 
modules. As such, the root cause is not in RXE. It is not good to fix 
this problem in RXE.

2). In the kernel upstream, tasklet is marked obsolete and has some 
design flaws. So replacing workqueue with tasklet in RXE does not keep 
up with the kernel upstream.

https://patchwork.kernel.org/project/linux-rdma/cover/20240621050525.3720069-1-allen.lkml@gmail.com/
In this link, there are some work to replace tasklet with BH workqueue.
As such, it is not good to replace workqueue with tasklet.

 From the above, to now I can not agree with you. This is just my 2-cent 
suggestions.

I am not sure if others have better suggestions about this commit or not.

Zhu Yanjun
> 
>     88 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
>     89 {
>     90         struct ib_event ev;
>     91         int full;
>     92         void *addr;
>     93         unsigned long flags;
>     94
>     95         spin_lock_irqsave(&cq->cq_lock, flags);  // Lock!
>                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>     96
>     97         full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT);
>     98         if (unlikely(full)) {
>     99                 rxe_err_cq(cq, "queue full\n");
>    100                 spin_unlock_irqrestore(&cq->cq_lock, flags);
>    101                 if (cq->ibcq.event_handler) {
>    102                         ev.device = cq->ibcq.device;
>    103                         ev.element.cq = &cq->ibcq;
>    104                         ev.event = IB_EVENT_CQ_ERR;
>    105                         cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
>    106                 }
>    107
>    108                 return -EBUSY;
>    109         }
>    110
>    111         addr = queue_producer_addr(cq->queue, QUEUE_TYPE_TO_CLIENT);
>    112         memcpy(addr, cqe, sizeof(*cqe));
>    113
>    114         queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT);
>    115
>    116         if ((cq->notify & IB_CQ_NEXT_COMP) ||
>    117             (cq->notify & IB_CQ_SOLICITED && solicited)) {
>    118                 cq->notify = 0;
>    119                 cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
>                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 		      call the complete handler   krping_cq_event_handler()
>    120         }
>    121
>    122         spin_unlock_irqrestore(&cq->cq_lock, flags);
> 
> 
> 
> static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
> {
>          struct krping_cb *cb = ctx;
>          struct ib_wc wc;
>          const struct ib_recv_wr *bad_wr;
>          int ret;
> 
>          BUG_ON(cb->cq != cq);
>          if (cb->state == ERROR) {
>                  printk(KERN_ERR PFX "cq completion in ERROR state\n");
>                  return;
>          }
>          if (cb->frtest) {
>                  printk(KERN_ERR PFX "cq completion event in frtest!\n");
>                  return;
>          }
>          if (!cb->wlat && !cb->rlat && !cb->bw)
>                  ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP);
> 		^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>          while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
>                  if (wc.status) {
> 
> static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
> {
>          struct rxe_cq *cq = to_rcq(ibcq);
>          int ret = 0;
>          int empty;
>          unsigned long irq_flags;
> 
>          spin_lock_irqsave(&cq->cq_lock, irq_flags);
> 	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Deadlock
>
Zhu Yanjun July 11, 2024, 11:25 p.m. UTC | #4
在 2024/7/11 9:01, Honggang LI 写道:
> On Thu, Jul 11, 2024 at 11:06:06AM +0800, Greg Sword wrote:
>> Subject: Re: [PATCH] RDMA/rxe: Restore tasklet call for rxe_cq.c
>> From: Greg Sword <gregsword0@gmail.com>
>> Date: Thu, 11 Jul 2024 11:06:06 +0800
>>
>> On Thu, Jul 11, 2024 at 9:41 AM Honggang LI <honggangli@163.com> wrote:
>>>
>>> If ib_req_notify_cq() was called in complete handler, deadlock occurs
>>> in receive path.
>>>
>>> rxe_req_notify_cq+0x21/0x70 [rdma_rxe]
>>> krping_cq_event_handler+0x26f/0x2c0 [rdma_krping]
>>
>> What is rdma_krping? What is the deadlock?
> 
> https://github.com/larrystevenwise/krping.git
> 
>> Please explain the deadlock in details.
> 
>     88 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
>     89 {
>     90         struct ib_event ev;
>     91         int full;
>     92         void *addr;
>     93         unsigned long flags;
>     94
>     95         spin_lock_irqsave(&cq->cq_lock, flags);  // Lock!
>                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>     96
>     97         full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT);
>     98         if (unlikely(full)) {
>     99                 rxe_err_cq(cq, "queue full\n");
>    100                 spin_unlock_irqrestore(&cq->cq_lock, flags);
>    101                 if (cq->ibcq.event_handler) {
>    102                         ev.device = cq->ibcq.device;
>    103                         ev.element.cq = &cq->ibcq;
>    104                         ev.event = IB_EVENT_CQ_ERR;
>    105                         cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
>    106                 }
>    107
>    108                 return -EBUSY;
>    109         }
>    110
>    111         addr = queue_producer_addr(cq->queue, QUEUE_TYPE_TO_CLIENT);
>    112         memcpy(addr, cqe, sizeof(*cqe));
>    113
>    114         queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT);
>    115
>    116         if ((cq->notify & IB_CQ_NEXT_COMP) ||
>    117             (cq->notify & IB_CQ_SOLICITED && solicited)) {
>    118                 cq->notify = 0;
>    119                 cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
>                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 		      call the complete handler   krping_cq_event_handler()
>    120         }
>    121
>    122         spin_unlock_irqrestore(&cq->cq_lock, flags);
> 
> 
> 
> static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
> {
>          struct krping_cb *cb = ctx;
>          struct ib_wc wc;
>          const struct ib_recv_wr *bad_wr;
>          int ret;
> 
>          BUG_ON(cb->cq != cq);
>          if (cb->state == ERROR) {
>                  printk(KERN_ERR PFX "cq completion in ERROR state\n");
>                  return;
>          }
>          if (cb->frtest) {
>                  printk(KERN_ERR PFX "cq completion event in frtest!\n");
>                  return;
>          }
>          if (!cb->wlat && !cb->rlat && !cb->bw)
>                  ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP) > 		^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

IMO, here, we can use a BH workqueue to execute this notification? Or 
add an event handler?

Please reference other ULP kernel modules.

Thanks,
Zhu Yanjun

>          while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
>                  if (wc.status) {
> 
> static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
> {
>          struct rxe_cq *cq = to_rcq(ibcq);
>          int ret = 0;
>          int empty;
>          unsigned long irq_flags;
> 
>          spin_lock_irqsave(&cq->cq_lock, irq_flags);
> 	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Deadlock
>
Honggang LI July 12, 2024, 2:46 a.m. UTC | #5
On Thu, Jul 11, 2024 at 03:46:24PM +0200, Zhu Yanjun wrote:
> Subject: Re: [PATCH] RDMA/rxe: Restore tasklet call for rxe_cq.c
> > 
> > > Please explain the deadlock in details.
> 
> I read the discussion carefully. I have the following:
> 1). This problem is not from RXE. It seems to be related with krping
> modules. As such, the root cause is not in RXE. It is not good to fix this
> problem in RXE.

Can't say it a problem in krping. As krping works over irdma/mlx5/siw.
The deadlock only happens with rxe.

> 
> 2). In the kernel upstream, tasklet is marked obsolete and has some design
> flaws. So replacing workqueue with tasklet in RXE does not keep up with the
> kernel upstream.
> 
> https://patchwork.kernel.org/project/linux-rdma/cover/20240621050525.3720069-1-allen.lkml@gmail.com/
> In this link, there are some work to replace tasklet with BH workqueue.
> As such, it is not good to replace workqueue with tasklet.

Yes, you are right. Tasklet is obsoleted. We need a better solution for
this issue. Please feel free to drop the patch.
Zhu Yanjun Sept. 15, 2024, 12:26 p.m. UTC | #6
在 2024/7/12 7:25, Zhu Yanjun 写道:
> 在 2024/7/11 9:01, Honggang LI 写道:
>> On Thu, Jul 11, 2024 at 11:06:06AM +0800, Greg Sword wrote:
>>> Subject: Re: [PATCH] RDMA/rxe: Restore tasklet call for rxe_cq.c
>>> From: Greg Sword <gregsword0@gmail.com>
>>> Date: Thu, 11 Jul 2024 11:06:06 +0800
>>>
>>> On Thu, Jul 11, 2024 at 9:41 AM Honggang LI <honggangli@163.com> wrote:
>>>>
>>>> If ib_req_notify_cq() was called in complete handler, deadlock occurs
>>>> in receive path.
>>>>
>>>> rxe_req_notify_cq+0x21/0x70 [rdma_rxe]
>>>> krping_cq_event_handler+0x26f/0x2c0 [rdma_krping]
>>>
>>> What is rdma_krping? What is the deadlock?
>>
>> https://github.com/larrystevenwise/krping.git
>>
>>> Please explain the deadlock in details.
>>
>>     88 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int 
>> solicited)
>>     89 {
>>     90         struct ib_event ev;
>>     91         int full;
>>     92         void *addr;
>>     93         unsigned long flags;
>>     94
>>     95         spin_lock_irqsave(&cq->cq_lock, flags); // Lock!
>>                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>     96
>>     97         full = queue_full(cq->queue, QUEUE_TYPE_TO_CLIENT);
>>     98         if (unlikely(full)) {
>>     99                 rxe_err_cq(cq, "queue full\n");
>>    100 spin_unlock_irqrestore(&cq->cq_lock, flags);
>>    101                 if (cq->ibcq.event_handler) {
>>    102                         ev.device = cq->ibcq.device;
>>    103                         ev.element.cq = &cq->ibcq;
>>    104                         ev.event = IB_EVENT_CQ_ERR;
>>    105 cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
>>    106                 }
>>    107
>>    108                 return -EBUSY;
>>    109         }
>>    110
>>    111         addr = queue_producer_addr(cq->queue, 
>> QUEUE_TYPE_TO_CLIENT);
>>    112         memcpy(addr, cqe, sizeof(*cqe));
>>    113
>>    114         queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT);
>>    115
>>    116         if ((cq->notify & IB_CQ_NEXT_COMP) ||
>>    117             (cq->notify & IB_CQ_SOLICITED && solicited)) {
>>    118                 cq->notify = 0;
>>    119 cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
>> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>               call the complete handler krping_cq_event_handler()

Today I moved krping_cq_event_handler into a workqueue. And this krping 
can work well in SoftRoCE.

That is, I modifed the source code of krping and moved this 
krping_cq_event_handler into the system workqueue.

Then I run the krping tests on SoftRoCE. The krping tests work well on 
SoftRoCE.

Zhu Yanjun

>>    120         }
>>    121
>>    122         spin_unlock_irqrestore(&cq->cq_lock, flags);
>>
>>
>>
>> static void krping_cq_event_handler(struct ib_cq *cq, void *ctx)
>> {
>>          struct krping_cb *cb = ctx;
>>          struct ib_wc wc;
>>          const struct ib_recv_wr *bad_wr;
>>          int ret;
>>
>>          BUG_ON(cb->cq != cq);
>>          if (cb->state == ERROR) {
>>                  printk(KERN_ERR PFX "cq completion in ERROR state\n");
>>                  return;
>>          }
>>          if (cb->frtest) {
>>                  printk(KERN_ERR PFX "cq completion event in 
>> frtest!\n");
>>                  return;
>>          }
>>          if (!cb->wlat && !cb->rlat && !cb->bw)
>>                  ib_req_notify_cq(cb->cq, IB_CQ_NEXT_COMP) >         
>> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> IMO, here, we can use a BH workqueue to execute this notification? Or 
> add an event handler?
>
> Please reference other ULP kernel modules.
>
> Thanks,
> Zhu Yanjun
>
>>          while ((ret = ib_poll_cq(cb->cq, 1, &wc)) == 1) {
>>                  if (wc.status) {
>>
>> static int rxe_req_notify_cq(struct ib_cq *ibcq, enum 
>> ib_cq_notify_flags flags)
>> {
>>          struct rxe_cq *cq = to_rcq(ibcq);
>>          int ret = 0;
>>          int empty;
>>          unsigned long irq_flags;
>>
>>          spin_lock_irqsave(&cq->cq_lock, irq_flags);
>>     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Deadlock
>>
>
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
index fec87c9030ab..97a537994aee 100644
--- a/drivers/infiniband/sw/rxe/rxe_cq.c
+++ b/drivers/infiniband/sw/rxe/rxe_cq.c
@@ -39,6 +39,21 @@  int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
 	return -EINVAL;
 }
 
+static void rxe_send_complete(struct tasklet_struct *t)
+{
+	struct rxe_cq *cq = from_tasklet(cq, t, comp_task);
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	if (cq->is_dying) {
+		spin_unlock_irqrestore(&cq->cq_lock, flags);
+		return;
+	}
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
 int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
 		     int comp_vector, struct ib_udata *udata,
 		     struct rxe_create_cq_resp __user *uresp)
@@ -64,6 +79,10 @@  int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
 
 	cq->is_user = uresp;
 
+	cq->is_dying = false;
+
+	tasklet_setup(&cq->comp_task, rxe_send_complete);
+
 	spin_lock_init(&cq->cq_lock);
 	cq->ibcq.cqe = cqe;
 	return 0;
@@ -84,7 +103,6 @@  int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe,
 	return err;
 }
 
-/* caller holds reference to cq */
 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
 {
 	struct ib_event ev;
@@ -113,17 +131,26 @@  int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited)
 
 	queue_advance_producer(cq->queue, QUEUE_TYPE_TO_CLIENT);
 
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
 	if ((cq->notify & IB_CQ_NEXT_COMP) ||
 	    (cq->notify & IB_CQ_SOLICITED && solicited)) {
 		cq->notify = 0;
-		cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+		tasklet_schedule(&cq->comp_task);
 	}
 
-	spin_unlock_irqrestore(&cq->cq_lock, flags);
-
 	return 0;
 }
 
+void rxe_cq_disable(struct rxe_cq *cq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	cq->is_dying = true;
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+}
+
 void rxe_cq_cleanup(struct rxe_pool_elem *elem)
 {
 	struct rxe_cq *cq = container_of(elem, typeof(*cq), elem);
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index ded46119151b..ba84f780aa3d 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -31,6 +31,8 @@  int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe,
 
 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited);
 
+void rxe_cq_disable(struct rxe_cq *cq);
+
 void rxe_cq_cleanup(struct rxe_pool_elem *elem);
 
 /* rxe_mcast.c */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index de6238ee4379..a964d86789f6 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1205,6 +1205,8 @@  static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 		goto err_out;
 	}
 
+	rxe_cq_disable(cq);
+
 	err = rxe_cleanup(cq);
 	if (err)
 		rxe_err_cq(cq, "cleanup failed, err = %d\n", err);
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 3c1354f82283..03df97e83570 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -63,7 +63,9 @@  struct rxe_cq {
 	struct rxe_queue	*queue;
 	spinlock_t		cq_lock;
 	u8			notify;
+	bool			is_dying;
 	bool			is_user;
+	struct tasklet_struct	comp_task;
 	atomic_t		num_wq;
 };