diff mbox series

[for-rc] IB/isert: Fix hang in iscsit_wait_for_tag

Message ID 20230119210659.1871-1-shiraz.saleem@intel.com (mailing list archive)
State Changes Requested
Headers show
Series [for-rc] IB/isert: Fix hang in iscsit_wait_for_tag | expand

Commit Message

Shiraz Saleem Jan. 19, 2023, 9:07 p.m. UTC
From: Mustafa Ismail <mustafa.ismail@intel.com>

Running fio can occasionally cause a hang when sbitmap_queue_get() fails to
return a tag in iscsit_allocate_cmd() and iscsit_wait_for_tag() is called
and will never return from the schedule(). This is because the polling
thread of the CQ is suspended, and will not poll for a SQ completion which
would free up a tag.
Fix this by creating a separate CQ for the SQ so that send completions are
processed on a separate thread and are not blocked when the RQ CQ is
stalled.

Fixes: 10e9cbb6b531 ("scsi: target: Convert target drivers to use sbitmap")
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
---
 drivers/infiniband/ulp/isert/ib_isert.c | 33 +++++++++++++++++++++++----------
 drivers/infiniband/ulp/isert/ib_isert.h |  3 ++-
 2 files changed, 25 insertions(+), 11 deletions(-)

Comments

Sagi Grimberg Jan. 23, 2023, 9:19 a.m. UTC | #1
> From: Mustafa Ismail <mustafa.ismail@intel.com>
> 
> Running fio can occasionally cause a hang when sbitmap_queue_get() fails to
> return a tag in iscsit_allocate_cmd() and iscsit_wait_for_tag() is called
> and will never return from the schedule(). This is because the polling
> thread of the CQ is suspended, and will not poll for a SQ completion which
> would free up a tag.
> Fix this by creating a separate CQ for the SQ so that send completions are
> processed on a separate thread and are not blocked when the RQ CQ is
> stalled.
> 
> Fixes: 10e9cbb6b531 ("scsi: target: Convert target drivers to use sbitmap")

Is this the real offending commit? What prevented this from happening
before?

> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
> Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> ---
>   drivers/infiniband/ulp/isert/ib_isert.c | 33 +++++++++++++++++++++++----------
>   drivers/infiniband/ulp/isert/ib_isert.h |  3 ++-
>   2 files changed, 25 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
> index 7540488..f827b91 100644
> --- a/drivers/infiniband/ulp/isert/ib_isert.c
> +++ b/drivers/infiniband/ulp/isert/ib_isert.c
> @@ -109,19 +109,27 @@ static int isert_sg_tablesize_set(const char *val, const struct kernel_param *kp
>   	struct ib_qp_init_attr attr;
>   	int ret, factor;
>   
> -	isert_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1, IB_POLL_WORKQUEUE);
> -	if (IS_ERR(isert_conn->cq)) {
> -		isert_err("Unable to allocate cq\n");
> -		ret = PTR_ERR(isert_conn->cq);
> +	isert_conn->snd_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
> +					    IB_POLL_WORKQUEUE);
> +	if (IS_ERR(isert_conn->snd_cq)) {
> +		isert_err("Unable to allocate send cq\n");
> +		ret = PTR_ERR(isert_conn->snd_cq);
>   		return ERR_PTR(ret);
>   	}
> +	isert_conn->rcv_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
> +					    IB_POLL_WORKQUEUE);
> +	if (IS_ERR(isert_conn->rcv_cq)) {
> +		isert_err("Unable to allocate receive cq\n");
> +		ret = PTR_ERR(isert_conn->rcv_cq);
> +		goto create_cq_err;
> +	}

Does this have any noticeable performance implications?

Also I wander if there are any other assumptions in the code
for having a single context processing completions...

It'd be much easier if iscsi_allocate_cmd could accept
a timeout to fail...

CCing target-devel and Mike.
Sindhu Devale Jan. 30, 2023, 6:22 p.m. UTC | #2
> Subject: Re: [PATCH for-rc] IB/isert: Fix hang in iscsit_wait_for_tag
> 
> 
> > From: Mustafa Ismail <mustafa.ismail@intel.com>
> >
> > Running fio can occasionally cause a hang when sbitmap_queue_get()
> > fails to return a tag in iscsit_allocate_cmd() and
> > iscsit_wait_for_tag() is called and will never return from the
> > schedule(). This is because the polling thread of the CQ is suspended,
> > and will not poll for a SQ completion which would free up a tag.
> > Fix this by creating a separate CQ for the SQ so that send completions
> > are processed on a separate thread and are not blocked when the RQ CQ
> > is stalled.
> >
> > Fixes: 10e9cbb6b531 ("scsi: target: Convert target drivers to use
> > sbitmap")
> 
> Is this the real offending commit? What prevented this from happening
> before?

Maybe going to a global bitmap instead of per cpu ida makes it less likely to occur.
Going to single CQ maybe the real root cause in this commit:6f0fae3d7797("iser-target: Use single CQ for TX and RX")

> > Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
> > Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
> > Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> > ---
> >   drivers/infiniband/ulp/isert/ib_isert.c | 33 +++++++++++++++++++++++--
> --------
> >   drivers/infiniband/ulp/isert/ib_isert.h |  3 ++-
> >   2 files changed, 25 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/infiniband/ulp/isert/ib_isert.c
> > b/drivers/infiniband/ulp/isert/ib_isert.c
> > index 7540488..f827b91 100644
> > --- a/drivers/infiniband/ulp/isert/ib_isert.c
> > +++ b/drivers/infiniband/ulp/isert/ib_isert.c
> > @@ -109,19 +109,27 @@ static int isert_sg_tablesize_set(const char *val,
> const struct kernel_param *kp
> >   	struct ib_qp_init_attr attr;
> >   	int ret, factor;
> >
> > -	isert_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1,
> IB_POLL_WORKQUEUE);
> > -	if (IS_ERR(isert_conn->cq)) {
> > -		isert_err("Unable to allocate cq\n");
> > -		ret = PTR_ERR(isert_conn->cq);
> > +	isert_conn->snd_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
> > +					    IB_POLL_WORKQUEUE);
> > +	if (IS_ERR(isert_conn->snd_cq)) {
> > +		isert_err("Unable to allocate send cq\n");
> > +		ret = PTR_ERR(isert_conn->snd_cq);
> >   		return ERR_PTR(ret);
> >   	}
> > +	isert_conn->rcv_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
> > +					    IB_POLL_WORKQUEUE);
> > +	if (IS_ERR(isert_conn->rcv_cq)) {
> > +		isert_err("Unable to allocate receive cq\n");
> > +		ret = PTR_ERR(isert_conn->rcv_cq);
> > +		goto create_cq_err;
> > +	}
> 
> Does this have any noticeable performance implications?

Initial testing seems to indicate this change causes significant performance variability specifically only with 2K Writes.
We suspect that may be due an unfortunate vector placement where the snd_cq and rcv_cq are on different numa nodes. 
We can, in the patch, alter the second CQ creation to pass comp_vector to insure they are hinted to the same affinity.

> Also I wander if there are any other assumptions in the code for having a
> single context processing completions...

We don't see any.

> It'd be much easier if iscsi_allocate_cmd could accept a timeout to fail...
> 
> CCing target-devel and Mike.

Do you mean add a timeout to the wait or removing the call to iscsit_wait_for_tag() iscsit_allocate_cmd()?
Sagi Grimberg Feb. 13, 2023, 11:27 a.m. UTC | #3
On 1/30/23 20:22, Devale, Sindhu wrote:
> 
> 
>> Subject: Re: [PATCH for-rc] IB/isert: Fix hang in iscsit_wait_for_tag
>>
>>
>>> From: Mustafa Ismail <mustafa.ismail@intel.com>
>>>
>>> Running fio can occasionally cause a hang when sbitmap_queue_get()
>>> fails to return a tag in iscsit_allocate_cmd() and
>>> iscsit_wait_for_tag() is called and will never return from the
>>> schedule(). This is because the polling thread of the CQ is suspended,
>>> and will not poll for a SQ completion which would free up a tag.
>>> Fix this by creating a separate CQ for the SQ so that send completions
>>> are processed on a separate thread and are not blocked when the RQ CQ
>>> is stalled.
>>>
>>> Fixes: 10e9cbb6b531 ("scsi: target: Convert target drivers to use
>>> sbitmap")
>>
>> Is this the real offending commit? What prevented this from happening
>> before?
> 
> Maybe going to a global bitmap instead of per cpu ida makes it less likely to occur.
> Going to single CQ maybe the real root cause in this commit:6f0fae3d7797("iser-target: Use single CQ for TX and RX")

Yes this is more likely.

> 
>>> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
>>> Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
>>> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
>>> ---
>>>    drivers/infiniband/ulp/isert/ib_isert.c | 33 +++++++++++++++++++++++--
>> --------
>>>    drivers/infiniband/ulp/isert/ib_isert.h |  3 ++-
>>>    2 files changed, 25 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/infiniband/ulp/isert/ib_isert.c
>>> b/drivers/infiniband/ulp/isert/ib_isert.c
>>> index 7540488..f827b91 100644
>>> --- a/drivers/infiniband/ulp/isert/ib_isert.c
>>> +++ b/drivers/infiniband/ulp/isert/ib_isert.c
>>> @@ -109,19 +109,27 @@ static int isert_sg_tablesize_set(const char *val,
>> const struct kernel_param *kp
>>>    	struct ib_qp_init_attr attr;
>>>    	int ret, factor;
>>>
>>> -	isert_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1,
>> IB_POLL_WORKQUEUE);
>>> -	if (IS_ERR(isert_conn->cq)) {
>>> -		isert_err("Unable to allocate cq\n");
>>> -		ret = PTR_ERR(isert_conn->cq);
>>> +	isert_conn->snd_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
>>> +					    IB_POLL_WORKQUEUE);
>>> +	if (IS_ERR(isert_conn->snd_cq)) {
>>> +		isert_err("Unable to allocate send cq\n");
>>> +		ret = PTR_ERR(isert_conn->snd_cq);
>>>    		return ERR_PTR(ret);
>>>    	}
>>> +	isert_conn->rcv_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
>>> +					    IB_POLL_WORKQUEUE);
>>> +	if (IS_ERR(isert_conn->rcv_cq)) {
>>> +		isert_err("Unable to allocate receive cq\n");
>>> +		ret = PTR_ERR(isert_conn->rcv_cq);
>>> +		goto create_cq_err;
>>> +	}
>>
>> Does this have any noticeable performance implications?
> 
> Initial testing seems to indicate this change causes significant performance variability specifically only with 2K Writes.
> We suspect that may be due an unfortunate vector placement where the snd_cq and rcv_cq are on different numa nodes.
> We can, in the patch, alter the second CQ creation to pass comp_vector to insure they are hinted to the same affinity.

Even so, still there are now two competing threads for completion
processing.

> 
>> Also I wander if there are any other assumptions in the code for having a
>> single context processing completions...
> 
> We don't see any.
> 
>> It'd be much easier if iscsi_allocate_cmd could accept a timeout to fail...
>>
>> CCing target-devel and Mike.
> 
> Do you mean add a timeout to the wait or removing the call to iscsit_wait_for_tag() iscsit_allocate_cmd()?

Looking at the code, passing it TASK_RUNNING will make it fail if there
no available tag (and hence drop the received command, let the initiator
retry). But I also think that isert may need a deeper default queue
depth...
Shiraz Saleem March 7, 2023, 12:09 a.m. UTC | #4
> Subject: Re: [PATCH for-rc] IB/isert: Fix hang in iscsit_wait_for_tag
> 
> 
> 
> On 1/30/23 20:22, Devale, Sindhu wrote:
> >
> >
> >> Subject: Re: [PATCH for-rc] IB/isert: Fix hang in iscsit_wait_for_tag
> >>
> >>
> >>> From: Mustafa Ismail <mustafa.ismail@intel.com>
> >>>
> >>> Running fio can occasionally cause a hang when sbitmap_queue_get()
> >>> fails to return a tag in iscsit_allocate_cmd() and
> >>> iscsit_wait_for_tag() is called and will never return from the
> >>> schedule(). This is because the polling thread of the CQ is
> >>> suspended, and will not poll for a SQ completion which would free up a tag.
> >>> Fix this by creating a separate CQ for the SQ so that send
> >>> completions are processed on a separate thread and are not blocked
> >>> when the RQ CQ is stalled.
> >>>
> >>> Fixes: 10e9cbb6b531 ("scsi: target: Convert target drivers to use
> >>> sbitmap")
> >>
> >> Is this the real offending commit? What prevented this from happening
> >> before?
> >
> > Maybe going to a global bitmap instead of per cpu ida makes it less likely to
> occur.
> > Going to single CQ maybe the real root cause in this
> > commit:6f0fae3d7797("iser-target: Use single CQ for TX and RX")
> 
> Yes this is more likely.
> 
> >
> >>> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
> >>> Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
> >>> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
> >>> ---
> >>>    drivers/infiniband/ulp/isert/ib_isert.c | 33
> >>> +++++++++++++++++++++++--
> >> --------
> >>>    drivers/infiniband/ulp/isert/ib_isert.h |  3 ++-
> >>>    2 files changed, 25 insertions(+), 11 deletions(-)
> >>>
> >>> diff --git a/drivers/infiniband/ulp/isert/ib_isert.c
> >>> b/drivers/infiniband/ulp/isert/ib_isert.c
> >>> index 7540488..f827b91 100644
> >>> --- a/drivers/infiniband/ulp/isert/ib_isert.c
> >>> +++ b/drivers/infiniband/ulp/isert/ib_isert.c
> >>> @@ -109,19 +109,27 @@ static int isert_sg_tablesize_set(const char
> >>> *val,
> >> const struct kernel_param *kp
> >>>    	struct ib_qp_init_attr attr;
> >>>    	int ret, factor;
> >>>
> >>> -	isert_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1,
> >> IB_POLL_WORKQUEUE);
> >>> -	if (IS_ERR(isert_conn->cq)) {
> >>> -		isert_err("Unable to allocate cq\n");
> >>> -		ret = PTR_ERR(isert_conn->cq);
> >>> +	isert_conn->snd_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
> >>> +					    IB_POLL_WORKQUEUE);
> >>> +	if (IS_ERR(isert_conn->snd_cq)) {
> >>> +		isert_err("Unable to allocate send cq\n");
> >>> +		ret = PTR_ERR(isert_conn->snd_cq);
> >>>    		return ERR_PTR(ret);
> >>>    	}
> >>> +	isert_conn->rcv_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
> >>> +					    IB_POLL_WORKQUEUE);
> >>> +	if (IS_ERR(isert_conn->rcv_cq)) {
> >>> +		isert_err("Unable to allocate receive cq\n");
> >>> +		ret = PTR_ERR(isert_conn->rcv_cq);
> >>> +		goto create_cq_err;
> >>> +	}
> >>
> >> Does this have any noticeable performance implications?
> >
> > Initial testing seems to indicate this change causes significant performance
> variability specifically only with 2K Writes.
> > We suspect that may be due an unfortunate vector placement where the
> snd_cq and rcv_cq are on different numa nodes.
> > We can, in the patch, alter the second CQ creation to pass comp_vector to
> insure they are hinted to the same affinity.
> 
> Even so, still there are now two competing threads for completion processing.
> 
> >
> >> Also I wander if there are any other assumptions in the code for
> >> having a single context processing completions...
> >
> > We don't see any.
> >
> >> It'd be much easier if iscsi_allocate_cmd could accept a timeout to fail...
> >>
> >> CCing target-devel and Mike.
> >
> > Do you mean add a timeout to the wait or removing the call
> to iscsit_wait_for_tag() iscsit_allocate_cmd()?
> 
> Looking at the code, passing it TASK_RUNNING will make it fail if there no
> available tag (and hence drop the received command, let the initiator retry). But I
> also think that isert may need a deeper default queue depth...

Hi Sagi -


Mustafa reports - "The problem is not easily reproduced, so I reduce the amount of map_tags allocated when I testing a potential fix. Passing TASK_RUNNING and I got the following call trace:

[  220.131709] isert: isert_allocate_cmd: Unable to allocate iscsit_cmd + isert_cmd
[  220.131712] isert: isert_allocate_cmd: Unable to allocate iscsit_cmd + isert_cmd
[  280.862544] ABORT_TASK: Found referenced iSCSI task_tag: 70
[  313.265156] iSCSI Login timeout on Network Portal 5.1.1.21:3260
[  334.769268] INFO: task kworker/32:3:1285 blocked for more than 30 seconds.
[  334.769272]       Tainted: G           OE      6.2.0-rc3 #6
[  334.769274] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  334.769275] task:kworker/32:3    state:D stack:0     pid:1285  ppid:2      flags:0x00004000
[  334.769279] Workqueue: events target_tmr_work [target_core_mod]
[  334.769307] Call Trace:
[  334.769308]  <TASK>
[  334.769310]  __schedule+0x318/0xa30
[  334.769316]  ? _prb_read_valid+0x22e/0x2b0
[  334.769319]  ? __pfx_schedule_timeout+0x10/0x10
[  334.769322]  ? __wait_for_common+0xd3/0x1e0
[  334.769323]  schedule+0x57/0xd0
[  334.769325]  schedule_timeout+0x273/0x320
[  334.769327]  ? __irq_work_queue_local+0x39/0x80
[  334.769330]  ? irq_work_queue+0x3f/0x60
[  334.769332]  ? __pfx_schedule_timeout+0x10/0x10
[  334.769333]  __wait_for_common+0xf9/0x1e0
[  334.769335]  target_put_cmd_and_wait+0x59/0x80 [target_core_mod]
[  334.769351]  core_tmr_abort_task.cold.8+0x187/0x202 [target_core_mod]
[  334.769369]  target_tmr_work+0xa1/0x110 [target_core_mod]
[  334.769384]  process_one_work+0x1b0/0x390
[  334.769387]  worker_thread+0x40/0x380
[  334.769389]  ? __pfx_worker_thread+0x10/0x10
[  334.769391]  kthread+0xfa/0x120
[  334.769393]  ? __pfx_kthread+0x10/0x10
[  334.769395]  ret_from_fork+0x29/0x50
[  334.769399]  </TASK>
[  334.769442] INFO: task iscsi_np:5337 blocked for more than 30 seconds.
[  334.769444]       Tainted: G           OE      6.2.0-rc3 #6
[  334.769444] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  334.769445] task:iscsi_np        state:D stack:0     pid:5337  ppid:2      flags:0x00004004
[  334.769447] Call Trace:
[  334.769447]  <TASK>
[  334.769448]  __schedule+0x318/0xa30
[  334.769451]  ? __pfx_schedule_timeout+0x10/0x10
[  334.769453]  ? __wait_for_common+0xd3/0x1e0
[  334.769454]  schedule+0x57/0xd0
[  334.769456]  schedule_timeout+0x273/0x320
[  334.769459]  ? iscsi_update_param_value+0x27/0x70 [iscsi_target_mod]
[  334.769476]  ? __kmalloc_node_track_caller+0x52/0x130
[  334.769478]  ? __pfx_schedule_timeout+0x10/0x10
[  334.769480]  __wait_for_common+0xf9/0x1e0
[  334.769481]  iscsi_check_for_session_reinstatement+0x1e8/0x280 [iscsi_target_mod]
[  334.769496]  iscsi_target_do_login+0x23b/0x570 [iscsi_target_mod]
[  334.769508]  iscsi_target_start_negotiation+0x55/0xc0 [iscsi_target_mod]
[  334.769519]  iscsi_target_login_thread+0x675/0xeb0 [iscsi_target_mod]
[  334.769531]  ? __pfx_iscsi_target_login_thread+0x10/0x10 [iscsi_target_mod]
[  334.769541]  kthread+0xfa/0x120
[  334.769543]  ? __pfx_kthread+0x10/0x10
[  334.769544]  ret_from_fork+0x29/0x50
[  334.769547]  </TASK>


[  185.734571] isert: isert_allocate_cmd: Unable to allocate iscsit_cmd + isert_cmd
[  246.032360] ABORT_TASK: Found referenced iSCSI task_tag: 75
[  278.442726] iSCSI Login timeout on Network Portal 5.1.1.21:3260


By the way increasing tag_num in iscsi_target_locate_portal() will also avoid the issue"

Any thoughts on what could be causing this hang?

Shiraz
Sagi Grimberg March 7, 2023, 11:47 a.m. UTC | #5
On 3/7/23 02:09, Saleem, Shiraz wrote:
>> Subject: Re: [PATCH for-rc] IB/isert: Fix hang in iscsit_wait_for_tag
>>
>>
>>
>> On 1/30/23 20:22, Devale, Sindhu wrote:
>>>
>>>
>>>> Subject: Re: [PATCH for-rc] IB/isert: Fix hang in iscsit_wait_for_tag
>>>>
>>>>
>>>>> From: Mustafa Ismail <mustafa.ismail@intel.com>
>>>>>
>>>>> Running fio can occasionally cause a hang when sbitmap_queue_get()
>>>>> fails to return a tag in iscsit_allocate_cmd() and
>>>>> iscsit_wait_for_tag() is called and will never return from the
>>>>> schedule(). This is because the polling thread of the CQ is
>>>>> suspended, and will not poll for a SQ completion which would free up a tag.
>>>>> Fix this by creating a separate CQ for the SQ so that send
>>>>> completions are processed on a separate thread and are not blocked
>>>>> when the RQ CQ is stalled.
>>>>>
>>>>> Fixes: 10e9cbb6b531 ("scsi: target: Convert target drivers to use
>>>>> sbitmap")
>>>>
>>>> Is this the real offending commit? What prevented this from happening
>>>> before?
>>>
>>> Maybe going to a global bitmap instead of per cpu ida makes it less likely to
>> occur.
>>> Going to single CQ maybe the real root cause in this
>>> commit:6f0fae3d7797("iser-target: Use single CQ for TX and RX")
>>
>> Yes this is more likely.
>>
>>>
>>>>> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
>>>>> Signed-off-by: Mustafa Ismail <mustafa.ismail@intel.com>
>>>>> Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
>>>>> ---
>>>>>     drivers/infiniband/ulp/isert/ib_isert.c | 33
>>>>> +++++++++++++++++++++++--
>>>> --------
>>>>>     drivers/infiniband/ulp/isert/ib_isert.h |  3 ++-
>>>>>     2 files changed, 25 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/drivers/infiniband/ulp/isert/ib_isert.c
>>>>> b/drivers/infiniband/ulp/isert/ib_isert.c
>>>>> index 7540488..f827b91 100644
>>>>> --- a/drivers/infiniband/ulp/isert/ib_isert.c
>>>>> +++ b/drivers/infiniband/ulp/isert/ib_isert.c
>>>>> @@ -109,19 +109,27 @@ static int isert_sg_tablesize_set(const char
>>>>> *val,
>>>> const struct kernel_param *kp
>>>>>     	struct ib_qp_init_attr attr;
>>>>>     	int ret, factor;
>>>>>
>>>>> -	isert_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1,
>>>> IB_POLL_WORKQUEUE);
>>>>> -	if (IS_ERR(isert_conn->cq)) {
>>>>> -		isert_err("Unable to allocate cq\n");
>>>>> -		ret = PTR_ERR(isert_conn->cq);
>>>>> +	isert_conn->snd_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
>>>>> +					    IB_POLL_WORKQUEUE);
>>>>> +	if (IS_ERR(isert_conn->snd_cq)) {
>>>>> +		isert_err("Unable to allocate send cq\n");
>>>>> +		ret = PTR_ERR(isert_conn->snd_cq);
>>>>>     		return ERR_PTR(ret);
>>>>>     	}
>>>>> +	isert_conn->rcv_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
>>>>> +					    IB_POLL_WORKQUEUE);
>>>>> +	if (IS_ERR(isert_conn->rcv_cq)) {
>>>>> +		isert_err("Unable to allocate receive cq\n");
>>>>> +		ret = PTR_ERR(isert_conn->rcv_cq);
>>>>> +		goto create_cq_err;
>>>>> +	}
>>>>
>>>> Does this have any noticeable performance implications?
>>>
>>> Initial testing seems to indicate this change causes significant performance
>> variability specifically only with 2K Writes.
>>> We suspect that may be due an unfortunate vector placement where the
>> snd_cq and rcv_cq are on different numa nodes.
>>> We can, in the patch, alter the second CQ creation to pass comp_vector to
>> insure they are hinted to the same affinity.
>>
>> Even so, still there are now two competing threads for completion processing.
>>
>>>
>>>> Also I wander if there are any other assumptions in the code for
>>>> having a single context processing completions...
>>>
>>> We don't see any.
>>>
>>>> It'd be much easier if iscsi_allocate_cmd could accept a timeout to fail...
>>>>
>>>> CCing target-devel and Mike.
>>>
>>> Do you mean add a timeout to the wait or removing the call
>> to iscsit_wait_for_tag() iscsit_allocate_cmd()?
>>
>> Looking at the code, passing it TASK_RUNNING will make it fail if there no
>> available tag (and hence drop the received command, let the initiator retry). But I
>> also think that isert may need a deeper default queue depth...
> 
> Hi Sagi -
> 
> 
> Mustafa reports - "The problem is not easily reproduced, so I reduce the amount of map_tags allocated when I testing a potential fix. Passing TASK_RUNNING and I got the following call trace:
> 
> [  220.131709] isert: isert_allocate_cmd: Unable to allocate iscsit_cmd + isert_cmd
> [  220.131712] isert: isert_allocate_cmd: Unable to allocate iscsit_cmd + isert_cmd
> [  280.862544] ABORT_TASK: Found referenced iSCSI task_tag: 70
> [  313.265156] iSCSI Login timeout on Network Portal 5.1.1.21:3260
> [  334.769268] INFO: task kworker/32:3:1285 blocked for more than 30 seconds.
> [  334.769272]       Tainted: G           OE      6.2.0-rc3 #6
> [  334.769274] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  334.769275] task:kworker/32:3    state:D stack:0     pid:1285  ppid:2      flags:0x00004000
> [  334.769279] Workqueue: events target_tmr_work [target_core_mod]
> [  334.769307] Call Trace:
> [  334.769308]  <TASK>
> [  334.769310]  __schedule+0x318/0xa30
> [  334.769316]  ? _prb_read_valid+0x22e/0x2b0
> [  334.769319]  ? __pfx_schedule_timeout+0x10/0x10
> [  334.769322]  ? __wait_for_common+0xd3/0x1e0
> [  334.769323]  schedule+0x57/0xd0
> [  334.769325]  schedule_timeout+0x273/0x320
> [  334.769327]  ? __irq_work_queue_local+0x39/0x80
> [  334.769330]  ? irq_work_queue+0x3f/0x60
> [  334.769332]  ? __pfx_schedule_timeout+0x10/0x10
> [  334.769333]  __wait_for_common+0xf9/0x1e0
> [  334.769335]  target_put_cmd_and_wait+0x59/0x80 [target_core_mod]
> [  334.769351]  core_tmr_abort_task.cold.8+0x187/0x202 [target_core_mod]
> [  334.769369]  target_tmr_work+0xa1/0x110 [target_core_mod]
> [  334.769384]  process_one_work+0x1b0/0x390
> [  334.769387]  worker_thread+0x40/0x380
> [  334.769389]  ? __pfx_worker_thread+0x10/0x10
> [  334.769391]  kthread+0xfa/0x120
> [  334.769393]  ? __pfx_kthread+0x10/0x10
> [  334.769395]  ret_from_fork+0x29/0x50
> [  334.769399]  </TASK>
> [  334.769442] INFO: task iscsi_np:5337 blocked for more than 30 seconds.
> [  334.769444]       Tainted: G           OE      6.2.0-rc3 #6
> [  334.769444] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  334.769445] task:iscsi_np        state:D stack:0     pid:5337  ppid:2      flags:0x00004004
> [  334.769447] Call Trace:
> [  334.769447]  <TASK>
> [  334.769448]  __schedule+0x318/0xa30
> [  334.769451]  ? __pfx_schedule_timeout+0x10/0x10
> [  334.769453]  ? __wait_for_common+0xd3/0x1e0
> [  334.769454]  schedule+0x57/0xd0
> [  334.769456]  schedule_timeout+0x273/0x320
> [  334.769459]  ? iscsi_update_param_value+0x27/0x70 [iscsi_target_mod]
> [  334.769476]  ? __kmalloc_node_track_caller+0x52/0x130
> [  334.769478]  ? __pfx_schedule_timeout+0x10/0x10
> [  334.769480]  __wait_for_common+0xf9/0x1e0
> [  334.769481]  iscsi_check_for_session_reinstatement+0x1e8/0x280 [iscsi_target_mod]
> [  334.769496]  iscsi_target_do_login+0x23b/0x570 [iscsi_target_mod]
> [  334.769508]  iscsi_target_start_negotiation+0x55/0xc0 [iscsi_target_mod]
> [  334.769519]  iscsi_target_login_thread+0x675/0xeb0 [iscsi_target_mod]
> [  334.769531]  ? __pfx_iscsi_target_login_thread+0x10/0x10 [iscsi_target_mod]
> [  334.769541]  kthread+0xfa/0x120
> [  334.769543]  ? __pfx_kthread+0x10/0x10
> [  334.769544]  ret_from_fork+0x29/0x50
> [  334.769547]  </TASK>
> 
> 
> [  185.734571] isert: isert_allocate_cmd: Unable to allocate iscsit_cmd + isert_cmd
> [  246.032360] ABORT_TASK: Found referenced iSCSI task_tag: 75
> [  278.442726] iSCSI Login timeout on Network Portal 5.1.1.21:3260
> 
> 
> By the way increasing tag_num in iscsi_target_locate_portal() will also avoid the issue"
> 
> Any thoughts on what could be causing this hang?

I know that Mike just did a set of fixes on the session teardown area...
Perhaps you should try with the patchset "target: TMF and recovery
fixes" applied?
Mike Christie March 7, 2023, 5:05 p.m. UTC | #6
On 3/7/23 5:47 AM, Sagi Grimberg wrote:
>> [  220.131709] isert: isert_allocate_cmd: Unable to allocate iscsit_cmd + isert_cmd
>> [  220.131712] isert: isert_allocate_cmd: Unable to allocate iscsit_cmd + isert_cmd
>> [  280.862544] ABORT_TASK: Found referenced iSCSI task_tag: 70
>> [  313.265156] iSCSI Login timeout on Network Portal 5.1.1.21:3260
>> [  334.769268] INFO: task kworker/32:3:1285 blocked for more than 30 seconds.
>> [  334.769272]       Tainted: G           OE      6.2.0-rc3 #6
>> [  334.769274] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [  334.769275] task:kworker/32:3    state:D stack:0     pid:1285  ppid:2      flags:0x00004000
>> [  334.769279] Workqueue: events target_tmr_work [target_core_mod]
>> [  334.769307] Call Trace:
>> [  334.769308]  <TASK>
>> [  334.769310]  __schedule+0x318/0xa30
>> [  334.769316]  ? _prb_read_valid+0x22e/0x2b0
>> [  334.769319]  ? __pfx_schedule_timeout+0x10/0x10
>> [  334.769322]  ? __wait_for_common+0xd3/0x1e0
>> [  334.769323]  schedule+0x57/0xd0
>> [  334.769325]  schedule_timeout+0x273/0x320
>> [  334.769327]  ? __irq_work_queue_local+0x39/0x80
>> [  334.769330]  ? irq_work_queue+0x3f/0x60
>> [  334.769332]  ? __pfx_schedule_timeout+0x10/0x10
>> [  334.769333]  __wait_for_common+0xf9/0x1e0
>> [  334.769335]  target_put_cmd_and_wait+0x59/0x80 [target_core_mod]
>> [  334.769351]  core_tmr_abort_task.cold.8+0x187/0x202 [target_core_mod]
>> [  334.769369]  target_tmr_work+0xa1/0x110 [target_core_mod]
>> [  334.769384]  process_one_work+0x1b0/0x390
>> [  334.769387]  worker_thread+0x40/0x380
>> [  334.769389]  ? __pfx_worker_thread+0x10/0x10
>> [  334.769391]  kthread+0xfa/0x120
>> [  334.769393]  ? __pfx_kthread+0x10/0x10
>> [  334.769395]  ret_from_fork+0x29/0x50
>> [  334.769399]  </TASK>
>> [  334.769442] INFO: task iscsi_np:5337 blocked for more than 30 seconds.
>> [  334.769444]       Tainted: G           OE      6.2.0-rc3 #6
>> [  334.769444] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
>> [  334.769445] task:iscsi_np        state:D stack:0     pid:5337  ppid:2      flags:0x00004004
>> [  334.769447] Call Trace:
>> [  334.769447]  <TASK>
>> [  334.769448]  __schedule+0x318/0xa30
>> [  334.769451]  ? __pfx_schedule_timeout+0x10/0x10
>> [  334.769453]  ? __wait_for_common+0xd3/0x1e0
>> [  334.769454]  schedule+0x57/0xd0
>> [  334.769456]  schedule_timeout+0x273/0x320
>> [  334.769459]  ? iscsi_update_param_value+0x27/0x70 [iscsi_target_mod]
>> [  334.769476]  ? __kmalloc_node_track_caller+0x52/0x130
>> [  334.769478]  ? __pfx_schedule_timeout+0x10/0x10
>> [  334.769480]  __wait_for_common+0xf9/0x1e0
>> [  334.769481]  iscsi_check_for_session_reinstatement+0x1e8/0x280 [iscsi_target_mod]

The hang here might be this issue:

https://lore.kernel.org/linux-scsi/c1a395a3-74e2-c77f-c8e6-1cade30dfac6@oracle.com/T/#mdb29702f7c345eb7e3631d58e3ac7fac26e15fee

That version had some bugs, so I'm working on a new version.


>> [  334.769496]  iscsi_target_do_login+0x23b/0x570 [iscsi_target_mod]
>> [  334.769508]  iscsi_target_start_negotiation+0x55/0xc0 [iscsi_target_mod]
>> [  334.769519]  iscsi_target_login_thread+0x675/0xeb0 [iscsi_target_mod]
>> [  334.769531]  ? __pfx_iscsi_target_login_thread+0x10/0x10 [iscsi_target_mod]
>> [  334.769541]  kthread+0xfa/0x120
>> [  334.769543]  ? __pfx_kthread+0x10/0x10
>> [  334.769544]  ret_from_fork+0x29/0x50
>> [  334.769547]  </TASK>
>>
>>
>> [  185.734571] isert: isert_allocate_cmd: Unable to allocate iscsit_cmd + isert_cmd
>> [  246.032360] ABORT_TASK: Found referenced iSCSI task_tag: 75

Or, if there is only one session, then LIO might be waiting for commands to complete
before allowing a new login.

Or, it could be a combo of both.


>> [  278.442726] iSCSI Login timeout on Network Portal 5.1.1.21:3260
>>
>>
>> By the way increasing tag_num in iscsi_target_locate_portal() will also avoid the issue"
>>
>> Any thoughts on what could be causing this hang?
> 
> I know that Mike just did a set of fixes on the session teardown area...
> Perhaps you should try with the patchset "target: TMF and recovery
> fixes" applied?
diff mbox series

Patch

diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 7540488..f827b91 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -109,19 +109,27 @@  static int isert_sg_tablesize_set(const char *val, const struct kernel_param *kp
 	struct ib_qp_init_attr attr;
 	int ret, factor;
 
-	isert_conn->cq = ib_cq_pool_get(ib_dev, cq_size, -1, IB_POLL_WORKQUEUE);
-	if (IS_ERR(isert_conn->cq)) {
-		isert_err("Unable to allocate cq\n");
-		ret = PTR_ERR(isert_conn->cq);
+	isert_conn->snd_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
+					    IB_POLL_WORKQUEUE);
+	if (IS_ERR(isert_conn->snd_cq)) {
+		isert_err("Unable to allocate send cq\n");
+		ret = PTR_ERR(isert_conn->snd_cq);
 		return ERR_PTR(ret);
 	}
+	isert_conn->rcv_cq = ib_cq_pool_get(ib_dev, cq_size, -1,
+					    IB_POLL_WORKQUEUE);
+	if (IS_ERR(isert_conn->rcv_cq)) {
+		isert_err("Unable to allocate receive cq\n");
+		ret = PTR_ERR(isert_conn->rcv_cq);
+		goto create_cq_err;
+	}
 	isert_conn->cq_size = cq_size;
 
 	memset(&attr, 0, sizeof(struct ib_qp_init_attr));
 	attr.event_handler = isert_qp_event_callback;
 	attr.qp_context = isert_conn;
-	attr.send_cq = isert_conn->cq;
-	attr.recv_cq = isert_conn->cq;
+	attr.send_cq = isert_conn->snd_cq;
+	attr.recv_cq = isert_conn->rcv_cq;
 	attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS + 1;
 	attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
 	factor = rdma_rw_mr_factor(device->ib_device, cma_id->port_num,
@@ -137,12 +145,16 @@  static int isert_sg_tablesize_set(const char *val, const struct kernel_param *kp
 	ret = rdma_create_qp(cma_id, device->pd, &attr);
 	if (ret) {
 		isert_err("rdma_create_qp failed for cma_id %d\n", ret);
-		ib_cq_pool_put(isert_conn->cq, isert_conn->cq_size);
-
-		return ERR_PTR(ret);
+		goto create_qp_err;
 	}
 
 	return cma_id->qp;
+create_qp_err:
+	ib_cq_pool_put(isert_conn->rcv_cq, isert_conn->cq_size);
+create_cq_err:
+	ib_cq_pool_put(isert_conn->snd_cq, isert_conn->cq_size);
+
+	return ERR_PTR(ret);
 }
 
 static int
@@ -409,7 +421,8 @@  static int isert_sg_tablesize_set(const char *val, const struct kernel_param *kp
 isert_destroy_qp(struct isert_conn *isert_conn)
 {
 	ib_destroy_qp(isert_conn->qp);
-	ib_cq_pool_put(isert_conn->cq, isert_conn->cq_size);
+	ib_cq_pool_put(isert_conn->snd_cq, isert_conn->cq_size);
+	ib_cq_pool_put(isert_conn->rcv_cq, isert_conn->cq_size);
 }
 
 static int
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
index 0b2dfd6..0cd43af 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.h
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -180,7 +180,8 @@  struct isert_conn {
 	struct iser_tx_desc	login_tx_desc;
 	struct rdma_cm_id	*cm_id;
 	struct ib_qp		*qp;
-	struct ib_cq		*cq;
+	struct ib_cq		*snd_cq;
+	struct ib_cq		*rcv_cq;
 	u32			cq_size;
 	struct isert_device	*device;
 	struct mutex		mutex;