diff mbox series

[4/5] megaraid_sas: TM command refire leads to controller firmware crash

Message ID 20200508085242.23406-1-chandrakanth.patil@broadcom.com (mailing list archive)
State Accepted
Headers show
Series None | expand

Commit Message

Chandrakanth Patil May 8, 2020, 8:52 a.m. UTC
Issue: When TM command times-out driver invokes the controller
reset. Post reset, driver re-fires pended TM commands which leads
to firmware crash.

Fix: Post controller reset, return pended TM commands back to OS.

Cc: stable@vger.kernel.org
Signed-off-by: Sumit Saxena <sumit.saxena@broadcom.com>
Signed-off-by: Chandrakanth Patil <chandrakanth.patil@broadcom.com>
---
 drivers/scsi/megaraid/megaraid_sas_fusion.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

Comments

Hannes Reinecke May 13, 2020, 6:19 a.m. UTC | #1
On 5/8/20 10:52 AM, Chandrakanth Patil wrote:
> Issue: When TM command times-out driver invokes the controller
> reset. Post reset, driver re-fires pended TM commands which leads
> to firmware crash.
> 
> Fix: Post controller reset, return pended TM commands back to OS.
> 
> Cc: stable@vger.kernel.org
> Signed-off-by: Sumit Saxena <sumit.saxena@broadcom.com>
> Signed-off-by: Chandrakanth Patil <chandrakanth.patil@broadcom.com>
> ---
>   drivers/scsi/megaraid/megaraid_sas_fusion.c | 7 ++++++-
>   1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
> index 87f91a38..319f241 100644
> --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
> +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
> @@ -4180,6 +4180,7 @@ static void megasas_refire_mgmt_cmd(struct megasas_instance *instance,
>   	struct fusion_context *fusion;
>   	struct megasas_cmd *cmd_mfi;
>   	union MEGASAS_REQUEST_DESCRIPTOR_UNION *req_desc;
> +	struct MPI2_RAID_SCSI_IO_REQUEST *scsi_io_req;
>   	u16 smid;
>   	bool refire_cmd = false;
>   	u8 result;
> @@ -4247,6 +4248,11 @@ static void megasas_refire_mgmt_cmd(struct megasas_instance *instance,
>   			result = COMPLETE_CMD;
>   		}
>   
> +		scsi_io_req = (struct MPI2_RAID_SCSI_IO_REQUEST *)
> +				cmd_fusion->io_request;
> +		if (scsi_io_req->Function == MPI2_FUNCTION_SCSI_TASK_MGMT)
> +			result = RETURN_CMD;
> +
>   		switch (result) {
>   		case REFIRE_CMD:
>   			megasas_fire_cmd_fusion(instance, req_desc);
> @@ -4475,7 +4481,6 @@ megasas_issue_tm(struct megasas_instance *instance, u16 device_handle,
>   	if (!timeleft) {
>   		dev_err(&instance->pdev->dev,
>   			"task mgmt type 0x%x timed out\n", type);
> -		cmd_mfi->flags |= DRV_DCMD_SKIP_REFIRE;
>   		mutex_unlock(&instance->reset_mutex);
>   		rc = megasas_reset_fusion(instance->host, MFI_IO_TIMEOUT_OCR);
>   		mutex_lock(&instance->reset_mutex);
> 
Why didn't the 'DRV_DCMD_SKIP_REFIRE' work?
And if it doesn't work, can't it be removed completely?

Cheers,

Hannes
Chandrakanth Patil May 16, 2020, 6:56 a.m. UTC | #2
> > Issue: When TM command times-out driver invokes the controller reset.
> > Post reset, driver re-fires pended TM commands which leads to firmware
> > crash.
> >
> > Fix: Post controller reset, return pended TM commands back to OS.
> >
> > Cc: stable@vger.kernel.org
> > Signed-off-by: Sumit Saxena <sumit.saxena@broadcom.com>
> > Signed-off-by: Chandrakanth Patil <chandrakanth.patil@broadcom.com>
> > ---
> >   drivers/scsi/megaraid/megaraid_sas_fusion.c | 7 ++++++-
> >   1 file changed, 6 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c
> > b/drivers/scsi/megaraid/megaraid_sas_fusion.c
> > index 87f91a38..319f241 100644
> > --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
> > +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
> > @@ -4180,6 +4180,7 @@ static void megasas_refire_mgmt_cmd(struct
> megasas_instance *instance,
> >   	struct fusion_context *fusion;
> >   	struct megasas_cmd *cmd_mfi;
> >   	union MEGASAS_REQUEST_DESCRIPTOR_UNION *req_desc;
> > +	struct MPI2_RAID_SCSI_IO_REQUEST *scsi_io_req;
> >   	u16 smid;
> >   	bool refire_cmd = false;
> >   	u8 result;
> > @@ -4247,6 +4248,11 @@ static void megasas_refire_mgmt_cmd(struct
> megasas_instance *instance,
> >   			result = COMPLETE_CMD;
> >   		}
> >
> > +		scsi_io_req = (struct MPI2_RAID_SCSI_IO_REQUEST *)
> > +				cmd_fusion->io_request;
> > +		if (scsi_io_req->Function == MPI2_FUNCTION_SCSI_TASK_MGMT)
> > +			result = RETURN_CMD;
> > +
> >   		switch (result) {
> >   		case REFIRE_CMD:
> >   			megasas_fire_cmd_fusion(instance, req_desc); @@ -4475,7
> +4481,6
> > @@ megasas_issue_tm(struct megasas_instance *instance, u16
> > device_handle,
> >   	if (!timeleft) {
> >   		dev_err(&instance->pdev->dev,
> >   			"task mgmt type 0x%x timed out\n", type);
> > -		cmd_mfi->flags |= DRV_DCMD_SKIP_REFIRE;
> >   		mutex_unlock(&instance->reset_mutex);
> >   		rc = megasas_reset_fusion(instance->host, MFI_IO_TIMEOUT_OCR);
> >   		mutex_lock(&instance->reset_mutex);
> >
> Why didn't the 'DRV_DCMD_SKIP_REFIRE' work?
> And if it doesn't work, can't it be removed completely?

Re-fire logic doesn't check  'DRV_DCMD_SKIP_REFIRE'  flag for TM commands
but it will check the flag for DCMDs
Hence, 'DRV_DCMD_SKIP_REFIRE' flag is only removed for TM commands.

-Chandrakanth Patil
diff mbox series

Patch

diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
index 87f91a38..319f241 100644
--- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
+++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
@@ -4180,6 +4180,7 @@  static void megasas_refire_mgmt_cmd(struct megasas_instance *instance,
 	struct fusion_context *fusion;
 	struct megasas_cmd *cmd_mfi;
 	union MEGASAS_REQUEST_DESCRIPTOR_UNION *req_desc;
+	struct MPI2_RAID_SCSI_IO_REQUEST *scsi_io_req;
 	u16 smid;
 	bool refire_cmd = false;
 	u8 result;
@@ -4247,6 +4248,11 @@  static void megasas_refire_mgmt_cmd(struct megasas_instance *instance,
 			result = COMPLETE_CMD;
 		}
 
+		scsi_io_req = (struct MPI2_RAID_SCSI_IO_REQUEST *)
+				cmd_fusion->io_request;
+		if (scsi_io_req->Function == MPI2_FUNCTION_SCSI_TASK_MGMT)
+			result = RETURN_CMD;
+
 		switch (result) {
 		case REFIRE_CMD:
 			megasas_fire_cmd_fusion(instance, req_desc);
@@ -4475,7 +4481,6 @@  megasas_issue_tm(struct megasas_instance *instance, u16 device_handle,
 	if (!timeleft) {
 		dev_err(&instance->pdev->dev,
 			"task mgmt type 0x%x timed out\n", type);
-		cmd_mfi->flags |= DRV_DCMD_SKIP_REFIRE;
 		mutex_unlock(&instance->reset_mutex);
 		rc = megasas_reset_fusion(instance->host, MFI_IO_TIMEOUT_OCR);
 		mutex_lock(&instance->reset_mutex);