diff mbox series

[3/3] mpt3sas: Handle FWfault while second half of IOC Init

Message ID 20210518051625.1596742-4-suganath-prabu.subramani@broadcom.com (mailing list archive)
State Accepted
Headers show
Series Gracefully handle FW faults during HBA initialization | expand

Commit Message

Suganath Prabu S May 18, 2021, 5:16 a.m. UTC
Currently if any firmware fault occurs while scanning the devices during
IOC Initialization then the driver issues the hard reset operation to
recover the IOC. But driver is not issuing Port enable request
messageĀ as part of hard reset operation during IOC initialization time.
Due to this driver won't get any device discovery related events and
hence devices won't be accessible to the users.

Through this patch, the driver gracefully handles the Firmware
fault while scanning the target devices during IOC initialization time,
by allowing the driver to issue the port enable request message as part
of hard reset operation during IOC initialization time. So that driver
receives the device discovery related events from the firmware after the
hard reset operation and target devices are added to SML.

Signed-off-by: Suganath Prabu S <suganath-prabu.subramani@broadcom.com>
---
 drivers/scsi/mpt3sas/mpt3sas_base.c  |   8 +-
 drivers/scsi/mpt3sas/mpt3sas_base.h  |   1 +
 drivers/scsi/mpt3sas/mpt3sas_scsih.c | 152 ++++++++++++++++++++++++---
 3 files changed, 145 insertions(+), 16 deletions(-)
diff mbox series

Patch

diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c
index 97e63a5..e592b1a 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -7205,7 +7205,7 @@  mpt3sas_port_enable_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
 	if (ioc_status != MPI2_IOCSTATUS_SUCCESS)
 		ioc->port_enable_failed = 1;
 
-	if (ioc->is_driver_loading) {
+	if (ioc->port_enable_cmds.status & MPT3_CMD_COMPLETE_ASYNC) {
 		if (ioc_status == MPI2_IOCSTATUS_SUCCESS) {
 			mpt3sas_port_enable_complete(ioc);
 			return 1;
@@ -7214,6 +7214,7 @@  mpt3sas_port_enable_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
 			ioc->start_scan = 0;
 			return 1;
 		}
+		ioc->port_enable_cmds.status &= ~MPT3_CMD_COMPLETE_ASYNC;
 	}
 	complete(&ioc->port_enable_cmds.done);
 	return 1;
@@ -7308,6 +7309,7 @@  mpt3sas_port_enable(struct MPT3SAS_ADAPTER *ioc)
 	}
 	ioc->drv_internal_flags |= MPT_DRV_INERNAL_FIRST_PE_ISSUED;
 	ioc->port_enable_cmds.status = MPT3_CMD_PENDING;
+	ioc->port_enable_cmds.status |= MPT3_CMD_COMPLETE_ASYNC;
 	mpi_request = mpt3sas_base_get_msg_frame(ioc, smid);
 	ioc->port_enable_cmds.smid = smid;
 	memset(mpi_request, 0, sizeof(Mpi2PortEnableRequest_t));
@@ -7856,7 +7858,7 @@  _base_make_ioc_operational(struct MPT3SAS_ADAPTER *ioc)
 	if (r)
 		return r;
 
-	if (ioc->is_driver_loading) {
+	if (!ioc->shost_recovery) {
 
 		if (ioc->is_warpdrive && ioc->manu_pg10.OEMIdentifier
 		    == 0x80) {
@@ -8276,8 +8278,6 @@  _base_clear_outstanding_mpt_commands(struct MPT3SAS_ADAPTER *ioc)
 			ioc->start_scan_failed =
 				MPI2_IOCSTATUS_INTERNAL_ERROR;
 			ioc->start_scan = 0;
-			ioc->port_enable_cmds.status =
-				MPT3_CMD_NOT_USED;
 		} else {
 			complete(&ioc->port_enable_cmds.done);
 		}
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h
index a8100a9..020f411 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.h
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.h
@@ -500,6 +500,7 @@  struct MPT3SAS_DEVICE {
 #define MPT3_CMD_PENDING	0x0002	/* pending */
 #define MPT3_CMD_REPLY_VALID	0x0004	/* reply is valid */
 #define MPT3_CMD_RESET		0x0008	/* host reset dropped the command */
+#define MPT3_CMD_COMPLETE_ASYNC 0x0010  /* tells whether cmd completes in same thread or not */
 
 /**
  * struct _internal_cmd - internal commands struct
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index 79e34b5..0d8b2e9 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -78,6 +78,7 @@  static void _scsih_pcie_device_remove_from_sml(struct MPT3SAS_ADAPTER *ioc,
 static void
 _scsih_pcie_check_device(struct MPT3SAS_ADAPTER *ioc, u16 handle);
 static u8 _scsih_check_for_pending_tm(struct MPT3SAS_ADAPTER *ioc, u16 smid);
+static void _scsih_complete_devices_scanning(struct MPT3SAS_ADAPTER *ioc);
 
 /* global parameters */
 LIST_HEAD(mpt3sas_ioc_list);
@@ -3631,8 +3632,6 @@  _scsih_error_recovery_delete_devices(struct MPT3SAS_ADAPTER *ioc)
 {
 	struct fw_event_work *fw_event;
 
-	if (ioc->is_driver_loading)
-		return;
 	fw_event = alloc_fw_event_work(0);
 	if (!fw_event)
 		return;
@@ -3693,6 +3692,14 @@  _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
 	if ((list_empty(&ioc->fw_event_list) && !ioc->current_event) ||
 	    !ioc->firmware_event_thread)
 		return;
+	/*
+	 * Set current running event as ignore, so that
+	 * current running event will be exit quickely.
+	 * As diag reset has occurred, so it is of no use
+	 * to proccess remaing stale even data enties.
+	 */
+	if (ioc->shost_recovery && ioc->current_event)
+		ioc->current_event->ignore = 1;
 
 	ioc->fw_events_cleanup = 1;
 	while ((fw_event = dequeue_next_fw_event(ioc)) ||
@@ -3719,6 +3726,19 @@  _scsih_fw_event_cleanup_queue(struct MPT3SAS_ADAPTER *ioc)
 			continue;
 		}
 
+		/*
+		 * Driver has to clear ioc->start_scan flag when
+		 * it is cleanup the MPT3SAS_PORT_ENABLE_COMPLETE,
+		 * otherwise scsi_scan_host() API waits still
+		 * 5 mints timeout timer to expire. If we exit from
+		 * scsi_scan_host() quickly then we can smoothly issue
+		 * new port enable request as part of current diag reset.
+		 */
+		if (fw_event->event == MPT3SAS_PORT_ENABLE_COMPLETE) {
+			ioc->port_enable_cmds.status |= MPT3_CMD_RESET;
+			ioc->start_scan = 0;
+		}
+
 		/*
 		 * Wait on the fw_event to complete. If this returns 1, then
 		 * the event was never executed, and we need a put for the
@@ -10140,6 +10160,17 @@  _scsih_remove_unresponding_devices(struct MPT3SAS_ADAPTER *ioc)
 	 * owner for the reference the list had on any object we prune.
 	 */
 	spin_lock_irqsave(&ioc->sas_device_lock, flags);
+
+	/*
+	 * Cleanup the sas_device_init_list list as
+	 * driver goes for fresh scan as part of diag reset.
+	 */
+	list_for_each_entry_safe(sas_device, sas_device_next,
+	    &ioc->sas_device_init_list, list) {
+		list_del_init(&sas_device->list);
+		sas_device_put(sas_device);
+	}
+
 	list_for_each_entry_safe(sas_device, sas_device_next,
 	    &ioc->sas_device_list, list) {
 		if (!sas_device->responding)
@@ -10161,6 +10192,16 @@  _scsih_remove_unresponding_devices(struct MPT3SAS_ADAPTER *ioc)
 	ioc_info(ioc, "Removing unresponding devices: pcie end-devices\n");
 	INIT_LIST_HEAD(&head);
 	spin_lock_irqsave(&ioc->pcie_device_lock, flags);
+	/*
+	 * Cleanup the pcie_device_init_list list as
+	 * driver goes for fresh scan as part of diag reset.
+	 */
+	list_for_each_entry_safe(pcie_device, pcie_device_next,
+	    &ioc->pcie_device_init_list, list) {
+		list_del_init(&pcie_device->list);
+		pcie_device_put(pcie_device);
+	}
+
 	list_for_each_entry_safe(pcie_device, pcie_device_next,
 	    &ioc->pcie_device_list, list) {
 		if (!pcie_device->responding)
@@ -10563,8 +10604,7 @@  void
 mpt3sas_scsih_reset_done_handler(struct MPT3SAS_ADAPTER *ioc)
 {
 	dtmprintk(ioc, ioc_info(ioc, "%s: MPT3_IOC_DONE_RESET\n", __func__));
-	if ((!ioc->is_driver_loading) && !(disable_discovery > 0 &&
-					   !ioc->sas_hba.num_phys)) {
+	if (!(disable_discovery > 0 && !ioc->sas_hba.num_phys)) {
 		if (ioc->multipath_on_hba) {
 			_scsih_sas_port_refresh(ioc);
 			_scsih_update_vphys_after_reset(ioc);
@@ -10619,6 +10659,18 @@  _mpt3sas_fw_work(struct MPT3SAS_ADAPTER *ioc, struct fw_event_work *fw_event)
 		_scsih_del_dirty_vphy(ioc);
 		_scsih_del_dirty_port_entries(ioc);
 		_scsih_scan_for_devices_after_reset(ioc);
+		/*
+		 * If diag reset has occurred during the driver load
+		 * then driver has to complete the driver load operation
+		 * by executing below items,
+		 *- Register the devices from sas_device_init_list to SML
+		 *- clear is_driver_loading flag,
+		 *- start the watchdog thread.
+		 * In happy driver load path, above things are taken care when
+		 * driver executes scsih_scan_finished().
+		 */
+		if (ioc->is_driver_loading)
+			_scsih_complete_devices_scanning(ioc);
 		_scsih_set_nvme_max_shutdown_latency(ioc);
 		break;
 	case MPT3SAS_PORT_ENABLE_COMPLETE:
@@ -10764,11 +10816,23 @@  mpt3sas_scsih_event_callback(struct MPT3SAS_ADAPTER *ioc, u8 msix_index,
 		_scsih_check_topo_delete_events(ioc,
 		    (Mpi2EventDataSasTopologyChangeList_t *)
 		    mpi_reply->EventData);
+		/*
+		 * No need to add the topology change list
+		 * event to fw event work queue when
+		 * diag reset is going on. Since during diag
+		 * reset driver scan the devices by reading
+		 * sas device page0's not by processing the
+		 * events.
+		 */
+		if (ioc->shost_recovery)
+			return 1;
 		break;
 	case MPI2_EVENT_PCIE_TOPOLOGY_CHANGE_LIST:
 	_scsih_check_pcie_topo_remove_events(ioc,
 		    (Mpi26EventDataPCIeTopologyChangeList_t *)
 		    mpi_reply->EventData);
+		if (ioc->shost_recovery)
+			return 1;
 		break;
 	case MPI2_EVENT_IR_CONFIGURATION_CHANGE_LIST:
 		_scsih_check_ir_config_unhide_events(ioc,
@@ -11284,13 +11348,27 @@  _scsih_probe_boot_devices(struct MPT3SAS_ADAPTER *ioc)
 
 	if (channel == RAID_CHANNEL) {
 		raid_device = device;
+		/*
+		 * If this boot vd is already registered with SML then
+		 * no need to register it again as part of device scanning
+		 * after diag reset during driver load operation.
+		 */
+		if (raid_device->starget)
+			return;
 		rc = scsi_add_device(ioc->shost, RAID_CHANNEL,
 		    raid_device->id, 0);
 		if (rc)
 			_scsih_raid_device_remove(ioc, raid_device);
 	} else if (channel == PCIE_CHANNEL) {
-		spin_lock_irqsave(&ioc->pcie_device_lock, flags);
 		pcie_device = device;
+		/*
+		 * If this boot NVMe device is already registered with SML then
+		 * no need to register it again as part of device scanning
+		 * after diag reset during driver load operation.
+		 */
+		if (pcie_device->starget)
+			return;
+		spin_lock_irqsave(&ioc->pcie_device_lock, flags);
 		tid = pcie_device->id;
 		list_move_tail(&pcie_device->list, &ioc->pcie_device_list);
 		spin_unlock_irqrestore(&ioc->pcie_device_lock, flags);
@@ -11298,8 +11376,15 @@  _scsih_probe_boot_devices(struct MPT3SAS_ADAPTER *ioc)
 		if (rc)
 			_scsih_pcie_device_remove(ioc, pcie_device);
 	} else {
-		spin_lock_irqsave(&ioc->sas_device_lock, flags);
 		sas_device = device;
+		/*
+		 * If this boot sas/sata device is already registered with SML
+		 * then no need to register it again as part of device scanning
+		 * after diag reset during driver load operation.
+		 */
+		if (sas_device->starget)
+			return;
+		spin_lock_irqsave(&ioc->sas_device_lock, flags);
 		handle = sas_device->handle;
 		sas_address_parent = sas_device->sas_address_parent;
 		sas_address = sas_device->sas_address;
@@ -11597,6 +11682,25 @@  scsih_scan_start(struct Scsi_Host *shost)
 		ioc_info(ioc, "port enable: FAILED\n");
 }
 
+/**
+ * _scsih_complete_devices_scanning - add the devices to sml and
+ * complete ioc initialization.
+ * @ioc: per adapter object
+ *
+ * Return nothing.
+ */
+static void _scsih_complete_devices_scanning(struct MPT3SAS_ADAPTER *ioc)
+{
+
+	if (ioc->wait_for_discovery_to_complete) {
+		ioc->wait_for_discovery_to_complete = 0;
+		_scsih_probe_devices(ioc);
+	}
+
+	mpt3sas_base_start_watchdog(ioc);
+	ioc->is_driver_loading = 0;
+}
+
 /**
  * scsih_scan_finished - scsi lld callback for .scan_finished
  * @shost: SCSI host pointer
@@ -11610,6 +11714,8 @@  static int
 scsih_scan_finished(struct Scsi_Host *shost, unsigned long time)
 {
 	struct MPT3SAS_ADAPTER *ioc = shost_priv(shost);
+	u32 ioc_state;
+	int issue_hard_reset = 0;
 
 	if (disable_discovery > 0) {
 		ioc->is_driver_loading = 0;
@@ -11624,9 +11730,30 @@  scsih_scan_finished(struct Scsi_Host *shost, unsigned long time)
 		return 1;
 	}
 
-	if (ioc->start_scan)
+	if (ioc->start_scan) {
+		ioc_state = mpt3sas_base_get_iocstate(ioc, 0);
+		if ((ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
+			mpt3sas_print_fault_code(ioc, ioc_state &
+			    MPI2_DOORBELL_DATA_MASK);
+			issue_hard_reset = 1;
+			goto out;
+		} else if ((ioc_state & MPI2_IOC_STATE_MASK) ==
+				MPI2_IOC_STATE_COREDUMP) {
+			mpt3sas_base_coredump_info(ioc, ioc_state &
+			    MPI2_DOORBELL_DATA_MASK);
+			mpt3sas_base_wait_for_coredump_completion(ioc, __func__);
+			issue_hard_reset = 1;
+			goto out;
+		}
 		return 0;
+	}
 
+	if (ioc->port_enable_cmds.status & MPT3_CMD_RESET) {
+		ioc_info(ioc,
+		    "port enable: aborted due to diag reset\n");
+		ioc->port_enable_cmds.status = MPT3_CMD_NOT_USED;
+		goto out;
+	}
 	if (ioc->start_scan_failed) {
 		ioc_info(ioc, "port enable: FAILED with (ioc_status=0x%08x)\n",
 			 ioc->start_scan_failed);
@@ -11638,13 +11765,14 @@  scsih_scan_finished(struct Scsi_Host *shost, unsigned long time)
 
 	ioc_info(ioc, "port enable: SUCCESS\n");
 	ioc->port_enable_cmds.status = MPT3_CMD_NOT_USED;
+	_scsih_complete_devices_scanning(ioc);
 
-	if (ioc->wait_for_discovery_to_complete) {
-		ioc->wait_for_discovery_to_complete = 0;
-		_scsih_probe_devices(ioc);
+out:
+	if (issue_hard_reset) {
+		ioc->port_enable_cmds.status = MPT3_CMD_NOT_USED;
+		if (mpt3sas_base_hard_reset_handler(ioc, SOFT_RESET))
+			ioc->is_driver_loading = 0;
 	}
-	mpt3sas_base_start_watchdog(ioc);
-	ioc->is_driver_loading = 0;
 	return 1;
 }