diff mbox

[lpfc,05/19] Fix driver unload/reload operation.

Message ID 1491326232-11255-6-git-send-email-rkennedy@broadcom.com (mailing list archive)
State Changes Requested, archived
Headers show

Commit Message

Dick Kennedy April 4, 2017, 5:16 p.m. UTC
From: Dick Kennedy <rkennedy@lvnvda1400.lvn.broadcom.net>

Two issues: (1) driver could not be unloaded and
reloaded without some Oops or Panic occurring. (2) The
driver was panicking because of a corruption in the Memory
Manager when the iocb list was getting allocated.

Root cause for the memory corruption was a double
free of the Work Queue ring pointer memory - Freed once
in the lpfc_sli4_queue_free when the CQ was destroyed
and again in lpfc_sli4_queue_free when the WQ was
destroyed.  While the linux memory manager protects
against NULL pointers, it can't protect against stale
pointer kfree calls.

There are other fixes in this patch found during testing.
While they are not a direct corruption cause, they should
be fixed to ensure consistent unload/reload.

Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com>
Signed-off-by: James Smart <james.smart@broadcom.com>
---
 drivers/scsi/lpfc/lpfc_init.c | 46 +++++++++++++++++++++++--------------------
 drivers/scsi/lpfc/lpfc_sli.c  |  7 ++++++-
 2 files changed, 31 insertions(+), 22 deletions(-)

Comments

Johannes Thumshirn April 5, 2017, 7:20 a.m. UTC | #1
On Tue, Apr 04, 2017 at 10:16:58AM -0700, Dick Kennedy wrote:
> From: Dick Kennedy <rkennedy@lvnvda1400.lvn.broadcom.net>
> 
> Two issues: (1) driver could not be unloaded and
> reloaded without some Oops or Panic occurring. (2) The
> driver was panicking because of a corruption in the Memory
> Manager when the iocb list was getting allocated.
> 
> Root cause for the memory corruption was a double
> free of the Work Queue ring pointer memory - Freed once
> in the lpfc_sli4_queue_free when the CQ was destroyed
> and again in lpfc_sli4_queue_free when the WQ was
> destroyed.  While the linux memory manager protects
> against NULL pointers, it can't protect against stale
> pointer kfree calls.
> 
> There are other fixes in this patch found during testing.
> While they are not a direct corruption cause, they should
> be fixed to ensure consistent unload/reload.
> 
> Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com>
> Signed-off-by: James Smart <james.smart@broadcom.com>
> ---

Is this the regression Junichi Nomura reported? If yes please Cc him to get a
test result (plus proper Reported-by: attribution).

Note there was also a fix from Mauricio Faria de Oliveira [1] which IIRC
Junichi successfully tested.

[1] http://www.spinics.net/lists/linux-scsi/msg106877.html
Mauricio Faria de Oliveira April 5, 2017, 1:30 p.m. UTC | #2
Hi Dick,

If you don't mind me being pedantic, I've noticed a couple of things.

First, is your different email addresses in from/signed-off-by lines:

On 04/04/2017 02:16 PM, Dick Kennedy wrote:
> From: Dick Kennedy <rkennedy@lvnvda1400.lvn.broadcom.net>
[...]
> Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com>

Second, is there are several indentation/line-break changes in this
'fix' patch too, of which at least one seems inconsistent in the code
(see below) -- that would be better split off to another patch IMHO.

Maybe that would be covered in the scrubbing process you mentioned in
the other/regression thread.


> @@ -10312,6 +10315,7 @@ lpfc_pci_probe_one_s3(struct pci_dev *pdev, const struct pci_device_id *pid)
>  	}
>
>  	/* Initialize and populate the iocb list per host */
> +
>  	error = lpfc_init_iocb_list(phba, LPFC_IOCB_LIST_CNT);
>  	if (error) {
>  		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
> @@ -11092,7 +11096,6 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
>  	}
>
>  	/* Initialize and populate the iocb list per host */
> -
>  	lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
>  			"2821 initialize iocb list %d.\n",
>  			phba->cfg_iocb_cnt*1024);


cheers,
diff mbox

Patch

diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index b56da01..cca7f81 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -5815,6 +5815,12 @@  lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
 	INIT_LIST_HEAD(&phba->sli4_hba.lpfc_vfi_blk_list);
 	INIT_LIST_HEAD(&phba->lpfc_vpi_blk_list);
 
+	/* Initialize mboxq lists. If the early init routines fail
+	 * these lists need to be correctly initialized.
+	 */
+	INIT_LIST_HEAD(&phba->sli.mboxq);
+	INIT_LIST_HEAD(&phba->sli.mboxq_cmpl);
+
 	/* initialize optic_state to 0xFF */
 	phba->sli4_hba.lnk_info.optic_state = 0xff;
 
@@ -5880,6 +5886,7 @@  lpfc_sli4_driver_resource_setup(struct lpfc_hba *phba)
 					"READ_NV, mbxStatus x%x\n",
 					bf_get(lpfc_mqe_command, &mboxq->u.mqe),
 					bf_get(lpfc_mqe_status, &mboxq->u.mqe));
+			mempool_free(mboxq, phba->mbox_mem_pool);
 			rc = -EIO;
 			goto out_free_bsmbx;
 		}
@@ -7805,7 +7812,7 @@  lpfc_alloc_fcp_wq_cq(struct lpfc_hba *phba, int wqidx)
 
 	/* Create Fast Path FCP WQs */
 	wqesize = (phba->fcp_embed_io) ?
-				LPFC_WQE128_SIZE : phba->sli4_hba.wq_esize;
+		LPFC_WQE128_SIZE : phba->sli4_hba.wq_esize;
 	qdesc = lpfc_sli4_queue_alloc(phba, wqesize, phba->sli4_hba.wq_ecount);
 	if (!qdesc) {
 		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
@@ -7836,7 +7843,7 @@  int
 lpfc_sli4_queue_create(struct lpfc_hba *phba)
 {
 	struct lpfc_queue *qdesc;
-	int idx, io_channel, max;
+	int idx, io_channel;
 
 	/*
 	 * Create HBA Record arrays.
@@ -7997,15 +8004,6 @@  lpfc_sli4_queue_create(struct lpfc_hba *phba)
 		if (lpfc_alloc_nvme_wq_cq(phba, idx))
 			goto out_error;
 
-	/* allocate MRQ CQs */
-	max = phba->cfg_nvme_io_channel;
-	if (max < phba->cfg_nvmet_mrq)
-		max = phba->cfg_nvmet_mrq;
-
-	for (idx = 0; idx < max; idx++)
-		if (lpfc_alloc_nvme_wq_cq(phba, idx))
-			goto out_error;
-
 	if (phba->nvmet_support) {
 		for (idx = 0; idx < phba->cfg_nvmet_mrq; idx++) {
 			qdesc = lpfc_sli4_queue_alloc(phba,
@@ -8227,11 +8225,11 @@  lpfc_sli4_queue_destroy(struct lpfc_hba *phba)
 
 	/* Release FCP cqs */
 	lpfc_sli4_release_queues(&phba->sli4_hba.fcp_cq,
-					phba->cfg_fcp_io_channel);
+				 phba->cfg_fcp_io_channel);
 
 	/* Release FCP wqs */
 	lpfc_sli4_release_queues(&phba->sli4_hba.fcp_wq,
-					phba->cfg_fcp_io_channel);
+				 phba->cfg_fcp_io_channel);
 
 	/* Release FCP CQ mapping array */
 	lpfc_sli4_release_queue_map(&phba->sli4_hba.fcp_cq_map);
@@ -8577,15 +8575,15 @@  lpfc_sli4_queue_setup(struct lpfc_hba *phba)
 		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
 				"0528 %s not allocated\n",
 				phba->sli4_hba.mbx_cq ?
-						"Mailbox WQ" : "Mailbox CQ");
+				"Mailbox WQ" : "Mailbox CQ");
 		rc = -ENOMEM;
 		goto out_destroy;
 	}
 
 	rc = lpfc_create_wq_cq(phba, phba->sli4_hba.hba_eq[0],
-					phba->sli4_hba.mbx_cq,
-					phba->sli4_hba.mbx_wq,
-					NULL, 0, LPFC_MBOX);
+			       phba->sli4_hba.mbx_cq,
+			       phba->sli4_hba.mbx_wq,
+			       NULL, 0, LPFC_MBOX);
 	if (rc) {
 		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
 			"0529 Failed setup of mailbox WQ/CQ: rc = 0x%x\n",
@@ -10054,9 +10052,14 @@  lpfc_sli4_hba_unset(struct lpfc_hba *phba)
 	/* Stop kthread signal shall trigger work_done one more time */
 	kthread_stop(phba->worker_thread);
 
+	/* Unset the queues shared with the hardware then release all
+	 * allocated resources.
+	 */
+	lpfc_sli4_queue_unset(phba);
+	lpfc_sli4_queue_destroy(phba);
+
 	/* Reset SLI4 HBA FCoE function */
 	lpfc_pci_function_reset(phba);
-	lpfc_sli4_queue_destroy(phba);
 
 	/* Stop the SLI4 device port */
 	phba->pport->work_port_events = 0;
@@ -10312,6 +10315,7 @@  lpfc_pci_probe_one_s3(struct pci_dev *pdev, const struct pci_device_id *pid)
 	}
 
 	/* Initialize and populate the iocb list per host */
+
 	error = lpfc_init_iocb_list(phba, LPFC_IOCB_LIST_CNT);
 	if (error) {
 		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
@@ -11092,7 +11096,6 @@  lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
 	}
 
 	/* Initialize and populate the iocb list per host */
-
 	lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
 			"2821 initialize iocb list %d.\n",
 			phba->cfg_iocb_cnt*1024);
@@ -11183,7 +11186,9 @@  lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
 	if ((phba->nvmet_support == 0) &&
 	    (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME)) {
 		/* Create NVME binding with nvme_fc_transport. This
-		 * ensures the vport is initialized.
+		 * ensures the vport is initialized.  If the localport
+		 * create fails, it should not unload the driver to
+		 * support field issues.
 		 */
 		error = lpfc_nvme_create_localport(vport);
 		if (error) {
@@ -11191,7 +11196,6 @@  lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid)
 					"6004 NVME registration failed, "
 					"error x%x\n",
 					error);
-			goto out_disable_intr;
 		}
 	}
 
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 7087c55..d606944 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -13758,7 +13758,10 @@  lpfc_sli4_queue_free(struct lpfc_queue *queue)
 		lpfc_free_rq_buffer(queue->phba, queue);
 		kfree(queue->rqbp);
 	}
-	kfree(queue->pring);
+
+	if (!list_empty(&queue->wq_list))
+		list_del(&queue->wq_list);
+
 	kfree(queue);
 	return;
 }
@@ -15561,6 +15564,8 @@  lpfc_wq_destroy(struct lpfc_hba *phba, struct lpfc_queue *wq)
 	}
 	/* Remove wq from any list */
 	list_del_init(&wq->list);
+	kfree(wq->pring);
+	wq->pring = NULL;
 	mempool_free(mbox, wq->phba->mbox_mem_pool);
 	return status;
 }