diff mbox series

[ndctl] ndctl/dimm: Attempt an abort upon firmware-update-busy status

Message ID 160859948599.1811202.6750689494442813307.stgit@dwillia2-desk3.amr.corp.intel.com (mailing list archive)
State Accepted
Commit b43645707540f425ce905912b8539eb4821182e7
Headers show
Series [ndctl] ndctl/dimm: Attempt an abort upon firmware-update-busy status | expand

Commit Message

Dan Williams Dec. 22, 2020, 1:11 a.m. UTC
Mark reports that if a previous firmware update is blocked due to a
background ARS then ndctl fails to start another firmware-udpate
request until the platform is rebooted.

Teach 'ndctl update-firmware' to abort previous firmware-update sessions
when '--force' is specified.

Link: https://github.com/pmem/ndctl/issues/155
Link: http://lore.kernel.org/r/20201222005704.2355076-1-jane.chu@oracle.com
Reported-by: Mark Baker <mark.a.baker@oracle.com>
Tested-by: Mark Baker <mark.a.baker@oracle.com>
Tested-by: Jane Chu <jane.chu@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---

Needs the fix from Jane mentioned in the link above, but with that
included Jane and Mark report this works.

 ndctl/dimm.c |  109 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 42 deletions(-)
diff mbox series

Patch

diff --git a/ndctl/dimm.c b/ndctl/dimm.c
index 8e85d692afd3..167c3f1bc7c7 100644
--- a/ndctl/dimm.c
+++ b/ndctl/dimm.c
@@ -504,6 +504,36 @@  out:
 	return rc;
 }
 
+static int submit_abort_firmware(struct ndctl_dimm *dimm,
+		struct action_context *actx)
+{
+	struct update_context *uctx = &actx->update;
+	struct ndctl_cmd *cmd;
+	int rc;
+	enum ND_FW_STATUS status;
+
+	cmd = ndctl_dimm_cmd_new_fw_abort(uctx->start);
+	if (!cmd)
+		return -ENXIO;
+
+	rc = ndctl_cmd_submit(cmd);
+	if (rc < 0)
+		goto out;
+
+	status = ndctl_cmd_fw_xlat_firmware_status(cmd);
+	if (!(status & ND_CMD_STATUS_FIN_ABORTED)) {
+		fprintf(stderr,
+			"Firmware update abort on DIMM %s failed: %#x\n",
+			ndctl_dimm_get_devname(dimm), status);
+		rc = -ENXIO;
+		goto out;
+	}
+
+out:
+	ndctl_cmd_unref(cmd);
+	return rc;
+}
+
 static int submit_start_firmware_upload(struct ndctl_dimm *dimm,
 		struct action_context *actx)
 {
@@ -511,8 +541,8 @@  static int submit_start_firmware_upload(struct ndctl_dimm *dimm,
 	struct update_context *uctx = &actx->update;
 	struct fw_info *fw = &uctx->dimm_fw;
 	struct ndctl_cmd *cmd;
-	int rc;
 	enum ND_FW_STATUS status;
+	int rc;
 
 	cmd = ndctl_dimm_cmd_new_fw_start_update(dimm);
 	if (!cmd)
@@ -520,27 +550,46 @@  static int submit_start_firmware_upload(struct ndctl_dimm *dimm,
 
 	rc = ndctl_cmd_submit(cmd);
 	if (rc < 0)
-		return rc;
+		goto err;
 
+	uctx->start = cmd;
 	status = ndctl_cmd_fw_xlat_firmware_status(cmd);
 	if (status == FW_EBUSY) {
-		err("%s: busy with another firmware update", devname);
-		return -EBUSY;
+		if (param.force) {
+			rc = submit_abort_firmware(dimm, actx);
+			if (rc < 0) {
+				err("%s: busy with another firmware update, "
+				    "abort failed", devname);
+				rc = -EBUSY;
+				goto err;
+			}
+			rc = -EAGAIN;
+			goto err;
+		} else {
+			err("%s: busy with another firmware update", devname);
+			rc = -EBUSY;
+			goto err;
+		}
 	}
 	if (status != FW_SUCCESS) {
 		err("%s: failed to create start context", devname);
-		return -ENXIO;
+		rc = -ENXIO;
+		goto err;
 	}
 
 	fw->context = ndctl_cmd_fw_start_get_context(cmd);
 	if (fw->context == UINT_MAX) {
 		err("%s: failed to retrieve start context", devname);
-		return -ENXIO;
+		rc = -ENXIO;
+		goto err;
 	}
 
-	uctx->start = cmd;
-
 	return 0;
+
+err:
+	uctx->start = NULL;
+	ndctl_cmd_unref(cmd);
+	return rc;
 }
 
 static int get_fw_data_from_file(FILE *file, void *buf, uint32_t len)
@@ -659,36 +708,6 @@  out:
 	return rc;
 }
 
-static int submit_abort_firmware(struct ndctl_dimm *dimm,
-		struct action_context *actx)
-{
-	struct update_context *uctx = &actx->update;
-	struct ndctl_cmd *cmd;
-	int rc;
-	enum ND_FW_STATUS status;
-
-	cmd = ndctl_dimm_cmd_new_fw_abort(uctx->start);
-	if (!cmd)
-		return -ENXIO;
-
-	rc = ndctl_cmd_submit(cmd);
-	if (rc < 0)
-		goto out;
-
-	status = ndctl_cmd_fw_xlat_firmware_status(cmd);
-	if (!(status & ND_CMD_STATUS_FIN_ABORTED)) {
-		fprintf(stderr,
-			"Firmware update abort on DIMM %s failed: %#x\n",
-			ndctl_dimm_get_devname(dimm), status);
-		rc = -ENXIO;
-		goto out;
-	}
-
-out:
-	ndctl_cmd_unref(cmd);
-	return rc;
-}
-
 static enum ndctl_fwa_state fw_update_arm(struct ndctl_dimm *dimm)
 {
 	struct ndctl_bus *bus = ndctl_dimm_get_bus(dimm);
@@ -856,15 +875,21 @@  static int update_firmware(struct ndctl_dimm *dimm,
 		struct action_context *actx)
 {
 	const char *devname = ndctl_dimm_get_devname(dimm);
-	int rc;
+	int rc, i;
 
 	rc = submit_get_firmware_info(dimm, actx);
 	if (rc < 0)
 		return rc;
 
-	rc = submit_start_firmware_upload(dimm, actx);
-	if (rc < 0)
-		return rc;
+	/* try a few times in the --force and state busy case */
+	for (i = 0; i < 3; i++) {
+		rc = submit_start_firmware_upload(dimm, actx);
+		if (rc == -EAGAIN)
+			continue;
+		if (rc < 0)
+			return rc;
+		break;
+	}
 
 	if (param.verbose)
 		fprintf(stderr, "%s: uploading firmware\n", devname);