diff mbox series

[v2,1/5] PCI/switchtec: Error out MRPC execution when MMIO reads fail

Message ID 20211014141859.11444-2-kelvin.cao@microchip.com (mailing list archive)
State Accepted
Delegated to: Bjorn Helgaas
Headers show
Series Switchtec Fixes and Improvements | expand

Commit Message

Kelvin Cao Oct. 14, 2021, 2:18 p.m. UTC
From: Kelvin Cao <kelvin.cao@microchip.com>

A firmware hard reset may be initiated by various mechanisms including a
UART interface, TWI sideband interface from BMC, MRPC command from
userspace, etc. The switchtec management driver is unaware of these
resets.

The reset clears PCI state including the BARs and Memory Space Enable
bits, so the device no longer responds to the MMIO accesses the driver
uses to operate it.

MMIO reads to the device will fail with a PCIe error. When the root
complex handles that error, it typically fabricates ~0 data to complete
the CPU read.

Check for this sort of error by reading the device ID from MMIO space.
This ID can never be ~0, so if we see that value, it probably means the
PCIe Memory Read failed and we should return an error indication to the
application using the switchtec driver.

Signed-off-by: Kelvin Cao <kelvin.cao@microchip.com>
---
 drivers/pci/switch/switchtec.c | 67 ++++++++++++++++++++++++++++++----
 1 file changed, 60 insertions(+), 7 deletions(-)

Comments

Krzysztof Wilczyński Oct. 15, 2021, 1:21 a.m. UTC | #1
Hi Kelvin,

Thank you for sending the series over!

I am terribly sorry for a very late comment, especially since Bjorn already
accepted this series to be included, but allow me for a small question
below.

[...]
> @@ -113,6 +127,7 @@ static void stuser_set_state(struct switchtec_user *stuser,
>  		[MRPC_QUEUED] = "QUEUED",
>  		[MRPC_RUNNING] = "RUNNING",
>  		[MRPC_DONE] = "DONE",
> +		[MRPC_IO_ERROR] = "IO_ERROR",

Looking at the above, and then looking at stuser_set_state(), which
contains the following local array definition:

	const char * const state_names[] = {
		[MRPC_IDLE] = "IDLE",
		[MRPC_QUEUED] = "QUEUED",
		[MRPC_RUNNING] = "RUNNING",
		[MRPC_DONE] = "DONE",
	};

I was wondering if there might be a small benefit of declaring this array
state_names[], or list of states if you wish, as static so that we avoid
having to allocate space and fill it in with values every time this
functions runs?

The function itself if referenced in few places as per:

  Index File                           Line Content
      1 drivers/pci/switch/switchtec.c  159 stuser_set_state(stuser, MRPC_RUNNING);
      2 drivers/pci/switch/switchtec.c  178 stuser_set_state(stuser, MRPC_QUEUED);
      3 drivers/pci/switch/switchtec.c  206 stuser_set_state(stuser, MRPC_DONE);
      4 drivers/pci/switch/switchtec.c  567 stuser_set_state(stuser, MRPC_IDLE);

Even though the string representation of the state is ever only printed if
a debug logging is requested, we would allocate and popular this array
every time anyway, regardless of whether we print any debug information or
not.

What do you think?

	Krzysztof
Kelvin Cao Oct. 15, 2021, 10:13 p.m. UTC | #2
On Fri, 2021-10-15 at 03:21 +0200, Krzysztof Wilczyński wrote:
> Hi Kelvin,
> 
> Thank you for sending the series over!
> 
> I am terribly sorry for a very late comment, especially since Bjorn
> already
> accepted this series to be included, but allow me for a small
> question
> below.
> 
> [...]
> > @@ -113,6 +127,7 @@ static void stuser_set_state(struct
> > switchtec_user *stuser,
> >               [MRPC_QUEUED] = "QUEUED",
> >               [MRPC_RUNNING] = "RUNNING",
> >               [MRPC_DONE] = "DONE",
> > +             [MRPC_IO_ERROR] = "IO_ERROR",
> 
> Looking at the above, and then looking at stuser_set_state(), which
> contains the following local array definition:
> 
>         const char * const state_names[] = {
>                 [MRPC_IDLE] = "IDLE",
>                 [MRPC_QUEUED] = "QUEUED",
>                 [MRPC_RUNNING] = "RUNNING",
>                 [MRPC_DONE] = "DONE",
>         };
> 
> I was wondering if there might be a small benefit of declaring this
> array
> state_names[], or list of states if you wish, as static so that we
> avoid
> having to allocate space and fill it in with values every time this
> functions runs?
> 
> The function itself if referenced in few places as per:
> 
>   Index File                           Line Content
>       1 drivers/pci/switch/switchtec.c  159 stuser_set_state(stuser,
> MRPC_RUNNING);
>       2 drivers/pci/switch/switchtec.c  178 stuser_set_state(stuser,
> MRPC_QUEUED);
>       3 drivers/pci/switch/switchtec.c  206 stuser_set_state(stuser,
> MRPC_DONE);
>       4 drivers/pci/switch/switchtec.c  567 stuser_set_state(stuser,
> MRPC_IDLE);
> 
> Even though the string representation of the state is ever only
> printed if
> a debug logging is requested, we would allocate and popular this
> array
> every time anyway, regardless of whether we print any debug
> information or
> not.
> 
> What do you think?

Thank you Krzysztof. That will be an improvement. I can probably tweak
it in the next patchset (coming soon). 

Kelvin
> 
>         Krzysztof
diff mbox series

Patch

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 0b301f8be9ed..e5bb2ac0e7bb 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -45,6 +45,7 @@  enum mrpc_state {
 	MRPC_QUEUED,
 	MRPC_RUNNING,
 	MRPC_DONE,
+	MRPC_IO_ERROR,
 };
 
 struct switchtec_user {
@@ -66,6 +67,19 @@  struct switchtec_user {
 	int event_cnt;
 };
 
+/*
+ * The MMIO reads to the device_id register should always return the device ID
+ * of the device, otherwise the firmware is probably stuck or unreachable
+ * due to a firmware reset which clears PCI state including the BARs and Memory
+ * Space Enable bits.
+ */
+static int is_firmware_running(struct switchtec_dev *stdev)
+{
+	u32 device = ioread32(&stdev->mmio_sys_info->device_id);
+
+	return stdev->pdev->device == device;
+}
+
 static struct switchtec_user *stuser_create(struct switchtec_dev *stdev)
 {
 	struct switchtec_user *stuser;
@@ -113,6 +127,7 @@  static void stuser_set_state(struct switchtec_user *stuser,
 		[MRPC_QUEUED] = "QUEUED",
 		[MRPC_RUNNING] = "RUNNING",
 		[MRPC_DONE] = "DONE",
+		[MRPC_IO_ERROR] = "IO_ERROR",
 	};
 
 	stuser->state = state;
@@ -184,9 +199,26 @@  static int mrpc_queue_cmd(struct switchtec_user *stuser)
 	return 0;
 }
 
+static void mrpc_cleanup_cmd(struct switchtec_dev *stdev)
+{
+	/* requires the mrpc_mutex to already be held when called */
+
+	struct switchtec_user *stuser = list_entry(stdev->mrpc_queue.next,
+						   struct switchtec_user, list);
+
+	stuser->cmd_done = true;
+	wake_up_interruptible(&stuser->cmd_comp);
+	list_del_init(&stuser->list);
+	stuser_put(stuser);
+	stdev->mrpc_busy = 0;
+
+	mrpc_cmd_submit(stdev);
+}
+
 static void mrpc_complete_cmd(struct switchtec_dev *stdev)
 {
 	/* requires the mrpc_mutex to already be held when called */
+
 	struct switchtec_user *stuser;
 
 	if (list_empty(&stdev->mrpc_queue))
@@ -223,13 +255,7 @@  static void mrpc_complete_cmd(struct switchtec_dev *stdev)
 		memcpy_fromio(stuser->data, &stdev->mmio_mrpc->output_data,
 			      stuser->read_len);
 out:
-	stuser->cmd_done = true;
-	wake_up_interruptible(&stuser->cmd_comp);
-	list_del_init(&stuser->list);
-	stuser_put(stuser);
-	stdev->mrpc_busy = 0;
-
-	mrpc_cmd_submit(stdev);
+	mrpc_cleanup_cmd(stdev);
 }
 
 static void mrpc_event_work(struct work_struct *work)
@@ -246,6 +272,23 @@  static void mrpc_event_work(struct work_struct *work)
 	mutex_unlock(&stdev->mrpc_mutex);
 }
 
+static void mrpc_error_complete_cmd(struct switchtec_dev *stdev)
+{
+	/* requires the mrpc_mutex to already be held when called */
+
+	struct switchtec_user *stuser;
+
+	if (list_empty(&stdev->mrpc_queue))
+		return;
+
+	stuser = list_entry(stdev->mrpc_queue.next,
+			    struct switchtec_user, list);
+
+	stuser_set_state(stuser, MRPC_IO_ERROR);
+
+	mrpc_cleanup_cmd(stdev);
+}
+
 static void mrpc_timeout_work(struct work_struct *work)
 {
 	struct switchtec_dev *stdev;
@@ -257,6 +300,11 @@  static void mrpc_timeout_work(struct work_struct *work)
 
 	mutex_lock(&stdev->mrpc_mutex);
 
+	if (!is_firmware_running(stdev)) {
+		mrpc_error_complete_cmd(stdev);
+		goto out;
+	}
+
 	if (stdev->dma_mrpc)
 		status = stdev->dma_mrpc->status;
 	else
@@ -544,6 +592,11 @@  static ssize_t switchtec_dev_read(struct file *filp, char __user *data,
 	if (rc)
 		return rc;
 
+	if (stuser->state == MRPC_IO_ERROR) {
+		mutex_unlock(&stdev->mrpc_mutex);
+		return -EIO;
+	}
+
 	if (stuser->state != MRPC_DONE) {
 		mutex_unlock(&stdev->mrpc_mutex);
 		return -EBADE;