diff mbox series

[05/10] accel/ivpu: Fix potential infinite loops in IRQ handlers

Message ID 20240105112218.351265-6-jacek.lawrynowicz@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series accel/ivpu fixes for 6.8 | expand

Commit Message

Jacek Lawrynowicz Jan. 5, 2024, 11:22 a.m. UTC
Limit number of iterations in ivpu_mmu_irq_evtq_handler() and
ivpu_ipc_irq_handler().

Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
---
 drivers/accel/ivpu/ivpu_ipc.c |  6 ++++++
 drivers/accel/ivpu/ivpu_mmu.c | 21 +++++++++++++--------
 2 files changed, 19 insertions(+), 8 deletions(-)

Comments

Jeffrey Hugo Jan. 5, 2024, 4:35 p.m. UTC | #1
On 1/5/2024 4:22 AM, Jacek Lawrynowicz wrote:
> Limit number of iterations in ivpu_mmu_irq_evtq_handler() and
> ivpu_ipc_irq_handler().

"potential infinite loops" sounds like something that has not been 
observed.  Has a problem actually occurred?

Are you concerned that the FW is broken and spamming Linux with events?

Why a limit of 100 events?  Seems arbitrary.

I suspect threaded irqs might be useful here, but it is hard to tell.

> 
> Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
> ---
>   drivers/accel/ivpu/ivpu_ipc.c |  6 ++++++
>   drivers/accel/ivpu/ivpu_mmu.c | 21 +++++++++++++--------
>   2 files changed, 19 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/accel/ivpu/ivpu_ipc.c b/drivers/accel/ivpu/ivpu_ipc.c
> index e86621f16f85..f69780248803 100644
> --- a/drivers/accel/ivpu/ivpu_ipc.c
> +++ b/drivers/accel/ivpu/ivpu_ipc.c
> @@ -389,12 +389,18 @@ void ivpu_ipc_irq_handler(struct ivpu_device *vdev, bool *wake_thread)
>   	unsigned long flags;
>   	bool dispatched;
>   	u32 vpu_addr;
> +	int msg_count = 0;
>   
>   	/*
>   	 * Driver needs to purge all messages from IPC FIFO to clear IPC interrupt.
>   	 * Without purge IPC FIFO to 0 next IPC interrupts won't be generated.
>   	 */
>   	while (ivpu_hw_reg_ipc_rx_count_get(vdev)) {
> +		if (++msg_count > IPC_MAX_RX_MSG) {
> +			ivpu_pm_schedule_recovery(vdev);
> +			return;
> +		}
> +
>   		vpu_addr = ivpu_hw_reg_ipc_rx_addr_get(vdev);
>   		if (vpu_addr == REG_IO_ERROR) {
>   			ivpu_err_ratelimited(vdev, "Failed to read IPC rx addr register\n");
> diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c
> index 1f813625aab3..c82929b0ae9d 100644
> --- a/drivers/accel/ivpu/ivpu_mmu.c
> +++ b/drivers/accel/ivpu/ivpu_mmu.c
> @@ -236,6 +236,8 @@
>   #define IVPU_MMU_CERROR_ABT          0x2
>   #define IVPU_MMU_CERROR_ATC_INV_SYNC 0x3
>   
> +#define IVPU_MMU_MAX_EVENT_COUNT     100
> +
>   static const char *ivpu_mmu_event_to_str(u32 cmd)
>   {
>   	switch (cmd) {
> @@ -887,7 +889,7 @@ static u32 *ivpu_mmu_get_event(struct ivpu_device *vdev)
>   
>   void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev)
>   {
> -	bool schedule_recovery = false;
> +	int event_count = 0;
>   	u32 *event;
>   	u32 ssid;
>   
> @@ -895,16 +897,19 @@ void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev)
>   
>   	while ((event = ivpu_mmu_get_event(vdev)) != NULL) {
>   		ivpu_mmu_dump_event(vdev, event);
> +		if (++event_count > IVPU_MMU_MAX_EVENT_COUNT) {
> +			ivpu_pm_schedule_recovery(vdev);
> +			return;
> +		}
>   
>   		ssid = FIELD_GET(IVPU_MMU_EVT_SSID_MASK, event[0]);
> -		if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID)
> -			schedule_recovery = true;
> -		else
> -			ivpu_mmu_user_context_mark_invalid(vdev, ssid);
> -	}
> +		if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID) {
> +			ivpu_pm_schedule_recovery(vdev);
> +			return;
> +		}
>   
> -	if (schedule_recovery)
> -		ivpu_pm_schedule_recovery(vdev);
> +		ivpu_mmu_user_context_mark_invalid(vdev, ssid);
> +	}
>   }
>   
>   void ivpu_mmu_evtq_dump(struct ivpu_device *vdev)
Jacek Lawrynowicz Jan. 9, 2024, 12:34 p.m. UTC | #2
On 05.01.2024 17:35, Jeffrey Hugo wrote:
> On 1/5/2024 4:22 AM, Jacek Lawrynowicz wrote:
>> Limit number of iterations in ivpu_mmu_irq_evtq_handler() and
>> ivpu_ipc_irq_handler().
> 
> "potential infinite loops" sounds like something that has not been observed.  Has a problem actually occurred?
> 
> Are you concerned that the FW is broken and spamming Linux with events?
> 
> Why a limit of 100 events?  Seems arbitrary.
> 
> I suspect threaded irqs might be useful here, but it is hard to tell.

No, these problems were not observed yet.
I think that the small size of the IPC/MMU queues and relatively fast host CPU prevent the host from being overloaded.
Maybe let's drop this patch until there is a real issue then :)
diff mbox series

Patch

diff --git a/drivers/accel/ivpu/ivpu_ipc.c b/drivers/accel/ivpu/ivpu_ipc.c
index e86621f16f85..f69780248803 100644
--- a/drivers/accel/ivpu/ivpu_ipc.c
+++ b/drivers/accel/ivpu/ivpu_ipc.c
@@ -389,12 +389,18 @@  void ivpu_ipc_irq_handler(struct ivpu_device *vdev, bool *wake_thread)
 	unsigned long flags;
 	bool dispatched;
 	u32 vpu_addr;
+	int msg_count = 0;
 
 	/*
 	 * Driver needs to purge all messages from IPC FIFO to clear IPC interrupt.
 	 * Without purge IPC FIFO to 0 next IPC interrupts won't be generated.
 	 */
 	while (ivpu_hw_reg_ipc_rx_count_get(vdev)) {
+		if (++msg_count > IPC_MAX_RX_MSG) {
+			ivpu_pm_schedule_recovery(vdev);
+			return;
+		}
+
 		vpu_addr = ivpu_hw_reg_ipc_rx_addr_get(vdev);
 		if (vpu_addr == REG_IO_ERROR) {
 			ivpu_err_ratelimited(vdev, "Failed to read IPC rx addr register\n");
diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c
index 1f813625aab3..c82929b0ae9d 100644
--- a/drivers/accel/ivpu/ivpu_mmu.c
+++ b/drivers/accel/ivpu/ivpu_mmu.c
@@ -236,6 +236,8 @@ 
 #define IVPU_MMU_CERROR_ABT          0x2
 #define IVPU_MMU_CERROR_ATC_INV_SYNC 0x3
 
+#define IVPU_MMU_MAX_EVENT_COUNT     100
+
 static const char *ivpu_mmu_event_to_str(u32 cmd)
 {
 	switch (cmd) {
@@ -887,7 +889,7 @@  static u32 *ivpu_mmu_get_event(struct ivpu_device *vdev)
 
 void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev)
 {
-	bool schedule_recovery = false;
+	int event_count = 0;
 	u32 *event;
 	u32 ssid;
 
@@ -895,16 +897,19 @@  void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev)
 
 	while ((event = ivpu_mmu_get_event(vdev)) != NULL) {
 		ivpu_mmu_dump_event(vdev, event);
+		if (++event_count > IVPU_MMU_MAX_EVENT_COUNT) {
+			ivpu_pm_schedule_recovery(vdev);
+			return;
+		}
 
 		ssid = FIELD_GET(IVPU_MMU_EVT_SSID_MASK, event[0]);
-		if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID)
-			schedule_recovery = true;
-		else
-			ivpu_mmu_user_context_mark_invalid(vdev, ssid);
-	}
+		if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID) {
+			ivpu_pm_schedule_recovery(vdev);
+			return;
+		}
 
-	if (schedule_recovery)
-		ivpu_pm_schedule_recovery(vdev);
+		ivpu_mmu_user_context_mark_invalid(vdev, ssid);
+	}
 }
 
 void ivpu_mmu_evtq_dump(struct ivpu_device *vdev)