diff mbox series

[v2,5/6] drm/msm: Add a module param to force coredump

Message ID 20211124024436.v2.5.I7e56690d7d94973520d4f4b7d9441e2b39ac2224@changeid (mailing list archive)
State Not Applicable
Headers show
Series [v2,1/6] drm/msm: Increase gpu boost interval | expand

Commit Message

Akhil P Oommen Nov. 23, 2021, 9:17 p.m. UTC
Add a module param "force_gpu_coredump" to force coredump on relatively
harmless gpu hw errors.

Signed-off-by: Akhil P Oommen <akhilpo@codeaurora.org>
---

(no changes since v1)

 drivers/gpu/drm/msm/adreno/a5xx_gpu.c      | 33 ++++++++++++++++++--------
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c      | 38 +++++++++++++++++++++---------
 drivers/gpu/drm/msm/adreno/adreno_device.c |  4 ++++
 3 files changed, 54 insertions(+), 21 deletions(-)
diff mbox series

Patch

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 5e2750e..1861e9a 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -14,6 +14,7 @@ 
 #include "a5xx_gpu.h"
 
 extern bool hang_debug;
+extern bool force_gpu_coredump;
 static void a5xx_dump(struct msm_gpu *gpu);
 
 #define GPU_PAS_ID 13
@@ -1237,11 +1238,6 @@  static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
 		gpu_read(gpu, REG_A5XX_CP_IB1_BUFSZ),
 		gpu_read64(gpu, REG_A5XX_CP_IB2_BASE, REG_A5XX_CP_IB2_BASE_HI),
 		gpu_read(gpu, REG_A5XX_CP_IB2_BUFSZ));
-
-	/* Turn off the hangcheck timer to keep it from bothering us */
-	del_timer(&gpu->hangcheck_timer);
-
-	kthread_queue_work(gpu->worker, &gpu->recover_work);
 }
 
 #define RBBM_ERROR_MASK \
@@ -1255,6 +1251,7 @@  static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
 static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
 {
 	u32 status = gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS);
+	bool coredump = false;
 
 	/*
 	 * Clear all the interrupts except RBBM_AHB_ERROR - if we clear it
@@ -1264,20 +1261,30 @@  static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
 		status & ~A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR);
 
 	/* Pass status to a5xx_rbbm_err_irq because we've already cleared it */
-	if (status & RBBM_ERROR_MASK)
+	if (status & RBBM_ERROR_MASK) {
 		a5xx_rbbm_err_irq(gpu, status);
+		coredump |= force_gpu_coredump;
+	}
 
-	if (status & A5XX_RBBM_INT_0_MASK_CP_HW_ERROR)
+	if (status & A5XX_RBBM_INT_0_MASK_CP_HW_ERROR) {
 		a5xx_cp_err_irq(gpu);
+		coredump |= force_gpu_coredump;
+	}
 
-	if (status & A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT)
+	if (status & A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT) {
 		a5xx_fault_detect_irq(gpu);
+		coredump = true;
+	}
 
-	if (status & A5XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS)
+	if (status & A5XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS) {
 		a5xx_uche_err_irq(gpu);
+		coredump |= force_gpu_coredump;
+	}
 
-	if (status & A5XX_RBBM_INT_0_MASK_GPMU_VOLTAGE_DROOP)
+	if (status & A5XX_RBBM_INT_0_MASK_GPMU_VOLTAGE_DROOP) {
 		a5xx_gpmu_err_irq(gpu);
+		coredump |= force_gpu_coredump;
+	}
 
 	if (status & A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS) {
 		a5xx_preempt_trigger(gpu);
@@ -1287,6 +1294,12 @@  static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
 	if (status & A5XX_RBBM_INT_0_MASK_CP_SW)
 		a5xx_preempt_irq(gpu);
 
+	if (coredump) {
+		/* Turn off the hangcheck timer to keep it from bothering us */
+		del_timer(&gpu->hangcheck_timer);
+		kthread_queue_work(gpu->worker, &gpu->recover_work);
+	}
+
 	return IRQ_HANDLED;
 }
 
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 6c2edce..f96587f 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -15,6 +15,8 @@ 
 
 #define GPU_PAS_ID 13
 
+extern bool force_gpu_coredump;
+
 static inline bool _a6xx_check_idle(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -1354,40 +1356,54 @@  static void a6xx_fault_detect_irq(struct msm_gpu *gpu)
 		gpu_read(gpu, REG_A6XX_CP_IB1_REM_SIZE),
 		gpu_read64(gpu, REG_A6XX_CP_IB2_BASE, REG_A6XX_CP_IB2_BASE_HI),
 		gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE));
-
-	/* Turn off the hangcheck timer to keep it from bothering us */
-	del_timer(&gpu->hangcheck_timer);
-
-	kthread_queue_work(gpu->worker, &gpu->recover_work);
 }
 
 static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
 {
 	u32 status = gpu_read(gpu, REG_A6XX_RBBM_INT_0_STATUS);
+	bool coredump = false;
 
 	gpu_write(gpu, REG_A6XX_RBBM_INT_CLEAR_CMD, status);
 
-	if (status & A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT)
+	if (status & A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT) {
 		a6xx_fault_detect_irq(gpu);
+		coredump = true;
+	}
 
-	if (status & A6XX_RBBM_INT_0_MASK_CP_AHB_ERROR)
+	if (status & A6XX_RBBM_INT_0_MASK_CP_AHB_ERROR) {
 		dev_err_ratelimited(&gpu->pdev->dev, "CP | AHB bus error\n");
+		coredump |= force_gpu_coredump;
+	}
 
-	if (status & A6XX_RBBM_INT_0_MASK_CP_HW_ERROR)
+	if (status & A6XX_RBBM_INT_0_MASK_CP_HW_ERROR) {
 		a6xx_cp_hw_err_irq(gpu);
+		coredump |= force_gpu_coredump;
+	}
 
-	if (status & A6XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNCFIFO_OVERFLOW)
+	if (status & A6XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNCFIFO_OVERFLOW) {
 		dev_err_ratelimited(&gpu->pdev->dev, "RBBM | ATB ASYNC overflow\n");
+		coredump |= force_gpu_coredump;
+	}
 
-	if (status & A6XX_RBBM_INT_0_MASK_RBBM_ATB_BUS_OVERFLOW)
+	if (status & A6XX_RBBM_INT_0_MASK_RBBM_ATB_BUS_OVERFLOW) {
 		dev_err_ratelimited(&gpu->pdev->dev, "RBBM | ATB bus overflow\n");
+		coredump |= force_gpu_coredump;
+	}
 
-	if (status & A6XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS)
+	if (status & A6XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS) {
 		dev_err_ratelimited(&gpu->pdev->dev, "UCHE | Out of bounds access\n");
+		coredump |= force_gpu_coredump;
+	}
 
 	if (status & A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS)
 		msm_gpu_retire(gpu);
 
+	if (coredump) {
+		/* Turn off the hangcheck timer to keep it from bothering us */
+		del_timer(&gpu->hangcheck_timer);
+		kthread_queue_work(gpu->worker, &gpu->recover_work);
+	}
+
 	return IRQ_HANDLED;
 }
 
diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c
index 2a6ce76..a159cb9 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_device.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_device.c
@@ -20,6 +20,10 @@  bool allow_vram_carveout = false;
 MODULE_PARM_DESC(allow_vram_carveout, "Allow using VRAM Carveout, in place of IOMMU");
 module_param_named(allow_vram_carveout, allow_vram_carveout, bool, 0600);
 
+bool force_gpu_coredump = false;
+MODULE_PARM_DESC(snapshot_debugbus, "Force gpu coredump on hw errors which are usually harmless");
+module_param_named(force_gpu_coredump, force_gpu_coredump, bool, 0600);
+
 static const struct adreno_info gpulist[] = {
 	{
 		.rev   = ADRENO_REV(2, 0, 0, 0),