@@ -4092,7 +4092,7 @@ i915_wedged_set(void *data, u64 val)
engine->hangcheck.stalled = true;
}
- i915_handle_error(i915, val, "Manually setting wedged to %llu", val);
+ i915_handle_error(i915, true, val, "Manually setting wedged to %llu", val);
wait_on_bit(&i915->gpu_error.flags,
I915_RESET_HANDOFF,
@@ -1004,6 +1004,8 @@ struct i915_gpu_state {
u32 *pages[0];
} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
+ bool batch_had_hang_signature;
+
struct drm_i915_error_object **user_bo;
long user_bo_count;
@@ -3372,8 +3374,9 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
&dev_priv->gpu_error.hangcheck_work, delay);
}
-__printf(3, 4)
+__printf(4, 5)
void i915_handle_error(struct drm_i915_private *dev_priv,
+ bool intentional,
u32 engine_mask,
const char *fmt, ...);
@@ -3960,6 +3963,7 @@ static inline void i915_error_state_buf_release(
struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
void i915_capture_error_state(struct drm_i915_private *dev_priv,
+ bool intentional,
u32 engine_mask,
const char *error_msg);
@@ -918,15 +918,52 @@ void __i915_gpu_state_free(struct kref *error_ref)
kfree(error);
}
+#ifdef CONFIG_DRM_I915_DEBUG_GEM
+
+#define HANG_SIGNATURE_MAGIC 0xc5c5c5c5
+
+static bool batch_has_hang_signature(const u32 * const batch, u64 len)
+{
+ struct marker { u32 val; u64 offset; } m[2] = {
+ { MI_BATCH_BUFFER_END, -1 },
+ { HANG_SIGNATURE_MAGIC, -1 }
+ };
+ u64 i, j;
+
+ len = min_t(u64, len, PAGE_SIZE);
+ len = DIV_ROUND_UP(len, 4);
+
+ for (i = 0, j = 0; i < len && j < ARRAY_SIZE(m); i++) {
+ if (m[j].offset == -1 && batch[i] == m[j].val)
+ m[j++].offset = i;
+
+ if (m[0].offset != -1 && i - m[0].offset > 2)
+ break;
+ }
+
+ if (m[0].offset == -1 || m[1].offset == -1)
+ return false;
+
+ return m[1].offset - m[0].offset == 2;
+}
+#else
+static bool batch_has_hang_signature(struct i915_vma *vma)
+{
+ return false;
+}
+#endif
+
static struct drm_i915_error_object *
-i915_error_object_create(struct drm_i915_private *i915,
- struct i915_vma *vma)
+__i915_error_object_create(struct drm_i915_private *i915,
+ struct i915_vma *vma,
+ bool *hangsig)
{
struct i915_ggtt *ggtt = &i915->ggtt;
const u64 slot = ggtt->error_capture.start;
struct drm_i915_error_object *dst;
struct compress compress;
- unsigned long num_pages;
+ unsigned long num_pages, src_pages;
+ unsigned long page_count = 0;
struct sgt_iter iter;
dma_addr_t dma;
@@ -934,6 +971,7 @@ i915_error_object_create(struct drm_i915_private *i915,
return NULL;
num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
+ src_pages = num_pages;
num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
GFP_ATOMIC | __GFP_NOWARN);
@@ -958,9 +996,16 @@ i915_error_object_create(struct drm_i915_private *i915,
I915_CACHE_NONE, 0);
s = io_mapping_map_atomic_wc(&ggtt->mappable, slot);
+
+ /* Check last page only for intentional hang signature */
+ if (hangsig && page_count == src_pages - 1)
+ *hangsig = batch_has_hang_signature((void __force *)s,
+ vma->size);
+
ret = compress_page(&compress, (void __force *)s, dst);
io_mapping_unmap_atomic(s);
+ page_count++;
if (ret)
goto unwind;
}
@@ -978,6 +1023,21 @@ i915_error_object_create(struct drm_i915_private *i915,
return dst;
}
+static struct drm_i915_error_object *
+i915_error_object_create(struct drm_i915_private *i915,
+ struct i915_vma *vma)
+{
+ return __i915_error_object_create(i915, vma, NULL);
+}
+
+static struct drm_i915_error_object *
+i915_error_batch_object_create(struct drm_i915_private *i915,
+ struct i915_vma *vma,
+ bool *signature)
+{
+ return __i915_error_object_create(i915, vma, signature);
+}
+
/* The error capture is special as tries to run underneath the normal
* locking rules - so we use the raw version of the i915_gem_active lookup.
*/
@@ -1484,8 +1544,9 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
* by userspace.
*/
ee->batchbuffer =
- i915_error_object_create(dev_priv,
- request->batch);
+ i915_error_batch_object_create(dev_priv,
+ request->batch,
+ &ee->batch_had_hang_signature);
if (HAS_BROKEN_CS_TLB(dev_priv))
ee->wa_batchbuffer =
@@ -1701,7 +1762,8 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
struct i915_gpu_state *error,
u32 engine_mask,
- const char *error_msg)
+ const char *error_msg,
+ bool simulated)
{
u32 ecode;
int engine_id = -1, len;
@@ -1709,7 +1771,8 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
ecode = i915_error_generate_code(dev_priv, error, &engine_id);
len = scnprintf(error->error_msg, sizeof(error->error_msg),
- "GPU HANG: ecode %d:%d:0x%08x",
+ "GPU HANG%s: ecode %d:%d:0x%08x",
+ simulated ? "(simulated)" : "",
INTEL_GEN(dev_priv), engine_id, ecode);
if (engine_id != -1 && error->engine[engine_id].context.pid)
@@ -1803,6 +1866,18 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
return error;
}
+static bool error_state_has_hang_signature(const struct i915_gpu_state *error)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
+ if (error->engine[i].batch_had_hang_signature)
+ return true;
+ }
+
+ return false;
+}
+
/**
* i915_capture_error_state - capture an error record for later analysis
* @dev: drm device
@@ -1813,12 +1888,14 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
* to pick up.
*/
void i915_capture_error_state(struct drm_i915_private *dev_priv,
+ bool intentional,
u32 engine_mask,
const char *error_msg)
{
static bool warned;
struct i915_gpu_state *error;
unsigned long flags;
+ bool expected;
if (!i915_modparams.error_capture)
return;
@@ -1832,8 +1909,13 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
return;
}
- i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
- DRM_INFO("%s\n", error->error_msg);
+ expected = intentional || error_state_has_hang_signature(error);
+ i915_error_capture_msg(dev_priv, error, engine_mask, error_msg, expected);
+
+ if (expected)
+ DRM_INFO("%s\n", error->error_msg);
+ else
+ DRM_WARN("%s\n", error->error_msg);
if (!error->simulated) {
spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
@@ -2828,6 +2828,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
/**
* i915_handle_error - handle a gpu error
* @dev_priv: i915 device private
+ * @intentional: error was intentionally injected
* @engine_mask: mask representing engines that are hung
* @fmt: Error message format string
*
@@ -2838,6 +2839,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
* of a ring dump etc.).
*/
void i915_handle_error(struct drm_i915_private *dev_priv,
+ bool intentional,
u32 engine_mask,
const char *fmt, ...)
{
@@ -2859,7 +2861,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
*/
intel_runtime_pm_get(dev_priv);
- i915_capture_error_state(dev_priv, engine_mask, error_msg);
+ i915_capture_error_state(dev_priv, intentional, engine_mask, error_msg);
i915_clear_error_registers(dev_priv);
/*
@@ -266,7 +266,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
*/
tmp = I915_READ_CTL(engine);
if (tmp & RING_WAIT) {
- i915_handle_error(dev_priv, 0,
+ i915_handle_error(dev_priv, false, 0,
"Kicking stuck wait on %s",
engine->name);
I915_WRITE_CTL(engine, tmp);
@@ -278,7 +278,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
default:
return ENGINE_DEAD;
case 1:
- i915_handle_error(dev_priv, 0,
+ i915_handle_error(dev_priv, false, 0,
"Kicking stuck semaphore on %s",
engine->name);
I915_WRITE_CTL(engine, tmp);
@@ -407,7 +407,7 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
"%s, ", engine->name);
msg[len-2] = '\0';
- return i915_handle_error(i915, hung, "%s", msg);
+ return i915_handle_error(i915, false, hung, "%s", msg);
}
/*
@@ -846,7 +846,7 @@ static int igt_handle_error(void *arg)
engine->hangcheck.stalled = true;
engine->hangcheck.seqno = intel_engine_get_seqno(engine);
- i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
+ i915_handle_error(i915, true, intel_engine_flag(engine), "%s", __func__);
xchg(&i915->gpu_error.first_error, error);
We have a problem of distinguishing intended hangs submitted by igt during CI/bat and hangs that are nonintended happening in close proximity. As we know how igt constructs a batch intended to hang the gpu, we can use this in our advantage when error state is constructed. The signature of a intended hang can be parsed out in this stage. Batches that are expected to hang can be watermarked by: batch[i++] = MI_BATCH_BUFFER_END; batch[i++] = MI_NOOP; batch[i++] = IGT_HANG_SIGNATURE; /* 0xc5c5c5c5 */ Note that we do the parsing only with gem debug builds. Later when we spew out notification about the hang into the dmesg, we can use DRM_INFO for intended hangs and DRM_WARN for nonintended. This way auxiliary tools can make better judgement on what to consider important for reporting. Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> --- drivers/gpu/drm/i915/i915_debugfs.c | 2 +- drivers/gpu/drm/i915/i915_drv.h | 6 +- drivers/gpu/drm/i915/i915_gpu_error.c | 100 +++++++++++++++++++++-- drivers/gpu/drm/i915/i915_irq.c | 4 +- drivers/gpu/drm/i915/intel_hangcheck.c | 6 +- drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 2 +- 6 files changed, 104 insertions(+), 16 deletions(-)