@@ -885,10 +885,14 @@ int i915_reset(struct drm_device *dev)
mutex_lock(&dev->struct_mutex);
+ /* i915_gem_reset() will set this */
+ dev_priv->gpu_error.ctx_banned = false;
+
i915_gem_reset(dev);
ret = -ENODEV;
- if (get_seconds() - dev_priv->gpu_error.last_reset < 5)
+ if (!dev_priv->gpu_error.ctx_banned &&
+ get_seconds() - dev_priv->gpu_error.last_reset < 5)
DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
else
ret = intel_gpu_reset(dev);
@@ -459,6 +459,12 @@ struct i915_ctx_hang_stats {
/* This context had batch active when hang was declared */
unsigned batch_active;
+
+ /* Time when this context was last blamed for a GPU reset */
+ unsigned long batch_active_reset_ts;
+
+ /* This context is banned to submit more work */
+ bool banned;
};
/* This must match up with the value previously used for execbuf2.rsvd1. */
@@ -835,6 +841,9 @@ struct i915_gpu_error {
unsigned long last_reset;
+ /* During reset handling, guilty context found and banned */
+ bool ctx_banned;
+
/**
* State variable and reset counter controlling the reset flow
*
@@ -2147,6 +2147,7 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
struct drm_i915_gem_request *request,
u32 acthd)
{
+ struct drm_i915_private *dev_priv = ring->dev->dev_private;
struct i915_ctx_hang_stats *hs = NULL;
bool inside, guilty;
@@ -2175,10 +2176,19 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
hs = &request->file_priv->hang_stats;
if (hs) {
- if (guilty)
+ if (guilty) {
+ if (!hs->banned &&
+ get_seconds() - hs->batch_active_reset_ts < 5) {
+ hs->banned = true;
+ DRM_ERROR("context hanging too fast, "
+ "declaring banned\n");
+ dev_priv->gpu_error.ctx_banned = true;
+ }
hs->batch_active++;
- else
+ hs->batch_active_reset_ts = get_seconds();
+ } else {
hs->batch_pending++;
+ }
}
}
@@ -844,6 +844,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
struct drm_clip_rect *cliprects = NULL;
struct intel_ring_buffer *ring;
struct i915_hw_context *ctx;
+ struct i915_ctx_hang_stats *hs;
u32 ctx_id = i915_execbuffer2_get_context_id(*args);
u32 exec_start, exec_len;
u32 mask, flags;
@@ -1026,6 +1027,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
if (ret)
goto err;
+ hs = i915_gem_context_get_hang_stats(&dev_priv->ring[RCS],
+ file, ctx_id);
+ if (IS_ERR(hs)) {
+ ret = PTR_ERR(hs);
+ goto err;
+ }
+
+ if (hs->banned) {
+ ret = -EIO;
+ goto err;
+ }
+
ctx = i915_switch_context(ring, file, ctx_id);
if (IS_ERR(ctx)) {
ret = PTR_ERR(ctx);
If context has recently submitted a faulty batchbuffers guilty of gpu hang and decides to keep submitting more crap, ban it permanently. v2: Store guilty ban status bool in gpu_error instead of pointers that might become danling before hang is declared. Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_drv.c | 6 +++++- drivers/gpu/drm/i915/i915_drv.h | 9 +++++++++ drivers/gpu/drm/i915/i915_gem.c | 14 ++++++++++++-- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 +++++++++++++ 4 files changed, 39 insertions(+), 3 deletions(-)