@@ -883,16 +883,17 @@ int i915_reset(struct drm_device *dev)
{
drm_i915_private_t *dev_priv = dev->dev_private;
int ret;
+ bool ctx_banned;
if (!i915_try_reset)
return 0;
mutex_lock(&dev->struct_mutex);
- i915_gem_reset(dev);
+ ctx_banned = i915_gem_reset(dev);
ret = -ENODEV;
- if (get_seconds() - dev_priv->gpu_error.last_reset < 5)
+ if (!ctx_banned && get_seconds() - dev_priv->gpu_error.last_reset < 5)
DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
else
ret = intel_gpu_reset(dev);
@@ -459,6 +459,12 @@ struct i915_ctx_hang_stats {
/* This context had batch active when hang was declared */
unsigned batch_active;
+
+ /* Time when this context was last blamed for a GPU reset */
+ unsigned long batch_active_reset_ts;
+
+ /* This context is banned to submit more work */
+ bool banned;
};
/* This must match up with the value previously used for execbuf2.rsvd1. */
@@ -1662,7 +1668,7 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
return atomic_read(&error->reset_counter) == I915_WEDGED;
}
-void i915_gem_reset(struct drm_device *dev);
+bool i915_gem_reset(struct drm_device *dev);
void i915_gem_clflush_object(struct drm_i915_gem_object *obj);
int __must_check i915_gem_object_set_domain(struct drm_i915_gem_object *obj,
uint32_t read_domains,
@@ -2143,15 +2143,15 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
return false;
}
-static void i915_set_reset_status(struct intel_ring_buffer *ring,
+static bool i915_set_reset_status(struct intel_ring_buffer *ring,
struct drm_i915_gem_request *request,
u32 acthd)
{
struct i915_ctx_hang_stats *hs = NULL;
- bool inside, guilty;
+ bool inside, guilty, banned;
/* Innocent until proven guilty */
- guilty = false;
+ guilty = banned = false;
if (!ring->hangcheck.was_waiting &&
i915_request_guilty(request, acthd, &inside)) {
@@ -2175,11 +2175,21 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
hs = &request->file_priv->hang_stats;
if (hs) {
- if (guilty)
+ if (guilty) {
+ if (!hs->banned &&
+ get_seconds() - hs->batch_active_reset_ts < 5) {
+ hs->banned = banned = true;
+ DRM_ERROR("context hanging too fast, "
+ "declaring banned\n");
+ }
hs->batch_active++;
- else
+ hs->batch_active_reset_ts = get_seconds();
+ } else {
hs->batch_pending++;
+ }
}
+
+ return banned;
}
static void i915_gem_free_request(struct drm_i915_gem_request *request)
@@ -2193,11 +2203,12 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
kfree(request);
}
-static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
+static bool i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
struct intel_ring_buffer *ring)
{
u32 completed_seqno;
u32 acthd;
+ bool ctx_banned = false;
acthd = intel_ring_get_active_head(ring);
completed_seqno = ring->get_seqno(ring, false);
@@ -2210,7 +2221,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
list);
if (request->seqno > completed_seqno)
- i915_set_reset_status(ring, request, acthd);
+ ctx_banned |= i915_set_reset_status(ring,
+ request, acthd);
i915_gem_free_request(request);
}
@@ -2224,6 +2236,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
i915_gem_object_move_to_inactive(obj);
}
+
+ return ctx_banned;
}
static void i915_gem_reset_fences(struct drm_device *dev)
@@ -2247,15 +2261,16 @@ static void i915_gem_reset_fences(struct drm_device *dev)
INIT_LIST_HEAD(&dev_priv->mm.fence_list);
}
-void i915_gem_reset(struct drm_device *dev)
+bool i915_gem_reset(struct drm_device *dev)
{
struct drm_i915_private *dev_priv = dev->dev_private;
struct drm_i915_gem_object *obj;
struct intel_ring_buffer *ring;
int i;
+ bool ctx_banned = false;
for_each_ring(ring, dev_priv, i)
- i915_gem_reset_ring_lists(dev_priv, ring);
+ ctx_banned |= i915_gem_reset_ring_lists(dev_priv, ring);
/* Move everything out of the GPU domains to ensure we do any
* necessary invalidation upon reuse.
@@ -2269,6 +2284,8 @@ void i915_gem_reset(struct drm_device *dev)
/* The fence registers are invalidated so clear them out */
i915_gem_reset_fences(dev);
+
+ return ctx_banned;
}
/**
@@ -844,6 +844,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
struct drm_clip_rect *cliprects = NULL;
struct intel_ring_buffer *ring;
struct i915_hw_context *ctx;
+ struct i915_ctx_hang_stats *hs;
u32 ctx_id = i915_execbuffer2_get_context_id(*args);
u32 exec_start, exec_len;
u32 mask, flags;
@@ -1026,6 +1027,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
if (ret)
goto err;
+ hs = i915_gem_context_get_hang_stats(&dev_priv->ring[RCS],
+ file, ctx_id);
+ if (IS_ERR(hs)) {
+ ret = PTR_ERR(hs);
+ goto err;
+ }
+
+ if (hs->banned) {
+ ret = -EIO;
+ goto err;
+ }
+
ctx = i915_switch_context(ring, file, ctx_id);
if (IS_ERR(ctx)) {
ret = PTR_ERR(ctx);
If context has recently submitted a faulty batchbuffers guilty of gpu hang and decides to keep submitting more crap, ban it permanently. v2: Store guilty ban status bool in gpu_error instead of pointers that might become danling before hang is declared. v3: Use return value for banned status instead of stashing state into gpu_error (Chris Wilson) Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_drv.c | 5 ++-- drivers/gpu/drm/i915/i915_drv.h | 8 ++++++- drivers/gpu/drm/i915/i915_gem.c | 35 +++++++++++++++++++++------- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 13 +++++++++++ 4 files changed, 49 insertions(+), 12 deletions(-)