@@ -804,6 +804,7 @@ int i915_reset(struct drm_device *dev)
{
drm_i915_private_t *dev_priv = dev->dev_private;
bool simulated;
+ bool ctx_banned;
int ret;
if (!i915_try_reset)
@@ -811,11 +812,12 @@ int i915_reset(struct drm_device *dev)
mutex_lock(&dev->struct_mutex);
- i915_gem_reset(dev);
+ ctx_banned = i915_gem_reset(dev);
simulated = dev_priv->gpu_error.stop_rings != 0;
- if (!simulated && get_seconds() - dev_priv->gpu_error.last_reset <
+ if (!(simulated || ctx_banned) &&
+ get_seconds() - dev_priv->gpu_error.last_reset <
DRM_I915_WEDGE_PERIOD) {
DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
ret = -ENODEV;
@@ -575,6 +575,12 @@ struct i915_ctx_hang_stats {
/* This context had batch active when hang was declared */
unsigned batch_active;
+
+ /* Time when this context was last blamed for a GPU reset */
+ unsigned long batch_active_reset_ts;
+
+ /* This context is banned to submit more work */
+ bool banned;
};
/* This must match up with the value previously used for execbuf2.rsvd1. */
@@ -978,6 +984,8 @@ struct i915_gpu_error {
#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
/* Hangcheck needs >2 periods to declare a hang */
#define DRM_I915_WEDGE_PERIOD DIV_ROUND_UP(5*DRM_I915_HANGCHECK_PERIOD, 1000)
+ /* Hang gpu twice in this window and your context gets banned */
+#define DRM_I915_CONTEXT_BAN_PERIOD (2 * DRM_I915_WEDGE_PERIOD)
struct timer_list hangcheck_timer;
@@ -1922,7 +1930,7 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
return atomic_read(&error->reset_counter) == I915_WEDGED;
}
-void i915_gem_reset(struct drm_device *dev);
+bool i915_gem_reset(struct drm_device *dev);
bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj);
int __must_check i915_gem_init(struct drm_device *dev);
@@ -2201,16 +2201,16 @@ static bool i915_request_guilty(struct drm_i915_gem_request *request,
return false;
}
-static void i915_set_reset_status(struct intel_ring_buffer *ring,
+static bool i915_set_reset_status(struct intel_ring_buffer *ring,
struct drm_i915_gem_request *request,
u32 acthd)
{
struct i915_ctx_hang_stats *hs = NULL;
- bool inside, guilty;
+ bool inside, guilty, banned;
unsigned long offset = 0;
/* Innocent until proven guilty */
- guilty = false;
+ guilty = banned = false;
if (request->batch_obj)
offset = i915_gem_obj_offset(request->batch_obj,
@@ -2237,11 +2237,21 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
hs = &request->file_priv->hang_stats;
if (hs) {
- if (guilty)
+ if (guilty) {
+ if (!hs->banned &&
+ get_seconds() - hs->batch_active_reset_ts <
+ DRM_I915_CONTEXT_BAN_PERIOD) {
+ hs->banned = banned = true;
+ DRM_ERROR("context hanging too fast, declaring banned\n");
+ }
hs->batch_active++;
- else
+ hs->batch_active_reset_ts = get_seconds();
+ } else {
hs->batch_pending++;
+ }
}
+
+ return banned;
}
static void i915_gem_free_request(struct drm_i915_gem_request *request)
@@ -2255,11 +2265,12 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
kfree(request);
}
-static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
+static bool i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
struct intel_ring_buffer *ring)
{
u32 completed_seqno;
u32 acthd;
+ bool ctx_banned = false;
acthd = intel_ring_get_active_head(ring);
completed_seqno = ring->get_seqno(ring, false);
@@ -2272,7 +2283,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
list);
if (request->seqno > completed_seqno)
- i915_set_reset_status(ring, request, acthd);
+ ctx_banned |= i915_set_reset_status(ring,
+ request, acthd);
i915_gem_free_request(request);
}
@@ -2286,6 +2298,8 @@ static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
i915_gem_object_move_to_inactive(obj);
}
+
+ return ctx_banned;
}
void i915_gem_restore_fences(struct drm_device *dev)
@@ -2309,16 +2323,19 @@ void i915_gem_restore_fences(struct drm_device *dev)
}
}
-void i915_gem_reset(struct drm_device *dev)
+bool i915_gem_reset(struct drm_device *dev)
{
struct drm_i915_private *dev_priv = dev->dev_private;
struct intel_ring_buffer *ring;
int i;
+ bool ctx_banned = false;
for_each_ring(ring, dev_priv, i)
- i915_gem_reset_ring_lists(dev_priv, ring);
+ ctx_banned |= i915_gem_reset_ring_lists(dev_priv, ring);
i915_gem_restore_fences(dev);
+
+ return ctx_banned;
}
/**
@@ -886,6 +886,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
struct drm_i915_gem_object *batch_obj;
struct drm_clip_rect *cliprects = NULL;
struct intel_ring_buffer *ring;
+ struct i915_ctx_hang_stats *hs;
u32 ctx_id = i915_execbuffer2_get_context_id(*args);
u32 exec_start, exec_len;
u32 mask, flags;
@@ -1077,6 +1078,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
if (ret)
goto err;
+ hs = i915_gem_context_get_hang_stats(dev, file, ctx_id);
+ if (IS_ERR(hs)) {
+ ret = PTR_ERR(hs);
+ goto err;
+ }
+
+ if (hs->banned) {
+ ret = -EIO;
+ goto err;
+ }
+
ret = i915_switch_context(ring, file, ctx_id);
if (ret)
goto err;
Now when we have mechanism in place to track which context was guilty of hanging the gpu, it is possible to punish for bad behaviour. If context has recently submitted a faulty batchbuffers guilty of gpu hang and submits another batch which hangs gpu in a 16 seconds time window, ban it permanently. Ban means that batchbuffers from this fd/context will not be accepted anymore for execution. v2: Store guilty ban status bool in gpu_error instead of pointers that might become danling before hang is declared. v3: Use return value for banned status instead of stashing state into gpu_error (Chris Wilson) v4: - rebase on top of fixed hang stats api - add define for ban period - rename commit and improve commit msg Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_drv.c | 6 +++-- drivers/gpu/drm/i915/i915_drv.h | 10 +++++++- drivers/gpu/drm/i915/i915_gem.c | 35 +++++++++++++++++++++------- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 ++++++++++ 4 files changed, 51 insertions(+), 12 deletions(-)