@@ -815,6 +815,8 @@ int intel_gpu_reset(struct drm_device *dev)
int i915_reset(struct drm_device *dev)
{
drm_i915_private_t *dev_priv = dev->dev_private;
+ struct ctx_reset_state *gstate;
+ bool do_wedge = true;
int ret;
if (!i915_try_reset)
@@ -822,10 +824,29 @@ int i915_reset(struct drm_device *dev)
mutex_lock(&dev->struct_mutex);
+ /* i915_gem_reset will set this if it finds guilty context */
+ dev_priv->gpu_error.guilty_state = NULL;
+
i915_gem_reset(dev);
+ gstate = dev_priv->gpu_error.guilty_state;
+
+ if (gstate) {
+ if (gstate->guilty == 1) {
+ do_wedge = false;
+ } else if (!gstate->banned &&
+ get_seconds() - gstate->last_guilty_reset < 5) {
+ gstate->banned = true;
+ do_wedge = false;
+ }
+
+ gstate->last_guilty_reset = get_seconds();
+ }
+
+ dev_priv->gpu_error.guilty_state = NULL;
+
ret = -ENODEV;
- if (get_seconds() - dev_priv->gpu_error.last_reset < 5)
+ if (do_wedge && get_seconds() - dev_priv->gpu_error.last_reset < 5)
DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
else
ret = intel_gpu_reset(dev);
@@ -437,6 +437,10 @@ struct ctx_reset_state {
u32 total;
u32 innocent;
u32 guilty;
+ unsigned long last_guilty_reset;
+
+ /* banned to submit more work */
+ bool banned;
};
/* This must match up with the value previously used for execbuf2.rsvd1. */
@@ -810,6 +814,7 @@ struct i915_gpu_error {
struct work_struct work;
unsigned long last_reset;
+ struct ctx_reset_state *guilty_state;
/**
* State variable and reset counter controlling the reset flow
@@ -2143,6 +2143,7 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
struct drm_i915_gem_request *request,
u32 acthd)
{
+ drm_i915_private_t *dev_priv = ring->dev->dev_private;
bool inside;
struct ctx_reset_state *rs = NULL;
bool guilty;
@@ -2174,10 +2175,13 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
if (rs) {
rs->total++;
- if (guilty)
+ if (guilty) {
rs->guilty++;
- else
+
+ dev_priv->gpu_error.guilty_state = rs;
+ } else {
rs->innocent++;
+ }
}
}
@@ -837,6 +837,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
struct drm_clip_rect *cliprects = NULL;
struct intel_ring_buffer *ring;
struct i915_hw_context *ctx;
+ struct ctx_reset_state *rs;
u32 ctx_id = i915_execbuffer2_get_context_id(*args);
u32 exec_start, exec_len;
u32 mask, flags;
@@ -1020,6 +1021,17 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
if (ret)
goto err;
+
+ ret = i915_gem_context_get_reset_state(&dev_priv->ring[RCS],
+ file, ctx_id, &rs);
+ if (ret)
+ goto err;
+
+ if (rs->banned) {
+ ret = -EIO;
+ goto err;
+ }
+
ret = i915_switch_context(ring, file, ctx_id, &ctx);
if (ret)
goto err;
If context has recently submitted a faulty batchbuffers guilty of gpu hang and decides to keep submitting more crap, ban it permanently. Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com> --- drivers/gpu/drm/i915/i915_drv.c | 23 ++++++++++++++++++++++- drivers/gpu/drm/i915/i915_drv.h | 5 +++++ drivers/gpu/drm/i915/i915_gem.c | 8 ++++++-- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 12 ++++++++++++ 4 files changed, 45 insertions(+), 3 deletions(-)