diff mbox

[v3,14/16] drm/i915: refuse to submit more batchbuffers from guilty context

Message ID 1365089568-20457-15-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Mika Kuoppala April 4, 2013, 3:32 p.m. UTC
If context has recently submitted a faulty batchbuffers guilty of
gpu hang and decides to keep submitting more crap, ban it permanently.

Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |   23 ++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_drv.h            |    7 +++++++
 drivers/gpu/drm/i915/i915_gem.c            |    7 +++++--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   13 +++++++++++++
 4 files changed, 47 insertions(+), 3 deletions(-)

Comments

Mika Kuoppala April 11, 2013, 3:13 p.m. UTC | #1
Mika Kuoppala <mika.kuoppala@linux.intel.com> writes:

> If context has recently submitted a faulty batchbuffers guilty of
> gpu hang and decides to keep submitting more crap, ban it permanently.
>
> Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c            |   23 ++++++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_drv.h            |    7 +++++++
>  drivers/gpu/drm/i915/i915_gem.c            |    7 +++++--
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |   13 +++++++++++++
>  4 files changed, 47 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index a5b8aa9..0928f11 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -852,6 +852,8 @@ int intel_gpu_reset(struct drm_device *dev)
>  int i915_reset(struct drm_device *dev)
>  {
>  	drm_i915_private_t *dev_priv = dev->dev_private;
> +	struct i915_ctx_hang_stats *hs;
> +	bool do_wedge = true;
>  	int ret;
>  
>  	if (!i915_try_reset)
> @@ -859,10 +861,29 @@ int i915_reset(struct drm_device *dev)
>  
>  	mutex_lock(&dev->struct_mutex);
>  
> +	/* i915_gem_reset will set this if it finds guilty context */
> +	dev_priv->gpu_error.hang_stats = NULL;
> +
>  	i915_gem_reset(dev);
>  
> +	hs = dev_priv->gpu_error.hang_stats;
> +
> +	if (hs) {
> +		if (hs->batch_active == 1) {
> +			do_wedge = false;
> +		} else if (!hs->banned &&
> +			   get_seconds() - hs->batch_active_reset_ts < 5) {
> +			hs->banned = true;
> +			do_wedge = false;
> +		}
> +
> +		hs->batch_active_reset_ts = get_seconds();
> +	}
> +
> +	dev_priv->gpu_error.hang_stats = NULL;
> +
>  	ret = -ENODEV;
> -	if (get_seconds() - dev_priv->gpu_error.last_reset < 5)
> +	if (do_wedge && get_seconds() - dev_priv->gpu_error.last_reset < 5)
>  		DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
>  	else
>  		ret = intel_gpu_reset(dev);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 8223908..30ba79c 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -458,6 +458,12 @@ struct i915_ctx_hang_stats {
>  
>  	/* This context had batch active when hang was declared */
>  	unsigned batch_active;
> +
> +	/* Time when this context was last blamed for a GPU reset */
> +	unsigned long batch_active_reset_ts;
> +
> +	/* This context is banned to submit more work */
> +	bool banned;
>  };
>  
>  /* This must match up with the value previously used for execbuf2.rsvd1. */
> @@ -831,6 +837,7 @@ struct i915_gpu_error {
>  	struct work_struct work;
>  
>  	unsigned long last_reset;
> +	struct i915_ctx_hang_stats *hang_stats;
>  
>  	/**
>  	 * State variable and reset counter controlling the reset flow
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 475b6ad..ca5c9c3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2147,6 +2147,7 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  				  struct drm_i915_gem_request *request,
>  				  u32 acthd)
>  {
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  	struct i915_ctx_hang_stats *hs = NULL;
>  	bool inside, guilty;
>  
> @@ -2175,10 +2176,12 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
>  		hs = &request->file_priv->hang_stats;
>  
>  	if (hs) {
> -		if (guilty)
> +		if (guilty) {
>  			hs->batch_active++;
> -		else
> +			dev_priv->gpu_error.hang_stats = hs;

This is wrong. The context can be destroyed shortly after,
before reset is handled making this a dangling pointer.


> +		} else {
>  			hs->batch_pending++;
> +		}
>  	}
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index bd1750a..f1b1ea9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -844,6 +844,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	struct drm_clip_rect *cliprects = NULL;
>  	struct intel_ring_buffer *ring;
>  	struct i915_hw_context *ctx;
> +	struct i915_ctx_hang_stats *hs;
>  	u32 ctx_id = i915_execbuffer2_get_context_id(*args);
>  	u32 exec_start, exec_len;
>  	u32 mask, flags;
> @@ -1026,6 +1027,18 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	if (ret)
>  		goto err;
>  
> +	hs = i915_gem_context_get_hang_stats(&dev_priv->ring[RCS],
> +					     file, ctx_id);
> +	if (IS_ERR(hs)) {
> +		ret = PTR_ERR(hs);
> +		goto err;
> +	}
> +
> +	if (hs->banned) {
> +		ret = -EIO;
> +		goto err;
> +	}
> +
>  	ctx = i915_switch_context(ring, file, ctx_id);
>  	if (IS_ERR(ctx)) {
>  		ret = PTR_ERR(ctx);
> -- 
> 1.7.9.5
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index a5b8aa9..0928f11 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -852,6 +852,8 @@  int intel_gpu_reset(struct drm_device *dev)
 int i915_reset(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
+	struct i915_ctx_hang_stats *hs;
+	bool do_wedge = true;
 	int ret;
 
 	if (!i915_try_reset)
@@ -859,10 +861,29 @@  int i915_reset(struct drm_device *dev)
 
 	mutex_lock(&dev->struct_mutex);
 
+	/* i915_gem_reset will set this if it finds guilty context */
+	dev_priv->gpu_error.hang_stats = NULL;
+
 	i915_gem_reset(dev);
 
+	hs = dev_priv->gpu_error.hang_stats;
+
+	if (hs) {
+		if (hs->batch_active == 1) {
+			do_wedge = false;
+		} else if (!hs->banned &&
+			   get_seconds() - hs->batch_active_reset_ts < 5) {
+			hs->banned = true;
+			do_wedge = false;
+		}
+
+		hs->batch_active_reset_ts = get_seconds();
+	}
+
+	dev_priv->gpu_error.hang_stats = NULL;
+
 	ret = -ENODEV;
-	if (get_seconds() - dev_priv->gpu_error.last_reset < 5)
+	if (do_wedge && get_seconds() - dev_priv->gpu_error.last_reset < 5)
 		DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
 	else
 		ret = intel_gpu_reset(dev);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8223908..30ba79c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -458,6 +458,12 @@  struct i915_ctx_hang_stats {
 
 	/* This context had batch active when hang was declared */
 	unsigned batch_active;
+
+	/* Time when this context was last blamed for a GPU reset */
+	unsigned long batch_active_reset_ts;
+
+	/* This context is banned to submit more work */
+	bool banned;
 };
 
 /* This must match up with the value previously used for execbuf2.rsvd1. */
@@ -831,6 +837,7 @@  struct i915_gpu_error {
 	struct work_struct work;
 
 	unsigned long last_reset;
+	struct i915_ctx_hang_stats *hang_stats;
 
 	/**
 	 * State variable and reset counter controlling the reset flow
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 475b6ad..ca5c9c3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2147,6 +2147,7 @@  static void i915_set_reset_status(struct intel_ring_buffer *ring,
 				  struct drm_i915_gem_request *request,
 				  u32 acthd)
 {
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
 	struct i915_ctx_hang_stats *hs = NULL;
 	bool inside, guilty;
 
@@ -2175,10 +2176,12 @@  static void i915_set_reset_status(struct intel_ring_buffer *ring,
 		hs = &request->file_priv->hang_stats;
 
 	if (hs) {
-		if (guilty)
+		if (guilty) {
 			hs->batch_active++;
-		else
+			dev_priv->gpu_error.hang_stats = hs;
+		} else {
 			hs->batch_pending++;
+		}
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index bd1750a..f1b1ea9 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -844,6 +844,7 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct drm_clip_rect *cliprects = NULL;
 	struct intel_ring_buffer *ring;
 	struct i915_hw_context *ctx;
+	struct i915_ctx_hang_stats *hs;
 	u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 exec_start, exec_len;
 	u32 mask, flags;
@@ -1026,6 +1027,18 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	if (ret)
 		goto err;
 
+	hs = i915_gem_context_get_hang_stats(&dev_priv->ring[RCS],
+					     file, ctx_id);
+	if (IS_ERR(hs)) {
+		ret = PTR_ERR(hs);
+		goto err;
+	}
+
+	if (hs->banned) {
+		ret = -EIO;
+		goto err;
+	}
+
 	ctx = i915_switch_context(ring, file, ctx_id);
 	if (IS_ERR(ctx)) {
 		ret = PTR_ERR(ctx);