diff mbox

drm/i915: Reset logical ring contexts' head and tail during GPU reset

Message ID 1424103173-17387-1-git-send-email-thomas.daniel@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Thomas Daniel Feb. 16, 2015, 4:12 p.m. UTC
Work was getting left behind in LRC contexts during reset.  This causes a hang
if the GPU is reset when HEAD==TAIL because the context's ringbuffer head and
tail don't get reset and retiring a request doesn't alter them, so the ring
still appears full.

Added a function intel_lr_context_reset() to reset head and tail on a LRC and
its ringbuffer.

Call intel_lr_context_reset() for each context in i915_gem_context_reset() when
in execlists mode.

Testcase: igt/pm_rps --run-subtest reset #bdw
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=88096
Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.c |   12 +++++++----
 drivers/gpu/drm/i915/intel_lrc.c        |   34 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.h        |    2 ++
 3 files changed, 44 insertions(+), 4 deletions(-)

Comments

Shuang He Feb. 17, 2015, 3:08 a.m. UTC | #1
Tested-By: PRC QA PRTS (Patch Regression Test System Contact: shuang.he@intel.com)
Task id: 5785
-------------------------------------Summary-------------------------------------
Platform          Delta          drm-intel-nightly          Series Applied
PNV                 -5              277/277              272/277
ILK                                  313/313              313/313
SNB                 -1              309/309              308/309
IVB                 -1              382/382              381/382
BYT                                  296/296              296/296
HSW                                  425/425              425/425
BDW                 -1              318/318              317/318
-------------------------------------Detailed-------------------------------------
Platform  Test                                drm-intel-nightly          Series Applied
*PNV  igt_gem_fence_thrash_bo-write-verify-none      NRUN(1)PASS(2)      FAIL(1)PASS(1)
*PNV  igt_gem_fence_thrash_bo-write-verify-x      PASS(3)      FAIL(1)PASS(1)
*PNV  igt_gem_fence_thrash_bo-write-verify-y      PASS(3)      FAIL(1)PASS(1)
 PNV  igt_gem_userptr_blits_coherency-sync      CRASH(1)PASS(3)      CRASH(1)PASS(1)
 PNV  igt_gem_userptr_blits_coherency-unsync      CRASH(1)PASS(2)      CRASH(1)PASS(1)
*SNB  igt_kms_pipe_crc_basic_read-crc-pipe-A      PASS(2)      DMESG_WARN(1)PASS(1)
*IVB  igt_gem_storedw_batches_loop_normal      PASS(3)      DMESG_WARN(1)PASS(1)
*BDW  igt_gem_gtt_hog      PASS(5)      DMESG_WARN(1)PASS(1)
Note: You need to pay more attention to line start with '*'
Dave Gordon Feb. 17, 2015, 11:34 a.m. UTC | #2
On 16/02/15 16:12, Thomas Daniel wrote:
> Work was getting left behind in LRC contexts during reset.  This causes a hang
> if the GPU is reset when HEAD==TAIL because the context's ringbuffer head and
> tail don't get reset and retiring a request doesn't alter them, so the ring
> still appears full.
> 
> Added a function intel_lr_context_reset() to reset head and tail on a LRC and
> its ringbuffer.
> 
> Call intel_lr_context_reset() for each context in i915_gem_context_reset() when
> in execlists mode.
> 
> Testcase: igt/pm_rps --run-subtest reset #bdw
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=88096
> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_context.c |   12 +++++++----
>  drivers/gpu/drm/i915/intel_lrc.c        |   34 +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_lrc.h        |    2 ++
>  3 files changed, 44 insertions(+), 4 deletions(-)

A couple of minor points below, but not ones that require changes, so:

Reviewed-by: Dave Gordon <david.s.gordon@intel.com>

> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 8603bf4..70346b0 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -296,11 +296,15 @@ void i915_gem_context_reset(struct drm_device *dev)
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int i;
>  
> -	/* In execlists mode we will unreference the context when the execlist
> -	 * queue is cleared and the requests destroyed.
> -	 */
> -	if (i915.enable_execlists)
> +	if (i915.enable_execlists) {
> +		struct intel_context *ctx;
> +
> +		list_for_each_entry(ctx, &dev_priv->context_list, link) {
> +			intel_lr_context_reset(dev, ctx);
> +		}
> +
>  		return;
> +	}
>  
>  	for (i = 0; i < I915_NUM_RINGS; i++) {
>  		struct intel_engine_cs *ring = &dev_priv->ring[i];
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index aafcef3..1946bb9 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1950,3 +1950,37 @@ error_unpin_ctx:
>  	drm_gem_object_unreference(&ctx_obj->base);
>  	return ret;
>  }
> +
> +void intel_lr_context_reset(struct drm_device *dev,
> +			struct intel_context *ctx)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct intel_engine_cs *ring;
> +	int i;
> +
> +	for_each_ring(ring, dev_priv, i) {
> +		struct drm_i915_gem_object *ctx_obj =
> +				ctx->engine[ring->id].state;
> +		if (ctx_obj) {
> +			struct intel_ringbuffer *ringbuf =
> +					ctx->engine[ring->id].ringbuf;
> +			uint32_t *reg_state;
> +			struct page *page;
> +
> +			if (i915_gem_object_get_pages(ctx_obj)) {
> +				WARN(1, "Failed get_pages for context obj\n");
> +				continue;
> +			}

This could be folded into a single "if (WARN_ON(...)) continue;"

> +			page = i915_gem_object_get_page(ctx_obj, 1);

Isn't it a pity that we have i915_gem_object_get_page() and
i915_gem_object_get_pages() which look so similar but do completely
different things :(

> +			reg_state = kmap_atomic(page);
> +
> +			reg_state[CTX_RING_HEAD+1] = 0;
> +			reg_state[CTX_RING_TAIL+1] = 0;
> +
> +			kunmap_atomic(reg_state);
> +
> +			ringbuf->head = 0;
> +			ringbuf->tail = 0;
> +		}
> +	}
> +}
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index f635735..5dd0eca 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -73,6 +73,8 @@ int intel_lr_context_deferred_create(struct intel_context *ctx,
>  				     struct intel_engine_cs *ring);
>  void intel_lr_context_unpin(struct intel_engine_cs *ring,
>  		struct intel_context *ctx);
> +void intel_lr_context_reset(struct drm_device *dev,
> +			struct intel_context *ctx);
>  
>  /* Execlists */
>  int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists);
>
Daniel Vetter Feb. 23, 2015, 11:21 p.m. UTC | #3
On Tue, Feb 17, 2015 at 11:34:31AM +0000, Dave Gordon wrote:
> On 16/02/15 16:12, Thomas Daniel wrote:
> > +			if (i915_gem_object_get_pages(ctx_obj)) {
> > +				WARN(1, "Failed get_pages for context obj\n");
> > +				continue;
> > +			}
> 
> This could be folded into a single "if (WARN_ON(...)) continue;"

It's also a bit bad style to not handle this case at all. But meh ;-)

> > +			page = i915_gem_object_get_page(ctx_obj, 1);
> 
> Isn't it a pity that we have i915_gem_object_get_page() and
> i915_gem_object_get_pages() which look so similar but do completely
> different things :(

lookup_page might be a more suitable name for this one indeed ... patch
highly welcome.
-Daniel
Daniel Vetter Feb. 23, 2015, 11:21 p.m. UTC | #4
On Mon, Feb 16, 2015 at 04:12:53PM +0000, Thomas Daniel wrote:
> Work was getting left behind in LRC contexts during reset.  This causes a hang
> if the GPU is reset when HEAD==TAIL because the context's ringbuffer head and
> tail don't get reset and retiring a request doesn't alter them, so the ring
> still appears full.
> 
> Added a function intel_lr_context_reset() to reset head and tail on a LRC and
> its ringbuffer.
> 
> Call intel_lr_context_reset() for each context in i915_gem_context_reset() when
> in execlists mode.
> 
> Testcase: igt/pm_rps --run-subtest reset #bdw
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=88096
> Signed-off-by: Thomas Daniel <thomas.daniel@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_context.c |   12 +++++++----
>  drivers/gpu/drm/i915/intel_lrc.c        |   34 +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_lrc.h        |    2 ++
>  3 files changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 8603bf4..70346b0 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -296,11 +296,15 @@ void i915_gem_context_reset(struct drm_device *dev)
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int i;
>  
> -	/* In execlists mode we will unreference the context when the execlist
> -	 * queue is cleared and the requests destroyed.
> -	 */
> -	if (i915.enable_execlists)
> +	if (i915.enable_execlists) {
> +		struct intel_context *ctx;
> +
> +		list_for_each_entry(ctx, &dev_priv->context_list, link) {
> +			intel_lr_context_reset(dev, ctx);
> +		}
> +
>  		return;
> +	}

I'm somewhat voting for a vfunc for engine state reset since hiding the
lrc callback by going through legacy code doesn't look too pretty.
Follow-up patch if you're bored would be neat ;-)

>  
>  	for (i = 0; i < I915_NUM_RINGS; i++) {
>  		struct intel_engine_cs *ring = &dev_priv->ring[i];
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index aafcef3..1946bb9 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1950,3 +1950,37 @@ error_unpin_ctx:
>  	drm_gem_object_unreference(&ctx_obj->base);
>  	return ret;
>  }
> +
> +void intel_lr_context_reset(struct drm_device *dev,
> +			struct intel_context *ctx)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	struct intel_engine_cs *ring;
> +	int i;
> +
> +	for_each_ring(ring, dev_priv, i) {
> +		struct drm_i915_gem_object *ctx_obj =
> +				ctx->engine[ring->id].state;
> +		if (ctx_obj) {

The common patter is
		
		if (!ctx_obj)
			continue;

which allows you to fold out one indent level. I couldn't resist and
done this while applying ...

Queued for -next, thanks for the patch.
-Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 8603bf4..70346b0 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -296,11 +296,15 @@  void i915_gem_context_reset(struct drm_device *dev)
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int i;
 
-	/* In execlists mode we will unreference the context when the execlist
-	 * queue is cleared and the requests destroyed.
-	 */
-	if (i915.enable_execlists)
+	if (i915.enable_execlists) {
+		struct intel_context *ctx;
+
+		list_for_each_entry(ctx, &dev_priv->context_list, link) {
+			intel_lr_context_reset(dev, ctx);
+		}
+
 		return;
+	}
 
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		struct intel_engine_cs *ring = &dev_priv->ring[i];
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index aafcef3..1946bb9 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1950,3 +1950,37 @@  error_unpin_ctx:
 	drm_gem_object_unreference(&ctx_obj->base);
 	return ret;
 }
+
+void intel_lr_context_reset(struct drm_device *dev,
+			struct intel_context *ctx)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct intel_engine_cs *ring;
+	int i;
+
+	for_each_ring(ring, dev_priv, i) {
+		struct drm_i915_gem_object *ctx_obj =
+				ctx->engine[ring->id].state;
+		if (ctx_obj) {
+			struct intel_ringbuffer *ringbuf =
+					ctx->engine[ring->id].ringbuf;
+			uint32_t *reg_state;
+			struct page *page;
+
+			if (i915_gem_object_get_pages(ctx_obj)) {
+				WARN(1, "Failed get_pages for context obj\n");
+				continue;
+			}
+			page = i915_gem_object_get_page(ctx_obj, 1);
+			reg_state = kmap_atomic(page);
+
+			reg_state[CTX_RING_HEAD+1] = 0;
+			reg_state[CTX_RING_TAIL+1] = 0;
+
+			kunmap_atomic(reg_state);
+
+			ringbuf->head = 0;
+			ringbuf->tail = 0;
+		}
+	}
+}
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index f635735..5dd0eca 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -73,6 +73,8 @@  int intel_lr_context_deferred_create(struct intel_context *ctx,
 				     struct intel_engine_cs *ring);
 void intel_lr_context_unpin(struct intel_engine_cs *ring,
 		struct intel_context *ctx);
+void intel_lr_context_reset(struct drm_device *dev,
+			struct intel_context *ctx);
 
 /* Execlists */
 int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists);