drm/i915: avoid concurrent writes to aux_inv

Message ID	20220302182657.1483623-1-fei.yang@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <dri-devel-bounces@lists.freedesktop.org> From: fei.yang@intel.com To: intel-gfx@lists.freedesktop.org, dri-devel@lists.freedesktop.org Subject: [PATCH] drm/i915: avoid concurrent writes to aux_inv Date: Wed, 2 Mar 2022 10:26:57 -0800 Message-Id: <20220302182657.1483623-1-fei.yang@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Cc: Fei Yang <fei.yang@intel.com> Errors-To: dri-devel-bounces@lists.freedesktop.org Sender: "dri-devel" <dri-devel-bounces@lists.freedesktop.org>
Series	drm/i915: avoid concurrent writes to aux_inv \| expand drm/i915: avoid concurrent writes to aux_inv

Message ID

20220302182657.1483623-1-fei.yang@intel.com (mailing list archive)

State

New, archived

Headers

From: fei.yang@intel.com
To: intel-gfx@lists.freedesktop.org,
	dri-devel@lists.freedesktop.org
Subject: [PATCH] drm/i915: avoid concurrent writes to aux_inv
Date: Wed,  2 Mar 2022 10:26:57 -0800
Message-Id: <20220302182657.1483623-1-fei.yang@intel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: list
Cc: Fei Yang <fei.yang@intel.com>
Errors-To: dri-devel-bounces@lists.freedesktop.org
Sender: "dri-devel" <dri-devel-bounces@lists.freedesktop.org>

Series

drm/i915: avoid concurrent writes to aux_inv | expand

Commit Message

Yang, Fei March 2, 2022, 6:26 p.m. UTC

From: Fei Yang <fei.yang@intel.com>

GPU hangs have been observed when multiple engines write to the
same aux_inv register at the same time. To avoid this each engine
should only invalidate its own auxiliary table. The function
gen12_emit_flush_xcs() currently invalidate the auxiliary table for
all engines because the rq->engine is not necessarily the engine
eventually carrying out the request, and potentially the engine
could even be a virtual one (with engine->instance being -1).
With this patch, auxiliary table invalidation is done only for the
engine executing the request. And the mmio address for the aux_inv
register is set after the engine instance becomes certain.

Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
Signed-off-by: Fei Yang <fei.yang@intel.com>
---
 drivers/gpu/drm/i915/gt/gen8_engine_cs.c      | 41 ++++---------------
 .../drm/i915/gt/intel_execlists_submission.c  | 38 +++++++++++++++++
 drivers/gpu/drm/i915/i915_request.h           |  2 +
 3 files changed, 47 insertions(+), 34 deletions(-)

Comments

Chris Wilson March 2, 2022, 8:50 p.m. UTC | #1

Quoting fei.yang@intel.com (2022-03-02 18:26:57)
> From: Fei Yang <fei.yang@intel.com>
> 
> GPU hangs have been observed when multiple engines write to the
> same aux_inv register at the same time. To avoid this each engine
> should only invalidate its own auxiliary table. The function
> gen12_emit_flush_xcs() currently invalidate the auxiliary table for
> all engines because the rq->engine is not necessarily the engine
> eventually carrying out the request, and potentially the engine
> could even be a virtual one (with engine->instance being -1).
> With this patch, auxiliary table invalidation is done only for the
> engine executing the request. And the mmio address for the aux_inv
> register is set after the engine instance becomes certain.
> 
> Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
> Signed-off-by: Fei Yang <fei.yang@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/gen8_engine_cs.c      | 41 ++++---------------
>  .../drm/i915/gt/intel_execlists_submission.c  | 38 +++++++++++++++++
>  drivers/gpu/drm/i915/i915_request.h           |  2 +
>  3 files changed, 47 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> index b1b9c3fd7bf9..af62e2bc2c9b 100644
> --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> @@ -165,30 +165,6 @@ static u32 preparser_disable(bool state)
>         return MI_ARB_CHECK | 1 << 8 | state;
>  }
>  
> -static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
> -{
> -       static const i915_reg_t vd[] = {
> -               GEN12_VD0_AUX_NV,
> -               GEN12_VD1_AUX_NV,
> -               GEN12_VD2_AUX_NV,
> -               GEN12_VD3_AUX_NV,
> -       };
> -
> -       static const i915_reg_t ve[] = {
> -               GEN12_VE0_AUX_NV,
> -               GEN12_VE1_AUX_NV,
> -       };
> -
> -       if (engine->class == VIDEO_DECODE_CLASS)
> -               return vd[engine->instance];
> -
> -       if (engine->class == VIDEO_ENHANCEMENT_CLASS)
> -               return ve[engine->instance];
> -
> -       GEM_BUG_ON("unknown aux_inv reg\n");
> -       return INVALID_MMIO_REG;
> -}
> -
>  static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
>  {
>         *cs++ = MI_LOAD_REGISTER_IMM(1);
> @@ -288,7 +264,7 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
>         if (mode & EMIT_INVALIDATE)
>                 aux_inv = rq->engine->mask & ~BIT(BCS0);
>         if (aux_inv)
> -               cmd += 2 * hweight32(aux_inv) + 2;
> +               cmd += 4;
>  
>         cs = intel_ring_begin(rq, cmd);
>         if (IS_ERR(cs))
> @@ -319,16 +295,13 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
>         *cs++ = 0; /* value */
>  
>         if (aux_inv) { /* hsdes: 1809175790 */
> -               struct intel_engine_cs *engine;
> -               unsigned int tmp;
> -
> -               *cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
> -               for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
> -                       *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
> -                       *cs++ = AUX_INV;
> -               }
> +               *cs++ = MI_LOAD_REGISTER_IMM(1);
> +               rq->vd_ve_aux_inv = cs;
> +               *cs++ = 0; /* address to be set at submission to HW */
> +               *cs++ = AUX_INV;
>                 *cs++ = MI_NOOP;
> -       }
> +       } else
> +               rq->vd_ve_aux_inv = NULL;
>  
>         if (mode & EMIT_INVALIDATE)
>                 *cs++ = preparser_disable(false);
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 1c602d4ae297..a018de6dcac5 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -1258,6 +1258,34 @@ static bool completed(const struct i915_request *rq)
>         return __i915_request_is_complete(rq);
>  }
>  
> +static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
> +{
> +       static const i915_reg_t vd[] = {
> +               GEN12_VD0_AUX_NV,
> +               GEN12_VD1_AUX_NV,
> +               GEN12_VD2_AUX_NV,
> +               GEN12_VD3_AUX_NV,
> +       };
> +
> +       static const i915_reg_t ve[] = {
> +               GEN12_VE0_AUX_NV,
> +               GEN12_VE1_AUX_NV,
> +       };
> +
> +       if (engine->class == VIDEO_DECODE_CLASS) {
> +               GEM_BUG_ON(engine->instance >= ARRAY_SIZE(vd));
> +               return vd[engine->instance];
> +       }
> +
> +       if (engine->class == VIDEO_ENHANCEMENT_CLASS) {
> +               GEM_BUG_ON(engine->instance >= ARRAY_SIZE(ve));
> +               return ve[engine->instance];
> +       }
> +
> +       GEM_BUG_ON("unknown aux_inv reg\n");
> +       return INVALID_MMIO_REG;
> +}
> +
>  static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
>         struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -1538,6 +1566,16 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>                         }
>  
>                         if (__i915_request_submit(rq)) {
> +                               /* hsdes: 1809175790 */
> +                               if ((GRAPHICS_VER(engine->i915) == 12) &&
> +                                   rq->vd_ve_aux_inv &&
> +                                   (engine->class == VIDEO_DECODE_CLASS ||
> +                                    engine->class == VIDEO_ENHANCEMENT_CLASS)) {

We do not need the extra checks here; we just do as we are told. We only
tell ourselves to apply the fixup when required.

> +                                       *rq->vd_ve_aux_inv = i915_mmio_reg_offset

Likewise, vd_ve is overspecific, aux_inv_fixup or aux_inv_wa (or
wa_aux_iv, fixup_aux_inv).

> +                                               (aux_inv_reg(engine));
> +                                       rq->vd_ve_aux_inv = NULL;

Move this to i915_request initialisation so that we only set aux_inv
when required, which probably explains the extra defence.

> +                                       rq->execution_mask = engine->mask;
> +                               }
>                                 if (!merge) {
>                                         *port++ = i915_request_get(last);
>                                         last = NULL;
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 28b1f9db5487..69de32e5e15d 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -350,6 +350,8 @@ struct i915_request {
>                 struct list_head link;
>                 unsigned long delay;
>         } mock;)
> +
> +       u32 *vd_ve_aux_inv;

Not at the end of the struct; that's where we put things in the dungeon.
The selftest struct should be last; I do hope no one has been putting
things at random places in the struct without considering the layout and
semantics. From the flow, this is akin to batch, capture_list; before
emitted_jiffies would be a good spot.
-Chris

diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
index b1b9c3fd7bf9..af62e2bc2c9b 100644
--- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -165,30 +165,6 @@  static u32 preparser_disable(bool state)
 	return MI_ARB_CHECK | 1 << 8 | state;
 }
 
-static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
-{
-	static const i915_reg_t vd[] = {
-		GEN12_VD0_AUX_NV,
-		GEN12_VD1_AUX_NV,
-		GEN12_VD2_AUX_NV,
-		GEN12_VD3_AUX_NV,
-	};
-
-	static const i915_reg_t ve[] = {
-		GEN12_VE0_AUX_NV,
-		GEN12_VE1_AUX_NV,
-	};
-
-	if (engine->class == VIDEO_DECODE_CLASS)
-		return vd[engine->instance];
-
-	if (engine->class == VIDEO_ENHANCEMENT_CLASS)
-		return ve[engine->instance];
-
-	GEM_BUG_ON("unknown aux_inv reg\n");
-	return INVALID_MMIO_REG;
-}
-
 static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
 {
 	*cs++ = MI_LOAD_REGISTER_IMM(1);
@@ -288,7 +264,7 @@  int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
 	if (mode & EMIT_INVALIDATE)
 		aux_inv = rq->engine->mask & ~BIT(BCS0);
 	if (aux_inv)
-		cmd += 2 * hweight32(aux_inv) + 2;
+		cmd += 4;
 
 	cs = intel_ring_begin(rq, cmd);
 	if (IS_ERR(cs))
@@ -319,16 +295,13 @@  int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
 	*cs++ = 0; /* value */
 
 	if (aux_inv) { /* hsdes: 1809175790 */
-		struct intel_engine_cs *engine;
-		unsigned int tmp;
-
-		*cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
-		for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
-			*cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
-			*cs++ = AUX_INV;
-		}
+		*cs++ = MI_LOAD_REGISTER_IMM(1);
+		rq->vd_ve_aux_inv = cs;
+		*cs++ = 0; /* address to be set at submission to HW */
+		*cs++ = AUX_INV;
 		*cs++ = MI_NOOP;
-	}
+	} else
+		rq->vd_ve_aux_inv = NULL;
 
 	if (mode & EMIT_INVALIDATE)
 		*cs++ = preparser_disable(false);
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 1c602d4ae297..a018de6dcac5 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -1258,6 +1258,34 @@  static bool completed(const struct i915_request *rq)
 	return __i915_request_is_complete(rq);
 }
 
+static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
+{
+	static const i915_reg_t vd[] = {
+		GEN12_VD0_AUX_NV,
+		GEN12_VD1_AUX_NV,
+		GEN12_VD2_AUX_NV,
+		GEN12_VD3_AUX_NV,
+	};
+
+	static const i915_reg_t ve[] = {
+		GEN12_VE0_AUX_NV,
+		GEN12_VE1_AUX_NV,
+	};
+
+	if (engine->class == VIDEO_DECODE_CLASS) {
+		GEM_BUG_ON(engine->instance >= ARRAY_SIZE(vd));
+		return vd[engine->instance];
+	}
+
+	if (engine->class == VIDEO_ENHANCEMENT_CLASS) {
+		GEM_BUG_ON(engine->instance >= ARRAY_SIZE(ve));
+		return ve[engine->instance];
+	}
+
+	GEM_BUG_ON("unknown aux_inv reg\n");
+	return INVALID_MMIO_REG;
+}
+
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1538,6 +1566,16 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 			}
 
 			if (__i915_request_submit(rq)) {
+				/* hsdes: 1809175790 */
+				if ((GRAPHICS_VER(engine->i915) == 12) &&
+				    rq->vd_ve_aux_inv &&
+				    (engine->class == VIDEO_DECODE_CLASS ||
+				     engine->class == VIDEO_ENHANCEMENT_CLASS)) {
+					*rq->vd_ve_aux_inv = i915_mmio_reg_offset
+						(aux_inv_reg(engine));
+					rq->vd_ve_aux_inv = NULL;
+					rq->execution_mask = engine->mask;
+				}
 				if (!merge) {
 					*port++ = i915_request_get(last);
 					last = NULL;
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 28b1f9db5487..69de32e5e15d 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -350,6 +350,8 @@  struct i915_request {
 		struct list_head link;
 		unsigned long delay;
 	} mock;)
+
+	u32 *vd_ve_aux_inv;
 };
 
 #define I915_FENCE_GFP (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)

drm/i915: avoid concurrent writes to aux_inv

Commit Message

Comments

Patch