diff mbox series

[2/2] drm/i915/gt: preempt and reset based on reset domain

Message ID 20220316130754.813761-2-tejaskumarx.surendrakumar.upadhyay@intel.com (mailing list archive)
State New, archived
Headers show
Series [1/2] drm/i915/gt: preempt engine to idle before reset | expand

Commit Message

Tejas Upadhyay March 16, 2022, 1:07 p.m. UTC
When we have shared reset domains, as we the engine
may be indirectly coupled to the stalled engine, and
we need to idle the current context to prevent
collateral damage.

Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Tejas Upadhyay <tejaskumarx.surendrakumar.upadhyay@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine.h               | 3 ++-
 drivers/gpu/drm/i915/gt/intel_engine_cs.c            | 6 ++++++
 drivers/gpu/drm/i915/gt/intel_engine_types.h         | 8 ++++++++
 drivers/gpu/drm/i915/gt/intel_execlists_submission.c | 7 ++++++-
 drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c  | 2 +-
 drivers/gpu/drm/i915/gt/selftest_execlists.c         | 4 ++--
 drivers/gpu/drm/i915/selftests/i915_request.c        | 5 ++---
 7 files changed, 27 insertions(+), 8 deletions(-)

Comments

kernel test robot March 16, 2022, 8:19 p.m. UTC | #1
Hi Tejas,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-tip/drm-tip]
[cannot apply to drm-intel/for-linux-next v5.17-rc8 next-20220316]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Tejas-Upadhyay/drm-i915-gt-preempt-engine-to-idle-before-reset/20220316-215054
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: x86_64-randconfig-a005 (https://download.01.org/0day-ci/archive/20220317/202203170430.cDmFRpYE-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project a6ec1e3d798f8eab43fb3a91028c6ab04e115fcb)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/06139e9768f3f8e43bf061ae6292feb7daf07944
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Tejas-Upadhyay/drm-i915-gt-preempt-engine-to-idle-before-reset/20220316-215054
        git checkout 06139e9768f3f8e43bf061ae6292feb7daf07944
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/gpu/drm/i915/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/i915/gt/intel_execlists_submission.c:2524:7: warning: variable 'err' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
                   if (READ_ONCE(el->pending[0])) {
                       ^~~~~~~~~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:47:28: note: expanded from macro 'READ_ONCE'
   #define READ_ONCE(x)                                                    \
                                                                           ^
   drivers/gpu/drm/i915/gt/intel_execlists_submission.c:2538:9: note: uninitialized use occurs here
           return err;
                  ^~~
   drivers/gpu/drm/i915/gt/intel_execlists_submission.c:2524:3: note: remove the 'if' if its condition is always true
                   if (READ_ONCE(el->pending[0])) {
                   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   drivers/gpu/drm/i915/gt/intel_execlists_submission.c:2489:6: warning: variable 'err' is used uninitialized whenever 'if' condition is false [-Wsometimes-uninitialized]
           if (*el->active) { /* preempt to idle required */
               ^~~~~~~~~~~
   drivers/gpu/drm/i915/gt/intel_execlists_submission.c:2538:9: note: uninitialized use occurs here
           return err;
                  ^~~
   drivers/gpu/drm/i915/gt/intel_execlists_submission.c:2489:2: note: remove the 'if' if its condition is always true
           if (*el->active) { /* preempt to idle required */
           ^~~~~~~~~~~~~~~~~
   drivers/gpu/drm/i915/gt/intel_execlists_submission.c:2462:9: note: initialize the variable 'err' to silence this warning
           int err;
                  ^
                   = 0
   2 warnings generated.


vim +2524 drivers/gpu/drm/i915/gt/intel_execlists_submission.c

c3b2b6ffcd31318 Chris Wilson   2022-03-16  2453  
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2454  /* XXX return error and force a full reset if we fail to
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2455   * preempt-to-idle
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2456   */
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2457  static int execlists_suspend(struct intel_engine_cs *engine)
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2458  {
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2459  	struct i915_sched_engine *se = engine->sched_engine;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2460  	struct intel_engine_execlists * const el = &engine->execlists;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2461  	unsigned long timeout;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2462  	int err;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2463  
06139e9768f3f8e Tejas Upadhyay 2022-03-16  2464  	if (!intel_engine_pm_get_if_awake(engine))
06139e9768f3f8e Tejas Upadhyay 2022-03-16  2465  		return 0;
06139e9768f3f8e Tejas Upadhyay 2022-03-16  2466  	ENGINE_TRACE(engine, "supending active engine\n");
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2467  	/* Stop further submissions, but listen for our own preempt-to-idle */
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2468  	tasklet_disable(&se->tasklet);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2469  	se->tasklet.callback = suspend_tasklet;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2470  	tasklet_enable(&se->tasklet);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2471  
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2472  	/*
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2473  	 * We have to wait for the HW to complete a pending context switch
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2474  	 * before we can write to ELS[PQ] again. Otherwise the behaviour
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2475  	 * is undefined...
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2476  	 *
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2477  	 * If the engine is truly hung, it will neither clear pending
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2478  	 * nor respond to our preemption request. In the later case,
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2479  	 * we have the dilemma of how to restore hang detection...
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2480  	 */
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2481  	timeout = jiffies + HZ / 2;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2482  	while (READ_ONCE(el->pending[0]) && time_before(jiffies, timeout))
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2483  		intel_engine_flush_submission(engine);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2484  	if (READ_ONCE(el->pending[0])) {
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2485  		err = -EBUSY;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2486  		goto err;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2487  	}
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2488  
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2489  	if (*el->active) { /* preempt to idle required */
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2490  		struct i915_request **pending = el->pending;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2491  		struct intel_context *ce = el->preempt_context;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2492  		u64 desc;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2493  		int n;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2494  
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2495  		/* Always submit an empty / idle context */
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2496  		desc = lrc_update_regs(ce, engine, ce->ring->tail);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2497  
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2498  		/*
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2499  		 * As we submit a dummy context, we will get two events.
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2500  		 * First a preemption of the running context, causing us
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2501  		 * to promote el->pending to el->inflight. And then
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2502  		 * we will receive a completion event as our context
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2503  		 * idles.
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2504  		 *
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2505  		 * We can use any dummy request here for tracking the
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2506  		 * preemption events.
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2507  		 */
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2508  		execlists_schedule_in(*el->active, 0);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2509  		*pending++ = i915_request_get(*el->active);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2510  		*pending++ = NULL;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2511  
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2512  		/* Tell the HW to preempt to our special context */
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2513  		for (n = execlists_num_ports(el); --n; )
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2514  			write_desc(el, 0, n);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2515  		write_desc(el, desc, 0);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2516  		if (el->ctrl_reg)
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2517  			writel(EL_CTRL_LOAD, el->ctrl_reg);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2518  
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2519  		timeout = jiffies + HZ / 2;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2520  		while (READ_ONCE(el->pending[0]) &&
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2521  		       time_before(jiffies, timeout))
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2522  			intel_engine_flush_submission(engine);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2523  
c3b2b6ffcd31318 Chris Wilson   2022-03-16 @2524  		if (READ_ONCE(el->pending[0])) {
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2525  			err = -EIO;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2526  			goto err;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2527  		}
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2528  	}
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2529  
06139e9768f3f8e Tejas Upadhyay 2022-03-16  2530  	goto out;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2531  
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2532  err:
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2533  	tasklet_disable(&se->tasklet);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2534  	se->tasklet.callback = execlists_submission_tasklet;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2535  	tasklet_enable(&se->tasklet);
06139e9768f3f8e Tejas Upadhyay 2022-03-16  2536  out:
06139e9768f3f8e Tejas Upadhyay 2022-03-16  2537  	intel_engine_pm_put(engine);
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2538  	return err;
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2539  }
c3b2b6ffcd31318 Chris Wilson   2022-03-16  2540  

---
0-DAY CI Kernel Test Service
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index 1c0ab05c3c40..a6ea0cdd8b53 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -282,7 +282,8 @@  intel_engine_has_preempt_reset(const struct intel_engine_cs *engine)
 	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
 		return false;
 
-	return intel_engine_has_preemption(engine);
+	return intel_engine_has_preemption(engine) &&
+	       !intel_engine_has_shared_reset_domain(engine);
 }
 
 #define FORCE_VIRTUAL	BIT(0)
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 8080479f27aa..b28120f0158a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -472,7 +472,13 @@  static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id,
 static void __setup_engine_capabilities(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *i915 = engine->i915;
+	enum intel_engine_id id;
+	struct intel_engine_cs *e;
 
+	for_each_engine(e, engine->gt, id)
+		if ((e->reset_domain & engine->reset_domain) &&
+		    e->id != engine->id)
+			engine->flags |= I915_ENGINE_HAS_SHARED_RESET_DOMAIN;
 	if (engine->class == VIDEO_DECODE_CLASS) {
 		/*
 		 * HEVC support is present on first engine instance
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 194155de900d..d27103b23318 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -531,6 +531,8 @@  struct intel_engine_cs {
 #define I915_ENGINE_HAS_RCS_REG_STATE  BIT(9)
 #define I915_ENGINE_HAS_EU_PRIORITY    BIT(10)
 #define I915_ENGINE_FIRST_RENDER_COMPUTE BIT(11)
+#define I915_ENGINE_HAS_SHARED_RESET_DOMAIN BIT(9)
+
 	unsigned int flags;
 
 	/*
@@ -598,6 +600,12 @@  intel_engine_supports_stats(const struct intel_engine_cs *engine)
 	return engine->flags & I915_ENGINE_SUPPORTS_STATS;
 }
 
+static inline bool
+intel_engine_has_shared_reset_domain(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_HAS_SHARED_RESET_DOMAIN;
+}
+
 static inline bool
 intel_engine_has_preemption(const struct intel_engine_cs *engine)
 {
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 006e2d9a53e3..9dda02956494 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2461,6 +2461,9 @@  static int execlists_suspend(struct intel_engine_cs *engine)
 	unsigned long timeout;
 	int err;
 
+	if (!intel_engine_pm_get_if_awake(engine))
+		return 0;
+	ENGINE_TRACE(engine, "supending active engine\n");
 	/* Stop further submissions, but listen for our own preempt-to-idle */
 	tasklet_disable(&se->tasklet);
 	se->tasklet.callback = suspend_tasklet;
@@ -2524,12 +2527,14 @@  static int execlists_suspend(struct intel_engine_cs *engine)
 		}
 	}
 
-	return 0;
+	goto out;
 
 err:
 	tasklet_disable(&se->tasklet);
 	se->tasklet.callback = execlists_submission_tasklet;
 	tasklet_enable(&se->tasklet);
+out:
+	intel_engine_pm_put(engine);
 	return err;
 }
 
diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
index 273d440a53e3..939bbea7ce1b 100644
--- a/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c
@@ -356,7 +356,7 @@  static int live_heartbeat_off(void *arg)
 		return 0;
 
 	for_each_engine(engine, gt, id) {
-		if (!intel_engine_has_preemption(engine))
+		if (!intel_engine_has_preempt_reset(engine))
 			continue;
 
 		err = __live_heartbeat_off(engine);
diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c b/drivers/gpu/drm/i915/gt/selftest_execlists.c
index 09f8cd2d0e2c..3eb3496cfb7e 100644
--- a/drivers/gpu/drm/i915/gt/selftest_execlists.c
+++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c
@@ -2389,7 +2389,7 @@  static int live_preempt_cancel(void *arg)
 		goto err_client_a;
 
 	for_each_engine(data.engine, gt, id) {
-		if (!intel_engine_has_preemption(data.engine))
+		if (!intel_engine_has_preempt_reset(data.engine))
 			continue;
 
 		err = __cancel_active0(&data);
@@ -3399,7 +3399,7 @@  static int live_preempt_timeout(void *arg)
 		unsigned long saved_timeout;
 		struct i915_request *rq;
 
-		if (!intel_engine_has_preemption(engine))
+		if (!intel_engine_has_preempt_reset(engine))
 			continue;
 
 		rq = spinner_create_request(&spin_lo, ctx_lo, engine,
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index c56a0c2cd2f7..e80363c81d6b 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -803,8 +803,7 @@  static int __cancel_reset(struct drm_i915_private *i915,
 	unsigned long preempt_timeout_ms;
 	int err = 0;
 
-	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
-	    !intel_has_reset_engine(engine->gt))
+	if (!intel_engine_has_preempt_reset(engine))
 		return 0;
 
 	preempt_timeout_ms = engine->props.preempt_timeout_ms;
@@ -906,7 +905,7 @@  static int live_cancel_request(void *arg)
 		struct igt_live_test t;
 		int err, err2;
 
-		if (!intel_engine_has_preemption(engine))
+		if (!intel_engine_has_preempt_reset(engine))
 			continue;
 
 		err = igt_live_test_begin(&t, i915, __func__, engine->name);