Message ID | 1452706112-8617-6-git-send-email-arun.siluvery@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Jan 13, 2016 at 05:28:17PM +0000, Arun Siluvery wrote: > From: Tomas Elf <tomas.elf@intel.com> > > i915_gem_wedge now returns a non-zero result in three different cases: > > 1. Legacy: A hang has been detected and full GPU reset is in progress. > > 2. Per-engine recovery: > > a. A single engine reference can be passed to the function, in which > case only that engine will be checked. If that particular engine is > detected to be hung and is to be reset this will yield a non-zero > result but not if reset is in progress for any other engine. > > b. No engine reference is passed to the function, in which case all > engines are checked for ongoing per-engine hang recovery. > > Also, i915_wait_request was updated to take advantage of this new > functionality. This is important since the TDR hang recovery mechanism needs a > way to force waiting threads that hold the struct_mutex to give up the > struct_mutex and try again after the hang recovery has completed. If > i915_wait_request does not take per-engine hang recovery into account there is > no way for a waiting thread to know that a per-engine recovery is about to > happen and that it needs to back off. > > Signed-off-by: Tomas Elf <tomas.elf@intel.com> > Signed-off-by: Arun Siluvery <arun.siluvery@intel.com> > Signed-off-by: Ian Lister <ian.lister@intel.com> > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> > --- > drivers/gpu/drm/i915/i915_drv.h | 3 +- > drivers/gpu/drm/i915/i915_gem.c | 60 +++++++++++++++++++++++++++------ > drivers/gpu/drm/i915/intel_lrc.c | 4 +-- > drivers/gpu/drm/i915/intel_ringbuffer.c | 4 ++- > 4 files changed, 56 insertions(+), 15 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 85cf692..5be7d3e 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -3033,7 +3033,8 @@ i915_gem_find_active_request(struct intel_engine_cs *ring); > > bool i915_gem_retire_requests(struct drm_device *dev); > void i915_gem_retire_requests_ring(struct intel_engine_cs *ring); > -int __must_check i915_gem_check_wedge(struct i915_gpu_error *error, > +int __must_check i915_gem_check_wedge(struct drm_i915_private *dev_priv, > + struct intel_engine_cs *engine, > bool interruptible); > > static inline bool i915_reset_in_progress(struct i915_gpu_error *error) > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index e3cfed2..e6eb45d 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -80,12 +80,38 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv, > spin_unlock(&dev_priv->mm.object_stat_lock); > } > > +static inline int > +i915_engine_reset_in_progress(struct drm_i915_private *dev_priv, > + struct intel_engine_cs *engine) > +{ > + int ret = 0; > + > + if (engine) { > + ret = !!(atomic_read(&dev_priv->ring[engine->id].hangcheck.flags) > + & I915_ENGINE_RESET_IN_PROGRESS); > + } else { > + int i; > + > + for (i = 0; i < I915_NUM_RINGS; i++) > + if (atomic_read(&dev_priv->ring[i].hangcheck.flags) > + & I915_ENGINE_RESET_IN_PROGRESS) { > + > + ret = 1; > + break; > + } > + } Since this side will be called far more often than the writer, could you not make this more convenient for the reader and move it to a global set of flags in dev_priv->gpu_error? To avoid regressing on the EIO front, the waiter sequence should look like if (req->reset_counter != i915_reset_counter(&req->i915->gpu_error)) return 0; if (flags & LOCKED && i915_engine_reset_in_process(&req->i915->gpu_error, req->engine)) return -EAGAIN; Oh, and don't add a second boolean to __i915_wait_request, just transform the first into flags. -Chris
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 85cf692..5be7d3e 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3033,7 +3033,8 @@ i915_gem_find_active_request(struct intel_engine_cs *ring); bool i915_gem_retire_requests(struct drm_device *dev); void i915_gem_retire_requests_ring(struct intel_engine_cs *ring); -int __must_check i915_gem_check_wedge(struct i915_gpu_error *error, +int __must_check i915_gem_check_wedge(struct drm_i915_private *dev_priv, + struct intel_engine_cs *engine, bool interruptible); static inline bool i915_reset_in_progress(struct i915_gpu_error *error) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index e3cfed2..e6eb45d 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -80,12 +80,38 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv, spin_unlock(&dev_priv->mm.object_stat_lock); } +static inline int +i915_engine_reset_in_progress(struct drm_i915_private *dev_priv, + struct intel_engine_cs *engine) +{ + int ret = 0; + + if (engine) { + ret = !!(atomic_read(&dev_priv->ring[engine->id].hangcheck.flags) + & I915_ENGINE_RESET_IN_PROGRESS); + } else { + int i; + + for (i = 0; i < I915_NUM_RINGS; i++) + if (atomic_read(&dev_priv->ring[i].hangcheck.flags) + & I915_ENGINE_RESET_IN_PROGRESS) { + + ret = 1; + break; + } + } + + return ret; +} + static int -i915_gem_wait_for_error(struct i915_gpu_error *error) +i915_gem_wait_for_error(struct drm_i915_private *dev_priv) { int ret; + struct i915_gpu_error *error = &dev_priv->gpu_error; #define EXIT_COND (!i915_reset_in_progress(error) || \ + !i915_engine_reset_in_progress(dev_priv, NULL) || \ i915_terminally_wedged(error)) if (EXIT_COND) return 0; @@ -114,7 +140,7 @@ int i915_mutex_lock_interruptible(struct drm_device *dev) struct drm_i915_private *dev_priv = dev->dev_private; int ret; - ret = i915_gem_wait_for_error(&dev_priv->gpu_error); + ret = i915_gem_wait_for_error(dev_priv); if (ret) return ret; @@ -1110,10 +1136,15 @@ put_rpm: } int -i915_gem_check_wedge(struct i915_gpu_error *error, +i915_gem_check_wedge(struct drm_i915_private *dev_priv, + struct intel_engine_cs *engine, bool interruptible) { - if (i915_reset_in_progress(error)) { + struct i915_gpu_error *error = &dev_priv->gpu_error; + + if (i915_reset_in_progress(error) || + i915_engine_reset_in_progress(dev_priv, engine)) { + /* Non-interruptible callers can't handle -EAGAIN, hence return * -EIO unconditionally for these. */ if (!interruptible) @@ -1253,6 +1284,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req, unsigned long timeout_expire; s64 before, now; int ret; + int reset_in_progress = 0; WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled"); @@ -1297,11 +1329,17 @@ int __i915_wait_request(struct drm_i915_gem_request *req, /* We need to check whether any gpu reset happened in between * the caller grabbing the seqno and now ... */ - if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) { + reset_in_progress = + i915_gem_check_wedge(ring->dev->dev_private, NULL, interruptible); + + if ((reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) || + reset_in_progress) { + /* ... but upgrade the -EAGAIN to an -EIO if the gpu * is truely gone. */ - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); - if (ret == 0) + if (reset_in_progress) + ret = reset_in_progress; + else ret = -EAGAIN; break; } @@ -1470,7 +1508,7 @@ i915_wait_request(struct drm_i915_gem_request *req) BUG_ON(!mutex_is_locked(&dev->struct_mutex)); - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); + ret = i915_gem_check_wedge(dev_priv, NULL, interruptible); if (ret) return ret; @@ -1560,7 +1598,7 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj, if (!obj->active) return 0; - ret = i915_gem_check_wedge(&dev_priv->gpu_error, true); + ret = i915_gem_check_wedge(dev_priv, NULL, true); if (ret) return ret; @@ -4104,11 +4142,11 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file) unsigned reset_counter; int ret; - ret = i915_gem_wait_for_error(&dev_priv->gpu_error); + ret = i915_gem_wait_for_error(dev_priv); if (ret) return ret; - ret = i915_gem_check_wedge(&dev_priv->gpu_error, false); + ret = i915_gem_check_wedge(dev_priv, NULL, false); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index fcec476..a2e56d4 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1066,8 +1066,8 @@ int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords) WARN_ON(req == NULL); dev_priv = req->ring->dev->dev_private; - - ret = i915_gem_check_wedge(&dev_priv->gpu_error, + ret = i915_gem_check_wedge(dev_priv, + req->ring, dev_priv->mm.interruptible); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index def0dcf..f959326 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -2501,8 +2501,10 @@ int intel_ring_begin(struct drm_i915_gem_request *req, ring = req->ring; dev_priv = ring->dev->dev_private; - ret = i915_gem_check_wedge(&dev_priv->gpu_error, + ret = i915_gem_check_wedge(dev_priv, + ring, dev_priv->mm.interruptible); + if (ret) return ret;