@@ -1884,6 +1884,19 @@ void i915_reset(struct drm_i915_private *dev_priv)
goto finish;
}
+/**
+ * i915_reset_engine - reset GPU engine to recover from a hang
+ * @engine: engine to reset
+ *
+ * Reset a specific GPU engine. Useful if a hang is detected.
+ * Returns zero on successful reset or otherwise an error code.
+ */
+int i915_reset_engine(struct intel_engine_cs *engine)
+{
+ /* FIXME: replace me with engine reset sequence */
+ return -ENODEV;
+}
+
static int i915_pm_suspend(struct device *kdev)
{
struct pci_dev *pdev = to_pci_dev(kdev);
@@ -753,6 +753,7 @@ struct intel_csr {
func(has_csr); \
func(has_ddi); \
func(has_dp_mst); \
+ func(has_reset_engine); \
func(has_fbc); \
func(has_fpga_dbg); \
func(has_full_ppgtt); \
@@ -1548,6 +1549,12 @@ struct i915_gpu_error {
* inspect the bit and do the reset directly, otherwise the worker
* waits for the struct_mutex.
*
+ * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
+ * acquire the struct_mutex to reset an engine, we need an explicit
+ * flag to prevent two concurrent reset attempts in the same engine.
+ * As the number of engines continues to grow, allocate the flags from
+ * the most significant bits.
+ *
* #I915_WEDGED - If reset fails and we can no longer use the GPU,
* we set the #I915_WEDGED bit. Prior to command submission, e.g.
* i915_gem_request_alloc(), this bit is checked and the sequence
@@ -1557,6 +1564,7 @@ struct i915_gpu_error {
#define I915_RESET_BACKOFF 0
#define I915_RESET_HANDOFF 1
#define I915_WEDGED (BITS_PER_LONG - 1)
+#define I915_RESET_ENGINE (I915_WEDGED - I915_NUM_ENGINES)
/**
* Waitqueue to signal when a hang is detected. Used to for waiters
@@ -3056,6 +3064,8 @@ extern void i915_driver_unload(struct drm_device *dev);
extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
extern void i915_reset(struct drm_i915_private *dev_priv);
+extern int i915_reset_engine(struct intel_engine_cs *engine);
+extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
extern int intel_guc_reset(struct drm_i915_private *dev_priv);
extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
@@ -2722,6 +2722,8 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
u32 engine_mask,
const char *fmt, ...)
{
+ struct intel_engine_cs *engine;
+ unsigned int tmp;
va_list args;
char error_msg[80];
@@ -2744,12 +2746,55 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
if (!engine_mask)
goto out;
+ /*
+ * Try engine reset when available. We fall back to full reset if
+ * single reset fails.
+ */
+ if (intel_has_reset_engine(dev_priv)) {
+ for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
+ BUILD_BUG_ON(I915_RESET_HANDOFF >= I915_RESET_ENGINE);
+ if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
+ &dev_priv->gpu_error.flags))
+ continue;
+
+ if (i915_reset_engine(engine) == 0)
+ engine_mask &= ~engine_mask;
+
+ /*
+ * Clear unconditionally because full reset won't
+ * care about reset-engine flags.
+ */
+ clear_bit(I915_RESET_ENGINE + engine->id,
+ &dev_priv->gpu_error.flags);
+ wake_up_bit(&dev_priv->gpu_error.flags,
+ I915_RESET_ENGINE + engine->id);
+ }
+
+ if (!engine_mask)
+ goto out;
+ }
+
+ /* full reset needs the mutex, stop any other user trying to do so */
if (test_and_set_bit(I915_RESET_BACKOFF,
&dev_priv->gpu_error.flags))
goto out;
+ /* prevent any other reset-engine attempt */
+ for_each_engine(engine, dev_priv, tmp) {
+ while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
+ &dev_priv->gpu_error.flags))
+ wait_on_bit(&dev_priv->gpu_error.flags,
+ I915_RESET_ENGINE + engine->id,
+ TASK_UNINTERRUPTIBLE);
+ }
+
i915_reset_and_wakeup(dev_priv);
+ for_each_engine(engine, dev_priv, tmp) {
+ clear_bit(I915_RESET_ENGINE + engine->id,
+ &dev_priv->gpu_error.flags);
+ }
+
out:
intel_runtime_pm_put(dev_priv);
}
@@ -310,7 +310,8 @@ static const struct intel_device_info intel_haswell_info = {
BDW_COLORS, \
.has_logical_ring_contexts = 1, \
.has_full_48bit_ppgtt = 1, \
- .has_64bit_reloc = 1
+ .has_64bit_reloc = 1, \
+ .has_reset_engine = 1
static const struct intel_device_info intel_broadwell_info = {
BDW_FEATURES,
@@ -341,6 +342,7 @@ static const struct intel_device_info intel_cherryview_info = {
.has_gmch_display = 1,
.has_aliasing_ppgtt = 1,
.has_full_ppgtt = 1,
+ .has_reset_engine = 1,
.display_mmio_offset = VLV_DISPLAY_BASE,
GEN_CHV_PIPEOFFSETS,
CURSOR_OFFSETS,
@@ -388,6 +390,7 @@ static const struct intel_device_info intel_skylake_gt3_info = {
.has_aliasing_ppgtt = 1, \
.has_full_ppgtt = 1, \
.has_full_48bit_ppgtt = 1, \
+ .has_reset_engine = 1, \
GEN_DEFAULT_PIPEOFFSETS, \
IVB_CURSOR_OFFSETS, \
BDW_COLORS
@@ -1719,6 +1719,17 @@ bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
return intel_get_gpu_reset(dev_priv) != NULL;
}
+/*
+ * When GuC submission is enabled, GuC manages ELSP and can initiate the
+ * engine reset too. For now, fall back to full GPU reset if it is enabled.
+ */
+bool intel_has_reset_engine(struct drm_i915_private *dev_priv)
+{
+ return (dev_priv->info.has_reset_engine &&
+ !dev_priv->guc.execbuf_client &&
+ i915.reset >= 2);
+}
+
int intel_guc_reset(struct drm_i915_private *dev_priv)
{
int ret;