diff mbox

[v3,12/14] drm/i915/guc: Preemption! With GuC

Message ID 20171019183619.6235-13-michal.winiarski@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michał Winiarski Oct. 19, 2017, 6:36 p.m. UTC
Pretty similar to what we have on execlists.
We're reusing most of the GEM code, however, due to GuC quirks we need a
couple of extra bits.
Preemption is implemented as GuC action, and actions can be pretty slow.
Because of that, we're using a mutex to serialize them. Since we're
requesting preemption from the tasklet, the task of creating a workitem
and wrapping it in GuC action is delegated to a worker.

To distinguish that preemption has finished, we're using additional
piece of HWSP, and since we're not getting context switch interrupts,
we're also adding a user interrupt.

The fact that our special preempt context has completed unfortunately
doesn't mean that we're ready to submit new work. We also need to wait
for GuC to finish its own processing.

v2: Don't compile out the wait for GuC, handle workqueue flush on reset,
no need for ordered workqueue, put on a reviewer hat when looking at my own
patches (Chris)
Move struct work around in intel_guc, move user interruput outside of
conditional (Michał)
Keep ring around rather than chase though intel_context

v3: Extract WA for flushing ggtt writes to a helper (Chris)
Keep work_struct in intel_guc rather than engine (Michał)
Use ordered workqueue for inject_preempt worker to avoid GuC quirks.

Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jeff McGee <jeff.mcgee@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Oscar Mateo <oscar.mateo@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |   3 +-
 drivers/gpu/drm/i915/i915_gem.c            |  10 ++
 drivers/gpu/drm/i915/i915_guc_submission.c | 206 +++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_guc.h           |   8 ++
 drivers/gpu/drm/i915/intel_lrc.c           |   4 +-
 drivers/gpu/drm/i915/intel_lrc.h           |   1 -
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   8 ++
 7 files changed, 222 insertions(+), 18 deletions(-)

Comments

Chris Wilson Oct. 19, 2017, 7:14 p.m. UTC | #1
Quoting Michał Winiarski (2017-10-19 19:36:17)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ef14c6d570dc..7d52baf4f3bd 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2904,6 +2904,16 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
>         tasklet_kill(&engine->execlists.irq_tasklet);
>         tasklet_disable(&engine->execlists.irq_tasklet);
>  
> +       /*
> +        * We're using worker to queue preemption requests from the tasklet in
> +        * GuC submission mode.
> +        * Even though tasklet was disabled, we may still have a worker queued.
> +        * Let's make sure that all workers scheduled before disabling the
> +        * tasklet are completed before continuing with the reset.
> +        */
> +       if (i915_modparams.enable_guc_submission)

Don't we have engine->i915->guc.preempt_ctx or something a bit more
specific than a modparam?

> +               flush_workqueue(engine->i915->guc.preempt_wq);

Ok, after some thought, this is the preferred order. If we do the flush
early, we may end up a worker queued before we kill the tasklet. Too
late and tasklet_action spins a little; that's better than parallel
writes into the hw.
-Chris
Chris Wilson Oct. 19, 2017, 7:16 p.m. UTC | #2
Quoting Michał Winiarski (2017-10-19 19:36:17)
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 689fde1a63a9..17182ce29674 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -107,7 +107,6 @@ intel_lr_context_descriptor(struct i915_gem_context *ctx,
>         return ctx->engine[engine->id].lrc_desc;
>  }
>  
> -
>  /* Execlists */
>  int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv,
>                                     int enable_execlists);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 8ad9a33e803b..5a68499bf3eb 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -525,6 +525,12 @@ struct intel_engine_cs {
>         u32 (*get_cmd_length_mask)(u32 cmd_header);
>  };
>  
> +void
> +execlists_cancel_port_requests(struct intel_engine_execlists * const execlists);
> +
> +void execlists_unwind_incomplete_requests(
> +               const struct intel_engine_execlists * const execlists);
> +
>  static inline unsigned int
>  execlists_num_ports(const struct intel_engine_execlists * const execlists)
>  {
> @@ -543,6 +549,8 @@ execlists_port_complete(struct intel_engine_execlists * const execlists,
>         memset(port + m, 0, sizeof(struct execlist_port));
>  }
>  
> +
> +
>  static inline unsigned int

You were doing such a good job at cleaning up the unwanted newlines....
-Chris
Chris Wilson Oct. 20, 2017, 9 a.m. UTC | #3
Quoting Michał Winiarski (2017-10-19 19:36:17)
> @@ -686,10 +802,23 @@ static void i915_guc_irq_handler(unsigned long data)
>         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
>         struct intel_engine_execlists * const execlists = &engine->execlists;
>         struct execlist_port *port = execlists->port;
> -       const struct execlist_port * const last_port =
> -               &execlists->port[execlists->port_mask];
>         struct drm_i915_gem_request *rq;
>  
> +       if (READ_ONCE(execlists->preempt) &&
> +           intel_read_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX) ==
> +           GUC_PREEMPT_FINISHED) {
> +               execlists_cancel_port_requests(&engine->execlists);
> +
> +               spin_lock_irq(&engine->timeline->lock);
> +               execlists_unwind_incomplete_requests(execlists);
> +               spin_unlock_irq(&engine->timeline->lock);
> +
> +               wait_for_guc_preempt_report(engine);
> +
> +               WRITE_ONCE(execlists->preempt, false);
> +               intel_write_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX, 0);
> +       }
> +
>         rq = port_request(&port[0]);
>         while (rq && i915_gem_request_completed(rq)) {
>                 trace_i915_gem_request_out(rq);
> @@ -701,7 +830,7 @@ static void i915_guc_irq_handler(unsigned long data)
>                 rq = port_request(&port[0]);
>         }
>  
> -       if (!port_isset(last_port))
> +       if (!READ_ONCE(execlists->preempt))
>                 i915_guc_dequeue(engine);
>  }

    kworker/u9:1-1687  [000] d..1  2215.658429: inject_preempt_context: GUC WQ tail=330, head=320
    kworker/u9:1-1687  [000] d..1  2215.658429: inject_preempt_context: RING tail=1a0
    kworker/u9:1-1687  [000] ....  2215.658429: inject_preempt_context: PREEMPT: WQI ADDED ring=2, guc_id=1
    kworker/u9:1-1687  [000] ....  2215.658429: inject_preempt_context: PREEMPT: GUC ACTION SENDING ring=2, guc_id=1
          <idle>-0     [003] d.h1  2215.658456: gen8_cs_irq_handler: USER INTERRUPT ring=2, guc_id=1
          <idle>-0     [003] ..s1  2215.658460: i915_guc_irq_handler: PREEMPT: WAITING FOR GUC COMPLETE ring=2, guc_id=1
          <idle>-0     [000] d.h1  2215.658462: gen8_cs_irq_handler: USER INTERRUPT ring=2, guc_id=1
          <idle>-0     [003] ..s1  2215.658473: i915_guc_irq_handler: PREEMPT: FINISHED ring=2, guc_id=1
          <idle>-0     [003] d.s2  2215.658473: i915_gem_request_execute: dev=0, ring=2, ctx=2983, seqno=6, global=0
          <idle>-0     [003] d.s2  2215.658476: i915_gem_request_in: dev=0, ring=2, ctx=2983, seqno=6, global=916, port=0
          <idle>-0     [003] d.s2  2215.658476: i915_gem_request_execute: dev=0, ring=2, ctx=3298, seqno=6, global=0
          <idle>-0     [003] d.s2  2215.658477: i915_gem_request_in: dev=0, ring=2, ctx=3298, seqno=6, global=917, port=1
          <idle>-0     [002] d.h1  2215.658510: gen8_cs_irq_handler: USER INTERRUPT ring=2, guc_id=1
          <idle>-0     [000] d.h1  2215.658517: gen8_cs_irq_handler: USER INTERRUPT ring=2, guc_id=1
          <idle>-0     [002] ..s1  2215.658520: i915_gem_request_out: dev=0, ring=2, ctx=2983, seqno=6, global=916
          <idle>-0     [002] ..s1  2215.658521: i915_gem_request_out: dev=0, ring=2, ctx=3298, seqno=6, global=917
          <idle>-0     [002] d.s2  2215.658521: i915_gem_request_execute: dev=0, ring=2, ctx=3323, seqno=5, global=0
          <idle>-0     [002] d.s2  2215.658522: i915_gem_request_in: dev=0, ring=2, ctx=3323, seqno=5, global=918, port=0
          <idle>-0     [002] d.s2  2215.658523: i915_gem_request_execute: dev=0, ring=2, ctx=3363, seqno=4, global=0
          <idle>-0     [002] d.s2  2215.658524: i915_gem_request_in: dev=0, ring=2, ctx=3363, seqno=4, global=919, port=1
          <idle>-0     [002] d.h1  2215.658569: gen8_cs_irq_handler: USER INTERRUPT ring=2, guc_id=1
          <idle>-0     [002] ..s1  2215.658572: i915_gem_request_out: dev=0, ring=2, ctx=3323, seqno=5, global=918
          <idle>-0     [002] ..s1  2215.658572: i915_gem_request_out: dev=0, ring=2, ctx=3363, seqno=4, global=919
          <idle>-0     [002] d.s2  2215.658573: i915_gem_request_execute: dev=0, ring=2, ctx=3673, seqno=2, global=0
          <idle>-0     [000] d.h1  2215.658574: gen8_cs_irq_handler: USER INTERRUPT ring=2, guc_id=1
          <idle>-0     [002] d.s2  2215.658576: i915_gem_request_in: dev=0, ring=2, ctx=3673, seqno=2, global=920, port=0
          <idle>-0     [002] d.s2  2215.658576: i915_gem_request_execute: dev=0, ring=2, ctx=2783, seqno=3, global=0
          <idle>-0     [002] d.s2  2215.658578: i915_gem_request_in: dev=0, ring=2, ctx=2783, seqno=3, global=921, port=1
     ksoftirqd/0-7     [000] d.H2  2215.658587: i915_gem_request_submit: dev=0, ring=2, ctx=3003, seqno=3, global=0
     ksoftirqd/0-7     [000] d.s1  2215.658591: i915_guc_irq_handler: PREEMPT: WORKER QUEUED ring=2, guc_id=1
          <idle>-0     [002] d.h1  2215.658649: gen8_cs_irq_handler: USER INTERRUPT ring=2, guc_id=1
          <idle>-0     [000] d.h1  2215.658656: gen8_cs_irq_handler: USER INTERRUPT ring=2, guc_id=1
          <idle>-0     [002] ..s1  2215.658659: i915_guc_irq_handler: PREEMPT: IN PROGRESS, NOT DONE ring=2, guc_id=1
          <idle>-0     [002] ..s1  2215.658659: i915_gem_request_out: dev=0, ring=2, ctx=3673, seqno=2, global=920
          <idle>-0     [002] ..s1  2215.658660: i915_gem_request_out: dev=0, ring=2, ctx=2783, seqno=3, global=921
          <idle>-0     [001] d.h2  2215.658663: i915_gem_request_submit: dev=0, ring=2, ctx=4013, seqno=6, global=0
          <idle>-0     [002] dNh2  2215.658725: i915_gem_request_submit: dev=0, ring=2, ctx=2928, seqno=6, global=0
    kworker/u9:1-1687  [000] d..1  2215.660436: inject_preempt_context: GUC WQ tail=340, head=330
    kworker/u9:1-1687  [000] d..1  2215.660436: inject_preempt_context: RING tail=1c0
    kworker/u9:1-1687  [000] ....  2215.660437: inject_preempt_context: PREEMPT: WQI ADDED ring=2, guc_id=1
    kworker/u9:1-1687  [000] ....  2215.660437: inject_preempt_context: PREEMPT: GUC ACTION SENDING ring=2, guc_id=1

This suggests that the decision to queue the preempt-worker is racy, and
not evidence that the USER_INTERRUPT went missing. Do you have a better
example?
-Chris
Chris Wilson Oct. 20, 2017, 5 p.m. UTC | #4
Quoting Michał Winiarski (2017-10-19 19:36:17)
> @@ -686,10 +802,23 @@ static void i915_guc_irq_handler(unsigned long data)
>         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
>         struct intel_engine_execlists * const execlists = &engine->execlists;
>         struct execlist_port *port = execlists->port;
> -       const struct execlist_port * const last_port =
> -               &execlists->port[execlists->port_mask];
>         struct drm_i915_gem_request *rq;
>  
> +       if (READ_ONCE(execlists->preempt) &&
> +           intel_read_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX) ==
> +           GUC_PREEMPT_FINISHED) {
> +               execlists_cancel_port_requests(&engine->execlists);
> +
> +               spin_lock_irq(&engine->timeline->lock);
> +               execlists_unwind_incomplete_requests(execlists);
> +               spin_unlock_irq(&engine->timeline->lock);
> +
> +               wait_for_guc_preempt_report(engine);
> +
> +               WRITE_ONCE(execlists->preempt, false);
> +               intel_write_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX, 0);
> +       }
> +
>         rq = port_request(&port[0]);
>         while (rq && i915_gem_request_completed(rq)) {
>                 trace_i915_gem_request_out(rq);

I don't think we have the ordering right here for
cancel_port_requests(). It should only be called for what are
effectively incomplete requests, or else the
INTEL_CONTEXT_SCHEDULE_PREEMPTED is a misnomer. It doesn't look like it
makes any great difference in the over scheme of things, but I'm tempted
to put a GEM_BUG_ON(i915_gem_request_completed(rq)) there.
-Chris
Daniele Ceraolo Spurio Oct. 23, 2017, 11:14 p.m. UTC | #5
<snip>

> +
> +#define GUC_PREEMPT_FINISHED 0x1
> +#define GUC_PREEMPT_BREADCRUMB_DWORDS 0x8
> +static void inject_preempt_context(struct work_struct *work)
> +{
> +	struct guc_preempt_work *preempt_work =
> +		container_of(work, typeof(*preempt_work), work);
> +	struct intel_engine_cs *engine = preempt_work->engine;
> +	struct intel_guc *guc = &engine->i915->guc;
> +	struct i915_guc_client *client = guc->client[PREEMPT];
> +	struct intel_ring *ring = client->owner->engine[engine->id].ring;
> +	u32 ctx_desc = lower_32_bits(intel_lr_context_descriptor(client->owner,
> +								 engine));
> +	u32 *cs = ring->vaddr + ring->tail;
> +	u32 data[7];
> +
> +	if (engine->id == RCS) {
> +		cs = gen8_emit_ggtt_write_rcs(cs, GUC_PREEMPT_FINISHED,
> +				intel_hws_preempt_done_address(engine));
> +	} else {
> +		cs = gen8_emit_ggtt_write(cs, GUC_PREEMPT_FINISHED,
> +				intel_hws_preempt_done_address(engine));
> +		*cs++ = MI_NOOP;
> +		*cs++ = MI_NOOP;
> +	}
> +	*cs++ = MI_USER_INTERRUPT;
> +	*cs++ = MI_NOOP;
> +
> +	GEM_BUG_ON(!IS_ALIGNED(ring->size,
> +			       GUC_PREEMPT_BREADCRUMB_DWORDS * sizeof(u32)));
> +	GEM_BUG_ON((void*)cs - (ring->vaddr + ring->tail) !=
> +		   GUC_PREEMPT_BREADCRUMB_DWORDS * sizeof(u32));
> +
> +	ring->tail += GUC_PREEMPT_BREADCRUMB_DWORDS * sizeof(u32);
> +	ring->tail &= (ring->size - 1);
> +
> +	flush_ggtt_writes(ring->vma);
> +
> +	spin_lock_irq(&client->wq_lock);
> +	guc_wq_item_append(client, engine->guc_id, ctx_desc,
> +			   ring->tail / sizeof(u64), 0);
> +	spin_unlock_irq(&client->wq_lock);
> +
> +	data[0] = INTEL_GUC_ACTION_REQUEST_PREEMPTION;
> +	data[1] = client->stage_id;
> +	data[2] = INTEL_GUC_PREEMPT_OPTION_IMMEDIATE |

I was looking at how the GuC handles these flags and from what I've seen 
INTEL_GUC_PREEMPT_OPTION_IMMEDIATE doesn't seem to be used anywhere in 
GuC FW anymore (even if it still exist in the interface), so I believe 
it should be safe to drop it.

Daniele

> +		  INTEL_GUC_PREEMPT_OPTION_DROP_WORK_Q |
> +		  INTEL_GUC_PREEMPT_OPTION_DROP_SUBMIT_Q;
> +	data[3] = engine->guc_id;
> +	data[4] = guc->client[SUBMIT]->priority;
> +	data[5] = guc->client[SUBMIT]->stage_id;
> +	data[6] = guc_ggtt_offset(guc->shared_data);
> +
> +	if (WARN_ON(intel_guc_send(guc, data, ARRAY_SIZE(data)))) {
> +		WRITE_ONCE(engine->execlists.preempt, false);
> +		tasklet_schedule(&engine->execlists.irq_tasklet);
> +	}
> +}
Chris Wilson Oct. 25, 2017, 6:15 p.m. UTC | #6
Quoting Michał Winiarski (2017-10-19 19:36:17)
> Pretty similar to what we have on execlists.
> We're reusing most of the GEM code, however, due to GuC quirks we need a
> couple of extra bits.
> Preemption is implemented as GuC action, and actions can be pretty slow.
> Because of that, we're using a mutex to serialize them. Since we're
> requesting preemption from the tasklet, the task of creating a workitem
> and wrapping it in GuC action is delegated to a worker.
> 
> To distinguish that preemption has finished, we're using additional
> piece of HWSP, and since we're not getting context switch interrupts,
> we're also adding a user interrupt.
> 
> The fact that our special preempt context has completed unfortunately
> doesn't mean that we're ready to submit new work. We also need to wait
> for GuC to finish its own processing.

Should we be concerned about the latency of a preemption switch here?
For execlists the impact on a stress test like whisper/contexts-priority
it is barely noticeable, the same cannot be said here.

Hmm, I guess I need to amend gem_exec_latency to include a measurement.
And benchmarks/ needs some TLC. Not that I really expect it to show up
at e.g. the GL client level, but one day someone may care enough to
complain; better to be prepared.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 7b871802ae36..af745749509c 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -373,8 +373,7 @@  static int i915_getparam(struct drm_device *dev, void *data,
 			value |= I915_SCHEDULER_CAP_PRIORITY;
 
 			if (HAS_LOGICAL_RING_PREEMPTION(dev_priv) &&
-			    i915_modparams.enable_execlists &&
-			    !i915_modparams.enable_guc_submission)
+			    i915_modparams.enable_execlists)
 				value |= I915_SCHEDULER_CAP_PREEMPTION;
 		}
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ef14c6d570dc..7d52baf4f3bd 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2904,6 +2904,16 @@  i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
 	tasklet_kill(&engine->execlists.irq_tasklet);
 	tasklet_disable(&engine->execlists.irq_tasklet);
 
+	/*
+	 * We're using worker to queue preemption requests from the tasklet in
+	 * GuC submission mode.
+	 * Even though tasklet was disabled, we may still have a worker queued.
+	 * Let's make sure that all workers scheduled before disabling the
+	 * tasklet are completed before continuing with the reset.
+	 */
+	if (i915_modparams.enable_guc_submission)
+		flush_workqueue(engine->i915->guc.preempt_wq);
+
 	if (engine->irq_seqno_barrier)
 		engine->irq_seqno_barrier(engine);
 
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index e47a5000fc03..a11ed4deff4b 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -559,6 +559,105 @@  static void guc_add_request(struct intel_guc *guc,
 	spin_unlock(&client->wq_lock);
 }
 
+/*
+ * When we're doing submissions using regular execlists backend, writing to
+ * ELSP from CPU side is enough to make sure that writes to ringbuffer pages
+ * pinned in mappable aperture portion of GGTT are visible to command streamer.
+ * Writes done by GuC on our behalf are not guaranteeing such ordering,
+ * therefore, to ensure the flush, we're issuing a POSTING READ.
+ */
+static void flush_ggtt_writes(struct i915_vma *vma)
+{
+	struct drm_i915_private *dev_priv = to_i915(vma->obj->base.dev);
+
+	if (i915_vma_is_map_and_fenceable(vma))
+		POSTING_READ_FW(GUC_STATUS);
+}
+
+#define GUC_PREEMPT_FINISHED 0x1
+#define GUC_PREEMPT_BREADCRUMB_DWORDS 0x8
+static void inject_preempt_context(struct work_struct *work)
+{
+	struct guc_preempt_work *preempt_work =
+		container_of(work, typeof(*preempt_work), work);
+	struct intel_engine_cs *engine = preempt_work->engine;
+	struct intel_guc *guc = &engine->i915->guc;
+	struct i915_guc_client *client = guc->client[PREEMPT];
+	struct intel_ring *ring = client->owner->engine[engine->id].ring;
+	u32 ctx_desc = lower_32_bits(intel_lr_context_descriptor(client->owner,
+								 engine));
+	u32 *cs = ring->vaddr + ring->tail;
+	u32 data[7];
+
+	if (engine->id == RCS) {
+		cs = gen8_emit_ggtt_write_rcs(cs, GUC_PREEMPT_FINISHED,
+				intel_hws_preempt_done_address(engine));
+	} else {
+		cs = gen8_emit_ggtt_write(cs, GUC_PREEMPT_FINISHED,
+				intel_hws_preempt_done_address(engine));
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+	}
+	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
+
+	GEM_BUG_ON(!IS_ALIGNED(ring->size,
+			       GUC_PREEMPT_BREADCRUMB_DWORDS * sizeof(u32)));
+	GEM_BUG_ON((void*)cs - (ring->vaddr + ring->tail) !=
+		   GUC_PREEMPT_BREADCRUMB_DWORDS * sizeof(u32));
+
+	ring->tail += GUC_PREEMPT_BREADCRUMB_DWORDS * sizeof(u32);
+	ring->tail &= (ring->size - 1);
+
+	flush_ggtt_writes(ring->vma);
+
+	spin_lock_irq(&client->wq_lock);
+	guc_wq_item_append(client, engine->guc_id, ctx_desc,
+			   ring->tail / sizeof(u64), 0);
+	spin_unlock_irq(&client->wq_lock);
+
+	data[0] = INTEL_GUC_ACTION_REQUEST_PREEMPTION;
+	data[1] = client->stage_id;
+	data[2] = INTEL_GUC_PREEMPT_OPTION_IMMEDIATE |
+		  INTEL_GUC_PREEMPT_OPTION_DROP_WORK_Q |
+		  INTEL_GUC_PREEMPT_OPTION_DROP_SUBMIT_Q;
+	data[3] = engine->guc_id;
+	data[4] = guc->client[SUBMIT]->priority;
+	data[5] = guc->client[SUBMIT]->stage_id;
+	data[6] = guc_ggtt_offset(guc->shared_data);
+
+	if (WARN_ON(intel_guc_send(guc, data, ARRAY_SIZE(data)))) {
+		WRITE_ONCE(engine->execlists.preempt, false);
+		tasklet_schedule(&engine->execlists.irq_tasklet);
+	}
+}
+
+/*
+ * We're using user interrupt and HWSP value to mark that preemption has
+ * finished and GPU is idle. Normally, we could unwind and continue similar to
+ * execlists submission path. Unfortunately, with GuC we also need to wait for
+ * it to finish its own postprocessing, before attempting to submit. Otherwise
+ * GuC may silently ignore our submissions, and thus we risk losing request at
+ * best, executing out-of-order and causing kernel panic at worst.
+ */
+#define GUC_PREEMPT_POSTPROCESS_DELAY_MS 10
+static void wait_for_guc_preempt_report(struct intel_engine_cs *engine)
+{
+	struct intel_guc *guc = &engine->i915->guc;
+	struct guc_shared_ctx_data *data = guc->shared_data_vaddr;
+	struct guc_ctx_report *report = &data->preempt_ctx_report[engine->guc_id];
+
+	WARN_ON(wait_for_atomic(report->report_return_status ==
+				INTEL_GUC_REPORT_STATUS_COMPLETE,
+				GUC_PREEMPT_POSTPROCESS_DELAY_MS));
+	/* GuC is expecting that we're also going to clear the affected context
+	 * counter, let's also reset the return status to not depend on GuC
+	 * resetting it after recieving another preempt action */
+	report->affected_count = 0;
+	report->report_return_status = INTEL_GUC_REPORT_STATUS_UNKNOWN;
+}
+
+
 /**
  * i915_guc_submit() - Submit commands through GuC
  * @engine: engine associated with the commands
@@ -568,8 +667,7 @@  static void guc_add_request(struct intel_guc *guc,
  */
 static void i915_guc_submit(struct intel_engine_cs *engine)
 {
-	struct drm_i915_private *dev_priv = engine->i915;
-	struct intel_guc *guc = &dev_priv->guc;
+	struct intel_guc *guc = &engine->i915->guc;
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct execlist_port *port = execlists->port;
 	unsigned int n;
@@ -582,8 +680,7 @@  static void i915_guc_submit(struct intel_engine_cs *engine)
 		if (rq && count == 0) {
 			port_set(&port[n], port_pack(rq, ++count));
 
-			if (i915_vma_is_map_and_fenceable(rq->ring->vma))
-				POSTING_READ_FW(GUC_STATUS);
+			flush_ggtt_writes(rq->ring->vma);
 
 			guc_add_request(guc, rq);
 		}
@@ -635,13 +732,31 @@  static void i915_guc_dequeue(struct intel_engine_cs *engine)
 	bool submit = false;
 	struct rb_node *rb;
 
-	if (port_isset(port))
-		port++;
-
 	spin_lock_irq(&engine->timeline->lock);
 	rb = execlists->first;
 	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
-	while (rb) {
+
+	if (!rb)
+		goto unlock;
+
+	if (HAS_LOGICAL_RING_PREEMPTION(engine->i915) && port_isset(port)) {
+		struct guc_preempt_work *preempt_work =
+			&engine->i915->guc.preempt_work[engine->id];
+
+		if (rb_entry(rb, struct i915_priolist, node)->priority >
+		    max(port_request(port)->priotree.priority, 0)) {
+			WRITE_ONCE(execlists->preempt, true);
+			queue_work(engine->i915->guc.preempt_wq,
+				   &preempt_work->work);
+			goto unlock;
+		} else if (port_isset(last_port)) {
+			goto unlock;
+		}
+
+		port++;
+	}
+
+	do {
 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
 		struct drm_i915_gem_request *rq, *rn;
 
@@ -671,13 +786,14 @@  static void i915_guc_dequeue(struct intel_engine_cs *engine)
 		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
-	}
+	} while (rb);
 done:
 	execlists->first = rb;
 	if (submit) {
 		port_assign(port, last);
 		i915_guc_submit(engine);
 	}
+unlock:
 	spin_unlock_irq(&engine->timeline->lock);
 }
 
@@ -686,10 +802,23 @@  static void i915_guc_irq_handler(unsigned long data)
 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct execlist_port *port = execlists->port;
-	const struct execlist_port * const last_port =
-		&execlists->port[execlists->port_mask];
 	struct drm_i915_gem_request *rq;
 
+	if (READ_ONCE(execlists->preempt) &&
+	    intel_read_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX) ==
+	    GUC_PREEMPT_FINISHED) {
+		execlists_cancel_port_requests(&engine->execlists);
+
+		spin_lock_irq(&engine->timeline->lock);
+		execlists_unwind_incomplete_requests(execlists);
+		spin_unlock_irq(&engine->timeline->lock);
+
+		wait_for_guc_preempt_report(engine);
+
+		WRITE_ONCE(execlists->preempt, false);
+		intel_write_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX, 0);
+	}
+
 	rq = port_request(&port[0]);
 	while (rq && i915_gem_request_completed(rq)) {
 		trace_i915_gem_request_out(rq);
@@ -701,7 +830,7 @@  static void i915_guc_irq_handler(unsigned long data)
 		rq = port_request(&port[0]);
 	}
 
-	if (!port_isset(last_port))
+	if (!READ_ONCE(execlists->preempt))
 		i915_guc_dequeue(engine);
 }
 
@@ -1073,6 +1202,50 @@  static void guc_ads_destroy(struct intel_guc *guc)
 	i915_vma_unpin_and_release(&guc->ads_vma);
 }
 
+static int guc_preempt_work_create(struct intel_guc *guc)
+{
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	/*
+	 * Even though both sending GuC action, and adding a new workitem to
+	 * GuC workqueue are serialized (each with its own locking), since
+	 * we're using mutliple engines, it's possible that we're going to
+	 * issue a preempt request with two (or more - each for different
+	 * engine) workitems in GuC queue. In this situation, GuC may submit
+	 * all of them, which will make us very confused.
+	 * Our preemption contexts may even already be complete - before we
+	 * even had the chance to sent the preempt action to GuC!. Rather
+	 * than introducing yet another lock, we can just use ordered workqueue
+	 * to make sure we're always sending a single preemption request with a
+	 * single workitem.
+	 */
+	guc->preempt_wq = alloc_ordered_workqueue("i915-guc_preempt",
+						  WQ_HIGHPRI);
+	if (!guc->preempt_wq)
+		return -ENOMEM;
+
+	for_each_engine(engine, dev_priv, id) {
+		guc->preempt_work[id].engine = engine;
+		INIT_WORK(&guc->preempt_work[id].work, inject_preempt_context);
+	}
+
+	return 0;
+}
+
+static void guc_preempt_work_destroy(struct intel_guc *guc)
+{
+	struct drm_i915_private *dev_priv = guc_to_i915(guc);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	for_each_engine(engine, dev_priv, id)
+		cancel_work_sync(&guc->preempt_work[id].work);
+
+	destroy_workqueue(guc->preempt_wq);
+}
+
 /*
  * Set up the memory resources to be shared with the GuC (via the GGTT)
  * at firmware loading time.
@@ -1097,12 +1270,18 @@  int i915_guc_submission_init(struct drm_i915_private *dev_priv)
 	if (ret < 0)
 		goto err_shared_data;
 
+	ret = guc_preempt_work_create(guc);
+	if (ret)
+		goto err_log;
+
 	ret = guc_ads_create(guc);
 	if (ret < 0)
-		goto err_log;
+		goto err_wq;
 
 	return 0;
 
+err_wq:
+	guc_preempt_work_destroy(guc);
 err_log:
 	intel_guc_log_destroy(guc);
 err_shared_data:
@@ -1117,6 +1296,7 @@  void i915_guc_submission_fini(struct drm_i915_private *dev_priv)
 	struct intel_guc *guc = &dev_priv->guc;
 
 	guc_ads_destroy(guc);
+	guc_preempt_work_destroy(guc);
 	intel_guc_log_destroy(guc);
 	guc_shared_data_destroy(guc);
 	guc_stage_desc_pool_destroy(guc);
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 15eadf27b7ae..7273a6be7dc1 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -46,6 +46,11 @@  enum i915_guc_client_id {
 	PREEMPT
 };
 
+struct guc_preempt_work {
+	struct intel_engine_cs *engine;
+	struct work_struct work;
+};
+
 struct intel_guc {
 	struct intel_uc_fw fw;
 	struct intel_guc_log log;
@@ -66,6 +71,9 @@  struct intel_guc {
 
 	struct i915_guc_client *client[I915_GUC_NUM_CLIENTS];
 
+	struct guc_preempt_work preempt_work[I915_NUM_ENGINES];
+	struct workqueue_struct *preempt_wq;
+
 	DECLARE_BITMAP(doorbell_bitmap, GUC_NUM_DOORBELLS);
 	/* Cyclic counter mod pagesize	*/
 	u32 db_cacheline;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index dd4708904c85..d2327b36cf71 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -354,7 +354,7 @@  static void unwind_wa_tail(struct drm_i915_gem_request *rq)
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
 
-static void execlists_unwind_incomplete_requests(
+void execlists_unwind_incomplete_requests(
 		const struct intel_engine_execlists * const execlists)
 {
 	struct intel_engine_cs *engine =
@@ -685,7 +685,7 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 		execlists_submit_ports(engine);
 }
 
-static void
+void
 execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
 {
 	struct execlist_port *port = execlists->port;
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 689fde1a63a9..17182ce29674 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -107,7 +107,6 @@  intel_lr_context_descriptor(struct i915_gem_context *ctx,
 	return ctx->engine[engine->id].lrc_desc;
 }
 
-
 /* Execlists */
 int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv,
 				    int enable_execlists);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 8ad9a33e803b..5a68499bf3eb 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -525,6 +525,12 @@  struct intel_engine_cs {
 	u32 (*get_cmd_length_mask)(u32 cmd_header);
 };
 
+void
+execlists_cancel_port_requests(struct intel_engine_execlists * const execlists);
+
+void execlists_unwind_incomplete_requests(
+		const struct intel_engine_execlists * const execlists);
+
 static inline unsigned int
 execlists_num_ports(const struct intel_engine_execlists * const execlists)
 {
@@ -543,6 +549,8 @@  execlists_port_complete(struct intel_engine_execlists * const execlists,
 	memset(port + m, 0, sizeof(struct execlist_port));
 }
 
+
+
 static inline unsigned int
 intel_engine_flag(const struct intel_engine_cs *engine)
 {