diff mbox

[09/10] drm/i915/guc: Preemption! With GuC.

Message ID 20171005092005.10559-4-michal.winiarski@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michał Winiarski Oct. 5, 2017, 9:20 a.m. UTC
Pretty similar to what we have on execlists.
We're reusing most of the GEM code, however, due to GuC quirks we need a
couple of extra bits.
Preemption is implemented as GuC action, and actions can be pretty slow.
Because of that, we're using a mutex to serialize them. Since we're
requesting preemption from the tasklet, the task of creating a workitem
and wrapping it in GuC action is delegated to a worker.

To distinguish that preemption has finished, we're using additional
piece of HWSP, and since we're not getting context switch interrupts,
we're also adding a user interrupt.

The fact that our special preempt context has completed unfortunately
doesn't mean that we're ready to submit new work. We also need to wait
for GuC to finish its own processing.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jeff Mcgee <jeff.mcgee@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Oscar Mateo <oscar.mateo@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |   3 +-
 drivers/gpu/drm/i915/i915_guc_submission.c | 139 +++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_lrc.c           |   4 +-
 drivers/gpu/drm/i915/intel_lrc.h           |   4 +
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   2 +
 drivers/gpu/drm/i915/intel_uc.c            |   2 +
 drivers/gpu/drm/i915/intel_uc.h            |   3 +
 7 files changed, 144 insertions(+), 13 deletions(-)

Comments

Chris Wilson Oct. 5, 2017, 10:04 a.m. UTC | #1
Quoting Michał Winiarski (2017-10-05 10:20:04)
> Pretty similar to what we have on execlists.
> We're reusing most of the GEM code, however, due to GuC quirks we need a
> couple of extra bits.
> Preemption is implemented as GuC action, and actions can be pretty slow.
> Because of that, we're using a mutex to serialize them. Since we're
> requesting preemption from the tasklet, the task of creating a workitem
> and wrapping it in GuC action is delegated to a worker.
> 
> To distinguish that preemption has finished, we're using additional
> piece of HWSP, and since we're not getting context switch interrupts,
> we're also adding a user interrupt.
> 
> The fact that our special preempt context has completed unfortunately
> doesn't mean that we're ready to submit new work. We also need to wait
> for GuC to finish its own processing.
> 
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Jeff Mcgee <jeff.mcgee@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Oscar Mateo <oscar.mateo@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c            |   3 +-
>  drivers/gpu/drm/i915/i915_guc_submission.c | 139 +++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/intel_lrc.c           |   4 +-
>  drivers/gpu/drm/i915/intel_lrc.h           |   4 +
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |   2 +
>  drivers/gpu/drm/i915/intel_uc.c            |   2 +
>  drivers/gpu/drm/i915/intel_uc.h            |   3 +
>  7 files changed, 144 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 66fc156b294a..06670cc3ece0 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -373,8 +373,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
>                         value |= I915_SCHEDULER_CAP_PRIORITY;
>  
>                         if (INTEL_INFO(dev_priv)->has_logical_ring_preemption &&
> -                           i915_modparams.enable_execlists &&
> -                           !i915_modparams.enable_guc_submission)
> +                           i915_modparams.enable_execlists)
>                                 value |= I915_SCHEDULER_CAP_PREEMPTION;
>                 }
>                 break;
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 0f36bba9fc9e..21412ea3dc0e 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -498,6 +498,88 @@ static void guc_add_request(struct intel_guc *guc,
>         spin_unlock(&client->wq_lock);
>  }
>  
> +#define GUC_PREEMPT_FINISHED 0x1
> +#define GUC_PREEMPT_BREADCRUMB_DWORDS 0x8
> +static void inject_preempt_context(struct work_struct *work)
> +{
> +       struct intel_engine_cs *engine =
> +               container_of(work, typeof(*engine), guc_preempt_work);
> +       struct drm_i915_private *dev_priv = engine->i915;
> +       struct intel_guc *guc = &engine->i915->guc;
> +       struct i915_guc_client *client = guc->preempt_client;
> +       struct intel_context *ce = &client->owner->engine[engine->id];
> +       u32 ctx_desc = lower_32_bits(intel_lr_context_descriptor(client->owner,
> +                                                                engine));
> +       u32 *cs = ce->ring->vaddr + ce->ring->tail;
> +       u32 data[7];
> +
> +       if (engine->id == RCS) {
> +               cs = gen8_emit_ggtt_write_render(
> +                               intel_hws_preempt_done_address(engine),
> +                               GUC_PREEMPT_FINISHED, cs);
> +               *cs++ = MI_USER_INTERRUPT;
> +               *cs++ = MI_NOOP;
> +       } else {
> +               cs = gen8_emit_ggtt_write(
> +                               intel_hws_preempt_done_address(engine),
> +                               GUC_PREEMPT_FINISHED, cs);
> +               *cs++ = MI_USER_INTERRUPT;
> +               *cs++ = MI_NOOP;
> +               *cs++ = MI_NOOP;
> +               *cs++ = MI_NOOP;
> +       }
> +
> +       ce->ring->tail += GUC_PREEMPT_BREADCRUMB_DWORDS * sizeof(u32);
> +       ce->ring->tail &= (ce->ring->size - 1);

My turn! What guarantees do you give me that everything is nicely
aligned such that we don't overflow the ring?

> +       if (i915_vma_is_map_and_fenceable(ce->ring->vma))
> +               POSTING_READ_FW(GUC_STATUS);

Push this to

flush_ggtt_writes(ce->ring->vma);

It'll hide the dev_priv, and we can explain it a bit more carefully,
for both users.

> +       spin_lock_irq(&client->wq_lock);
> +       guc_wq_item_append(client, engine->guc_id, ctx_desc,
> +                          ce->ring->tail / sizeof(u64), 0);
> +       spin_unlock_irq(&client->wq_lock);
> +
> +       data[0] = INTEL_GUC_ACTION_REQUEST_PREEMPTION;
> +       data[1] = client->stage_id;
> +       data[2] = INTEL_GUC_PREEMPT_OPTION_IMMEDIATE |
> +                 INTEL_GUC_PREEMPT_OPTION_DROP_WORK_Q |
> +                 INTEL_GUC_PREEMPT_OPTION_DROP_SUBMIT_Q;
> +       data[3] = engine->guc_id;
> +       data[4] = guc->execbuf_client->priority;
> +       data[5] = guc->execbuf_client->stage_id;
> +       data[6] = guc->shared_data_offset;

On the surface, this all makes sense. I'll let those in the know check
that this is what the guc wants.

> +       if (WARN_ON(intel_guc_send(guc, data, ARRAY_SIZE(data)))) {
> +               WRITE_ONCE(engine->execlists.preempt, false);
> +               tasklet_schedule(&engine->execlists.irq_tasklet);
> +       }
> +}
> +
> +/*
> + * We're using user interrupt and HWSP value to mark that preemption has
> + * finished and GPU is idle. Normally, we could unwind and continue similar to
> + * execlists submission path. Unfortunately, with GuC we also need to wait for
> + * it to finish its own postprocessing, before attempting to submit. Otherwise
> + * GuC may silently ignore our submissions, and thus we risk losing request at
> + * best, executing out-of-order and causing kernel panic at worst.
> + */
> +#define GUC_PREEMPT_POSTPROCESS_DELAY_MS 10
> +static void wait_for_guc_preempt_report(struct intel_engine_cs *engine)
> +{
> +       struct intel_guc *guc = &engine->i915->guc;
> +       struct guc_shared_ctx_data *data = guc->shared_data_addr;
> +       struct guc_ctx_report *report = &data->preempt_ctx_report[engine->guc_id];
> +
> +       GEM_BUG_ON(wait_for_atomic(report->report_return_status ==
> +                                  INTEL_GUC_REPORT_STATUS_COMPLETE,
> +                                  GUC_PREEMPT_POSTPROCESS_DELAY_MS));

This will be compiled out for !debug. Is that intentional? The above
comment suggests it maybe, but also that we do want to try and wait.

> +       /* GuC is expecting that we're also going to clear the affected context
> +        * counter */
> +       report->affected_count = 0;
> +}
> +
> +
>  /**
>   * i915_guc_submit() - Submit commands through GuC
>   * @engine: engine associated with the commands
> @@ -574,13 +656,28 @@ static void i915_guc_dequeue(struct intel_engine_cs *engine)
>         bool submit = false;
>         struct rb_node *rb;
>  
> -       if (port_isset(port))
> -               port++;
> -
>         spin_lock_irq(&engine->timeline->lock);
>         rb = execlists->first;
>         GEM_BUG_ON(rb_first(&execlists->queue) != rb);
> -       while (rb) {
> +
> +       if (!rb)
> +               goto unlock;
> +
> +       if (port_isset(port)) {

Feeling that we do can_preempt() as well. Just to have that get out of
jail clause.

> +               if (rb_entry(rb, struct i915_priolist, node)->priority >
> +                   max(port_request(port)->priotree.priority, 0)) {
> +                       WRITE_ONCE(execlists->preempt, true);
> +                       queue_work(engine->i915->guc.preempt_wq,
> +                                  &engine->guc_preempt_work);
> +                       goto unlock;
> +               } else if (port_isset(last_port)) {
> +                       goto unlock;
> +               }
> +
> +               port++;
> +       }
> +
> +       do {
>                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
>                 struct drm_i915_gem_request *rq, *rn;
>  
> @@ -610,13 +707,14 @@ static void i915_guc_dequeue(struct intel_engine_cs *engine)
>                 INIT_LIST_HEAD(&p->requests);
>                 if (p->priority != I915_PRIORITY_NORMAL)
>                         kmem_cache_free(engine->i915->priorities, p);
> -       }
> +       } while (rb);
>  done:
>         execlists->first = rb;
>         if (submit) {
>                 port_assign(port, last);
>                 i915_guc_submit(engine);
>         }
> +unlock:
>         spin_unlock_irq(&engine->timeline->lock);
>  }
>  
> @@ -625,10 +723,23 @@ static void i915_guc_irq_handler(unsigned long data)
>         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
>         struct intel_engine_execlists * const execlists = &engine->execlists;
>         struct execlist_port *port = execlists->port;
> -       const struct execlist_port * const last_port =
> -               &execlists->port[execlists->port_mask];
>         struct drm_i915_gem_request *rq;
>  
> +       if (READ_ONCE(execlists->preempt) &&
> +           intel_read_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX) ==
> +           GUC_PREEMPT_FINISHED) {
> +               execlist_cancel_port_requests(&engine->execlists);
> +
> +               spin_lock_irq(&engine->timeline->lock);
> +               unwind_incomplete_requests(engine);
> +               spin_unlock_irq(&engine->timeline->lock);
> +
> +               wait_for_guc_preempt_report(engine);
> +
> +               WRITE_ONCE(execlists->preempt, false);
> +               intel_write_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX, 0);
> +       }
> +
>         rq = port_request(&port[0]);
>         while (rq && i915_gem_request_completed(rq)) {
>                 trace_i915_gem_request_out(rq);
> @@ -640,7 +751,7 @@ static void i915_guc_irq_handler(unsigned long data)
>                 rq = port_request(&port[0]);
>         }
>  
> -       if (!port_isset(last_port))
> +       if (!READ_ONCE(execlists->preempt))
>                 i915_guc_dequeue(engine);
>  }
>  
> @@ -1046,14 +1157,22 @@ int i915_guc_submission_init(struct drm_i915_private *dev_priv)
>         if (ret < 0)
>                 goto err_vaddr;
>  

/* Please explain you choice. */

Ordered? I presume because we are serialised by the guc send mutex, you
don't want to have engines preempting in parallel. Makes sense, but
please do record rationale for the reader.

How should we cancel the preempt work on reset?

> +       guc->preempt_wq = alloc_ordered_workqueue("i915-guc_preempt", WQ_HIGHPRI);
> +       if (!guc->preempt_wq) {
> +               ret = -ENOMEM;
> +               goto err_log;
> +       }
> +
>         ret = guc_ads_create(guc);
>         if (ret < 0)
> -               goto err_log;
> +               goto err_wq;
>  
>         ida_init(&guc->stage_ids);
>  
>         return 0;
>  
> +err_wq:
> +       destroy_workqueue(guc->preempt_wq);
>  err_log:
>         intel_guc_log_destroy(guc);
>  err_vaddr:
> @@ -1069,6 +1188,7 @@ void i915_guc_submission_fini(struct drm_i915_private *dev_priv)
>  
>         ida_destroy(&guc->stage_ids);
>         guc_ads_destroy(guc);
> +       destroy_workqueue(guc->preempt_wq);
>         intel_guc_log_destroy(guc);
>         i915_gem_object_unpin_map(guc->stage_desc_pool->obj);
>         i915_vma_unpin_and_release(&guc->stage_desc_pool);
> @@ -1204,6 +1324,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>         for_each_engine(engine, dev_priv, id) {
>                 struct intel_engine_execlists * const execlists = &engine->execlists;
>                 execlists->irq_tasklet.func = i915_guc_irq_handler;
> +               INIT_WORK(&engine->guc_preempt_work, inject_preempt_context);
>         }
>  
>         return 0;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index e6bbdc5dd01b..affef8e808be 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -354,7 +354,7 @@ static void unwind_wa_tail(struct drm_i915_gem_request *rq)
>         assert_ring_tail_valid(rq->ring, rq->tail);
>  }
>  
> -static void unwind_incomplete_requests(struct intel_engine_cs *engine)
> +void unwind_incomplete_requests(struct intel_engine_cs *engine)
>  {
>         struct drm_i915_gem_request *rq, *rn;
>         struct i915_priolist *uninitialized_var(p);
> @@ -687,7 +687,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>                 execlists_submit_ports(engine);
>  }
>  
> -static void
> +void
>  execlist_cancel_port_requests(struct intel_engine_execlists *execlists)
>  {
>         struct execlist_port *port = execlists->port;
> diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
> index 689fde1a63a9..e650610de226 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.h
> +++ b/drivers/gpu/drm/i915/intel_lrc.h
> @@ -107,6 +107,10 @@ intel_lr_context_descriptor(struct i915_gem_context *ctx,
>         return ctx->engine[engine->id].lrc_desc;
>  }
>  
> +void unwind_incomplete_requests(struct intel_engine_cs *engine);
> +void
> +execlist_cancel_port_requests(struct intel_engine_execlists *execlists);

We are using execlists as our prefix, but only so far for internal
intel_lrc.c usage, so perhaps intel_execlists_ for sharing (verbosity is
unfortunately the way of unambiguity). And we should convert
unwind_incomplete_requests() over.
-Chris
Chris Wilson Oct. 5, 2017, 10:31 a.m. UTC | #2
Quoting Michał Winiarski (2017-10-05 10:20:04)
> @@ -625,10 +723,23 @@ static void i915_guc_irq_handler(unsigned long data)
>         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
>         struct intel_engine_execlists * const execlists = &engine->execlists;
>         struct execlist_port *port = execlists->port;
> -       const struct execlist_port * const last_port =
> -               &execlists->port[execlists->port_mask];
>         struct drm_i915_gem_request *rq;
>  
> +       if (READ_ONCE(execlists->preempt) &&
> +           intel_read_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX) ==
> +           GUC_PREEMPT_FINISHED) {
> +               execlist_cancel_port_requests(&engine->execlists);
> +
> +               spin_lock_irq(&engine->timeline->lock);
> +               unwind_incomplete_requests(engine);
> +               spin_unlock_irq(&engine->timeline->lock);
> +
> +               wait_for_guc_preempt_report(engine);
> +
> +               WRITE_ONCE(execlists->preempt, false);
> +               intel_write_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX, 0);

Ok. So we injected the preempt context, observed it complete. What stops
the GuC from switching back to the normal context and draining its wq?
(Those requests we've just put back to be re-executed later.)
-Chris
Michal Wajdeczko Oct. 5, 2017, 3 p.m. UTC | #3
On Thu, 05 Oct 2017 11:20:04 +0200, Michał Winiarski  
<michal.winiarski@intel.com> wrote:

> Pretty similar to what we have on execlists.
> We're reusing most of the GEM code, however, due to GuC quirks we need a
> couple of extra bits.
> Preemption is implemented as GuC action, and actions can be pretty slow.
> Because of that, we're using a mutex to serialize them. Since we're
> requesting preemption from the tasklet, the task of creating a workitem
> and wrapping it in GuC action is delegated to a worker.
>
> To distinguish that preemption has finished, we're using additional
> piece of HWSP, and since we're not getting context switch interrupts,
> we're also adding a user interrupt.
>
> The fact that our special preempt context has completed unfortunately
> doesn't mean that we're ready to submit new work. We also need to wait
> for GuC to finish its own processing.
>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Jeff Mcgee <jeff.mcgee@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Oscar Mateo <oscar.mateo@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
> ---

<snip>

> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c  
> b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 0f36bba9fc9e..21412ea3dc0e 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -498,6 +498,88 @@ static void guc_add_request(struct intel_guc *guc,
>  	spin_unlock(&client->wq_lock);
>  }
> +#define GUC_PREEMPT_FINISHED 0x1
> +#define GUC_PREEMPT_BREADCRUMB_DWORDS 0x8
> +static void inject_preempt_context(struct work_struct *work)
> +{
> +	struct intel_engine_cs *engine =
> +		container_of(work, typeof(*engine), guc_preempt_work);
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	struct intel_guc *guc = &engine->i915->guc;

guc = &dev_priv->guc;

> +	struct i915_guc_client *client = guc->preempt_client;
> +	struct intel_context *ce = &client->owner->engine[engine->id];
> +	u32 ctx_desc = lower_32_bits(intel_lr_context_descriptor(client->owner,
> +								 engine));
> +	u32 *cs = ce->ring->vaddr + ce->ring->tail;
> +	u32 data[7];
> +
> +	if (engine->id == RCS) {
> +		cs = gen8_emit_ggtt_write_render(
> +				intel_hws_preempt_done_address(engine),
> +				GUC_PREEMPT_FINISHED, cs);
> +		*cs++ = MI_USER_INTERRUPT;
> +		*cs++ = MI_NOOP;
> +	} else {
> +		cs = gen8_emit_ggtt_write(
> +				intel_hws_preempt_done_address(engine),
> +				GUC_PREEMPT_FINISHED, cs);
> +		*cs++ = MI_USER_INTERRUPT;
> +		*cs++ = MI_NOOP;
> +		*cs++ = MI_NOOP;
> +		*cs++ = MI_NOOP;
> +	}

Maybe like this:

	cs = gen8_emit_ggtt_write_render(
		intel_hws_preempt_done_address(engine),
		GUC_PREEMPT_FINISHED, cs);
	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;
	if (engine->id != RCS) {
		*cs++ = MI_NOOP;
		*cs++ = MI_NOOP;
	}

> +
> +	ce->ring->tail += GUC_PREEMPT_BREADCRUMB_DWORDS * sizeof(u32);
> +	ce->ring->tail &= (ce->ring->size - 1);
> +
> +	if (i915_vma_is_map_and_fenceable(ce->ring->vma))
> +		POSTING_READ_FW(GUC_STATUS);
> +
> +	spin_lock_irq(&client->wq_lock);
> +	guc_wq_item_append(client, engine->guc_id, ctx_desc,
> +			   ce->ring->tail / sizeof(u64), 0);
> +	spin_unlock_irq(&client->wq_lock);
> +
> +	data[0] = INTEL_GUC_ACTION_REQUEST_PREEMPTION;
> +	data[1] = client->stage_id;
> +	data[2] = INTEL_GUC_PREEMPT_OPTION_IMMEDIATE |
> +		  INTEL_GUC_PREEMPT_OPTION_DROP_WORK_Q |
> +		  INTEL_GUC_PREEMPT_OPTION_DROP_SUBMIT_Q;
> +	data[3] = engine->guc_id;
> +	data[4] = guc->execbuf_client->priority;

Hmm, this is always GUC_CLIENT_PRIORITY_KMD_NORMAL, right ?

> +	data[5] = guc->execbuf_client->stage_id;
> +	data[6] = guc->shared_data_offset;
> +

Please pull above action code into new helper

	int guc_send_preemption_request(guc, engine_guc_id)
	{
		data[] { ... };
		return intel_guc_send(guc, data, ARRAY_SIZE(data));
	}

> +	if (WARN_ON(intel_guc_send(guc, data, ARRAY_SIZE(data)))) {
> +		WRITE_ONCE(engine->execlists.preempt, false);
> +		tasklet_schedule(&engine->execlists.irq_tasklet);
> +	}
> +}
> +

<snip>

> @@ -177,6 +177,8 @@ static int guc_enable_communication(struct intel_guc  
> *guc)
> 	guc->shared_data_offset = guc_ggtt_offset(ctx->engine[RCS].state) +
>  		LRC_GUCSHR_PN * PAGE_SIZE;
> +	guc->shared_data_addr = (void*)ctx->engine[RCS].lrc_reg_state -
> +		LRC_STATE_PN * PAGE_SIZE;

Please pick better place (see my comment in 1/10)

> 	if (HAS_GUC_CT(dev_priv))
>  		return intel_guc_enable_ct(guc);
> diff --git a/drivers/gpu/drm/i915/intel_uc.h  
> b/drivers/gpu/drm/i915/intel_uc.h
> index c6c6f8513bbf..8e2818e5741e 100644
> --- a/drivers/gpu/drm/i915/intel_uc.h
> +++ b/drivers/gpu/drm/i915/intel_uc.h
> @@ -124,6 +124,9 @@ struct intel_guc {
> 	/* Kernel context state ggtt offset, first page is shared with GuC */
>  	u32 shared_data_offset;
> +	void *shared_data_addr;
> +
> +	struct workqueue_struct *preempt_wq;

Please pick better place (see my comment in 1/10)

> 	/* GuC's FW specific send function */
>  	int (*send)(struct intel_guc *guc, const u32 *data, u32 len);
Michał Winiarski Oct. 5, 2017, 3:42 p.m. UTC | #4
On Thu, Oct 05, 2017 at 03:00:31PM +0000, Michal Wajdeczko wrote:
> On Thu, 05 Oct 2017 11:20:04 +0200, Michał Winiarski
> <michal.winiarski@intel.com> wrote:
> 
> > Pretty similar to what we have on execlists.
> > We're reusing most of the GEM code, however, due to GuC quirks we need a
> > couple of extra bits.
> > Preemption is implemented as GuC action, and actions can be pretty slow.
> > Because of that, we're using a mutex to serialize them. Since we're
> > requesting preemption from the tasklet, the task of creating a workitem
> > and wrapping it in GuC action is delegated to a worker.
> > 
> > To distinguish that preemption has finished, we're using additional
> > piece of HWSP, and since we're not getting context switch interrupts,
> > we're also adding a user interrupt.
> > 
> > The fact that our special preempt context has completed unfortunately
> > doesn't mean that we're ready to submit new work. We also need to wait
> > for GuC to finish its own processing.
> > 
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Jeff Mcgee <jeff.mcgee@intel.com>
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > Cc: Oscar Mateo <oscar.mateo@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
> > ---
> 
> <snip>
> 
> > diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c
> > b/drivers/gpu/drm/i915/i915_guc_submission.c
> > index 0f36bba9fc9e..21412ea3dc0e 100644
> > --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> > @@ -498,6 +498,88 @@ static void guc_add_request(struct intel_guc *guc,
> >  	spin_unlock(&client->wq_lock);
> >  }
> > +#define GUC_PREEMPT_FINISHED 0x1
> > +#define GUC_PREEMPT_BREADCRUMB_DWORDS 0x8
> > +static void inject_preempt_context(struct work_struct *work)
> > +{
> > +	struct intel_engine_cs *engine =
> > +		container_of(work, typeof(*engine), guc_preempt_work);
> > +	struct drm_i915_private *dev_priv = engine->i915;
> > +	struct intel_guc *guc = &engine->i915->guc;
> 
> guc = &dev_priv->guc;
> 
> > +	struct i915_guc_client *client = guc->preempt_client;
> > +	struct intel_context *ce = &client->owner->engine[engine->id];
> > +	u32 ctx_desc = lower_32_bits(intel_lr_context_descriptor(client->owner,
> > +								 engine));
> > +	u32 *cs = ce->ring->vaddr + ce->ring->tail;
> > +	u32 data[7];
> > +
> > +	if (engine->id == RCS) {
> > +		cs = gen8_emit_ggtt_write_render(
> > +				intel_hws_preempt_done_address(engine),
> > +				GUC_PREEMPT_FINISHED, cs);
> > +		*cs++ = MI_USER_INTERRUPT;
> > +		*cs++ = MI_NOOP;
> > +	} else {
> > +		cs = gen8_emit_ggtt_write(
> > +				intel_hws_preempt_done_address(engine),
> > +				GUC_PREEMPT_FINISHED, cs);
> > +		*cs++ = MI_USER_INTERRUPT;
> > +		*cs++ = MI_NOOP;
> > +		*cs++ = MI_NOOP;
> > +		*cs++ = MI_NOOP;
> > +	}
> 
> Maybe like this:
> 
> 	cs = gen8_emit_ggtt_write_render(
> 		intel_hws_preempt_done_address(engine),
> 		GUC_PREEMPT_FINISHED, cs);
> 	*cs++ = MI_USER_INTERRUPT;
> 	*cs++ = MI_NOOP;
> 	if (engine->id != RCS) {
> 		*cs++ = MI_NOOP;
> 		*cs++ = MI_NOOP;
> 	}

Did you mean:

if (engine->id == RCS) {
	cs = gen8_emit_ggtt_write_render()
} else {
	cs = gen8_emit_ggtt_write()
	*cs++ = MI_NOOP;
	*cs++ = MI_NOOP;
}
*cs++ = MI_USER_INTERRUPT
*cs++ = MI_NOOP

?

[SNIP]

> > diff --git a/drivers/gpu/drm/i915/intel_uc.h
> > b/drivers/gpu/drm/i915/intel_uc.h
> > index c6c6f8513bbf..8e2818e5741e 100644
> > --- a/drivers/gpu/drm/i915/intel_uc.h
> > +++ b/drivers/gpu/drm/i915/intel_uc.h
> > @@ -124,6 +124,9 @@ struct intel_guc {
> > 	/* Kernel context state ggtt offset, first page is shared with GuC */
> >  	u32 shared_data_offset;
> > +	void *shared_data_addr;
> > +
> > +	struct workqueue_struct *preempt_wq;
> 
> Please pick better place (see my comment in 1/10)

I'll go with a helper for shared_data_* instead.

-Michał

> 
> > 	/* GuC's FW specific send function */
> >  	int (*send)(struct intel_guc *guc, const u32 *data, u32 len);
Michał Winiarski Oct. 5, 2017, 6:38 p.m. UTC | #5
On Thu, Oct 05, 2017 at 10:31:34AM +0000, Chris Wilson wrote:
> Quoting Michał Winiarski (2017-10-05 10:20:04)
> > @@ -625,10 +723,23 @@ static void i915_guc_irq_handler(unsigned long data)
> >         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
> >         struct intel_engine_execlists * const execlists = &engine->execlists;
> >         struct execlist_port *port = execlists->port;
> > -       const struct execlist_port * const last_port =
> > -               &execlists->port[execlists->port_mask];
> >         struct drm_i915_gem_request *rq;
> >  
> > +       if (READ_ONCE(execlists->preempt) &&
> > +           intel_read_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX) ==
> > +           GUC_PREEMPT_FINISHED) {
> > +               execlist_cancel_port_requests(&engine->execlists);
> > +
> > +               spin_lock_irq(&engine->timeline->lock);
> > +               unwind_incomplete_requests(engine);
> > +               spin_unlock_irq(&engine->timeline->lock);
> > +
> > +               wait_for_guc_preempt_report(engine);
> > +
> > +               WRITE_ONCE(execlists->preempt, false);
> > +               intel_write_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX, 0);
> 
> Ok. So we injected the preempt context, observed it complete. What stops
> the GuC from switching back to the normal context and draining its wq?
> (Those requests we've just put back to be re-executed later.)
> -Chris

We're sending GuC action with:
INTEL_GUC_PREEMPT_OPTION_DROP_WORK_Q | INTEL_GUC_PREEMPT_OPTION_DROP_SUBMIT_Q

Which prevents GuC from doing exactly that.

I'll add a comment in inject preempt ctx.

-Michał
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 66fc156b294a..06670cc3ece0 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -373,8 +373,7 @@  static int i915_getparam(struct drm_device *dev, void *data,
 			value |= I915_SCHEDULER_CAP_PRIORITY;
 
 			if (INTEL_INFO(dev_priv)->has_logical_ring_preemption &&
-			    i915_modparams.enable_execlists &&
-			    !i915_modparams.enable_guc_submission)
+			    i915_modparams.enable_execlists)
 				value |= I915_SCHEDULER_CAP_PREEMPTION;
 		}
 		break;
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 0f36bba9fc9e..21412ea3dc0e 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -498,6 +498,88 @@  static void guc_add_request(struct intel_guc *guc,
 	spin_unlock(&client->wq_lock);
 }
 
+#define GUC_PREEMPT_FINISHED 0x1
+#define GUC_PREEMPT_BREADCRUMB_DWORDS 0x8
+static void inject_preempt_context(struct work_struct *work)
+{
+	struct intel_engine_cs *engine =
+		container_of(work, typeof(*engine), guc_preempt_work);
+	struct drm_i915_private *dev_priv = engine->i915;
+	struct intel_guc *guc = &engine->i915->guc;
+	struct i915_guc_client *client = guc->preempt_client;
+	struct intel_context *ce = &client->owner->engine[engine->id];
+	u32 ctx_desc = lower_32_bits(intel_lr_context_descriptor(client->owner,
+								 engine));
+	u32 *cs = ce->ring->vaddr + ce->ring->tail;
+	u32 data[7];
+
+	if (engine->id == RCS) {
+		cs = gen8_emit_ggtt_write_render(
+				intel_hws_preempt_done_address(engine),
+				GUC_PREEMPT_FINISHED, cs);
+		*cs++ = MI_USER_INTERRUPT;
+		*cs++ = MI_NOOP;
+	} else {
+		cs = gen8_emit_ggtt_write(
+				intel_hws_preempt_done_address(engine),
+				GUC_PREEMPT_FINISHED, cs);
+		*cs++ = MI_USER_INTERRUPT;
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+		*cs++ = MI_NOOP;
+	}
+
+	ce->ring->tail += GUC_PREEMPT_BREADCRUMB_DWORDS * sizeof(u32);
+	ce->ring->tail &= (ce->ring->size - 1);
+
+	if (i915_vma_is_map_and_fenceable(ce->ring->vma))
+		POSTING_READ_FW(GUC_STATUS);
+
+	spin_lock_irq(&client->wq_lock);
+	guc_wq_item_append(client, engine->guc_id, ctx_desc,
+			   ce->ring->tail / sizeof(u64), 0);
+	spin_unlock_irq(&client->wq_lock);
+
+	data[0] = INTEL_GUC_ACTION_REQUEST_PREEMPTION;
+	data[1] = client->stage_id;
+	data[2] = INTEL_GUC_PREEMPT_OPTION_IMMEDIATE |
+		  INTEL_GUC_PREEMPT_OPTION_DROP_WORK_Q |
+		  INTEL_GUC_PREEMPT_OPTION_DROP_SUBMIT_Q;
+	data[3] = engine->guc_id;
+	data[4] = guc->execbuf_client->priority;
+	data[5] = guc->execbuf_client->stage_id;
+	data[6] = guc->shared_data_offset;
+
+	if (WARN_ON(intel_guc_send(guc, data, ARRAY_SIZE(data)))) {
+		WRITE_ONCE(engine->execlists.preempt, false);
+		tasklet_schedule(&engine->execlists.irq_tasklet);
+	}
+}
+
+/*
+ * We're using user interrupt and HWSP value to mark that preemption has
+ * finished and GPU is idle. Normally, we could unwind and continue similar to
+ * execlists submission path. Unfortunately, with GuC we also need to wait for
+ * it to finish its own postprocessing, before attempting to submit. Otherwise
+ * GuC may silently ignore our submissions, and thus we risk losing request at
+ * best, executing out-of-order and causing kernel panic at worst.
+ */
+#define GUC_PREEMPT_POSTPROCESS_DELAY_MS 10
+static void wait_for_guc_preempt_report(struct intel_engine_cs *engine)
+{
+	struct intel_guc *guc = &engine->i915->guc;
+	struct guc_shared_ctx_data *data = guc->shared_data_addr;
+	struct guc_ctx_report *report = &data->preempt_ctx_report[engine->guc_id];
+
+	GEM_BUG_ON(wait_for_atomic(report->report_return_status ==
+				   INTEL_GUC_REPORT_STATUS_COMPLETE,
+				   GUC_PREEMPT_POSTPROCESS_DELAY_MS));
+	/* GuC is expecting that we're also going to clear the affected context
+	 * counter */
+	report->affected_count = 0;
+}
+
+
 /**
  * i915_guc_submit() - Submit commands through GuC
  * @engine: engine associated with the commands
@@ -574,13 +656,28 @@  static void i915_guc_dequeue(struct intel_engine_cs *engine)
 	bool submit = false;
 	struct rb_node *rb;
 
-	if (port_isset(port))
-		port++;
-
 	spin_lock_irq(&engine->timeline->lock);
 	rb = execlists->first;
 	GEM_BUG_ON(rb_first(&execlists->queue) != rb);
-	while (rb) {
+
+	if (!rb)
+		goto unlock;
+
+	if (port_isset(port)) {
+		if (rb_entry(rb, struct i915_priolist, node)->priority >
+		    max(port_request(port)->priotree.priority, 0)) {
+			WRITE_ONCE(execlists->preempt, true);
+			queue_work(engine->i915->guc.preempt_wq,
+				   &engine->guc_preempt_work);
+			goto unlock;
+		} else if (port_isset(last_port)) {
+			goto unlock;
+		}
+
+		port++;
+	}
+
+	do {
 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
 		struct drm_i915_gem_request *rq, *rn;
 
@@ -610,13 +707,14 @@  static void i915_guc_dequeue(struct intel_engine_cs *engine)
 		INIT_LIST_HEAD(&p->requests);
 		if (p->priority != I915_PRIORITY_NORMAL)
 			kmem_cache_free(engine->i915->priorities, p);
-	}
+	} while (rb);
 done:
 	execlists->first = rb;
 	if (submit) {
 		port_assign(port, last);
 		i915_guc_submit(engine);
 	}
+unlock:
 	spin_unlock_irq(&engine->timeline->lock);
 }
 
@@ -625,10 +723,23 @@  static void i915_guc_irq_handler(unsigned long data)
 	struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct execlist_port *port = execlists->port;
-	const struct execlist_port * const last_port =
-		&execlists->port[execlists->port_mask];
 	struct drm_i915_gem_request *rq;
 
+	if (READ_ONCE(execlists->preempt) &&
+	    intel_read_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX) ==
+	    GUC_PREEMPT_FINISHED) {
+		execlist_cancel_port_requests(&engine->execlists);
+
+		spin_lock_irq(&engine->timeline->lock);
+		unwind_incomplete_requests(engine);
+		spin_unlock_irq(&engine->timeline->lock);
+
+		wait_for_guc_preempt_report(engine);
+
+		WRITE_ONCE(execlists->preempt, false);
+		intel_write_status_page(engine, I915_GEM_HWS_PREEMPT_INDEX, 0);
+	}
+
 	rq = port_request(&port[0]);
 	while (rq && i915_gem_request_completed(rq)) {
 		trace_i915_gem_request_out(rq);
@@ -640,7 +751,7 @@  static void i915_guc_irq_handler(unsigned long data)
 		rq = port_request(&port[0]);
 	}
 
-	if (!port_isset(last_port))
+	if (!READ_ONCE(execlists->preempt))
 		i915_guc_dequeue(engine);
 }
 
@@ -1046,14 +1157,22 @@  int i915_guc_submission_init(struct drm_i915_private *dev_priv)
 	if (ret < 0)
 		goto err_vaddr;
 
+	guc->preempt_wq = alloc_ordered_workqueue("i915-guc_preempt", WQ_HIGHPRI);
+	if (!guc->preempt_wq) {
+		ret = -ENOMEM;
+		goto err_log;
+	}
+
 	ret = guc_ads_create(guc);
 	if (ret < 0)
-		goto err_log;
+		goto err_wq;
 
 	ida_init(&guc->stage_ids);
 
 	return 0;
 
+err_wq:
+	destroy_workqueue(guc->preempt_wq);
 err_log:
 	intel_guc_log_destroy(guc);
 err_vaddr:
@@ -1069,6 +1188,7 @@  void i915_guc_submission_fini(struct drm_i915_private *dev_priv)
 
 	ida_destroy(&guc->stage_ids);
 	guc_ads_destroy(guc);
+	destroy_workqueue(guc->preempt_wq);
 	intel_guc_log_destroy(guc);
 	i915_gem_object_unpin_map(guc->stage_desc_pool->obj);
 	i915_vma_unpin_and_release(&guc->stage_desc_pool);
@@ -1204,6 +1324,7 @@  int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	for_each_engine(engine, dev_priv, id) {
 		struct intel_engine_execlists * const execlists = &engine->execlists;
 		execlists->irq_tasklet.func = i915_guc_irq_handler;
+		INIT_WORK(&engine->guc_preempt_work, inject_preempt_context);
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e6bbdc5dd01b..affef8e808be 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -354,7 +354,7 @@  static void unwind_wa_tail(struct drm_i915_gem_request *rq)
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
 
-static void unwind_incomplete_requests(struct intel_engine_cs *engine)
+void unwind_incomplete_requests(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *rq, *rn;
 	struct i915_priolist *uninitialized_var(p);
@@ -687,7 +687,7 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 		execlists_submit_ports(engine);
 }
 
-static void
+void
 execlist_cancel_port_requests(struct intel_engine_execlists *execlists)
 {
 	struct execlist_port *port = execlists->port;
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 689fde1a63a9..e650610de226 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -107,6 +107,10 @@  intel_lr_context_descriptor(struct i915_gem_context *ctx,
 	return ctx->engine[engine->id].lrc_desc;
 }
 
+void unwind_incomplete_requests(struct intel_engine_cs *engine);
+void
+execlist_cancel_port_requests(struct intel_engine_execlists *execlists);
+
 
 /* Execlists */
 int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index dd3b8727bf6d..2da1869fe4e3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -498,6 +498,8 @@  struct intel_engine_cs {
 
 	bool needs_cmd_parser;
 
+	struct work_struct guc_preempt_work;
+
 	/*
 	 * Table of commands the command parser needs to know about
 	 * for this engine.
diff --git a/drivers/gpu/drm/i915/intel_uc.c b/drivers/gpu/drm/i915/intel_uc.c
index f4893c85e54a..ca66b61c799e 100644
--- a/drivers/gpu/drm/i915/intel_uc.c
+++ b/drivers/gpu/drm/i915/intel_uc.c
@@ -177,6 +177,8 @@  static int guc_enable_communication(struct intel_guc *guc)
 
 	guc->shared_data_offset = guc_ggtt_offset(ctx->engine[RCS].state) +
 		LRC_GUCSHR_PN * PAGE_SIZE;
+	guc->shared_data_addr = (void*)ctx->engine[RCS].lrc_reg_state -
+		LRC_STATE_PN * PAGE_SIZE;
 
 	if (HAS_GUC_CT(dev_priv))
 		return intel_guc_enable_ct(guc);
diff --git a/drivers/gpu/drm/i915/intel_uc.h b/drivers/gpu/drm/i915/intel_uc.h
index c6c6f8513bbf..8e2818e5741e 100644
--- a/drivers/gpu/drm/i915/intel_uc.h
+++ b/drivers/gpu/drm/i915/intel_uc.h
@@ -124,6 +124,9 @@  struct intel_guc {
 
 	/* Kernel context state ggtt offset, first page is shared with GuC */
 	u32 shared_data_offset;
+	void *shared_data_addr;
+
+	struct workqueue_struct *preempt_wq;
 
 	/* GuC's FW specific send function */
 	int (*send)(struct intel_guc *guc, const u32 *data, u32 len);