diff mbox

[3/4] drm/i915/guc: Make adding GuC work items lockless

Message ID 20170912124726.19689-3-michal.winiarski@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michał Winiarski Sept. 12, 2017, 12:47 p.m. UTC
We can get rid of a spinlock by updating the tail directly using
cmpxchg. We can also put guc client on a diet by removing some constants
from the struct.
This causes a small change in one of GuC debugfs files.
We're no longer reporting constant values (which I don't think is a
problem), but we're also no longer reporting the tail (does anyone care?).

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Oscar Mateo <oscar.mateo@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |  2 --
 drivers/gpu/drm/i915/i915_guc_submission.c | 39 ++++++++++--------------------
 drivers/gpu/drm/i915/intel_uc.h            |  5 ----
 3 files changed, 13 insertions(+), 33 deletions(-)

Comments

Chris Wilson Sept. 12, 2017, 1:40 p.m. UTC | #1
Quoting Michał Winiarski (2017-09-12 13:47:25)
> @@ -416,15 +416,15 @@ static void guc_wq_item_append(struct i915_guc_client *client,
>         struct intel_engine_cs *engine = rq->engine;
>         struct guc_process_desc *desc = __get_process_desc(client);
>         struct guc_wq_item *wqi;
> -       u32 freespace, tail, wq_off;
> +       u32 freespace, ring_tail, wq_off, wq_next;
>  
>         /* Free space is guaranteed */
> -       freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
> +       freespace = CIRC_SPACE(desc->tail, desc->head, GUC_WQ_SIZE);
>         GEM_BUG_ON(freespace < wqi_size);

Fwiw, I would move this to the cmpxchg loop.

GEM_BUG_ON(CIRC_SPACE(wq_off, READ_ONCE(desc->head), GUC_WQ_SIZE) < wqi_size);
>  
>         /* The GuC firmware wants the tail index in QWords, not bytes */
> -       tail = intel_ring_set_tail(rq->ring, rq->tail) >> 3;
> -       GEM_BUG_ON(tail > WQ_RING_TAIL_MAX);
> +       ring_tail = intel_ring_set_tail(rq->ring, rq->tail) >> 3;
> +       GEM_BUG_ON(ring_tail > WQ_RING_TAIL_MAX);
>  
>         /* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
>          * should not have the case where structure wqi is across page, neither
> @@ -435,11 +435,12 @@ static void guc_wq_item_append(struct i915_guc_client *client,
>          */
>         BUILD_BUG_ON(wqi_size != 16);
>  
> -       /* postincrement WQ tail for next time */
> -       wq_off = client->wq_tail;
> +       /* Find our offset and postincrement WQ tail for next time */
> +       do {
> +               wq_off = desc->tail;

wq_off = READ_ONCE(desc->tail);

> +               wq_next = (wq_off + wqi_size) & (GUC_WQ_SIZE - 1);
> +       } while (cmpxchg(&desc->tail, wq_off, wq_next) != wq_off);
>         GEM_BUG_ON(wq_off & (wqi_size - 1));
> -       client->wq_tail += wqi_size;
> -       client->wq_tail &= client->wq_size - 1;
>  
>         /* WQ starts from the page after doorbell / process_desc */
>         wqi = client->vaddr + wq_off + GUC_DB_SIZE;
> @@ -453,7 +454,7 @@ static void guc_wq_item_append(struct i915_guc_client *client,
>         /* The GuC wants only the low-order word of the context descriptor */
>         wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, engine);
>  
> -       wqi->submit_element_info = tail << WQ_RING_TAIL_SHIFT;
> +       wqi->submit_element_info = ring_tail << WQ_RING_TAIL_SHIFT;
>         wqi->fence_id = rq->global_seqno;
>  }
>  
> @@ -463,20 +464,14 @@ static void guc_reset_wq(struct i915_guc_client *client)
>  
>         desc->head = 0;
>         desc->tail = 0;
> -
> -       client->wq_tail = 0;
>  }
>  
>  static int guc_ring_doorbell(struct i915_guc_client *client)
>  {
> -       struct guc_process_desc *desc = __get_process_desc(client);
>         union guc_doorbell_qw db_cmp, db_exc, db_ret;
>         union guc_doorbell_qw *db;
>         int attempt = 2, ret = -EAGAIN;
>  
> -       /* Update the tail so it is visible to GuC */
> -       desc->tail = client->wq_tail;
> -
>         /* current cookie */
>         db_cmp.db_status = GUC_DOORBELL_ENABLED;
>         db_cmp.cookie = client->doorbell_cookie;
> @@ -535,7 +530,6 @@ static void i915_guc_submit(struct intel_engine_cs *engine)
>         struct execlist_port *port = engine->execlist_port;
>         unsigned int engine_id = engine->id;
>         unsigned int n;
> -       unsigned long flags;
>  
>         for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
>                 struct drm_i915_gem_request *rq;
> @@ -548,14 +542,10 @@ static void i915_guc_submit(struct intel_engine_cs *engine)
>                         if (i915_vma_is_map_and_fenceable(rq->ring->vma))
>                                 POSTING_READ_FW(GUC_STATUS);
>  
> -                       spin_lock_irqsave(&client->wq_lock, flags);
> -
>                         guc_wq_item_append(client, rq);
>                         WARN_ON(guc_ring_doorbell(client));
>  
>                         client->submissions[engine_id] += 1;

Per-engine, so this is actually serialized by the tasklet. Hmm, double
accounting after reset. But do I care? I consider it to be pointless
since we are counting at the wrong boundary.

I think we need the READ_ONCE to be clear to both the compiler and
ourselves that we are reading transient values shared with the guc. But
since that's the only issue I could see,
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index f5fd00cfb3b0..e124e91aefcf 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2447,8 +2447,6 @@  static void i915_guc_client_info(struct seq_file *m,
 		client->priority, client->stage_id, client->proc_desc_offset);
 	seq_printf(m, "\tDoorbell id %d, offset: 0x%lx, cookie 0x%x\n",
 		client->doorbell_id, client->doorbell_offset, client->doorbell_cookie);
-	seq_printf(m, "\tWQ size %d, offset: 0x%x, tail %d\n",
-		client->wq_size, client->wq_offset, client->wq_tail);
 
 	for_each_engine(engine, dev_priv, id) {
 		u64 submissions = client->submissions[id];
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 6f0adcd2a058..3a8a77ae2af8 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -306,7 +306,7 @@  static void guc_proc_desc_init(struct intel_guc *guc,
 	desc->db_base_addr = 0;
 
 	desc->stage_id = client->stage_id;
-	desc->wq_size_bytes = client->wq_size;
+	desc->wq_size_bytes = GUC_WQ_SIZE;
 	desc->wq_status = WQ_STATUS_ACTIVE;
 	desc->priority = client->priority;
 }
@@ -391,8 +391,8 @@  static void guc_stage_desc_init(struct intel_guc *guc,
 	desc->db_trigger_cpu = (uintptr_t)__get_doorbell(client);
 	desc->db_trigger_uk = gfx_addr + client->doorbell_offset;
 	desc->process_desc = gfx_addr + client->proc_desc_offset;
-	desc->wq_addr = gfx_addr + client->wq_offset;
-	desc->wq_size = client->wq_size;
+	desc->wq_addr = gfx_addr + GUC_DB_SIZE;
+	desc->wq_size = GUC_WQ_SIZE;
 
 	desc->desc_private = (uintptr_t)client;
 }
@@ -416,15 +416,15 @@  static void guc_wq_item_append(struct i915_guc_client *client,
 	struct intel_engine_cs *engine = rq->engine;
 	struct guc_process_desc *desc = __get_process_desc(client);
 	struct guc_wq_item *wqi;
-	u32 freespace, tail, wq_off;
+	u32 freespace, ring_tail, wq_off, wq_next;
 
 	/* Free space is guaranteed */
-	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
+	freespace = CIRC_SPACE(desc->tail, desc->head, GUC_WQ_SIZE);
 	GEM_BUG_ON(freespace < wqi_size);
 
 	/* The GuC firmware wants the tail index in QWords, not bytes */
-	tail = intel_ring_set_tail(rq->ring, rq->tail) >> 3;
-	GEM_BUG_ON(tail > WQ_RING_TAIL_MAX);
+	ring_tail = intel_ring_set_tail(rq->ring, rq->tail) >> 3;
+	GEM_BUG_ON(ring_tail > WQ_RING_TAIL_MAX);
 
 	/* For now workqueue item is 4 DWs; workqueue buffer is 2 pages. So we
 	 * should not have the case where structure wqi is across page, neither
@@ -435,11 +435,12 @@  static void guc_wq_item_append(struct i915_guc_client *client,
 	 */
 	BUILD_BUG_ON(wqi_size != 16);
 
-	/* postincrement WQ tail for next time */
-	wq_off = client->wq_tail;
+	/* Find our offset and postincrement WQ tail for next time */
+	do {
+		wq_off = desc->tail;
+		wq_next = (wq_off + wqi_size) & (GUC_WQ_SIZE - 1);
+	} while (cmpxchg(&desc->tail, wq_off, wq_next) != wq_off);
 	GEM_BUG_ON(wq_off & (wqi_size - 1));
-	client->wq_tail += wqi_size;
-	client->wq_tail &= client->wq_size - 1;
 
 	/* WQ starts from the page after doorbell / process_desc */
 	wqi = client->vaddr + wq_off + GUC_DB_SIZE;
@@ -453,7 +454,7 @@  static void guc_wq_item_append(struct i915_guc_client *client,
 	/* The GuC wants only the low-order word of the context descriptor */
 	wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, engine);
 
-	wqi->submit_element_info = tail << WQ_RING_TAIL_SHIFT;
+	wqi->submit_element_info = ring_tail << WQ_RING_TAIL_SHIFT;
 	wqi->fence_id = rq->global_seqno;
 }
 
@@ -463,20 +464,14 @@  static void guc_reset_wq(struct i915_guc_client *client)
 
 	desc->head = 0;
 	desc->tail = 0;
-
-	client->wq_tail = 0;
 }
 
 static int guc_ring_doorbell(struct i915_guc_client *client)
 {
-	struct guc_process_desc *desc = __get_process_desc(client);
 	union guc_doorbell_qw db_cmp, db_exc, db_ret;
 	union guc_doorbell_qw *db;
 	int attempt = 2, ret = -EAGAIN;
 
-	/* Update the tail so it is visible to GuC */
-	desc->tail = client->wq_tail;
-
 	/* current cookie */
 	db_cmp.db_status = GUC_DOORBELL_ENABLED;
 	db_cmp.cookie = client->doorbell_cookie;
@@ -535,7 +530,6 @@  static void i915_guc_submit(struct intel_engine_cs *engine)
 	struct execlist_port *port = engine->execlist_port;
 	unsigned int engine_id = engine->id;
 	unsigned int n;
-	unsigned long flags;
 
 	for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
 		struct drm_i915_gem_request *rq;
@@ -548,14 +542,10 @@  static void i915_guc_submit(struct intel_engine_cs *engine)
 			if (i915_vma_is_map_and_fenceable(rq->ring->vma))
 				POSTING_READ_FW(GUC_STATUS);
 
-			spin_lock_irqsave(&client->wq_lock, flags);
-
 			guc_wq_item_append(client, rq);
 			WARN_ON(guc_ring_doorbell(client));
 
 			client->submissions[engine_id] += 1;
-
-			spin_unlock_irqrestore(&client->wq_lock, flags);
 		}
 	}
 }
@@ -848,9 +838,6 @@  guc_client_alloc(struct drm_i915_private *dev_priv,
 	client->engines = engines;
 	client->priority = priority;
 	client->doorbell_id = GUC_DOORBELL_INVALID;
-	client->wq_offset = GUC_DB_SIZE;
-	client->wq_size = GUC_WQ_SIZE;
-	spin_lock_init(&client->wq_lock);
 
 	ret = ida_simple_get(&guc->stage_ids, 0, GUC_MAX_STAGE_DESCRIPTORS,
 				GFP_KERNEL);
diff --git a/drivers/gpu/drm/i915/intel_uc.h b/drivers/gpu/drm/i915/intel_uc.h
index d41051688221..851b4f173781 100644
--- a/drivers/gpu/drm/i915/intel_uc.h
+++ b/drivers/gpu/drm/i915/intel_uc.h
@@ -68,11 +68,6 @@  struct i915_guc_client {
 	unsigned long doorbell_offset;
 	u32 doorbell_cookie;
 
-	spinlock_t wq_lock;
-	uint32_t wq_offset;
-	uint32_t wq_size;
-	uint32_t wq_tail;
-
 	/* Per-engine counts of GuC submissions */
 	uint64_t submissions[I915_NUM_ENGINES];
 };