Message ID | 20170522220755.12400-3-michal.winiarski@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 22/05/17 15:07, Michał Winiarski wrote: > To create an upper bound on number of GuC workitems, we need to change > the way that requests are being submitted. Rather than submitting each > request as an individual workitem, we can do coalescing in a similar way > we're handlig execlist submission ports. We also need to stop pretending > that we're doing "lite-restore" in GuC submission (we would create a > workitem each time we hit this condition). > > v2: Also coalesce when replaying on reset (Daniele) > v3: Consistent wq_resv - per-request (Daniele) > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> > Cc: Jeff McGee <jeff.mcgee@intel.com> > Cc: Michal Wajdeczko <michal.wajdeczko@intel.com> > Cc: Oscar Mateo <oscar.mateo@intel.com> > Signed-off-by: Michał Winiarski <michal.winiarski@intel.com> > --- > drivers/gpu/drm/i915/i915_guc_submission.c | 72 +++++++++++++++--------------- > 1 file changed, 37 insertions(+), 35 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c > index e6e0c6e..2a0c3161 100644 > --- a/drivers/gpu/drm/i915/i915_guc_submission.c > +++ b/drivers/gpu/drm/i915/i915_guc_submission.c > @@ -491,14 +491,12 @@ static void guc_wq_item_append(struct i915_guc_client *client, > * workqueue buffer dw by dw. > */ > BUILD_BUG_ON(wqi_size != 16); > - GEM_BUG_ON(client->wq_rsvd < wqi_size); > > /* postincrement WQ tail for next time */ > wq_off = client->wq_tail; > GEM_BUG_ON(wq_off & (wqi_size - 1)); > client->wq_tail += wqi_size; > client->wq_tail &= client->wq_size - 1; > - client->wq_rsvd -= wqi_size; > > /* WQ starts from the page after doorbell / process_desc */ > wqi = client->vaddr + wq_off + GUC_DB_SIZE; > @@ -580,7 +578,7 @@ static int guc_ring_doorbell(struct i915_guc_client *client) > } > > /** > - * __i915_guc_submit() - Submit commands through GuC > + * i915_guc_submit() - Submit commands through GuC > * @rq: request associated with the commands > * > * The caller must have already called i915_guc_wq_reserve() above with > @@ -594,7 +592,7 @@ static int guc_ring_doorbell(struct i915_guc_client *client) > * The only error here arises if the doorbell hardware isn't functioning > * as expected, which really shouln't happen. > */ > -static void __i915_guc_submit(struct drm_i915_gem_request *rq) > +static void i915_guc_submit(struct drm_i915_gem_request *rq) > { > struct drm_i915_private *dev_priv = rq->i915; > struct intel_engine_cs *engine = rq->engine; > @@ -618,12 +616,6 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq) > spin_unlock_irqrestore(&client->wq_lock, flags); > } > > -static void i915_guc_submit(struct drm_i915_gem_request *rq) > -{ > - __i915_gem_request_submit(rq); > - __i915_guc_submit(rq); > -} > - > static void nested_enable_signaling(struct drm_i915_gem_request *rq) > { > /* If we use dma_fence_enable_sw_signaling() directly, lockdep > @@ -659,12 +651,15 @@ static void port_assign(struct execlist_port *port, > nested_enable_signaling(rq); > } > > -static bool i915_guc_dequeue(struct intel_engine_cs *engine) > +static void i915_guc_dequeue(struct intel_engine_cs *engine) > { > struct execlist_port *port = engine->execlist_port; > - struct drm_i915_gem_request *last = port_request(port); > - struct rb_node *rb; > + struct drm_i915_gem_request *last = NULL; > bool submit = false; > + struct rb_node *rb; > + > + if (port_request(port)) > + port++; > > spin_lock_irq(&engine->timeline->lock); > rb = engine->execlist_first; > @@ -681,18 +676,22 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine) > goto done; > } > > - if (submit) > + if (submit) { > port_assign(port, last); > + i915_guc_submit(last); > + submit = false; > + } > port++; > } > > INIT_LIST_HEAD(&rq->priotree.link); > rq->priotree.priority = INT_MAX; > > - i915_guc_submit(rq); > + __i915_gem_request_submit(rq); > trace_i915_gem_request_in(rq, port_index(port, engine)); > last = rq; > submit = true; > + i915_guc_wq_unreserve(rq); Bikeshed: doesn't moving the removal of the reservation to before the actual WQ tail update actually theoretically introduce the possibility of racing over-reserving? It can't hurt anyway because of the coalescing, but it might be worth squashing the next patch into this one to make things cleaner. > } > > rb = rb_next(rb); > @@ -703,11 +702,11 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine) > } > done: > engine->execlist_first = rb; > - if (submit) > + if (submit) { > port_assign(port, last); > + i915_guc_submit(last); > + } > spin_unlock_irq(&engine->timeline->lock); > - > - return submit; > } > > static void i915_guc_irq_handler(unsigned long data) > @@ -715,24 +714,20 @@ static void i915_guc_irq_handler(unsigned long data) > struct intel_engine_cs *engine = (struct intel_engine_cs *)data; > struct execlist_port *port = engine->execlist_port; > struct drm_i915_gem_request *rq; > - bool submit; > > - do { > - rq = port_request(&port[0]); > - while (rq && i915_gem_request_completed(rq)) { > - trace_i915_gem_request_out(rq); > - i915_gem_request_put(rq); > + rq = port_request(&port[0]); > + while (rq && i915_gem_request_completed(rq)) { > + trace_i915_gem_request_out(rq); > + i915_gem_request_put(rq); > > - port[0] = port[1]; > - memset(&port[1], 0, sizeof(port[1])); > + port[0] = port[1]; > + memset(&port[1], 0, sizeof(port[1])); > > - rq = port_request(&port[0]); > - } > + rq = port_request(&port[0]); > + } > > - submit = false; > - if (!port_count(&port[1])) > - submit = i915_guc_dequeue(engine); > - } while (submit); > + if (!port_isset(&port[1])) > + i915_guc_dequeue(engine); > } > > /* > @@ -1250,6 +1245,8 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv) > for_each_engine(engine, dev_priv, id) { > const int wqi_size = sizeof(struct guc_wq_item); > struct drm_i915_gem_request *rq; > + struct execlist_port *port = engine->execlist_port; > + int n; > > /* The tasklet was initialised by execlists, and may be in > * a state of flux (across a reset) and so we just want to > @@ -1261,11 +1258,16 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv) > > /* Replay the current set of previously submitted requests */ > spin_lock_irq(&engine->timeline->lock); > - list_for_each_entry(rq, &engine->timeline->requests, link) { > + list_for_each_entry(rq, &engine->timeline->requests, link) > guc_client_update_wq_rsvd(client, wqi_size); Can we just drop this entirely due to the new changes in this version of the patch? We don't require wq_rsvd to be > 0 in i915_guc_submit anymore and we're guaranteed to have space (the wq is empty and we're only going to submit up to 2 items). Otherwise, aren't we going to leak this rsvd space since we don't clean it in i915_guc_submit anymore? Thanks, Daniele > - __i915_guc_submit(rq); > - } > spin_unlock_irq(&engine->timeline->lock); > + > + for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) { > + if (!port_isset(&port[n])) > + break; > + > + i915_guc_submit(port_request(&port[n])); > + } > } > > return 0; >
On Tue, May 23, 2017 at 12:07:50AM +0200, Michał Winiarski wrote: > To create an upper bound on number of GuC workitems, we need to change > the way that requests are being submitted. Rather than submitting each > request as an individual workitem, we can do coalescing in a similar way > we're handlig execlist submission ports. We also need to stop pretending > that we're doing "lite-restore" in GuC submission (we would create a > workitem each time we hit this condition). > > v2: Also coalesce when replaying on reset (Daniele) > v3: Consistent wq_resv - per-request (Daniele) > > Cc: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> > Cc: Jeff McGee <jeff.mcgee@intel.com> > Cc: Michal Wajdeczko <michal.wajdeczko@intel.com> > Cc: Oscar Mateo <oscar.mateo@intel.com> > Signed-off-by: Michał Winiarski <michal.winiarski@intel.com> > --- > drivers/gpu/drm/i915/i915_guc_submission.c | 72 +++++++++++++++--------------- > 1 file changed, 37 insertions(+), 35 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c > index e6e0c6e..2a0c3161 100644 > --- a/drivers/gpu/drm/i915/i915_guc_submission.c > +++ b/drivers/gpu/drm/i915/i915_guc_submission.c > @@ -491,14 +491,12 @@ static void guc_wq_item_append(struct i915_guc_client *client, > * workqueue buffer dw by dw. > */ > BUILD_BUG_ON(wqi_size != 16); > - GEM_BUG_ON(client->wq_rsvd < wqi_size); > > /* postincrement WQ tail for next time */ > wq_off = client->wq_tail; > GEM_BUG_ON(wq_off & (wqi_size - 1)); > client->wq_tail += wqi_size; > client->wq_tail &= client->wq_size - 1; > - client->wq_rsvd -= wqi_size; Stray chunk for another patch. > > /* WQ starts from the page after doorbell / process_desc */ > wqi = client->vaddr + wq_off + GUC_DB_SIZE; > @@ -580,7 +578,7 @@ static int guc_ring_doorbell(struct i915_guc_client *client) > } > > /** > - * __i915_guc_submit() - Submit commands through GuC > + * i915_guc_submit() - Submit commands through GuC > * @rq: request associated with the commands > * > * The caller must have already called i915_guc_wq_reserve() above with > @@ -594,7 +592,7 @@ static int guc_ring_doorbell(struct i915_guc_client *client) > * The only error here arises if the doorbell hardware isn't functioning > * as expected, which really shouln't happen. > */ > -static void __i915_guc_submit(struct drm_i915_gem_request *rq) > +static void i915_guc_submit(struct drm_i915_gem_request *rq) > { > struct drm_i915_private *dev_priv = rq->i915; > struct intel_engine_cs *engine = rq->engine; > @@ -618,12 +616,6 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq) > spin_unlock_irqrestore(&client->wq_lock, flags); > } > > -static void i915_guc_submit(struct drm_i915_gem_request *rq) > -{ > - __i915_gem_request_submit(rq); > - __i915_guc_submit(rq); > -} > - > static void nested_enable_signaling(struct drm_i915_gem_request *rq) > { > /* If we use dma_fence_enable_sw_signaling() directly, lockdep > @@ -659,12 +651,15 @@ static void port_assign(struct execlist_port *port, > nested_enable_signaling(rq); > } > > -static bool i915_guc_dequeue(struct intel_engine_cs *engine) > +static void i915_guc_dequeue(struct intel_engine_cs *engine) > { > struct execlist_port *port = engine->execlist_port; > - struct drm_i915_gem_request *last = port_request(port); > - struct rb_node *rb; > + struct drm_i915_gem_request *last = NULL; > bool submit = false; > + struct rb_node *rb; > + > + if (port_request(port)) port_isset() > + port++; > > spin_lock_irq(&engine->timeline->lock); > rb = engine->execlist_first; > @@ -681,18 +676,22 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine) > goto done; > } > > - if (submit) > + if (submit) { > port_assign(port, last); > + i915_guc_submit(last); > + submit = false; The way I did it was to make it look more like execlists again by passing engine to i915_guc_submit() and have it iterate over the ports, so that is only called at the end and can be done outside of the spinlock. (Which is an issue as this spinlock is highly contended on guc.) I then used port_count to avoid resubmitting requests. > + } > port++; > } > > INIT_LIST_HEAD(&rq->priotree.link); > rq->priotree.priority = INT_MAX; > > - i915_guc_submit(rq); > + __i915_gem_request_submit(rq); > trace_i915_gem_request_in(rq, port_index(port, engine)); > last = rq; > submit = true; > + i915_guc_wq_unreserve(rq); > } > > rb = rb_next(rb); > @@ -703,11 +702,11 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine) > } > done: > engine->execlist_first = rb; > - if (submit) > + if (submit) { > port_assign(port, last); > + i915_guc_submit(last); > + } > spin_unlock_irq(&engine->timeline->lock); > - > - return submit; > } > > static void i915_guc_irq_handler(unsigned long data) > @@ -715,24 +714,20 @@ static void i915_guc_irq_handler(unsigned long data) > struct intel_engine_cs *engine = (struct intel_engine_cs *)data; > struct execlist_port *port = engine->execlist_port; > struct drm_i915_gem_request *rq; > - bool submit; > > - do { > - rq = port_request(&port[0]); > - while (rq && i915_gem_request_completed(rq)) { > - trace_i915_gem_request_out(rq); > - i915_gem_request_put(rq); > + rq = port_request(&port[0]); > + while (rq && i915_gem_request_completed(rq)) { > + trace_i915_gem_request_out(rq); > + i915_gem_request_put(rq); > > - port[0] = port[1]; > - memset(&port[1], 0, sizeof(port[1])); > + port[0] = port[1]; > + memset(&port[1], 0, sizeof(port[1])); > > - rq = port_request(&port[0]); > - } > + rq = port_request(&port[0]); > + } > > - submit = false; > - if (!port_count(&port[1])) > - submit = i915_guc_dequeue(engine); > - } while (submit); > + if (!port_isset(&port[1])) > + i915_guc_dequeue(engine); This is a change not recorded in the changelog, so separate patch for justification (e.g. it is nothing to do with lite-restore). > } > > /* > @@ -1250,6 +1245,8 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv) > for_each_engine(engine, dev_priv, id) { > const int wqi_size = sizeof(struct guc_wq_item); > struct drm_i915_gem_request *rq; > + struct execlist_port *port = engine->execlist_port; > + int n; > > /* The tasklet was initialised by execlists, and may be in > * a state of flux (across a reset) and so we just want to > @@ -1261,11 +1258,16 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv) > > /* Replay the current set of previously submitted requests */ > spin_lock_irq(&engine->timeline->lock); > - list_for_each_entry(rq, &engine->timeline->requests, link) { > + list_for_each_entry(rq, &engine->timeline->requests, link) > guc_client_update_wq_rsvd(client, wqi_size); > - __i915_guc_submit(rq); > - } > spin_unlock_irq(&engine->timeline->lock); > + > + for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) { > + if (!port_isset(&port[n])) > + break; > + > + i915_guc_submit(port_request(&port[n])); > + } ... which just falls out if you passed engine to i915_guc_submit(). -Chris
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c index e6e0c6e..2a0c3161 100644 --- a/drivers/gpu/drm/i915/i915_guc_submission.c +++ b/drivers/gpu/drm/i915/i915_guc_submission.c @@ -491,14 +491,12 @@ static void guc_wq_item_append(struct i915_guc_client *client, * workqueue buffer dw by dw. */ BUILD_BUG_ON(wqi_size != 16); - GEM_BUG_ON(client->wq_rsvd < wqi_size); /* postincrement WQ tail for next time */ wq_off = client->wq_tail; GEM_BUG_ON(wq_off & (wqi_size - 1)); client->wq_tail += wqi_size; client->wq_tail &= client->wq_size - 1; - client->wq_rsvd -= wqi_size; /* WQ starts from the page after doorbell / process_desc */ wqi = client->vaddr + wq_off + GUC_DB_SIZE; @@ -580,7 +578,7 @@ static int guc_ring_doorbell(struct i915_guc_client *client) } /** - * __i915_guc_submit() - Submit commands through GuC + * i915_guc_submit() - Submit commands through GuC * @rq: request associated with the commands * * The caller must have already called i915_guc_wq_reserve() above with @@ -594,7 +592,7 @@ static int guc_ring_doorbell(struct i915_guc_client *client) * The only error here arises if the doorbell hardware isn't functioning * as expected, which really shouln't happen. */ -static void __i915_guc_submit(struct drm_i915_gem_request *rq) +static void i915_guc_submit(struct drm_i915_gem_request *rq) { struct drm_i915_private *dev_priv = rq->i915; struct intel_engine_cs *engine = rq->engine; @@ -618,12 +616,6 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq) spin_unlock_irqrestore(&client->wq_lock, flags); } -static void i915_guc_submit(struct drm_i915_gem_request *rq) -{ - __i915_gem_request_submit(rq); - __i915_guc_submit(rq); -} - static void nested_enable_signaling(struct drm_i915_gem_request *rq) { /* If we use dma_fence_enable_sw_signaling() directly, lockdep @@ -659,12 +651,15 @@ static void port_assign(struct execlist_port *port, nested_enable_signaling(rq); } -static bool i915_guc_dequeue(struct intel_engine_cs *engine) +static void i915_guc_dequeue(struct intel_engine_cs *engine) { struct execlist_port *port = engine->execlist_port; - struct drm_i915_gem_request *last = port_request(port); - struct rb_node *rb; + struct drm_i915_gem_request *last = NULL; bool submit = false; + struct rb_node *rb; + + if (port_request(port)) + port++; spin_lock_irq(&engine->timeline->lock); rb = engine->execlist_first; @@ -681,18 +676,22 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine) goto done; } - if (submit) + if (submit) { port_assign(port, last); + i915_guc_submit(last); + submit = false; + } port++; } INIT_LIST_HEAD(&rq->priotree.link); rq->priotree.priority = INT_MAX; - i915_guc_submit(rq); + __i915_gem_request_submit(rq); trace_i915_gem_request_in(rq, port_index(port, engine)); last = rq; submit = true; + i915_guc_wq_unreserve(rq); } rb = rb_next(rb); @@ -703,11 +702,11 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine) } done: engine->execlist_first = rb; - if (submit) + if (submit) { port_assign(port, last); + i915_guc_submit(last); + } spin_unlock_irq(&engine->timeline->lock); - - return submit; } static void i915_guc_irq_handler(unsigned long data) @@ -715,24 +714,20 @@ static void i915_guc_irq_handler(unsigned long data) struct intel_engine_cs *engine = (struct intel_engine_cs *)data; struct execlist_port *port = engine->execlist_port; struct drm_i915_gem_request *rq; - bool submit; - do { - rq = port_request(&port[0]); - while (rq && i915_gem_request_completed(rq)) { - trace_i915_gem_request_out(rq); - i915_gem_request_put(rq); + rq = port_request(&port[0]); + while (rq && i915_gem_request_completed(rq)) { + trace_i915_gem_request_out(rq); + i915_gem_request_put(rq); - port[0] = port[1]; - memset(&port[1], 0, sizeof(port[1])); + port[0] = port[1]; + memset(&port[1], 0, sizeof(port[1])); - rq = port_request(&port[0]); - } + rq = port_request(&port[0]); + } - submit = false; - if (!port_count(&port[1])) - submit = i915_guc_dequeue(engine); - } while (submit); + if (!port_isset(&port[1])) + i915_guc_dequeue(engine); } /* @@ -1250,6 +1245,8 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv) for_each_engine(engine, dev_priv, id) { const int wqi_size = sizeof(struct guc_wq_item); struct drm_i915_gem_request *rq; + struct execlist_port *port = engine->execlist_port; + int n; /* The tasklet was initialised by execlists, and may be in * a state of flux (across a reset) and so we just want to @@ -1261,11 +1258,16 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv) /* Replay the current set of previously submitted requests */ spin_lock_irq(&engine->timeline->lock); - list_for_each_entry(rq, &engine->timeline->requests, link) { + list_for_each_entry(rq, &engine->timeline->requests, link) guc_client_update_wq_rsvd(client, wqi_size); - __i915_guc_submit(rq); - } spin_unlock_irq(&engine->timeline->lock); + + for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) { + if (!port_isset(&port[n])) + break; + + i915_guc_submit(port_request(&port[n])); + } } return 0;
To create an upper bound on number of GuC workitems, we need to change the way that requests are being submitted. Rather than submitting each request as an individual workitem, we can do coalescing in a similar way we're handlig execlist submission ports. We also need to stop pretending that we're doing "lite-restore" in GuC submission (we would create a workitem each time we hit this condition). v2: Also coalesce when replaying on reset (Daniele) v3: Consistent wq_resv - per-request (Daniele) Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: Jeff McGee <jeff.mcgee@intel.com> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com> Cc: Oscar Mateo <oscar.mateo@intel.com> Signed-off-by: Michał Winiarski <michal.winiarski@intel.com> --- drivers/gpu/drm/i915/i915_guc_submission.c | 72 +++++++++++++++--------------- 1 file changed, 37 insertions(+), 35 deletions(-)