diff mbox

[v2,2/6] drm/i915/guc: Submit GuC workitems containing coalesced requests

Message ID 20170519132321.11953-2-michal.winiarski@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michał Winiarski May 19, 2017, 1:23 p.m. UTC
To create an upper bound on number of GuC workitems, we need to change
the way that requests are being submitted. Rather than submitting each
request as an individual workitem, we can do coalescing in a similar way
we're handlig execlist submission ports. We also need to stop pretending
that we're doing "lite-restore" in GuC submission (we would create a
workitem each time we hit this condition).

v2: Also coalesce when replaying on reset (Daniele)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Oscar Mateo <oscar.mateo@intel.com>
Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 68 ++++++++++++++++--------------
 1 file changed, 36 insertions(+), 32 deletions(-)

Comments

Daniele Ceraolo Spurio May 19, 2017, 6:08 p.m. UTC | #1
On 19/05/17 06:23, Michał Winiarski wrote:
> To create an upper bound on number of GuC workitems, we need to change
> the way that requests are being submitted. Rather than submitting each
> request as an individual workitem, we can do coalescing in a similar way
> we're handlig execlist submission ports. We also need to stop pretending
> that we're doing "lite-restore" in GuC submission (we would create a
> workitem each time we hit this condition).
>
> v2: Also coalesce when replaying on reset (Daniele)
>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Oscar Mateo <oscar.mateo@intel.com>
> Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 68 ++++++++++++++++--------------
>  1 file changed, 36 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index b3da056..a586998 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -580,7 +580,7 @@ static int guc_ring_doorbell(struct i915_guc_client *client)
>  }
>
>  /**
> - * __i915_guc_submit() - Submit commands through GuC
> + * i915_guc_submit() - Submit commands through GuC
>   * @rq:		request associated with the commands
>   *
>   * The caller must have already called i915_guc_wq_reserve() above with
> @@ -594,7 +594,7 @@ static int guc_ring_doorbell(struct i915_guc_client *client)
>   * The only error here arises if the doorbell hardware isn't functioning
>   * as expected, which really shouln't happen.
>   */
> -static void __i915_guc_submit(struct drm_i915_gem_request *rq)
> +static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  {
>  	struct drm_i915_private *dev_priv = rq->i915;
>  	struct intel_engine_cs *engine = rq->engine;
> @@ -624,12 +624,6 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq)
>  	spin_unlock_irqrestore(&client->wq_lock, flags);
>  }
>
> -static void i915_guc_submit(struct drm_i915_gem_request *rq)
> -{
> -	__i915_gem_request_submit(rq);
> -	__i915_guc_submit(rq);
> -}
> -
>  static void nested_enable_signaling(struct drm_i915_gem_request *rq)
>  {
>  	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
> @@ -665,12 +659,15 @@ static void port_assign(struct execlist_port *port,
>  	nested_enable_signaling(rq);
>  }
>
> -static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +static void i915_guc_dequeue(struct intel_engine_cs *engine)
>  {
>  	struct execlist_port *port = engine->execlist_port;
> -	struct drm_i915_gem_request *last = port_request(port);
> -	struct rb_node *rb;
> +	struct drm_i915_gem_request *last = NULL;
>  	bool submit = false;
> +	struct rb_node *rb;
> +
> +	if (port_request(port))
> +		port++;
>
>  	spin_lock_irq(&engine->timeline->lock);
>  	rb = engine->execlist_first;
> @@ -689,12 +686,16 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>
>  				port_assign(port, last);
>  				port++;
> +				if (submit) {
> +					i915_guc_submit(last);
> +					submit = false;
> +				}
>  			}
>
>  			INIT_LIST_HEAD(&rq->priotree.link);
>  			rq->priotree.priority = INT_MAX;
>
> -			i915_guc_submit(rq);
> +			__i915_gem_request_submit(rq);
>  			trace_i915_gem_request_in(rq, port_index(port, engine));
>  			last = rq;
>  			submit = true;
> @@ -708,11 +709,11 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>  	}
>  done:
>  	engine->execlist_first = rb;
> -	if (submit)
> +	if (submit) {
>  		port_assign(port, last);
> +		i915_guc_submit(last);
> +	}

Since now we only call both port_assign and i915_guc_submit once per 
port, can we move both usages in the loop, before the break, and remove 
it from here? i.e.:

     if (last && rq->ctx != last->ctx) {

         if (submit) {
             port_assign(port, last);
             i915_guc_submit(last);
         }
         if (port != engine->execlist_port) {
             __list_del_many(&p->requests,
                     &rq->priotree.link);
             goto done;
         }
         port++;
     }

Or do we have issues with the ordering of operations?


>  	spin_unlock_irq(&engine->timeline->lock);
> -
> -	return submit;
>  }
>
>  static void i915_guc_irq_handler(unsigned long data)
> @@ -720,24 +721,20 @@ static void i915_guc_irq_handler(unsigned long data)
>  	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
>  	struct execlist_port *port = engine->execlist_port;
>  	struct drm_i915_gem_request *rq;
> -	bool submit;
>
> -	do {
> -		rq = port_request(&port[0]);
> -		while (rq && i915_gem_request_completed(rq)) {
> -			trace_i915_gem_request_out(rq);
> -			i915_gem_request_put(rq);
> +	rq = port_request(&port[0]);
> +	while (rq && i915_gem_request_completed(rq)) {
> +		trace_i915_gem_request_out(rq);
> +		i915_gem_request_put(rq);
>
> -			port[0] = port[1];
> -			memset(&port[1], 0, sizeof(port[1]));
> +		port[0] = port[1];
> +		memset(&port[1], 0, sizeof(port[1]));
>
> -			rq = port_request(&port[0]);
> -		}
> +		rq = port_request(&port[0]);
> +	}
>
> -		submit = false;
> -		if (!port_count(&port[1]))
> -			submit = i915_guc_dequeue(engine);
> -	} while (submit);
> +	if (!port_isset(&port[1]))
> +		i915_guc_dequeue(engine);
>  }
>
>  /*
> @@ -1255,6 +1252,8 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>  	for_each_engine(engine, dev_priv, id) {
>  		const int wqi_size = sizeof(struct guc_wq_item);
>  		struct drm_i915_gem_request *rq;
> +		struct execlist_port *port = engine->execlist_port;
> +		int n;
>
>  		/* The tasklet was initialised by execlists, and may be in
>  		 * a state of flux (across a reset) and so we just want to
> @@ -1266,11 +1265,16 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>
>  		/* Replay the current set of previously submitted requests */
>  		spin_lock_irq(&engine->timeline->lock);
> -		list_for_each_entry(rq, &engine->timeline->requests, link) {
> +		list_for_each_entry(rq, &engine->timeline->requests, link)
>  			guc_client_update_wq_rsvd(client, wqi_size);

Should we move the reservation to just above i915_guc_submit()? we use 1 
wq item per port and not per request starting this patch. Code is going 
away in the next patch anyway so I'm happy if you skip this change to 
keep things simple.

Thanks,
Daniele

> -			__i915_guc_submit(rq);
> -		}
>  		spin_unlock_irq(&engine->timeline->lock);
> +
> +		for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> +			if (!port_isset(&port[n]))
> +				break;
> +
> +			i915_guc_submit(port_request(&port[n]));
> +		}
>  	}
>
>  	return 0;
>
Michał Winiarski May 22, 2017, 9:53 p.m. UTC | #2
On Fri, May 19, 2017 at 11:08:05AM -0700, Daniele Ceraolo Spurio wrote:
> 
> 
> On 19/05/17 06:23, Michał Winiarski wrote:
> > To create an upper bound on number of GuC workitems, we need to change
> > the way that requests are being submitted. Rather than submitting each
> > request as an individual workitem, we can do coalescing in a similar way
> > we're handlig execlist submission ports. We also need to stop pretending
> > that we're doing "lite-restore" in GuC submission (we would create a
> > workitem each time we hit this condition).
> > 
> > v2: Also coalesce when replaying on reset (Daniele)
> > 
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> > Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> > Cc: Oscar Mateo <oscar.mateo@intel.com>
> > Signed-off-by: Michał Winiarski <michal.winiarski@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_guc_submission.c | 68 ++++++++++++++++--------------
> >  1 file changed, 36 insertions(+), 32 deletions(-)
> > 

[SNIP]

> > @@ -689,12 +686,16 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> > 
> >  				port_assign(port, last);
> >  				port++;
> > +				if (submit) {
> > +					i915_guc_submit(last);
> > +					submit = false;
> > +				}
> >  			}
> > 
> >  			INIT_LIST_HEAD(&rq->priotree.link);
> >  			rq->priotree.priority = INT_MAX;
> > 
> > -			i915_guc_submit(rq);
> > +			__i915_gem_request_submit(rq);
> >  			trace_i915_gem_request_in(rq, port_index(port, engine));
> >  			last = rq;
> >  			submit = true;
> > @@ -708,11 +709,11 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> >  	}
> >  done:
> >  	engine->execlist_first = rb;
> > -	if (submit)
> > +	if (submit) {
> >  		port_assign(port, last);
> > +		i915_guc_submit(last);
> > +	}
> 
> Since now we only call both port_assign and i915_guc_submit once per port,
> can we move both usages in the loop, before the break, and remove it from
> here? i.e.:
> 
>     if (last && rq->ctx != last->ctx) {
> 
>         if (submit) {
>             port_assign(port, last);
>             i915_guc_submit(last);
>         }
>         if (port != engine->execlist_port) {
>             __list_del_many(&p->requests,
>                     &rq->priotree.link);
>             goto done;
>         }
>         port++;
>     }
> 
> Or do we have issues with the ordering of operations?

No, not with the ordering, it's just that we won't handle the case where we've
ran out of request (we're done iterating through the list, but we still have to
submit "last" request). Still, I think moving port_assign and port++ under
submit conditional makes the code a bit easier on the reader - I'll do that in
next version.

> 
> 
> >  	spin_unlock_irq(&engine->timeline->lock);
> > -
> > -	return submit;
> >  }
> > 
> >  static void i915_guc_irq_handler(unsigned long data)
> > @@ -720,24 +721,20 @@ static void i915_guc_irq_handler(unsigned long data)
> >  	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> >  	struct execlist_port *port = engine->execlist_port;
> >  	struct drm_i915_gem_request *rq;
> > -	bool submit;
> > 
> > -	do {
> > -		rq = port_request(&port[0]);
> > -		while (rq && i915_gem_request_completed(rq)) {
> > -			trace_i915_gem_request_out(rq);
> > -			i915_gem_request_put(rq);
> > +	rq = port_request(&port[0]);
> > +	while (rq && i915_gem_request_completed(rq)) {
> > +		trace_i915_gem_request_out(rq);
> > +		i915_gem_request_put(rq);
> > 
> > -			port[0] = port[1];
> > -			memset(&port[1], 0, sizeof(port[1]));
> > +		port[0] = port[1];
> > +		memset(&port[1], 0, sizeof(port[1]));
> > 
> > -			rq = port_request(&port[0]);
> > -		}
> > +		rq = port_request(&port[0]);
> > +	}
> > 
> > -		submit = false;
> > -		if (!port_count(&port[1]))
> > -			submit = i915_guc_dequeue(engine);
> > -	} while (submit);
> > +	if (!port_isset(&port[1]))
> > +		i915_guc_dequeue(engine);
> >  }
> > 
> >  /*
> > @@ -1255,6 +1252,8 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
> >  	for_each_engine(engine, dev_priv, id) {
> >  		const int wqi_size = sizeof(struct guc_wq_item);
> >  		struct drm_i915_gem_request *rq;
> > +		struct execlist_port *port = engine->execlist_port;
> > +		int n;
> > 
> >  		/* The tasklet was initialised by execlists, and may be in
> >  		 * a state of flux (across a reset) and so we just want to
> > @@ -1266,11 +1265,16 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
> > 
> >  		/* Replay the current set of previously submitted requests */
> >  		spin_lock_irq(&engine->timeline->lock);
> > -		list_for_each_entry(rq, &engine->timeline->requests, link) {
> > +		list_for_each_entry(rq, &engine->timeline->requests, link)
> >  			guc_client_update_wq_rsvd(client, wqi_size);
> 
> Should we move the reservation to just above i915_guc_submit()? we use 1 wq
> item per port and not per request starting this patch. Code is going away in
> the next patch anyway so I'm happy if you skip this change to keep things
> simple.

Yeah, actually, wq_resv is broken in this version. We're reserving per-request,
but we're unreserving per-workitem basis, which will eventually lead to wq_resv
going full. I still need to fix it for bisectability.

-Michał
 
> Thanks,
> Daniele
> 
> > -			__i915_guc_submit(rq);
> > -		}
> >  		spin_unlock_irq(&engine->timeline->lock);
> > +
> > +		for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
> > +			if (!port_isset(&port[n]))
> > +				break;
> > +
> > +			i915_guc_submit(port_request(&port[n]));
> > +		}
> >  	}
> > 
> >  	return 0;
> >
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index b3da056..a586998 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -580,7 +580,7 @@  static int guc_ring_doorbell(struct i915_guc_client *client)
 }
 
 /**
- * __i915_guc_submit() - Submit commands through GuC
+ * i915_guc_submit() - Submit commands through GuC
  * @rq:		request associated with the commands
  *
  * The caller must have already called i915_guc_wq_reserve() above with
@@ -594,7 +594,7 @@  static int guc_ring_doorbell(struct i915_guc_client *client)
  * The only error here arises if the doorbell hardware isn't functioning
  * as expected, which really shouln't happen.
  */
-static void __i915_guc_submit(struct drm_i915_gem_request *rq)
+static void i915_guc_submit(struct drm_i915_gem_request *rq)
 {
 	struct drm_i915_private *dev_priv = rq->i915;
 	struct intel_engine_cs *engine = rq->engine;
@@ -624,12 +624,6 @@  static void __i915_guc_submit(struct drm_i915_gem_request *rq)
 	spin_unlock_irqrestore(&client->wq_lock, flags);
 }
 
-static void i915_guc_submit(struct drm_i915_gem_request *rq)
-{
-	__i915_gem_request_submit(rq);
-	__i915_guc_submit(rq);
-}
-
 static void nested_enable_signaling(struct drm_i915_gem_request *rq)
 {
 	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
@@ -665,12 +659,15 @@  static void port_assign(struct execlist_port *port,
 	nested_enable_signaling(rq);
 }
 
-static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+static void i915_guc_dequeue(struct intel_engine_cs *engine)
 {
 	struct execlist_port *port = engine->execlist_port;
-	struct drm_i915_gem_request *last = port_request(port);
-	struct rb_node *rb;
+	struct drm_i915_gem_request *last = NULL;
 	bool submit = false;
+	struct rb_node *rb;
+
+	if (port_request(port))
+		port++;
 
 	spin_lock_irq(&engine->timeline->lock);
 	rb = engine->execlist_first;
@@ -689,12 +686,16 @@  static bool i915_guc_dequeue(struct intel_engine_cs *engine)
 
 				port_assign(port, last);
 				port++;
+				if (submit) {
+					i915_guc_submit(last);
+					submit = false;
+				}
 			}
 
 			INIT_LIST_HEAD(&rq->priotree.link);
 			rq->priotree.priority = INT_MAX;
 
-			i915_guc_submit(rq);
+			__i915_gem_request_submit(rq);
 			trace_i915_gem_request_in(rq, port_index(port, engine));
 			last = rq;
 			submit = true;
@@ -708,11 +709,11 @@  static bool i915_guc_dequeue(struct intel_engine_cs *engine)
 	}
 done:
 	engine->execlist_first = rb;
-	if (submit)
+	if (submit) {
 		port_assign(port, last);
+		i915_guc_submit(last);
+	}
 	spin_unlock_irq(&engine->timeline->lock);
-
-	return submit;
 }
 
 static void i915_guc_irq_handler(unsigned long data)
@@ -720,24 +721,20 @@  static void i915_guc_irq_handler(unsigned long data)
 	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
 	struct execlist_port *port = engine->execlist_port;
 	struct drm_i915_gem_request *rq;
-	bool submit;
 
-	do {
-		rq = port_request(&port[0]);
-		while (rq && i915_gem_request_completed(rq)) {
-			trace_i915_gem_request_out(rq);
-			i915_gem_request_put(rq);
+	rq = port_request(&port[0]);
+	while (rq && i915_gem_request_completed(rq)) {
+		trace_i915_gem_request_out(rq);
+		i915_gem_request_put(rq);
 
-			port[0] = port[1];
-			memset(&port[1], 0, sizeof(port[1]));
+		port[0] = port[1];
+		memset(&port[1], 0, sizeof(port[1]));
 
-			rq = port_request(&port[0]);
-		}
+		rq = port_request(&port[0]);
+	}
 
-		submit = false;
-		if (!port_count(&port[1]))
-			submit = i915_guc_dequeue(engine);
-	} while (submit);
+	if (!port_isset(&port[1]))
+		i915_guc_dequeue(engine);
 }
 
 /*
@@ -1255,6 +1252,8 @@  int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	for_each_engine(engine, dev_priv, id) {
 		const int wqi_size = sizeof(struct guc_wq_item);
 		struct drm_i915_gem_request *rq;
+		struct execlist_port *port = engine->execlist_port;
+		int n;
 
 		/* The tasklet was initialised by execlists, and may be in
 		 * a state of flux (across a reset) and so we just want to
@@ -1266,11 +1265,16 @@  int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 
 		/* Replay the current set of previously submitted requests */
 		spin_lock_irq(&engine->timeline->lock);
-		list_for_each_entry(rq, &engine->timeline->requests, link) {
+		list_for_each_entry(rq, &engine->timeline->requests, link)
 			guc_client_update_wq_rsvd(client, wqi_size);
-			__i915_guc_submit(rq);
-		}
 		spin_unlock_irq(&engine->timeline->lock);
+
+		for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
+			if (!port_isset(&port[n]))
+				break;
+
+			i915_guc_submit(port_request(&port[n]));
+		}
 	}
 
 	return 0;