diff mbox

[4/4] drm/i915: Opportunistically reduce flushing at execbuf

Message ID 1418526504-26316-5-git-send-email-benjamin.widawsky@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ben Widawsky Dec. 14, 2014, 3:08 a.m. UTC
If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
we've already blown out the entire cache via a wbinvd, there is nothing more to
do.

With this and the previous patches, I am seeing a 3x FPS increase on a certain
benchmark which uses a giant 2d array texture. Unless I missed something in the
code, it should only effect non-LLC i915 platforms.

I haven't yet run any numbers for other benchmarks, nor have I attempted to
check if various conformance tests still pass.

NOTE: As mentioned in the previous patch, if one can easily obtain the largest
buffer and attempt to flush it first, the results would be even more desirable.

Cc: DRI Development <dri-devel@lists.freedesktop.org>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h            |  3 ++-
 drivers/gpu/drm/i915/i915_gem.c            | 12 +++++-------
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  8 +++++---
 drivers/gpu/drm/i915/intel_lrc.c           |  8 +++++---
 4 files changed, 17 insertions(+), 14 deletions(-)

Comments

Ville Syrjala Dec. 14, 2014, 1:12 p.m. UTC | #1
On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote:
> If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
> we've already blown out the entire cache via a wbinvd, there is nothing more to
> do.
> 
> With this and the previous patches, I am seeing a 3x FPS increase on a certain
> benchmark which uses a giant 2d array texture. Unless I missed something in the
> code, it should only effect non-LLC i915 platforms.
> 
> I haven't yet run any numbers for other benchmarks, nor have I attempted to
> check if various conformance tests still pass.
> 
> NOTE: As mentioned in the previous patch, if one can easily obtain the largest
> buffer and attempt to flush it first, the results would be even more desirable.

So even with that optimization if you only have tons of small buffers
that need to be flushed you'd still take the clflush path for every
single one.

How difficult would it to calculate the total size to be flushed first,
and then make the clflush vs. wbinvd decision base on that?

> 
> Cc: DRI Development <dri-devel@lists.freedesktop.org>
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_drv.h            |  3 ++-
>  drivers/gpu/drm/i915/i915_gem.c            | 12 +++++-------
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  8 +++++---
>  drivers/gpu/drm/i915/intel_lrc.c           |  8 +++++---
>  4 files changed, 17 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index d68c75f..fdb92a3 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2642,7 +2642,8 @@ static inline bool i915_stop_ring_allow_warn(struct drm_i915_private *dev_priv)
>  }
>  
>  void i915_gem_reset(struct drm_device *dev);
> -bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
> +enum drm_cache_flush
> +i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
>  int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj);
>  int __must_check i915_gem_init(struct drm_device *dev);
>  int i915_gem_init_rings(struct drm_device *dev);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index de241eb..3746738 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3608,7 +3608,7 @@ err_unpin:
>  	return vma;
>  }
>  
> -bool
> +enum drm_cache_flush
>  i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>  			bool force)
>  {
> @@ -3617,14 +3617,14 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>  	 * again at bind time.
>  	 */
>  	if (obj->pages == NULL)
> -		return false;
> +		return DRM_CACHE_FLUSH_NONE;
>  
>  	/*
>  	 * Stolen memory is always coherent with the GPU as it is explicitly
>  	 * marked as wc by the system, or the system is cache-coherent.
>  	 */
>  	if (obj->stolen || obj->phys_handle)
> -		return false;
> +		return DRM_CACHE_FLUSH_NONE;
>  
>  	/* If the GPU is snooping the contents of the CPU cache,
>  	 * we do not need to manually clear the CPU cache lines.  However,
> @@ -3635,12 +3635,10 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>  	 * tracking.
>  	 */
>  	if (!force && cpu_cache_is_coherent(obj->base.dev, obj->cache_level))
> -		return false;
> +		return DRM_CACHE_FLUSH_NONE;
>  
>  	trace_i915_gem_object_clflush(obj);
> -	drm_clflush_sg(obj->pages);
> -
> -	return true;
> +	return drm_clflush_sg(obj->pages);
>  }
>  
>  /** Flushes the GTT write domain for the object if it's dirty. */
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 0c25f62..e8eb9e9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -827,7 +827,7 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
>  {
>  	struct i915_vma *vma;
>  	uint32_t flush_domains = 0;
> -	bool flush_chipset = false;
> +	enum drm_cache_flush flush_chipset = DRM_CACHE_FLUSH_NONE;
>  	int ret;
>  
>  	list_for_each_entry(vma, vmas, exec_list) {
> @@ -836,8 +836,10 @@ i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
>  		if (ret)
>  			return ret;
>  
> -		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
> -			flush_chipset |= i915_gem_clflush_object(obj, false);
> +		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU &&
> +		    flush_chipset != DRM_CACHE_FLUSH_WBINVD) {
> +			flush_chipset = i915_gem_clflush_object(obj, false);
> +		}
>  
>  		flush_domains |= obj->base.write_domain;
>  	}
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 89b5577..a6c6ebd 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -611,7 +611,7 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf,
>  	struct intel_engine_cs *ring = ringbuf->ring;
>  	struct i915_vma *vma;
>  	uint32_t flush_domains = 0;
> -	bool flush_chipset = false;
> +	enum drm_cache_flush flush_chipset = DRM_CACHE_FLUSH_NONE;
>  	int ret;
>  
>  	list_for_each_entry(vma, vmas, exec_list) {
> @@ -621,8 +621,10 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf,
>  		if (ret)
>  			return ret;
>  
> -		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
> -			flush_chipset |= i915_gem_clflush_object(obj, false);
> +		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU &&
> +		    flush_chipset != DRM_CACHE_FLUSH_WBINVD) {
> +			flush_chipset = i915_gem_clflush_object(obj, false);
> +		}
>  
>  		flush_domains |= obj->base.write_domain;
>  	}
> -- 
> 2.1.3
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Ben Widawsky Dec. 14, 2014, 11:37 p.m. UTC | #2
On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote:
> On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote:
> > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
> > we've already blown out the entire cache via a wbinvd, there is nothing more to
> > do.
> > 
> > With this and the previous patches, I am seeing a 3x FPS increase on a certain
> > benchmark which uses a giant 2d array texture. Unless I missed something in the
> > code, it should only effect non-LLC i915 platforms.
> > 
> > I haven't yet run any numbers for other benchmarks, nor have I attempted to
> > check if various conformance tests still pass.
> > 
> > NOTE: As mentioned in the previous patch, if one can easily obtain the largest
> > buffer and attempt to flush it first, the results would be even more desirable.
> 
> So even with that optimization if you only have tons of small buffers
> that need to be flushed you'd still take the clflush path for every
> single one.
> 
> How difficult would it to calculate the total size to be flushed first,
> and then make the clflush vs. wbinvd decision base on that?
> 

I'll write the patch and send it to Eero for test.

It's not hard, and I think that's a good idea as well. One reason I didn't put
such code in this series is that moves away from a global DRM solution (and like
I said in the cover-letter, I am fine with that). Implementing this, I think in
the i915 code we'd just iterate through the BOs until we got to a certain
threshold, then just call wbinvd() from i915 and not even both with drm_cache.
You could also maybe try to shorcut if there are more than X buffers.

However, for what you describe, I think it might make more sense to let
userspace specify an execbuf flag to do the wbinvd(). Userspace can trivially
determine such info, it prevents having to iterate through the buffers an extra
time in the kernel.

I wonder if the clflushing many small objects is showing up on profiles? So far,
this specific microbenchmark was the only profile I'd seen where the clflushes
show up.

Thanks.

[snip]
Daniel Vetter Dec. 15, 2014, 7:55 a.m. UTC | #3
On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote:
> On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote:
> > On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote:
> > > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
> > > we've already blown out the entire cache via a wbinvd, there is nothing more to
> > > do.
> > > 
> > > With this and the previous patches, I am seeing a 3x FPS increase on a certain
> > > benchmark which uses a giant 2d array texture. Unless I missed something in the
> > > code, it should only effect non-LLC i915 platforms.
> > > 
> > > I haven't yet run any numbers for other benchmarks, nor have I attempted to
> > > check if various conformance tests still pass.
> > > 
> > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest
> > > buffer and attempt to flush it first, the results would be even more desirable.
> > 
> > So even with that optimization if you only have tons of small buffers
> > that need to be flushed you'd still take the clflush path for every
> > single one.
> > 
> > How difficult would it to calculate the total size to be flushed first,
> > and then make the clflush vs. wbinvd decision base on that?
> > 
> 
> I'll write the patch and send it to Eero for test.
> 
> It's not hard, and I think that's a good idea as well. One reason I didn't put
> such code in this series is that moves away from a global DRM solution (and like
> I said in the cover-letter, I am fine with that). Implementing this, I think in
> the i915 code we'd just iterate through the BOs until we got to a certain
> threshold, then just call wbinvd() from i915 and not even both with drm_cache.
> You could also maybe try to shorcut if there are more than X buffers.

I don't mind an i915 specific solution (we have them already in many
places). So will wait for the results of this experiments before merging
more patches.

> However, for what you describe, I think it might make more sense to let
> userspace specify an execbuf flag to do the wbinvd(). Userspace can trivially
> determine such info, it prevents having to iterate through the buffers an extra
> time in the kernel.

If we keep the running tally of clflushing in the reservation step and
then use that in move_to_gpu we shouldn't need an additional loop. And
even if the estimate from the first loop is off a bit (e.g. due to
eviction to make space for new buffers) it won't matter - it's just an
optimization. Imo asking userspace to do this for the kernel doesn't make
much sense since the kernel already keeps track of domains.

Cheers, Daniel
Chris Wilson Dec. 15, 2014, 8:20 a.m. UTC | #4
On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote:
> On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote:
> > On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote:
> > > On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote:
> > > > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
> > > > we've already blown out the entire cache via a wbinvd, there is nothing more to
> > > > do.
> > > > 
> > > > With this and the previous patches, I am seeing a 3x FPS increase on a certain
> > > > benchmark which uses a giant 2d array texture. Unless I missed something in the
> > > > code, it should only effect non-LLC i915 platforms.
> > > > 
> > > > I haven't yet run any numbers for other benchmarks, nor have I attempted to
> > > > check if various conformance tests still pass.
> > > > 
> > > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest
> > > > buffer and attempt to flush it first, the results would be even more desirable.
> > > 
> > > So even with that optimization if you only have tons of small buffers
> > > that need to be flushed you'd still take the clflush path for every
> > > single one.
> > > 
> > > How difficult would it to calculate the total size to be flushed first,
> > > and then make the clflush vs. wbinvd decision base on that?
> > > 
> > 
> > I'll write the patch and send it to Eero for test.
> > 
> > It's not hard, and I think that's a good idea as well. One reason I didn't put
> > such code in this series is that moves away from a global DRM solution (and like
> > I said in the cover-letter, I am fine with that). Implementing this, I think in
> > the i915 code we'd just iterate through the BOs until we got to a certain
> > threshold, then just call wbinvd() from i915 and not even both with drm_cache.
> > You could also maybe try to shorcut if there are more than X buffers.
> 
> I don't mind an i915 specific solution (we have them already in many
> places). So will wait for the results of this experiments before merging
> more patches.

I actually think an i915 specific solution is required, as the making
drm_clflush_pages autoselect is likely to cause regressions on unaware
drivers (e.g. anything that has a reservation loop in execbuf like i915).

I do think that execbuf regularly hitting clflush is something to be
concerned about in userspace (even a wbinvd every execbuf is not going
to be nice, even an mfence per execbuf shows up on profiles!), but we do
have to occassionally flush large objects like framebuffers which
probably would be faster using wbinvd.
-Chris
Ben Widawsky Dec. 15, 2014, 7:56 p.m. UTC | #5
On Mon, Dec 15, 2014 at 08:20:50AM +0000, Chris Wilson wrote:
> On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote:
> > On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote:
> > > On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote:
> > > > On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote:
> > > > > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
> > > > > we've already blown out the entire cache via a wbinvd, there is nothing more to
> > > > > do.
> > > > > 
> > > > > With this and the previous patches, I am seeing a 3x FPS increase on a certain
> > > > > benchmark which uses a giant 2d array texture. Unless I missed something in the
> > > > > code, it should only effect non-LLC i915 platforms.
> > > > > 
> > > > > I haven't yet run any numbers for other benchmarks, nor have I attempted to
> > > > > check if various conformance tests still pass.
> > > > > 
> > > > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest
> > > > > buffer and attempt to flush it first, the results would be even more desirable.
> > > > 
> > > > So even with that optimization if you only have tons of small buffers
> > > > that need to be flushed you'd still take the clflush path for every
> > > > single one.
> > > > 
> > > > How difficult would it to calculate the total size to be flushed first,
> > > > and then make the clflush vs. wbinvd decision base on that?
> > > > 
> > > 
> > > I'll write the patch and send it to Eero for test.
> > > 
> > > It's not hard, and I think that's a good idea as well. One reason I didn't put
> > > such code in this series is that moves away from a global DRM solution (and like
> > > I said in the cover-letter, I am fine with that). Implementing this, I think in
> > > the i915 code we'd just iterate through the BOs until we got to a certain
> > > threshold, then just call wbinvd() from i915 and not even both with drm_cache.
> > > You could also maybe try to shorcut if there are more than X buffers.
> > 
> > I don't mind an i915 specific solution (we have them already in many
> > places). So will wait for the results of this experiments before merging
> > more patches.
> 
> I actually think an i915 specific solution is required, as the making
> drm_clflush_pages autoselect is likely to cause regressions on unaware
> drivers (e.g. anything that has a reservation loop in execbuf like i915).

Assuming the stall is gone as Jesse said in the other thread, I can't envision a
scenario where wbinvd would do worse on large objects.

> 
> I do think that execbuf regularly hitting clflush is something to be
> concerned about in userspace (even a wbinvd every execbuf is not going
> to be nice, even an mfence per execbuf shows up on profiles!), but we do
> have to occassionally flush large objects like framebuffers which
> probably would be faster using wbinvd.
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
Chris Wilson Dec. 15, 2014, 8:39 p.m. UTC | #6
On Mon, Dec 15, 2014 at 11:56:05AM -0800, Ben Widawsky wrote:
> On Mon, Dec 15, 2014 at 08:20:50AM +0000, Chris Wilson wrote:
> > On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote:
> > > On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote:
> > > > On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote:
> > > > > On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote:
> > > > > > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
> > > > > > we've already blown out the entire cache via a wbinvd, there is nothing more to
> > > > > > do.
> > > > > > 
> > > > > > With this and the previous patches, I am seeing a 3x FPS increase on a certain
> > > > > > benchmark which uses a giant 2d array texture. Unless I missed something in the
> > > > > > code, it should only effect non-LLC i915 platforms.
> > > > > > 
> > > > > > I haven't yet run any numbers for other benchmarks, nor have I attempted to
> > > > > > check if various conformance tests still pass.
> > > > > > 
> > > > > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest
> > > > > > buffer and attempt to flush it first, the results would be even more desirable.
> > > > > 
> > > > > So even with that optimization if you only have tons of small buffers
> > > > > that need to be flushed you'd still take the clflush path for every
> > > > > single one.
> > > > > 
> > > > > How difficult would it to calculate the total size to be flushed first,
> > > > > and then make the clflush vs. wbinvd decision base on that?
> > > > > 
> > > > 
> > > > I'll write the patch and send it to Eero for test.
> > > > 
> > > > It's not hard, and I think that's a good idea as well. One reason I didn't put
> > > > such code in this series is that moves away from a global DRM solution (and like
> > > > I said in the cover-letter, I am fine with that). Implementing this, I think in
> > > > the i915 code we'd just iterate through the BOs until we got to a certain
> > > > threshold, then just call wbinvd() from i915 and not even both with drm_cache.
> > > > You could also maybe try to shorcut if there are more than X buffers.
> > > 
> > > I don't mind an i915 specific solution (we have them already in many
> > > places). So will wait for the results of this experiments before merging
> > > more patches.
> > 
> > I actually think an i915 specific solution is required, as the making
> > drm_clflush_pages autoselect is likely to cause regressions on unaware
> > drivers (e.g. anything that has a reservation loop in execbuf like i915).
> 
> Assuming the stall is gone as Jesse said in the other thread, I can't envision a
> scenario where wbinvd would do worse on large objects.

It is the multiple wbinvd performed at each execbuffer that is worrisome.
-Chris
Ben Widawsky Dec. 15, 2014, 9:06 p.m. UTC | #7
On Mon, Dec 15, 2014 at 08:39:35PM +0000, Chris Wilson wrote:
> On Mon, Dec 15, 2014 at 11:56:05AM -0800, Ben Widawsky wrote:
> > On Mon, Dec 15, 2014 at 08:20:50AM +0000, Chris Wilson wrote:
> > > On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote:
> > > > On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote:
> > > > > On Sun, Dec 14, 2014 at 03:12:21PM +0200, Ville Syrjälä wrote:
> > > > > > On Sat, Dec 13, 2014 at 07:08:24PM -0800, Ben Widawsky wrote:
> > > > > > > If we're moving a bunch of buffers from the CPU domain to the GPU domain, and
> > > > > > > we've already blown out the entire cache via a wbinvd, there is nothing more to
> > > > > > > do.
> > > > > > > 
> > > > > > > With this and the previous patches, I am seeing a 3x FPS increase on a certain
> > > > > > > benchmark which uses a giant 2d array texture. Unless I missed something in the
> > > > > > > code, it should only effect non-LLC i915 platforms.
> > > > > > > 
> > > > > > > I haven't yet run any numbers for other benchmarks, nor have I attempted to
> > > > > > > check if various conformance tests still pass.
> > > > > > > 
> > > > > > > NOTE: As mentioned in the previous patch, if one can easily obtain the largest
> > > > > > > buffer and attempt to flush it first, the results would be even more desirable.
> > > > > > 
> > > > > > So even with that optimization if you only have tons of small buffers
> > > > > > that need to be flushed you'd still take the clflush path for every
> > > > > > single one.
> > > > > > 
> > > > > > How difficult would it to calculate the total size to be flushed first,
> > > > > > and then make the clflush vs. wbinvd decision base on that?
> > > > > > 
> > > > > 
> > > > > I'll write the patch and send it to Eero for test.
> > > > > 
> > > > > It's not hard, and I think that's a good idea as well. One reason I didn't put
> > > > > such code in this series is that moves away from a global DRM solution (and like
> > > > > I said in the cover-letter, I am fine with that). Implementing this, I think in
> > > > > the i915 code we'd just iterate through the BOs until we got to a certain
> > > > > threshold, then just call wbinvd() from i915 and not even both with drm_cache.
> > > > > You could also maybe try to shorcut if there are more than X buffers.
> > > > 
> > > > I don't mind an i915 specific solution (we have them already in many
> > > > places). So will wait for the results of this experiments before merging
> > > > more patches.
> > > 
> > > I actually think an i915 specific solution is required, as the making
> > > drm_clflush_pages autoselect is likely to cause regressions on unaware
> > > drivers (e.g. anything that has a reservation loop in execbuf like i915).
> > 
> > Assuming the stall is gone as Jesse said in the other thread, I can't envision a
> > scenario where wbinvd would do worse on large objects.
> 
> It is the multiple wbinvd performed at each execbuffer that is worrisome.
> -Chris
> 

This patch attempts to avoid that by dropping all flushing after the first
WBINVD.
Chris Wilson Dec. 16, 2014, 7:57 a.m. UTC | #8
On Mon, Dec 15, 2014 at 01:06:40PM -0800, Ben Widawsky wrote:
> On Mon, Dec 15, 2014 at 08:39:35PM +0000, Chris Wilson wrote:
> > On Mon, Dec 15, 2014 at 11:56:05AM -0800, Ben Widawsky wrote:
> > > On Mon, Dec 15, 2014 at 08:20:50AM +0000, Chris Wilson wrote:
> > > > On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote:
> > > > > On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote:
> > > > > > It's not hard, and I think that's a good idea as well. One reason I didn't put
> > > > > > such code in this series is that moves away from a global DRM solution (and like
> > > > > > I said in the cover-letter, I am fine with that). Implementing this, I think in
> > > > > > the i915 code we'd just iterate through the BOs until we got to a certain
> > > > > > threshold, then just call wbinvd() from i915 and not even both with drm_cache.
> > > > > > You could also maybe try to shorcut if there are more than X buffers.
> > > > > 
> > > > > I don't mind an i915 specific solution (we have them already in many
> > > > > places). So will wait for the results of this experiments before merging
> > > > > more patches.
> > > > 
> > > > I actually think an i915 specific solution is required, as the making
> > > > drm_clflush_pages autoselect is likely to cause regressions on unaware
> > > > drivers (e.g. anything that has a reservation loop in execbuf like i915).
> > > 
> > > Assuming the stall is gone as Jesse said in the other thread, I can't envision a
> > > scenario where wbinvd would do worse on large objects.
> > 
> > It is the multiple wbinvd performed at each execbuffer that is worrisome.
> 
> This patch attempts to avoid that by dropping all flushing after the first
> WBINVD.

But you can't make the central change to drm_clflush_* without driver
specific workarounds like this patch...
-Chris
Ben Widawsky Dec. 16, 2014, 7:38 p.m. UTC | #9
On Tue, Dec 16, 2014 at 07:57:39AM +0000, Chris Wilson wrote:
> On Mon, Dec 15, 2014 at 01:06:40PM -0800, Ben Widawsky wrote:
> > On Mon, Dec 15, 2014 at 08:39:35PM +0000, Chris Wilson wrote:
> > > On Mon, Dec 15, 2014 at 11:56:05AM -0800, Ben Widawsky wrote:
> > > > On Mon, Dec 15, 2014 at 08:20:50AM +0000, Chris Wilson wrote:
> > > > > On Mon, Dec 15, 2014 at 08:55:32AM +0100, Daniel Vetter wrote:
> > > > > > On Sun, Dec 14, 2014 at 03:37:36PM -0800, Ben Widawsky wrote:
> > > > > > > It's not hard, and I think that's a good idea as well. One reason I didn't put
> > > > > > > such code in this series is that moves away from a global DRM solution (and like
> > > > > > > I said in the cover-letter, I am fine with that). Implementing this, I think in
> > > > > > > the i915 code we'd just iterate through the BOs until we got to a certain
> > > > > > > threshold, then just call wbinvd() from i915 and not even both with drm_cache.
> > > > > > > You could also maybe try to shorcut if there are more than X buffers.
> > > > > > 
> > > > > > I don't mind an i915 specific solution (we have them already in many
> > > > > > places). So will wait for the results of this experiments before merging
> > > > > > more patches.
> > > > > 
> > > > > I actually think an i915 specific solution is required, as the making
> > > > > drm_clflush_pages autoselect is likely to cause regressions on unaware
> > > > > drivers (e.g. anything that has a reservation loop in execbuf like i915).
> > > > 
> > > > Assuming the stall is gone as Jesse said in the other thread, I can't envision a
> > > > scenario where wbinvd would do worse on large objects.
> > > 
> > > It is the multiple wbinvd performed at each execbuffer that is worrisome.
> > 
> > This patch attempts to avoid that by dropping all flushing after the first
> > WBINVD.
> 
> But you can't make the central change to drm_clflush_* without driver
> specific workarounds like this patch...

Good point. I'll do an i915 one when I find the time. I was still hoping to get
some numbers from Eero on this series.
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d68c75f..fdb92a3 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2642,7 +2642,8 @@  static inline bool i915_stop_ring_allow_warn(struct drm_i915_private *dev_priv)
 }
 
 void i915_gem_reset(struct drm_device *dev);
-bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
+enum drm_cache_flush
+i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
 int __must_check i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj);
 int __must_check i915_gem_init(struct drm_device *dev);
 int i915_gem_init_rings(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index de241eb..3746738 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3608,7 +3608,7 @@  err_unpin:
 	return vma;
 }
 
-bool
+enum drm_cache_flush
 i915_gem_clflush_object(struct drm_i915_gem_object *obj,
 			bool force)
 {
@@ -3617,14 +3617,14 @@  i915_gem_clflush_object(struct drm_i915_gem_object *obj,
 	 * again at bind time.
 	 */
 	if (obj->pages == NULL)
-		return false;
+		return DRM_CACHE_FLUSH_NONE;
 
 	/*
 	 * Stolen memory is always coherent with the GPU as it is explicitly
 	 * marked as wc by the system, or the system is cache-coherent.
 	 */
 	if (obj->stolen || obj->phys_handle)
-		return false;
+		return DRM_CACHE_FLUSH_NONE;
 
 	/* If the GPU is snooping the contents of the CPU cache,
 	 * we do not need to manually clear the CPU cache lines.  However,
@@ -3635,12 +3635,10 @@  i915_gem_clflush_object(struct drm_i915_gem_object *obj,
 	 * tracking.
 	 */
 	if (!force && cpu_cache_is_coherent(obj->base.dev, obj->cache_level))
-		return false;
+		return DRM_CACHE_FLUSH_NONE;
 
 	trace_i915_gem_object_clflush(obj);
-	drm_clflush_sg(obj->pages);
-
-	return true;
+	return drm_clflush_sg(obj->pages);
 }
 
 /** Flushes the GTT write domain for the object if it's dirty. */
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0c25f62..e8eb9e9 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -827,7 +827,7 @@  i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
 {
 	struct i915_vma *vma;
 	uint32_t flush_domains = 0;
-	bool flush_chipset = false;
+	enum drm_cache_flush flush_chipset = DRM_CACHE_FLUSH_NONE;
 	int ret;
 
 	list_for_each_entry(vma, vmas, exec_list) {
@@ -836,8 +836,10 @@  i915_gem_execbuffer_move_to_gpu(struct intel_engine_cs *ring,
 		if (ret)
 			return ret;
 
-		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
-			flush_chipset |= i915_gem_clflush_object(obj, false);
+		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU &&
+		    flush_chipset != DRM_CACHE_FLUSH_WBINVD) {
+			flush_chipset = i915_gem_clflush_object(obj, false);
+		}
 
 		flush_domains |= obj->base.write_domain;
 	}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 89b5577..a6c6ebd 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -611,7 +611,7 @@  static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf,
 	struct intel_engine_cs *ring = ringbuf->ring;
 	struct i915_vma *vma;
 	uint32_t flush_domains = 0;
-	bool flush_chipset = false;
+	enum drm_cache_flush flush_chipset = DRM_CACHE_FLUSH_NONE;
 	int ret;
 
 	list_for_each_entry(vma, vmas, exec_list) {
@@ -621,8 +621,10 @@  static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf,
 		if (ret)
 			return ret;
 
-		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
-			flush_chipset |= i915_gem_clflush_object(obj, false);
+		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU &&
+		    flush_chipset != DRM_CACHE_FLUSH_WBINVD) {
+			flush_chipset = i915_gem_clflush_object(obj, false);
+		}
 
 		flush_domains |= obj->base.write_domain;
 	}