diff mbox

[6/9] drm/i915: add frontbuffer tracking to FBC

Message ID 1423500395-1787-7-git-send-email-przanoni@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Paulo Zanoni Feb. 9, 2015, 4:46 p.m. UTC
From: Paulo Zanoni <paulo.r.zanoni@intel.com>

Kill the blt/render tracking we currently have and use the frontbuffer
tracking infrastructure.

Don't enable things by default yet.

v2: (Rodrigo) Fix small conflict on rebase and typo at subject.
v3: (Paulo) Rebase on RENDER_CS change.
v4: (Paulo) Rebase.

Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com> (v2)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h          |   9 +--
 drivers/gpu/drm/i915/intel_drv.h         |   6 +-
 drivers/gpu/drm/i915/intel_fbc.c         | 107 ++++++++++++++++++++-----------
 drivers/gpu/drm/i915/intel_frontbuffer.c |  14 +---
 drivers/gpu/drm/i915/intel_ringbuffer.c  |  41 +-----------
 drivers/gpu/drm/i915/intel_ringbuffer.h  |   1 -
 6 files changed, 80 insertions(+), 98 deletions(-)

Comments

Daniel Vetter Feb. 9, 2015, 7:05 p.m. UTC | #1
On Mon, Feb 09, 2015 at 02:46:32PM -0200, Paulo Zanoni wrote:
> From: Paulo Zanoni <paulo.r.zanoni@intel.com>
> 
> Kill the blt/render tracking we currently have and use the frontbuffer
> tracking infrastructure.
> 
> Don't enable things by default yet.
> 
> v2: (Rodrigo) Fix small conflict on rebase and typo at subject.
> v3: (Paulo) Rebase on RENDER_CS change.
> v4: (Paulo) Rebase.
> 
> Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com> (v2)
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h          |   9 +--
>  drivers/gpu/drm/i915/intel_drv.h         |   6 +-
>  drivers/gpu/drm/i915/intel_fbc.c         | 107 ++++++++++++++++++++-----------
>  drivers/gpu/drm/i915/intel_frontbuffer.c |  14 +---
>  drivers/gpu/drm/i915/intel_ringbuffer.c  |  41 +-----------
>  drivers/gpu/drm/i915/intel_ringbuffer.h  |   1 -
>  6 files changed, 80 insertions(+), 98 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index d578211..dc002df 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -782,6 +782,7 @@ struct i915_fbc {
>  	unsigned long uncompressed_size;
>  	unsigned threshold;
>  	unsigned int fb_id;
> +	unsigned int possible_framebuffer_bits;
>  	struct intel_crtc *crtc;
>  	int y;
>  
> @@ -794,14 +795,6 @@ struct i915_fbc {
>  	 * possible. */
>  	bool enabled;
>  
> -	/* On gen8 some rings cannont perform fbc clean operation so for now
> -	 * we are doing this on SW with mmio.
> -	 * This variable works in the opposite information direction
> -	 * of ring->fbc_dirty telling software on frontbuffer tracking
> -	 * to perform the cache clean on sw side.
> -	 */
> -	bool need_sw_cache_clean;
> -
>  	struct intel_fbc_work {
>  		struct delayed_work work;
>  		struct drm_crtc *crtc;
> diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
> index a73d091..ed85df8 100644
> --- a/drivers/gpu/drm/i915/intel_drv.h
> +++ b/drivers/gpu/drm/i915/intel_drv.h
> @@ -1101,7 +1101,11 @@ bool intel_fbc_enabled(struct drm_device *dev);
>  void intel_fbc_update(struct drm_device *dev);
>  void intel_fbc_init(struct drm_i915_private *dev_priv);
>  void intel_fbc_disable(struct drm_device *dev);
> -void bdw_fbc_sw_flush(struct drm_device *dev, u32 value);
> +void intel_fbc_invalidate(struct drm_i915_private *dev_priv,
> +			  unsigned int frontbuffer_bits,
> +			  enum fb_op_origin origin);
> +void intel_fbc_flush(struct drm_i915_private *dev_priv,
> +		     unsigned int frontbuffer_bits, enum fb_op_origin origin);
>  
>  /* intel_hdmi.c */
>  void intel_hdmi_init(struct drm_device *dev, int hdmi_reg, enum port port);
> diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c
> index 7341e87..26c4f9c 100644
> --- a/drivers/gpu/drm/i915/intel_fbc.c
> +++ b/drivers/gpu/drm/i915/intel_fbc.c
> @@ -174,29 +174,10 @@ static bool g4x_fbc_enabled(struct drm_device *dev)
>  	return I915_READ(DPFC_CONTROL) & DPFC_CTL_EN;
>  }
>  
> -static void snb_fbc_blit_update(struct drm_device *dev)
> +static void intel_fbc_nuke(struct drm_i915_private *dev_priv)
>  {
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	u32 blt_ecoskpd;
> -
> -	/* Make sure blitter notifies FBC of writes */
> -
> -	/* Blitter is part of Media powerwell on VLV. No impact of
> -	 * his param in other platforms for now */
> -	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_MEDIA);
> -
> -	blt_ecoskpd = I915_READ(GEN6_BLITTER_ECOSKPD);
> -	blt_ecoskpd |= GEN6_BLITTER_FBC_NOTIFY <<
> -		GEN6_BLITTER_LOCK_SHIFT;
> -	I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
> -	blt_ecoskpd |= GEN6_BLITTER_FBC_NOTIFY;
> -	I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
> -	blt_ecoskpd &= ~(GEN6_BLITTER_FBC_NOTIFY <<
> -			 GEN6_BLITTER_LOCK_SHIFT);
> -	I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
> -	POSTING_READ(GEN6_BLITTER_ECOSKPD);
> -
> -	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_MEDIA);
> +	I915_WRITE(MSG_FBC_REND_STATE, FBC_REND_NUKE);
> +	POSTING_READ(MSG_FBC_REND_STATE);
>  }
>  
>  static void ilk_fbc_enable(struct drm_crtc *crtc)
> @@ -239,9 +220,10 @@ static void ilk_fbc_enable(struct drm_crtc *crtc)
>  		I915_WRITE(SNB_DPFC_CTL_SA,
>  			   SNB_CPU_FENCE_ENABLE | obj->fence_reg);
>  		I915_WRITE(DPFC_CPU_FENCE_OFFSET, crtc->y);
> -		snb_fbc_blit_update(dev);
>  	}
>  
> +	intel_fbc_nuke(dev_priv);
> +
>  	DRM_DEBUG_KMS("enabled fbc on plane %c\n", plane_name(intel_crtc->plane));
>  }
>  
> @@ -320,7 +302,7 @@ static void gen7_fbc_enable(struct drm_crtc *crtc)
>  		   SNB_CPU_FENCE_ENABLE | obj->fence_reg);
>  	I915_WRITE(DPFC_CPU_FENCE_OFFSET, crtc->y);
>  
> -	snb_fbc_blit_update(dev);
> +	intel_fbc_nuke(dev_priv);
>  
>  	DRM_DEBUG_KMS("enabled fbc on plane %c\n", plane_name(intel_crtc->plane));
>  }
> @@ -340,19 +322,6 @@ bool intel_fbc_enabled(struct drm_device *dev)
>  	return dev_priv->fbc.enabled;
>  }
>  
> -void bdw_fbc_sw_flush(struct drm_device *dev, u32 value)
> -{
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -
> -	if (!IS_GEN8(dev))
> -		return;
> -
> -	if (!intel_fbc_enabled(dev))
> -		return;
> -
> -	I915_WRITE(MSG_FBC_REND_STATE, value);
> -}
> -
>  static void intel_fbc_work_fn(struct work_struct *__work)
>  {
>  	struct intel_fbc_work *work =
> @@ -658,6 +627,63 @@ out_disable:
>  	i915_gem_stolen_cleanup_compression(dev);
>  }
>  
> +void intel_fbc_invalidate(struct drm_i915_private *dev_priv,
> +			  unsigned int frontbuffer_bits,
> +			  enum fb_op_origin origin)
> +{
> +	struct drm_device *dev = dev_priv->dev;
> +	unsigned int fbc_bits;
> +
> +	if (origin == ORIGIN_GTT)
> +		return;
> +
> +	if (dev_priv->fbc.enabled)
> +		fbc_bits = INTEL_FRONTBUFFER_PRIMARY(dev_priv->fbc.crtc->pipe);
> +	else if (dev_priv->fbc.fbc_work)
> +		fbc_bits = INTEL_FRONTBUFFER_PRIMARY(
> +			to_intel_crtc(dev_priv->fbc.fbc_work->crtc)->pipe);
> +	else
> +		fbc_bits = dev_priv->fbc.possible_framebuffer_bits;
> +
> +	if ((fbc_bits & frontbuffer_bits) == 0)
> +		return;

The fbc code should store all the currently invalidate fb_bits somewhere,
so that it knows when it can reenable fbc. Copypasting from the psr code:

dev_priv->fbc.busy_frontbuffer_bits |= fbc_bits & frontbuffer_bits;

Of course there should only ever be one bit (for the primary plane on the
crtc fbc is enabled on), so a boolean should be enough. But we could use
the additional bits for a bit of consistency checks.

> +
> +	intel_fbc_disable(dev);
> +}
> +
> +void intel_fbc_flush(struct drm_i915_private *dev_priv,
> +		     unsigned int frontbuffer_bits,
> +		     enum fb_op_origin origin)
> +{
> +	struct drm_device *dev = dev_priv->dev;
> +	unsigned int fbc_bits = 0;
> +	bool fbc_enabled;
> +
> +	if (dev_priv->fbc.enabled)
> +		fbc_bits = INTEL_FRONTBUFFER_PRIMARY(dev_priv->fbc.crtc->pipe);
> +	else
> +		fbc_bits = dev_priv->fbc.possible_framebuffer_bits;
> +
> +	/* These are not the planes you are looking for. */
> +	if ((frontbuffer_bits & fbc_bits) == 0)
> +		return;
> +
> +	fbc_enabled = intel_fbc_enabled(dev);
> +
> +	if ((dev_priv->fb_tracking.busy_bits & frontbuffer_bits &
> +	     fbc_bits) == 0) {
> +		if (fbc_enabled) {
> +			if (origin == ORIGIN_CS || origin == ORIGIN_CPU)
> +				intel_fbc_nuke(dev_priv);
> +		} else {
> +			intel_fbc_update(dev);
> +		}

Ok, here comes the gist of what needs to be changed.
- You need to check the fbc-private copy of busy bits, otherwise the
  filtering for the GTT origin won't work. We want to keep fbc enabled (or
  renable) even when gtt is still invalidated somehow.
- That means when you decide to reenable fbc (i.e.
  dev_priv->fbc.busy_frontbuffer_bits == 0) then it's guranteed that fbc
  is already disabled, and a plain intel_fbc_enable is good enough.
- nuke in flush is not enough and too late - the frontbuffer rendering
  could have been going on for a while since you've seen the invalidate,
  the flush only happens when everything is quiet again. A nuke at the end
  is hence too late and not enough.

This way you don't need the origin information in the flush helper.

One bit I'm not yet sure off is where to put intel_fbc_update to
reallocate buffers over pageflips. The complication since you've written
these patches is the atomic flips, which can now asynchronously update the
primary plane (and change the pixel format), but different from the
pageflip code the atomic paths don't disable/enable fbc over a flip.

Which means that doing the intel_fbc_update just somewhere in between the
disable and enable won't be enough to keep everything in sync.

I think what we ultimately need is a bit of logic in the atomic_check part
which decides whether fbc must be disabled/enabled (e.g. when we need to
realloc the cfb) around the update. And then we'll place explicit
intel_fbc_disable/enable calls in the atomic_begin/flush callbacks.

The other case is just updating the gtt fence data. At least on g4x+ we
should try not to disable fbc but keep it going over pageflips. Which
means there could be interesting races between the flip and gtt writes
right afterwards (which aren't synchronized in any way).

I see two options:
- We update the fence register whenever we want, but while a pageflip is
  pending we don't filter out gtt invalidates (since we kinda don't know
  whether the hw invalidate will hit before/after the flip happens). This
  one is a bit tricky, we'd need to add fbc callbacks to
  intel_frontbuffer_flip_prepare/complete.
- We do a manual nuke after each flip by adding calls to intel_fbc_flip to
  intel_frontbuffer_flip and flip_complete. This one has the downside that
  proper userspace which doesn't do gtt writes will get hurt by nuking the
  fbc twice: Once for the flip and once right afterwards.

Cheers, Daniel
> +	} else {
> +		if (fbc_enabled)
> +			intel_fbc_disable(dev);
> +	}
> +}
> +
>  /**
>   * intel_fbc_init - Initialize FBC
>   * @dev_priv: the i915 device
> @@ -666,12 +692,19 @@ out_disable:
>   */
>  void intel_fbc_init(struct drm_i915_private *dev_priv)
>  {
> +	enum pipe pipe;
> +
>  	if (!HAS_FBC(dev_priv)) {
>  		dev_priv->fbc.enabled = false;
>  		dev_priv->fbc.no_fbc_reason = FBC_UNSUPPORTED;
>  		return;
>  	}
>  
> +	/* TODO: some platforms have FBC tied to a specific plane! */
> +	for_each_pipe(dev_priv, pipe)
> +		dev_priv->fbc.possible_framebuffer_bits |=
> +				INTEL_FRONTBUFFER_PRIMARY(pipe);
> +
>  	if (INTEL_INFO(dev_priv)->gen >= 7) {
>  		dev_priv->display.fbc_enabled = ilk_fbc_enabled;
>  		dev_priv->display.enable_fbc = gen7_fbc_enable;
> diff --git a/drivers/gpu/drm/i915/intel_frontbuffer.c b/drivers/gpu/drm/i915/intel_frontbuffer.c
> index 0a773981..2fc059c 100644
> --- a/drivers/gpu/drm/i915/intel_frontbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_frontbuffer.c
> @@ -118,8 +118,6 @@ static void intel_mark_fb_busy(struct drm_device *dev,
>  			continue;
>  
>  		intel_increase_pllclock(dev, pipe);
> -		if (ring && intel_fbc_enabled(dev))
> -			ring->fbc_dirty = true;
>  	}
>  }
>  
> @@ -160,6 +158,7 @@ void intel_fb_obj_invalidate(struct drm_i915_gem_object *obj,
>  
>  	intel_psr_invalidate(dev, obj->frontbuffer_bits);
>  	intel_edp_drrs_invalidate(dev, obj->frontbuffer_bits);
> +	intel_fbc_invalidate(dev_priv, obj->frontbuffer_bits, origin);
>  }
>  
>  /**
> @@ -189,16 +188,7 @@ void intel_frontbuffer_flush(struct drm_device *dev,
>  
>  	intel_edp_drrs_flush(dev, frontbuffer_bits);
>  	intel_psr_flush(dev, frontbuffer_bits);
> -
> -	/*
> -	 * FIXME: Unconditional fbc flushing here is a rather gross hack and
> -	 * needs to be reworked into a proper frontbuffer tracking scheme like
> -	 * psr employs.
> -	 */
> -	if (dev_priv->fbc.need_sw_cache_clean) {
> -		dev_priv->fbc.need_sw_cache_clean = false;
> -		bdw_fbc_sw_flush(dev, FBC_REND_CACHE_CLEAN);
> -	}
> +	intel_fbc_flush(dev_priv, frontbuffer_bits, origin);
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 573b80f..48dd8a2 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -317,29 +317,6 @@ gen7_render_ring_cs_stall_wa(struct intel_engine_cs *ring)
>  	return 0;
>  }
>  
> -static int gen7_ring_fbc_flush(struct intel_engine_cs *ring, u32 value)
> -{
> -	int ret;
> -
> -	if (!ring->fbc_dirty)
> -		return 0;
> -
> -	ret = intel_ring_begin(ring, 6);
> -	if (ret)
> -		return ret;
> -	/* WaFbcNukeOn3DBlt:ivb/hsw */
> -	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
> -	intel_ring_emit(ring, MSG_FBC_REND_STATE);
> -	intel_ring_emit(ring, value);
> -	intel_ring_emit(ring, MI_STORE_REGISTER_MEM(1) | MI_SRM_LRM_GLOBAL_GTT);
> -	intel_ring_emit(ring, MSG_FBC_REND_STATE);
> -	intel_ring_emit(ring, ring->scratch.gtt_offset + 256);
> -	intel_ring_advance(ring);
> -
> -	ring->fbc_dirty = false;
> -	return 0;
> -}
> -
>  static int
>  gen7_render_ring_flush(struct intel_engine_cs *ring,
>  		       u32 invalidate_domains, u32 flush_domains)
> @@ -398,9 +375,6 @@ gen7_render_ring_flush(struct intel_engine_cs *ring,
>  	intel_ring_emit(ring, 0);
>  	intel_ring_advance(ring);
>  
> -	if (!invalidate_domains && flush_domains)
> -		return gen7_ring_fbc_flush(ring, FBC_REND_NUKE);
> -
>  	return 0;
>  }
>  
> @@ -462,9 +436,6 @@ gen8_render_ring_flush(struct intel_engine_cs *ring,
>  	if (ret)
>  		return ret;
>  
> -	if (!invalidate_domains && flush_domains)
> -		return gen7_ring_fbc_flush(ring, FBC_REND_NUKE);
> -
>  	return 0;
>  }
>  
> @@ -2382,7 +2353,6 @@ static int gen6_ring_flush(struct intel_engine_cs *ring,
>  			   u32 invalidate, u32 flush)
>  {
>  	struct drm_device *dev = ring->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
>  	uint32_t cmd;
>  	int ret;
>  
> @@ -2391,7 +2361,7 @@ static int gen6_ring_flush(struct intel_engine_cs *ring,
>  		return ret;
>  
>  	cmd = MI_FLUSH_DW;
> -	if (INTEL_INFO(ring->dev)->gen >= 8)
> +	if (INTEL_INFO(dev)->gen >= 8)
>  		cmd += 1;
>  	/*
>  	 * Bspec vol 1c.3 - blitter engine command streamer:
> @@ -2404,7 +2374,7 @@ static int gen6_ring_flush(struct intel_engine_cs *ring,
>  			MI_FLUSH_DW_OP_STOREDW;
>  	intel_ring_emit(ring, cmd);
>  	intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
> -	if (INTEL_INFO(ring->dev)->gen >= 8) {
> +	if (INTEL_INFO(dev)->gen >= 8) {
>  		intel_ring_emit(ring, 0); /* upper addr */
>  		intel_ring_emit(ring, 0); /* value */
>  	} else  {
> @@ -2413,13 +2383,6 @@ static int gen6_ring_flush(struct intel_engine_cs *ring,
>  	}
>  	intel_ring_advance(ring);
>  
> -	if (!invalidate && flush) {
> -		if (IS_GEN7(dev))
> -			return gen7_ring_fbc_flush(ring, FBC_REND_CACHE_CLEAN);
> -		else if (IS_BROADWELL(dev))
> -			dev_priv->fbc.need_sw_cache_clean = true;
> -	}
> -
>  	return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 714f3fd..f8dc0c6 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -267,7 +267,6 @@ struct  intel_engine_cs {
>  	 */
>  	struct drm_i915_gem_request *outstanding_lazy_request;
>  	bool gpu_caches_dirty;
> -	bool fbc_dirty;
>  
>  	wait_queue_head_t irq_queue;
>  
> -- 
> 2.1.4
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Paulo Zanoni Feb. 10, 2015, 12:19 p.m. UTC | #2
2015-02-09 17:05 GMT-02:00 Daniel Vetter <daniel@ffwll.ch>:
> On Mon, Feb 09, 2015 at 02:46:32PM -0200, Paulo Zanoni wrote:
>> From: Paulo Zanoni <paulo.r.zanoni@intel.com>
>>
>> Kill the blt/render tracking we currently have and use the frontbuffer
>> tracking infrastructure.
>>
>> Don't enable things by default yet.
>>
>> v2: (Rodrigo) Fix small conflict on rebase and typo at subject.
>> v3: (Paulo) Rebase on RENDER_CS change.
>> v4: (Paulo) Rebase.
>>
>> Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com> (v2)
>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
>> Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h          |   9 +--
>>  drivers/gpu/drm/i915/intel_drv.h         |   6 +-
>>  drivers/gpu/drm/i915/intel_fbc.c         | 107 ++++++++++++++++++++-----------
>>  drivers/gpu/drm/i915/intel_frontbuffer.c |  14 +---
>>  drivers/gpu/drm/i915/intel_ringbuffer.c  |  41 +-----------
>>  drivers/gpu/drm/i915/intel_ringbuffer.h  |   1 -
>>  6 files changed, 80 insertions(+), 98 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index d578211..dc002df 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -782,6 +782,7 @@ struct i915_fbc {
>>       unsigned long uncompressed_size;
>>       unsigned threshold;
>>       unsigned int fb_id;
>> +     unsigned int possible_framebuffer_bits;
>>       struct intel_crtc *crtc;
>>       int y;
>>
>> @@ -794,14 +795,6 @@ struct i915_fbc {
>>        * possible. */
>>       bool enabled;
>>
>> -     /* On gen8 some rings cannont perform fbc clean operation so for now
>> -      * we are doing this on SW with mmio.
>> -      * This variable works in the opposite information direction
>> -      * of ring->fbc_dirty telling software on frontbuffer tracking
>> -      * to perform the cache clean on sw side.
>> -      */
>> -     bool need_sw_cache_clean;
>> -
>>       struct intel_fbc_work {
>>               struct delayed_work work;
>>               struct drm_crtc *crtc;
>> diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
>> index a73d091..ed85df8 100644
>> --- a/drivers/gpu/drm/i915/intel_drv.h
>> +++ b/drivers/gpu/drm/i915/intel_drv.h
>> @@ -1101,7 +1101,11 @@ bool intel_fbc_enabled(struct drm_device *dev);
>>  void intel_fbc_update(struct drm_device *dev);
>>  void intel_fbc_init(struct drm_i915_private *dev_priv);
>>  void intel_fbc_disable(struct drm_device *dev);
>> -void bdw_fbc_sw_flush(struct drm_device *dev, u32 value);
>> +void intel_fbc_invalidate(struct drm_i915_private *dev_priv,
>> +                       unsigned int frontbuffer_bits,
>> +                       enum fb_op_origin origin);
>> +void intel_fbc_flush(struct drm_i915_private *dev_priv,
>> +                  unsigned int frontbuffer_bits, enum fb_op_origin origin);
>>
>>  /* intel_hdmi.c */
>>  void intel_hdmi_init(struct drm_device *dev, int hdmi_reg, enum port port);
>> diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c
>> index 7341e87..26c4f9c 100644
>> --- a/drivers/gpu/drm/i915/intel_fbc.c
>> +++ b/drivers/gpu/drm/i915/intel_fbc.c
>> @@ -174,29 +174,10 @@ static bool g4x_fbc_enabled(struct drm_device *dev)
>>       return I915_READ(DPFC_CONTROL) & DPFC_CTL_EN;
>>  }
>>
>> -static void snb_fbc_blit_update(struct drm_device *dev)
>> +static void intel_fbc_nuke(struct drm_i915_private *dev_priv)
>>  {
>> -     struct drm_i915_private *dev_priv = dev->dev_private;
>> -     u32 blt_ecoskpd;
>> -
>> -     /* Make sure blitter notifies FBC of writes */
>> -
>> -     /* Blitter is part of Media powerwell on VLV. No impact of
>> -      * his param in other platforms for now */
>> -     intel_uncore_forcewake_get(dev_priv, FORCEWAKE_MEDIA);
>> -
>> -     blt_ecoskpd = I915_READ(GEN6_BLITTER_ECOSKPD);
>> -     blt_ecoskpd |= GEN6_BLITTER_FBC_NOTIFY <<
>> -             GEN6_BLITTER_LOCK_SHIFT;
>> -     I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
>> -     blt_ecoskpd |= GEN6_BLITTER_FBC_NOTIFY;
>> -     I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
>> -     blt_ecoskpd &= ~(GEN6_BLITTER_FBC_NOTIFY <<
>> -                      GEN6_BLITTER_LOCK_SHIFT);
>> -     I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
>> -     POSTING_READ(GEN6_BLITTER_ECOSKPD);
>> -
>> -     intel_uncore_forcewake_put(dev_priv, FORCEWAKE_MEDIA);
>> +     I915_WRITE(MSG_FBC_REND_STATE, FBC_REND_NUKE);
>> +     POSTING_READ(MSG_FBC_REND_STATE);
>>  }
>>
>>  static void ilk_fbc_enable(struct drm_crtc *crtc)
>> @@ -239,9 +220,10 @@ static void ilk_fbc_enable(struct drm_crtc *crtc)
>>               I915_WRITE(SNB_DPFC_CTL_SA,
>>                          SNB_CPU_FENCE_ENABLE | obj->fence_reg);
>>               I915_WRITE(DPFC_CPU_FENCE_OFFSET, crtc->y);
>> -             snb_fbc_blit_update(dev);
>>       }
>>
>> +     intel_fbc_nuke(dev_priv);
>> +
>>       DRM_DEBUG_KMS("enabled fbc on plane %c\n", plane_name(intel_crtc->plane));
>>  }
>>
>> @@ -320,7 +302,7 @@ static void gen7_fbc_enable(struct drm_crtc *crtc)
>>                  SNB_CPU_FENCE_ENABLE | obj->fence_reg);
>>       I915_WRITE(DPFC_CPU_FENCE_OFFSET, crtc->y);
>>
>> -     snb_fbc_blit_update(dev);
>> +     intel_fbc_nuke(dev_priv);
>>
>>       DRM_DEBUG_KMS("enabled fbc on plane %c\n", plane_name(intel_crtc->plane));
>>  }
>> @@ -340,19 +322,6 @@ bool intel_fbc_enabled(struct drm_device *dev)
>>       return dev_priv->fbc.enabled;
>>  }
>>
>> -void bdw_fbc_sw_flush(struct drm_device *dev, u32 value)
>> -{
>> -     struct drm_i915_private *dev_priv = dev->dev_private;
>> -
>> -     if (!IS_GEN8(dev))
>> -             return;
>> -
>> -     if (!intel_fbc_enabled(dev))
>> -             return;
>> -
>> -     I915_WRITE(MSG_FBC_REND_STATE, value);
>> -}
>> -
>>  static void intel_fbc_work_fn(struct work_struct *__work)
>>  {
>>       struct intel_fbc_work *work =
>> @@ -658,6 +627,63 @@ out_disable:
>>       i915_gem_stolen_cleanup_compression(dev);
>>  }
>>
>> +void intel_fbc_invalidate(struct drm_i915_private *dev_priv,
>> +                       unsigned int frontbuffer_bits,
>> +                       enum fb_op_origin origin)
>> +{
>> +     struct drm_device *dev = dev_priv->dev;
>> +     unsigned int fbc_bits;
>> +
>> +     if (origin == ORIGIN_GTT)
>> +             return;
>> +
>> +     if (dev_priv->fbc.enabled)
>> +             fbc_bits = INTEL_FRONTBUFFER_PRIMARY(dev_priv->fbc.crtc->pipe);
>> +     else if (dev_priv->fbc.fbc_work)
>> +             fbc_bits = INTEL_FRONTBUFFER_PRIMARY(
>> +                     to_intel_crtc(dev_priv->fbc.fbc_work->crtc)->pipe);
>> +     else
>> +             fbc_bits = dev_priv->fbc.possible_framebuffer_bits;
>> +
>> +     if ((fbc_bits & frontbuffer_bits) == 0)
>> +             return;
>
> The fbc code should store all the currently invalidate fb_bits somewhere,
> so that it knows when it can reenable fbc. Copypasting from the psr code:
>
> dev_priv->fbc.busy_frontbuffer_bits |= fbc_bits & frontbuffer_bits;
>
> Of course there should only ever be one bit (for the primary plane on the
> crtc fbc is enabled on), so a boolean should be enough. But we could use
> the additional bits for a bit of consistency checks.
>
>> +
>> +     intel_fbc_disable(dev);
>> +}
>> +
>> +void intel_fbc_flush(struct drm_i915_private *dev_priv,
>> +                  unsigned int frontbuffer_bits,
>> +                  enum fb_op_origin origin)
>> +{
>> +     struct drm_device *dev = dev_priv->dev;
>> +     unsigned int fbc_bits = 0;
>> +     bool fbc_enabled;
>> +
>> +     if (dev_priv->fbc.enabled)
>> +             fbc_bits = INTEL_FRONTBUFFER_PRIMARY(dev_priv->fbc.crtc->pipe);
>> +     else
>> +             fbc_bits = dev_priv->fbc.possible_framebuffer_bits;
>> +
>> +     /* These are not the planes you are looking for. */
>> +     if ((frontbuffer_bits & fbc_bits) == 0)
>> +             return;
>> +
>> +     fbc_enabled = intel_fbc_enabled(dev);
>> +
>> +     if ((dev_priv->fb_tracking.busy_bits & frontbuffer_bits &
>> +          fbc_bits) == 0) {
>> +             if (fbc_enabled) {
>> +                     if (origin == ORIGIN_CS || origin == ORIGIN_CPU)
>> +                             intel_fbc_nuke(dev_priv);
>> +             } else {
>> +                     intel_fbc_update(dev);
>> +             }
>
> Ok, here comes the gist of what needs to be changed.
> - You need to check the fbc-private copy of busy bits, otherwise the
>   filtering for the GTT origin won't work. We want to keep fbc enabled (or
>   renable) even when gtt is still invalidated somehow.
> - That means when you decide to reenable fbc (i.e.
>   dev_priv->fbc.busy_frontbuffer_bits == 0) then it's guranteed that fbc
>   is already disabled, and a plain intel_fbc_enable is good enough.
> - nuke in flush is not enough and too late - the frontbuffer rendering
>   could have been going on for a while since you've seen the invalidate,
>   the flush only happens when everything is quiet again. A nuke at the end
>   is hence too late and not enough.
>
> This way you don't need the origin information in the flush helper.
>
> One bit I'm not yet sure off is where to put intel_fbc_update to
> reallocate buffers over pageflips. The complication since you've written
> these patches is the atomic flips, which can now asynchronously update the
> primary plane (and change the pixel format), but different from the
> pageflip code the atomic paths don't disable/enable fbc over a flip.
>
> Which means that doing the intel_fbc_update just somewhere in between the
> disable and enable won't be enough to keep everything in sync.
>
> I think what we ultimately need is a bit of logic in the atomic_check part
> which decides whether fbc must be disabled/enabled (e.g. when we need to
> realloc the cfb) around the update. And then we'll place explicit
> intel_fbc_disable/enable calls in the atomic_begin/flush callbacks.
>
> The other case is just updating the gtt fence data. At least on g4x+ we
> should try not to disable fbc but keep it going over pageflips. Which
> means there could be interesting races between the flip and gtt writes
> right afterwards (which aren't synchronized in any way).
>
> I see two options:
> - We update the fence register whenever we want, but while a pageflip is
>   pending we don't filter out gtt invalidates (since we kinda don't know
>   whether the hw invalidate will hit before/after the flip happens). This
>   one is a bit tricky, we'd need to add fbc callbacks to
>   intel_frontbuffer_flip_prepare/complete.
> - We do a manual nuke after each flip by adding calls to intel_fbc_flip to
>   intel_frontbuffer_flip and flip_complete. This one has the downside that
>   proper userspace which doesn't do gtt writes will get hurt by nuking the
>   fbc twice: Once for the flip and once right afterwards.

Thanks for the review!

If you can think of any way to write a test case to reproduce the
problems you pointed above, please give me ideas! I'll try to do this
while rewriting the patches, but maybe you already have something in
your mind? I guess we need to try to find a way to get an invalidate
but only get the corresponding flush a long time (> 1 frame) after
that? Maybe keep the frame busy with tons of operations that don't
change the CRC?


>
> Cheers, Daniel
>> +     } else {
>> +             if (fbc_enabled)
>> +                     intel_fbc_disable(dev);
>> +     }
>> +}
>> +
>>  /**
>>   * intel_fbc_init - Initialize FBC
>>   * @dev_priv: the i915 device
>> @@ -666,12 +692,19 @@ out_disable:
>>   */
>>  void intel_fbc_init(struct drm_i915_private *dev_priv)
>>  {
>> +     enum pipe pipe;
>> +
>>       if (!HAS_FBC(dev_priv)) {
>>               dev_priv->fbc.enabled = false;
>>               dev_priv->fbc.no_fbc_reason = FBC_UNSUPPORTED;
>>               return;
>>       }
>>
>> +     /* TODO: some platforms have FBC tied to a specific plane! */
>> +     for_each_pipe(dev_priv, pipe)
>> +             dev_priv->fbc.possible_framebuffer_bits |=
>> +                             INTEL_FRONTBUFFER_PRIMARY(pipe);
>> +
>>       if (INTEL_INFO(dev_priv)->gen >= 7) {
>>               dev_priv->display.fbc_enabled = ilk_fbc_enabled;
>>               dev_priv->display.enable_fbc = gen7_fbc_enable;
>> diff --git a/drivers/gpu/drm/i915/intel_frontbuffer.c b/drivers/gpu/drm/i915/intel_frontbuffer.c
>> index 0a773981..2fc059c 100644
>> --- a/drivers/gpu/drm/i915/intel_frontbuffer.c
>> +++ b/drivers/gpu/drm/i915/intel_frontbuffer.c
>> @@ -118,8 +118,6 @@ static void intel_mark_fb_busy(struct drm_device *dev,
>>                       continue;
>>
>>               intel_increase_pllclock(dev, pipe);
>> -             if (ring && intel_fbc_enabled(dev))
>> -                     ring->fbc_dirty = true;
>>       }
>>  }
>>
>> @@ -160,6 +158,7 @@ void intel_fb_obj_invalidate(struct drm_i915_gem_object *obj,
>>
>>       intel_psr_invalidate(dev, obj->frontbuffer_bits);
>>       intel_edp_drrs_invalidate(dev, obj->frontbuffer_bits);
>> +     intel_fbc_invalidate(dev_priv, obj->frontbuffer_bits, origin);
>>  }
>>
>>  /**
>> @@ -189,16 +188,7 @@ void intel_frontbuffer_flush(struct drm_device *dev,
>>
>>       intel_edp_drrs_flush(dev, frontbuffer_bits);
>>       intel_psr_flush(dev, frontbuffer_bits);
>> -
>> -     /*
>> -      * FIXME: Unconditional fbc flushing here is a rather gross hack and
>> -      * needs to be reworked into a proper frontbuffer tracking scheme like
>> -      * psr employs.
>> -      */
>> -     if (dev_priv->fbc.need_sw_cache_clean) {
>> -             dev_priv->fbc.need_sw_cache_clean = false;
>> -             bdw_fbc_sw_flush(dev, FBC_REND_CACHE_CLEAN);
>> -     }
>> +     intel_fbc_flush(dev_priv, frontbuffer_bits, origin);
>>  }
>>
>>  /**
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> index 573b80f..48dd8a2 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
>> @@ -317,29 +317,6 @@ gen7_render_ring_cs_stall_wa(struct intel_engine_cs *ring)
>>       return 0;
>>  }
>>
>> -static int gen7_ring_fbc_flush(struct intel_engine_cs *ring, u32 value)
>> -{
>> -     int ret;
>> -
>> -     if (!ring->fbc_dirty)
>> -             return 0;
>> -
>> -     ret = intel_ring_begin(ring, 6);
>> -     if (ret)
>> -             return ret;
>> -     /* WaFbcNukeOn3DBlt:ivb/hsw */
>> -     intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
>> -     intel_ring_emit(ring, MSG_FBC_REND_STATE);
>> -     intel_ring_emit(ring, value);
>> -     intel_ring_emit(ring, MI_STORE_REGISTER_MEM(1) | MI_SRM_LRM_GLOBAL_GTT);
>> -     intel_ring_emit(ring, MSG_FBC_REND_STATE);
>> -     intel_ring_emit(ring, ring->scratch.gtt_offset + 256);
>> -     intel_ring_advance(ring);
>> -
>> -     ring->fbc_dirty = false;
>> -     return 0;
>> -}
>> -
>>  static int
>>  gen7_render_ring_flush(struct intel_engine_cs *ring,
>>                      u32 invalidate_domains, u32 flush_domains)
>> @@ -398,9 +375,6 @@ gen7_render_ring_flush(struct intel_engine_cs *ring,
>>       intel_ring_emit(ring, 0);
>>       intel_ring_advance(ring);
>>
>> -     if (!invalidate_domains && flush_domains)
>> -             return gen7_ring_fbc_flush(ring, FBC_REND_NUKE);
>> -
>>       return 0;
>>  }
>>
>> @@ -462,9 +436,6 @@ gen8_render_ring_flush(struct intel_engine_cs *ring,
>>       if (ret)
>>               return ret;
>>
>> -     if (!invalidate_domains && flush_domains)
>> -             return gen7_ring_fbc_flush(ring, FBC_REND_NUKE);
>> -
>>       return 0;
>>  }
>>
>> @@ -2382,7 +2353,6 @@ static int gen6_ring_flush(struct intel_engine_cs *ring,
>>                          u32 invalidate, u32 flush)
>>  {
>>       struct drm_device *dev = ring->dev;
>> -     struct drm_i915_private *dev_priv = dev->dev_private;
>>       uint32_t cmd;
>>       int ret;
>>
>> @@ -2391,7 +2361,7 @@ static int gen6_ring_flush(struct intel_engine_cs *ring,
>>               return ret;
>>
>>       cmd = MI_FLUSH_DW;
>> -     if (INTEL_INFO(ring->dev)->gen >= 8)
>> +     if (INTEL_INFO(dev)->gen >= 8)
>>               cmd += 1;
>>       /*
>>        * Bspec vol 1c.3 - blitter engine command streamer:
>> @@ -2404,7 +2374,7 @@ static int gen6_ring_flush(struct intel_engine_cs *ring,
>>                       MI_FLUSH_DW_OP_STOREDW;
>>       intel_ring_emit(ring, cmd);
>>       intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
>> -     if (INTEL_INFO(ring->dev)->gen >= 8) {
>> +     if (INTEL_INFO(dev)->gen >= 8) {
>>               intel_ring_emit(ring, 0); /* upper addr */
>>               intel_ring_emit(ring, 0); /* value */
>>       } else  {
>> @@ -2413,13 +2383,6 @@ static int gen6_ring_flush(struct intel_engine_cs *ring,
>>       }
>>       intel_ring_advance(ring);
>>
>> -     if (!invalidate && flush) {
>> -             if (IS_GEN7(dev))
>> -                     return gen7_ring_fbc_flush(ring, FBC_REND_CACHE_CLEAN);
>> -             else if (IS_BROADWELL(dev))
>> -                     dev_priv->fbc.need_sw_cache_clean = true;
>> -     }
>> -
>>       return 0;
>>  }
>>
>> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> index 714f3fd..f8dc0c6 100644
>> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
>> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
>> @@ -267,7 +267,6 @@ struct  intel_engine_cs {
>>        */
>>       struct drm_i915_gem_request *outstanding_lazy_request;
>>       bool gpu_caches_dirty;
>> -     bool fbc_dirty;
>>
>>       wait_queue_head_t irq_queue;
>>
>> --
>> 2.1.4
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
Daniel Vetter Feb. 11, 2015, 8:25 a.m. UTC | #3
On Tue, Feb 10, 2015 at 10:19:10AM -0200, Paulo Zanoni wrote:
> 2015-02-09 17:05 GMT-02:00 Daniel Vetter <daniel@ffwll.ch>:
> > Ok, here comes the gist of what needs to be changed.
> > - You need to check the fbc-private copy of busy bits, otherwise the
> >   filtering for the GTT origin won't work. We want to keep fbc enabled (or
> >   renable) even when gtt is still invalidated somehow.
> > - That means when you decide to reenable fbc (i.e.
> >   dev_priv->fbc.busy_frontbuffer_bits == 0) then it's guranteed that fbc
> >   is already disabled, and a plain intel_fbc_enable is good enough.
> > - nuke in flush is not enough and too late - the frontbuffer rendering
> >   could have been going on for a while since you've seen the invalidate,
> >   the flush only happens when everything is quiet again. A nuke at the end
> >   is hence too late and not enough.
> >
> > This way you don't need the origin information in the flush helper.
> >
> > One bit I'm not yet sure off is where to put intel_fbc_update to
> > reallocate buffers over pageflips. The complication since you've written
> > these patches is the atomic flips, which can now asynchronously update the
> > primary plane (and change the pixel format), but different from the
> > pageflip code the atomic paths don't disable/enable fbc over a flip.
> >
> > Which means that doing the intel_fbc_update just somewhere in between the
> > disable and enable won't be enough to keep everything in sync.
> >
> > I think what we ultimately need is a bit of logic in the atomic_check part
> > which decides whether fbc must be disabled/enabled (e.g. when we need to
> > realloc the cfb) around the update. And then we'll place explicit
> > intel_fbc_disable/enable calls in the atomic_begin/flush callbacks.
> >
> > The other case is just updating the gtt fence data. At least on g4x+ we
> > should try not to disable fbc but keep it going over pageflips. Which
> > means there could be interesting races between the flip and gtt writes
> > right afterwards (which aren't synchronized in any way).
> >
> > I see two options:
> > - We update the fence register whenever we want, but while a pageflip is
> >   pending we don't filter out gtt invalidates (since we kinda don't know
> >   whether the hw invalidate will hit before/after the flip happens). This
> >   one is a bit tricky, we'd need to add fbc callbacks to
> >   intel_frontbuffer_flip_prepare/complete.
> > - We do a manual nuke after each flip by adding calls to intel_fbc_flip to
> >   intel_frontbuffer_flip and flip_complete. This one has the downside that
> >   proper userspace which doesn't do gtt writes will get hurt by nuking the
> >   fbc twice: Once for the flip and once right afterwards.
> 
> Thanks for the review!
> 
> If you can think of any way to write a test case to reproduce the
> problems you pointed above, please give me ideas! I'll try to do this
> while rewriting the patches, but maybe you already have something in
> your mind? I guess we need to try to find a way to get an invalidate
> but only get the corresponding flush a long time (> 1 frame) after
> that? Maybe keep the frame busy with tons of operations that don't
> change the CRC?

Well that's going to be fairly hard since you need to race pageflips with
gtt writes and then try to sneak in gtt writes for the new buffer in
before the fence and everything is set up correctly.

The time between invalidate and flush isn't a problem since for GTT mmaps
it's forever by default. So as long as you don't do any set_domain/execbuf
or anything else GTT mmaps will stay invalidated. But since we want to
filter out GTT invalidates (the hw actually works for that case!) the
problem creeps in when we update the gtt fbc fence in a racy way. The
frontbuffer tracking stuff is race-free (avoiding that race is why there's
the flip_prepare/complete split) itself.

If you want to try the following might work to reproduce this case:

Prerequisites: We don't disable fbc any more over pageflips, which we
  can do on g4x as long as the pixel format stays the same.

1. Take reference crc for black, pain test buffer white.
2. Pageflip to that white buffer.
3. Immediately afterwards (i.e. before the pageflip completed) draw the
buffer black through gtt mmaps with all the set_domain stuff that's
required.
4. Wait a few frames, grab crc.

And now that I've describe in detail there's actually no race here as long
as we update the fbc fence _before_ we do the pageflip. Some invalidates
will hit the old buffer still, but the pageflip will invalidate everything
anyway. And any gtt writes after the flip will do line invalidation
correctly.

So I don't think we'll have a problem, and I also don't think we need
another testcase really.
-Daniel
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d578211..dc002df 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -782,6 +782,7 @@  struct i915_fbc {
 	unsigned long uncompressed_size;
 	unsigned threshold;
 	unsigned int fb_id;
+	unsigned int possible_framebuffer_bits;
 	struct intel_crtc *crtc;
 	int y;
 
@@ -794,14 +795,6 @@  struct i915_fbc {
 	 * possible. */
 	bool enabled;
 
-	/* On gen8 some rings cannont perform fbc clean operation so for now
-	 * we are doing this on SW with mmio.
-	 * This variable works in the opposite information direction
-	 * of ring->fbc_dirty telling software on frontbuffer tracking
-	 * to perform the cache clean on sw side.
-	 */
-	bool need_sw_cache_clean;
-
 	struct intel_fbc_work {
 		struct delayed_work work;
 		struct drm_crtc *crtc;
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index a73d091..ed85df8 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -1101,7 +1101,11 @@  bool intel_fbc_enabled(struct drm_device *dev);
 void intel_fbc_update(struct drm_device *dev);
 void intel_fbc_init(struct drm_i915_private *dev_priv);
 void intel_fbc_disable(struct drm_device *dev);
-void bdw_fbc_sw_flush(struct drm_device *dev, u32 value);
+void intel_fbc_invalidate(struct drm_i915_private *dev_priv,
+			  unsigned int frontbuffer_bits,
+			  enum fb_op_origin origin);
+void intel_fbc_flush(struct drm_i915_private *dev_priv,
+		     unsigned int frontbuffer_bits, enum fb_op_origin origin);
 
 /* intel_hdmi.c */
 void intel_hdmi_init(struct drm_device *dev, int hdmi_reg, enum port port);
diff --git a/drivers/gpu/drm/i915/intel_fbc.c b/drivers/gpu/drm/i915/intel_fbc.c
index 7341e87..26c4f9c 100644
--- a/drivers/gpu/drm/i915/intel_fbc.c
+++ b/drivers/gpu/drm/i915/intel_fbc.c
@@ -174,29 +174,10 @@  static bool g4x_fbc_enabled(struct drm_device *dev)
 	return I915_READ(DPFC_CONTROL) & DPFC_CTL_EN;
 }
 
-static void snb_fbc_blit_update(struct drm_device *dev)
+static void intel_fbc_nuke(struct drm_i915_private *dev_priv)
 {
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 blt_ecoskpd;
-
-	/* Make sure blitter notifies FBC of writes */
-
-	/* Blitter is part of Media powerwell on VLV. No impact of
-	 * his param in other platforms for now */
-	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_MEDIA);
-
-	blt_ecoskpd = I915_READ(GEN6_BLITTER_ECOSKPD);
-	blt_ecoskpd |= GEN6_BLITTER_FBC_NOTIFY <<
-		GEN6_BLITTER_LOCK_SHIFT;
-	I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
-	blt_ecoskpd |= GEN6_BLITTER_FBC_NOTIFY;
-	I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
-	blt_ecoskpd &= ~(GEN6_BLITTER_FBC_NOTIFY <<
-			 GEN6_BLITTER_LOCK_SHIFT);
-	I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd);
-	POSTING_READ(GEN6_BLITTER_ECOSKPD);
-
-	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_MEDIA);
+	I915_WRITE(MSG_FBC_REND_STATE, FBC_REND_NUKE);
+	POSTING_READ(MSG_FBC_REND_STATE);
 }
 
 static void ilk_fbc_enable(struct drm_crtc *crtc)
@@ -239,9 +220,10 @@  static void ilk_fbc_enable(struct drm_crtc *crtc)
 		I915_WRITE(SNB_DPFC_CTL_SA,
 			   SNB_CPU_FENCE_ENABLE | obj->fence_reg);
 		I915_WRITE(DPFC_CPU_FENCE_OFFSET, crtc->y);
-		snb_fbc_blit_update(dev);
 	}
 
+	intel_fbc_nuke(dev_priv);
+
 	DRM_DEBUG_KMS("enabled fbc on plane %c\n", plane_name(intel_crtc->plane));
 }
 
@@ -320,7 +302,7 @@  static void gen7_fbc_enable(struct drm_crtc *crtc)
 		   SNB_CPU_FENCE_ENABLE | obj->fence_reg);
 	I915_WRITE(DPFC_CPU_FENCE_OFFSET, crtc->y);
 
-	snb_fbc_blit_update(dev);
+	intel_fbc_nuke(dev_priv);
 
 	DRM_DEBUG_KMS("enabled fbc on plane %c\n", plane_name(intel_crtc->plane));
 }
@@ -340,19 +322,6 @@  bool intel_fbc_enabled(struct drm_device *dev)
 	return dev_priv->fbc.enabled;
 }
 
-void bdw_fbc_sw_flush(struct drm_device *dev, u32 value)
-{
-	struct drm_i915_private *dev_priv = dev->dev_private;
-
-	if (!IS_GEN8(dev))
-		return;
-
-	if (!intel_fbc_enabled(dev))
-		return;
-
-	I915_WRITE(MSG_FBC_REND_STATE, value);
-}
-
 static void intel_fbc_work_fn(struct work_struct *__work)
 {
 	struct intel_fbc_work *work =
@@ -658,6 +627,63 @@  out_disable:
 	i915_gem_stolen_cleanup_compression(dev);
 }
 
+void intel_fbc_invalidate(struct drm_i915_private *dev_priv,
+			  unsigned int frontbuffer_bits,
+			  enum fb_op_origin origin)
+{
+	struct drm_device *dev = dev_priv->dev;
+	unsigned int fbc_bits;
+
+	if (origin == ORIGIN_GTT)
+		return;
+
+	if (dev_priv->fbc.enabled)
+		fbc_bits = INTEL_FRONTBUFFER_PRIMARY(dev_priv->fbc.crtc->pipe);
+	else if (dev_priv->fbc.fbc_work)
+		fbc_bits = INTEL_FRONTBUFFER_PRIMARY(
+			to_intel_crtc(dev_priv->fbc.fbc_work->crtc)->pipe);
+	else
+		fbc_bits = dev_priv->fbc.possible_framebuffer_bits;
+
+	if ((fbc_bits & frontbuffer_bits) == 0)
+		return;
+
+	intel_fbc_disable(dev);
+}
+
+void intel_fbc_flush(struct drm_i915_private *dev_priv,
+		     unsigned int frontbuffer_bits,
+		     enum fb_op_origin origin)
+{
+	struct drm_device *dev = dev_priv->dev;
+	unsigned int fbc_bits = 0;
+	bool fbc_enabled;
+
+	if (dev_priv->fbc.enabled)
+		fbc_bits = INTEL_FRONTBUFFER_PRIMARY(dev_priv->fbc.crtc->pipe);
+	else
+		fbc_bits = dev_priv->fbc.possible_framebuffer_bits;
+
+	/* These are not the planes you are looking for. */
+	if ((frontbuffer_bits & fbc_bits) == 0)
+		return;
+
+	fbc_enabled = intel_fbc_enabled(dev);
+
+	if ((dev_priv->fb_tracking.busy_bits & frontbuffer_bits &
+	     fbc_bits) == 0) {
+		if (fbc_enabled) {
+			if (origin == ORIGIN_CS || origin == ORIGIN_CPU)
+				intel_fbc_nuke(dev_priv);
+		} else {
+			intel_fbc_update(dev);
+		}
+	} else {
+		if (fbc_enabled)
+			intel_fbc_disable(dev);
+	}
+}
+
 /**
  * intel_fbc_init - Initialize FBC
  * @dev_priv: the i915 device
@@ -666,12 +692,19 @@  out_disable:
  */
 void intel_fbc_init(struct drm_i915_private *dev_priv)
 {
+	enum pipe pipe;
+
 	if (!HAS_FBC(dev_priv)) {
 		dev_priv->fbc.enabled = false;
 		dev_priv->fbc.no_fbc_reason = FBC_UNSUPPORTED;
 		return;
 	}
 
+	/* TODO: some platforms have FBC tied to a specific plane! */
+	for_each_pipe(dev_priv, pipe)
+		dev_priv->fbc.possible_framebuffer_bits |=
+				INTEL_FRONTBUFFER_PRIMARY(pipe);
+
 	if (INTEL_INFO(dev_priv)->gen >= 7) {
 		dev_priv->display.fbc_enabled = ilk_fbc_enabled;
 		dev_priv->display.enable_fbc = gen7_fbc_enable;
diff --git a/drivers/gpu/drm/i915/intel_frontbuffer.c b/drivers/gpu/drm/i915/intel_frontbuffer.c
index 0a773981..2fc059c 100644
--- a/drivers/gpu/drm/i915/intel_frontbuffer.c
+++ b/drivers/gpu/drm/i915/intel_frontbuffer.c
@@ -118,8 +118,6 @@  static void intel_mark_fb_busy(struct drm_device *dev,
 			continue;
 
 		intel_increase_pllclock(dev, pipe);
-		if (ring && intel_fbc_enabled(dev))
-			ring->fbc_dirty = true;
 	}
 }
 
@@ -160,6 +158,7 @@  void intel_fb_obj_invalidate(struct drm_i915_gem_object *obj,
 
 	intel_psr_invalidate(dev, obj->frontbuffer_bits);
 	intel_edp_drrs_invalidate(dev, obj->frontbuffer_bits);
+	intel_fbc_invalidate(dev_priv, obj->frontbuffer_bits, origin);
 }
 
 /**
@@ -189,16 +188,7 @@  void intel_frontbuffer_flush(struct drm_device *dev,
 
 	intel_edp_drrs_flush(dev, frontbuffer_bits);
 	intel_psr_flush(dev, frontbuffer_bits);
-
-	/*
-	 * FIXME: Unconditional fbc flushing here is a rather gross hack and
-	 * needs to be reworked into a proper frontbuffer tracking scheme like
-	 * psr employs.
-	 */
-	if (dev_priv->fbc.need_sw_cache_clean) {
-		dev_priv->fbc.need_sw_cache_clean = false;
-		bdw_fbc_sw_flush(dev, FBC_REND_CACHE_CLEAN);
-	}
+	intel_fbc_flush(dev_priv, frontbuffer_bits, origin);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 573b80f..48dd8a2 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -317,29 +317,6 @@  gen7_render_ring_cs_stall_wa(struct intel_engine_cs *ring)
 	return 0;
 }
 
-static int gen7_ring_fbc_flush(struct intel_engine_cs *ring, u32 value)
-{
-	int ret;
-
-	if (!ring->fbc_dirty)
-		return 0;
-
-	ret = intel_ring_begin(ring, 6);
-	if (ret)
-		return ret;
-	/* WaFbcNukeOn3DBlt:ivb/hsw */
-	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-	intel_ring_emit(ring, MSG_FBC_REND_STATE);
-	intel_ring_emit(ring, value);
-	intel_ring_emit(ring, MI_STORE_REGISTER_MEM(1) | MI_SRM_LRM_GLOBAL_GTT);
-	intel_ring_emit(ring, MSG_FBC_REND_STATE);
-	intel_ring_emit(ring, ring->scratch.gtt_offset + 256);
-	intel_ring_advance(ring);
-
-	ring->fbc_dirty = false;
-	return 0;
-}
-
 static int
 gen7_render_ring_flush(struct intel_engine_cs *ring,
 		       u32 invalidate_domains, u32 flush_domains)
@@ -398,9 +375,6 @@  gen7_render_ring_flush(struct intel_engine_cs *ring,
 	intel_ring_emit(ring, 0);
 	intel_ring_advance(ring);
 
-	if (!invalidate_domains && flush_domains)
-		return gen7_ring_fbc_flush(ring, FBC_REND_NUKE);
-
 	return 0;
 }
 
@@ -462,9 +436,6 @@  gen8_render_ring_flush(struct intel_engine_cs *ring,
 	if (ret)
 		return ret;
 
-	if (!invalidate_domains && flush_domains)
-		return gen7_ring_fbc_flush(ring, FBC_REND_NUKE);
-
 	return 0;
 }
 
@@ -2382,7 +2353,6 @@  static int gen6_ring_flush(struct intel_engine_cs *ring,
 			   u32 invalidate, u32 flush)
 {
 	struct drm_device *dev = ring->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
 	uint32_t cmd;
 	int ret;
 
@@ -2391,7 +2361,7 @@  static int gen6_ring_flush(struct intel_engine_cs *ring,
 		return ret;
 
 	cmd = MI_FLUSH_DW;
-	if (INTEL_INFO(ring->dev)->gen >= 8)
+	if (INTEL_INFO(dev)->gen >= 8)
 		cmd += 1;
 	/*
 	 * Bspec vol 1c.3 - blitter engine command streamer:
@@ -2404,7 +2374,7 @@  static int gen6_ring_flush(struct intel_engine_cs *ring,
 			MI_FLUSH_DW_OP_STOREDW;
 	intel_ring_emit(ring, cmd);
 	intel_ring_emit(ring, I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT);
-	if (INTEL_INFO(ring->dev)->gen >= 8) {
+	if (INTEL_INFO(dev)->gen >= 8) {
 		intel_ring_emit(ring, 0); /* upper addr */
 		intel_ring_emit(ring, 0); /* value */
 	} else  {
@@ -2413,13 +2383,6 @@  static int gen6_ring_flush(struct intel_engine_cs *ring,
 	}
 	intel_ring_advance(ring);
 
-	if (!invalidate && flush) {
-		if (IS_GEN7(dev))
-			return gen7_ring_fbc_flush(ring, FBC_REND_CACHE_CLEAN);
-		else if (IS_BROADWELL(dev))
-			dev_priv->fbc.need_sw_cache_clean = true;
-	}
-
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 714f3fd..f8dc0c6 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -267,7 +267,6 @@  struct  intel_engine_cs {
 	 */
 	struct drm_i915_gem_request *outstanding_lazy_request;
 	bool gpu_caches_dirty;
-	bool fbc_dirty;
 
 	wait_queue_head_t irq_queue;