diff mbox

[v3] drm/i915/bdw: BDW Software Turbo

Message ID 1407780518-26610-1-git-send-email-daisy.sun@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Daisy Sun Aug. 11, 2014, 6:08 p.m. UTC
BDW supports GT C0 residency reporting in constant time unit. Driver
calculates GT utilization based on C0 residency and adjusts RP
frequency up/down accordingly. For offscreen workload specificly,
set frequency to RP0.

Offscreen task is not restricted by frame rate, it can be
executed as soon as possible. Transcoding and serilized workload
between CPU and GPU both need high GT performance, RP0 is a good
option in this case. RC6 will kick in to compensate power
consumption when GT is not active.

v2: Rebase on recent drm-intel-nightly
v3: Add flip timerout monitor, when no flip is deteced within
100ms, set frequency to RP0.

Signed-off-by: Daisy Sun <daisy.sun@intel.com>
[torourke: rebased on latest and resolved conflict]
Signed-off-by: Tom O'Rourke <Tom.O'Rourke@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h      |  22 ++++
 drivers/gpu/drm/i915/i915_irq.c      |  21 ++++
 drivers/gpu/drm/i915/i915_reg.h      |   4 +
 drivers/gpu/drm/i915/intel_display.c |   3 +
 drivers/gpu/drm/i915/intel_pm.c      | 230 +++++++++++++++++++++++++++++------
 5 files changed, 241 insertions(+), 39 deletions(-)

Comments

Daniel Vetter Aug. 11, 2014, 9:33 p.m. UTC | #1
On Mon, Aug 11, 2014 at 11:08:38AM -0700, Daisy Sun wrote:
> BDW supports GT C0 residency reporting in constant time unit. Driver
> calculates GT utilization based on C0 residency and adjusts RP
> frequency up/down accordingly. For offscreen workload specificly,
> set frequency to RP0.
> 
> Offscreen task is not restricted by frame rate, it can be
> executed as soon as possible. Transcoding and serilized workload
> between CPU and GPU both need high GT performance, RP0 is a good
> option in this case. RC6 will kick in to compensate power
> consumption when GT is not active.
> 
> v2: Rebase on recent drm-intel-nightly
> v3: Add flip timerout monitor, when no flip is deteced within
> 100ms, set frequency to RP0.

Ok, let's make this really clear:

If you wire this into the flip handling in any way, I will not merge your
patch. The timer should be fully independant and tie into the gpu
busy/idle handling we already have.

Thanks, Daniel

> 
> Signed-off-by: Daisy Sun <daisy.sun@intel.com>
> [torourke: rebased on latest and resolved conflict]
> Signed-off-by: Tom O'Rourke <Tom.O'Rourke@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h      |  22 ++++
>  drivers/gpu/drm/i915/i915_irq.c      |  21 ++++
>  drivers/gpu/drm/i915/i915_reg.h      |   4 +
>  drivers/gpu/drm/i915/intel_display.c |   3 +
>  drivers/gpu/drm/i915/intel_pm.c      | 230 +++++++++++++++++++++++++++++------
>  5 files changed, 241 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index ef38c3b..f1c4c5b 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -915,6 +915,23 @@ struct intel_rps_ei {
>  	u32 media_c0;
>  };
>  
> +struct intel_rps_bdw_cal {
> +	u32 it_threshold_pct; /* interrupt, in percentage */
> +	u32 eval_interval; /* evaluation interval, in us */
> +	u32 last_ts;
> +	u32 last_c0;
> +	bool is_up;
> +};
> +
> +struct intel_rps_bdw_turbo {
> +	struct intel_rps_bdw_cal up;
> +	struct intel_rps_bdw_cal down;
> +	struct timer_list flip_timer;
> +	u32 timeout;
> +	atomic_t flip_received;
> +	struct work_struct work_max_freq;
> +};
> +
>  struct intel_gen6_power_mgmt {
>  	/* work and pm_iir are protected by dev_priv->irq_lock */
>  	struct work_struct work;
> @@ -948,6 +965,9 @@ struct intel_gen6_power_mgmt {
>  	bool enabled;
>  	struct delayed_work delayed_resume_work;
>  
> +	bool is_bdw_sw_turbo;	/* Switch of BDW software turbo */
> +	struct intel_rps_bdw_turbo sw_turbo; /* Calculate RP interrupt timing */
> +
>  	/* manual wa residency calculations */
>  	struct intel_rps_ei up_ei, down_ei;
>  
> @@ -2703,6 +2723,8 @@ extern void intel_disable_fbc(struct drm_device *dev);
>  extern bool ironlake_set_drps(struct drm_device *dev, u8 val);
>  extern void intel_init_pch_refclk(struct drm_device *dev);
>  extern void gen6_set_rps(struct drm_device *dev, u8 val);
> +extern void bdw_software_turbo(struct drm_device *dev);
> +extern void gen8_flip_interrupt(struct drm_device *dev);
>  extern void valleyview_set_rps(struct drm_device *dev, u8 val);
>  extern void intel_set_memory_cxsr(struct drm_i915_private *dev_priv,
>  				  bool enable);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6ef9d6f..367f8e1 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1961,6 +1961,27 @@ static void i9xx_pipe_crc_irq_handler(struct drm_device *dev, enum pipe pipe)
>  				     res1, res2);
>  }
>  
> +void gen8_flip_interrupt(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	if (!dev_priv->rps.is_bdw_sw_turbo)
> +		return;
> +
> +	if(atomic_read(&dev_priv->rps.sw_turbo.flip_received)) {
> +		mod_timer(&dev_priv->rps.sw_turbo.flip_timer,
> +				usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies);
> +	}
> +	else {
> +		dev_priv->rps.sw_turbo.flip_timer.expires =
> +				usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies;
> +		add_timer(&dev_priv->rps.sw_turbo.flip_timer);
> +		atomic_set(&dev_priv->rps.sw_turbo.flip_received, true);
> +	}
> +
> +	bdw_software_turbo(dev);
> +}
> +
>  /* The RPS events need forcewake, so we add them to a work queue and mask their
>   * IMR bits until the work is done. Other interrupts can be processed without
>   * the work queue. */
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index fe5c276..088e0e1 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -5453,6 +5453,10 @@ enum punit_power_well {
>  #define GEN8_UCGCTL6				0x9430
>  #define   GEN8_SDEUNIT_CLOCK_GATE_DISABLE	(1<<14)
>  
> +#define TIMESTAMP_CTR		0x44070
> +#define FREQ_1_28_US(us)	(((us) * 100) >> 7)
> +#define MCHBAR_PCU_C0		(MCHBAR_MIRROR_BASE_SNB + 0x5960)
> +
>  #define GEN6_GFXPAUSE				0xA000
>  #define GEN6_RPNSWREQ				0xA008
>  #define   GEN6_TURBO_DISABLE			(1<<31)
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 99eb7ca..1dd8a7c 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -9661,6 +9661,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
>  	unsigned long flags;
>  	int ret;
>  
> +	//trigger software GT busyness calculation
> +	gen8_flip_interrupt(dev);
> +
>  	/*
>  	 * drm_mode_page_flip_ioctl() should already catch this, but double
>  	 * check to be safe.  In the future we may enable pageflipping from
> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
> index 3f88f29..e13d0ff 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -2122,7 +2122,6 @@ int ilk_wm_max_level(const struct drm_device *dev)
>  	else
>  		return 2;
>  }
> -
>  static void intel_print_wm_latency(struct drm_device *dev,
>  				   const char *name,
>  				   const uint16_t wm[5])
> @@ -3091,6 +3090,9 @@ static void gen6_set_rps_thresholds(struct drm_i915_private *dev_priv, u8 val)
>  {
>  	int new_power;
>  
> +	if (dev_priv->rps.is_bdw_sw_turbo)
> +		return;
> +
>  	new_power = dev_priv->rps.power;
>  	switch (dev_priv->rps.power) {
>  	case LOW_POWER:
> @@ -3298,8 +3300,11 @@ void gen6_rps_idle(struct drm_i915_private *dev_priv)
>  			valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_freq_softlimit);
>  		else if (IS_VALLEYVIEW(dev))
>  			vlv_set_rps_idle(dev_priv);
> -		else
> +		else if (!dev_priv->rps.is_bdw_sw_turbo
> +					|| atomic_read(&dev_priv->rps.sw_turbo.flip_received)){
>  			gen6_set_rps(dev_priv->dev, dev_priv->rps.min_freq_softlimit);
> +		}
> +
>  		dev_priv->rps.last_adj = 0;
>  	}
>  	mutex_unlock(&dev_priv->rps.hw_lock);
> @@ -3313,8 +3318,11 @@ void gen6_rps_boost(struct drm_i915_private *dev_priv)
>  	if (dev_priv->rps.enabled) {
>  		if (IS_VALLEYVIEW(dev))
>  			valleyview_set_rps(dev_priv->dev, dev_priv->rps.max_freq_softlimit);
> -		else
> +		else if (!dev_priv->rps.is_bdw_sw_turbo
> +					|| atomic_read(&dev_priv->rps.sw_turbo.flip_received)){
>  			gen6_set_rps(dev_priv->dev, dev_priv->rps.max_freq_softlimit);
> +		}
> +
>  		dev_priv->rps.last_adj = 0;
>  	}
>  	mutex_unlock(&dev_priv->rps.hw_lock);
> @@ -3345,21 +3353,26 @@ void valleyview_set_rps(struct drm_device *dev, u8 val)
>  static void gen8_disable_rps_interrupts(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
> +	if (IS_BROADWELL(dev) && dev_priv->rps.is_bdw_sw_turbo){
> +		if (atomic_read(&dev_priv->rps.sw_turbo.flip_received))
> +			del_timer(&dev_priv->rps.sw_turbo.flip_timer);
> +		dev_priv-> rps.is_bdw_sw_turbo = false;
> +	} else {
> +		I915_WRITE(GEN6_PMINTRMSK, ~GEN8_PMINTR_REDIRECT_TO_NON_DISP);
> +		I915_WRITE(GEN8_GT_IER(2), I915_READ(GEN8_GT_IER(2)) &
> +					   ~dev_priv->pm_rps_events);
> +		/* Complete PM interrupt masking here doesn't race with the rps work
> +		 * item again unmasking PM interrupts because that is using a different
> +		 * register (GEN8_GT_IMR(2)) to mask PM interrupts. The only risk is in
> +		 * leaving stale bits in GEN8_GT_IIR(2) and GEN8_GT_IMR(2) which
> +		 * gen8_enable_rps will clean up. */
>  
> -	I915_WRITE(GEN6_PMINTRMSK, ~GEN8_PMINTR_REDIRECT_TO_NON_DISP);
> -	I915_WRITE(GEN8_GT_IER(2), I915_READ(GEN8_GT_IER(2)) &
> -				   ~dev_priv->pm_rps_events);
> -	/* Complete PM interrupt masking here doesn't race with the rps work
> -	 * item again unmasking PM interrupts because that is using a different
> -	 * register (GEN8_GT_IMR(2)) to mask PM interrupts. The only risk is in
> -	 * leaving stale bits in GEN8_GT_IIR(2) and GEN8_GT_IMR(2) which
> -	 * gen8_enable_rps will clean up. */
> -
> -	spin_lock_irq(&dev_priv->irq_lock);
> -	dev_priv->rps.pm_iir = 0;
> -	spin_unlock_irq(&dev_priv->irq_lock);
> +		spin_lock_irq(&dev_priv->irq_lock);
> +		dev_priv->rps.pm_iir = 0;
> +		spin_unlock_irq(&dev_priv->irq_lock);
>  
> -	I915_WRITE(GEN8_GT_IIR(2), dev_priv->pm_rps_events);
> +		I915_WRITE(GEN8_GT_IIR(2), dev_priv->pm_rps_events);
> +	}
>  }
>  
>  static void gen6_disable_rps_interrupts(struct drm_device *dev)
> @@ -3511,13 +3524,111 @@ static void parse_rp_state_cap(struct drm_i915_private *dev_priv, u32 rp_state_c
>  		dev_priv->rps.min_freq_softlimit = dev_priv->rps.min_freq;
>  }
>  
> +static void bdw_sw_calculate_freq(struct drm_device *dev,
> +		struct intel_rps_bdw_cal *c, u32 *cur_time, u32 *c0)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	u64 busy = 0;
> +	u32 busyness_pct = 0;
> +	u32 elapsed_time = 0;
> +	u16 new_freq = 0;
> +
> +	if (!c || !cur_time || !c0)
> +		return;
> +
> +	if (0 == c->last_c0)
> +		goto out;
> +
> +	/* Check Evaluation interval */
> +	elapsed_time = *cur_time - c->last_ts;
> +	if (elapsed_time < c->eval_interval)
> +		return;
> +
> +	mutex_lock(&dev_priv->rps.hw_lock);
> +
> +	/*
> +	 * c0 unit in 32*1.28 usec, elapsed_time unit in 1 usec.
> +	 * Whole busyness_pct calculation should be
> +	 *     busy = ((u64)(*c0 - c->last_c0) << 5 << 7) / 100;
> +	 *     busyness_pct = (u32)(busy * 100 / elapsed_time);
> +	 * The final formula is to simplify CPU calculation
> +	 */
> +	busy = (u64)(*c0 - c->last_c0) << 12;
> +	do_div(busy, elapsed_time);
> +	busyness_pct = (u32)busy;
> +
> +	if (c->is_up && busyness_pct >= c->it_threshold_pct)
> +		new_freq = (u16)dev_priv->rps.cur_freq + 3;
> +	if (!c->is_up && busyness_pct <= c->it_threshold_pct)
> +		new_freq = (u16)dev_priv->rps.cur_freq - 1;
> +
> +	/* Adjust to new frequency busyness and compare with threshold */
> +	if (0 != new_freq) {
> +		if (new_freq > dev_priv->rps.max_freq_softlimit)
> +			new_freq = dev_priv->rps.max_freq_softlimit;
> +		else if (new_freq < dev_priv->rps.min_freq_softlimit)
> +			new_freq = dev_priv->rps.min_freq_softlimit;
> +
> +		gen6_set_rps(dev, new_freq);
> +	}
> +
> +	mutex_unlock(&dev_priv->rps.hw_lock);
> +
> +out:
> +	c->last_c0 = *c0;
> +	c->last_ts = *cur_time;
> +}
> +
> +static void gen8_set_frequency_RP0(struct work_struct *work)
> +{
> +	struct intel_rps_bdw_turbo *p_bdw_turbo =
> +			container_of(work, struct intel_rps_bdw_turbo, work_max_freq);
> +	struct intel_gen6_power_mgmt *p_power_mgmt =
> +			container_of(p_bdw_turbo, struct intel_gen6_power_mgmt, sw_turbo);
> +	struct drm_i915_private *dev_priv =
> +			container_of(p_power_mgmt, struct drm_i915_private, rps);
> +
> +	mutex_lock(&dev_priv->rps.hw_lock);
> +	gen6_set_rps(dev_priv->dev, dev_priv->rps.rp0_freq);
> +	mutex_unlock(&dev_priv->rps.hw_lock);
> +}
> +
> +static void flip_active_timeout_handler(unsigned long var)
> +{
> +	struct drm_i915_private *dev_priv = (struct drm_i915_private *) var;
> +
> +	del_timer(&dev_priv->rps.sw_turbo.flip_timer);
> +	atomic_set(&dev_priv->rps.sw_turbo.flip_received, false);
> +
> +	queue_work(dev_priv->wq, &dev_priv->rps.sw_turbo.work_max_freq);
> +}
> +
> +void bdw_software_turbo(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	u32 current_time = I915_READ(TIMESTAMP_CTR); /* unit in usec */
> +	u32 current_c0 = I915_READ(MCHBAR_PCU_C0); /* unit in 32*1.28 usec */
> +
> +	bdw_sw_calculate_freq(dev, &dev_priv->rps.sw_turbo.up,
> +			&current_time, &current_c0);
> +	bdw_sw_calculate_freq(dev, &dev_priv->rps.sw_turbo.down,
> +			&current_time, &current_c0);
> +}
> +
>  static void gen8_enable_rps(struct drm_device *dev)
>  {
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	struct intel_engine_cs *ring;
>  	uint32_t rc6_mask = 0, rp_state_cap;
> +	uint32_t threshold_up_pct, threshold_down_pct;
> +	uint32_t ei_up, ei_down; /* up and down evaluation interval */
> +	u32 rp_ctl_flag;
>  	int unused;
>  
> +	/* Use software Turbo for BDW */
> +	dev_priv->rps.is_bdw_sw_turbo = IS_BROADWELL(dev);
> +
>  	/* 1a: Software RC state - RC0 */
>  	I915_WRITE(GEN6_RC_STATE, 0);
>  
> @@ -3561,35 +3672,74 @@ static void gen8_enable_rps(struct drm_device *dev)
>  		   HSW_FREQUENCY(dev_priv->rps.rp1_freq));
>  	I915_WRITE(GEN6_RC_VIDEO_FREQ,
>  		   HSW_FREQUENCY(dev_priv->rps.rp1_freq));
> -	/* NB: Docs say 1s, and 1000000 - which aren't equivalent */
> -	I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 100000000 / 128); /* 1 second timeout */
> +	ei_up = 84480; /* 84.48ms */
> +	ei_down = 448000;
> +	threshold_up_pct = 90; /* x percent busy */
> +	threshold_down_pct = 70;
> +
> +	if (dev_priv->rps.is_bdw_sw_turbo) {
> +		dev_priv->rps.sw_turbo.up.it_threshold_pct = threshold_up_pct;
> +		dev_priv->rps.sw_turbo.up.eval_interval = ei_up;
> +		dev_priv->rps.sw_turbo.up.is_up = true;
> +		dev_priv->rps.sw_turbo.up.last_ts = 0;
> +		dev_priv->rps.sw_turbo.up.last_c0 = 0;
> +
> +		dev_priv->rps.sw_turbo.down.it_threshold_pct = threshold_down_pct;
> +		dev_priv->rps.sw_turbo.down.eval_interval = ei_down;
> +		dev_priv->rps.sw_turbo.down.is_up = false;
> +		dev_priv->rps.sw_turbo.down.last_ts = 0;
> +		dev_priv->rps.sw_turbo.down.last_c0 = 0;
> +
> +		/* Start the timer to track if flip comes*/
> +		dev_priv->rps.sw_turbo.timeout = 200*1000; /* in us */
> +
> +		init_timer(&dev_priv->rps.sw_turbo.flip_timer);
> +		dev_priv->rps.sw_turbo.flip_timer.function = flip_active_timeout_handler;
> +		dev_priv->rps.sw_turbo.flip_timer.data  = (unsigned long) dev_priv;
> +		dev_priv->rps.sw_turbo.flip_timer.expires =
> +			usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies;
> +		add_timer(&dev_priv->rps.sw_turbo.flip_timer);
> +		INIT_WORK(&dev_priv->rps.sw_turbo.work_max_freq, gen8_set_frequency_RP0);
> +
> +		atomic_set(&dev_priv->rps.sw_turbo.flip_received, true);
> +	} else {
> +		/* NB: Docs say 1s, and 1000000 - which aren't equivalent
> +		 * 1 second timeout*/
> +		I915_WRITE(GEN6_RP_DOWN_TIMEOUT, FREQ_1_28_US(1000000));
>  
> -	/* Docs recommend 900MHz, and 300 MHz respectively */
> -	I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
> -		   dev_priv->rps.max_freq_softlimit << 24 |
> -		   dev_priv->rps.min_freq_softlimit << 16);
> +		/* Docs recommend 900MHz, and 300 MHz respectively */
> +		I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
> +			   dev_priv->rps.max_freq_softlimit << 24 |
> +			   dev_priv->rps.min_freq_softlimit << 16);
>  
> -	I915_WRITE(GEN6_RP_UP_THRESHOLD, 7600000 / 128); /* 76ms busyness per EI, 90% */
> -	I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 31300000 / 128); /* 313ms busyness per EI, 70%*/
> -	I915_WRITE(GEN6_RP_UP_EI, 66000); /* 84.48ms, XXX: random? */
> -	I915_WRITE(GEN6_RP_DOWN_EI, 350000); /* 448ms, XXX: random? */
> +		I915_WRITE(GEN6_RP_UP_THRESHOLD,
> +			FREQ_1_28_US(ei_up * threshold_up_pct / 100));
> +		I915_WRITE(GEN6_RP_DOWN_THRESHOLD,
> +			FREQ_1_28_US(ei_down * threshold_down_pct / 100));
> +		I915_WRITE(GEN6_RP_UP_EI,
> +			FREQ_1_28_US(ei_up));
> +		I915_WRITE(GEN6_RP_DOWN_EI,
> +			FREQ_1_28_US(ei_down));
>  
> -	I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
> +		I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
> +	}
>  
>  	/* 5: Enable RPS */
> -	I915_WRITE(GEN6_RP_CONTROL,
> -		   GEN6_RP_MEDIA_TURBO |
> -		   GEN6_RP_MEDIA_HW_NORMAL_MODE |
> -		   GEN6_RP_MEDIA_IS_GFX |
> -		   GEN6_RP_ENABLE |
> -		   GEN6_RP_UP_BUSY_AVG |
> -		   GEN6_RP_DOWN_IDLE_AVG);
> -
> -	/* 6: Ring frequency + overclocking (our driver does this later */
> -
> +	rp_ctl_flag = GEN6_RP_MEDIA_TURBO |
> +					GEN6_RP_MEDIA_HW_NORMAL_MODE |
> +					GEN6_RP_MEDIA_IS_GFX |
> +					GEN6_RP_UP_BUSY_AVG |
> +					GEN6_RP_DOWN_IDLE_AVG;
> +	if (!dev_priv->rps.is_bdw_sw_turbo)
> +		rp_ctl_flag |= GEN6_RP_ENABLE;
> +
> +	I915_WRITE(GEN6_RP_CONTROL, rp_ctl_flag);
> +
> +	/* 6: Ring frequency + overclocking
> +	 * (our driver does this later */
>  	gen6_set_rps(dev, (I915_READ(GEN6_GT_PERF_STATUS) & 0xff00) >> 8);
> -
> -	gen8_enable_rps_interrupts(dev);
> +	if (!dev_priv->rps.is_bdw_sw_turbo)
> +		gen8_enable_rps_interrupts(dev);
>  
>  	gen6_gt_force_wake_put(dev_priv, FORCEWAKE_ALL);
>  }
> @@ -5018,6 +5168,8 @@ static void intel_gen6_powersave_work(struct work_struct *work)
>  			     rps.delayed_resume_work.work);
>  	struct drm_device *dev = dev_priv->dev;
>  
> +	dev_priv->rps.is_bdw_sw_turbo = false;
> +
>  	mutex_lock(&dev_priv->rps.hw_lock);
>  
>  	if (IS_CHERRYVIEW(dev)) {
> -- 
> 1.9.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Daisy Sun Aug. 11, 2014, 10:13 p.m. UTC | #2
The expected design is a constant timer which will trigger GT busyness 
calculation steadily,  I understand.
Yet the case is that we would have to wrap up BDW related works as we 
mentioned with Jesse, there are not enough resources to do further 
development on the constant timer scheme, sorry that I'll not able to 
rework this patch.

It would be great if Linux validation could at least do some 
power/performance comparison study on the current solution to settle the 
concerns. We feel this proposal has minimum package C-state impact and 
hopefully better performance/watt impact.

Thanks,
Daisy

On 8/11/2014 2:33 PM, Daniel Vetter wrote:
> On Mon, Aug 11, 2014 at 11:08:38AM -0700, Daisy Sun wrote:
>> BDW supports GT C0 residency reporting in constant time unit. Driver
>> calculates GT utilization based on C0 residency and adjusts RP
>> frequency up/down accordingly. For offscreen workload specificly,
>> set frequency to RP0.
>>
>> Offscreen task is not restricted by frame rate, it can be
>> executed as soon as possible. Transcoding and serilized workload
>> between CPU and GPU both need high GT performance, RP0 is a good
>> option in this case. RC6 will kick in to compensate power
>> consumption when GT is not active.
>>
>> v2: Rebase on recent drm-intel-nightly
>> v3: Add flip timerout monitor, when no flip is deteced within
>> 100ms, set frequency to RP0.
> Ok, let's make this really clear:
>
> If you wire this into the flip handling in any way, I will not merge your
> patch. The timer should be fully independant and tie into the gpu
> busy/idle handling we already have.
>
> Thanks, Daniel
>
>> Signed-off-by: Daisy Sun <daisy.sun@intel.com>
>> [torourke: rebased on latest and resolved conflict]
>> Signed-off-by: Tom O'Rourke <Tom.O'Rourke@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h      |  22 ++++
>>   drivers/gpu/drm/i915/i915_irq.c      |  21 ++++
>>   drivers/gpu/drm/i915/i915_reg.h      |   4 +
>>   drivers/gpu/drm/i915/intel_display.c |   3 +
>>   drivers/gpu/drm/i915/intel_pm.c      | 230 +++++++++++++++++++++++++++++------
>>   5 files changed, 241 insertions(+), 39 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index ef38c3b..f1c4c5b 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -915,6 +915,23 @@ struct intel_rps_ei {
>>   	u32 media_c0;
>>   };
>>   
>> +struct intel_rps_bdw_cal {
>> +	u32 it_threshold_pct; /* interrupt, in percentage */
>> +	u32 eval_interval; /* evaluation interval, in us */
>> +	u32 last_ts;
>> +	u32 last_c0;
>> +	bool is_up;
>> +};
>> +
>> +struct intel_rps_bdw_turbo {
>> +	struct intel_rps_bdw_cal up;
>> +	struct intel_rps_bdw_cal down;
>> +	struct timer_list flip_timer;
>> +	u32 timeout;
>> +	atomic_t flip_received;
>> +	struct work_struct work_max_freq;
>> +};
>> +
>>   struct intel_gen6_power_mgmt {
>>   	/* work and pm_iir are protected by dev_priv->irq_lock */
>>   	struct work_struct work;
>> @@ -948,6 +965,9 @@ struct intel_gen6_power_mgmt {
>>   	bool enabled;
>>   	struct delayed_work delayed_resume_work;
>>   
>> +	bool is_bdw_sw_turbo;	/* Switch of BDW software turbo */
>> +	struct intel_rps_bdw_turbo sw_turbo; /* Calculate RP interrupt timing */
>> +
>>   	/* manual wa residency calculations */
>>   	struct intel_rps_ei up_ei, down_ei;
>>   
>> @@ -2703,6 +2723,8 @@ extern void intel_disable_fbc(struct drm_device *dev);
>>   extern bool ironlake_set_drps(struct drm_device *dev, u8 val);
>>   extern void intel_init_pch_refclk(struct drm_device *dev);
>>   extern void gen6_set_rps(struct drm_device *dev, u8 val);
>> +extern void bdw_software_turbo(struct drm_device *dev);
>> +extern void gen8_flip_interrupt(struct drm_device *dev);
>>   extern void valleyview_set_rps(struct drm_device *dev, u8 val);
>>   extern void intel_set_memory_cxsr(struct drm_i915_private *dev_priv,
>>   				  bool enable);
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>> index 6ef9d6f..367f8e1 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -1961,6 +1961,27 @@ static void i9xx_pipe_crc_irq_handler(struct drm_device *dev, enum pipe pipe)
>>   				     res1, res2);
>>   }
>>   
>> +void gen8_flip_interrupt(struct drm_device *dev)
>> +{
>> +	struct drm_i915_private *dev_priv = dev->dev_private;
>> +
>> +	if (!dev_priv->rps.is_bdw_sw_turbo)
>> +		return;
>> +
>> +	if(atomic_read(&dev_priv->rps.sw_turbo.flip_received)) {
>> +		mod_timer(&dev_priv->rps.sw_turbo.flip_timer,
>> +				usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies);
>> +	}
>> +	else {
>> +		dev_priv->rps.sw_turbo.flip_timer.expires =
>> +				usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies;
>> +		add_timer(&dev_priv->rps.sw_turbo.flip_timer);
>> +		atomic_set(&dev_priv->rps.sw_turbo.flip_received, true);
>> +	}
>> +
>> +	bdw_software_turbo(dev);
>> +}
>> +
>>   /* The RPS events need forcewake, so we add them to a work queue and mask their
>>    * IMR bits until the work is done. Other interrupts can be processed without
>>    * the work queue. */
>> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>> index fe5c276..088e0e1 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -5453,6 +5453,10 @@ enum punit_power_well {
>>   #define GEN8_UCGCTL6				0x9430
>>   #define   GEN8_SDEUNIT_CLOCK_GATE_DISABLE	(1<<14)
>>   
>> +#define TIMESTAMP_CTR		0x44070
>> +#define FREQ_1_28_US(us)	(((us) * 100) >> 7)
>> +#define MCHBAR_PCU_C0		(MCHBAR_MIRROR_BASE_SNB + 0x5960)
>> +
>>   #define GEN6_GFXPAUSE				0xA000
>>   #define GEN6_RPNSWREQ				0xA008
>>   #define   GEN6_TURBO_DISABLE			(1<<31)
>> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
>> index 99eb7ca..1dd8a7c 100644
>> --- a/drivers/gpu/drm/i915/intel_display.c
>> +++ b/drivers/gpu/drm/i915/intel_display.c
>> @@ -9661,6 +9661,9 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
>>   	unsigned long flags;
>>   	int ret;
>>   
>> +	//trigger software GT busyness calculation
>> +	gen8_flip_interrupt(dev);
>> +
>>   	/*
>>   	 * drm_mode_page_flip_ioctl() should already catch this, but double
>>   	 * check to be safe.  In the future we may enable pageflipping from
>> diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
>> index 3f88f29..e13d0ff 100644
>> --- a/drivers/gpu/drm/i915/intel_pm.c
>> +++ b/drivers/gpu/drm/i915/intel_pm.c
>> @@ -2122,7 +2122,6 @@ int ilk_wm_max_level(const struct drm_device *dev)
>>   	else
>>   		return 2;
>>   }
>> -
>>   static void intel_print_wm_latency(struct drm_device *dev,
>>   				   const char *name,
>>   				   const uint16_t wm[5])
>> @@ -3091,6 +3090,9 @@ static void gen6_set_rps_thresholds(struct drm_i915_private *dev_priv, u8 val)
>>   {
>>   	int new_power;
>>   
>> +	if (dev_priv->rps.is_bdw_sw_turbo)
>> +		return;
>> +
>>   	new_power = dev_priv->rps.power;
>>   	switch (dev_priv->rps.power) {
>>   	case LOW_POWER:
>> @@ -3298,8 +3300,11 @@ void gen6_rps_idle(struct drm_i915_private *dev_priv)
>>   			valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_freq_softlimit);
>>   		else if (IS_VALLEYVIEW(dev))
>>   			vlv_set_rps_idle(dev_priv);
>> -		else
>> +		else if (!dev_priv->rps.is_bdw_sw_turbo
>> +					|| atomic_read(&dev_priv->rps.sw_turbo.flip_received)){
>>   			gen6_set_rps(dev_priv->dev, dev_priv->rps.min_freq_softlimit);
>> +		}
>> +
>>   		dev_priv->rps.last_adj = 0;
>>   	}
>>   	mutex_unlock(&dev_priv->rps.hw_lock);
>> @@ -3313,8 +3318,11 @@ void gen6_rps_boost(struct drm_i915_private *dev_priv)
>>   	if (dev_priv->rps.enabled) {
>>   		if (IS_VALLEYVIEW(dev))
>>   			valleyview_set_rps(dev_priv->dev, dev_priv->rps.max_freq_softlimit);
>> -		else
>> +		else if (!dev_priv->rps.is_bdw_sw_turbo
>> +					|| atomic_read(&dev_priv->rps.sw_turbo.flip_received)){
>>   			gen6_set_rps(dev_priv->dev, dev_priv->rps.max_freq_softlimit);
>> +		}
>> +
>>   		dev_priv->rps.last_adj = 0;
>>   	}
>>   	mutex_unlock(&dev_priv->rps.hw_lock);
>> @@ -3345,21 +3353,26 @@ void valleyview_set_rps(struct drm_device *dev, u8 val)
>>   static void gen8_disable_rps_interrupts(struct drm_device *dev)
>>   {
>>   	struct drm_i915_private *dev_priv = dev->dev_private;
>> +	if (IS_BROADWELL(dev) && dev_priv->rps.is_bdw_sw_turbo){
>> +		if (atomic_read(&dev_priv->rps.sw_turbo.flip_received))
>> +			del_timer(&dev_priv->rps.sw_turbo.flip_timer);
>> +		dev_priv-> rps.is_bdw_sw_turbo = false;
>> +	} else {
>> +		I915_WRITE(GEN6_PMINTRMSK, ~GEN8_PMINTR_REDIRECT_TO_NON_DISP);
>> +		I915_WRITE(GEN8_GT_IER(2), I915_READ(GEN8_GT_IER(2)) &
>> +					   ~dev_priv->pm_rps_events);
>> +		/* Complete PM interrupt masking here doesn't race with the rps work
>> +		 * item again unmasking PM interrupts because that is using a different
>> +		 * register (GEN8_GT_IMR(2)) to mask PM interrupts. The only risk is in
>> +		 * leaving stale bits in GEN8_GT_IIR(2) and GEN8_GT_IMR(2) which
>> +		 * gen8_enable_rps will clean up. */
>>   
>> -	I915_WRITE(GEN6_PMINTRMSK, ~GEN8_PMINTR_REDIRECT_TO_NON_DISP);
>> -	I915_WRITE(GEN8_GT_IER(2), I915_READ(GEN8_GT_IER(2)) &
>> -				   ~dev_priv->pm_rps_events);
>> -	/* Complete PM interrupt masking here doesn't race with the rps work
>> -	 * item again unmasking PM interrupts because that is using a different
>> -	 * register (GEN8_GT_IMR(2)) to mask PM interrupts. The only risk is in
>> -	 * leaving stale bits in GEN8_GT_IIR(2) and GEN8_GT_IMR(2) which
>> -	 * gen8_enable_rps will clean up. */
>> -
>> -	spin_lock_irq(&dev_priv->irq_lock);
>> -	dev_priv->rps.pm_iir = 0;
>> -	spin_unlock_irq(&dev_priv->irq_lock);
>> +		spin_lock_irq(&dev_priv->irq_lock);
>> +		dev_priv->rps.pm_iir = 0;
>> +		spin_unlock_irq(&dev_priv->irq_lock);
>>   
>> -	I915_WRITE(GEN8_GT_IIR(2), dev_priv->pm_rps_events);
>> +		I915_WRITE(GEN8_GT_IIR(2), dev_priv->pm_rps_events);
>> +	}
>>   }
>>   
>>   static void gen6_disable_rps_interrupts(struct drm_device *dev)
>> @@ -3511,13 +3524,111 @@ static void parse_rp_state_cap(struct drm_i915_private *dev_priv, u32 rp_state_c
>>   		dev_priv->rps.min_freq_softlimit = dev_priv->rps.min_freq;
>>   }
>>   
>> +static void bdw_sw_calculate_freq(struct drm_device *dev,
>> +		struct intel_rps_bdw_cal *c, u32 *cur_time, u32 *c0)
>> +{
>> +	struct drm_i915_private *dev_priv = dev->dev_private;
>> +	u64 busy = 0;
>> +	u32 busyness_pct = 0;
>> +	u32 elapsed_time = 0;
>> +	u16 new_freq = 0;
>> +
>> +	if (!c || !cur_time || !c0)
>> +		return;
>> +
>> +	if (0 == c->last_c0)
>> +		goto out;
>> +
>> +	/* Check Evaluation interval */
>> +	elapsed_time = *cur_time - c->last_ts;
>> +	if (elapsed_time < c->eval_interval)
>> +		return;
>> +
>> +	mutex_lock(&dev_priv->rps.hw_lock);
>> +
>> +	/*
>> +	 * c0 unit in 32*1.28 usec, elapsed_time unit in 1 usec.
>> +	 * Whole busyness_pct calculation should be
>> +	 *     busy = ((u64)(*c0 - c->last_c0) << 5 << 7) / 100;
>> +	 *     busyness_pct = (u32)(busy * 100 / elapsed_time);
>> +	 * The final formula is to simplify CPU calculation
>> +	 */
>> +	busy = (u64)(*c0 - c->last_c0) << 12;
>> +	do_div(busy, elapsed_time);
>> +	busyness_pct = (u32)busy;
>> +
>> +	if (c->is_up && busyness_pct >= c->it_threshold_pct)
>> +		new_freq = (u16)dev_priv->rps.cur_freq + 3;
>> +	if (!c->is_up && busyness_pct <= c->it_threshold_pct)
>> +		new_freq = (u16)dev_priv->rps.cur_freq - 1;
>> +
>> +	/* Adjust to new frequency busyness and compare with threshold */
>> +	if (0 != new_freq) {
>> +		if (new_freq > dev_priv->rps.max_freq_softlimit)
>> +			new_freq = dev_priv->rps.max_freq_softlimit;
>> +		else if (new_freq < dev_priv->rps.min_freq_softlimit)
>> +			new_freq = dev_priv->rps.min_freq_softlimit;
>> +
>> +		gen6_set_rps(dev, new_freq);
>> +	}
>> +
>> +	mutex_unlock(&dev_priv->rps.hw_lock);
>> +
>> +out:
>> +	c->last_c0 = *c0;
>> +	c->last_ts = *cur_time;
>> +}
>> +
>> +static void gen8_set_frequency_RP0(struct work_struct *work)
>> +{
>> +	struct intel_rps_bdw_turbo *p_bdw_turbo =
>> +			container_of(work, struct intel_rps_bdw_turbo, work_max_freq);
>> +	struct intel_gen6_power_mgmt *p_power_mgmt =
>> +			container_of(p_bdw_turbo, struct intel_gen6_power_mgmt, sw_turbo);
>> +	struct drm_i915_private *dev_priv =
>> +			container_of(p_power_mgmt, struct drm_i915_private, rps);
>> +
>> +	mutex_lock(&dev_priv->rps.hw_lock);
>> +	gen6_set_rps(dev_priv->dev, dev_priv->rps.rp0_freq);
>> +	mutex_unlock(&dev_priv->rps.hw_lock);
>> +}
>> +
>> +static void flip_active_timeout_handler(unsigned long var)
>> +{
>> +	struct drm_i915_private *dev_priv = (struct drm_i915_private *) var;
>> +
>> +	del_timer(&dev_priv->rps.sw_turbo.flip_timer);
>> +	atomic_set(&dev_priv->rps.sw_turbo.flip_received, false);
>> +
>> +	queue_work(dev_priv->wq, &dev_priv->rps.sw_turbo.work_max_freq);
>> +}
>> +
>> +void bdw_software_turbo(struct drm_device *dev)
>> +{
>> +	struct drm_i915_private *dev_priv = dev->dev_private;
>> +
>> +	u32 current_time = I915_READ(TIMESTAMP_CTR); /* unit in usec */
>> +	u32 current_c0 = I915_READ(MCHBAR_PCU_C0); /* unit in 32*1.28 usec */
>> +
>> +	bdw_sw_calculate_freq(dev, &dev_priv->rps.sw_turbo.up,
>> +			&current_time, &current_c0);
>> +	bdw_sw_calculate_freq(dev, &dev_priv->rps.sw_turbo.down,
>> +			&current_time, &current_c0);
>> +}
>> +
>>   static void gen8_enable_rps(struct drm_device *dev)
>>   {
>>   	struct drm_i915_private *dev_priv = dev->dev_private;
>>   	struct intel_engine_cs *ring;
>>   	uint32_t rc6_mask = 0, rp_state_cap;
>> +	uint32_t threshold_up_pct, threshold_down_pct;
>> +	uint32_t ei_up, ei_down; /* up and down evaluation interval */
>> +	u32 rp_ctl_flag;
>>   	int unused;
>>   
>> +	/* Use software Turbo for BDW */
>> +	dev_priv->rps.is_bdw_sw_turbo = IS_BROADWELL(dev);
>> +
>>   	/* 1a: Software RC state - RC0 */
>>   	I915_WRITE(GEN6_RC_STATE, 0);
>>   
>> @@ -3561,35 +3672,74 @@ static void gen8_enable_rps(struct drm_device *dev)
>>   		   HSW_FREQUENCY(dev_priv->rps.rp1_freq));
>>   	I915_WRITE(GEN6_RC_VIDEO_FREQ,
>>   		   HSW_FREQUENCY(dev_priv->rps.rp1_freq));
>> -	/* NB: Docs say 1s, and 1000000 - which aren't equivalent */
>> -	I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 100000000 / 128); /* 1 second timeout */
>> +	ei_up = 84480; /* 84.48ms */
>> +	ei_down = 448000;
>> +	threshold_up_pct = 90; /* x percent busy */
>> +	threshold_down_pct = 70;
>> +
>> +	if (dev_priv->rps.is_bdw_sw_turbo) {
>> +		dev_priv->rps.sw_turbo.up.it_threshold_pct = threshold_up_pct;
>> +		dev_priv->rps.sw_turbo.up.eval_interval = ei_up;
>> +		dev_priv->rps.sw_turbo.up.is_up = true;
>> +		dev_priv->rps.sw_turbo.up.last_ts = 0;
>> +		dev_priv->rps.sw_turbo.up.last_c0 = 0;
>> +
>> +		dev_priv->rps.sw_turbo.down.it_threshold_pct = threshold_down_pct;
>> +		dev_priv->rps.sw_turbo.down.eval_interval = ei_down;
>> +		dev_priv->rps.sw_turbo.down.is_up = false;
>> +		dev_priv->rps.sw_turbo.down.last_ts = 0;
>> +		dev_priv->rps.sw_turbo.down.last_c0 = 0;
>> +
>> +		/* Start the timer to track if flip comes*/
>> +		dev_priv->rps.sw_turbo.timeout = 200*1000; /* in us */
>> +
>> +		init_timer(&dev_priv->rps.sw_turbo.flip_timer);
>> +		dev_priv->rps.sw_turbo.flip_timer.function = flip_active_timeout_handler;
>> +		dev_priv->rps.sw_turbo.flip_timer.data  = (unsigned long) dev_priv;
>> +		dev_priv->rps.sw_turbo.flip_timer.expires =
>> +			usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies;
>> +		add_timer(&dev_priv->rps.sw_turbo.flip_timer);
>> +		INIT_WORK(&dev_priv->rps.sw_turbo.work_max_freq, gen8_set_frequency_RP0);
>> +
>> +		atomic_set(&dev_priv->rps.sw_turbo.flip_received, true);
>> +	} else {
>> +		/* NB: Docs say 1s, and 1000000 - which aren't equivalent
>> +		 * 1 second timeout*/
>> +		I915_WRITE(GEN6_RP_DOWN_TIMEOUT, FREQ_1_28_US(1000000));
>>   
>> -	/* Docs recommend 900MHz, and 300 MHz respectively */
>> -	I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
>> -		   dev_priv->rps.max_freq_softlimit << 24 |
>> -		   dev_priv->rps.min_freq_softlimit << 16);
>> +		/* Docs recommend 900MHz, and 300 MHz respectively */
>> +		I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
>> +			   dev_priv->rps.max_freq_softlimit << 24 |
>> +			   dev_priv->rps.min_freq_softlimit << 16);
>>   
>> -	I915_WRITE(GEN6_RP_UP_THRESHOLD, 7600000 / 128); /* 76ms busyness per EI, 90% */
>> -	I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 31300000 / 128); /* 313ms busyness per EI, 70%*/
>> -	I915_WRITE(GEN6_RP_UP_EI, 66000); /* 84.48ms, XXX: random? */
>> -	I915_WRITE(GEN6_RP_DOWN_EI, 350000); /* 448ms, XXX: random? */
>> +		I915_WRITE(GEN6_RP_UP_THRESHOLD,
>> +			FREQ_1_28_US(ei_up * threshold_up_pct / 100));
>> +		I915_WRITE(GEN6_RP_DOWN_THRESHOLD,
>> +			FREQ_1_28_US(ei_down * threshold_down_pct / 100));
>> +		I915_WRITE(GEN6_RP_UP_EI,
>> +			FREQ_1_28_US(ei_up));
>> +		I915_WRITE(GEN6_RP_DOWN_EI,
>> +			FREQ_1_28_US(ei_down));
>>   
>> -	I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
>> +		I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
>> +	}
>>   
>>   	/* 5: Enable RPS */
>> -	I915_WRITE(GEN6_RP_CONTROL,
>> -		   GEN6_RP_MEDIA_TURBO |
>> -		   GEN6_RP_MEDIA_HW_NORMAL_MODE |
>> -		   GEN6_RP_MEDIA_IS_GFX |
>> -		   GEN6_RP_ENABLE |
>> -		   GEN6_RP_UP_BUSY_AVG |
>> -		   GEN6_RP_DOWN_IDLE_AVG);
>> -
>> -	/* 6: Ring frequency + overclocking (our driver does this later */
>> -
>> +	rp_ctl_flag = GEN6_RP_MEDIA_TURBO |
>> +					GEN6_RP_MEDIA_HW_NORMAL_MODE |
>> +					GEN6_RP_MEDIA_IS_GFX |
>> +					GEN6_RP_UP_BUSY_AVG |
>> +					GEN6_RP_DOWN_IDLE_AVG;
>> +	if (!dev_priv->rps.is_bdw_sw_turbo)
>> +		rp_ctl_flag |= GEN6_RP_ENABLE;
>> +
>> +	I915_WRITE(GEN6_RP_CONTROL, rp_ctl_flag);
>> +
>> +	/* 6: Ring frequency + overclocking
>> +	 * (our driver does this later */
>>   	gen6_set_rps(dev, (I915_READ(GEN6_GT_PERF_STATUS) & 0xff00) >> 8);
>> -
>> -	gen8_enable_rps_interrupts(dev);
>> +	if (!dev_priv->rps.is_bdw_sw_turbo)
>> +		gen8_enable_rps_interrupts(dev);
>>   
>>   	gen6_gt_force_wake_put(dev_priv, FORCEWAKE_ALL);
>>   }
>> @@ -5018,6 +5168,8 @@ static void intel_gen6_powersave_work(struct work_struct *work)
>>   			     rps.delayed_resume_work.work);
>>   	struct drm_device *dev = dev_priv->dev;
>>   
>> +	dev_priv->rps.is_bdw_sw_turbo = false;
>> +
>>   	mutex_lock(&dev_priv->rps.hw_lock);
>>   
>>   	if (IS_CHERRYVIEW(dev)) {
>> -- 
>> 1.9.1
>>
>> _______________________________________________
>> Intel-gfx mailing list
>> Intel-gfx@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Jesse Barnes Aug. 14, 2014, 7:37 p.m. UTC | #3
On Mon, 11 Aug 2014 23:33:57 +0200
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Mon, Aug 11, 2014 at 11:08:38AM -0700, Daisy Sun wrote:
> > BDW supports GT C0 residency reporting in constant time unit. Driver
> > calculates GT utilization based on C0 residency and adjusts RP
> > frequency up/down accordingly. For offscreen workload specificly,
> > set frequency to RP0.
> > 
> > Offscreen task is not restricted by frame rate, it can be
> > executed as soon as possible. Transcoding and serilized workload
> > between CPU and GPU both need high GT performance, RP0 is a good
> > option in this case. RC6 will kick in to compensate power
> > consumption when GT is not active.
> > 
> > v2: Rebase on recent drm-intel-nightly
> > v3: Add flip timerout monitor, when no flip is deteced within
> > 100ms, set frequency to RP0.
> 
> Ok, let's make this really clear:
> 
> If you wire this into the flip handling in any way, I will not merge your
> patch. The timer should be fully independant and tie into the gpu
> busy/idle handling we already have.

Sounds like Daisy won't be able to spend any more time on this either.

So we're left with this patch, which does improve things for most
cases, or no patch, which leaves things universally bad.

Unless someone wants to pick up the additional work and testing of
using a timer scheme, making sure we don't have needless wakeups, and
generally improve power/perf across even more cases than this patch.
Daniel Vetter Aug. 27, 2014, 6:57 p.m. UTC | #4
On Thu, Aug 14, 2014 at 12:37:53PM -0700, Jesse Barnes wrote:
> On Mon, 11 Aug 2014 23:33:57 +0200
> Daniel Vetter <daniel@ffwll.ch> wrote:
> 
> > On Mon, Aug 11, 2014 at 11:08:38AM -0700, Daisy Sun wrote:
> > > BDW supports GT C0 residency reporting in constant time unit. Driver
> > > calculates GT utilization based on C0 residency and adjusts RP
> > > frequency up/down accordingly. For offscreen workload specificly,
> > > set frequency to RP0.
> > > 
> > > Offscreen task is not restricted by frame rate, it can be
> > > executed as soon as possible. Transcoding and serilized workload
> > > between CPU and GPU both need high GT performance, RP0 is a good
> > > option in this case. RC6 will kick in to compensate power
> > > consumption when GT is not active.
> > > 
> > > v2: Rebase on recent drm-intel-nightly
> > > v3: Add flip timerout monitor, when no flip is deteced within
> > > 100ms, set frequency to RP0.
> > 
> > Ok, let's make this really clear:
> > 
> > If you wire this into the flip handling in any way, I will not merge your
> > patch. The timer should be fully independant and tie into the gpu
> > busy/idle handling we already have.
> 
> Sounds like Daisy won't be able to spend any more time on this either.
> 
> So we're left with this patch, which does improve things for most
> cases, or no patch, which leaves things universally bad.
> 
> Unless someone wants to pick up the additional work and testing of
> using a timer scheme, making sure we don't have needless wakeups, and
> generally improve power/perf across even more cases than this patch.

I'm taking this as an ack from you and pulled the patch into dinq.

Thanks, Daniel
Chris Wilson Aug. 28, 2014, 9:10 a.m. UTC | #5
On Wed, Aug 27, 2014 at 08:57:56PM +0200, Daniel Vetter wrote:
> On Thu, Aug 14, 2014 at 12:37:53PM -0700, Jesse Barnes wrote:
> > On Mon, 11 Aug 2014 23:33:57 +0200
> > Daniel Vetter <daniel@ffwll.ch> wrote:
> > 
> > > On Mon, Aug 11, 2014 at 11:08:38AM -0700, Daisy Sun wrote:
> > > > BDW supports GT C0 residency reporting in constant time unit. Driver
> > > > calculates GT utilization based on C0 residency and adjusts RP
> > > > frequency up/down accordingly. For offscreen workload specificly,
> > > > set frequency to RP0.
> > > > 
> > > > Offscreen task is not restricted by frame rate, it can be
> > > > executed as soon as possible. Transcoding and serilized workload
> > > > between CPU and GPU both need high GT performance, RP0 is a good
> > > > option in this case. RC6 will kick in to compensate power
> > > > consumption when GT is not active.
> > > > 
> > > > v2: Rebase on recent drm-intel-nightly
> > > > v3: Add flip timerout monitor, when no flip is deteced within
> > > > 100ms, set frequency to RP0.
> > > 
> > > Ok, let's make this really clear:
> > > 
> > > If you wire this into the flip handling in any way, I will not merge your
> > > patch. The timer should be fully independant and tie into the gpu
> > > busy/idle handling we already have.
> > 
> > Sounds like Daisy won't be able to spend any more time on this either.
> > 
> > So we're left with this patch, which does improve things for most
> > cases, or no patch, which leaves things universally bad.
> > 
> > Unless someone wants to pick up the additional work and testing of
> > using a timer scheme, making sure we don't have needless wakeups, and
> > generally improve power/perf across even more cases than this patch.
> 
> I'm taking this as an ack from you and pulled the patch into dinq.

Maybe also a nak from me for the bad design and poor integration with
the existing RPS infrastructue?
-Chris
Paulo Zanoni Sept. 4, 2014, 8:59 p.m. UTC | #6
2014-08-28 6:10 GMT-03:00 Chris Wilson <chris@chris-wilson.co.uk>:
> On Wed, Aug 27, 2014 at 08:57:56PM +0200, Daniel Vetter wrote:
>> On Thu, Aug 14, 2014 at 12:37:53PM -0700, Jesse Barnes wrote:
>> > On Mon, 11 Aug 2014 23:33:57 +0200
>> > Daniel Vetter <daniel@ffwll.ch> wrote:
>> >
>> > > On Mon, Aug 11, 2014 at 11:08:38AM -0700, Daisy Sun wrote:
>> > > > BDW supports GT C0 residency reporting in constant time unit. Driver
>> > > > calculates GT utilization based on C0 residency and adjusts RP
>> > > > frequency up/down accordingly. For offscreen workload specificly,
>> > > > set frequency to RP0.
>> > > >
>> > > > Offscreen task is not restricted by frame rate, it can be
>> > > > executed as soon as possible. Transcoding and serilized workload
>> > > > between CPU and GPU both need high GT performance, RP0 is a good
>> > > > option in this case. RC6 will kick in to compensate power
>> > > > consumption when GT is not active.
>> > > >
>> > > > v2: Rebase on recent drm-intel-nightly
>> > > > v3: Add flip timerout monitor, when no flip is deteced within
>> > > > 100ms, set frequency to RP0.
>> > >
>> > > Ok, let's make this really clear:
>> > >
>> > > If you wire this into the flip handling in any way, I will not merge your
>> > > patch. The timer should be fully independant and tie into the gpu
>> > > busy/idle handling we already have.
>> >
>> > Sounds like Daisy won't be able to spend any more time on this either.
>> >
>> > So we're left with this patch, which does improve things for most
>> > cases, or no patch, which leaves things universally bad.
>> >
>> > Unless someone wants to pick up the additional work and testing of
>> > using a timer scheme, making sure we don't have needless wakeups, and
>> > generally improve power/perf across even more cases than this patch.
>>
>> I'm taking this as an ack from you and pulled the patch into dinq.
>
> Maybe also a nak from me for the bad design and poor integration with
> the existing RPS infrastructue?

And I just concluded this is the first bad commit for
igt/pm_rpm/gem-execbuf. You have to run the test a few times (less
than 10 is enough), then check for a highly polluted dmesg. A simple
"git revert" is still possible on -nightly, and apparently fixes the
issue for me.

I am also right now trying to write a patch that fixes the bug, so I
guess we can avoid the revert if we want.

[   68.892465] ------------[ cut here ]------------
[   68.892547] WARNING: CPU: 1 PID: 142 at
drivers/gpu/drm/i915/intel_uncore.c:47
assert_device_not_suspended.isra.8+0x43/0x50 [i915]()
[   68.892552] Device suspended
[   68.892556] Modules linked in: snd_hda_codec_hdmi intel_rapl
x86_pkg_temp_thermal intel_powerclamp serio_raw efivars btusb iwlmvm
iwlwifi mei_me snd_hda_intel snd_hda_controller mei snd_hda_codec
snd_hwdep snd_pcm_oss snd_mixer_oss snd_pcm snd_timer int3403_thermal
i2c_designware_platform i2c_designware_core acpi_pad fuse nls_utf8
nls_cp437 vfat fat sd_mod ahci libahci i915 drm_kms_helper sdhci_pci
drm e1000e sdhci_acpi sdhci
[   68.892646] CPU: 1 PID: 142 Comm: kworker/u16:2 Tainted: G        W
     3.17.0-rc2.1409041708pz+ #1073
[   68.892652] Hardware name: Intel Corporation Broadwell Client
platform/Wilson Beach SDS, BIOS BDW-E2R1.86C.0072.R03.1405072127
05/07/2014
[   68.892683] Workqueue: i915 gen8_set_frequency_RP0 [i915]
[   68.892690]  0000000000000009 ffff880240557c50 ffffffff816f6d23
ffff880240557c98
[   68.892702]  ffff880240557c88 ffffffff8107b368 ffff8800377e0000
000000000000a008
[   68.892713]  000000000000a008 ffff8800377e0068 ffff8802406b5000
ffff880240557ce8
[   68.892724] Call Trace:
[   68.892743]  [<ffffffff816f6d23>] dump_stack+0x4d/0x66
[   68.892755]  [<ffffffff8107b368>] warn_slowpath_common+0x78/0xa0
[   68.892764]  [<ffffffff8107b3d7>] warn_slowpath_fmt+0x47/0x50
[   68.892816]  [<ffffffffa0129aa3>]
assert_device_not_suspended.isra.8+0x43/0x50 [i915]
[   68.892863]  [<ffffffffa012da15>] gen8_write32+0x35/0x180 [i915]
[   68.892894]  [<ffffffffa00eeb39>] gen6_set_rps+0x219/0x430 [i915]
[   68.892924]  [<ffffffffa00eee7b>] gen8_set_frequency_RP0+0x2b/0x40 [i915]
[   68.892935]  [<ffffffff81094a4a>] process_one_work+0x1da/0x510
[   68.892944]  [<ffffffff810949ea>] ? process_one_work+0x17a/0x510
[   68.892957]  [<ffffffff8109504b>] worker_thread+0x6b/0x4a0
[   68.892967]  [<ffffffff81094fe0>] ? rescuer_thread+0x260/0x260
[   68.892978]  [<ffffffff81099d30>] kthread+0x100/0x120
[   68.892991]  [<ffffffff81099c30>] ? kthread_create_on_node+0x230/0x230
[   68.893002]  [<ffffffff81700fac>] ret_from_fork+0x7c/0xb0
[   68.893013]  [<ffffffff81099c30>] ? kthread_create_on_node+0x230/0x230
[   68.893019] ---[ end trace 999107c7a7ea5be9 ]---
[   68.893027] ------------[ cut here ]------------

> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ef38c3b..f1c4c5b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -915,6 +915,23 @@  struct intel_rps_ei {
 	u32 media_c0;
 };
 
+struct intel_rps_bdw_cal {
+	u32 it_threshold_pct; /* interrupt, in percentage */
+	u32 eval_interval; /* evaluation interval, in us */
+	u32 last_ts;
+	u32 last_c0;
+	bool is_up;
+};
+
+struct intel_rps_bdw_turbo {
+	struct intel_rps_bdw_cal up;
+	struct intel_rps_bdw_cal down;
+	struct timer_list flip_timer;
+	u32 timeout;
+	atomic_t flip_received;
+	struct work_struct work_max_freq;
+};
+
 struct intel_gen6_power_mgmt {
 	/* work and pm_iir are protected by dev_priv->irq_lock */
 	struct work_struct work;
@@ -948,6 +965,9 @@  struct intel_gen6_power_mgmt {
 	bool enabled;
 	struct delayed_work delayed_resume_work;
 
+	bool is_bdw_sw_turbo;	/* Switch of BDW software turbo */
+	struct intel_rps_bdw_turbo sw_turbo; /* Calculate RP interrupt timing */
+
 	/* manual wa residency calculations */
 	struct intel_rps_ei up_ei, down_ei;
 
@@ -2703,6 +2723,8 @@  extern void intel_disable_fbc(struct drm_device *dev);
 extern bool ironlake_set_drps(struct drm_device *dev, u8 val);
 extern void intel_init_pch_refclk(struct drm_device *dev);
 extern void gen6_set_rps(struct drm_device *dev, u8 val);
+extern void bdw_software_turbo(struct drm_device *dev);
+extern void gen8_flip_interrupt(struct drm_device *dev);
 extern void valleyview_set_rps(struct drm_device *dev, u8 val);
 extern void intel_set_memory_cxsr(struct drm_i915_private *dev_priv,
 				  bool enable);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6ef9d6f..367f8e1 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1961,6 +1961,27 @@  static void i9xx_pipe_crc_irq_handler(struct drm_device *dev, enum pipe pipe)
 				     res1, res2);
 }
 
+void gen8_flip_interrupt(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	if (!dev_priv->rps.is_bdw_sw_turbo)
+		return;
+
+	if(atomic_read(&dev_priv->rps.sw_turbo.flip_received)) {
+		mod_timer(&dev_priv->rps.sw_turbo.flip_timer,
+				usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies);
+	}
+	else {
+		dev_priv->rps.sw_turbo.flip_timer.expires =
+				usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies;
+		add_timer(&dev_priv->rps.sw_turbo.flip_timer);
+		atomic_set(&dev_priv->rps.sw_turbo.flip_received, true);
+	}
+
+	bdw_software_turbo(dev);
+}
+
 /* The RPS events need forcewake, so we add them to a work queue and mask their
  * IMR bits until the work is done. Other interrupts can be processed without
  * the work queue. */
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index fe5c276..088e0e1 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -5453,6 +5453,10 @@  enum punit_power_well {
 #define GEN8_UCGCTL6				0x9430
 #define   GEN8_SDEUNIT_CLOCK_GATE_DISABLE	(1<<14)
 
+#define TIMESTAMP_CTR		0x44070
+#define FREQ_1_28_US(us)	(((us) * 100) >> 7)
+#define MCHBAR_PCU_C0		(MCHBAR_MIRROR_BASE_SNB + 0x5960)
+
 #define GEN6_GFXPAUSE				0xA000
 #define GEN6_RPNSWREQ				0xA008
 #define   GEN6_TURBO_DISABLE			(1<<31)
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 99eb7ca..1dd8a7c 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -9661,6 +9661,9 @@  static int intel_crtc_page_flip(struct drm_crtc *crtc,
 	unsigned long flags;
 	int ret;
 
+	//trigger software GT busyness calculation
+	gen8_flip_interrupt(dev);
+
 	/*
 	 * drm_mode_page_flip_ioctl() should already catch this, but double
 	 * check to be safe.  In the future we may enable pageflipping from
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 3f88f29..e13d0ff 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -2122,7 +2122,6 @@  int ilk_wm_max_level(const struct drm_device *dev)
 	else
 		return 2;
 }
-
 static void intel_print_wm_latency(struct drm_device *dev,
 				   const char *name,
 				   const uint16_t wm[5])
@@ -3091,6 +3090,9 @@  static void gen6_set_rps_thresholds(struct drm_i915_private *dev_priv, u8 val)
 {
 	int new_power;
 
+	if (dev_priv->rps.is_bdw_sw_turbo)
+		return;
+
 	new_power = dev_priv->rps.power;
 	switch (dev_priv->rps.power) {
 	case LOW_POWER:
@@ -3298,8 +3300,11 @@  void gen6_rps_idle(struct drm_i915_private *dev_priv)
 			valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_freq_softlimit);
 		else if (IS_VALLEYVIEW(dev))
 			vlv_set_rps_idle(dev_priv);
-		else
+		else if (!dev_priv->rps.is_bdw_sw_turbo
+					|| atomic_read(&dev_priv->rps.sw_turbo.flip_received)){
 			gen6_set_rps(dev_priv->dev, dev_priv->rps.min_freq_softlimit);
+		}
+
 		dev_priv->rps.last_adj = 0;
 	}
 	mutex_unlock(&dev_priv->rps.hw_lock);
@@ -3313,8 +3318,11 @@  void gen6_rps_boost(struct drm_i915_private *dev_priv)
 	if (dev_priv->rps.enabled) {
 		if (IS_VALLEYVIEW(dev))
 			valleyview_set_rps(dev_priv->dev, dev_priv->rps.max_freq_softlimit);
-		else
+		else if (!dev_priv->rps.is_bdw_sw_turbo
+					|| atomic_read(&dev_priv->rps.sw_turbo.flip_received)){
 			gen6_set_rps(dev_priv->dev, dev_priv->rps.max_freq_softlimit);
+		}
+
 		dev_priv->rps.last_adj = 0;
 	}
 	mutex_unlock(&dev_priv->rps.hw_lock);
@@ -3345,21 +3353,26 @@  void valleyview_set_rps(struct drm_device *dev, u8 val)
 static void gen8_disable_rps_interrupts(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	if (IS_BROADWELL(dev) && dev_priv->rps.is_bdw_sw_turbo){
+		if (atomic_read(&dev_priv->rps.sw_turbo.flip_received))
+			del_timer(&dev_priv->rps.sw_turbo.flip_timer);
+		dev_priv-> rps.is_bdw_sw_turbo = false;
+	} else {
+		I915_WRITE(GEN6_PMINTRMSK, ~GEN8_PMINTR_REDIRECT_TO_NON_DISP);
+		I915_WRITE(GEN8_GT_IER(2), I915_READ(GEN8_GT_IER(2)) &
+					   ~dev_priv->pm_rps_events);
+		/* Complete PM interrupt masking here doesn't race with the rps work
+		 * item again unmasking PM interrupts because that is using a different
+		 * register (GEN8_GT_IMR(2)) to mask PM interrupts. The only risk is in
+		 * leaving stale bits in GEN8_GT_IIR(2) and GEN8_GT_IMR(2) which
+		 * gen8_enable_rps will clean up. */
 
-	I915_WRITE(GEN6_PMINTRMSK, ~GEN8_PMINTR_REDIRECT_TO_NON_DISP);
-	I915_WRITE(GEN8_GT_IER(2), I915_READ(GEN8_GT_IER(2)) &
-				   ~dev_priv->pm_rps_events);
-	/* Complete PM interrupt masking here doesn't race with the rps work
-	 * item again unmasking PM interrupts because that is using a different
-	 * register (GEN8_GT_IMR(2)) to mask PM interrupts. The only risk is in
-	 * leaving stale bits in GEN8_GT_IIR(2) and GEN8_GT_IMR(2) which
-	 * gen8_enable_rps will clean up. */
-
-	spin_lock_irq(&dev_priv->irq_lock);
-	dev_priv->rps.pm_iir = 0;
-	spin_unlock_irq(&dev_priv->irq_lock);
+		spin_lock_irq(&dev_priv->irq_lock);
+		dev_priv->rps.pm_iir = 0;
+		spin_unlock_irq(&dev_priv->irq_lock);
 
-	I915_WRITE(GEN8_GT_IIR(2), dev_priv->pm_rps_events);
+		I915_WRITE(GEN8_GT_IIR(2), dev_priv->pm_rps_events);
+	}
 }
 
 static void gen6_disable_rps_interrupts(struct drm_device *dev)
@@ -3511,13 +3524,111 @@  static void parse_rp_state_cap(struct drm_i915_private *dev_priv, u32 rp_state_c
 		dev_priv->rps.min_freq_softlimit = dev_priv->rps.min_freq;
 }
 
+static void bdw_sw_calculate_freq(struct drm_device *dev,
+		struct intel_rps_bdw_cal *c, u32 *cur_time, u32 *c0)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	u64 busy = 0;
+	u32 busyness_pct = 0;
+	u32 elapsed_time = 0;
+	u16 new_freq = 0;
+
+	if (!c || !cur_time || !c0)
+		return;
+
+	if (0 == c->last_c0)
+		goto out;
+
+	/* Check Evaluation interval */
+	elapsed_time = *cur_time - c->last_ts;
+	if (elapsed_time < c->eval_interval)
+		return;
+
+	mutex_lock(&dev_priv->rps.hw_lock);
+
+	/*
+	 * c0 unit in 32*1.28 usec, elapsed_time unit in 1 usec.
+	 * Whole busyness_pct calculation should be
+	 *     busy = ((u64)(*c0 - c->last_c0) << 5 << 7) / 100;
+	 *     busyness_pct = (u32)(busy * 100 / elapsed_time);
+	 * The final formula is to simplify CPU calculation
+	 */
+	busy = (u64)(*c0 - c->last_c0) << 12;
+	do_div(busy, elapsed_time);
+	busyness_pct = (u32)busy;
+
+	if (c->is_up && busyness_pct >= c->it_threshold_pct)
+		new_freq = (u16)dev_priv->rps.cur_freq + 3;
+	if (!c->is_up && busyness_pct <= c->it_threshold_pct)
+		new_freq = (u16)dev_priv->rps.cur_freq - 1;
+
+	/* Adjust to new frequency busyness and compare with threshold */
+	if (0 != new_freq) {
+		if (new_freq > dev_priv->rps.max_freq_softlimit)
+			new_freq = dev_priv->rps.max_freq_softlimit;
+		else if (new_freq < dev_priv->rps.min_freq_softlimit)
+			new_freq = dev_priv->rps.min_freq_softlimit;
+
+		gen6_set_rps(dev, new_freq);
+	}
+
+	mutex_unlock(&dev_priv->rps.hw_lock);
+
+out:
+	c->last_c0 = *c0;
+	c->last_ts = *cur_time;
+}
+
+static void gen8_set_frequency_RP0(struct work_struct *work)
+{
+	struct intel_rps_bdw_turbo *p_bdw_turbo =
+			container_of(work, struct intel_rps_bdw_turbo, work_max_freq);
+	struct intel_gen6_power_mgmt *p_power_mgmt =
+			container_of(p_bdw_turbo, struct intel_gen6_power_mgmt, sw_turbo);
+	struct drm_i915_private *dev_priv =
+			container_of(p_power_mgmt, struct drm_i915_private, rps);
+
+	mutex_lock(&dev_priv->rps.hw_lock);
+	gen6_set_rps(dev_priv->dev, dev_priv->rps.rp0_freq);
+	mutex_unlock(&dev_priv->rps.hw_lock);
+}
+
+static void flip_active_timeout_handler(unsigned long var)
+{
+	struct drm_i915_private *dev_priv = (struct drm_i915_private *) var;
+
+	del_timer(&dev_priv->rps.sw_turbo.flip_timer);
+	atomic_set(&dev_priv->rps.sw_turbo.flip_received, false);
+
+	queue_work(dev_priv->wq, &dev_priv->rps.sw_turbo.work_max_freq);
+}
+
+void bdw_software_turbo(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	u32 current_time = I915_READ(TIMESTAMP_CTR); /* unit in usec */
+	u32 current_c0 = I915_READ(MCHBAR_PCU_C0); /* unit in 32*1.28 usec */
+
+	bdw_sw_calculate_freq(dev, &dev_priv->rps.sw_turbo.up,
+			&current_time, &current_c0);
+	bdw_sw_calculate_freq(dev, &dev_priv->rps.sw_turbo.down,
+			&current_time, &current_c0);
+}
+
 static void gen8_enable_rps(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct intel_engine_cs *ring;
 	uint32_t rc6_mask = 0, rp_state_cap;
+	uint32_t threshold_up_pct, threshold_down_pct;
+	uint32_t ei_up, ei_down; /* up and down evaluation interval */
+	u32 rp_ctl_flag;
 	int unused;
 
+	/* Use software Turbo for BDW */
+	dev_priv->rps.is_bdw_sw_turbo = IS_BROADWELL(dev);
+
 	/* 1a: Software RC state - RC0 */
 	I915_WRITE(GEN6_RC_STATE, 0);
 
@@ -3561,35 +3672,74 @@  static void gen8_enable_rps(struct drm_device *dev)
 		   HSW_FREQUENCY(dev_priv->rps.rp1_freq));
 	I915_WRITE(GEN6_RC_VIDEO_FREQ,
 		   HSW_FREQUENCY(dev_priv->rps.rp1_freq));
-	/* NB: Docs say 1s, and 1000000 - which aren't equivalent */
-	I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 100000000 / 128); /* 1 second timeout */
+	ei_up = 84480; /* 84.48ms */
+	ei_down = 448000;
+	threshold_up_pct = 90; /* x percent busy */
+	threshold_down_pct = 70;
+
+	if (dev_priv->rps.is_bdw_sw_turbo) {
+		dev_priv->rps.sw_turbo.up.it_threshold_pct = threshold_up_pct;
+		dev_priv->rps.sw_turbo.up.eval_interval = ei_up;
+		dev_priv->rps.sw_turbo.up.is_up = true;
+		dev_priv->rps.sw_turbo.up.last_ts = 0;
+		dev_priv->rps.sw_turbo.up.last_c0 = 0;
+
+		dev_priv->rps.sw_turbo.down.it_threshold_pct = threshold_down_pct;
+		dev_priv->rps.sw_turbo.down.eval_interval = ei_down;
+		dev_priv->rps.sw_turbo.down.is_up = false;
+		dev_priv->rps.sw_turbo.down.last_ts = 0;
+		dev_priv->rps.sw_turbo.down.last_c0 = 0;
+
+		/* Start the timer to track if flip comes*/
+		dev_priv->rps.sw_turbo.timeout = 200*1000; /* in us */
+
+		init_timer(&dev_priv->rps.sw_turbo.flip_timer);
+		dev_priv->rps.sw_turbo.flip_timer.function = flip_active_timeout_handler;
+		dev_priv->rps.sw_turbo.flip_timer.data  = (unsigned long) dev_priv;
+		dev_priv->rps.sw_turbo.flip_timer.expires =
+			usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies;
+		add_timer(&dev_priv->rps.sw_turbo.flip_timer);
+		INIT_WORK(&dev_priv->rps.sw_turbo.work_max_freq, gen8_set_frequency_RP0);
+
+		atomic_set(&dev_priv->rps.sw_turbo.flip_received, true);
+	} else {
+		/* NB: Docs say 1s, and 1000000 - which aren't equivalent
+		 * 1 second timeout*/
+		I915_WRITE(GEN6_RP_DOWN_TIMEOUT, FREQ_1_28_US(1000000));
 
-	/* Docs recommend 900MHz, and 300 MHz respectively */
-	I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
-		   dev_priv->rps.max_freq_softlimit << 24 |
-		   dev_priv->rps.min_freq_softlimit << 16);
+		/* Docs recommend 900MHz, and 300 MHz respectively */
+		I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
+			   dev_priv->rps.max_freq_softlimit << 24 |
+			   dev_priv->rps.min_freq_softlimit << 16);
 
-	I915_WRITE(GEN6_RP_UP_THRESHOLD, 7600000 / 128); /* 76ms busyness per EI, 90% */
-	I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 31300000 / 128); /* 313ms busyness per EI, 70%*/
-	I915_WRITE(GEN6_RP_UP_EI, 66000); /* 84.48ms, XXX: random? */
-	I915_WRITE(GEN6_RP_DOWN_EI, 350000); /* 448ms, XXX: random? */
+		I915_WRITE(GEN6_RP_UP_THRESHOLD,
+			FREQ_1_28_US(ei_up * threshold_up_pct / 100));
+		I915_WRITE(GEN6_RP_DOWN_THRESHOLD,
+			FREQ_1_28_US(ei_down * threshold_down_pct / 100));
+		I915_WRITE(GEN6_RP_UP_EI,
+			FREQ_1_28_US(ei_up));
+		I915_WRITE(GEN6_RP_DOWN_EI,
+			FREQ_1_28_US(ei_down));
 
-	I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
+		I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
+	}
 
 	/* 5: Enable RPS */
-	I915_WRITE(GEN6_RP_CONTROL,
-		   GEN6_RP_MEDIA_TURBO |
-		   GEN6_RP_MEDIA_HW_NORMAL_MODE |
-		   GEN6_RP_MEDIA_IS_GFX |
-		   GEN6_RP_ENABLE |
-		   GEN6_RP_UP_BUSY_AVG |
-		   GEN6_RP_DOWN_IDLE_AVG);
-
-	/* 6: Ring frequency + overclocking (our driver does this later */
-
+	rp_ctl_flag = GEN6_RP_MEDIA_TURBO |
+					GEN6_RP_MEDIA_HW_NORMAL_MODE |
+					GEN6_RP_MEDIA_IS_GFX |
+					GEN6_RP_UP_BUSY_AVG |
+					GEN6_RP_DOWN_IDLE_AVG;
+	if (!dev_priv->rps.is_bdw_sw_turbo)
+		rp_ctl_flag |= GEN6_RP_ENABLE;
+
+	I915_WRITE(GEN6_RP_CONTROL, rp_ctl_flag);
+
+	/* 6: Ring frequency + overclocking
+	 * (our driver does this later */
 	gen6_set_rps(dev, (I915_READ(GEN6_GT_PERF_STATUS) & 0xff00) >> 8);
-
-	gen8_enable_rps_interrupts(dev);
+	if (!dev_priv->rps.is_bdw_sw_turbo)
+		gen8_enable_rps_interrupts(dev);
 
 	gen6_gt_force_wake_put(dev_priv, FORCEWAKE_ALL);
 }
@@ -5018,6 +5168,8 @@  static void intel_gen6_powersave_work(struct work_struct *work)
 			     rps.delayed_resume_work.work);
 	struct drm_device *dev = dev_priv->dev;
 
+	dev_priv->rps.is_bdw_sw_turbo = false;
+
 	mutex_lock(&dev_priv->rps.hw_lock);
 
 	if (IS_CHERRYVIEW(dev)) {