diff mbox

drm/i915: HSW GT3 Slices: exec flag to warn kernel that userspace is using predication

Message ID 1383260829-2819-1-git-send-email-rodrigo.vivi@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Rodrigo Vivi Oct. 31, 2013, 11:07 p.m. UTC
If Userspace isn't using MI_PREDICATE all slices must be enabled for
backward compatibility.

If I915_EXEC_USE_PREDICATE isn't set and defaul is set to half, kernel will force
all slices on.

v2: fix the inverted logic for backwards compatibility
    USE_PREDICATE unset force gt_full when defaul is half
    instead of GT_FULL flag.

v3: Accepting Chris's suggestions: better variable names;
    better logic around state_default x legacy_userspace_busy;
    remove unecessary mutex;

v4: Accepting more suggestions from Chris:
    * Send all LRIs in only one block and don't ignore if it fails.
    * function name and cleaner code on forcing_full.

v5: fix mutex_lock use by Chris.

CC: Chris Wilson <chris@chris-wilson.co.uk>
CC: Eric Anholt <eric@anholt.net>
CC: Kenneth Graunke <kenneth@whitecape.org>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@gmail.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |  8 ++++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 64 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_reg.h            | 11 +++++
 drivers/gpu/drm/i915/i915_sysfs.c          |  7 ++++
 drivers/gpu/drm/i915/intel_display.c       | 17 ++++++++
 drivers/gpu/drm/i915/intel_drv.h           |  1 +
 drivers/gpu/drm/i915/intel_pm.c            | 41 ++++++++++++++++++-
 include/uapi/drm/i915_drm.h                |  8 +++-
 8 files changed, 154 insertions(+), 3 deletions(-)

Comments

Chris Wilson Nov. 1, 2013, 10:39 a.m. UTC | #1
On Thu, Oct 31, 2013 at 09:07:09PM -0200, Rodrigo Vivi wrote:
> If Userspace isn't using MI_PREDICATE all slices must be enabled for
> backward compatibility.
> 
> If I915_EXEC_USE_PREDICATE isn't set and defaul is set to half, kernel will force
> all slices on.
> 
> v2: fix the inverted logic for backwards compatibility
>     USE_PREDICATE unset force gt_full when defaul is half
>     instead of GT_FULL flag.
> 
> v3: Accepting Chris's suggestions: better variable names;
>     better logic around state_default x legacy_userspace_busy;
>     remove unecessary mutex;
> 
> v4: Accepting more suggestions from Chris:
>     * Send all LRIs in only one block and don't ignore if it fails.
>     * function name and cleaner code on forcing_full.
> 
> v5: fix mutex_lock use by Chris.
> 
> CC: Chris Wilson <chris@chris-wilson.co.uk>
> CC: Eric Anholt <eric@anholt.net>
> CC: Kenneth Graunke <kenneth@whitecape.org>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@gmail.com>

Locking needs major work still.

> @@ -935,6 +985,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	struct drm_clip_rect *cliprects = NULL;
>  	struct intel_ring_buffer *ring;
>  	struct i915_ctx_hang_stats *hs;
> +	struct i915_gt_slices *gt_slices = &dev_priv->gt_slices;
>  	u32 ctx_id = i915_execbuffer2_get_context_id(*args);
>  	u32 exec_start, exec_len;
>  	u32 mask, flags;
> @@ -999,6 +1050,19 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  		return -EINVAL;
>  	}
>  
> +	if (gt_legacy_userspace(ring, args)) {
> +		mutex_lock(&gt_slices->lock);
> +		if (gt_slices->state_default == 0 &&
> +		    !gt_slices->legacy_userspace_busy) {

You need to set legacy_userspace_busy if gt_slices->state_default is
already 0. Why 0? Why not 1? 1 - for one slice, 2 - for two slices, etc.

> +			ret = gt_legacy_userspace_busy(ring);
> +			if (ret == 0)
> +				gt_slices->legacy_userspace_busy = true;
> +		}
> +		mutex_unlock(&gt_slices->lock);
> +		if (ret)
> +			return ret;
> +	}
> +
>  	mode = args->flags & I915_EXEC_CONSTANTS_MASK;
>  	mask = I915_EXEC_CONSTANTS_MASK;
>  	switch (mode) {

> diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
> index 86ccd52..a821499 100644
> --- a/drivers/gpu/drm/i915/i915_sysfs.c
> +++ b/drivers/gpu/drm/i915/i915_sysfs.c
> @@ -135,16 +135,23 @@ static ssize_t gt_slice_config_store(struct device *kdev,
>  {
>  	struct drm_minor *minor = container_of(kdev, struct drm_minor, kdev);
>  	struct drm_device *dev = minor->dev;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
>  	int ret;
>  
>  	if (!strncmp(buf, "full", sizeof("full") - 1)) {
>  		ret = intel_set_gt_full(dev);
>  		if (ret)
>  			return ret;
> +		mutex_lock(&dev_priv->gt_slices.lock);
> +		dev_priv->gt_slices.state_default = 1;
> +		mutex_unlock(&dev_priv->gt_slices.lock);
>  	} else if (!strncmp(buf, "half", sizeof("half") - 1)) {
>  		ret = intel_set_gt_half(dev);
>  		if (ret)
>  			return ret;
> +		mutex_lock(&dev_priv->gt_slices.lock);
> +		dev_priv->gt_slices.state_default = 0;
> +		mutex_unlock(&dev_priv->gt_slices.lock);
>  	} else
>  		return -EINVAL;

This is the clearest example that the locking is fubar. Consider a
second process that simultaneously tries to change slice config. What
state is recorded? What state is the hardware actually in?
-Chris
Rodrigo Vivi Nov. 2, 2013, 12:49 p.m. UTC | #2
On Fri, Nov 1, 2013 at 8:39 AM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Thu, Oct 31, 2013 at 09:07:09PM -0200, Rodrigo Vivi wrote:
>> If Userspace isn't using MI_PREDICATE all slices must be enabled for
>> backward compatibility.
>>
>> If I915_EXEC_USE_PREDICATE isn't set and defaul is set to half, kernel will force
>> all slices on.
>>
>> v2: fix the inverted logic for backwards compatibility
>>     USE_PREDICATE unset force gt_full when defaul is half
>>     instead of GT_FULL flag.
>>
>> v3: Accepting Chris's suggestions: better variable names;
>>     better logic around state_default x legacy_userspace_busy;
>>     remove unecessary mutex;
>>
>> v4: Accepting more suggestions from Chris:
>>     * Send all LRIs in only one block and don't ignore if it fails.
>>     * function name and cleaner code on forcing_full.
>>
>> v5: fix mutex_lock use by Chris.
>>
>> CC: Chris Wilson <chris@chris-wilson.co.uk>
>> CC: Eric Anholt <eric@anholt.net>
>> CC: Kenneth Graunke <kenneth@whitecape.org>
>> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@gmail.com>
>
> Locking needs major work still.

What else is wrong besides what you pointed on sysfs?

>
>> @@ -935,6 +985,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>       struct drm_clip_rect *cliprects = NULL;
>>       struct intel_ring_buffer *ring;
>>       struct i915_ctx_hang_stats *hs;
>> +     struct i915_gt_slices *gt_slices = &dev_priv->gt_slices;
>>       u32 ctx_id = i915_execbuffer2_get_context_id(*args);
>>       u32 exec_start, exec_len;
>>       u32 mask, flags;
>> @@ -999,6 +1050,19 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>>               return -EINVAL;
>>       }
>>
>> +     if (gt_legacy_userspace(ring, args)) {
>> +             mutex_lock(&gt_slices->lock);
>> +             if (gt_slices->state_default == 0 &&
>> +                 !gt_slices->legacy_userspace_busy) {
>
> You need to set legacy_userspace_busy if gt_slices->state_default is
> already 0. Why 0? Why not 1? 1 - for one slice, 2 - for two slices, etc.

I used 0 for half and 1 for full like I used on the boot parameter.
I don't mind in changing that if you believe 1 and 2 is more clear...
but I would have to change all other patches and maybe the sysfs
interface?

>
>> +                     ret = gt_legacy_userspace_busy(ring);
>> +                     if (ret == 0)
>> +                             gt_slices->legacy_userspace_busy = true;
>> +             }
>> +             mutex_unlock(&gt_slices->lock);
>> +             if (ret)
>> +                     return ret;
>> +     }
>> +
>>       mode = args->flags & I915_EXEC_CONSTANTS_MASK;
>>       mask = I915_EXEC_CONSTANTS_MASK;
>>       switch (mode) {
>
>> diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
>> index 86ccd52..a821499 100644
>> --- a/drivers/gpu/drm/i915/i915_sysfs.c
>> +++ b/drivers/gpu/drm/i915/i915_sysfs.c
>> @@ -135,16 +135,23 @@ static ssize_t gt_slice_config_store(struct device *kdev,
>>  {
>>       struct drm_minor *minor = container_of(kdev, struct drm_minor, kdev);
>>       struct drm_device *dev = minor->dev;
>> +     struct drm_i915_private *dev_priv = dev->dev_private;
>>       int ret;
>>
>>       if (!strncmp(buf, "full", sizeof("full") - 1)) {
>>               ret = intel_set_gt_full(dev);
>>               if (ret)
>>                       return ret;
>> +             mutex_lock(&dev_priv->gt_slices.lock);
>> +             dev_priv->gt_slices.state_default = 1;
>> +             mutex_unlock(&dev_priv->gt_slices.lock);
>>       } else if (!strncmp(buf, "half", sizeof("half") - 1)) {
>>               ret = intel_set_gt_half(dev);
>>               if (ret)
>>                       return ret;
>> +             mutex_lock(&dev_priv->gt_slices.lock);
>> +             dev_priv->gt_slices.state_default = 0;
>> +             mutex_unlock(&dev_priv->gt_slices.lock);
>>       } else
>>               return -EINVAL;
>
> This is the clearest example that the locking is fubar. Consider a
> second process that simultaneously tries to change slice config. What
> state is recorded? What state is the hardware actually in?

agree... I will just remove it, but what else you see wrong with locking?

> -Chris
>
> --
> Chris Wilson, Intel Open Source Technology Centre


Thanks,
Rodrigo.
Chris Wilson Nov. 2, 2013, 12:54 p.m. UTC | #3
On Sat, Nov 02, 2013 at 10:49:32AM -0200, Rodrigo Vivi wrote:
> On Fri, Nov 1, 2013 at 8:39 AM, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > On Thu, Oct 31, 2013 at 09:07:09PM -0200, Rodrigo Vivi wrote:
> >> If Userspace isn't using MI_PREDICATE all slices must be enabled for
> >> backward compatibility.
> >>
> >> If I915_EXEC_USE_PREDICATE isn't set and defaul is set to half, kernel will force
> >> all slices on.
> >>
> >> v2: fix the inverted logic for backwards compatibility
> >>     USE_PREDICATE unset force gt_full when defaul is half
> >>     instead of GT_FULL flag.
> >>
> >> v3: Accepting Chris's suggestions: better variable names;
> >>     better logic around state_default x legacy_userspace_busy;
> >>     remove unecessary mutex;
> >>
> >> v4: Accepting more suggestions from Chris:
> >>     * Send all LRIs in only one block and don't ignore if it fails.
> >>     * function name and cleaner code on forcing_full.
> >>
> >> v5: fix mutex_lock use by Chris.
> >>
> >> CC: Chris Wilson <chris@chris-wilson.co.uk>
> >> CC: Eric Anholt <eric@anholt.net>
> >> CC: Kenneth Graunke <kenneth@whitecape.org>
> >> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@gmail.com>
> >
> > Locking needs major work still.
> 
> What else is wrong besides what you pointed on sysfs?

It has a ripple effect.
 
> >
> >> @@ -935,6 +985,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> >>       struct drm_clip_rect *cliprects = NULL;
> >>       struct intel_ring_buffer *ring;
> >>       struct i915_ctx_hang_stats *hs;
> >> +     struct i915_gt_slices *gt_slices = &dev_priv->gt_slices;
> >>       u32 ctx_id = i915_execbuffer2_get_context_id(*args);
> >>       u32 exec_start, exec_len;
> >>       u32 mask, flags;
> >> @@ -999,6 +1050,19 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
> >>               return -EINVAL;
> >>       }
> >>
> >> +     if (gt_legacy_userspace(ring, args)) {
> >> +             mutex_lock(&gt_slices->lock);
> >> +             if (gt_slices->state_default == 0 &&
> >> +                 !gt_slices->legacy_userspace_busy) {
> >
> > You need to set legacy_userspace_busy if gt_slices->state_default is
> > already 0. Why 0? Why not 1? 1 - for one slice, 2 - for two slices, etc.
> 
> I used 0 for half and 1 for full like I used on the boot parameter.
> I don't mind in changing that if you believe 1 and 2 is more clear...
> but I would have to change all other patches and maybe the sysfs
> interface?

half,full seems reasonable enough, and it will be easy to extend that
interface to include an integer slice count.  Having it as a slice count
it is a lot easier to understand the implications of different values. 
I would also argue that the module parameter should match the sysfs
interface.

> 
> >
> >> +                     ret = gt_legacy_userspace_busy(ring);
> >> +                     if (ret == 0)
> >> +                             gt_slices->legacy_userspace_busy = true;
> >> +             }
> >> +             mutex_unlock(&gt_slices->lock);
> >> +             if (ret)
> >> +                     return ret;
> >> +     }
> >> +
> >>       mode = args->flags & I915_EXEC_CONSTANTS_MASK;
> >>       mask = I915_EXEC_CONSTANTS_MASK;
> >>       switch (mode) {
> >
> >> diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
> >> index 86ccd52..a821499 100644
> >> --- a/drivers/gpu/drm/i915/i915_sysfs.c
> >> +++ b/drivers/gpu/drm/i915/i915_sysfs.c
> >> @@ -135,16 +135,23 @@ static ssize_t gt_slice_config_store(struct device *kdev,
> >>  {
> >>       struct drm_minor *minor = container_of(kdev, struct drm_minor, kdev);
> >>       struct drm_device *dev = minor->dev;
> >> +     struct drm_i915_private *dev_priv = dev->dev_private;
> >>       int ret;
> >>
> >>       if (!strncmp(buf, "full", sizeof("full") - 1)) {
> >>               ret = intel_set_gt_full(dev);
> >>               if (ret)
> >>                       return ret;
> >> +             mutex_lock(&dev_priv->gt_slices.lock);
> >> +             dev_priv->gt_slices.state_default = 1;
> >> +             mutex_unlock(&dev_priv->gt_slices.lock);
> >>       } else if (!strncmp(buf, "half", sizeof("half") - 1)) {
> >>               ret = intel_set_gt_half(dev);
> >>               if (ret)
> >>                       return ret;
> >> +             mutex_lock(&dev_priv->gt_slices.lock);
> >> +             dev_priv->gt_slices.state_default = 0;
> >> +             mutex_unlock(&dev_priv->gt_slices.lock);
> >>       } else
> >>               return -EINVAL;
> >
> > This is the clearest example that the locking is fubar. Consider a
> > second process that simultaneously tries to change slice config. What
> > state is recorded? What state is the hardware actually in?
> 
> agree... I will just remove it, but what else you see wrong with locking?

You can't just remove the locking here either. The locking has to
protect all register access and bookkeeping so that they are always in
sync.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 685fb1d..67bbbce 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1219,6 +1219,12 @@  struct i915_package_c8 {
 	} regsave;
 };
 
+struct i915_gt_slices {
+	int state_default;
+	int legacy_userspace_busy;
+	struct mutex lock; /* locks access to this scruct and slice registers */
+};
+
 typedef struct drm_i915_private {
 	struct drm_device *dev;
 	struct kmem_cache *slab;
@@ -1418,6 +1424,8 @@  typedef struct drm_i915_private {
 
 	struct i915_package_c8 pc8;
 
+	struct i915_gt_slices gt_slices;
+
 	/* Old dri1 support infrastructure, beware the dragons ya fools entering
 	 * here! */
 	struct i915_dri1_state dri1;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0ce0d47..3ada5b4 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -922,6 +922,56 @@  i915_reset_gen7_sol_offsets(struct drm_device *dev,
 	return 0;
 }
 
+static int gt_legacy_userspace_busy(struct intel_ring_buffer *ring)
+{
+	int ret;
+
+	ret = intel_ring_begin(ring, 18);
+	if (ret)
+		return ret;
+
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, HSW_GT_SLICE_INFO);
+	intel_ring_emit(ring, SLICE_SEL_BOTH);
+
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, MI_PREDICATE_RESULT_2);
+	intel_ring_emit(ring, LOWER_SLICE_ENABLED);
+
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, HSW_SLICESHUTDOWN);
+	intel_ring_emit(ring, ~SLICE_SHUTDOWN);
+
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, RC_IDLE_MAX_COUNT);
+	intel_ring_emit(ring, CS_IDLE_COUNT_1US);
+
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, WAIT_FOR_RC6_EXIT);
+	intel_ring_emit(ring, _MASKED_BIT_ENABLE(WAIT_RC6_EXIT));
+
+	intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+	intel_ring_emit(ring, RC_IDLE_MAX_COUNT);
+	intel_ring_emit(ring, CS_IDLE_COUNT_5US);
+
+	intel_ring_advance(ring);
+	return 0;
+}
+
+static bool gt_legacy_userspace(struct intel_ring_buffer *ring,
+				struct drm_i915_gem_execbuffer2 *args)
+{
+	drm_i915_private_t *dev_priv = ring->dev->dev_private;
+
+	if (ring->id == BCS)
+		return false;
+
+	if (!HAS_SLICE_SHUTDOWN(ring->dev))
+		return false;
+
+	return (args->flags & I915_EXEC_USE_PREDICATE) == 0;
+}
+
 static int
 i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		       struct drm_file *file,
@@ -935,6 +985,7 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct drm_clip_rect *cliprects = NULL;
 	struct intel_ring_buffer *ring;
 	struct i915_ctx_hang_stats *hs;
+	struct i915_gt_slices *gt_slices = &dev_priv->gt_slices;
 	u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 exec_start, exec_len;
 	u32 mask, flags;
@@ -999,6 +1050,19 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
+	if (gt_legacy_userspace(ring, args)) {
+		mutex_lock(&gt_slices->lock);
+		if (gt_slices->state_default == 0 &&
+		    !gt_slices->legacy_userspace_busy) {
+			ret = gt_legacy_userspace_busy(ring);
+			if (ret == 0)
+				gt_slices->legacy_userspace_busy = true;
+		}
+		mutex_unlock(&gt_slices->lock);
+		if (ret)
+			return ret;
+	}
+
 	mode = args->flags & I915_EXEC_CONSTANTS_MASK;
 	mask = I915_EXEC_CONSTANTS_MASK;
 	switch (mode) {
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 497c441..0146bef 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -277,6 +277,17 @@ 
 #define   SLICE_STATUS_MAIN_ON	(2<<0)
 #define   SLICE_STATUS_BOTH_ON	(3<<0)
 
+#define HSW_SLICESHUTDOWN	0xA190
+#define   SLICE_SHUTDOWN	(1<<0)
+
+#define RC_IDLE_MAX_COUNT	0x2054
+#define   CS_IDLE_COUNT_1US	(1<<1)
+#define   CS_IDLE_COUNT_5US	(1<<3)
+
+#define WAIT_FOR_RC6_EXIT	0x20CC
+#define   WAIT_RC6_EXIT		(1<<0)
+#define   MASK_WAIT_RC6_EXIT	(1<<16)
+
 /*
  * 3D instructions used by the kernel
  */
diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
index 86ccd52..a821499 100644
--- a/drivers/gpu/drm/i915/i915_sysfs.c
+++ b/drivers/gpu/drm/i915/i915_sysfs.c
@@ -135,16 +135,23 @@  static ssize_t gt_slice_config_store(struct device *kdev,
 {
 	struct drm_minor *minor = container_of(kdev, struct drm_minor, kdev);
 	struct drm_device *dev = minor->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret;
 
 	if (!strncmp(buf, "full", sizeof("full") - 1)) {
 		ret = intel_set_gt_full(dev);
 		if (ret)
 			return ret;
+		mutex_lock(&dev_priv->gt_slices.lock);
+		dev_priv->gt_slices.state_default = 1;
+		mutex_unlock(&dev_priv->gt_slices.lock);
 	} else if (!strncmp(buf, "half", sizeof("half") - 1)) {
 		ret = intel_set_gt_half(dev);
 		if (ret)
 			return ret;
+		mutex_lock(&dev_priv->gt_slices.lock);
+		dev_priv->gt_slices.state_default = 0;
+		mutex_unlock(&dev_priv->gt_slices.lock);
 	} else
 		return -EINVAL;
 	return count;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 4f1b636..eec4c0e 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -7759,6 +7759,20 @@  void intel_mark_busy(struct drm_device *dev)
 	i915_update_gfx_val(dev_priv);
 }
 
+static bool intel_need_shutdown_slices(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	mutex_lock(&dev_priv->gt_slices.lock);
+	if (dev_priv->gt_slices.legacy_userspace_busy) {
+		dev_priv->gt_slices.legacy_userspace_busy = false;
+		mutex_unlock(&dev_priv->gt_slices.lock);
+		return true;
+	}
+	mutex_unlock(&dev_priv->gt_slices.lock);
+	return false;
+}
+
 void intel_mark_idle(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -7778,6 +7792,9 @@  void intel_mark_idle(struct drm_device *dev)
 
 	if (dev_priv->info->gen >= 6)
 		gen6_rps_idle(dev->dev_private);
+
+	if (intel_need_shutdown_slices(dev))
+		intel_set_gt_half_async(dev);
 }
 
 void intel_mark_fb_busy(struct drm_i915_gem_object *obj,
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index a9abbb5..98cd63e 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -836,6 +836,7 @@  void intel_disable_gt_powersave(struct drm_device *dev);
 void ironlake_teardown_rc6(struct drm_device *dev);
 int intel_set_gt_full(struct drm_device *dev);
 int intel_set_gt_half(struct drm_device *dev);
+void intel_set_gt_half_async(struct drm_device *dev);
 void intel_init_gt_slices(struct drm_device *dev);
 void gen6_update_ring_freq(struct drm_device *dev);
 void gen6_rps_idle(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 63af075..b3bd70f 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3873,6 +3873,7 @@  int intel_set_gt_full(struct drm_device *dev)
 	if (!HAS_SLICE_SHUTDOWN(dev))
 		return -ENODEV;
 
+	mutex_lock(&dev_priv->gt_slices.lock);
 	I915_WRITE(HSW_GT_SLICE_INFO, SLICE_SEL_BOTH);
 
 	/* Slices are enabled on RC6 exit */
@@ -3881,13 +3882,18 @@  int intel_set_gt_full(struct drm_device *dev)
 	if (wait_for(((I915_READ(HSW_GT_SLICE_INFO) & SLICE_STATUS_MASK) ==
 		      SLICE_STATUS_BOTH_ON), 2000)) {
 		DRM_ERROR("Timeout enabling full gt slices\n");
+
 		I915_WRITE(HSW_GT_SLICE_INFO, ~SLICE_SEL_BOTH);
 		I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_DISABLED);
+
 		gen6_gt_force_wake_put(dev_priv);
+		mutex_unlock(&dev_priv->gt_slices.lock);
 		return -ETIMEDOUT;
 	}
+
 	I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_ENABLED);
 	gen6_gt_force_wake_put(dev_priv);
+	mutex_unlock(&dev_priv->gt_slices.lock);
 
 	return 0;
 }
@@ -3899,6 +3905,7 @@  int intel_set_gt_half(struct drm_device *dev)
 	if (!HAS_SLICE_SHUTDOWN(dev))
 		return -ENODEV;
 
+	mutex_lock(&dev_priv->gt_slices.lock);
 	I915_WRITE(HSW_GT_SLICE_INFO, ~SLICE_SEL_BOTH);
 
 	/* Slices are disabled on RC6 exit */
@@ -3907,16 +3914,42 @@  int intel_set_gt_half(struct drm_device *dev)
 	if (wait_for(((I915_READ(HSW_GT_SLICE_INFO) & SLICE_STATUS_MASK) ==
 		      SLICE_STATUS_MAIN_ON), 2000)) {
 		DRM_ERROR("Timed out disabling half gt slices\n");
+
 		I915_WRITE(HSW_GT_SLICE_INFO, SLICE_SEL_BOTH);
 		I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_ENABLED);
+
 		gen6_gt_force_wake_put(dev_priv);
+		mutex_unlock(&dev_priv->gt_slices.lock);
 		return -ETIMEDOUT;
 	}
+
 	I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_DISABLED);
 	gen6_gt_force_wake_put(dev_priv);
+
+	mutex_unlock(&dev_priv->gt_slices.lock);
 	return 0;
 }
 
+/**
+ * On Haswell, slices on/off transitions are done via RC6 sequence.
+ * This async function allows you to request slices shutdown without waiting.
+ * Slices will be disabled on next RC6 exit.
+ */
+void intel_set_gt_half_async(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	if (!HAS_SLICE_SHUTDOWN(dev))
+		return;
+
+	mutex_lock(&dev_priv->gt_slices.lock);
+	if (dev_priv->gt_slices.state_default == 0) {
+		I915_WRITE(HSW_GT_SLICE_INFO, ~SLICE_SEL_BOTH);
+		I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_DISABLED);
+	}
+	mutex_unlock(&dev_priv->gt_slices.lock);
+}
+
 void intel_init_gt_slices(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -3927,9 +3960,13 @@  void intel_init_gt_slices(struct drm_device *dev)
 	if (!HAS_SLICE_SHUTDOWN(dev))
 		return;
 
+	dev_priv->gt_slices.state_default = 1;
+	dev_priv->gt_slices.legacy_userspace_busy = false;
+	mutex_init(&dev_priv->gt_slices.lock);
+
 	if (!i915_gt_slice_config) {
-		I915_WRITE(HSW_GT_SLICE_INFO, ~SLICE_SEL_BOTH);
-		I915_WRITE(MI_PREDICATE_RESULT_2, LOWER_SLICE_DISABLED);
+		dev_priv->gt_slices.state_default = 0;
+		intel_set_gt_half_async(dev);
 	}
 }
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 3a4e97b..3fa3e24 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -731,7 +731,13 @@  struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_HANDLE_LUT		(1<<12)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_HANDLE_LUT<<1)
+/* If this flag is set userspace is using predicate and half slices can be
+ * let disabled for power saving. Otherwise use all slices even when disabled
+ * by boot parameter or via sysfs interface
+ */
+#define I915_EXEC_USE_PREDICATE		(1<<13)
+
+#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_USE_PREDICATE<<1)
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \