diff mbox

[RFC/RFT,2/2] cpufreq: schedutil: Utilization aggregation

Message ID 2242635.g1ACnTm5vK@aspire.rjw.lan (mailing list archive)
State RFC
Headers show

Commit Message

Rafael J. Wysocki April 10, 2017, 12:11 a.m. UTC
From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Due to the limitation of the rate of frequency changes the schedutil
governor only estimates the CPU utilization entirely when it is about
to update the frequency for the corresponding cpufreq policy.  As a
result, the intermediate utilization values are discarded by it,
but that is not appropriate in general (like, for example, when
tasks migrate from one CPU to another or exit, in which cases the
utilization measured by PELT may change abruptly between frequency
updates).

For this reason, modify schedutil to estimate CPU utilization
completely whenever it is invoked for the given CPU and store the
maximum encountered value of it as input for subsequent new frequency
computations.  This way the new frequency is always based on the
maximum utilization value seen by the governor after the previous
frequency update which effectively prevents intermittent utilization
variations from causing it to be reduced unnecessarily.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c |   90 +++++++++++++++++++++------------------
 1 file changed, 50 insertions(+), 40 deletions(-)

Comments

Joel Fernandes April 10, 2017, 6:39 a.m. UTC | #1
Hi Rafael,

On Sun, Apr 9, 2017 at 5:11 PM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>
> Due to the limitation of the rate of frequency changes the schedutil
> governor only estimates the CPU utilization entirely when it is about
> to update the frequency for the corresponding cpufreq policy.  As a
> result, the intermediate utilization values are discarded by it,
> but that is not appropriate in general (like, for example, when
> tasks migrate from one CPU to another or exit, in which cases the
> utilization measured by PELT may change abruptly between frequency
> updates).
>
> For this reason, modify schedutil to estimate CPU utilization
> completely whenever it is invoked for the given CPU and store the
> maximum encountered value of it as input for subsequent new frequency
> computations.  This way the new frequency is always based on the
> maximum utilization value seen by the governor after the previous
> frequency update which effectively prevents intermittent utilization
> variations from causing it to be reduced unnecessarily.
>
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---
>  kernel/sched/cpufreq_schedutil.c |   90 +++++++++++++++++++++------------------
>  1 file changed, 50 insertions(+), 40 deletions(-)
>
> Index: linux-pm/kernel/sched/cpufreq_schedutil.c
> ===================================================================
> --- linux-pm.orig/kernel/sched/cpufreq_schedutil.c
> +++ linux-pm/kernel/sched/cpufreq_schedutil.c
> @@ -57,7 +57,6 @@ struct sugov_cpu {
>         unsigned long iowait_boost_max;
>         u64 last_update;
>
> -       /* The fields below are only needed when sharing a policy. */
>         unsigned long util;
>         unsigned long max;
>         unsigned int flags;
> @@ -154,22 +153,30 @@ static unsigned int get_next_freq(struct
>         return cpufreq_driver_resolve_freq(policy, freq);
>  }
>
> -static void sugov_get_util(unsigned long *util, unsigned long *max)
> +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned int flags)
>  {
> +       unsigned long cfs_util, cfs_max;
>         struct rq *rq = this_rq();
> -       unsigned long cfs_max;
>
> -       cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
> +       sg_cpu->flags |= flags & SCHED_CPUFREQ_RT_DL;
> +       if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
> +               return;
>
> -       *util = min(rq->cfs.avg.util_avg, cfs_max);
> -       *max = cfs_max;
> +       cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
> +       cfs_util = min(rq->cfs.avg.util_avg, cfs_max);
> +       if (sg_cpu->util * cfs_max < sg_cpu->max * cfs_util) {

Assuming all CPUs have equal compute capacity, doesn't this mean that
sg_cpu->util is updated only if cfs_util > sg_cpu->util?

Maybe I missed something, but wouldn't we want sg_cpu->util to be
reduced as well when cfs_util reduces? Doesn't this condition
basically discard all updates to sg_cpu->util that could have reduced
it?

> +               sg_cpu->util = cfs_util;
> +               sg_cpu->max = cfs_max;
> +       }
>  }

Thanks,
Joel
Juri Lelli April 10, 2017, 11:26 a.m. UTC | #2
Hi Rafael,

thanks for this set. I'll give it a try (together with your previous
patch) in the next few days.

A question below.

On 10/04/17 02:11, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> 
> Due to the limitation of the rate of frequency changes the schedutil
> governor only estimates the CPU utilization entirely when it is about
> to update the frequency for the corresponding cpufreq policy.  As a
> result, the intermediate utilization values are discarded by it,
> but that is not appropriate in general (like, for example, when
> tasks migrate from one CPU to another or exit, in which cases the
> utilization measured by PELT may change abruptly between frequency
> updates).
> 
> For this reason, modify schedutil to estimate CPU utilization
> completely whenever it is invoked for the given CPU and store the
> maximum encountered value of it as input for subsequent new frequency
> computations.  This way the new frequency is always based on the
> maximum utilization value seen by the governor after the previous
> frequency update which effectively prevents intermittent utilization
> variations from causing it to be reduced unnecessarily.
> 
> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
> ---

[...]

> -static void sugov_get_util(unsigned long *util, unsigned long *max)
> +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned int flags)
>  {
> +	unsigned long cfs_util, cfs_max;
>  	struct rq *rq = this_rq();
> -	unsigned long cfs_max;
>  
> -	cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
> +	sg_cpu->flags |= flags & SCHED_CPUFREQ_RT_DL;
> +	if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
> +		return;
>  

IIUC, with this you also keep track of any RT/DL tasks that woke up
during the last throttling period, and react accordingly as soon a
triggering event happens after the throttling period elapses.

Given that for RT (and still for DL as well) the next event is a
periodic tick, couldn't happen that the required frequency transition
for an RT task, that unfortunately woke up before the end of a throttling
period, gets delayed of a tick interval (at least 4ms on ARM)?
Don't we need to treat such wake up events (RT/DL) in a special way and
maybe set a timer to fire and process them as soon as the current
throttling period elapses? Might be a patch on top of this I guess.

Best,

- Juri
Rafael J. Wysocki April 10, 2017, 8:59 p.m. UTC | #3
On Mon, Apr 10, 2017 at 8:39 AM, Joel Fernandes <joelaf@google.com> wrote:
> Hi Rafael,

Hi,

> On Sun, Apr 9, 2017 at 5:11 PM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
>> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>>

[cut]

>> @@ -154,22 +153,30 @@ static unsigned int get_next_freq(struct
>>         return cpufreq_driver_resolve_freq(policy, freq);
>>  }
>>
>> -static void sugov_get_util(unsigned long *util, unsigned long *max)
>> +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned int flags)
>>  {
>> +       unsigned long cfs_util, cfs_max;
>>         struct rq *rq = this_rq();
>> -       unsigned long cfs_max;
>>
>> -       cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
>> +       sg_cpu->flags |= flags & SCHED_CPUFREQ_RT_DL;
>> +       if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
>> +               return;
>>
>> -       *util = min(rq->cfs.avg.util_avg, cfs_max);
>> -       *max = cfs_max;
>> +       cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
>> +       cfs_util = min(rq->cfs.avg.util_avg, cfs_max);
>> +       if (sg_cpu->util * cfs_max < sg_cpu->max * cfs_util) {
>
> Assuming all CPUs have equal compute capacity, doesn't this mean that
> sg_cpu->util is updated only if cfs_util > sg_cpu->util?

Yes, it does.

> Maybe I missed something, but wouldn't we want sg_cpu->util to be
> reduced as well when cfs_util reduces? Doesn't this condition
> basically discard all updates to sg_cpu->util that could have reduced
> it?
>
>> +               sg_cpu->util = cfs_util;
>> +               sg_cpu->max = cfs_max;
>> +       }
>>  }


Well, that's the idea. :-)

During the discussion at the OSPM-summit we concluded that discarding
all of the utilization changes between the points at which frequency
updates actually happened was not a good idea, so they needed to be
aggregated somehow.

There are a few ways to aggregate them, but the most straightforward
one (and one which actually makes sense) is to take the maximum as the
aggregate value.

Of course, this means that we skew things towards performance here,
but I'm not worried that much. :-)

Thanks,
Rafael
Rafael J. Wysocki April 10, 2017, 9:13 p.m. UTC | #4
On Mon, Apr 10, 2017 at 1:26 PM, Juri Lelli <juri.lelli@arm.com> wrote:
> Hi Rafael,

Hi,

> thanks for this set. I'll give it a try (together with your previous
> patch) in the next few days.
>
> A question below.
>
> On 10/04/17 02:11, Rafael J. Wysocki wrote:
>> From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>>
>> Due to the limitation of the rate of frequency changes the schedutil
>> governor only estimates the CPU utilization entirely when it is about
>> to update the frequency for the corresponding cpufreq policy.  As a
>> result, the intermediate utilization values are discarded by it,
>> but that is not appropriate in general (like, for example, when
>> tasks migrate from one CPU to another or exit, in which cases the
>> utilization measured by PELT may change abruptly between frequency
>> updates).
>>
>> For this reason, modify schedutil to estimate CPU utilization
>> completely whenever it is invoked for the given CPU and store the
>> maximum encountered value of it as input for subsequent new frequency
>> computations.  This way the new frequency is always based on the
>> maximum utilization value seen by the governor after the previous
>> frequency update which effectively prevents intermittent utilization
>> variations from causing it to be reduced unnecessarily.
>>
>> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
>> ---
>
> [...]
>
>> -static void sugov_get_util(unsigned long *util, unsigned long *max)
>> +static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned int flags)
>>  {
>> +     unsigned long cfs_util, cfs_max;
>>       struct rq *rq = this_rq();
>> -     unsigned long cfs_max;
>>
>> -     cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
>> +     sg_cpu->flags |= flags & SCHED_CPUFREQ_RT_DL;
>> +     if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
>> +             return;
>>
>
> IIUC, with this you also keep track of any RT/DL tasks that woke up
> during the last throttling period, and react accordingly as soon a
> triggering event happens after the throttling period elapses.

Right (that's the idea at least).

> Given that for RT (and still for DL as well) the next event is a
> periodic tick, couldn't happen that the required frequency transition
> for an RT task, that unfortunately woke up before the end of a throttling
> period, gets delayed of a tick interval (at least 4ms on ARM)?

No, that won't be an entire tick unless it wakes up exactly at the
update time AFAICS.

> Don't we need to treat such wake up events (RT/DL) in a special way and
> maybe set a timer to fire and process them as soon as the current
> throttling period elapses? Might be a patch on top of this I guess.

Setting a timer won't be a good idea at all, as it would need to be a
deferrable one and Thomas would not like that (I'm sure).

We could in principle add some special casing around that, like for
example pass flags to sugov_should_update_freq() and opportunistically
ignore freq_update_delay_ns if SCHED_CPUFREQ_RT_DL is set in there,
but that would lead to extra overhead on systems where frequency
updates happen in-context.

Also the case looks somewhat corner to me to be honest.

Thanks,
Rafael
Joel Fernandes April 11, 2017, 1:57 a.m. UTC | #5
On Mon, Apr 10, 2017 at 1:59 PM, Rafael J. Wysocki <rafael@kernel.org> wrote:
[..]
>>> +               sg_cpu->util = cfs_util;
>>> +               sg_cpu->max = cfs_max;
>>> +       }
>>>  }
>
>
> Well, that's the idea. :-)
>
> During the discussion at the OSPM-summit we concluded that discarding
> all of the utilization changes between the points at which frequency
> updates actually happened was not a good idea, so they needed to be
> aggregated somehow.
>
> There are a few ways to aggregate them, but the most straightforward
> one (and one which actually makes sense) is to take the maximum as the
> aggregate value.
>
> Of course, this means that we skew things towards performance here,
> but I'm not worried that much. :-)

Does this increase the chance of going to idle at higher frequency?
Say in the last rate limit window, we have a high request followed by
a low request. After the window closes, by this algorithm we ignore
the low request and take the higher valued request, and then enter
idle. Then, wouldn't we be idling at higher frequency? I guess if you
enter "cluster-idle" then probably this isn't a big deal (like on the
ARM64 platforms I am working on). But I wasn't sure how expensive is
entering C-states at higher frequency on Intel platforms is or if it
is even a concern. :-D

Thanks,
Joel
Juri Lelli April 11, 2017, 7 a.m. UTC | #6
On 10/04/17 23:13, Rafael J. Wysocki wrote:
> On Mon, Apr 10, 2017 at 1:26 PM, Juri Lelli <juri.lelli@arm.com> wrote:

[...]

> > Given that for RT (and still for DL as well) the next event is a
> > periodic tick, couldn't happen that the required frequency transition
> > for an RT task, that unfortunately woke up before the end of a throttling
> > period, gets delayed of a tick interval (at least 4ms on ARM)?
> 
> No, that won't be an entire tick unless it wakes up exactly at the
> update time AFAICS.
> 

Right. I was trying to think about worst case, as I'm considering RT
type of tasks.

> > Don't we need to treat such wake up events (RT/DL) in a special way and
> > maybe set a timer to fire and process them as soon as the current
> > throttling period elapses? Might be a patch on top of this I guess.
> 
> Setting a timer won't be a good idea at all, as it would need to be a
> deferrable one and Thomas would not like that (I'm sure).
> 

Why deferrable? IMHO, we should be servicing RT requestes as soon as the
HW is capable of. Even a small delay of, say, a couple of ms could be
causing deadline misses.

> We could in principle add some special casing around that, like for
> example pass flags to sugov_should_update_freq() and opportunistically
> ignore freq_update_delay_ns if SCHED_CPUFREQ_RT_DL is set in there,
> but that would lead to extra overhead on systems where frequency
> updates happen in-context.
> 

Also, it looks still event driven to me. If the RT task is the only
thing running, nothing will trigger a potential frequency change
re-evaluation before the next tick.

> Also the case looks somewhat corner to me to be honest.
> 

Sure. Only thinking about potential problems here. However, playing with
my DL patches I noticed that this can be actually a problem, as for DL,
for example, we trigger a frequency switch when the task wakes up, but
then we don't do anything during the tick (because it doesn't seem to
make sense to do anything :). So, if we missed the opportunity to
increase frequency at enqueue time, the task is hopelessly done. :(

Anyway, since this looks anyway something that we might want on top of
your patches, I'll play with the idea when refreshing my set and see
what I get.

Thanks,

- Juri
Rafael J. Wysocki April 11, 2017, 8:53 p.m. UTC | #7
On Tue, Apr 11, 2017 at 3:57 AM, Joel Fernandes <joelaf@google.com> wrote:
> On Mon, Apr 10, 2017 at 1:59 PM, Rafael J. Wysocki <rafael@kernel.org> wrote:
> [..]
>>>> +               sg_cpu->util = cfs_util;
>>>> +               sg_cpu->max = cfs_max;
>>>> +       }
>>>>  }
>>
>>
>> Well, that's the idea. :-)
>>
>> During the discussion at the OSPM-summit we concluded that discarding
>> all of the utilization changes between the points at which frequency
>> updates actually happened was not a good idea, so they needed to be
>> aggregated somehow.
>>
>> There are a few ways to aggregate them, but the most straightforward
>> one (and one which actually makes sense) is to take the maximum as the
>> aggregate value.
>>
>> Of course, this means that we skew things towards performance here,
>> but I'm not worried that much. :-)
>
> Does this increase the chance of going to idle at higher frequency?
> Say in the last rate limit window, we have a high request followed by
> a low request. After the window closes, by this algorithm we ignore
> the low request and take the higher valued request, and then enter
> idle. Then, wouldn't we be idling at higher frequency? I guess if you
> enter "cluster-idle" then probably this isn't a big deal (like on the
> ARM64 platforms I am working on). But I wasn't sure how expensive is
> entering C-states at higher frequency on Intel platforms is or if it
> is even a concern. :-D

It isn't a concern at all AFAICS.

Thanks,
Rafael
Rafael J. Wysocki April 11, 2017, 9:03 p.m. UTC | #8
On Tue, Apr 11, 2017 at 9:00 AM, Juri Lelli <juri.lelli@arm.com> wrote:
> On 10/04/17 23:13, Rafael J. Wysocki wrote:
>> On Mon, Apr 10, 2017 at 1:26 PM, Juri Lelli <juri.lelli@arm.com> wrote:
>
> [...]
>
>> > Given that for RT (and still for DL as well) the next event is a
>> > periodic tick, couldn't happen that the required frequency transition
>> > for an RT task, that unfortunately woke up before the end of a throttling
>> > period, gets delayed of a tick interval (at least 4ms on ARM)?
>>
>> No, that won't be an entire tick unless it wakes up exactly at the
>> update time AFAICS.
>>
>
> Right. I was trying to think about worst case, as I'm considering RT
> type of tasks.
>
>> > Don't we need to treat such wake up events (RT/DL) in a special way and
>> > maybe set a timer to fire and process them as soon as the current
>> > throttling period elapses? Might be a patch on top of this I guess.
>>
>> Setting a timer won't be a good idea at all, as it would need to be a
>> deferrable one and Thomas would not like that (I'm sure).
>>
>
> Why deferrable? IMHO, we should be servicing RT requestes as soon as the
> HW is capable of. Even a small delay of, say, a couple of ms could be
> causing deadline misses.

If it is not deferrable, it will wake up the CPU from idle, but that's
not a concern here, because we're assuming that the CPU is not idle
anyway, so fair enough.

>> We could in principle add some special casing around that, like for
>> example pass flags to sugov_should_update_freq() and opportunistically
>> ignore freq_update_delay_ns if SCHED_CPUFREQ_RT_DL is set in there,
>> but that would lead to extra overhead on systems where frequency
>> updates happen in-context.
>>
>
> Also, it looks still event driven to me. If the RT task is the only
> thing running, nothing will trigger a potential frequency change
> re-evaluation before the next tick.

If freq_update_delay_ns is opportunistically ignored for
SCHED_CPUFREQ_RT_DL set in the flags by sugov_should_update_freq(),
then all of the updates with that flag set will cause a frequency
update to happen immediately *except* *for* the ones that require us
to wait for work_in_progress to become false, but in that case the
kthread might trigger an update (eg. by scheduling an irq_work) after
it has cleared work_in_progress.

No timers needed I guess after all? :-)

>> Also the case looks somewhat corner to me to be honest.
>>
>
> Sure. Only thinking about potential problems here. However, playing with
> my DL patches I noticed that this can be actually a problem, as for DL,
> for example, we trigger a frequency switch when the task wakes up, but
> then we don't do anything during the tick (because it doesn't seem to
> make sense to do anything :). So, if we missed the opportunity to
> increase frequency at enqueue time, the task is hopelessly done. :(
>
> Anyway, since this looks anyway something that we might want on top of
> your patches, I'll play with the idea when refreshing my set and see
> what I get.

Sounds good.

Thanks,
Rafael
diff mbox

Patch

Index: linux-pm/kernel/sched/cpufreq_schedutil.c
===================================================================
--- linux-pm.orig/kernel/sched/cpufreq_schedutil.c
+++ linux-pm/kernel/sched/cpufreq_schedutil.c
@@ -57,7 +57,6 @@  struct sugov_cpu {
 	unsigned long iowait_boost_max;
 	u64 last_update;
 
-	/* The fields below are only needed when sharing a policy. */
 	unsigned long util;
 	unsigned long max;
 	unsigned int flags;
@@ -154,22 +153,30 @@  static unsigned int get_next_freq(struct
 	return cpufreq_driver_resolve_freq(policy, freq);
 }
 
-static void sugov_get_util(unsigned long *util, unsigned long *max)
+static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned int flags)
 {
+	unsigned long cfs_util, cfs_max;
 	struct rq *rq = this_rq();
-	unsigned long cfs_max;
 
-	cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+	sg_cpu->flags |= flags & SCHED_CPUFREQ_RT_DL;
+	if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
+		return;
 
-	*util = min(rq->cfs.avg.util_avg, cfs_max);
-	*max = cfs_max;
+	cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+	cfs_util = min(rq->cfs.avg.util_avg, cfs_max);
+	if (sg_cpu->util * cfs_max < sg_cpu->max * cfs_util) {
+		sg_cpu->util = cfs_util;
+		sg_cpu->max = cfs_max;
+	}
 }
 
-static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
-				   unsigned int flags)
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+			       unsigned int flags)
 {
+	unsigned long boost_util, boost_max = sg_cpu->iowait_boost_max;
+
 	if (flags & SCHED_CPUFREQ_IOWAIT) {
-		sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+		sg_cpu->iowait_boost = boost_max;
 	} else if (sg_cpu->iowait_boost) {
 		s64 delta_ns = time - sg_cpu->last_update;
 
@@ -177,22 +184,15 @@  static void sugov_set_iowait_boost(struc
 		if (delta_ns > TICK_NSEC)
 			sg_cpu->iowait_boost = 0;
 	}
-}
 
-static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
-			       unsigned long *max)
-{
-	unsigned long boost_util = sg_cpu->iowait_boost;
-	unsigned long boost_max = sg_cpu->iowait_boost_max;
-
-	if (!boost_util)
+	boost_util = sg_cpu->iowait_boost;
+	if (!boost_util || sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
 		return;
 
-	if (*util * boost_max < *max * boost_util) {
-		*util = boost_util;
-		*max = boost_max;
+	if (sg_cpu->util * boost_max < sg_cpu->max * boost_util) {
+		sg_cpu->util = boost_util;
+		sg_cpu->max = boost_max;
 	}
-	sg_cpu->iowait_boost >>= 1;
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
@@ -208,30 +208,42 @@  static bool sugov_cpu_is_busy(struct sug
 static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
 #endif /* CONFIG_NO_HZ_COMMON */
 
+static void sugov_cpu_update(struct sugov_cpu *sg_cpu, u64 time,
+			     unsigned int flags)
+{
+	sugov_get_util(sg_cpu, flags);
+	sugov_iowait_boost(sg_cpu, time, flags);
+	sg_cpu->last_update = time;
+}
+
+static void sugov_reset_util(struct sugov_cpu *sg_cpu)
+{
+	sg_cpu->util = 0;
+	sg_cpu->max = 1;
+	sg_cpu->flags = 0;
+	sg_cpu->iowait_boost >>= 1;
+}
+
 static void sugov_update_single(struct update_util_data *hook, u64 time,
 				unsigned int flags)
 {
 	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	struct cpufreq_policy *policy = sg_policy->policy;
-	unsigned long util, max;
 	unsigned int next_f;
 	bool busy;
 
-	sugov_set_iowait_boost(sg_cpu, time, flags);
-	sg_cpu->last_update = time;
+	sugov_cpu_update(sg_cpu, time, flags);
 
 	if (!sugov_should_update_freq(sg_policy, time))
 		return;
 
 	busy = sugov_cpu_is_busy(sg_cpu);
 
-	if (flags & SCHED_CPUFREQ_RT_DL) {
+	if (sg_cpu->flags & SCHED_CPUFREQ_RT_DL) {
 		next_f = policy->cpuinfo.max_freq;
 	} else {
-		sugov_get_util(&util, &max);
-		sugov_iowait_boost(sg_cpu, &util, &max);
-		next_f = get_next_freq(sg_policy, util, max);
+		next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
 		/*
 		 * Do not reduce the frequency if the CPU has not been idle
 		 * recently, as the reduction is likely to be premature then.
@@ -240,6 +252,7 @@  static void sugov_update_single(struct u
 			next_f = sg_policy->next_freq;
 	}
 	sugov_update_commit(sg_policy, time, next_f);
+	sugov_reset_util(sg_cpu);
 }
 
 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu)
@@ -276,8 +289,6 @@  static unsigned int sugov_next_freq_shar
 			util = j_util;
 			max = j_max;
 		}
-
-		sugov_iowait_boost(j_sg_cpu, &util, &max);
 	}
 
 	return get_next_freq(sg_policy, util, max);
@@ -288,27 +299,25 @@  static void sugov_update_shared(struct u
 {
 	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
-	unsigned long util, max;
-	unsigned int next_f;
-
-	sugov_get_util(&util, &max);
 
 	raw_spin_lock(&sg_policy->update_lock);
 
-	sg_cpu->util = util;
-	sg_cpu->max = max;
-	sg_cpu->flags = flags;
-
-	sugov_set_iowait_boost(sg_cpu, time, flags);
-	sg_cpu->last_update = time;
+	sugov_cpu_update(sg_cpu, time, flags);
 
 	if (sugov_should_update_freq(sg_policy, time)) {
+		struct cpufreq_policy *policy = sg_policy->policy;
+		unsigned int next_f;
+		unsigned int j;
+
 		if (flags & SCHED_CPUFREQ_RT_DL)
 			next_f = sg_policy->policy->cpuinfo.max_freq;
 		else
 			next_f = sugov_next_freq_shared(sg_cpu);
 
 		sugov_update_commit(sg_policy, time, next_f);
+
+		for_each_cpu(j, policy->cpus)
+			sugov_reset_util(&per_cpu(sugov_cpu, j));
 	}
 
 	raw_spin_unlock(&sg_policy->update_lock);
@@ -606,6 +615,7 @@  static int sugov_start(struct cpufreq_po
 		sg_cpu->sg_policy = sg_policy;
 		sg_cpu->flags = SCHED_CPUFREQ_RT;
 		sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+		sugov_reset_util(sg_cpu);
 		cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
 					     policy_is_shared(policy) ?
 							sugov_update_shared :