diff mbox

[RFCv5,41/46] sched/fair: add triggers for OPP change requests

Message ID 1436293469-25707-42-git-send-email-morten.rasmussen@arm.com (mailing list archive)
State RFC
Headers show

Commit Message

Morten Rasmussen July 7, 2015, 6:24 p.m. UTC
From: Juri Lelli <juri.lelli@arm.com>

Each time a task is {en,de}queued we might need to adapt the current
frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
this purpose.  Only trigger a freq request if we are effectively waking up
or going to sleep.  Filter out load balancing related calls to reduce the
number of triggers.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>

Signed-off-by: Juri Lelli <juri.lelli@arm.com>
---
 kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

Comments

Michael Turquette July 8, 2015, 3:42 p.m. UTC | #1
Hi Juri,

Quoting Morten Rasmussen (2015-07-07 11:24:24)
> From: Juri Lelli <juri.lelli@arm.com>
> 
> Each time a task is {en,de}queued we might need to adapt the current
> frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
> this purpose.  Only trigger a freq request if we are effectively waking up
> or going to sleep.  Filter out load balancing related calls to reduce the
> number of triggers.
> 
> cc: Ingo Molnar <mingo@redhat.com>
> cc: Peter Zijlstra <peterz@infradead.org>
> 
> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
> ---
>  kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 40 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f74e9d2..b8627c6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4281,7 +4281,10 @@ static inline void hrtick_update(struct rq *rq)
>  }
>  #endif
>  
> +static unsigned int capacity_margin = 1280; /* ~20% margin */

This is a 25% margin. Calling it ~20% is a bit misleading :)

Should margin be scaled for cpus that do not have max capacity == 1024?
In other words, should margin be dynamically calculated to be 20% of
*this* cpu's max capacity?

I'm imagining a corner case where a heterogeneous cpu system is set up
in such a way that adding margin that is hard-coded to 25% of 1024
almost always puts req_cap to the highest frequency, skipping some
reasonable capacity states in between.

> +
>  static bool cpu_overutilized(int cpu);
> +static unsigned long get_cpu_usage(int cpu);
>  struct static_key __sched_energy_freq __read_mostly = STATIC_KEY_INIT_FALSE;
>  
>  /*
> @@ -4332,6 +4335,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>                 if (!task_new && !rq->rd->overutilized &&
>                     cpu_overutilized(rq->cpu))
>                         rq->rd->overutilized = true;
> +               /*
> +                * We want to trigger a freq switch request only for tasks that
> +                * are waking up; this is because we get here also during
> +                * load balancing, but in these cases it seems wise to trigger
> +                * as single request after load balancing is done.
> +                *
> +                * XXX: how about fork()? Do we need a special flag/something
> +                *      to tell if we are here after a fork() (wakeup_task_new)?
> +                *
> +                * Also, we add a margin (same ~20% used for the tipping point)
> +                * to our request to provide some head room if p's utilization
> +                * further increases.
> +                */
> +               if (sched_energy_freq() && !task_new) {
> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
> +
> +                       req_cap = req_cap * capacity_margin
> +                                       >> SCHED_CAPACITY_SHIFT;

Probably a dumb question:

Can we "cheat" here and just assume that capacity and load use the same
units? That would avoid the multiplication and change your code to the
following:

	#define capacity_margin SCHED_CAPACITY_SCALE >> 2; /* 25% */
	req_cap += SCHED_CAPACITY_SCALE;

> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
> +               }
>         }
>         hrtick_update(rq);
>  }
> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>         if (!se) {
>                 sub_nr_running(rq, 1);
>                 update_rq_runnable_avg(rq, 1);
> +               /*
> +                * We want to trigger a freq switch request only for tasks that
> +                * are going to sleep; this is because we get here also during
> +                * load balancing, but in these cases it seems wise to trigger
> +                * as single request after load balancing is done.
> +                *
> +                * Also, we add a margin (same ~20% used for the tipping point)
> +                * to our request to provide some head room if p's utilization
> +                * further increases.
> +                */
> +               if (sched_energy_freq() && task_sleep) {
> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
> +
> +                       req_cap = req_cap * capacity_margin
> +                                       >> SCHED_CAPACITY_SHIFT;
> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);

Filtering out the load_balance bits is neat.

Regards,
Mike

> +               }
>         }
>         hrtick_update(rq);
>  }
> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>         return idx;
>  }
>  
> -static unsigned int capacity_margin = 1280; /* ~20% margin */
> -
>  static bool cpu_overutilized(int cpu)
>  {
>         return (capacity_of(cpu) * 1024) <
> -- 
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli July 9, 2015, 4:52 p.m. UTC | #2
Hi Mike,

On 08/07/15 16:42, Michael Turquette wrote:
> Hi Juri,
> 
> Quoting Morten Rasmussen (2015-07-07 11:24:24)
>> From: Juri Lelli <juri.lelli@arm.com>
>>
>> Each time a task is {en,de}queued we might need to adapt the current
>> frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
>> this purpose.  Only trigger a freq request if we are effectively waking up
>> or going to sleep.  Filter out load balancing related calls to reduce the
>> number of triggers.
>>
>> cc: Ingo Molnar <mingo@redhat.com>
>> cc: Peter Zijlstra <peterz@infradead.org>
>>
>> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
>> ---
>>  kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 40 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index f74e9d2..b8627c6 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4281,7 +4281,10 @@ static inline void hrtick_update(struct rq *rq)
>>  }
>>  #endif
>>  
>> +static unsigned int capacity_margin = 1280; /* ~20% margin */
> 
> This is a 25% margin. Calling it ~20% is a bit misleading :)
> 

Well, 1024 is what you get if your remove 20% to 1280. But, I
confess it wasn't clear to me too at first sight ;). Anyway,
you are right that the way I use it below, you end up adding
25% to req_cap. It is just because I didn't want to add another
margin I guess. :)

> Should margin be scaled for cpus that do not have max capacity == 1024?
> In other words, should margin be dynamically calculated to be 20% of
> *this* cpu's max capacity?
> 
> I'm imagining a corner case where a heterogeneous cpu system is set up
> in such a way that adding margin that is hard-coded to 25% of 1024
> almost always puts req_cap to the highest frequency, skipping some
> reasonable capacity states in between.
> 

But, what below should actually ask for a 25% more related to the
current cpu usage. So, if you have let's say a usage of 300 (this
is both cpu and freq scaled) when you do what below you get:

  300 * 1280 / 1024 = 375

and 375 is 300 + 25%. It is the ratio between capacity_margin and
SCHED_CAPACITY_SCALE that gives you a percentage relative to cpu usage.
Or did I get it wrong?

>> +
>>  static bool cpu_overutilized(int cpu);
>> +static unsigned long get_cpu_usage(int cpu);
>>  struct static_key __sched_energy_freq __read_mostly = STATIC_KEY_INIT_FALSE;
>>  
>>  /*
>> @@ -4332,6 +4335,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>                 if (!task_new && !rq->rd->overutilized &&
>>                     cpu_overutilized(rq->cpu))
>>                         rq->rd->overutilized = true;
>> +               /*
>> +                * We want to trigger a freq switch request only for tasks that
>> +                * are waking up; this is because we get here also during
>> +                * load balancing, but in these cases it seems wise to trigger
>> +                * as single request after load balancing is done.
>> +                *
>> +                * XXX: how about fork()? Do we need a special flag/something
>> +                *      to tell if we are here after a fork() (wakeup_task_new)?
>> +                *
>> +                * Also, we add a margin (same ~20% used for the tipping point)
>> +                * to our request to provide some head room if p's utilization
>> +                * further increases.
>> +                */
>> +               if (sched_energy_freq() && !task_new) {
>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>> +
>> +                       req_cap = req_cap * capacity_margin
>> +                                       >> SCHED_CAPACITY_SHIFT;
> 
> Probably a dumb question:
> 
> Can we "cheat" here and just assume that capacity and load use the same
> units? That would avoid the multiplication and change your code to the
> following:
> 
> 	#define capacity_margin SCHED_CAPACITY_SCALE >> 2; /* 25% */
> 	req_cap += SCHED_CAPACITY_SCALE;
> 

I'd rather stick with an increase relative to the current usage
as opposed to adding 256 to every request. I fear that the latter
would end up cutting out some OPPs entirely, as you were saying above.

>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>> +               }
>>         }
>>         hrtick_update(rq);
>>  }
>> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>         if (!se) {
>>                 sub_nr_running(rq, 1);
>>                 update_rq_runnable_avg(rq, 1);
>> +               /*
>> +                * We want to trigger a freq switch request only for tasks that
>> +                * are going to sleep; this is because we get here also during
>> +                * load balancing, but in these cases it seems wise to trigger
>> +                * as single request after load balancing is done.
>> +                *
>> +                * Also, we add a margin (same ~20% used for the tipping point)
>> +                * to our request to provide some head room if p's utilization
>> +                * further increases.
>> +                */
>> +               if (sched_energy_freq() && task_sleep) {
>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>> +
>> +                       req_cap = req_cap * capacity_margin
>> +                                       >> SCHED_CAPACITY_SHIFT;
>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
> 
> Filtering out the load_balance bits is neat.
> 

Also, I guess we need to do that because we still have some rate
limit to the frequency at which we can issue requests. If we move
more that one task when load balacing, we could miss some requests.

Thanks,

- Juri

> Regards,
> Mike
> 
>> +               }
>>         }
>>         hrtick_update(rq);
>>  }
>> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>>         return idx;
>>  }
>>  
>> -static unsigned int capacity_margin = 1280; /* ~20% margin */
>> -
>>  static bool cpu_overutilized(int cpu)
>>  {
>>         return (capacity_of(cpu) * 1024) <
>> -- 
>> 1.9.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Aug. 4, 2015, 1:41 p.m. UTC | #3
Hi Juri,

On 7 July 2015 at 20:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
> From: Juri Lelli <juri.lelli@arm.com>
>
> Each time a task is {en,de}queued we might need to adapt the current
> frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
> this purpose.  Only trigger a freq request if we are effectively waking up
> or going to sleep.  Filter out load balancing related calls to reduce the
> number of triggers.
>
> cc: Ingo Molnar <mingo@redhat.com>
> cc: Peter Zijlstra <peterz@infradead.org>
>
> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
> ---
>  kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 40 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f74e9d2..b8627c6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4281,7 +4281,10 @@ static inline void hrtick_update(struct rq *rq)
>  }
>  #endif
>
> +static unsigned int capacity_margin = 1280; /* ~20% margin */
> +
>  static bool cpu_overutilized(int cpu);
> +static unsigned long get_cpu_usage(int cpu);
>  struct static_key __sched_energy_freq __read_mostly = STATIC_KEY_INIT_FALSE;
>
>  /*
> @@ -4332,6 +4335,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>                 if (!task_new && !rq->rd->overutilized &&
>                     cpu_overutilized(rq->cpu))
>                         rq->rd->overutilized = true;
> +               /*
> +                * We want to trigger a freq switch request only for tasks that
> +                * are waking up; this is because we get here also during
> +                * load balancing, but in these cases it seems wise to trigger
> +                * as single request after load balancing is done.
> +                *
> +                * XXX: how about fork()? Do we need a special flag/something
> +                *      to tell if we are here after a fork() (wakeup_task_new)?
> +                *
> +                * Also, we add a margin (same ~20% used for the tipping point)
> +                * to our request to provide some head room if p's utilization
> +                * further increases.
> +                */
> +               if (sched_energy_freq() && !task_new) {
> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
> +
> +                       req_cap = req_cap * capacity_margin
> +                                       >> SCHED_CAPACITY_SHIFT;
> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
> +               }
>         }
>         hrtick_update(rq);
>  }
> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>         if (!se) {
>                 sub_nr_running(rq, 1);
>                 update_rq_runnable_avg(rq, 1);
> +               /*
> +                * We want to trigger a freq switch request only for tasks that
> +                * are going to sleep; this is because we get here also during
> +                * load balancing, but in these cases it seems wise to trigger
> +                * as single request after load balancing is done.
> +                *
> +                * Also, we add a margin (same ~20% used for the tipping point)
> +                * to our request to provide some head room if p's utilization
> +                * further increases.
> +                */
> +               if (sched_energy_freq() && task_sleep) {
> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
> +
> +                       req_cap = req_cap * capacity_margin
> +                                       >> SCHED_CAPACITY_SHIFT;
> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);

Could you clarify why you want to trig a freq switch for tasks that
are going to sleep ?
The cpu_usage should not changed that much as the se_utilization of
the entity moves from utilization_load_avg to utilization_blocked_avg
of the rq and the usage and the freq are updated periodically.
It should be the same for the wake up of a task in enqueue_task_fair
above, even if it's less obvious for this latter use case because the
cpu might wake up from a long idle phase during which its
utilization_blocked_avg has not been updated. Nevertheless, a trig of
the freq switch at wake up of the cpu once its usage has been updated
should do the job.

So tick, migration of tasks, new tasks, entering/leaving idle state of
cpu should be enough to trig freq switch

Regards,
Vincent


> +               }
>         }
>         hrtick_update(rq);
>  }
> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>         return idx;
>  }
>
> -static unsigned int capacity_margin = 1280; /* ~20% margin */
> -
>  static bool cpu_overutilized(int cpu)
>  {
>         return (capacity_of(cpu) * 1024) <
> --
> 1.9.1
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Aug. 10, 2015, 1:43 p.m. UTC | #4
Hi Vincent,

On 04/08/15 14:41, Vincent Guittot wrote:
> Hi Juri,
> 
> On 7 July 2015 at 20:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
>> From: Juri Lelli <juri.lelli@arm.com>
>>
>> Each time a task is {en,de}queued we might need to adapt the current
>> frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
>> this purpose.  Only trigger a freq request if we are effectively waking up
>> or going to sleep.  Filter out load balancing related calls to reduce the
>> number of triggers.
>>
>> cc: Ingo Molnar <mingo@redhat.com>
>> cc: Peter Zijlstra <peterz@infradead.org>
>>
>> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
>> ---
>>  kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
>>  1 file changed, 40 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index f74e9d2..b8627c6 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4281,7 +4281,10 @@ static inline void hrtick_update(struct rq *rq)
>>  }
>>  #endif
>>
>> +static unsigned int capacity_margin = 1280; /* ~20% margin */
>> +
>>  static bool cpu_overutilized(int cpu);
>> +static unsigned long get_cpu_usage(int cpu);
>>  struct static_key __sched_energy_freq __read_mostly = STATIC_KEY_INIT_FALSE;
>>
>>  /*
>> @@ -4332,6 +4335,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>                 if (!task_new && !rq->rd->overutilized &&
>>                     cpu_overutilized(rq->cpu))
>>                         rq->rd->overutilized = true;
>> +               /*
>> +                * We want to trigger a freq switch request only for tasks that
>> +                * are waking up; this is because we get here also during
>> +                * load balancing, but in these cases it seems wise to trigger
>> +                * as single request after load balancing is done.
>> +                *
>> +                * XXX: how about fork()? Do we need a special flag/something
>> +                *      to tell if we are here after a fork() (wakeup_task_new)?
>> +                *
>> +                * Also, we add a margin (same ~20% used for the tipping point)
>> +                * to our request to provide some head room if p's utilization
>> +                * further increases.
>> +                */
>> +               if (sched_energy_freq() && !task_new) {
>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>> +
>> +                       req_cap = req_cap * capacity_margin
>> +                                       >> SCHED_CAPACITY_SHIFT;
>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>> +               }
>>         }
>>         hrtick_update(rq);
>>  }
>> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>         if (!se) {
>>                 sub_nr_running(rq, 1);
>>                 update_rq_runnable_avg(rq, 1);
>> +               /*
>> +                * We want to trigger a freq switch request only for tasks that
>> +                * are going to sleep; this is because we get here also during
>> +                * load balancing, but in these cases it seems wise to trigger
>> +                * as single request after load balancing is done.
>> +                *
>> +                * Also, we add a margin (same ~20% used for the tipping point)
>> +                * to our request to provide some head room if p's utilization
>> +                * further increases.
>> +                */
>> +               if (sched_energy_freq() && task_sleep) {
>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>> +
>> +                       req_cap = req_cap * capacity_margin
>> +                                       >> SCHED_CAPACITY_SHIFT;
>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
> 
> Could you clarify why you want to trig a freq switch for tasks that
> are going to sleep ?
> The cpu_usage should not changed that much as the se_utilization of
> the entity moves from utilization_load_avg to utilization_blocked_avg
> of the rq and the usage and the freq are updated periodically.

I think we still need to cover multiple back-to-back dequeues. Suppose
that you have, let's say, 3 tasks that get enqueued at the same time.
After some time the first one goes to sleep and its utilization, as you
say, gets moved to utilization_blocked_avg. So, nothing changes, and
the trigger is superfluous (even if no freq change I guess will be
issued as we are already servicing enough capacity). However, after a
while, the second task goes to sleep. Now we still use get_cpu_usage()
and the first task contribution in utilization_blocked_avg should have
been decayed by this time. Same thing may than happen for the third task
as well. So, if we don't check if we need to scale down in
dequeue_task_fair, it seems to me that we might miss some opportunities,
as blocked contribution of other tasks could have been successively
decayed.

What you think?

Thanks,

- Juri

> It should be the same for the wake up of a task in enqueue_task_fair
> above, even if it's less obvious for this latter use case because the
> cpu might wake up from a long idle phase during which its
> utilization_blocked_avg has not been updated. Nevertheless, a trig of
> the freq switch at wake up of the cpu once its usage has been updated
> should do the job.
> 
> So tick, migration of tasks, new tasks, entering/leaving idle state of
> cpu should be enough to trig freq switch
> 
> Regards,
> Vincent
> 
> 
>> +               }
>>         }
>>         hrtick_update(rq);
>>  }
>> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>>         return idx;
>>  }
>>
>> -static unsigned int capacity_margin = 1280; /* ~20% margin */
>> -
>>  static bool cpu_overutilized(int cpu)
>>  {
>>         return (capacity_of(cpu) * 1024) <
>> --
>> 1.9.1
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Aug. 10, 2015, 3:07 p.m. UTC | #5
On 10 August 2015 at 15:43, Juri Lelli <juri.lelli@arm.com> wrote:
>
> Hi Vincent,
>
> On 04/08/15 14:41, Vincent Guittot wrote:
> > Hi Juri,
> >
> > On 7 July 2015 at 20:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
> >> From: Juri Lelli <juri.lelli@arm.com>
> >>
> >> Each time a task is {en,de}queued we might need to adapt the current
> >> frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
> >> this purpose.  Only trigger a freq request if we are effectively waking up
> >> or going to sleep.  Filter out load balancing related calls to reduce the
> >> number of triggers.
> >>
> >> cc: Ingo Molnar <mingo@redhat.com>
> >> cc: Peter Zijlstra <peterz@infradead.org>
> >>
> >> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
> >> ---
> >>  kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
> >>  1 file changed, 40 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index f74e9d2..b8627c6 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -4281,7 +4281,10 @@ static inline void hrtick_update(struct rq *rq)
> >>  }
> >>  #endif
> >>
> >> +static unsigned int capacity_margin = 1280; /* ~20% margin */
> >> +
> >>  static bool cpu_overutilized(int cpu);
> >> +static unsigned long get_cpu_usage(int cpu);
> >>  struct static_key __sched_energy_freq __read_mostly = STATIC_KEY_INIT_FALSE;
> >>
> >>  /*
> >> @@ -4332,6 +4335,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >>                 if (!task_new && !rq->rd->overutilized &&
> >>                     cpu_overutilized(rq->cpu))
> >>                         rq->rd->overutilized = true;
> >> +               /*
> >> +                * We want to trigger a freq switch request only for tasks that
> >> +                * are waking up; this is because we get here also during
> >> +                * load balancing, but in these cases it seems wise to trigger
> >> +                * as single request after load balancing is done.
> >> +                *
> >> +                * XXX: how about fork()? Do we need a special flag/something
> >> +                *      to tell if we are here after a fork() (wakeup_task_new)?
> >> +                *
> >> +                * Also, we add a margin (same ~20% used for the tipping point)
> >> +                * to our request to provide some head room if p's utilization
> >> +                * further increases.
> >> +                */
> >> +               if (sched_energy_freq() && !task_new) {
> >> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
> >> +
> >> +                       req_cap = req_cap * capacity_margin
> >> +                                       >> SCHED_CAPACITY_SHIFT;
> >> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
> >> +               }
> >>         }
> >>         hrtick_update(rq);
> >>  }
> >> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
> >>         if (!se) {
> >>                 sub_nr_running(rq, 1);
> >>                 update_rq_runnable_avg(rq, 1);
> >> +               /*
> >> +                * We want to trigger a freq switch request only for tasks that
> >> +                * are going to sleep; this is because we get here also during
> >> +                * load balancing, but in these cases it seems wise to trigger
> >> +                * as single request after load balancing is done.
> >> +                *
> >> +                * Also, we add a margin (same ~20% used for the tipping point)
> >> +                * to our request to provide some head room if p's utilization
> >> +                * further increases.
> >> +                */
> >> +               if (sched_energy_freq() && task_sleep) {
> >> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
> >> +
> >> +                       req_cap = req_cap * capacity_margin
> >> +                                       >> SCHED_CAPACITY_SHIFT;
> >> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
> >
> > Could you clarify why you want to trig a freq switch for tasks that
> > are going to sleep ?
> > The cpu_usage should not changed that much as the se_utilization of
> > the entity moves from utilization_load_avg to utilization_blocked_avg
> > of the rq and the usage and the freq are updated periodically.
>
> I think we still need to cover multiple back-to-back dequeues. Suppose
> that you have, let's say, 3 tasks that get enqueued at the same time.
> After some time the first one goes to sleep and its utilization, as you
> say, gets moved to utilization_blocked_avg. So, nothing changes, and
> the trigger is superfluous (even if no freq change I guess will be
> issued as we are already servicing enough capacity). However, after a
> while, the second task goes to sleep. Now we still use get_cpu_usage()
> and the first task contribution in utilization_blocked_avg should have
> been decayed by this time. Same thing may than happen for the third task
> as well. So, if we don't check if we need to scale down in
> dequeue_task_fair, it seems to me that we might miss some opportunities,
> as blocked contribution of other tasks could have been successively
> decayed.
>
> What you think?

The tick is used to monitor such variation of the usage (in both way,
decay of the usage of sleeping tasks and increase of the usage of
running tasks). So in your example, if the duration between the sleep
of the 2 tasks is significant enough, the tick will handle this
variation

Regards,
Vincent
>
> Thanks,
>
> - Juri
>
> > It should be the same for the wake up of a task in enqueue_task_fair
> > above, even if it's less obvious for this latter use case because the
> > cpu might wake up from a long idle phase during which its
> > utilization_blocked_avg has not been updated. Nevertheless, a trig of
> > the freq switch at wake up of the cpu once its usage has been updated
> > should do the job.
> >
> > So tick, migration of tasks, new tasks, entering/leaving idle state of
> > cpu should be enough to trig freq switch
> >
> > Regards,
> > Vincent
> >
> >
> >> +               }
> >>         }
> >>         hrtick_update(rq);
> >>  }
> >> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
> >>         return idx;
> >>  }
> >>
> >> -static unsigned int capacity_margin = 1280; /* ~20% margin */
> >> -
> >>  static bool cpu_overutilized(int cpu)
> >>  {
> >>         return (capacity_of(cpu) * 1024) <
> >> --
> >> 1.9.1
> >>
> >
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Aug. 11, 2015, 9:08 a.m. UTC | #6
On 10/08/15 16:07, Vincent Guittot wrote:
> On 10 August 2015 at 15:43, Juri Lelli <juri.lelli@arm.com> wrote:
>>
>> Hi Vincent,
>>
>> On 04/08/15 14:41, Vincent Guittot wrote:
>>> Hi Juri,
>>>
>>> On 7 July 2015 at 20:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
>>>> From: Juri Lelli <juri.lelli@arm.com>
>>>>
>>>> Each time a task is {en,de}queued we might need to adapt the current
>>>> frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
>>>> this purpose.  Only trigger a freq request if we are effectively waking up
>>>> or going to sleep.  Filter out load balancing related calls to reduce the
>>>> number of triggers.
>>>>
>>>> cc: Ingo Molnar <mingo@redhat.com>
>>>> cc: Peter Zijlstra <peterz@infradead.org>
>>>>
>>>> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
>>>> ---
>>>>  kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
>>>>  1 file changed, 40 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index f74e9d2..b8627c6 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -4281,7 +4281,10 @@ static inline void hrtick_update(struct rq *rq)
>>>>  }
>>>>  #endif
>>>>
>>>> +static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>> +
>>>>  static bool cpu_overutilized(int cpu);
>>>> +static unsigned long get_cpu_usage(int cpu);
>>>>  struct static_key __sched_energy_freq __read_mostly = STATIC_KEY_INIT_FALSE;
>>>>
>>>>  /*
>>>> @@ -4332,6 +4335,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>                 if (!task_new && !rq->rd->overutilized &&
>>>>                     cpu_overutilized(rq->cpu))
>>>>                         rq->rd->overutilized = true;
>>>> +               /*
>>>> +                * We want to trigger a freq switch request only for tasks that
>>>> +                * are waking up; this is because we get here also during
>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>> +                * as single request after load balancing is done.
>>>> +                *
>>>> +                * XXX: how about fork()? Do we need a special flag/something
>>>> +                *      to tell if we are here after a fork() (wakeup_task_new)?
>>>> +                *
>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>> +                * to our request to provide some head room if p's utilization
>>>> +                * further increases.
>>>> +                */
>>>> +               if (sched_energy_freq() && !task_new) {
>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>> +
>>>> +                       req_cap = req_cap * capacity_margin
>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>> +               }
>>>>         }
>>>>         hrtick_update(rq);
>>>>  }
>>>> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>         if (!se) {
>>>>                 sub_nr_running(rq, 1);
>>>>                 update_rq_runnable_avg(rq, 1);
>>>> +               /*
>>>> +                * We want to trigger a freq switch request only for tasks that
>>>> +                * are going to sleep; this is because we get here also during
>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>> +                * as single request after load balancing is done.
>>>> +                *
>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>> +                * to our request to provide some head room if p's utilization
>>>> +                * further increases.
>>>> +                */
>>>> +               if (sched_energy_freq() && task_sleep) {
>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>> +
>>>> +                       req_cap = req_cap * capacity_margin
>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>
>>> Could you clarify why you want to trig a freq switch for tasks that
>>> are going to sleep ?
>>> The cpu_usage should not changed that much as the se_utilization of
>>> the entity moves from utilization_load_avg to utilization_blocked_avg
>>> of the rq and the usage and the freq are updated periodically.
>>
>> I think we still need to cover multiple back-to-back dequeues. Suppose
>> that you have, let's say, 3 tasks that get enqueued at the same time.
>> After some time the first one goes to sleep and its utilization, as you
>> say, gets moved to utilization_blocked_avg. So, nothing changes, and
>> the trigger is superfluous (even if no freq change I guess will be
>> issued as we are already servicing enough capacity). However, after a
>> while, the second task goes to sleep. Now we still use get_cpu_usage()
>> and the first task contribution in utilization_blocked_avg should have
>> been decayed by this time. Same thing may than happen for the third task
>> as well. So, if we don't check if we need to scale down in
>> dequeue_task_fair, it seems to me that we might miss some opportunities,
>> as blocked contribution of other tasks could have been successively
>> decayed.
>>
>> What you think?
> 
> The tick is used to monitor such variation of the usage (in both way,
> decay of the usage of sleeping tasks and increase of the usage of
> running tasks). So in your example, if the duration between the sleep
> of the 2 tasks is significant enough, the tick will handle this
> variation
> 

The tick is used to decide if we need to scale up (to max OPP for the
time being), but we don't scale down. It makes more logical sense to
scale down at task deactivation, or wakeup after a long time, IMHO.

Best,

- Juri

> Regards,
> Vincent
>>
>> Thanks,
>>
>> - Juri
>>
>>> It should be the same for the wake up of a task in enqueue_task_fair
>>> above, even if it's less obvious for this latter use case because the
>>> cpu might wake up from a long idle phase during which its
>>> utilization_blocked_avg has not been updated. Nevertheless, a trig of
>>> the freq switch at wake up of the cpu once its usage has been updated
>>> should do the job.
>>>
>>> So tick, migration of tasks, new tasks, entering/leaving idle state of
>>> cpu should be enough to trig freq switch
>>>
>>> Regards,
>>> Vincent
>>>
>>>
>>>> +               }
>>>>         }
>>>>         hrtick_update(rq);
>>>>  }
>>>> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>>>>         return idx;
>>>>  }
>>>>
>>>> -static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>> -
>>>>  static bool cpu_overutilized(int cpu)
>>>>  {
>>>>         return (capacity_of(cpu) * 1024) <
>>>> --
>>>> 1.9.1
>>>>
>>>
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Aug. 11, 2015, 11:41 a.m. UTC | #7
On 11 August 2015 at 11:08, Juri Lelli <juri.lelli@arm.com> wrote:
> On 10/08/15 16:07, Vincent Guittot wrote:
>> On 10 August 2015 at 15:43, Juri Lelli <juri.lelli@arm.com> wrote:
>>>
>>> Hi Vincent,
>>>
>>> On 04/08/15 14:41, Vincent Guittot wrote:
>>>> Hi Juri,
>>>>
>>>> On 7 July 2015 at 20:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
>>>>> From: Juri Lelli <juri.lelli@arm.com>
>>>>>
>>>>> Each time a task is {en,de}queued we might need to adapt the current
>>>>> frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
>>>>> this purpose.  Only trigger a freq request if we are effectively waking up
>>>>> or going to sleep.  Filter out load balancing related calls to reduce the
>>>>> number of triggers.
>>>>>
>>>>> cc: Ingo Molnar <mingo@redhat.com>
>>>>> cc: Peter Zijlstra <peterz@infradead.org>
>>>>>
>>>>> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
>>>>> ---
>>>>>  kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
>>>>>  1 file changed, 40 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index f74e9d2..b8627c6 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -4281,7 +4281,10 @@ static inline void hrtick_update(struct rq *rq)
>>>>>  }
>>>>>  #endif
>>>>>
>>>>> +static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>>> +
>>>>>  static bool cpu_overutilized(int cpu);
>>>>> +static unsigned long get_cpu_usage(int cpu);
>>>>>  struct static_key __sched_energy_freq __read_mostly = STATIC_KEY_INIT_FALSE;
>>>>>
>>>>>  /*
>>>>> @@ -4332,6 +4335,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>                 if (!task_new && !rq->rd->overutilized &&
>>>>>                     cpu_overutilized(rq->cpu))
>>>>>                         rq->rd->overutilized = true;
>>>>> +               /*
>>>>> +                * We want to trigger a freq switch request only for tasks that
>>>>> +                * are waking up; this is because we get here also during
>>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>>> +                * as single request after load balancing is done.
>>>>> +                *
>>>>> +                * XXX: how about fork()? Do we need a special flag/something
>>>>> +                *      to tell if we are here after a fork() (wakeup_task_new)?
>>>>> +                *
>>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>>> +                * to our request to provide some head room if p's utilization
>>>>> +                * further increases.
>>>>> +                */
>>>>> +               if (sched_energy_freq() && !task_new) {
>>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>>> +
>>>>> +                       req_cap = req_cap * capacity_margin
>>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>>> +               }
>>>>>         }
>>>>>         hrtick_update(rq);
>>>>>  }
>>>>> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>         if (!se) {
>>>>>                 sub_nr_running(rq, 1);
>>>>>                 update_rq_runnable_avg(rq, 1);
>>>>> +               /*
>>>>> +                * We want to trigger a freq switch request only for tasks that
>>>>> +                * are going to sleep; this is because we get here also during
>>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>>> +                * as single request after load balancing is done.
>>>>> +                *
>>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>>> +                * to our request to provide some head room if p's utilization
>>>>> +                * further increases.
>>>>> +                */
>>>>> +               if (sched_energy_freq() && task_sleep) {
>>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>>> +
>>>>> +                       req_cap = req_cap * capacity_margin
>>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>>
>>>> Could you clarify why you want to trig a freq switch for tasks that
>>>> are going to sleep ?
>>>> The cpu_usage should not changed that much as the se_utilization of
>>>> the entity moves from utilization_load_avg to utilization_blocked_avg
>>>> of the rq and the usage and the freq are updated periodically.
>>>
>>> I think we still need to cover multiple back-to-back dequeues. Suppose
>>> that you have, let's say, 3 tasks that get enqueued at the same time.
>>> After some time the first one goes to sleep and its utilization, as you
>>> say, gets moved to utilization_blocked_avg. So, nothing changes, and
>>> the trigger is superfluous (even if no freq change I guess will be
>>> issued as we are already servicing enough capacity). However, after a
>>> while, the second task goes to sleep. Now we still use get_cpu_usage()
>>> and the first task contribution in utilization_blocked_avg should have
>>> been decayed by this time. Same thing may than happen for the third task
>>> as well. So, if we don't check if we need to scale down in
>>> dequeue_task_fair, it seems to me that we might miss some opportunities,
>>> as blocked contribution of other tasks could have been successively
>>> decayed.
>>>
>>> What you think?
>>
>> The tick is used to monitor such variation of the usage (in both way,
>> decay of the usage of sleeping tasks and increase of the usage of
>> running tasks). So in your example, if the duration between the sleep
>> of the 2 tasks is significant enough, the tick will handle this
>> variation
>>
>
> The tick is used to decide if we need to scale up (to max OPP for the
> time being), but we don't scale down. It makes more logical sense to

why don't you want to check if you need to scale down ?

> scale down at task deactivation, or wakeup after a long time, IMHO.

But waking up or going to sleep don't have any impact on the usage of
a cpu. The only events that impact the cpu usage are:
-task migration,
-new task
-time that elapse which can be monitored by periodically checking the usage.
-and for nohz system when cpu enter or leave idle state

waking up and going to sleep events doesn't give any useful
information and using them to trig the monitoring of the usage
variation doesn't give you a predictable/periodic update of it whereas
the tick will

Regards,
Vincent

>
> Best,
>
> - Juri
>
>> Regards,
>> Vincent
>>>
>>> Thanks,
>>>
>>> - Juri
>>>
>>>> It should be the same for the wake up of a task in enqueue_task_fair
>>>> above, even if it's less obvious for this latter use case because the
>>>> cpu might wake up from a long idle phase during which its
>>>> utilization_blocked_avg has not been updated. Nevertheless, a trig of
>>>> the freq switch at wake up of the cpu once its usage has been updated
>>>> should do the job.
>>>>
>>>> So tick, migration of tasks, new tasks, entering/leaving idle state of
>>>> cpu should be enough to trig freq switch
>>>>
>>>> Regards,
>>>> Vincent
>>>>
>>>>
>>>>> +               }
>>>>>         }
>>>>>         hrtick_update(rq);
>>>>>  }
>>>>> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>>>>>         return idx;
>>>>>  }
>>>>>
>>>>> -static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>>> -
>>>>>  static bool cpu_overutilized(int cpu)
>>>>>  {
>>>>>         return (capacity_of(cpu) * 1024) <
>>>>> --
>>>>> 1.9.1
>>>>>
>>>>
>>>
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Aug. 11, 2015, 3:07 p.m. UTC | #8
Hi Vincent,

On 11/08/15 12:41, Vincent Guittot wrote:
> On 11 August 2015 at 11:08, Juri Lelli <juri.lelli@arm.com> wrote:
>> On 10/08/15 16:07, Vincent Guittot wrote:
>>> On 10 August 2015 at 15:43, Juri Lelli <juri.lelli@arm.com> wrote:
>>>>
>>>> Hi Vincent,
>>>>
>>>> On 04/08/15 14:41, Vincent Guittot wrote:
>>>>> Hi Juri,
>>>>>
>>>>> On 7 July 2015 at 20:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
>>>>>> From: Juri Lelli <juri.lelli@arm.com>
>>>>>>
>>>>>> Each time a task is {en,de}queued we might need to adapt the current
>>>>>> frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
>>>>>> this purpose.  Only trigger a freq request if we are effectively waking up
>>>>>> or going to sleep.  Filter out load balancing related calls to reduce the
>>>>>> number of triggers.
>>>>>>
>>>>>> cc: Ingo Molnar <mingo@redhat.com>
>>>>>> cc: Peter Zijlstra <peterz@infradead.org>
>>>>>>
>>>>>> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
>>>>>> ---
>>>>>>  kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
>>>>>>  1 file changed, 40 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>> index f74e9d2..b8627c6 100644
>>>>>> --- a/kernel/sched/fair.c
>>>>>> +++ b/kernel/sched/fair.c
>>>>>> @@ -4281,7 +4281,10 @@ static inline void hrtick_update(struct rq *rq)
>>>>>>  }
>>>>>>  #endif
>>>>>>
>>>>>> +static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>>>> +
>>>>>>  static bool cpu_overutilized(int cpu);
>>>>>> +static unsigned long get_cpu_usage(int cpu);
>>>>>>  struct static_key __sched_energy_freq __read_mostly = STATIC_KEY_INIT_FALSE;
>>>>>>
>>>>>>  /*
>>>>>> @@ -4332,6 +4335,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>>                 if (!task_new && !rq->rd->overutilized &&
>>>>>>                     cpu_overutilized(rq->cpu))
>>>>>>                         rq->rd->overutilized = true;
>>>>>> +               /*
>>>>>> +                * We want to trigger a freq switch request only for tasks that
>>>>>> +                * are waking up; this is because we get here also during
>>>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>>>> +                * as single request after load balancing is done.
>>>>>> +                *
>>>>>> +                * XXX: how about fork()? Do we need a special flag/something
>>>>>> +                *      to tell if we are here after a fork() (wakeup_task_new)?
>>>>>> +                *
>>>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>>>> +                * to our request to provide some head room if p's utilization
>>>>>> +                * further increases.
>>>>>> +                */
>>>>>> +               if (sched_energy_freq() && !task_new) {
>>>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>>>> +
>>>>>> +                       req_cap = req_cap * capacity_margin
>>>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>>>> +               }
>>>>>>         }
>>>>>>         hrtick_update(rq);
>>>>>>  }
>>>>>> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>>         if (!se) {
>>>>>>                 sub_nr_running(rq, 1);
>>>>>>                 update_rq_runnable_avg(rq, 1);
>>>>>> +               /*
>>>>>> +                * We want to trigger a freq switch request only for tasks that
>>>>>> +                * are going to sleep; this is because we get here also during
>>>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>>>> +                * as single request after load balancing is done.
>>>>>> +                *
>>>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>>>> +                * to our request to provide some head room if p's utilization
>>>>>> +                * further increases.
>>>>>> +                */
>>>>>> +               if (sched_energy_freq() && task_sleep) {
>>>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>>>> +
>>>>>> +                       req_cap = req_cap * capacity_margin
>>>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>>>
>>>>> Could you clarify why you want to trig a freq switch for tasks that
>>>>> are going to sleep ?
>>>>> The cpu_usage should not changed that much as the se_utilization of
>>>>> the entity moves from utilization_load_avg to utilization_blocked_avg
>>>>> of the rq and the usage and the freq are updated periodically.
>>>>
>>>> I think we still need to cover multiple back-to-back dequeues. Suppose
>>>> that you have, let's say, 3 tasks that get enqueued at the same time.
>>>> After some time the first one goes to sleep and its utilization, as you
>>>> say, gets moved to utilization_blocked_avg. So, nothing changes, and
>>>> the trigger is superfluous (even if no freq change I guess will be
>>>> issued as we are already servicing enough capacity). However, after a
>>>> while, the second task goes to sleep. Now we still use get_cpu_usage()
>>>> and the first task contribution in utilization_blocked_avg should have
>>>> been decayed by this time. Same thing may than happen for the third task
>>>> as well. So, if we don't check if we need to scale down in
>>>> dequeue_task_fair, it seems to me that we might miss some opportunities,
>>>> as blocked contribution of other tasks could have been successively
>>>> decayed.
>>>>
>>>> What you think?
>>>
>>> The tick is used to monitor such variation of the usage (in both way,
>>> decay of the usage of sleeping tasks and increase of the usage of
>>> running tasks). So in your example, if the duration between the sleep
>>> of the 2 tasks is significant enough, the tick will handle this
>>> variation
>>>
>>
>> The tick is used to decide if we need to scale up (to max OPP for the
>> time being), but we don't scale down. It makes more logical sense to
> 
> why don't you want to check if you need to scale down ?
> 

Well, because if I'm still executing something the cpu usage is only
subject to raise.

>> scale down at task deactivation, or wakeup after a long time, IMHO.
> 
> But waking up or going to sleep don't have any impact on the usage of
> a cpu. The only events that impact the cpu usage are:
> -task migration,

We explicitly cover this on load balancing paths.

> -new task

We cover this in enqueue_task_fair(), introducing a new flag.

> -time that elapse which can be monitored by periodically checking the usage.

Do you mean when a task utilization crosses some threshold
related to the current OPP? If that is the case, we have a
check in task_tick_fair().

> -and for nohz system when cpu enter or leave idle state
> 

We address this in dequeue_task_fair(). In particular, if
the cpu is going to be idle we don't trigger any change as
it seems not always wise to wake up a thread to just change
the OPP and the go idle; some platforms might require this
behaviour anyway, but it probably more cpuidle/fw related?

I would also add:

- task is going to die

We address this in dequeue as well, as its contribution is
removed from usage (mod Yuyang's patches).

> waking up and going to sleep events doesn't give any useful
> information and using them to trig the monitoring of the usage
> variation doesn't give you a predictable/periodic update of it whereas
> the tick will
> 

So, one key point of this solution is to get away as much
as we can from periodic updates/sampling and move towards a
(fully) event driven approach. The event logically associated
to task_tick_fair() is when we realize that a task is going
to saturate the current capacity; in this case we trigger a
freq switch to an higher capacity. Also, if we never react
to normal wakeups (as I understand you are proposing) we might
miss some chances to adapt quickly enough. As an example, if
you have a big task that suddenly goes to sleep, and sleeps
until its decayed utilization goes almost to zero; when it
wakes up, if we don't have a trigger in enqueue_task_fair(),
we'll have to wait until the next tick to select an appropriate
(low) OPP.

Best,

- Juri

>>
>> Best,
>>
>> - Juri
>>
>>> Regards,
>>> Vincent
>>>>
>>>> Thanks,
>>>>
>>>> - Juri
>>>>
>>>>> It should be the same for the wake up of a task in enqueue_task_fair
>>>>> above, even if it's less obvious for this latter use case because the
>>>>> cpu might wake up from a long idle phase during which its
>>>>> utilization_blocked_avg has not been updated. Nevertheless, a trig of
>>>>> the freq switch at wake up of the cpu once its usage has been updated
>>>>> should do the job.
>>>>>
>>>>> So tick, migration of tasks, new tasks, entering/leaving idle state of
>>>>> cpu should be enough to trig freq switch
>>>>>
>>>>> Regards,
>>>>> Vincent
>>>>>
>>>>>
>>>>>> +               }
>>>>>>         }
>>>>>>         hrtick_update(rq);
>>>>>>  }
>>>>>> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>>>>>>         return idx;
>>>>>>  }
>>>>>>
>>>>>> -static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>>>> -
>>>>>>  static bool cpu_overutilized(int cpu)
>>>>>>  {
>>>>>>         return (capacity_of(cpu) * 1024) <
>>>>>> --
>>>>>> 1.9.1
>>>>>>
>>>>>
>>>>
>>>
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Aug. 11, 2015, 4:37 p.m. UTC | #9
On 11 August 2015 at 17:07, Juri Lelli <juri.lelli@arm.com> wrote:
> Hi Vincent,
>
> On 11/08/15 12:41, Vincent Guittot wrote:
>> On 11 August 2015 at 11:08, Juri Lelli <juri.lelli@arm.com> wrote:
>>> On 10/08/15 16:07, Vincent Guittot wrote:
>>>> On 10 August 2015 at 15:43, Juri Lelli <juri.lelli@arm.com> wrote:
>>>>>
>>>>> Hi Vincent,
>>>>>
>>>>> On 04/08/15 14:41, Vincent Guittot wrote:
>>>>>> Hi Juri,
>>>>>>
>>>>>> On 7 July 2015 at 20:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
>>>>>>> From: Juri Lelli <juri.lelli@arm.com>
>>>>>>>
>>>>>>> Each time a task is {en,de}queued we might need to adapt the current
>>>>>>> frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
>>>>>>> this purpose.  Only trigger a freq request if we are effectively waking up
>>>>>>> or going to sleep.  Filter out load balancing related calls to reduce the
>>>>>>> number of triggers.
>>>>>>>
>>>>>>> cc: Ingo Molnar <mingo@redhat.com>
>>>>>>> cc: Peter Zijlstra <peterz@infradead.org>
>>>>>>>
>>>>>>> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
>>>>>>> ---
>>>>>>>  kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
>>>>>>>  1 file changed, 40 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>>> index f74e9d2..b8627c6 100644
>>>>>>> --- a/kernel/sched/fair.c
>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>> @@ -4281,7 +4281,10 @@ static inline void hrtick_update(struct rq *rq)
>>>>>>>  }
>>>>>>>  #endif
>>>>>>>
>>>>>>> +static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>>>>> +
>>>>>>>  static bool cpu_overutilized(int cpu);
>>>>>>> +static unsigned long get_cpu_usage(int cpu);
>>>>>>>  struct static_key __sched_energy_freq __read_mostly = STATIC_KEY_INIT_FALSE;
>>>>>>>
>>>>>>>  /*
>>>>>>> @@ -4332,6 +4335,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>>>                 if (!task_new && !rq->rd->overutilized &&
>>>>>>>                     cpu_overutilized(rq->cpu))
>>>>>>>                         rq->rd->overutilized = true;
>>>>>>> +               /*
>>>>>>> +                * We want to trigger a freq switch request only for tasks that
>>>>>>> +                * are waking up; this is because we get here also during
>>>>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>>>>> +                * as single request after load balancing is done.
>>>>>>> +                *
>>>>>>> +                * XXX: how about fork()? Do we need a special flag/something
>>>>>>> +                *      to tell if we are here after a fork() (wakeup_task_new)?
>>>>>>> +                *
>>>>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>>>>> +                * to our request to provide some head room if p's utilization
>>>>>>> +                * further increases.
>>>>>>> +                */
>>>>>>> +               if (sched_energy_freq() && !task_new) {
>>>>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>>>>> +
>>>>>>> +                       req_cap = req_cap * capacity_margin
>>>>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>>>>> +               }
>>>>>>>         }
>>>>>>>         hrtick_update(rq);
>>>>>>>  }
>>>>>>> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>>>         if (!se) {
>>>>>>>                 sub_nr_running(rq, 1);
>>>>>>>                 update_rq_runnable_avg(rq, 1);
>>>>>>> +               /*
>>>>>>> +                * We want to trigger a freq switch request only for tasks that
>>>>>>> +                * are going to sleep; this is because we get here also during
>>>>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>>>>> +                * as single request after load balancing is done.
>>>>>>> +                *
>>>>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>>>>> +                * to our request to provide some head room if p's utilization
>>>>>>> +                * further increases.
>>>>>>> +                */
>>>>>>> +               if (sched_energy_freq() && task_sleep) {
>>>>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>>>>> +
>>>>>>> +                       req_cap = req_cap * capacity_margin
>>>>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>>>>
>>>>>> Could you clarify why you want to trig a freq switch for tasks that
>>>>>> are going to sleep ?
>>>>>> The cpu_usage should not changed that much as the se_utilization of
>>>>>> the entity moves from utilization_load_avg to utilization_blocked_avg
>>>>>> of the rq and the usage and the freq are updated periodically.
>>>>>
>>>>> I think we still need to cover multiple back-to-back dequeues. Suppose
>>>>> that you have, let's say, 3 tasks that get enqueued at the same time.
>>>>> After some time the first one goes to sleep and its utilization, as you
>>>>> say, gets moved to utilization_blocked_avg. So, nothing changes, and
>>>>> the trigger is superfluous (even if no freq change I guess will be
>>>>> issued as we are already servicing enough capacity). However, after a
>>>>> while, the second task goes to sleep. Now we still use get_cpu_usage()
>>>>> and the first task contribution in utilization_blocked_avg should have
>>>>> been decayed by this time. Same thing may than happen for the third task
>>>>> as well. So, if we don't check if we need to scale down in
>>>>> dequeue_task_fair, it seems to me that we might miss some opportunities,
>>>>> as blocked contribution of other tasks could have been successively
>>>>> decayed.
>>>>>
>>>>> What you think?
>>>>
>>>> The tick is used to monitor such variation of the usage (in both way,
>>>> decay of the usage of sleeping tasks and increase of the usage of
>>>> running tasks). So in your example, if the duration between the sleep
>>>> of the 2 tasks is significant enough, the tick will handle this
>>>> variation
>>>>
>>>
>>> The tick is used to decide if we need to scale up (to max OPP for the
>>> time being), but we don't scale down. It makes more logical sense to
>>
>> why don't you want to check if you need to scale down ?
>>
>
> Well, because if I'm still executing something the cpu usage is only
> subject to raise.

This is only true for  system with NO HZ idle

>
>>> scale down at task deactivation, or wakeup after a long time, IMHO.
>>
>> But waking up or going to sleep don't have any impact on the usage of
>> a cpu. The only events that impact the cpu usage are:
>> -task migration,
>
> We explicitly cover this on load balancing paths.
>
>> -new task
>
> We cover this in enqueue_task_fair(), introducing a new flag.
>
>> -time that elapse which can be monitored by periodically checking the usage.
>
> Do you mean when a task utilization crosses some threshold
> related to the current OPP? If that is the case, we have a
> check in task_tick_fair().
>
>> -and for nohz system when cpu enter or leave idle state
>>
>
> We address this in dequeue_task_fair(). In particular, if
> the cpu is going to be idle we don't trigger any change as
> it seems not always wise to wake up a thread to just change
> the OPP and the go idle; some platforms might require this
> behaviour anyway, but it probably more cpuidle/fw related?

I would say that it's interesting to notifiy sched-dvfs that a cpu
becomes idle because we could decrease the opp of a cluster of cpus
that share the same clock if this cpu is the one that requires the max
capacity of the cluster (and other cpus are still running).

>
> I would also add:
>
> - task is going to die
>
> We address this in dequeue as well, as its contribution is
> removed from usage (mod Yuyang's patches).
>
>> waking up and going to sleep events doesn't give any useful
>> information and using them to trig the monitoring of the usage
>> variation doesn't give you a predictable/periodic update of it whereas
>> the tick will
>>
>
> So, one key point of this solution is to get away as much
> as we can from periodic updates/sampling and move towards a
> (fully) event driven approach. The event logically associated
> to task_tick_fair() is when we realize that a task is going
> to saturate the current capacity; in this case we trigger a
> freq switch to an higher capacity. Also, if we never react
> to normal wakeups (as I understand you are proposing) we might
> miss some chances to adapt quickly enough. As an example, if
> you have a big task that suddenly goes to sleep, and sleeps
> until its decayed utilization goes almost to zero; when it
> wakes up, if we don't have a trigger in enqueue_task_fair(),
> we'll have to wait until the next tick to select an appropriate
> (low) OPP.

I assume that the cpu is idle in this case. This situation only
happens on Nohz idle system because tick is disable and you have to
update statistics when leaving idle as it is done for the jiffies or
the cpu_load array. So you should track cpu enter/leave idle (for nohz
system only) instead of tracking all tasks wake up/sleep events.

So you can either use update_cpu_load_nohz like it is already done for
cpu_load array
or you should use some conditions like below if you want to stay in
enqueue/dequeue_task_fair but task wake up or sleep event are not the
right condition
if (!(flags & ENQUEUE_WAKEUP) || rq->nr_running == 1 ) in enqueue_task_fair
and
if (!task_sleep || rq->nr_running == 0) in dequeue_task_fair

We can probably optimized by using  rq->cfs.h_nr_running instead of
rq->nr_running as only cfs tasks really modifies the usage

Regards,
Vincent

>
> Best,
>
> - Juri
>
>>>
>>> Best,
>>>
>>> - Juri
>>>
>>>> Regards,
>>>> Vincent
>>>>>
>>>>> Thanks,
>>>>>
>>>>> - Juri
>>>>>
>>>>>> It should be the same for the wake up of a task in enqueue_task_fair
>>>>>> above, even if it's less obvious for this latter use case because the
>>>>>> cpu might wake up from a long idle phase during which its
>>>>>> utilization_blocked_avg has not been updated. Nevertheless, a trig of
>>>>>> the freq switch at wake up of the cpu once its usage has been updated
>>>>>> should do the job.
>>>>>>
>>>>>> So tick, migration of tasks, new tasks, entering/leaving idle state of
>>>>>> cpu should be enough to trig freq switch
>>>>>>
>>>>>> Regards,
>>>>>> Vincent
>>>>>>
>>>>>>
>>>>>>> +               }
>>>>>>>         }
>>>>>>>         hrtick_update(rq);
>>>>>>>  }
>>>>>>> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>>>>>>>         return idx;
>>>>>>>  }
>>>>>>>
>>>>>>> -static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>>>>> -
>>>>>>>  static bool cpu_overutilized(int cpu)
>>>>>>>  {
>>>>>>>         return (capacity_of(cpu) * 1024) <
>>>>>>> --
>>>>>>> 1.9.1
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Aug. 12, 2015, 3:15 p.m. UTC | #10
On 11/08/15 17:37, Vincent Guittot wrote:
> On 11 August 2015 at 17:07, Juri Lelli <juri.lelli@arm.com> wrote:
>> Hi Vincent,
>>
>> On 11/08/15 12:41, Vincent Guittot wrote:
>>> On 11 August 2015 at 11:08, Juri Lelli <juri.lelli@arm.com> wrote:
>>>> On 10/08/15 16:07, Vincent Guittot wrote:
>>>>> On 10 August 2015 at 15:43, Juri Lelli <juri.lelli@arm.com> wrote:
>>>>>>
>>>>>> Hi Vincent,
>>>>>>
>>>>>> On 04/08/15 14:41, Vincent Guittot wrote:
>>>>>>> Hi Juri,
>>>>>>>
>>>>>>> On 7 July 2015 at 20:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
>>>>>>>> From: Juri Lelli <juri.lelli@arm.com>
>>>>>>>>
>>>>>>>> Each time a task is {en,de}queued we might need to adapt the current
>>>>>>>> frequency to the new usage. Add triggers on {en,de}queue_task_fair() for
>>>>>>>> this purpose.  Only trigger a freq request if we are effectively waking up
>>>>>>>> or going to sleep.  Filter out load balancing related calls to reduce the
>>>>>>>> number of triggers.
>>>>>>>>
>>>>>>>> cc: Ingo Molnar <mingo@redhat.com>
>>>>>>>> cc: Peter Zijlstra <peterz@infradead.org>
>>>>>>>>
>>>>>>>> Signed-off-by: Juri Lelli <juri.lelli@arm.com>
>>>>>>>> ---
>>>>>>>>  kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++--
>>>>>>>>  1 file changed, 40 insertions(+), 2 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>>>> index f74e9d2..b8627c6 100644
>>>>>>>> --- a/kernel/sched/fair.c
>>>>>>>> +++ b/kernel/sched/fair.c
>>>>>>>> @@ -4281,7 +4281,10 @@ static inline void hrtick_update(struct rq *rq)
>>>>>>>>  }
>>>>>>>>  #endif
>>>>>>>>
>>>>>>>> +static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>>>>>> +
>>>>>>>>  static bool cpu_overutilized(int cpu);
>>>>>>>> +static unsigned long get_cpu_usage(int cpu);
>>>>>>>>  struct static_key __sched_energy_freq __read_mostly = STATIC_KEY_INIT_FALSE;
>>>>>>>>
>>>>>>>>  /*
>>>>>>>> @@ -4332,6 +4335,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>>>>                 if (!task_new && !rq->rd->overutilized &&
>>>>>>>>                     cpu_overutilized(rq->cpu))
>>>>>>>>                         rq->rd->overutilized = true;
>>>>>>>> +               /*
>>>>>>>> +                * We want to trigger a freq switch request only for tasks that
>>>>>>>> +                * are waking up; this is because we get here also during
>>>>>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>>>>>> +                * as single request after load balancing is done.
>>>>>>>> +                *
>>>>>>>> +                * XXX: how about fork()? Do we need a special flag/something
>>>>>>>> +                *      to tell if we are here after a fork() (wakeup_task_new)?
>>>>>>>> +                *
>>>>>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>>>>>> +                * to our request to provide some head room if p's utilization
>>>>>>>> +                * further increases.
>>>>>>>> +                */
>>>>>>>> +               if (sched_energy_freq() && !task_new) {
>>>>>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>>>>>> +
>>>>>>>> +                       req_cap = req_cap * capacity_margin
>>>>>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>>>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>>>>>> +               }
>>>>>>>>         }
>>>>>>>>         hrtick_update(rq);
>>>>>>>>  }
>>>>>>>> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>>>>         if (!se) {
>>>>>>>>                 sub_nr_running(rq, 1);
>>>>>>>>                 update_rq_runnable_avg(rq, 1);
>>>>>>>> +               /*
>>>>>>>> +                * We want to trigger a freq switch request only for tasks that
>>>>>>>> +                * are going to sleep; this is because we get here also during
>>>>>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>>>>>> +                * as single request after load balancing is done.
>>>>>>>> +                *
>>>>>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>>>>>> +                * to our request to provide some head room if p's utilization
>>>>>>>> +                * further increases.
>>>>>>>> +                */
>>>>>>>> +               if (sched_energy_freq() && task_sleep) {
>>>>>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>>>>>> +
>>>>>>>> +                       req_cap = req_cap * capacity_margin
>>>>>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>>>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>>>>>
>>>>>>> Could you clarify why you want to trig a freq switch for tasks that
>>>>>>> are going to sleep ?
>>>>>>> The cpu_usage should not changed that much as the se_utilization of
>>>>>>> the entity moves from utilization_load_avg to utilization_blocked_avg
>>>>>>> of the rq and the usage and the freq are updated periodically.
>>>>>>
>>>>>> I think we still need to cover multiple back-to-back dequeues. Suppose
>>>>>> that you have, let's say, 3 tasks that get enqueued at the same time.
>>>>>> After some time the first one goes to sleep and its utilization, as you
>>>>>> say, gets moved to utilization_blocked_avg. So, nothing changes, and
>>>>>> the trigger is superfluous (even if no freq change I guess will be
>>>>>> issued as we are already servicing enough capacity). However, after a
>>>>>> while, the second task goes to sleep. Now we still use get_cpu_usage()
>>>>>> and the first task contribution in utilization_blocked_avg should have
>>>>>> been decayed by this time. Same thing may than happen for the third task
>>>>>> as well. So, if we don't check if we need to scale down in
>>>>>> dequeue_task_fair, it seems to me that we might miss some opportunities,
>>>>>> as blocked contribution of other tasks could have been successively
>>>>>> decayed.
>>>>>>
>>>>>> What you think?
>>>>>
>>>>> The tick is used to monitor such variation of the usage (in both way,
>>>>> decay of the usage of sleeping tasks and increase of the usage of
>>>>> running tasks). So in your example, if the duration between the sleep
>>>>> of the 2 tasks is significant enough, the tick will handle this
>>>>> variation
>>>>>
>>>>
>>>> The tick is used to decide if we need to scale up (to max OPP for the
>>>> time being), but we don't scale down. It makes more logical sense to
>>>
>>> why don't you want to check if you need to scale down ?
>>>
>>
>> Well, because if I'm still executing something the cpu usage is only
>> subject to raise.
> 
> This is only true for  system with NO HZ idle
> 

Well, even with !NO_HZ_IDLE usage only decreases when cpu is idle. But,
I think I got your point; for !NO_HZ_IDLE configurations we might end
up not scaling down frequency even if we have the tick running and
the cpu is idle. I might need some more time to think this through, but
it seems to me that we are still fine without an explicit trigger in
task_tick_fair(); if we are running a !NO_HZ_IDLE system we are probably
not so much concerned about power savings and still we react
to tasks waking up, sleeping, leaving or moving around (which seems the
real important events to me); OTOH, we might add that trigger, but this
will generate unconditional checks at tick time for NO_HZ_IDLE
configurations, for a benefit that it seems to be still not completely
clear.

>>
>>>> scale down at task deactivation, or wakeup after a long time, IMHO.
>>>
>>> But waking up or going to sleep don't have any impact on the usage of
>>> a cpu. The only events that impact the cpu usage are:
>>> -task migration,
>>
>> We explicitly cover this on load balancing paths.
>>
>>> -new task
>>
>> We cover this in enqueue_task_fair(), introducing a new flag.
>>
>>> -time that elapse which can be monitored by periodically checking the usage.
>>
>> Do you mean when a task utilization crosses some threshold
>> related to the current OPP? If that is the case, we have a
>> check in task_tick_fair().
>>
>>> -and for nohz system when cpu enter or leave idle state
>>>
>>
>> We address this in dequeue_task_fair(). In particular, if
>> the cpu is going to be idle we don't trigger any change as
>> it seems not always wise to wake up a thread to just change
>> the OPP and the go idle; some platforms might require this
>> behaviour anyway, but it probably more cpuidle/fw related?
> 
> I would say that it's interesting to notifiy sched-dvfs that a cpu
> becomes idle because we could decrease the opp of a cluster of cpus
> that share the same clock if this cpu is the one that requires the max
> capacity of the cluster (and other cpus are still running).
> 

Well, we reset the capacity request of the cpu that is going idle.
The idea is that the next event on one of the other related cpus
will update the cluster freq correctly. If any other cpu in the
cluster is running something we keep the same frequency until
the task running on that cpu goes to sleep; this seems fine to
me because that task might end up being heavy and we saved a
back to back lower to higher OPP switch; if the task is instead
light it will probably be dequeued pretty soon, and at that time
we switch to a lower OPP (since we cleared the idle cpu request
before). Also, if the other cpus in the cluster are all idle
we'll most probably enter an idle state, so no freq switch is
most likely required.

>>
>> I would also add:
>>
>> - task is going to die
>>
>> We address this in dequeue as well, as its contribution is
>> removed from usage (mod Yuyang's patches).
>>
>>> waking up and going to sleep events doesn't give any useful
>>> information and using them to trig the monitoring of the usage
>>> variation doesn't give you a predictable/periodic update of it whereas
>>> the tick will
>>>
>>
>> So, one key point of this solution is to get away as much
>> as we can from periodic updates/sampling and move towards a
>> (fully) event driven approach. The event logically associated
>> to task_tick_fair() is when we realize that a task is going
>> to saturate the current capacity; in this case we trigger a
>> freq switch to an higher capacity. Also, if we never react
>> to normal wakeups (as I understand you are proposing) we might
>> miss some chances to adapt quickly enough. As an example, if
>> you have a big task that suddenly goes to sleep, and sleeps
>> until its decayed utilization goes almost to zero; when it
>> wakes up, if we don't have a trigger in enqueue_task_fair(),
>> we'll have to wait until the next tick to select an appropriate
>> (low) OPP.
> 
> I assume that the cpu is idle in this case. This situation only
> happens on Nohz idle system because tick is disable and you have to
> update statistics when leaving idle as it is done for the jiffies or
> the cpu_load array. So you should track cpu enter/leave idle (for nohz
> system only) instead of tracking all tasks wake up/sleep events.
> 

I think I already replied to this in what above. Did I? :)

> So you can either use update_cpu_load_nohz like it is already done for
> cpu_load array
> or you should use some conditions like below if you want to stay in
> enqueue/dequeue_task_fair but task wake up or sleep event are not the
> right condition
> if (!(flags & ENQUEUE_WAKEUP) || rq->nr_running == 1 ) in enqueue_task_fair
> and
> if (!task_sleep || rq->nr_running == 0) in dequeue_task_fair
> 
> We can probably optimized by using  rq->cfs.h_nr_running instead of
> rq->nr_running as only cfs tasks really modifies the usage
> 

I already filter out enqueues/dequeues that comes from load balancing;
and I use cfs.nr_running because, as you say, we currently work with CFS
tasks only.

Thanks,

- Juri

> Regards,
> Vincent
> 
>>
>> Best,
>>
>> - Juri
>>
>>>>
>>>> Best,
>>>>
>>>> - Juri
>>>>
>>>>> Regards,
>>>>> Vincent
>>>>>>
>>>>>> Thanks,
>>>>>>
>>>>>> - Juri
>>>>>>
>>>>>>> It should be the same for the wake up of a task in enqueue_task_fair
>>>>>>> above, even if it's less obvious for this latter use case because the
>>>>>>> cpu might wake up from a long idle phase during which its
>>>>>>> utilization_blocked_avg has not been updated. Nevertheless, a trig of
>>>>>>> the freq switch at wake up of the cpu once its usage has been updated
>>>>>>> should do the job.
>>>>>>>
>>>>>>> So tick, migration of tasks, new tasks, entering/leaving idle state of
>>>>>>> cpu should be enough to trig freq switch
>>>>>>>
>>>>>>> Regards,
>>>>>>> Vincent
>>>>>>>
>>>>>>>
>>>>>>>> +               }
>>>>>>>>         }
>>>>>>>>         hrtick_update(rq);
>>>>>>>>  }
>>>>>>>> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>>>>>>>>         return idx;
>>>>>>>>  }
>>>>>>>>
>>>>>>>> -static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>>>>>> -
>>>>>>>>  static bool cpu_overutilized(int cpu)
>>>>>>>>  {
>>>>>>>>         return (capacity_of(cpu) * 1024) <
>>>>>>>> --
>>>>>>>> 1.9.1
>>>>>>>>
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Aug. 13, 2015, 12:08 p.m. UTC | #11
On 12 August 2015 at 17:15, Juri Lelli <juri.lelli@arm.com> wrote:
> On 11/08/15 17:37, Vincent Guittot wrote:
>> On 11 August 2015 at 17:07, Juri Lelli <juri.lelli@arm.com> wrote:
>>> Hi Vincent,
>>>
>>> On 11/08/15 12:41, Vincent Guittot wrote:
>>>> On 11 August 2015 at 11:08, Juri Lelli <juri.lelli@arm.com> wrote:
>>>>> On 10/08/15 16:07, Vincent Guittot wrote:
>>>>>> On 10 August 2015 at 15:43, Juri Lelli <juri.lelli@arm.com> wrote:
>>>>>>>
>>>>>>> Hi Vincent,
>>>>>>>
>>>>>>> On 04/08/15 14:41, Vincent Guittot wrote:
>>>>>>>> Hi Juri,
>>>>>>>>
>>>>>>>> On 7 July 2015 at 20:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
>>>>>>>>> From: Juri Lelli <juri.lelli@arm.com>

 [snip]

>>>>>>>>>  }
>>>>>>>>> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>>>>>         if (!se) {
>>>>>>>>>                 sub_nr_running(rq, 1);
>>>>>>>>>                 update_rq_runnable_avg(rq, 1);
>>>>>>>>> +               /*
>>>>>>>>> +                * We want to trigger a freq switch request only for tasks that
>>>>>>>>> +                * are going to sleep; this is because we get here also during
>>>>>>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>>>>>>> +                * as single request after load balancing is done.
>>>>>>>>> +                *
>>>>>>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>>>>>>> +                * to our request to provide some head room if p's utilization
>>>>>>>>> +                * further increases.
>>>>>>>>> +                */
>>>>>>>>> +               if (sched_energy_freq() && task_sleep) {
>>>>>>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>>>>>>> +
>>>>>>>>> +                       req_cap = req_cap * capacity_margin
>>>>>>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>>>>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>>>>>>
>>>>>>>> Could you clarify why you want to trig a freq switch for tasks that
>>>>>>>> are going to sleep ?
>>>>>>>> The cpu_usage should not changed that much as the se_utilization of
>>>>>>>> the entity moves from utilization_load_avg to utilization_blocked_avg
>>>>>>>> of the rq and the usage and the freq are updated periodically.
>>>>>>>
>>>>>>> I think we still need to cover multiple back-to-back dequeues. Suppose
>>>>>>> that you have, let's say, 3 tasks that get enqueued at the same time.
>>>>>>> After some time the first one goes to sleep and its utilization, as you
>>>>>>> say, gets moved to utilization_blocked_avg. So, nothing changes, and
>>>>>>> the trigger is superfluous (even if no freq change I guess will be
>>>>>>> issued as we are already servicing enough capacity). However, after a
>>>>>>> while, the second task goes to sleep. Now we still use get_cpu_usage()
>>>>>>> and the first task contribution in utilization_blocked_avg should have
>>>>>>> been decayed by this time. Same thing may than happen for the third task
>>>>>>> as well. So, if we don't check if we need to scale down in
>>>>>>> dequeue_task_fair, it seems to me that we might miss some opportunities,
>>>>>>> as blocked contribution of other tasks could have been successively
>>>>>>> decayed.
>>>>>>>
>>>>>>> What you think?
>>>>>>
>>>>>> The tick is used to monitor such variation of the usage (in both way,
>>>>>> decay of the usage of sleeping tasks and increase of the usage of
>>>>>> running tasks). So in your example, if the duration between the sleep
>>>>>> of the 2 tasks is significant enough, the tick will handle this
>>>>>> variation
>>>>>>
>>>>>
>>>>> The tick is used to decide if we need to scale up (to max OPP for the
>>>>> time being), but we don't scale down. It makes more logical sense to
>>>>
>>>> why don't you want to check if you need to scale down ?
>>>>
>>>
>>> Well, because if I'm still executing something the cpu usage is only
>>> subject to raise.
>>
>> This is only true for  system with NO HZ idle
>>
>
> Well, even with !NO_HZ_IDLE usage only decreases when cpu is idle. But,

Well, thanks for this obvious statement that usage only decreases when
cpu is idle but my question has never been about usage variation of
idle/running cpu but about the tick.

> I think I got your point; for !NO_HZ_IDLE configurations we might end
> up not scaling down frequency even if we have the tick running and
> the cpu is idle. I might need some more time to think this through, but
> it seems to me that we are still fine without an explicit trigger in
> task_tick_fair(); if we are running a !NO_HZ_IDLE system we are probably
> not so much concerned about power savings and still we react
> to tasks waking up, sleeping, leaving or moving around (which seems the
> real important events to me); OTOH, we might add that trigger, but this
> will generate unconditional checks at tick time for NO_HZ_IDLE

That will be far less critical than unconditionally check during all
task wake up or sleep. A task that wakes up every 200us will generate
much more check in the wake up hot path of the cpu that is already
busy with another task

> configurations, for a benefit that it seems to be still not completely
> clear.
>
>>>
>>>>> scale down at task deactivation, or wakeup after a long time, IMHO.
>>>>
>>>> But waking up or going to sleep don't have any impact on the usage of
>>>> a cpu. The only events that impact the cpu usage are:
>>>> -task migration,
>>>
>>> We explicitly cover this on load balancing paths.

But task can migrate out of the load balancing; At wake up for example
and AFAICT, you don't use this event to notify the decrease of the
usage of the cpu and check if a new OPP will fit better with the new
usage.

>>>
>>>> -new task
>>>
>>> We cover this in enqueue_task_fair(), introducing a new flag.
>>>
>>>> -time that elapse which can be monitored by periodically checking the usage.
>>>
>>> Do you mean when a task utilization crosses some threshold
>>> related to the current OPP? If that is the case, we have a
>>> check in task_tick_fair().
>>>
>>>> -and for nohz system when cpu enter or leave idle state
>>>>
>>>
>>> We address this in dequeue_task_fair(). In particular, if
>>> the cpu is going to be idle we don't trigger any change as
>>> it seems not always wise to wake up a thread to just change
>>> the OPP and the go idle; some platforms might require this
>>> behaviour anyway, but it probably more cpuidle/fw related?
>>
>> I would say that it's interesting to notifiy sched-dvfs that a cpu
>> becomes idle because we could decrease the opp of a cluster of cpus
>> that share the same clock if this cpu is the one that requires the max
>> capacity of the cluster (and other cpus are still running).
>>
>
> Well, we reset the capacity request of the cpu that is going idle.

And i'm fine with the fact that you use the cpu idle event

> The idea is that the next event on one of the other related cpus
> will update the cluster freq correctly. If any other cpu in the
> cluster is running something we keep the same frequency until
> the task running on that cpu goes to sleep; this seems fine to
> me because that task might end up being heavy and we saved a
> back to back lower to higher OPP switch; if the task is instead
> light it will probably be dequeued pretty soon, and at that time
> we switch to a lower OPP (since we cleared the idle cpu request
> before). Also, if the other cpus in the cluster are all idle
> we'll most probably enter an idle state, so no freq switch is
> most likely required.
>
>>>
>>> I would also add:
>>>
>>> - task is going to die
>>>
>>> We address this in dequeue as well, as its contribution is
>>> removed from usage (mod Yuyang's patches).
>>>
>>>> waking up and going to sleep events doesn't give any useful
>>>> information and using them to trig the monitoring of the usage
>>>> variation doesn't give you a predictable/periodic update of it whereas
>>>> the tick will
>>>>
>>>
>>> So, one key point of this solution is to get away as much
>>> as we can from periodic updates/sampling and move towards a
>>> (fully) event driven approach. The event logically associated
>>> to task_tick_fair() is when we realize that a task is going
>>> to saturate the current capacity; in this case we trigger a
>>> freq switch to an higher capacity. Also, if we never react
>>> to normal wakeups (as I understand you are proposing) we might
>>> miss some chances to adapt quickly enough. As an example, if
>>> you have a big task that suddenly goes to sleep, and sleeps
>>> until its decayed utilization goes almost to zero; when it
>>> wakes up, if we don't have a trigger in enqueue_task_fair(),

I'm not against having a trigger in enqueue, i'm against bindly
checking all task wake up in order to be sure to catch the useful
event like the cpu leave idle event of your example

>>> we'll have to wait until the next tick to select an appropriate
>>> (low) OPP.
>>
>> I assume that the cpu is idle in this case. This situation only
>> happens on Nohz idle system because tick is disable and you have to
>> update statistics when leaving idle as it is done for the jiffies or
>> the cpu_load array. So you should track cpu enter/leave idle (for nohz
>> system only) instead of tracking all tasks wake up/sleep events.
>>
>
> I think I already replied to this in what above. Did I? :)

In fact, It was not a question, i just state that using all wake up /
sleep events to be sure to trig the check of cpu capacity when the cpu
leave an idle phase (and especially a long one like in your example
above), is wrong. You have to use the leave idle event instead of
checking all wake up events to be sure to catch the right one. You say
that you want to get away as much as possible the periodic
updates/sampling but it's exactly what you do with these 2 events.
Using them only enables you to periodically check if the capacity has
changed since the last time like the tick already does. But instead of
using a periodic and controlled event, you use these random (with
regards to capacity evolution) and uncontrolled events in order to
catch useful change. As an example, If a cpu run a task and there is a
short running task that wakes up every 100us for running 100us, you
will call cpufreq_sched_set_cap, 5000 times per second for no good
reason as you already have the tick to periodically check the
evolution of the usage.

>
>> So you can either use update_cpu_load_nohz like it is already done for
>> cpu_load array
>> or you should use some conditions like below if you want to stay in
>> enqueue/dequeue_task_fair but task wake up or sleep event are not the
>> right condition
>> if (!(flags & ENQUEUE_WAKEUP) || rq->nr_running == 1 ) in enqueue_task_fair
>> and
>> if (!task_sleep || rq->nr_running == 0) in dequeue_task_fair
>>
>> We can probably optimized by using  rq->cfs.h_nr_running instead of
>> rq->nr_running as only cfs tasks really modifies the usage
>>
>
> I already filter out enqueues/dequeues that comes from load balancing;
> and I use cfs.nr_running because, as you say, we currently work with CFS
> tasks only.

But not for the enqueue where you should use it instead of all wake up events.

Just to be clear: Using all enqueue/dequeue events (including task
wake up and sleep) to check a change of the usage of a cpu was doing a
lot of sense when mike has sent his v3 of the scheduler driven cpu
frequency selection because the usage was not accounting the blocked
tasks at that time so it was changing for all enqueue/dequeue events.
But this doesn't make sense in this patchset that account the blocked
tasks in the usage of the cpu and more generally now that yuyang's
patch set has been accepted.

Regards,
Vincent

>
> Thanks,
>
> - Juri
>
>> Regards,
>> Vincent
>>
>>>
>>> Best,
>>>
>>> - Juri
>>>
>>>>>
>>>>> Best,
>>>>>
>>>>> - Juri
>>>>>
>>>>>> Regards,
>>>>>> Vincent
>>>>>>>
>>>>>>> Thanks,
>>>>>>>
>>>>>>> - Juri
>>>>>>>
>>>>>>>> It should be the same for the wake up of a task in enqueue_task_fair
>>>>>>>> above, even if it's less obvious for this latter use case because the
>>>>>>>> cpu might wake up from a long idle phase during which its
>>>>>>>> utilization_blocked_avg has not been updated. Nevertheless, a trig of
>>>>>>>> the freq switch at wake up of the cpu once its usage has been updated
>>>>>>>> should do the job.
>>>>>>>>
>>>>>>>> So tick, migration of tasks, new tasks, entering/leaving idle state of
>>>>>>>> cpu should be enough to trig freq switch
>>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Vincent
>>>>>>>>
>>>>>>>>
>>>>>>>>> +               }
>>>>>>>>>         }
>>>>>>>>>         hrtick_update(rq);
>>>>>>>>>  }
>>>>>>>>> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>>>>>>>>>         return idx;
>>>>>>>>>  }
>>>>>>>>>
>>>>>>>>> -static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>>>>>>> -
>>>>>>>>>  static bool cpu_overutilized(int cpu)
>>>>>>>>>  {
>>>>>>>>>         return (capacity_of(cpu) * 1024) <
>>>>>>>>> --
>>>>>>>>> 1.9.1
>>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Juri Lelli Aug. 14, 2015, 11:39 a.m. UTC | #12
Hi vincent,

On 13/08/15 13:08, Vincent Guittot wrote:
> On 12 August 2015 at 17:15, Juri Lelli <juri.lelli@arm.com> wrote:
>> On 11/08/15 17:37, Vincent Guittot wrote:
>>> On 11 August 2015 at 17:07, Juri Lelli <juri.lelli@arm.com> wrote:
>>>> Hi Vincent,
>>>>
>>>> On 11/08/15 12:41, Vincent Guittot wrote:
>>>>> On 11 August 2015 at 11:08, Juri Lelli <juri.lelli@arm.com> wrote:
>>>>>> On 10/08/15 16:07, Vincent Guittot wrote:
>>>>>>> On 10 August 2015 at 15:43, Juri Lelli <juri.lelli@arm.com> wrote:
>>>>>>>>
>>>>>>>> Hi Vincent,
>>>>>>>>
>>>>>>>> On 04/08/15 14:41, Vincent Guittot wrote:
>>>>>>>>> Hi Juri,
>>>>>>>>>
>>>>>>>>> On 7 July 2015 at 20:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
>>>>>>>>>> From: Juri Lelli <juri.lelli@arm.com>
> 
>  [snip]
> 
>>>>>>>>>>  }
>>>>>>>>>> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>>>>>>         if (!se) {
>>>>>>>>>>                 sub_nr_running(rq, 1);
>>>>>>>>>>                 update_rq_runnable_avg(rq, 1);
>>>>>>>>>> +               /*
>>>>>>>>>> +                * We want to trigger a freq switch request only for tasks that
>>>>>>>>>> +                * are going to sleep; this is because we get here also during
>>>>>>>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>>>>>>>> +                * as single request after load balancing is done.
>>>>>>>>>> +                *
>>>>>>>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>>>>>>>> +                * to our request to provide some head room if p's utilization
>>>>>>>>>> +                * further increases.
>>>>>>>>>> +                */
>>>>>>>>>> +               if (sched_energy_freq() && task_sleep) {
>>>>>>>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>>>>>>>> +
>>>>>>>>>> +                       req_cap = req_cap * capacity_margin
>>>>>>>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>>>>>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>>>>>>>
>>>>>>>>> Could you clarify why you want to trig a freq switch for tasks that
>>>>>>>>> are going to sleep ?
>>>>>>>>> The cpu_usage should not changed that much as the se_utilization of
>>>>>>>>> the entity moves from utilization_load_avg to utilization_blocked_avg
>>>>>>>>> of the rq and the usage and the freq are updated periodically.
>>>>>>>>
>>>>>>>> I think we still need to cover multiple back-to-back dequeues. Suppose
>>>>>>>> that you have, let's say, 3 tasks that get enqueued at the same time.
>>>>>>>> After some time the first one goes to sleep and its utilization, as you
>>>>>>>> say, gets moved to utilization_blocked_avg. So, nothing changes, and
>>>>>>>> the trigger is superfluous (even if no freq change I guess will be
>>>>>>>> issued as we are already servicing enough capacity). However, after a
>>>>>>>> while, the second task goes to sleep. Now we still use get_cpu_usage()
>>>>>>>> and the first task contribution in utilization_blocked_avg should have
>>>>>>>> been decayed by this time. Same thing may than happen for the third task
>>>>>>>> as well. So, if we don't check if we need to scale down in
>>>>>>>> dequeue_task_fair, it seems to me that we might miss some opportunities,
>>>>>>>> as blocked contribution of other tasks could have been successively
>>>>>>>> decayed.
>>>>>>>>
>>>>>>>> What you think?
>>>>>>>
>>>>>>> The tick is used to monitor such variation of the usage (in both way,
>>>>>>> decay of the usage of sleeping tasks and increase of the usage of
>>>>>>> running tasks). So in your example, if the duration between the sleep
>>>>>>> of the 2 tasks is significant enough, the tick will handle this
>>>>>>> variation
>>>>>>>
>>>>>>
>>>>>> The tick is used to decide if we need to scale up (to max OPP for the
>>>>>> time being), but we don't scale down. It makes more logical sense to
>>>>>
>>>>> why don't you want to check if you need to scale down ?
>>>>>
>>>>
>>>> Well, because if I'm still executing something the cpu usage is only
>>>> subject to raise.
>>>
>>> This is only true for  system with NO HZ idle
>>>
>>
>> Well, even with !NO_HZ_IDLE usage only decreases when cpu is idle. But,
> 
> Well, thanks for this obvious statement that usage only decreases when
> cpu is idle but my question has never been about usage variation of
> idle/running cpu but about the tick.
> 

I'm sorry if I sounded haughty to you, of course I didn't want too and
I apologize for that. I just wanted to state the obvious to confirm
myself that I understood your point, as I say below. :)

>> I think I got your point; for !NO_HZ_IDLE configurations we might end
>> up not scaling down frequency even if we have the tick running and
>> the cpu is idle. I might need some more time to think this through, but
>> it seems to me that we are still fine without an explicit trigger in
>> task_tick_fair(); if we are running a !NO_HZ_IDLE system we are probably
>> not so much concerned about power savings and still we react
>> to tasks waking up, sleeping, leaving or moving around (which seems the
>> real important events to me); OTOH, we might add that trigger, but this
>> will generate unconditional checks at tick time for NO_HZ_IDLE
> 
> That will be far less critical than unconditionally check during all
> task wake up or sleep. A task that wakes up every 200us will generate
> much more check in the wake up hot path of the cpu that is already
> busy with another task
> 

We have a throttling threshold for this kind of problem, which is the
same as the transition latency exported by cpufreq drivers. Now, we
still do some operations before checking that threshold, and the check
itself might be too expensive. I guess I'll go back and profile it.

>> configurations, for a benefit that it seems to be still not completely
>> clear.
>>
>>>>
>>>>>> scale down at task deactivation, or wakeup after a long time, IMHO.
>>>>>
>>>>> But waking up or going to sleep don't have any impact on the usage of
>>>>> a cpu. The only events that impact the cpu usage are:
>>>>> -task migration,
>>>>
>>>> We explicitly cover this on load balancing paths.
> 
> But task can migrate out of the load balancing; At wake up for example
> and AFAICT, you don't use this event to notify the decrease of the
> usage of the cpu and check if a new OPP will fit better with the new
> usage.
> 

If the task gets wakeup migrated its request will be issued as part
of the enqueue on the new cpu. If the cpu it was previously running
on is idle, it has already cleared its request, so it shouldn't
need any notification.

>>>>
>>>>> -new task
>>>>
>>>> We cover this in enqueue_task_fair(), introducing a new flag.
>>>>
>>>>> -time that elapse which can be monitored by periodically checking the usage.
>>>>
>>>> Do you mean when a task utilization crosses some threshold
>>>> related to the current OPP? If that is the case, we have a
>>>> check in task_tick_fair().
>>>>
>>>>> -and for nohz system when cpu enter or leave idle state
>>>>>
>>>>
>>>> We address this in dequeue_task_fair(). In particular, if
>>>> the cpu is going to be idle we don't trigger any change as
>>>> it seems not always wise to wake up a thread to just change
>>>> the OPP and the go idle; some platforms might require this
>>>> behaviour anyway, but it probably more cpuidle/fw related?
>>>
>>> I would say that it's interesting to notifiy sched-dvfs that a cpu
>>> becomes idle because we could decrease the opp of a cluster of cpus
>>> that share the same clock if this cpu is the one that requires the max
>>> capacity of the cluster (and other cpus are still running).
>>>
>>
>> Well, we reset the capacity request of the cpu that is going idle.
> 
> And i'm fine with the fact that you use the cpu idle event
> 

Ok :).

>> The idea is that the next event on one of the other related cpus
>> will update the cluster freq correctly. If any other cpu in the
>> cluster is running something we keep the same frequency until
>> the task running on that cpu goes to sleep; this seems fine to
>> me because that task might end up being heavy and we saved a
>> back to back lower to higher OPP switch; if the task is instead
>> light it will probably be dequeued pretty soon, and at that time
>> we switch to a lower OPP (since we cleared the idle cpu request
>> before). Also, if the other cpus in the cluster are all idle
>> we'll most probably enter an idle state, so no freq switch is
>> most likely required.
>>
>>>>
>>>> I would also add:
>>>>
>>>> - task is going to die
>>>>
>>>> We address this in dequeue as well, as its contribution is
>>>> removed from usage (mod Yuyang's patches).
>>>>
>>>>> waking up and going to sleep events doesn't give any useful
>>>>> information and using them to trig the monitoring of the usage
>>>>> variation doesn't give you a predictable/periodic update of it whereas
>>>>> the tick will
>>>>>
>>>>
>>>> So, one key point of this solution is to get away as much
>>>> as we can from periodic updates/sampling and move towards a
>>>> (fully) event driven approach. The event logically associated
>>>> to task_tick_fair() is when we realize that a task is going
>>>> to saturate the current capacity; in this case we trigger a
>>>> freq switch to an higher capacity. Also, if we never react
>>>> to normal wakeups (as I understand you are proposing) we might
>>>> miss some chances to adapt quickly enough. As an example, if
>>>> you have a big task that suddenly goes to sleep, and sleeps
>>>> until its decayed utilization goes almost to zero; when it
>>>> wakes up, if we don't have a trigger in enqueue_task_fair(),
> 
> I'm not against having a trigger in enqueue, i'm against bindly
> checking all task wake up in order to be sure to catch the useful
> event like the cpu leave idle event of your example
> 

Right, agreed.

>>>> we'll have to wait until the next tick to select an appropriate
>>>> (low) OPP.
>>>
>>> I assume that the cpu is idle in this case. This situation only
>>> happens on Nohz idle system because tick is disable and you have to
>>> update statistics when leaving idle as it is done for the jiffies or
>>> the cpu_load array. So you should track cpu enter/leave idle (for nohz
>>> system only) instead of tracking all tasks wake up/sleep events.
>>>
>>
>> I think I already replied to this in what above. Did I? :)
> 
> In fact, It was not a question, i just state that using all wake up /
> sleep events to be sure to trig the check of cpu capacity when the cpu
> leave an idle phase (and especially a long one like in your example
> above), is wrong. You have to use the leave idle event instead of
> checking all wake up events to be sure to catch the right one.

It seems to me that we need to catch enqueue events even for cpus
that are already busy. Don't we have to see if we have to scale
up in case a task is enqueued after a wakeup or fork on a cpu that is
already running some other task?

Then, I agree that some other conditions might be added to check
that we are not over triggering the thing. I need to think about
those more.

> You say
> that you want to get away as much as possible the periodic
> updates/sampling but it's exactly what you do with these 2 events.
> Using them only enables you to periodically check if the capacity has
> changed since the last time like the tick already does. But instead of
> using a periodic and controlled event, you use these random (with
> regards to capacity evolution) and uncontrolled events in order to
> catch useful change. As an example, If a cpu run a task and there is a
> short running task that wakes up every 100us for running 100us, you
> will call cpufreq_sched_set_cap, 5000 times per second for no good
> reason as you already have the tick to periodically check the
> evolution of the usage.
> 

In this case it's the workload that is inherently periodic, so we
have periodic checks. Anyway, this is most probably one bailout
condition we need to add, since if both task are already running on
the same cpu, I guess, its utilization shouldn't change that much.

>>
>>> So you can either use update_cpu_load_nohz like it is already done for
>>> cpu_load array
>>> or you should use some conditions like below if you want to stay in
>>> enqueue/dequeue_task_fair but task wake up or sleep event are not the
>>> right condition
>>> if (!(flags & ENQUEUE_WAKEUP) || rq->nr_running == 1 ) in enqueue_task_fair
>>> and
>>> if (!task_sleep || rq->nr_running == 0) in dequeue_task_fair
>>>
>>> We can probably optimized by using  rq->cfs.h_nr_running instead of
>>> rq->nr_running as only cfs tasks really modifies the usage
>>>
>>
>> I already filter out enqueues/dequeues that comes from load balancing;
>> and I use cfs.nr_running because, as you say, we currently work with CFS
>> tasks only.
> 
> But not for the enqueue where you should use it instead of all wake up events.
> 
> Just to be clear: Using all enqueue/dequeue events (including task
> wake up and sleep) to check a change of the usage of a cpu was doing a
> lot of sense when mike has sent his v3 of the scheduler driven cpu
> frequency selection because the usage was not accounting the blocked
> tasks at that time so it was changing for all enqueue/dequeue events.
> But this doesn't make sense in this patchset that account the blocked
> tasks in the usage of the cpu and more generally now that yuyang's
> patch set has been accepted.
> 

As above, I agree that we are most probably missing some optimizations;
that's what I'm going to look at again.

Thanks,

- Juri

> Regards,
> Vincent
> 
>>
>> Thanks,
>>
>> - Juri
>>
>>> Regards,
>>> Vincent
>>>
>>>>
>>>> Best,
>>>>
>>>> - Juri
>>>>
>>>>>>
>>>>>> Best,
>>>>>>
>>>>>> - Juri
>>>>>>
>>>>>>> Regards,
>>>>>>> Vincent
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>>
>>>>>>>> - Juri
>>>>>>>>
>>>>>>>>> It should be the same for the wake up of a task in enqueue_task_fair
>>>>>>>>> above, even if it's less obvious for this latter use case because the
>>>>>>>>> cpu might wake up from a long idle phase during which its
>>>>>>>>> utilization_blocked_avg has not been updated. Nevertheless, a trig of
>>>>>>>>> the freq switch at wake up of the cpu once its usage has been updated
>>>>>>>>> should do the job.
>>>>>>>>>
>>>>>>>>> So tick, migration of tasks, new tasks, entering/leaving idle state of
>>>>>>>>> cpu should be enough to trig freq switch
>>>>>>>>>
>>>>>>>>> Regards,
>>>>>>>>> Vincent
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> +               }
>>>>>>>>>>         }
>>>>>>>>>>         hrtick_update(rq);
>>>>>>>>>>  }
>>>>>>>>>> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>>>>>>>>>>         return idx;
>>>>>>>>>>  }
>>>>>>>>>>
>>>>>>>>>> -static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>>>>>>>> -
>>>>>>>>>>  static bool cpu_overutilized(int cpu)
>>>>>>>>>>  {
>>>>>>>>>>         return (capacity_of(cpu) * 1024) <
>>>>>>>>>> --
>>>>>>>>>> 1.9.1
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Aug. 15, 2015, 12:48 p.m. UTC | #13
So this OPP thing, I think that got mentioned once earlier in this patch
set, wth is that?
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael Turquette Aug. 16, 2015, 3:50 a.m. UTC | #14
Quoting Peter Zijlstra (2015-08-15 05:48:17)
> 
> 
> So this OPP thing, I think that got mentioned once earlier in this patch
> set, wth is that?

OPP == OPerating Point == P-state

In System-on-chip Land OPP is a very common term, roughly defined as a
frequency & voltage pair that makes up a performance state.

In other words, OPP is the P-state of the non-ACPI world.

Similarly DVFS is sometimes confused as a brand new file system, but it
is also a very standardized acronym amongst SoC vendors meaning
frequency and voltage scaling.

Regards,
Mike

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vincent Guittot Aug. 17, 2015, 9:43 a.m. UTC | #15
On 14 August 2015 at 13:39, Juri Lelli <juri.lelli@arm.com> wrote:
> Hi vincent,
>
> On 13/08/15 13:08, Vincent Guittot wrote:
>> On 12 August 2015 at 17:15, Juri Lelli <juri.lelli@arm.com> wrote:
>>> On 11/08/15 17:37, Vincent Guittot wrote:
>>>> On 11 August 2015 at 17:07, Juri Lelli <juri.lelli@arm.com> wrote:
>>>>> Hi Vincent,
>>>>>
>>>>> On 11/08/15 12:41, Vincent Guittot wrote:
>>>>>> On 11 August 2015 at 11:08, Juri Lelli <juri.lelli@arm.com> wrote:
>>>>>>> On 10/08/15 16:07, Vincent Guittot wrote:
>>>>>>>> On 10 August 2015 at 15:43, Juri Lelli <juri.lelli@arm.com> wrote:
>>>>>>>>>
>>>>>>>>> Hi Vincent,
>>>>>>>>>
>>>>>>>>> On 04/08/15 14:41, Vincent Guittot wrote:
>>>>>>>>>> Hi Juri,
>>>>>>>>>>
>>>>>>>>>> On 7 July 2015 at 20:24, Morten Rasmussen <morten.rasmussen@arm.com> wrote:
>>>>>>>>>>> From: Juri Lelli <juri.lelli@arm.com>
>>
>>  [snip]
>>
>>>>>>>>>>>  }
>>>>>>>>>>> @@ -4393,6 +4416,23 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
>>>>>>>>>>>         if (!se) {
>>>>>>>>>>>                 sub_nr_running(rq, 1);
>>>>>>>>>>>                 update_rq_runnable_avg(rq, 1);
>>>>>>>>>>> +               /*
>>>>>>>>>>> +                * We want to trigger a freq switch request only for tasks that
>>>>>>>>>>> +                * are going to sleep; this is because we get here also during
>>>>>>>>>>> +                * load balancing, but in these cases it seems wise to trigger
>>>>>>>>>>> +                * as single request after load balancing is done.
>>>>>>>>>>> +                *
>>>>>>>>>>> +                * Also, we add a margin (same ~20% used for the tipping point)
>>>>>>>>>>> +                * to our request to provide some head room if p's utilization
>>>>>>>>>>> +                * further increases.
>>>>>>>>>>> +                */
>>>>>>>>>>> +               if (sched_energy_freq() && task_sleep) {
>>>>>>>>>>> +                       unsigned long req_cap = get_cpu_usage(cpu_of(rq));
>>>>>>>>>>> +
>>>>>>>>>>> +                       req_cap = req_cap * capacity_margin
>>>>>>>>>>> +                                       >> SCHED_CAPACITY_SHIFT;
>>>>>>>>>>> +                       cpufreq_sched_set_cap(cpu_of(rq), req_cap);
>>>>>>>>>>
>>>>>>>>>> Could you clarify why you want to trig a freq switch for tasks that
>>>>>>>>>> are going to sleep ?
>>>>>>>>>> The cpu_usage should not changed that much as the se_utilization of
>>>>>>>>>> the entity moves from utilization_load_avg to utilization_blocked_avg
>>>>>>>>>> of the rq and the usage and the freq are updated periodically.
>>>>>>>>>
>>>>>>>>> I think we still need to cover multiple back-to-back dequeues. Suppose
>>>>>>>>> that you have, let's say, 3 tasks that get enqueued at the same time.
>>>>>>>>> After some time the first one goes to sleep and its utilization, as you
>>>>>>>>> say, gets moved to utilization_blocked_avg. So, nothing changes, and
>>>>>>>>> the trigger is superfluous (even if no freq change I guess will be
>>>>>>>>> issued as we are already servicing enough capacity). However, after a
>>>>>>>>> while, the second task goes to sleep. Now we still use get_cpu_usage()
>>>>>>>>> and the first task contribution in utilization_blocked_avg should have
>>>>>>>>> been decayed by this time. Same thing may than happen for the third task
>>>>>>>>> as well. So, if we don't check if we need to scale down in
>>>>>>>>> dequeue_task_fair, it seems to me that we might miss some opportunities,
>>>>>>>>> as blocked contribution of other tasks could have been successively
>>>>>>>>> decayed.
>>>>>>>>>
>>>>>>>>> What you think?
>>>>>>>>
>>>>>>>> The tick is used to monitor such variation of the usage (in both way,
>>>>>>>> decay of the usage of sleeping tasks and increase of the usage of
>>>>>>>> running tasks). So in your example, if the duration between the sleep
>>>>>>>> of the 2 tasks is significant enough, the tick will handle this
>>>>>>>> variation
>>>>>>>>
>>>>>>>
>>>>>>> The tick is used to decide if we need to scale up (to max OPP for the
>>>>>>> time being), but we don't scale down. It makes more logical sense to
>>>>>>
>>>>>> why don't you want to check if you need to scale down ?
>>>>>>
>>>>>
>>>>> Well, because if I'm still executing something the cpu usage is only
>>>>> subject to raise.
>>>>
>>>> This is only true for  system with NO HZ idle
>>>>
>>>
>>> Well, even with !NO_HZ_IDLE usage only decreases when cpu is idle. But,
>>
>> Well, thanks for this obvious statement that usage only decreases when
>> cpu is idle but my question has never been about usage variation of
>> idle/running cpu but about the tick.
>>
>
> I'm sorry if I sounded haughty to you, of course I didn't want too and
> I apologize for that. I just wanted to state the obvious to confirm
> myself that I understood your point, as I say below. :)
>
>>> I think I got your point; for !NO_HZ_IDLE configurations we might end
>>> up not scaling down frequency even if we have the tick running and
>>> the cpu is idle. I might need some more time to think this through, but
>>> it seems to me that we are still fine without an explicit trigger in
>>> task_tick_fair(); if we are running a !NO_HZ_IDLE system we are probably
>>> not so much concerned about power savings and still we react
>>> to tasks waking up, sleeping, leaving or moving around (which seems the
>>> real important events to me); OTOH, we might add that trigger, but this
>>> will generate unconditional checks at tick time for NO_HZ_IDLE
>>
>> That will be far less critical than unconditionally check during all
>> task wake up or sleep. A task that wakes up every 200us will generate
>> much more check in the wake up hot path of the cpu that is already
>> busy with another task
>>
>
> We have a throttling threshold for this kind of problem, which is the
> same as the transition latency exported by cpufreq drivers. Now, we
> still do some operations before checking that threshold, and the check
> itself might be too expensive. I guess I'll go back and profile it.

So, you take the cpufreq driver transition latency into account but
you should take the scheduler and its statistic responsiveness as the
transition latency can be small (less than fews hundreds of us) . As
an example, the variation of the usage of a cpu  is always less 20%
over a period of 10ms and the variation is less than 10% if the usage
is already above 50% ( This is also weighted by the current
frequency). This variation is in the same range or lower that the
margin you take when setting the current usage of a cpu. As most of
system have a tick period less or equal to 10ms, i'm not sure that
it's worth to ensure a periodic check of the usage that is shorter
than a tick (or the transition latency is the latter is greater than
the tick)

>
>>> configurations, for a benefit that it seems to be still not completely
>>> clear.
>>>
>>>>>
>>>>>>> scale down at task deactivation, or wakeup after a long time, IMHO.
>>>>>>
>>>>>> But waking up or going to sleep don't have any impact on the usage of
>>>>>> a cpu. The only events that impact the cpu usage are:
>>>>>> -task migration,
>>>>>
>>>>> We explicitly cover this on load balancing paths.
>>
>> But task can migrate out of the load balancing; At wake up for example
>> and AFAICT, you don't use this event to notify the decrease of the
>> usage of the cpu and check if a new OPP will fit better with the new
>> usage.
>>
>
> If the task gets wakeup migrated its request will be issued as part
> of the enqueue on the new cpu. If the cpu it was previously running
> on is idle, it has already cleared its request, so it shouldn't
> need any notification.

Such migration occurs most of the time when the source CPU is already
used by another task so scheduler looks for an idle sibling cpu. In
this case, you can't use the idle event of the source cpu but you have
to use the migration event. I agree that you will catch the increase
of usage of the dest cpu in the enqueue but you don't catch the remove
of usage of the source cpu whereas this event is a good one IMHO to
check if we can change the OPP as the usage will have a abrupt change

>
>>>>>
>>>>>> -new task
>>>>>
>>>>> We cover this in enqueue_task_fair(), introducing a new flag.
>>>>>
>>>>>> -time that elapse which can be monitored by periodically checking the usage.
>>>>>
>>>>> Do you mean when a task utilization crosses some threshold
>>>>> related to the current OPP? If that is the case, we have a
>>>>> check in task_tick_fair().
>>>>>
>>>>>> -and for nohz system when cpu enter or leave idle state
>>>>>>
>>>>>
>>>>> We address this in dequeue_task_fair(). In particular, if
>>>>> the cpu is going to be idle we don't trigger any change as
>>>>> it seems not always wise to wake up a thread to just change
>>>>> the OPP and the go idle; some platforms might require this
>>>>> behaviour anyway, but it probably more cpuidle/fw related?
>>>>
>>>> I would say that it's interesting to notifiy sched-dvfs that a cpu
>>>> becomes idle because we could decrease the opp of a cluster of cpus
>>>> that share the same clock if this cpu is the one that requires the max
>>>> capacity of the cluster (and other cpus are still running).
>>>>
>>>
>>> Well, we reset the capacity request of the cpu that is going idle.
>>
>> And i'm fine with the fact that you use the cpu idle event
>>
>
> Ok :).
>
>>> The idea is that the next event on one of the other related cpus
>>> will update the cluster freq correctly. If any other cpu in the
>>> cluster is running something we keep the same frequency until
>>> the task running on that cpu goes to sleep; this seems fine to
>>> me because that task might end up being heavy and we saved a
>>> back to back lower to higher OPP switch; if the task is instead
>>> light it will probably be dequeued pretty soon, and at that time
>>> we switch to a lower OPP (since we cleared the idle cpu request
>>> before). Also, if the other cpus in the cluster are all idle
>>> we'll most probably enter an idle state, so no freq switch is
>>> most likely required.
>>>
>>>>>
>>>>> I would also add:
>>>>>
>>>>> - task is going to die
>>>>>
>>>>> We address this in dequeue as well, as its contribution is
>>>>> removed from usage (mod Yuyang's patches).
>>>>>
>>>>>> waking up and going to sleep events doesn't give any useful
>>>>>> information and using them to trig the monitoring of the usage
>>>>>> variation doesn't give you a predictable/periodic update of it whereas
>>>>>> the tick will
>>>>>>
>>>>>
>>>>> So, one key point of this solution is to get away as much
>>>>> as we can from periodic updates/sampling and move towards a
>>>>> (fully) event driven approach. The event logically associated
>>>>> to task_tick_fair() is when we realize that a task is going
>>>>> to saturate the current capacity; in this case we trigger a
>>>>> freq switch to an higher capacity. Also, if we never react
>>>>> to normal wakeups (as I understand you are proposing) we might
>>>>> miss some chances to adapt quickly enough. As an example, if
>>>>> you have a big task that suddenly goes to sleep, and sleeps
>>>>> until its decayed utilization goes almost to zero; when it
>>>>> wakes up, if we don't have a trigger in enqueue_task_fair(),
>>
>> I'm not against having a trigger in enqueue, i'm against bindly
>> checking all task wake up in order to be sure to catch the useful
>> event like the cpu leave idle event of your example
>>
>
> Right, agreed.
>
>>>>> we'll have to wait until the next tick to select an appropriate
>>>>> (low) OPP.
>>>>
>>>> I assume that the cpu is idle in this case. This situation only
>>>> happens on Nohz idle system because tick is disable and you have to
>>>> update statistics when leaving idle as it is done for the jiffies or
>>>> the cpu_load array. So you should track cpu enter/leave idle (for nohz
>>>> system only) instead of tracking all tasks wake up/sleep events.
>>>>
>>>
>>> I think I already replied to this in what above. Did I? :)
>>
>> In fact, It was not a question, i just state that using all wake up /
>> sleep events to be sure to trig the check of cpu capacity when the cpu
>> leave an idle phase (and especially a long one like in your example
>> above), is wrong. You have to use the leave idle event instead of
>> checking all wake up events to be sure to catch the right one.
>
> It seems to me that we need to catch enqueue events even for cpus
> that are already busy. Don't we have to see if we have to scale
> up in case a task is enqueued after a wakeup or fork on a cpu that is
> already running some other task?

The enqueue of a waking up task doesn't change the usage of a cpu so
there is no reason to use this event instead of a periodic one. At the
opposite, fork, migrated and new tasks will change the usage so it's
worth to use them

>
> Then, I agree that some other conditions might be added to check
> that we are not over triggering the thing. I need to think about
> those more.

ok

>
>> You say
>> that you want to get away as much as possible the periodic
>> updates/sampling but it's exactly what you do with these 2 events.
>> Using them only enables you to periodically check if the capacity has
>> changed since the last time like the tick already does. But instead of
>> using a periodic and controlled event, you use these random (with
>> regards to capacity evolution) and uncontrolled events in order to
>> catch useful change. As an example, If a cpu run a task and there is a
>> short running task that wakes up every 100us for running 100us, you
>> will call cpufreq_sched_set_cap, 5000 times per second for no good
>> reason as you already have the tick to periodically check the
>> evolution of the usage.
>>
>
> In this case it's the workload that is inherently periodic, so we
> have periodic checks. Anyway, this is most probably one bailout
> condition we need to add, since if both task are already running on
> the same cpu, I guess, its utilization shouldn't change that much.

IMHO, the tick should be enough to periodically check if the change of
the usage value require an OPP update with regards of the rate of
change of the usage.

>
>>>
>>>> So you can either use update_cpu_load_nohz like it is already done for
>>>> cpu_load array
>>>> or you should use some conditions like below if you want to stay in
>>>> enqueue/dequeue_task_fair but task wake up or sleep event are not the
>>>> right condition
>>>> if (!(flags & ENQUEUE_WAKEUP) || rq->nr_running == 1 ) in enqueue_task_fair
>>>> and
>>>> if (!task_sleep || rq->nr_running == 0) in dequeue_task_fair
>>>>
>>>> We can probably optimized by using  rq->cfs.h_nr_running instead of
>>>> rq->nr_running as only cfs tasks really modifies the usage
>>>>
>>>
>>> I already filter out enqueues/dequeues that comes from load balancing;
>>> and I use cfs.nr_running because, as you say, we currently work with CFS
>>> tasks only.
>>
>> But not for the enqueue where you should use it instead of all wake up events.
>>
>> Just to be clear: Using all enqueue/dequeue events (including task
>> wake up and sleep) to check a change of the usage of a cpu was doing a
>> lot of sense when mike has sent his v3 of the scheduler driven cpu
>> frequency selection because the usage was not accounting the blocked
>> tasks at that time so it was changing for all enqueue/dequeue events.
>> But this doesn't make sense in this patchset that account the blocked
>> tasks in the usage of the cpu and more generally now that yuyang's
>> patch set has been accepted.
>>
>
> As above, I agree that we are most probably missing some optimizations;
> that's what I'm going to look at again.

Thanks,
Vincent

>
> Thanks,
>
> - Juri
>
>> Regards,
>> Vincent
>>
>>>
>>> Thanks,
>>>
>>> - Juri
>>>
>>>> Regards,
>>>> Vincent
>>>>
>>>>>
>>>>> Best,
>>>>>
>>>>> - Juri
>>>>>
>>>>>>>
>>>>>>> Best,
>>>>>>>
>>>>>>> - Juri
>>>>>>>
>>>>>>>> Regards,
>>>>>>>> Vincent
>>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>>
>>>>>>>>> - Juri
>>>>>>>>>
>>>>>>>>>> It should be the same for the wake up of a task in enqueue_task_fair
>>>>>>>>>> above, even if it's less obvious for this latter use case because the
>>>>>>>>>> cpu might wake up from a long idle phase during which its
>>>>>>>>>> utilization_blocked_avg has not been updated. Nevertheless, a trig of
>>>>>>>>>> the freq switch at wake up of the cpu once its usage has been updated
>>>>>>>>>> should do the job.
>>>>>>>>>>
>>>>>>>>>> So tick, migration of tasks, new tasks, entering/leaving idle state of
>>>>>>>>>> cpu should be enough to trig freq switch
>>>>>>>>>>
>>>>>>>>>> Regards,
>>>>>>>>>> Vincent
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> +               }
>>>>>>>>>>>         }
>>>>>>>>>>>         hrtick_update(rq);
>>>>>>>>>>>  }
>>>>>>>>>>> @@ -4959,8 +4999,6 @@ static int find_new_capacity(struct energy_env *eenv,
>>>>>>>>>>>         return idx;
>>>>>>>>>>>  }
>>>>>>>>>>>
>>>>>>>>>>> -static unsigned int capacity_margin = 1280; /* ~20% margin */
>>>>>>>>>>> -
>>>>>>>>>>>  static bool cpu_overutilized(int cpu)
>>>>>>>>>>>  {
>>>>>>>>>>>         return (capacity_of(cpu) * 1024) <
>>>>>>>>>>> --
>>>>>>>>>>> 1.9.1
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki Aug. 17, 2015, 6:22 p.m. UTC | #16
On Saturday, August 15, 2015 02:48:17 PM Peter Zijlstra wrote:
> 
> So this OPP thing, I think that got mentioned once earlier in this patch
> set, wth is that?

OPP stands for Operating Performance Points.  It is a library for representing
working clock-voltage combinations.

Described in Documentation/power/opp.txt (but may be outdated as there's some
work on it going on).

CC Viresh who's working on it right now.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f74e9d2..b8627c6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4281,7 +4281,10 @@  static inline void hrtick_update(struct rq *rq)
 }
 #endif
 
+static unsigned int capacity_margin = 1280; /* ~20% margin */
+
 static bool cpu_overutilized(int cpu);
+static unsigned long get_cpu_usage(int cpu);
 struct static_key __sched_energy_freq __read_mostly = STATIC_KEY_INIT_FALSE;
 
 /*
@@ -4332,6 +4335,26 @@  enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (!task_new && !rq->rd->overutilized &&
 		    cpu_overutilized(rq->cpu))
 			rq->rd->overutilized = true;
+		/*
+		 * We want to trigger a freq switch request only for tasks that
+		 * are waking up; this is because we get here also during
+		 * load balancing, but in these cases it seems wise to trigger
+		 * as single request after load balancing is done.
+		 *
+		 * XXX: how about fork()? Do we need a special flag/something
+		 *      to tell if we are here after a fork() (wakeup_task_new)?
+		 *
+		 * Also, we add a margin (same ~20% used for the tipping point)
+		 * to our request to provide some head room if p's utilization
+		 * further increases.
+		 */
+		if (sched_energy_freq() && !task_new) {
+			unsigned long req_cap = get_cpu_usage(cpu_of(rq));
+
+			req_cap = req_cap * capacity_margin
+					>> SCHED_CAPACITY_SHIFT;
+			cpufreq_sched_set_cap(cpu_of(rq), req_cap);
+		}
 	}
 	hrtick_update(rq);
 }
@@ -4393,6 +4416,23 @@  static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!se) {
 		sub_nr_running(rq, 1);
 		update_rq_runnable_avg(rq, 1);
+		/*
+		 * We want to trigger a freq switch request only for tasks that
+		 * are going to sleep; this is because we get here also during
+		 * load balancing, but in these cases it seems wise to trigger
+		 * as single request after load balancing is done.
+		 *
+		 * Also, we add a margin (same ~20% used for the tipping point)
+		 * to our request to provide some head room if p's utilization
+		 * further increases.
+		 */
+		if (sched_energy_freq() && task_sleep) {
+			unsigned long req_cap = get_cpu_usage(cpu_of(rq));
+
+			req_cap = req_cap * capacity_margin
+					>> SCHED_CAPACITY_SHIFT;
+			cpufreq_sched_set_cap(cpu_of(rq), req_cap);
+		}
 	}
 	hrtick_update(rq);
 }
@@ -4959,8 +4999,6 @@  static int find_new_capacity(struct energy_env *eenv,
 	return idx;
 }
 
-static unsigned int capacity_margin = 1280; /* ~20% margin */
-
 static bool cpu_overutilized(int cpu)
 {
 	return (capacity_of(cpu) * 1024) <