Message ID | 20140529095024.GF11074@laptop.programming.kicks-ass.net (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 29 May 2014 11:50, Peter Zijlstra <peterz@infradead.org> wrote: > On Fri, May 23, 2014 at 05:53:04PM +0200, Vincent Guittot wrote: >> If the CPU is used for handling lot of IRQs, trig a load balance to check if >> it's worth moving its tasks on another CPU that has more capacity >> >> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> >> --- >> kernel/sched/fair.c | 13 +++++++++++++ >> 1 file changed, 13 insertions(+) >> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >> index e8a30f9..2501e49 100644 >> --- a/kernel/sched/fair.c >> +++ b/kernel/sched/fair.c >> @@ -5948,6 +5948,13 @@ static bool update_sd_pick_busiest(struct lb_env *env, >> if (sgs->sum_nr_running > sgs->group_capacity) >> return true; >> >> + /* >> + * The group capacity is reduced probably because of activity from other >> + * sched class or interrupts which use part of the available capacity >> + */ >> + if ((sg->sgp->power_orig * 100) > (sgs->group_power * env->sd->imbalance_pct)) >> + return true; >> + >> if (sgs->group_imb) >> return true; >> > > But we should already do this because the load numbers are scaled with > the power/capacity figures. If one CPU gets significant less time to run > fair tasks, its effective load would spike and it'd get to be selected > here anyway. > > Or am I missing something? The CPU could have been picked when the capacity becomes null (which occurred when the cpu_power goes below half the default SCHED_POWER_SCALE). And even after that, there were some conditions in find_busiest_group that was bypassing this busiest group > >> @@ -7282,6 +7289,12 @@ static inline int nohz_kick_needed(struct rq *rq) >> >> if (nr_busy > 1) >> goto need_kick_unlock; >> + >> + if ((rq->cfs.h_nr_running >= 1) >> + && ((rq->cpu_power * sd->imbalance_pct) < >> + (rq->cpu_power_orig * 100))) >> + goto need_kick_unlock; >> + >> } >> >> sd = rcu_dereference(per_cpu(sd_asym, cpu)); > > OK, so there you're kicking the idle balancer to try and get another CPU > to pull some load? That makes sense I suppose. and especially if we have idle CPUs and one task on the CPU with reduced capacity > > That function is pretty horrible though; how about something like this > first? ok, i will integrate this modification in next version > > --- > kernel/sched/fair.c | 29 +++++++++++++---------------- > 1 file changed, 13 insertions(+), 16 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index c9617b73bcc0..47fb96e6fa83 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -7215,15 +7215,16 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) > * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler > * domain span are idle. > */ > -static inline int nohz_kick_needed(struct rq *rq) > +static inline bool nohz_kick_needed(struct rq *rq) > { > unsigned long now = jiffies; > struct sched_domain *sd; > struct sched_group_power *sgp; > int nr_busy, cpu = rq->cpu; > + bool kick = false; > > if (unlikely(rq->idle_balance)) > - return 0; > + return false; > > /* > * We may be recently in ticked or tickless idle mode. At the first > @@ -7237,38 +7238,34 @@ static inline int nohz_kick_needed(struct rq *rq) > * balancing. > */ > if (likely(!atomic_read(&nohz.nr_cpus))) > - return 0; > + return false; > > if (time_before(now, nohz.next_balance)) > - return 0; > + return false; > > if (rq->nr_running >= 2) > - goto need_kick; > + return true; > > rcu_read_lock(); > sd = rcu_dereference(per_cpu(sd_busy, cpu)); > - > if (sd) { > sgp = sd->groups->sgp; > nr_busy = atomic_read(&sgp->nr_busy_cpus); > > - if (nr_busy > 1) > - goto need_kick_unlock; > + if (nr_busy > 1) { > + kick = true; > + goto unlock; > + } > } > > sd = rcu_dereference(per_cpu(sd_asym, cpu)); > - > if (sd && (cpumask_first_and(nohz.idle_cpus_mask, > sched_domain_span(sd)) < cpu)) > - goto need_kick_unlock; > - > - rcu_read_unlock(); > - return 0; > + kick = true; > > -need_kick_unlock: > +unlock: > rcu_read_unlock(); > -need_kick: > - return 1; > + return kick; > } > #else > static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
On Thu, May 29, 2014 at 09:37:39PM +0200, Vincent Guittot wrote: > On 29 May 2014 11:50, Peter Zijlstra <peterz@infradead.org> wrote: > > On Fri, May 23, 2014 at 05:53:04PM +0200, Vincent Guittot wrote: > >> If the CPU is used for handling lot of IRQs, trig a load balance to check if > >> it's worth moving its tasks on another CPU that has more capacity > >> > >> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> > >> --- > >> kernel/sched/fair.c | 13 +++++++++++++ > >> 1 file changed, 13 insertions(+) > >> > >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > >> index e8a30f9..2501e49 100644 > >> --- a/kernel/sched/fair.c > >> +++ b/kernel/sched/fair.c > >> @@ -5948,6 +5948,13 @@ static bool update_sd_pick_busiest(struct lb_env *env, > >> if (sgs->sum_nr_running > sgs->group_capacity) > >> return true; > >> > >> + /* > >> + * The group capacity is reduced probably because of activity from other > >> + * sched class or interrupts which use part of the available capacity > >> + */ > >> + if ((sg->sgp->power_orig * 100) > (sgs->group_power * env->sd->imbalance_pct)) > >> + return true; > >> + > >> if (sgs->group_imb) > >> return true; > >> > > > > But we should already do this because the load numbers are scaled with > > the power/capacity figures. If one CPU gets significant less time to run > > fair tasks, its effective load would spike and it'd get to be selected > > here anyway. > > > > Or am I missing something? > > The CPU could have been picked when the capacity becomes null (which > occurred when the cpu_power goes below half the default > SCHED_POWER_SCALE). And even after that, there were some conditions in > find_busiest_group that was bypassing this busiest group Could you detail those conditions? FWIW those make excellent Changelog material.
On 30 May 2014 08:29, Peter Zijlstra <peterz@infradead.org> wrote: > On Thu, May 29, 2014 at 09:37:39PM +0200, Vincent Guittot wrote: >> On 29 May 2014 11:50, Peter Zijlstra <peterz@infradead.org> wrote: >> > On Fri, May 23, 2014 at 05:53:04PM +0200, Vincent Guittot wrote: >> >> If the CPU is used for handling lot of IRQs, trig a load balance to check if >> >> it's worth moving its tasks on another CPU that has more capacity >> >> >> >> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> >> >> --- >> >> kernel/sched/fair.c | 13 +++++++++++++ >> >> 1 file changed, 13 insertions(+) >> >> >> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >> >> index e8a30f9..2501e49 100644 >> >> --- a/kernel/sched/fair.c >> >> +++ b/kernel/sched/fair.c >> >> @@ -5948,6 +5948,13 @@ static bool update_sd_pick_busiest(struct lb_env *env, >> >> if (sgs->sum_nr_running > sgs->group_capacity) >> >> return true; >> >> >> >> + /* >> >> + * The group capacity is reduced probably because of activity from other >> >> + * sched class or interrupts which use part of the available capacity >> >> + */ >> >> + if ((sg->sgp->power_orig * 100) > (sgs->group_power * env->sd->imbalance_pct)) >> >> + return true; >> >> + >> >> if (sgs->group_imb) >> >> return true; >> >> >> > >> > But we should already do this because the load numbers are scaled with >> > the power/capacity figures. If one CPU gets significant less time to run >> > fair tasks, its effective load would spike and it'd get to be selected >> > here anyway. >> > >> > Or am I missing something? >> >> The CPU could have been picked when the capacity becomes null (which >> occurred when the cpu_power goes below half the default >> SCHED_POWER_SCALE). And even after that, there were some conditions in >> find_busiest_group that was bypassing this busiest group > > Could you detail those conditions? FWIW those make excellent Changelog > material. I need to go back to my traces and use case to be sure that I point the right culprit but IIRC, the imbalance was null. I will come back with more details once i'll be back in front of the boards.
On 30 May 2014 08:29, Peter Zijlstra <peterz@infradead.org> wrote: > On Thu, May 29, 2014 at 09:37:39PM +0200, Vincent Guittot wrote: >> On 29 May 2014 11:50, Peter Zijlstra <peterz@infradead.org> wrote: >> > On Fri, May 23, 2014 at 05:53:04PM +0200, Vincent Guittot wrote: >> >> If the CPU is used for handling lot of IRQs, trig a load balance to check if >> >> it's worth moving its tasks on another CPU that has more capacity >> >> >> >> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> >> >> --- >> >> kernel/sched/fair.c | 13 +++++++++++++ >> >> 1 file changed, 13 insertions(+) >> >> >> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c >> >> index e8a30f9..2501e49 100644 >> >> --- a/kernel/sched/fair.c >> >> +++ b/kernel/sched/fair.c >> >> @@ -5948,6 +5948,13 @@ static bool update_sd_pick_busiest(struct lb_env *env, >> >> if (sgs->sum_nr_running > sgs->group_capacity) >> >> return true; >> >> >> >> + /* >> >> + * The group capacity is reduced probably because of activity from other >> >> + * sched class or interrupts which use part of the available capacity >> >> + */ >> >> + if ((sg->sgp->power_orig * 100) > (sgs->group_power * env->sd->imbalance_pct)) >> >> + return true; >> >> + >> >> if (sgs->group_imb) >> >> return true; >> >> >> > >> > But we should already do this because the load numbers are scaled with >> > the power/capacity figures. If one CPU gets significant less time to run >> > fair tasks, its effective load would spike and it'd get to be selected >> > here anyway. >> > >> > Or am I missing something? >> >> The CPU could have been picked when the capacity becomes null (which >> occurred when the cpu_power goes below half the default >> SCHED_POWER_SCALE). And even after that, there were some conditions in >> find_busiest_group that was bypassing this busiest group > > Could you detail those conditions? FWIW those make excellent Changelog > material. I have looked back into my tests and traces: In a 1st test, the capacity of the CPU was still above half default value (power=538) unlike what i remembered. So it's some what "normal" to keep the task on CPU0 which also handles IRQ because sg_capacity still returns 1. In a 2nd test,the main task runs (most of the time) on CPU0 whereas the max power of the latter is only 623 and the cpu_power goes below 512 (power=330) during the use case. So the sg_capacity of CPU0 is null but the main task still stays on CPU0. The use case (scp transfer) is made of a long running task (ssh) and a periodic short task (scp). ssh runs on CPU0 and scp runs each 6ms on CPU1. The newly idle load balance on CPU1 doesn't pull the long running task although sg_capacity is null because of sd->nr_balance_failed is never incremented and load_balance doesn't trig an active load_balance. When an idle balance occurs in the middle of the newly idle balance, the ssh long task migrates on CPU1 but as soon as it sleeps and wakes up, it goes back on CPU0 because of the wake affine which migrates it back on CPU0 (issue solved by patch 09). Vincent
On Mon, Jun 02, 2014 at 07:06:44PM +0200, Vincent Guittot wrote: > > Could you detail those conditions? FWIW those make excellent Changelog > > material. > > I have looked back into my tests and traces: > > In a 1st test, the capacity of the CPU was still above half default > value (power=538) unlike what i remembered. So it's some what "normal" > to keep the task on CPU0 which also handles IRQ because sg_capacity > still returns 1. OK, so I suspect that once we move to utilization based capacity stuff we'll do the migration IF the task indeed requires more cpu than can be provided by the reduced, one, right? > In a 2nd test,the main task runs (most of the time) on CPU0 whereas > the max power of the latter is only 623 and the cpu_power goes below > 512 (power=330) during the use case. So the sg_capacity of CPU0 is > null but the main task still stays on CPU0. > The use case (scp transfer) is made of a long running task (ssh) and a > periodic short task (scp). ssh runs on CPU0 and scp runs each 6ms on > CPU1. The newly idle load balance on CPU1 doesn't pull the long > running task although sg_capacity is null because of > sd->nr_balance_failed is never incremented and load_balance doesn't > trig an active load_balance. When an idle balance occurs in the middle > of the newly idle balance, the ssh long task migrates on CPU1 but as > soon as it sleeps and wakes up, it goes back on CPU0 because of the > wake affine which migrates it back on CPU0 (issue solved by patch 09). OK, so there's two problems here, right? 1) we don't migrate away from cpu0 2) if we do, we get pulled back. And patch 9 solves 2, so maybe enhance its changelog to mention this slightly more explicit. Which leaves us with 1.. interesting problem. I'm just not sure endlessly kicking a low capacity cpu is the right fix for that.
On 3 June 2014 13:15, Peter Zijlstra <peterz@infradead.org> wrote: > On Mon, Jun 02, 2014 at 07:06:44PM +0200, Vincent Guittot wrote: >> > Could you detail those conditions? FWIW those make excellent Changelog >> > material. >> >> I have looked back into my tests and traces: >> >> In a 1st test, the capacity of the CPU was still above half default >> value (power=538) unlike what i remembered. So it's some what "normal" >> to keep the task on CPU0 which also handles IRQ because sg_capacity >> still returns 1. > > OK, so I suspect that once we move to utilization based capacity stuff > we'll do the migration IF the task indeed requires more cpu than can be > provided by the reduced, one, right? The current version of the patchset only checks if the capacity of a CPU has significantly reduced that we should look for another CPU. But we effectively could also add compare the remaining capacity with the task load > >> In a 2nd test,the main task runs (most of the time) on CPU0 whereas >> the max power of the latter is only 623 and the cpu_power goes below >> 512 (power=330) during the use case. So the sg_capacity of CPU0 is >> null but the main task still stays on CPU0. >> The use case (scp transfer) is made of a long running task (ssh) and a >> periodic short task (scp). ssh runs on CPU0 and scp runs each 6ms on >> CPU1. The newly idle load balance on CPU1 doesn't pull the long >> running task although sg_capacity is null because of >> sd->nr_balance_failed is never incremented and load_balance doesn't >> trig an active load_balance. When an idle balance occurs in the middle >> of the newly idle balance, the ssh long task migrates on CPU1 but as >> soon as it sleeps and wakes up, it goes back on CPU0 because of the >> wake affine which migrates it back on CPU0 (issue solved by patch 09). > > OK, so there's two problems here, right? > 1) we don't migrate away from cpu0 > 2) if we do, we get pulled back. > > And patch 9 solves 2, so maybe enhance its changelog to mention this > slightly more explicit. > > Which leaves us with 1.. interesting problem. I'm just not sure > endlessly kicking a low capacity cpu is the right fix for that. What prevent us to migrate the task directly is the fact that nr_balance_failed is not incremented for newly idle and it's the only condition for active migration (except asym feature) We could add a additional test in need_active_balance for newly_idle load balance. Something like: if ((sd->flags & SD_SHARE_PKG_RESOURCES) && (senv->rc_rq->cpu_power_orig * 100) > (env->src_rq->group_power * env->sd->imbalance_pct)) return 1; > >
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c9617b73bcc0..47fb96e6fa83 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7215,15 +7215,16 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_power *sgp; int nr_busy, cpu = rq->cpu; + bool kick = false; if (unlikely(rq->idle_balance)) - return 0; + return false; /* * We may be recently in ticked or tickless idle mode. At the first @@ -7237,38 +7238,34 @@ static inline int nohz_kick_needed(struct rq *rq) * balancing. */ if (likely(!atomic_read(&nohz.nr_cpus))) - return 0; + return false; if (time_before(now, nohz.next_balance)) - return 0; + return false; if (rq->nr_running >= 2) - goto need_kick; + return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { sgp = sd->groups->sgp; nr_busy = atomic_read(&sgp->nr_busy_cpus); - if (nr_busy > 1) - goto need_kick_unlock; + if (nr_busy > 1) { + kick = true; + goto unlock; + } } sd = rcu_dereference(per_cpu(sd_asym, cpu)); - if (sd && (cpumask_first_and(nohz.idle_cpus_mask, sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; - - rcu_read_unlock(); - return 0; + kick = true; -need_kick_unlock: +unlock: rcu_read_unlock(); -need_kick: - return 1; + return kick; } #else static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }